Exemplo n.º 1
0
 def fogbugzOnFail(self,logfp):
     print "Creating FogBuz Ticket"
     cfp=Config(self.__fpath)
     attempts=0
     run=True
     while run is True and attempts < 3: 
         try:
             site=FogBugz(cfp.getVar("fogbugz","site","string"))
             try:
                 site.logon(cfp.getVar("fogbugz","user","string"), cfp.getVar("fogbugz","pass","string"))
                 cfp=Config(self.__fpath)
                 with open(logfp,'rb') as fp:
                     print site.new(sTitle="The Parser "+os.path.join(self.__logbase,self.__execute)+" Failed",ixPersonAssignedTo="Andy",Files={"faillog.txt":fp})
                 attempts+=1
                 run=False
             except Exception,e:
                 print str(e)
                 for frame in traceback.extract_tb(sys.exc_info()[2]):
                     print '\n'.join([str(x) for x in frame])
             finally:
                 site.logoff()
         except Exception,e:
             print str(e)
             for frame in traceback.extract_tb(sys.exc_info()[2]):
                 print '\n'.join([str(x) for x in frame])
Exemplo n.º 2
0
class ProxyManager(object):
    
    def __init__(self,fpath):
        self.__fpath = fpath
        if self.__fpath is None or os.path.exists(self.__fpath) is False:
            raise ConfigurationNotFoundException("Valid Configuration must be Generated")
        self._cfp = Config(self.__fpath)
    
    def getProxies(self,number,domain):       
        return json.loads(urllib2.urlopen(urlparse.urljoin(self._cfp.getVar("proxymanager", "server", "String").strip(),"getProxy?auth=&number={}&domain={}".format(number,domain.strip())), None, timeout = 120).read())
    
    def dropProxy(self,domain,ip):
        return json.loads(urllib2.urlopen(urlparse.urljoin(self._cfp.getVar("proxymanager", "server", "String").strip(),"dropProxyForJson?auth=&domain={}&ip={}".format(domain.strip(),ip.strip())), None, timeout = 120).read())
    
    def dropDomain(self,domain):
        return json.loads(urllib2.urlopen(urlparse.urljoin(self._cfp.getVar("proxymanager", "server", "String").strip(),"dropDomainForJsony?auth=&domain={}".format(domain.strip())), None, timeout = 120).read())
Exemplo n.º 3
0
 def run(self):
     """
     Executes the crawler as a separate process and monitors for completion. The worker itself is a Thread so run is the 
     necessary name of the method.
     """
     print "Executing "+self.__execute
     p=Psyco(self.__fpath)
     cfp=Config(self.__fpath)
     
     if self.__execute is not None and self.__logbase is not None:
         try:
             logfp=self.__logbase+"logs/"+self.__execute.replace(".xml","").replace(".xml","")+str(int(round(time.time())))+".txt"
             self.__sema.increment(self.__lock)
             try:
                 p.execute("INSERT INTO crawlers.ca_crim_parsers_running(name) VALUES('"+self.__execute+"')")
                 p.execute("DELETE FROM crawlers.ca_crim_parsers_not_run WHERE name LIKE '"+self.__execute+"'")
                 stdfile=open(logfp,'w+')
                 
                 #the products config file will be in the base directory
                 cmd="/usr/bin/java -Xms"+cfp.getVar("Pentaho","xms","string").strip()+" -Xmx"+cfp.getVar("Pentaho","xmx","string").strip()+" -XX:+UseConcMarkSweepGC -Xcompactexplicitgc -Dbeansfile="+self.__logbase+self.__execute+" -jar "+self.__jarfile
                 print cmd
                 pipe=subprocess.Popen(shlex.split(cmd), stdout=stdfile,stderr=subprocess.STDOUT,shell=False)
                 ret=pipe.wait()
                     
                 print "Completed "+self.__execute
                 p.execute("DELETE FROM crawlers.ca_crim_parsers_running WHERE name LIKE '"+self.__execute+"'")
                 if ret is  0:
                     p.execute("INSERT INTO crawlers.ca_crim_parsers_complete (name,crawlstamp) VALUES('"+self.__execute+"','"+str(self.__datestamp)+"')")
                 else:
                     print "PARSERS- Premature Detonation!!! Failure "+self.__execute
                     print str(ret)
                     if cfp.getVar("notification", "err_type","string") == 'fog':
                         self.fogbugzOnFail(logfp)
                     else:
                         self.emailOnFail(logfp)
                     
             finally:
                 print "Decrementing"
                 self.__sema.decrement(self.__lock)
         except Exception, e:
             print "Process Failed"
             print str(e)
             for frame in traceback.extract_tb(sys.exc_info()[2]):
                 fname,lineno,fn,text = frame
                 print "Error in %s on line %d" % (fname, lineno)
         print "Worker Complete "+str(time.time())
Exemplo n.º 4
0
 def __init__(self, fpath=None, logger=None):
     self._errors = {}
     self._fpath = fpath
     self._cfp = None
     self.errCo = -1
     if self._fpath is not None:
         self._cfp = Config(self._fpath)
     self.logger = logger
Exemplo n.º 5
0
 def __init__(self, fpath=None, logger=None):
     """
     *Optional Parameters*
     :param fpath:   The file path to use.
     :param logger:  The logger to use.
     """
     self._errors = {}
     self._fpath = fpath
     self._cfp = None
     self.errCo = -1
     if self._fpath is not None:
         self._cfp = Config(self._fpath)
     self.logger = logger
Exemplo n.º 6
0
 def setup(self,folder,table,confPath,createImages=False):
     '''
     Setup status table and folder table.
          
     *Required Parameters*
     :param folder: the folder name to use
     :param table: the table to use in setting up the status
     :param confPath: the configuration path
     
     *Optional Parameters*
     :param createImages: whether or not to create an images folder
     '''
     cfp=Config(confPath)
     os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder)
     
     p=Psyco(confPath)
     
     if createImages is True:
         #update crawl specific tables
         p.execute("INSERT INTO folders.names (folder,schema) VALUES ('"+folder+"','us_fl_crawlsor')")
         os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder)
         os.mkdir("Images","fpath","string"+"FL/SOR/"+folder+"/Images/")
Exemplo n.º 7
0
 def changeOnComplete(self,alterSchema,confPath,folder=None):
     '''
     Handles the final cleanup on the last part of the crawl.
     
     *Required Parameters*
     :param alterSchema: the schema to rename and use
     :param confPath: the configuration path to use 
     
     *Optional Parameters*
     :param folder: folder name to use
     
     NOTE: The folder name should be the saem as provided from getFolder and/or to setup if used.
     '''
     cfp=Config(confPath)
     p=Psyco(confPath)
     rename="us_fl_crawlsor"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')
     print("Renaming Schema to "+rename)
     p.execute("ALTER SCHEMA us_fl_crawlsor RENAME TO "+rename)
     
     if folder is not None:
         #update the folder name information
         p.execute("UPDATE folders.names SET status=true,schema="+rename+" WHERE folder="+folder)
     p.execute("INSERT INTO "+cfp.getVar("finished","alterschema","string")+"."+cfp.getVar("finished","altertable","string")+"(filename,schema) VALUES('"+re.search('.*?\/([^\/]+\.py.*)',inspect.getfile(inspect.currentframe())).group(1)+"','"+rename+"')")
Exemplo n.º 8
0
 def emailOnFail(self,logfp):
     """
     Email on a failed crawler. Sender should not be internal for outlook purposes.
     
     *Required Parameter*
     
     :param logfp: log file path
     """
     print "Sending Alert Message"
     
     cfp=Config(self.__fpath)
     
     #get log as attachment
     att=""
     with open(logfp,'r') as fp:
         att=fp.read()
     
     #prep
     body=MIMEMultipart('alternative')
     body.attach(MIMEText("PARSER - Premature Detonation!!! Failed or Terminated Parser: "+os.path.join(self.__logbase,self.__execute)))
     msg=MIMEText("The Parser "+os.path.join(self.__logbase,self.__execute)+" Failed")
     
     msg=MIMEMultipart()
     msg.attach(body)
     msg['From']=cfp.getVar("mail","sender","string")
     msg['To']=cfp.getVar("mail","recipients","string")
     msg['Subject']="Failed CACRIM Parser"
     part=MIMEBase('application',"octet-stream")
     part.set_payload(att)
     Encoders.encode_base64(part)
     part.add_header('Content-Disposition','attachment',filename="faillog.txt")
     
     #attach
     msg.attach(part)
     
     #send
     mailobj=smtplib.SMTP(cfp.getVar("mail","host","string"))
     mailobj.sendmail(cfp.getVar("mail", "sender","string"),[cfp.getVar("mail","recipients","string").split(",")],msg.as_string())
     mailobj.quit()
     print "Mail Sent"
Exemplo n.º 9
0
class Errors:
    """
    Errors is the class that handles all error processing.
    """

    # empty constructor
    def __init__(self, fpath=None, logger=None):
        """
        *Optional Parameters*
        :param fpath:   The file path to use.
        :param logger:  The logger to use.
        """
        self._errors = {}
        self._fpath = fpath
        self._cfp = None
        self.errCo = -1
        if self._fpath is not None:
            self._cfp = Config(self._fpath)
        self.logger = logger

    @property
    def errCo(self):
        return self.__errco

    @errCo.setter
    def errCo(self, errCo):
        self.__errCo = errCo

    def printError(self, msg):
        """
        General Error Printing that checks for a log (Python2.7 cannot override print)
        
        :param msg: The message 
        """
        if self.logger is not None:
            self.logger.error(msg)
        else:
            print (msg)

    # print traceback frames
    def printFrames(self, frames):
        """
        Prints traceback frames.
        :param frames: The exception frames.
        """
        for frame in frames:
            fname, lineno, fn, text = frame
            self.printError("Error in %s on line %d" % (fname, lineno))

    def postToErrorTables(self, sourcename, e, frames):
        """
        Takes in traceback frames and posts them to database. The
        expectation is that the data collected here is useful in 
        anomoly detection.
        
        An error code of -1 specifies that an attribute has no code. This is a property that can be changed
        
        :param sourcename: The name of the source.
        :param frames: The traceback frames from the error.
        :param e: The error, Exception instance.
        """
        code = self.__errCo
        description = None
        etype = None

        etype = str(type(e))

        if "HTTPError" in etype:
            code = e.code
            description = str(e)
        elif hasattr(e, "errno") is True:
            code = e.errno

        if "URLError" in etype:
            description = e.reason()
        else:
            description = str(e)

        if sourcename is None:
            self.printError("Please Specify a Source Name")

        else:
            if self._fpath is not None and self._cfp is not None and type is not None and description is not None:
                p = Psyco(self._fpath)
                p.execute(
                    "INSERT INTO "
                    + self._cfp.getVar("Error", "table", "string")
                    + " (source,type,code,description) VALUES('"
                    + sourcename
                    + "','"
                    + etype
                    + "','"
                    + str(code)
                    + "','"
                    + description
                    + "')"
                )
            elif type is None or description is None:
                self.printError("Type or Description Not Found for the Error. Cannot insert To Database.")
            else:
                self.printError("Please Specify a Config Path")

    # handle http error
    def httpError(self, e, link, infailcodes, inkillswitch, incurrpos, frames):
        """
        Handles httpErrors
        
        *Required Parameters*
        
        :param e: error instance
        :param link: url address when the error occurred
        :param infailcodes: fail codes for the url
        :param inkillswitch: number of failures to the current point
        :param incurrpos: current position in the link list on fail
        :param frames: the traceback frames at failure
        """
        currpos = incurrpos
        killswitch = inkillswitch
        failcodes = infailcodes

        print "Failed to Get Page " + link
        if "URLError" in str(type(e)):
            self.printError("URLError")
            print e.reason
            failcodes += str(e.reason) + ","
        elif "HTTPError" in str(type(e)):
            self.printError("HTTPError")
            print e.code
            failcodes += str(e.code) + ","
        else:
            self.printError("Exception Error")
            print str(e)
            failcodes += str(e) + ","

        if killswitch < 5:
            killswitch += 1
        else:
            self.printError("Critical Failure for " + link)
            ks = 0
            currpos += 1

        self.printFrames(frames)

        return (failcodes, killswitch, currpos)

    # crawlersetup failure
    def crawlerSetupError(self, e, frames):
        """
        Handles failures in crawler setup while taking in the error 
        instance and traceback frames.
        """
        self.printError("Crawler Setup Failed")
        self.error(e, frames)

    # crawler process failure other than in the pull
    def crawlerFail(self, e, frames, restart):
        """
        Handles a failure in the crawler.
        :param e: The exception.
        :param frames: The frames to use.
        :param restart: Whether or not the process will restart.
        """
        print "Failed to Get Pages."
        if restart is True:
            self.printError("Will Restart This Iteration")
        self.error(e, frames)

    # Prints an error without a header
    def error(self, e, frames):
        """
        General error method for handling error instances and traceback
        frames.
        
        :param e: The exception.
        :param frames: The exception frames.
        """
        self.printError(str(e))
        self.printFrames(frames)
Exemplo n.º 10
0
    def parse(self):
        """
        Run the parser.
        """
        cfp=Config(self.__fpath)
        runlist=[]
        
        files=self.getXmlFiles(cfp.getVar("base", "basedir","string"))
        print "Files Found: ",
        print sorted(files)
        
        #get completed crawlers and parsers
        p=Psyco(self.__fpath)
        data=p.getData("SELECT q1.name,q1.stamp FROM (SELECT * FROM (SELECT name,max(date) as stamp FROM crawlers.ca_crim_weekly_complete GROUP BY name) as q1 UNION (SELECT name,max(date) FROM crawlers.ca_crim_monthly_complete GROUP BY name UNION SELECT name,max(date) FROM crawlers.ca_crim_daily_complete GROUP BY name)) as q1 LEFT OUTER JOIN (SELECT name,crawlstamp FROM crawlers.ca_crim_parsers_complete) as q2 ON q1.stamp = q2.crawlstamp AND regexp_replace(q1.name,'.py','') = regexp_replace(q2.name,'.xml','') WHERE q2.name IS NULL")      
        
        
        nilList=[]
        for xfile in data:
            if re.sub('\.py.*','',xfile[0])+".xml" not in files:
                nilList.append(re.sub('\.py.*','',xfile[0])+".xml") 
                
        if len(nilList) >0:
            print "Files Do Not Exist for the Following (the name but not the extension must match)",
            print sorted(nilList)
        else:
            print "All Crawlers Have Corresponding Parsers"
        
        #get datestamp dict -- assumes that the query gets the max(date)
        dates={}
        for xfile in data:
            fp=re.sub('\.py.*','',xfile[0])+".xml"
                     
            if fp not in dates:
                dates[fp]=xfile[1]

        
        for xfile in data:
            if xfile is not None:
                fp=xfile[0]
                
                if fp.replace('.py','.xml') in files:
                    runlist.append(fp.replace('.py','.xml'))
                   
        print "Execution List: ",
        print sorted(runlist)
        
        nilList=[]
        for xfile in data:
            if re.sub('\.py.*','',xfile[0])+".xml" not in files:
                nilList.append(re.sub('\.py.*','',xfile[0])+".xml") 
        
        if len(nilList) >0:
            print "Parsers that may not have been Found",
            print sorted(nilList)
        else:
            print "All Completed and Found crawlers Accounted For"

        #run the crawlers from the run list
        if len(runlist) > 0:
            pnum=0
            pool=[]
            #get the semaphore and lock
            sema=IncSemaphore()
            lock=Lock()
            
            #max processes
            maxprocs=cfp.getVar("opts","maxdel","int")
            print "Max Processes: "+str(maxprocs)
            #run
            cfp=Config(self.__fpath)
            while len(runlist) >0:
        
                if pnum<maxprocs and len(runlist)>0 and runlist[0] in files:
                    ex=runlist.pop()
                    print "Starting "+ex
                    w=Worker(self.__fpath,ex.strip(),cfp.getVar("base","logbase","string"),cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"),sema,lock,dates[ex],cfp.getVar("jars","parser","string"))
                    w.start()
                    pool.append(w)
                    pnum+=1
                    print "Continuing"
                
                    
                while sema.getValue() >= maxprocs and len(pool) > 0:
                    print "Waiting for Parsers to Complete"
                    time.sleep(random.randint(1,120))
                    
                    for proc in pool:
                        if not proc.isAlive():
                            pool.remove(proc)
                            del proc
    
                            pnum-=1
                        
                            if len(pool) is 0 and sema.getValue() >0:
                                sema.setValue(0, lock)
                            
                            gc.collect()
                            del gc.garbage[:]
                
                for proc in pool:
                    if not proc.isAlive():
                        pool.remove(proc)
                        del proc

                        pnum-=1
                    
                        if len(pool) is 0 and sema.getValue() >0:
                            sema.setValue(0, lock)
                        
                        gc.collect()
                        del gc.garbage[:]
                
                if sema.getValue()==0 and len(runlist)==0:
                    break
            time.sleep(30)
            
            print "Completed Loop. Awaiting any final Completions."
            gc.collect()
            del gc.garbage[:]
            
            while sema.getValue()>0 and len(pool)>0:
                print "Waiting for Completion"
                time.sleep(random.randint(1,120))
            
            if len(pool) is 0 and sema.getValue() >0:
                    sema.setValue(0, lock)
            
                
                
            print "Current Execution List Complete. Will Restart Soon!"
Exemplo n.º 11
0
 def start(self):
     """
     Start the controller. The controller is not multi-processed.
     Paths are configured, executables obtained, workers run; logs compressed from
     here.
     """
     #get configuration file
     cfp=Config(self.__fpath)
         
     if not os.path.isdir(cfp.getVar("base","basedir","string")+"logs"):
         os.mkdir(cfp.getVar("base","basedir","string")+"logs")
     
     #setup variables
     maxprocs=cfp.getVar("opts", "maxdel","int")
     #the threadpool
     pool=[]
     
     #set up db
     p=Psyco(cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"))
     p.prep()
     p.execute("CREATE TABLE IF NOT EXISTS data.failedurls(source text,url text)")
     
     prioritylist=[]
     if cfp.getVar("priorities","usepriority","int") is 1:
         prioritylist=cfp.getVar("priorities","priority_list","list")
     else:
         prioritylist=[x[0] for x in p.getData("SELECT name FROM crawlers.sor_complete  WHERE name NOT ILIKE '%ParserController%' ORDER BY date ASC")]
     
     #get the python files
     
     #get the semaphore and lock
     sema=IncSemaphore()
     lock=Lock()
     
     #the loop
     run=True
     
     print "Starting Crawlers"
     print "MAX: "+str(maxprocs)
     procnum=0
     print "Base Directory is "+os.path.split(inspect.stack()[0][1])[0]
     while run is True:
         execs=self.cleanUp(self.getPyFiles(os.path.split(inspect.stack()[0][1])[0]),cfp.getVar("restart","waitdays","string"),p)
         
         execs=list(set(execs))
         
         execs=[x for x in execs if x not in prioritylist]
         prioritylist.extend(execs)
         execs=prioritylist
         
         print "Executables: "+str(execs)
         
         startnew=maxprocs-sema.getValue()
         
         while  len(pool) >0 or procnum is 0 and len(execs)>0:
             
             
             print "Processes Left "+str(startnew)
             print "Number of Running Processes "+str(sema.getValue())
             for proc in pool:
                 if not proc.isAlive():
                     pool.remove(proc)
                     del proc
                     procnum-=1
                     execs=self.repopulate(execs,cfp.getVar("restart","waitdays","int"),p,maxprocs-sema.getValue(),cfp.getVar("base","basedir","string"))
                     execs=list(set(self.cleanUp(self.getPyFiles(cfp.getVar("base","basedir","string")),cfp.getVar("restart","waitdays","string"),p)))
             if len(pool) is 0 and sema.getValue() >0:
                 sema.setValue(0, lock)
             
             print "Pnum: "+str(procnum)
             
             if procnum < len(execs) and sema.getValue() < maxprocs:
                 
                 if float(psutil.swap_memory()[3])>2.0 and float(psutil.cpu_times_percent(interval=1,percpu=False)[4])>10.0:
                     print "Resources Low! Waiting for resources to free."
                     print "SMEM: "+str(psutil.swap_memory()[3])
                     print "IOWait"+str(float(psutil.cpu_times_percent(interval=1,percpu=False)[4]))
                     run=True
                     while run is True:
                         avgsmem=0
                         avgiowait=0
                         avgmem=0
                         for i in range (0,32):
                             time.sleep(.5)
                         
                             avgsmem+=psutil.swap_memory()[3]
                             avgmem+=psutil.virtual_memory()[2]
                             avgiowait+=float(psutil.cpu_times_percent(interval=1,percpu=False)[4])
                         
                         avgsmem/=32
                         avgmem/=32
                         avgiowait/=32
                         
                         print "Avg. Swap Space: "+str(avgsmem)
                         print "Avg. Mem: "+str(avgmem)
                         print "Avg. IO Wait: "+str(avgiowait)
                         
                         if avgiowait<10 and avgsmem<2:
                             print "Resource Level Acceptable. Continuing!"
                             run=False
                         else:
                             print "Resources Low! Waiting for resources to free."
                 
                 try:
                     ex=execs[procnum]
                     print "Starting "+ex
                     w=Worker(self.__fpath,ex.strip(),cfp.getVar("base","logbase","string"),cfp.getVar("db", "dbname","string"),cfp.getVar("db", "user","string"),cfp.getVar("db", "passw","string"),cfp.getVar("db", "host","string"), cfp.getVar("db", "port","string"),sema,lock)
                     w.start()
                     pool.append(w)
                     time.sleep(5)
                     procnum+=1
                     execs.remove(ex)
                 except Exception, e:
                     print "Failed to Start a Crawler"
                     print "Crawler Was: "+ex
                     print str(e)
                     for frame in traceback.extract_tb(sys.exc_info()[2]):
                         fname,lineno,fn,text = frame
                         print "Error in %s on line %d" % (fname, lineno)
             elif sema.getValue()>0:
                 print "Waiting for Crawlers to Complete "+str(sema.getValue())
                 print "Waiting To Start ",
                 print execs
                 time.sleep(30)
             startnew=maxprocs-sema.getValue()
Exemplo n.º 12
0
 def __init__(self,fpath):
     self.__fpath = fpath
     if self.__fpath is None or os.path.exists(self.__fpath) is False:
         raise ConfigurationNotFoundException("Valid Configuration must be Generated")
     self._cfp = Config(self.__fpath)