예제 #1
0
    def postToErrorTables(self, sourcename, e, frames):
        """
        Takes in traceback frames and posts them to database. The
        expectation is that the data collected here is useful in 
        anomoly detection.
        
        An error code of -1 specifies that an attribute has no code. This is a property that can be changed
        
        :param sourcename: The name of the source.
        :param frames: The traceback frames from the error.
        :param e: The error, Exception instance.
        """
        code = self.__errCo
        description = None
        etype = None

        etype = str(type(e))

        if "HTTPError" in etype:
            code = e.code
            description = str(e)
        elif hasattr(e, "errno") is True:
            code = e.errno

        if "URLError" in etype:
            description = e.reason()
        else:
            description = str(e)

        if sourcename is None:
            self.printError("Please Specify a Source Name")

        else:
            if self._fpath is not None and self._cfp is not None and type is not None and description is not None:
                p = Psyco(self._fpath)
                p.execute(
                    "INSERT INTO "
                    + self._cfp.getVar("Error", "table", "string")
                    + " (source,type,code,description) VALUES('"
                    + sourcename
                    + "','"
                    + etype
                    + "','"
                    + str(code)
                    + "','"
                    + description
                    + "')"
                )
            elif type is None or description is None:
                self.printError("Type or Description Not Found for the Error. Cannot insert To Database.")
            else:
                self.printError("Please Specify a Config Path")
예제 #2
0
 def run(self):
     p=Psyco(self.__fpath)
     crawler=GetPage(self.__proxy)
     opener=crawler.getOpener()
     
     html=opener.open("http://www.nationalsecurity.gov.au/Listedterroristorganisations/Pages/default.aspx",None,120).read()
     try:
         spynMap={
         }
         
         confMap={
             "fpath":self.__fpath
         }
         
         cats={
             "targets":"/home/aevans/Documents/cats/terror/targets.txt",
             "activities":"/home/aevans/Documents/cats/terror/activities.txt",
             "attacks":"/home/aevans/Documents/cats/terror/attacks.txt",
             "finance":"/home/aevans/Documents/cats/terror/finance.txt",
             "charges":"/home/aevans/Documents/cats/terror/charges.txt",
             "convictions":"/home/aevans/Documents/cats/terror/convictions.txt",
             "risk":"/home/aevans/Documents/cats/terror/risk.txt",
             "leadership":"/home/aevans/Documents/cats/terror/leadership.txt",
             "background":"/home/aevans/Documents/cats/terror/background.txt",
             "disclaimer":"/home/aevans/Documents/cats/terror/disclaimer.txt",
             "family":"/home/aevans/Documents/cats/terror/family.txt",
             "noninfo":"/home/aevans/Documents/cats/terror/noninfo.txt",
             "recruitment":"/home/aevans/Documents/cats/terror/nrecruitment.txt"
         }
         
         parseMap=[{
            "class":Parser(cats),
            "passSpynner":False
         }]
         
         pages,faillist=crawler.loop(html, linktag="a", linkattr="href", linklimiters={"href":re.compile("www.nationalsecurity.gov.au\/Listedterroristorganisations\/")}, pageId=self.__pid, maxproxies=self.__maxproxies, spynnerMap=spynMap, opener=opener, waitload=120,proxy=self.__proxy, hashName="hash", table="au_parse_test.html", test=False, printTest=False, faillist=[], database=p, cfpMap=confMap, parserClassMap=parseMap, commitSize=100, checkLinks=True)
         p.execute("INSERT INTO au_parse_test.terms VALUES('finished')")
     except Exception,e:
         self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True)
예제 #3
0
 def setup(self,folder,table,confPath,createImages=False):
     '''
     Setup status table and folder table.
          
     *Required Parameters*
     :param folder: the folder name to use
     :param table: the table to use in setting up the status
     :param confPath: the configuration path
     
     *Optional Parameters*
     :param createImages: whether or not to create an images folder
     '''
     cfp=Config(confPath)
     os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder)
     
     p=Psyco(confPath)
     
     if createImages is True:
         #update crawl specific tables
         p.execute("INSERT INTO folders.names (folder,schema) VALUES ('"+folder+"','us_fl_crawlsor')")
         os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder)
         os.mkdir("Images","fpath","string"+"FL/SOR/"+folder+"/Images/")
예제 #4
0
 def run(self):
     """
     Executes the crawler as a separate process and monitors for completion. The worker itself is a Thread so run is the 
     necessary name of the method.
     """
     print "Executing "+self.__execute
     p=Psyco(self.__fpath)
     cfp=Config(self.__fpath)
     
     if self.__execute is not None and self.__logbase is not None:
         try:
             logfp=self.__logbase+"logs/"+self.__execute.replace(".xml","").replace(".xml","")+str(int(round(time.time())))+".txt"
             self.__sema.increment(self.__lock)
             try:
                 p.execute("INSERT INTO crawlers.ca_crim_parsers_running(name) VALUES('"+self.__execute+"')")
                 p.execute("DELETE FROM crawlers.ca_crim_parsers_not_run WHERE name LIKE '"+self.__execute+"'")
                 stdfile=open(logfp,'w+')
                 
                 #the products config file will be in the base directory
                 cmd="/usr/bin/java -Xms"+cfp.getVar("Pentaho","xms","string").strip()+" -Xmx"+cfp.getVar("Pentaho","xmx","string").strip()+" -XX:+UseConcMarkSweepGC -Xcompactexplicitgc -Dbeansfile="+self.__logbase+self.__execute+" -jar "+self.__jarfile
                 print cmd
                 pipe=subprocess.Popen(shlex.split(cmd), stdout=stdfile,stderr=subprocess.STDOUT,shell=False)
                 ret=pipe.wait()
                     
                 print "Completed "+self.__execute
                 p.execute("DELETE FROM crawlers.ca_crim_parsers_running WHERE name LIKE '"+self.__execute+"'")
                 if ret is  0:
                     p.execute("INSERT INTO crawlers.ca_crim_parsers_complete (name,crawlstamp) VALUES('"+self.__execute+"','"+str(self.__datestamp)+"')")
                 else:
                     print "PARSERS- Premature Detonation!!! Failure "+self.__execute
                     print str(ret)
                     if cfp.getVar("notification", "err_type","string") == 'fog':
                         self.fogbugzOnFail(logfp)
                     else:
                         self.emailOnFail(logfp)
                     
             finally:
                 print "Decrementing"
                 self.__sema.decrement(self.__lock)
         except Exception, e:
             print "Process Failed"
             print str(e)
             for frame in traceback.extract_tb(sys.exc_info()[2]):
                 fname,lineno,fn,text = frame
                 print "Error in %s on line %d" % (fname, lineno)
         print "Worker Complete "+str(time.time())
예제 #5
0
 def changeOnComplete(self,alterSchema,confPath,folder=None):
     '''
     Handles the final cleanup on the last part of the crawl.
     
     *Required Parameters*
     :param alterSchema: the schema to rename and use
     :param confPath: the configuration path to use 
     
     *Optional Parameters*
     :param folder: folder name to use
     
     NOTE: The folder name should be the saem as provided from getFolder and/or to setup if used.
     '''
     cfp=Config(confPath)
     p=Psyco(confPath)
     rename="us_fl_crawlsor"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')
     print("Renaming Schema to "+rename)
     p.execute("ALTER SCHEMA us_fl_crawlsor RENAME TO "+rename)
     
     if folder is not None:
         #update the folder name information
         p.execute("UPDATE folders.names SET status=true,schema="+rename+" WHERE folder="+folder)
     p.execute("INSERT INTO "+cfp.getVar("finished","alterschema","string")+"."+cfp.getVar("finished","altertable","string")+"(filename,schema) VALUES('"+re.search('.*?\/([^\/]+\.py.*)',inspect.getfile(inspect.currentframe())).group(1)+"','"+rename+"')")