def postToErrorTables(self, sourcename, e, frames): """ Takes in traceback frames and posts them to database. The expectation is that the data collected here is useful in anomoly detection. An error code of -1 specifies that an attribute has no code. This is a property that can be changed :param sourcename: The name of the source. :param frames: The traceback frames from the error. :param e: The error, Exception instance. """ code = self.__errCo description = None etype = None etype = str(type(e)) if "HTTPError" in etype: code = e.code description = str(e) elif hasattr(e, "errno") is True: code = e.errno if "URLError" in etype: description = e.reason() else: description = str(e) if sourcename is None: self.printError("Please Specify a Source Name") else: if self._fpath is not None and self._cfp is not None and type is not None and description is not None: p = Psyco(self._fpath) p.execute( "INSERT INTO " + self._cfp.getVar("Error", "table", "string") + " (source,type,code,description) VALUES('" + sourcename + "','" + etype + "','" + str(code) + "','" + description + "')" ) elif type is None or description is None: self.printError("Type or Description Not Found for the Error. Cannot insert To Database.") else: self.printError("Please Specify a Config Path")
def run(self): p=Psyco(self.__fpath) crawler=GetPage(self.__proxy) opener=crawler.getOpener() html=opener.open("http://www.nationalsecurity.gov.au/Listedterroristorganisations/Pages/default.aspx",None,120).read() try: spynMap={ } confMap={ "fpath":self.__fpath } cats={ "targets":"/home/aevans/Documents/cats/terror/targets.txt", "activities":"/home/aevans/Documents/cats/terror/activities.txt", "attacks":"/home/aevans/Documents/cats/terror/attacks.txt", "finance":"/home/aevans/Documents/cats/terror/finance.txt", "charges":"/home/aevans/Documents/cats/terror/charges.txt", "convictions":"/home/aevans/Documents/cats/terror/convictions.txt", "risk":"/home/aevans/Documents/cats/terror/risk.txt", "leadership":"/home/aevans/Documents/cats/terror/leadership.txt", "background":"/home/aevans/Documents/cats/terror/background.txt", "disclaimer":"/home/aevans/Documents/cats/terror/disclaimer.txt", "family":"/home/aevans/Documents/cats/terror/family.txt", "noninfo":"/home/aevans/Documents/cats/terror/noninfo.txt", "recruitment":"/home/aevans/Documents/cats/terror/nrecruitment.txt" } parseMap=[{ "class":Parser(cats), "passSpynner":False }] pages,faillist=crawler.loop(html, linktag="a", linkattr="href", linklimiters={"href":re.compile("www.nationalsecurity.gov.au\/Listedterroristorganisations\/")}, pageId=self.__pid, maxproxies=self.__maxproxies, spynnerMap=spynMap, opener=opener, waitload=120,proxy=self.__proxy, hashName="hash", table="au_parse_test.html", test=False, printTest=False, faillist=[], database=p, cfpMap=confMap, parserClassMap=parseMap, commitSize=100, checkLinks=True) p.execute("INSERT INTO au_parse_test.terms VALUES('finished')") except Exception,e: self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True)
def setup(self,folder,table,confPath,createImages=False): ''' Setup status table and folder table. *Required Parameters* :param folder: the folder name to use :param table: the table to use in setting up the status :param confPath: the configuration path *Optional Parameters* :param createImages: whether or not to create an images folder ''' cfp=Config(confPath) os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder) p=Psyco(confPath) if createImages is True: #update crawl specific tables p.execute("INSERT INTO folders.names (folder,schema) VALUES ('"+folder+"','us_fl_crawlsor')") os.mkdir(cfp.getVar("Images","fpath","string")+"FL/SOR/"+folder) os.mkdir("Images","fpath","string"+"FL/SOR/"+folder+"/Images/")
def run(self): """ Executes the crawler as a separate process and monitors for completion. The worker itself is a Thread so run is the necessary name of the method. """ print "Executing "+self.__execute p=Psyco(self.__fpath) cfp=Config(self.__fpath) if self.__execute is not None and self.__logbase is not None: try: logfp=self.__logbase+"logs/"+self.__execute.replace(".xml","").replace(".xml","")+str(int(round(time.time())))+".txt" self.__sema.increment(self.__lock) try: p.execute("INSERT INTO crawlers.ca_crim_parsers_running(name) VALUES('"+self.__execute+"')") p.execute("DELETE FROM crawlers.ca_crim_parsers_not_run WHERE name LIKE '"+self.__execute+"'") stdfile=open(logfp,'w+') #the products config file will be in the base directory cmd="/usr/bin/java -Xms"+cfp.getVar("Pentaho","xms","string").strip()+" -Xmx"+cfp.getVar("Pentaho","xmx","string").strip()+" -XX:+UseConcMarkSweepGC -Xcompactexplicitgc -Dbeansfile="+self.__logbase+self.__execute+" -jar "+self.__jarfile print cmd pipe=subprocess.Popen(shlex.split(cmd), stdout=stdfile,stderr=subprocess.STDOUT,shell=False) ret=pipe.wait() print "Completed "+self.__execute p.execute("DELETE FROM crawlers.ca_crim_parsers_running WHERE name LIKE '"+self.__execute+"'") if ret is 0: p.execute("INSERT INTO crawlers.ca_crim_parsers_complete (name,crawlstamp) VALUES('"+self.__execute+"','"+str(self.__datestamp)+"')") else: print "PARSERS- Premature Detonation!!! Failure "+self.__execute print str(ret) if cfp.getVar("notification", "err_type","string") == 'fog': self.fogbugzOnFail(logfp) else: self.emailOnFail(logfp) finally: print "Decrementing" self.__sema.decrement(self.__lock) except Exception, e: print "Process Failed" print str(e) for frame in traceback.extract_tb(sys.exc_info()[2]): fname,lineno,fn,text = frame print "Error in %s on line %d" % (fname, lineno) print "Worker Complete "+str(time.time())
def changeOnComplete(self,alterSchema,confPath,folder=None): ''' Handles the final cleanup on the last part of the crawl. *Required Parameters* :param alterSchema: the schema to rename and use :param confPath: the configuration path to use *Optional Parameters* :param folder: folder name to use NOTE: The folder name should be the saem as provided from getFolder and/or to setup if used. ''' cfp=Config(confPath) p=Psyco(confPath) rename="us_fl_crawlsor"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S') print("Renaming Schema to "+rename) p.execute("ALTER SCHEMA us_fl_crawlsor RENAME TO "+rename) if folder is not None: #update the folder name information p.execute("UPDATE folders.names SET status=true,schema="+rename+" WHERE folder="+folder) p.execute("INSERT INTO "+cfp.getVar("finished","alterschema","string")+"."+cfp.getVar("finished","altertable","string")+"(filename,schema) VALUES('"+re.search('.*?\/([^\/]+\.py.*)',inspect.getfile(inspect.currentframe())).group(1)+"','"+rename+"')")