class AUPageGrabber(Process): def __init__(self,args): Process.__init__(self) self.__fpath=args["fpath"] self.__proxy=args["proxy"] self.__maxproxies=args["maxproxies"] self.__term=args["term"] self.__pid=args["pid"] self.__err=Errors() def run(self): p=Psyco(self.__fpath) crawler=GetPage(self.__proxy) opener=crawler.getOpener() html=opener.open("http://www.nationalsecurity.gov.au/Listedterroristorganisations/Pages/default.aspx",None,120).read() try: spynMap={ } confMap={ "fpath":self.__fpath } cats={ "targets":"/home/aevans/Documents/cats/terror/targets.txt", "activities":"/home/aevans/Documents/cats/terror/activities.txt", "attacks":"/home/aevans/Documents/cats/terror/attacks.txt", "finance":"/home/aevans/Documents/cats/terror/finance.txt", "charges":"/home/aevans/Documents/cats/terror/charges.txt", "convictions":"/home/aevans/Documents/cats/terror/convictions.txt", "risk":"/home/aevans/Documents/cats/terror/risk.txt", "leadership":"/home/aevans/Documents/cats/terror/leadership.txt", "background":"/home/aevans/Documents/cats/terror/background.txt", "disclaimer":"/home/aevans/Documents/cats/terror/disclaimer.txt", "family":"/home/aevans/Documents/cats/terror/family.txt", "noninfo":"/home/aevans/Documents/cats/terror/noninfo.txt", "recruitment":"/home/aevans/Documents/cats/terror/nrecruitment.txt" } parseMap=[{ "class":Parser(cats), "passSpynner":False }] pages,faillist=crawler.loop(html, linktag="a", linkattr="href", linklimiters={"href":re.compile("www.nationalsecurity.gov.au\/Listedterroristorganisations\/")}, pageId=self.__pid, maxproxies=self.__maxproxies, spynnerMap=spynMap, opener=opener, waitload=120,proxy=self.__proxy, hashName="hash", table="au_parse_test.html", test=False, printTest=False, faillist=[], database=p, cfpMap=confMap, parserClassMap=parseMap, commitSize=100, checkLinks=True) p.execute("INSERT INTO au_parse_test.terms VALUES('finished')") except Exception,e: self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True) finally:
class OIGPageGrabber(Process): def __init__(self,args): Process.__init__(self) self.__fpath=args["fpath"] self.__proxy=args["proxy"] self.__maxproxies=args["maxproxies"] self.__term=args["term"] self.__pid=args["pid"] self.__err=Errors() def run(self): p=Psyco(self.__fpath) crawler=GetPage(self.__proxy) opener=crawler.getOpener() html=opener.open("https://oig.hhs.gov/fraud/fugitives/profiles.asp#allonce",None,120).read() try: spynMap={ } confMap={ "fpath":self.__fpath } cats={ "charges":"/home/aevans/Documents/cats/finance/charges.txt", "attacks":"/home/aevans/Documents/cats/finance/attacks.txt", "convicitons":"/home/aevans/Documents/cats/finance/convictions.txt", "family":"/home/aevans/Documents/cats/finance/family.txt", "finance":"/home/aevans/Documents/cats/terror/finance.txt" } parser=Parser(cats) parser.parsePage(html,0) except Exception,e: self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True) finally: