Пример #1
0
class AUPageGrabber(Process):


    def __init__(self,args):
        Process.__init__(self)
        self.__fpath=args["fpath"]
        self.__proxy=args["proxy"]
        self.__maxproxies=args["maxproxies"]
        self.__term=args["term"]
        self.__pid=args["pid"]
        
        self.__err=Errors()
        
    def run(self):
        p=Psyco(self.__fpath)
        crawler=GetPage(self.__proxy)
        opener=crawler.getOpener()
        
        html=opener.open("http://www.nationalsecurity.gov.au/Listedterroristorganisations/Pages/default.aspx",None,120).read()
        try:
            spynMap={
            }
            
            confMap={
                "fpath":self.__fpath
            }
            
            cats={
                "targets":"/home/aevans/Documents/cats/terror/targets.txt",
                "activities":"/home/aevans/Documents/cats/terror/activities.txt",
                "attacks":"/home/aevans/Documents/cats/terror/attacks.txt",
                "finance":"/home/aevans/Documents/cats/terror/finance.txt",
                "charges":"/home/aevans/Documents/cats/terror/charges.txt",
                "convictions":"/home/aevans/Documents/cats/terror/convictions.txt",
                "risk":"/home/aevans/Documents/cats/terror/risk.txt",
                "leadership":"/home/aevans/Documents/cats/terror/leadership.txt",
                "background":"/home/aevans/Documents/cats/terror/background.txt",
                "disclaimer":"/home/aevans/Documents/cats/terror/disclaimer.txt",
                "family":"/home/aevans/Documents/cats/terror/family.txt",
                "noninfo":"/home/aevans/Documents/cats/terror/noninfo.txt",
                "recruitment":"/home/aevans/Documents/cats/terror/nrecruitment.txt"
            }
            
            parseMap=[{
               "class":Parser(cats),
               "passSpynner":False
            }]
            
            pages,faillist=crawler.loop(html, linktag="a", linkattr="href", linklimiters={"href":re.compile("www.nationalsecurity.gov.au\/Listedterroristorganisations\/")}, pageId=self.__pid, maxproxies=self.__maxproxies, spynnerMap=spynMap, opener=opener, waitload=120,proxy=self.__proxy, hashName="hash", table="au_parse_test.html", test=False, printTest=False, faillist=[], database=p, cfpMap=confMap, parserClassMap=parseMap, commitSize=100, checkLinks=True)
            p.execute("INSERT INTO au_parse_test.terms VALUES('finished')")
        except Exception,e:
            self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True)
        finally:
Пример #2
0
class OIGPageGrabber(Process):
    
    def __init__(self,args):
        Process.__init__(self)
        self.__fpath=args["fpath"]
        self.__proxy=args["proxy"]
        self.__maxproxies=args["maxproxies"]
        self.__term=args["term"]
        self.__pid=args["pid"]
        
        self.__err=Errors()
        
    def run(self):
        p=Psyco(self.__fpath)
        crawler=GetPage(self.__proxy)
        opener=crawler.getOpener()
        
        html=opener.open("https://oig.hhs.gov/fraud/fugitives/profiles.asp#allonce",None,120).read()
        try:
            spynMap={
            }
            
            confMap={
                "fpath":self.__fpath
            }
            
            cats={
                "charges":"/home/aevans/Documents/cats/finance/charges.txt",
                "attacks":"/home/aevans/Documents/cats/finance/attacks.txt",
                "convicitons":"/home/aevans/Documents/cats/finance/convictions.txt",
                "family":"/home/aevans/Documents/cats/finance/family.txt",
                "finance":"/home/aevans/Documents/cats/terror/finance.txt"
            }
            
            parser=Parser(cats)
            parser.parsePage(html,0)
        except Exception,e:
            self.__err.crawlerFail(e,traceback.extract_tb(sys.exc_info()[2]), True)
        finally: