Exemplo n.º 1
0
    if len(cr['serverErrors']) > 0 or len(cr['browserErrors']) > 0:
        cr['errorsPresent'] = True


def processCrawlJob(crawlJob):
    DB.removeFromCrawlQueue(crawlJob.url)
    resp = callAgent(crawlJob)
    processAgentResponse(resp)
    DB.addToCrawlQueue(crawlJob.url)
    crawlJob.success = True
    return crawlJob


running = True
if __name__ == '__main__':
    pool = eventlet.GreenPool(size=4*len(agents))

    DB.ensure_indexes()
    if not DB.inCrawlQueue(config['startUrl']):
        DB.addToCrawlQueue(config['startUrl'])

    while running:
        for crawlDoc in DB.getCrawlQueue():
            if urlAllowed(crawlDoc['url']):
                for agent in agents:
                    job = CrawlJob(agent['name'], agent['url'], crawlDoc['url'])
                    pool.spawn(processCrawlJob, job)
            else:
                print "Removing URL: ", crawlDoc['url']
                DB.removeFromCrawlQueue(crawlDoc['url'])