def run(self): self.dataBase = createProdDataBase() self.renrenAccountPool = createProdRenrenAccountPool() for i in range(0, self.ROUND_NUMBER): log.info('>>>>>>>> Main Crawl Thread Round(%s) <<<<<<<<' % (i+1)) if self.dataBase.needMoreStartNode(): startNodeCrawler = StartNodeCrawler(\ self.dataBase, self.renrenAccountPool) startNodeCrawler.startCrawling() self.startMultiThreadCrawling(self.THREAD_NUMBER) #self.startMultiThreadCrawlingWithProxy(1) #manager.startSignleThreadCrawling() try: Crawler.detectStopSignal() except Exception, e: break log.info('>>>>>> Router disconnect PPPoE <<<<<<') router.disconnectPPPoE() time.sleep(2) log.info('>>>>>> Router connect PPPoE <<<<<<') router.connectPPPoE() # Wait for the connection being established. time.sleep(10)
def test(): log.config(GC.LOG_FILE_DIR + 'crawler_test', 'info', 'info') db = createConnection() createTables(db) dropTables(db) createTables(db) pool = renrenaccountpool.createProdRenrenAccountPool() accounts = pool.getAccounts(1) account = accounts[0] global crawler try: crawler = Crawler(db) agent = RenrenAgent(account) agent.login() crawler.setAgent(agent) id = "322601086" crawler.crawl(id, 30) except CrawlerException, e: log.info("Crawler end, reason: " + str(e)) if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL: print "detect int signal" return
def importFromFile(fname): log.config(GC.LOG_FILE_DIR + 'import_accounts', 'info', 'info') fileName = fname accounts = [] pool = createProdRenrenAccountPool() with open(fileName) as importedFile: lines = importedFile.readlines() for line in lines: strs = line.split() if len(strs) < 2: continue # May be not a valid account username = strs[0] # User name first. password = strs[1] # And then password. log.info("Find username: "******" " +\ "password: "******"Finish importing..........\n" +\ "Success on verify accounts number: " +\ str(verifySuccessCount) + "\n" +\ "Fail on verify accounts number: " +\ str(verifyFailCount)) log.info('Success imported number: %s' % importSuccessCount) log.info('Fail imported number: %s' % importFailCount)
def __init__(self, dataBase=None, accountPool=None): if dataBase: self.dataBase = dataBase else: self.dataBase = createProdDataBase() if accountPool: self.renrenAccountPool = accountPool else: self.renrenAccountPool = createProdRenrenAccountPool() self.dataBase.releaseAllStartNode() self.userList = [] self.shareList = [] self.requestCount = 0 self.crawledShareSet = set()
def main(): log.config(GC.LOG_FILE_DIR + 'save_accounts', 'info', 'info') pool = createProdRenrenAccountPool() saveInUsingAccounts(pool) saveInvalidAccount(pool)