def __init__(self, dataBase, pool, proxy=None): threading.Thread.__init__(self) self.threadId = self.getThreadId() log.info('>>>>>> Create thread %s. <<<<<<' % self.threadId) self.dataBase = dataBase self.renrenAccountPool = pool self.proxy = proxy
def main(): log.config(GC.LOG_FILE_DIR + "import_start_nodes", "info", "info") fileName = "tools/data/start_nodes" importCount = 0 failCount = 0 dataBase = database.createProdDataBase() with open(fileName) as importedFile: lines = importedFile.readlines() for line in lines: strs = line.split() if len(strs) < 1: continue # May be not a valid account id = strs[0] # Start node id. log.info("Importing start node: " + id) success = dataBase.insertIntoStartList(id) if success: importCount += 1 else: failCount += 1 log.info( "Finish importing..........\n" + "Total imported start nodes number: " + str(importCount) + "\n" + "Fail start nodes number: " + str(failCount) )
def investigate(self): self.allProxies.sort(key=lambda x: x.addr) for proxy in self.allProxies: log.info(str(proxy.averageTime) + ' ' +\ str(proxy.successCount) + '/' +\ str(proxy.testCount) + ' ' +\ proxy.getAllString())
def test(): log.config(GC.LOG_FILE_DIR + 'crawler_test', 'info', 'info') db = createConnection() createTables(db) dropTables(db) createTables(db) pool = renrenaccountpool.createProdRenrenAccountPool() accounts = pool.getAccounts(1) account = accounts[0] global crawler try: crawler = Crawler(db) agent = RenrenAgent(account) agent.login() crawler.setAgent(agent) id = "322601086" crawler.crawl(id, 30) except CrawlerException, e: log.info("Crawler end, reason: " + str(e)) if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL: print "detect int signal" return
def saveInvalidAccount(pool): command = """ SELECT username, password FROM RenrenAccounts WHERE is_valid = 0 ORDER BY last_used_time DESC; """ pool.cursor.execute(command) rows = pool.cursor.fetchall() log.info('Total InValid account:' + str(len(rows))) failCount = 0 for row in rows: username = row[0] password = row[1] saveSuccess = False if pool.onceSaveFail(username, password): continue if failCount > 100: break; try: time.sleep(2) agent = RenrenAgent(username, password) agent.login() saveSuccess = agent.isLogin except Exception, e: log.warning('Save login fail: ' + str(e)) finally:
def login(self): assert self.account.isLogin == False loginData={ 'email': self.username, 'password': self.password, 'originUrl': '', 'formName': '', 'method': '', 'isplogin': '******', 'submit': '登陆' } postData = urlencode(loginData) req = urllib2.Request(self.loginUrl, postData) if self.proxy: log.info('Try to login with proxy: (%s, %s) %s %s ====' %\ (self.username, self.password, self.proxy.protocol, self.proxy.getProxyString())) else: log.info('Try to login: (%s, %s) ====' %\ (self.username, self.password)) try: response = self.opener.open(req, timeout=self.TIME_OUT) except urllib2.URLError, e: #print 'Login error: ' + e.reason log.warning('Login fail when requesting: ' + str(e.reason)) return ({}, ErrorCode.URL_ERROR)
def start(self): thread = MainCrawlThread() thread.start() # Wait for a signal signal.pause() thread.join() log.info(">>>>>>>>>> Main thread end.... <<<<<<<<<<")
def showHeadProxies(self, proxies, number): for i in range(0, number): if i >= len(proxies): break proxy = proxies[i] log.info('>>> ' + str(proxy.averageTime) + ' ' +\ str(proxy.successCount) + '/' + str(proxy.testCount) +\ ' ' + proxy.getAllString())
def analyse(self): """Analyse the data.""" profiles = self.getProfiles() log.info('Total Profile number: %s' % len(profiles)) self.processProfiles(profiles) if flag.getFlag('use_result_filter'): self.result.filter() self.result.caculate()
def testAanlysedDataBaseImportFromResult(): wdb = createTestAnalysedDataBase() # result = getRandomResult(10) result = getRandomResult(100) wdb.importResult(result) wdb.close() assertEqual(result) log.info("Pass the 1/2 test!")
def main(): flag.processArguments() log.config(GC.LOG_FILE_DIR + 'CrawlManager', 'info', 'info') signal.signal(signal.SIGINT, detectSignal) waitingTime = flag.getFlag('waiting_time') log.info('Wait for: ' + str(waitingTime) + ' minutes') time.sleep(waitingTime * 60) manager = CrawlManager() manager.start()
def pingServer(self): if time.time() - self.lastPing <= self.PING_INTERVAL: return log.info('Ping Mysql Server.') self.lastPing = time.time() try: self.mdbConnection.ping() except Exception, e: log.warning('Mysql server gone: %s' % e) self._init()
def main(): log.config(GC.LOG_FILE_DIR + 'database_test', 'debug', 'debug') db = createConnection() createTables(db) dropTables(db) createTables(db) test(db) testStartList(db) dropTables(db) db.close() log.info("Pass the test!")
def removeDuplicate(self): proxyMap = {} for proxy in self.allProxies: #key = proxy.addr[0 : proxy.addr.rfind('.')] key = proxy.addr + proxy.port if not proxyMap.get(key): proxyMap[key] = proxy self.proxiesToTest = [item[1] for item in proxyMap.items()] log.info( 'After remove duplicate, proxy number: ' +\ str(len(self.proxiesToTest)))
def main(): log.config(GC.LOG_FILE_DIR + 'account_pool_test', 'debug', 'debug') pool = createTestRenrenAccountPool() createTables(pool) dropTables(pool) createTables(pool) testRenrenAccountPool() testProxyPool() dropTables(pool) log.info("Pass the test!")
def testAanlysedDataBaseImportFromFile(): tmpFileName = "files/tmpserializedata" # result = getRandomResult(10) result = getRandomResult(100) result.writeToFile(tmpFileName) wdb = createTestAnalysedDataBase() wdb.importResultFromFile(tmpFileName) wdb.close() assertEqual(result) log.info("Pass the 2/2 test!")
def printInfo(info): log.info('Parsed user info:\n' +\ 'name: ' + str(info.name) + '\n'\ 'gender: ' + str(info.gender) + '\n'\ 'hometown: ' + str(info.hometown) + '\n'\ 'residence: ' + str(info.residence) + '\n'\ 'birthday: ' + str(info.birthday) + '\n'\ 'visitedNum: ' + str(info.visitedNum) + '\n'\ 'friendNum: ' + str(info.friendNum) + '\n'\ 'vistorListNum: ' + str(len(info.recentVisitedList)) + '\n'\ 'vistorList: ' + str(info.recentVisitedList) + '\n'\ 'friendListNum: ' + str(len(info.friendList)) + '\n'\ 'friendList: ' + str(info.friendList))
def testAll(self): startTime = time.time() threads = [] proxies = self.proxiesToTest for proxy in proxies: tester = ProxyTester(proxy) threads.append(tester) pool = ThreadPool(200) pool.start(threads) log.info( 'Test finish. Total test time: ' +\ str(time.time() - startTime) + 's')
def saveInUsingAccounts(pool): getAllUsingAccountsCommand = """ SELECT username, password FROM RenrenAccounts WHERE is_using = 1 AND is_valid = 1 AND last_used_time <= DATE_SUB(NOW(), INTERVAL 1 DAY) ORDER BY last_used_time ASC; """ pool.cursor.execute(getAllUsingAccountsCommand) rows = pool.cursor.fetchall() log.info('Total InUsing account:' + str(len(rows))) for row in rows: pool.saveAccount(row[0], row[1], True, '1971-1-1')
def recursiveProfileTest( username, password, testInterval, totalCount, startList): """Run a recursive get profile test.""" generator = recursiveTestGenerator( username, password, testInterval, totalCount, startList) while True: try: id, info, errorCode = generator.next() if not errorCode: log.info('Profile url: ' + RenrenAgent.getProfileUrl(id)) path = util.saveTestPage(info.html, id) log.info('Profile local path: file://'+path) printInfo(info) except Exception, e: log.error('Error happen or end: ' + str(e)) break
def expandSharePage(self, sharePageInfo, agent): for url in sharePageInfo.popularShareList: if url in self.crawledShareSet: continue self.crawledShareSet.add(url) if self.requestCount >= self.REQUEST_LIMIT: return log.info('Share agent is crawling url: %s' % url) info = agent.getSharePageInfo(url) self.requestCount += 1 if info: for user in info.commentUserList: self.addUser(user) for shareUrl in info.popularShareList: self.shareList.append(shareUrl) time.sleep(1) if info: self.expandSharePage(info, agent)
def verify(self): agent = RenrenAgent(self.account) agent.login() if agent.isLogin: self.success = True else: self.success = False log.info('Account Verify result: (%s, %s) >>> %s' %\ (self.account.username, self.account.password, self.success)) global lock global verifySuccessCount global verifyFailCount lock.acquire() if self.success: verifySuccessCount += 1 else: verifyFailCount += 1 lock.release()
def run(self): self.dataBase = createProdDataBase() self.renrenAccountPool = createProdRenrenAccountPool() for i in range(0, self.ROUND_NUMBER): log.info('>>>>>>>> Main Crawl Thread Round(%s) <<<<<<<<' % (i+1)) if self.dataBase.needMoreStartNode(): startNodeCrawler = StartNodeCrawler(\ self.dataBase, self.renrenAccountPool) startNodeCrawler.startCrawling() self.startMultiThreadCrawling(self.THREAD_NUMBER) #self.startMultiThreadCrawlingWithProxy(1) #manager.startSignleThreadCrawling() try: Crawler.detectStopSignal() except Exception, e: break log.info('>>>>>> Router disconnect PPPoE <<<<<<') router.disconnectPPPoE() time.sleep(2) log.info('>>>>>> Router connect PPPoE <<<<<<') router.connectPPPoE() # Wait for the connection being established. time.sleep(10)
def getProfileTest(agent, id, filePath=''): if filePath: log.info('================= Get Profile test (Local Html) ======' +\ '=======================') log.info('Local Profile path: file://' + filePath) html = open(filePath).read() info, errorCode = agent.parseProfileHtml(html) if errorCode: log.error('Error happen in parse local html, path: ' + filePath) return else: log.info('================= Get Profile test (Online Html) =====' +\ '=======================') log.info('Profile url: ' + agent.getProfileUrl(id)) info, errorCode = agent.getProfile(id) if errorCode: log.error('Error happen in get profile, id: ' + id) return if not info.html: log.warning('No html') return path = util.saveTestPage(info.html, id) log.info('Online Profile path: file://'+path) printInfo(info)
def handleUserList(self): log.info('Finish crawling start nodes, total user number: %s' %\ len(self.userList)) for id in self.userList: status = self.dataBase.getStatus(id) if status != Status.expanded: # Inser into start list. self.dataBase.insertIntoStartList(id) log.info('Inser into start list: %s' % id) else: log.info('This id have been expanded: %s' % id)
def importFromFile(fname): log.config(GC.LOG_FILE_DIR + 'import_accounts', 'info', 'info') fileName = fname accounts = [] pool = createProdRenrenAccountPool() with open(fileName) as importedFile: lines = importedFile.readlines() for line in lines: strs = line.split() if len(strs) < 2: continue # May be not a valid account username = strs[0] # User name first. password = strs[1] # And then password. log.info("Find username: "******" " +\ "password: "******"Finish importing..........\n" +\ "Success on verify accounts number: " +\ str(verifySuccessCount) + "\n" +\ "Fail on verify accounts number: " +\ str(verifyFailCount)) log.info('Success imported number: %s' % importSuccessCount) log.info('Fail imported number: %s' % importFailCount)
def main(): log.config(GC.LOG_FILE_DIR + "analysed_data_base_test", "debug", "debug") testAanlysedDataBaseImportFromResult() testAanlysedDataBaseImportFromFile() log.info("Pass the all the test!")
def expand(self, id, opt_connection=None): """Expand the node with given id. This function will: 1) If not provided connection, means the status of the id may be unrecorded. We need to check the status and makeHTTPRequest if needed. Then get the connection of the node. 2) If provided opt_connection, use the connection. 3) With the connection, get the profile (and with connection if availabel) of it's home page friend and recent visitors. 3) Calculates the connected upexpanded nodes' score. 4) Picks the node with highest score and return it for next expand. Returns: UserNode: the node for next expand. Raise: CrawlerException: happen. """ log.info("Expand node: " + id) # Get the connection of the node. if not opt_connection: # If opt_connection is not provided. # Check the status and get the connection. status = self.dataBase.getStatus(id) if status == database.Status.unrecorded: node = self.makeRequsetAndSave(id) if not node: raise CrawlerException( "Failed to get expanding node's user profile!", CrawlerErrorCode.GET_EXPANDING_NODE_FAILED) connection = node.connection elif status == database.Status.unexpanded or\ status == database.Status.expanded: connection = self.dataBase.getConnection(id) # TODO: Bring it back later. #elif status == database.Status.expanded: # raise CrawlerException( # "Try to expand expanded node.", # CrawlerErrorCode.EXPAND_EXPANDED_NODE) else: raise CrawlerException("WTF??", CrawlerErrorCode.UNKNOWN) else: connection = opt_connection # Get the profile of connected nodes. connectedUnexpandedNodes = [] allConnection =\ connection.recentVisitorList + connection.homePageFriendList for id in allConnection: status, profile = self.dataBase.getStatusAndProfile(id) if status == database.Status.unrecorded: node = self.makeRequsetAndSave(id) if node: connectedUnexpandedNodes.append(node) elif status == database.Status.unexpanded: node = UserNode(id, status, profile) connectedUnexpandedNodes.append(node) # Set node status to expanded. # Raise exception if not len(connectedUnexpandedNodes): self.dataBase.setStatus(id, database.Status.expanded) raise CrawlerException( "There is no availabel node to expand.", CrawlerErrorCode.NO_NODE_TO_EXPAND) # Calculate the score. scores = [self.calculateScore(node) \ for node in connectedUnexpandedNodes] # Random pick one base on the score. weights = [x*x*x for x in scores] index = util.weightPickInt(weights) nextToExpend = connectedUnexpandedNodes[index] #if self.dataBase.needMoreStartNode() and util.randomTrue(1.0/60): if self.dataBase.needMoreStartNode() and util.randomTrue(1.0/10): del connectedUnexpandedNodes[index] del weights[index] if (len(connectedUnexpandedNodes) > 0): index = util.weightPickInt(weights) newStartNode = connectedUnexpandedNodes[index] self.dataBase.insertIntoStartList(newStartNode.id) return nextToExpend
def recursiveTestGenerator( username, password, testInterval, totalCount, startList): """A recursive test generator. Start from a list of user id, and get all the profile of these id and their friends and friend of the friends. Every time it gets a user profile, it will yield the (id, UserInfo, ErrorCode) Args: @username {string} the user name of the agent. @password {string} the password of the agent. @testInterval {float} the interval time between every request. @totalCount {integer} total number of profile to get. @startList {List} a list of user id to start test. """ agent = RenrenAgent(username, password) info, error = agent.login() if not error: log.info(info['name']) log.info(info['href']) else: log.error('Login error(username, password): ' +\ username + ', ' + password) count = 1 visitList = [] for elem in startList: visitList.append((elem, None)) while visitList: # Get the element to do requet. elem = visitList[0] id = elem[0] log.info('processing(' + str(count) + '): ' + id) visitList = visitList[1:] info, errorCode = agent.getProfile(id) # Error handle if errorCode: if elem[1]: log.warning('Error happen when getProfile. Refer id: ' +\ str(elem[1]) + '. Refer page url: ' +\ agent.getProfileUrl(str(elem[1]))) else: log.warning('Error happen when getProfile, no refer id.') continue # Yield result yield (id, info, errorCode) # Result handle if len(visitList) < totalCount - count: newList = [] if info.friendList: for ele in info.friendList: newList += [(ele, id)] if info.recentVisitedList: for ele in info.recentVisitedList: newList += [(ele, id)] visitList += newList # Acc count += 1 if count > totalCount: return time.sleep(testInterval)
def main(): log.config(GC.LOG_FILE_DIR + 'result_test', 'debug', 'debug') testResultSerialization() log.info("Pass the test!")