def _init(self): """Actually initialize the mysql connection.""" try: self.mdbConnection = mdb.connect(self.host, self.username, self.password, self.database, charset='utf8'); self.cursor = self.mdbConnection.cursor() sucess = True except mdb.Error, e: log.error("Can not establish connection to mysql: " + str(e)) sucess = False
def recursiveProfileTest( username, password, testInterval, totalCount, startList): """Run a recursive get profile test.""" generator = recursiveTestGenerator( username, password, testInterval, totalCount, startList) while True: try: id, info, errorCode = generator.next() if not errorCode: log.info('Profile url: ' + RenrenAgent.getProfileUrl(id)) path = util.saveTestPage(info.html, id) log.info('Profile local path: file://'+path) printInfo(info) except Exception, e: log.error('Error happen or end: ' + str(e)) break
def getGlobalInfo(self): """Get the global information.""" AnalysedDataBase._acquireLock() self.pingServer() try: command = """SELECT info FROM GlobalNameInfo WHERE id = 1; """ self.cursor.execute(command) rows = self.cursor.fetchall() if not rows: log.error('Read gloabal information fail!') string = rows[0][0] globalInfo = GlobalNameInfo.FromString(string) return globalInfo except Exception, e: log.warning("Get global info failed! " + str(e)) self.mdbConnection.rollback() return None
def init(self, host, username, password, database): """Initialize the mysql connection. Args: @host {string} the name of the host, e.g. 'localhost'. @username {string} the user name of the database account. @password {string} the password. @database {string} the name of the database. Reuturns: True if the action success. False if the action failed. """ try: self.mdbConnection = mdb.connect(host, username, password, database); self.cursor = self.mdbConnection.cursor() sucess = True except mdb.Error, e: log.error("Can not establish connection to mysql: " + str(e)) sucess = False
def _getInfo(self, key, tableName): """Get the RawInfo for the map the key. Returns: {RawNameItemInfo} the raw info. """ AnalysedDataBase._acquireLock() self.pingServer() try: command = "SELECT info FROM %s WHERE s_key = %s;" % (tableName, '%s') self.cursor.execute(command, [key]) rows = self.cursor.fetchall() if len(rows): if len(rows) > 1: log.error("Mutiple result for key: " + key) infoString = rows[0][0] info = RawNameItemInfo.FromString(infoString) return info else: return None finally: AnalysedDataBase._releaseLock()
def getProfileTest(agent, id, filePath=''): if filePath: log.info('================= Get Profile test (Local Html) ======' +\ '=======================') log.info('Local Profile path: file://' + filePath) html = open(filePath).read() info, errorCode = agent.parseProfileHtml(html) if errorCode: log.error('Error happen in parse local html, path: ' + filePath) return else: log.info('================= Get Profile test (Online Html) =====' +\ '=======================') log.info('Profile url: ' + agent.getProfileUrl(id)) info, errorCode = agent.getProfile(id) if errorCode: log.error('Error happen in get profile, id: ' + id) return if not info.html: log.warning('No html') return path = util.saveTestPage(info.html, id) log.info('Online Profile path: file://'+path) printInfo(info)
def recursiveTestGenerator( username, password, testInterval, totalCount, startList): """A recursive test generator. Start from a list of user id, and get all the profile of these id and their friends and friend of the friends. Every time it gets a user profile, it will yield the (id, UserInfo, ErrorCode) Args: @username {string} the user name of the agent. @password {string} the password of the agent. @testInterval {float} the interval time between every request. @totalCount {integer} total number of profile to get. @startList {List} a list of user id to start test. """ agent = RenrenAgent(username, password) info, error = agent.login() if not error: log.info(info['name']) log.info(info['href']) else: log.error('Login error(username, password): ' +\ username + ', ' + password) count = 1 visitList = [] for elem in startList: visitList.append((elem, None)) while visitList: # Get the element to do requet. elem = visitList[0] id = elem[0] log.info('processing(' + str(count) + '): ' + id) visitList = visitList[1:] info, errorCode = agent.getProfile(id) # Error handle if errorCode: if elem[1]: log.warning('Error happen when getProfile. Refer id: ' +\ str(elem[1]) + '. Refer page url: ' +\ agent.getProfileUrl(str(elem[1]))) else: log.warning('Error happen when getProfile, no refer id.') continue # Yield result yield (id, info, errorCode) # Result handle if len(visitList) < totalCount - count: newList = [] if info.friendList: for ele in info.friendList: newList += [(ele, id)] if info.recentVisitedList: for ele in info.recentVisitedList: newList += [(ele, id)] visitList += newList # Acc count += 1 if count > totalCount: return time.sleep(testInterval)
def run(self): log.info('>>>>>> Thread %s start. <<<<<<' % self.threadId) crawler = Crawler(self.dataBase) dataBase = self.dataBase agent = None account = None startNode = None startNodeRowId = None try: while True: # Prepare for agent, account and startnode. if not startNode: startNode, startNodeRowId = dataBase.getStartNode() log.info('Thread %s, startnode: %s, %s' %\ (self.threadId, startNode, startNodeRowId)) if not startNode or not startNodeRowId: # No avaliable start node, exit crawling. log.error( 'No start node for thread %s, exit crawling.' %\ (self.threadId, )) break if not agent or not account: agent,account = self.getAgentWithAccount() if not agent or not account: # No avaliable account, exit crawling. log.warning( 'No avaliable agent for thread %s, exit crawling.' %\ (self.threadId, )) break # One crawling process. crawler.setAgent(agent) try: crawler.crawl(startNode) except CrawlerException, e: log.info('Thread %s gets exception: %s' %\ (self.threadId, str(e))) if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL: log.info("Thread " + str(self.threadId) +\ " stop crawling because of stop signal.") break if e.errorCode ==\ CrawlerErrorCode.GET_EXPANDING_NODE_FAILED or\ e.errorCode == CrawlerErrorCode.EXPAND_EXPANDED_NODE or\ e.errorCode == CrawlerErrorCode.NO_NODE_TO_EXPAND: # Start node's bad. log.warning('Thread %s, bad start node: %s, %s' %\ (self.threadId, startNode, startNodeRowId)) dataBase.deleteFromStartList(startNode) startNode = startNodeRowId = None if e.errorCode == CrawlerErrorCode.REQUEST_FAILED: # Still start node's bad. # TODO: Implement invalid usernode test support in # database to change it. log.warning('Thread %s, bad start node: %s, %s' %\ (self.threadId, startNode, startNodeRowId)) dataBase.deleteFromStartList(startNode) startNode = startNodeRowId = None if e.errorCode == CrawlerErrorCode.REACH_REQUEST_LIMIT: # Use a new accout account.finishUsing() account = agent = None finally: # The start node change every time crawler.epand() called. # So the start node can not be reused when exception happen. # We need to release it and use a new one. if startNodeRowId: dataBase.releaseStartNode(startNodeRowId) startNode = startNodeRowId = None
(self.threadId, startNode, startNodeRowId)) dataBase.deleteFromStartList(startNode) startNode = startNodeRowId = None if e.errorCode == CrawlerErrorCode.REACH_REQUEST_LIMIT: # Use a new accout account.finishUsing() account = agent = None finally: # The start node change every time crawler.epand() called. # So the start node can not be reused when exception happen. # We need to release it and use a new one. if startNodeRowId: dataBase.releaseStartNode(startNodeRowId) startNode = startNodeRowId = None except Exception, e: log.error('Thread %s gets exception, exit crawling: %s' %\ (self.threadId, str(e))) finally: # Release resource. if account: account.finishUsing() if startNodeRowId: dataBase.releaseStartNode(startNodeRowId) log.info('>>>>>> Thread %s end. <<<<<<' % self.threadId) class MainCrawlThread(threading.Thread): dataBase = None renrenAccountPool = None THREAD_NUMBER = flag.getFlag('thread_number') ROUND_NUMBER = flag.getFlag('round_number')