def get100KProfiles(self): """Returns 100, 000 prifles, next call will return next 1000 profiles. If there is no more profiles, it will return a empty list. Returns: [Profile] The profile list. """ ReadOnlyDataStore.acquireLock() profiles = [] try: command = """ SELECT id, name, gender, hometown, residence, birthday, visitor_number, friend_number, recent_visitor_number, home_page_friend_number FROM Persons LIMIT %s OFFSET %s; """ self.cursor.execute(command, [self.step, self.offset]) rows = self.cursor.fetchall() self.offset += len(rows) if len(rows) > 0: for row in rows: profiles.append(self.convertToProfile(row)) except Exception, e: log.warning("Get 100K profiles failed!" + str(e))
def getAgentWithAccount(self): loginFailLimit = self.FAIL_LOGIN_ACCOUNT_LIMIT pool = self.renrenAccountPool loginFailAccounts = [] account = None agent = None for i in range(0, loginFailLimit): if self.accountUsed >= self.ACCOUNTS_LIMIT: # Run out of accounts credit. break self.accountUsed += 1 account = pool.getAccount() if not account: # No avaliable account in the database. break agent = RenrenAgent(account, self.proxy) agent.login() time.sleep(1) if agent.isLogin: # Login success. break else: log.warning('Thread %s login fail.' % self.threadId) loginFailAccounts.append(account) if agent and agent.isLogin: for account in loginFailAccounts: account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN) return agent, account else: for account in loginFailAccounts: account.finishUsing() #account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN) return None, None
def getStartNodes(self, number=1): """Get start nodes for start one or several crawl thread. Args: number: the number of nodes you would like to get. Returns: ((tableId, id)): A tuple that contains nodes. """ DataBase.acquireLock() try: selectCommand = """ SELECT id, table_id FROM StartList WHERE is_using = 0 ORDER BY last_modified ASC LIMIT %s; """ self.cursor.execute(selectCommand, [number]) rows = self.cursor.fetchall() updateCommand = """ UPDATE StartList SET is_using = 1 WHERE table_id = %s; """ for row in rows: self.cursor.execute(updateCommand, [row[1]]) self.mdbConnection.commit() return rows except Exception, e: log.warning("Get start node failed!" + str(e)) self.mdbConnection.rollback() return ()
def login(self): assert self.account.isLogin == False loginData={ 'email': self.username, 'password': self.password, 'originUrl': '', 'formName': '', 'method': '', 'isplogin': '******', 'submit': '登陆' } postData = urlencode(loginData) req = urllib2.Request(self.loginUrl, postData) if self.proxy: log.info('Try to login with proxy: (%s, %s) %s %s ====' %\ (self.username, self.password, self.proxy.protocol, self.proxy.getProxyString())) else: log.info('Try to login: (%s, %s) ====' %\ (self.username, self.password)) try: response = self.opener.open(req, timeout=self.TIME_OUT) except urllib2.URLError, e: #print 'Login error: ' + e.reason log.warning('Login fail when requesting: ' + str(e.reason)) return ({}, ErrorCode.URL_ERROR)
def startCrawling(self): """Crawl start nodes. Returns: True if success, otherwise False. """ account = None try: url = self.getStartUrl() agent, account = self.getAgentWithAccount() if not agent: raise Exception('No account to crawl start nodes.') info = agent.getSharePageInfo(url) if not info: raise Exception('Get start share url fail.') time.sleep(0.8) self.requestCount = 1 self.expandSharePage(info, agent) self.handleUserList() # Update the new share url if self.shareList: self.writeStartUrl(self.shareList[-1]) return True except Exception, e: log.warning('Crawl start node fail: %s' % str(e)) return False
def getAgentWithAccount(self): loginFailLimit = 5 pool = self.renrenAccountPool loginFailAccounts = [] account = None agent = None for i in range(0, loginFailLimit): account = pool.getAccount() if not account: # No avaliable account in the database. break agent = ShareAgent(account) agent.login() time.sleep(1) if agent.isLogin: # Login success. break else: log.warning('Start node crawler login fail.') loginFailAccounts.append(account) if agent and agent.isLogin: for account in loginFailAccounts: account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN) return agent, account else: for account in loginFailAccounts: account.finishUsing() #account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN) return None, None
def saveInvalidAccount(pool): command = """ SELECT username, password FROM RenrenAccounts WHERE is_valid = 0 ORDER BY last_used_time DESC; """ pool.cursor.execute(command) rows = pool.cursor.fetchall() log.info('Total InValid account:' + str(len(rows))) failCount = 0 for row in rows: username = row[0] password = row[1] saveSuccess = False if pool.onceSaveFail(username, password): continue if failCount > 100: break; try: time.sleep(2) agent = RenrenAgent(username, password) agent.login() saveSuccess = agent.isLogin except Exception, e: log.warning('Save login fail: ' + str(e)) finally:
def insertProxy(self, proxy): """Insert a proxy to database. """ ProxyPool.acquireLock() try: command = """ INSERT INTO Proxies ( address, port, protocol, info, source, test_count, success_count, average_time ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s ); """ address = proxy.addr.encode('utf-8') port = proxy.port.encode('utf-8') protocol = proxy.protocol.encode('utf-8') if proxy.protocol\ else u'http' info = proxy.info.encode('utf-8') if proxy.info else None source = proxy.source.encode('utf-8') if proxy.source else None self.cursor.execute(command, ( address, port, protocol, info, source, proxy.testCount, proxy.successCount, proxy.averageTime)) self.mdbConnection.commit() success = True except Exception, e: log.warning('Proxy pool insert fail >>>>> ' + str(e)) self.mdbConnection.rollback() success = False
def _getRankNeighbors(self, rank, arrayName): """Get the neighbor keys in rank array and for the rank and the array. Returns: {(key...)} A tuple contain neighbor keys. """ NEIGHBOR_COUNT = 7 AnalysedDataBase._acquireLock() self.pingServer() try: # The result will contains the element which # rank is bottom and upper. command = """SELECT s_key FROM %s WHERE rank BETWEEN %s AND %s ORDER BY rank ASC; """ % (arrayName, '%s', '%s') bottom = rank - NEIGHBOR_COUNT / 2 if bottom < 1: bottom = 1 upper = rank + NEIGHBOR_COUNT / 2 self.cursor.execute(command, [bottom, upper]) rows = self.cursor.fetchall() array = [x[0] for x in rows] if not len(array) or len(array) > NEIGHBOR_COUNT: log.warning('Wrong result in rank neighbors, rank %s' % rank) return array finally: AnalysedDataBase._releaseLock()
def pingServer(self): if time.time() - self.lastPing <= self.PING_INTERVAL: return log.info('Ping Mysql Server.') self.lastPing = time.time() try: self.mdbConnection.ping() except Exception, e: log.warning('Mysql server gone: %s' % e) self._init()
def reportInvalidAccount(self, account, errorCode, errorInfo=None): """Update database when a RenrenAccoun become invalid. Set is_using = false and is_valid = fale the account, update account infomation and insert a log into log table. """ RenrenAccountPool.acquireLock() try: updateCommand = """ UPDATE RenrenAccounts SET is_using = 0, is_valid = 0, login_count = %s + login_count, request_count = %s + request_count, last_used_time = NOW(), become_invalid_time = NOW(), error_code = %s, error_info = %s WHERE username = %s AND password = %s; """ insertCommand = """ INSERT INTO RenrenAccountsLog ( username, password, event, is_login, request_count) VALUES( %s, %s, %s, %s, %s); """ loginCount = 1 if account.isLogin else 0 if not errorInfo: if errorCode == RenrenAccountErrorCode.ERROR_WHEN_LOGIN: errorInfo = "Get error when login." elif errorCode == RenrenAccountErrorCode.ERROR_WHEN_REQUEST: errorInfo = "Get error when making request." else: errorInfo = "Unknown error." self.cursor.execute( updateCommand, [ loginCount, account.requestCount, errorCode, errorInfo, account.username, account.password]); self.cursor.execute( insertCommand, [ account.username, account.password, RenrenAccountLogEvent.BECOME_INVALID, loginCount, account.requestCount]); self.mdbConnection.commit() except Exception, e: log.warning( "RenrenAccountPool: report invalid failed! " +\ "username: "******" " +\ "password: "******" " + str(e)) self.mdbConnection.rollback()
def run(self): try: opener = urllib2.build_opener() html = opener.open(self.url, timeout=10) proxies = self.parser.parse(html, self.url) for proxy in proxies: self.importer.addProxy(proxy) log.debug('Crawl proxies from ' + str(self.url) + ':') for proxy in proxies: log.debug('>>>>>' + proxy.getAllString()) except Exception, e: log.warning('Crawling thread exception: ' + str(e))
def deleteFromStartList(self, id): """Delete a node from start list.""" DataBase.acquireLock() try: command = "DELETE FROM StartList WHERE id = %s;" self.cursor.execute(command, [id.encode('utf-8')]) self.mdbConnection.commit() sucess = True except Exception, e: log.warning("Delete from start list failed!" + str(e)) self.mdbConnection.rollback() sucess = False
def releaseAllStartNode(self): """Release all startNode.""" DataBase.acquireLock() try: command = """ UPDATE StartList SET is_using = 0; """ self.cursor.execute(command) self.mdbConnection.commit() except Exception, e: log.warning("Release all start list node failed!" + str(e)) self.mdbConnection.rollback()
def clearAllStartNode(self): """Delete all startNode in the table.""" DataBase.acquireLock() try: command = """ DELETE FROM StartList; """ self.cursor.execute(command) self.mdbConnection.commit() except Exception, e: log.warning("Clear start list node failed!" + str(e)) self.mdbConnection.rollback()
def finishUsing(self, account): """Update database when a RenrenAccount is finished being using. Set is_using = false for the account, update account infomation and insert a log into log table. """ RenrenAccountPool.acquireLock() try: updateCommand = """ UPDATE RenrenAccounts SET is_using = 0, login_count = %s + login_count, request_count = %s + request_count, last_used_time = NOW() WHERE username = %s AND password = %s; """ updateCommandNoUse = """ UPDATE RenrenAccounts SET is_using = 0 WHERE username = %s AND password = %s; """ insertCommand = """ INSERT INTO RenrenAccountsLog ( username, password, event, is_login, request_count) VALUES( %s, %s, %s, %s, %s); """ loginCount = 1 if account.isLogin else 0 if not account.isLogin and account.requestCount == 0: self.cursor.execute( updateCommandNoUse, [ account.username, account.password]); else: self.cursor.execute( updateCommand, [ loginCount, account.requestCount, account.username, account.password]); self.cursor.execute( insertCommand, [ account.username, account.password, RenrenAccountLogEvent.FINISH_USE, loginCount, account.requestCount]); self.mdbConnection.commit() except Exception, e: log.warning( "RenrenAccountPool: finish use failed! " +\ "username: "******" " +\ "password: "******" " + str(e)) self.mdbConnection.rollback()
def saveAccount(self, username, password, successSave, time=None): """Set a account is_valid to valid.""" RenrenAccountPool.acquireLock() try: accountCommandNoTime = """ UPDATE RenrenAccounts SET is_using = 0, is_valid = 1, last_used_time = NOW() WHERE username = %s AND password = %s; """ accountCommandWithTime = """ UPDATE RenrenAccounts SET is_using = 0, is_valid = 1, last_used_time = %s WHERE username = %s AND password = %s; """ logCommand = """ INSERT INTO RenrenAccountsLog ( username, password, event ) VALUES (%s, %s, %s); """ if successSave: if time: self.cursor.execute( accountCommandWithTime, [ time, username, password]); else: self.cursor.execute( accountCommandNoTime, [ username, password]); event = RenrenAccountLogEvent.SAVE_ACCOUNT_SUCCESS if successSave\ else RenrenAccountLogEvent.SAVE_ACCOUNT_FAIL self.cursor.execute( logCommand, [ username, password, event]); self.mdbConnection.commit() success = True except Exception, e: log.warning( "RenrenAccountPool: save account operation failed! " +\ "username: "******" " +\ "password: "******" " + str(e)) self.mdbConnection.rollback() success = False
def parseOldProfilePageWithoutAccess(self, document): """Returns: {UserInfo} the user information.""" info = UserInfo() # Name & visitedNum try: info.name = document.find('div', class_='add-guide').h3.string spanTag = document.find( 'div', class_='status-holder').find( 'span', class_='count').find('span', class_='count') info.visitedNum = int(spanTag.string) except Exception, e: log.warning('Shit happen in parse old without access: ' + str(e)) return None # Lack of critical information
def parse(self, html, source): """Parse a html page, and return all contained proxy.""" document = BeautifulSoup(html) content = document.find('p') proxies = [] if content: for string in content.stripped_strings: proxy = Proxy.parse(string, source) if proxy: proxies.append(proxy) else: log.warning('Can not find content element.') return proxies
def insertIntoStartList(self, id, opt_lastModified=None): """Insert a node into start list.""" DataBase.acquireLock() try: command = \ "INSERT INTO StartList (id, last_modified) VALUES(%s, %s);" self.cursor.execute(command, [id.encode('utf-8'), opt_lastModified]) self.mdbConnection.commit() sucess = True except Exception, e: log.warning("Insert into start list failed!" + str(e)) self.mdbConnection.rollback() sucess = False
def replaceStartNode(self, originId, newId): """Replace a old node with new id.""" DataBase.acquireLock() try: command = "UPDATE StartList SET id = %s WHERE id = %s;" self.cursor.execute(command, [ newId.encode('utf-8'), originId.encode('utf-8')]) self.mdbConnection.commit() sucess = True except Exception, e: log.warning("Replace start list node failed!" + str(e)) self.mdbConnection.rollback() sucess = False
def addRecord(self, id, userInfo, opt_referenceId=None): """Insert a person into database, provided user id and userInfo from crawer. Reuturns: UserNode: the user node convert from the userInfo. None: if the operation failed. """ DataBase.acquireLock() try: personsCommand = "INSERT INTO Persons (" +\ "id, status, " +\ "name, gender, hometown, " +\ "residence, birthday, " +\ "visitor_number, friend_number, " +\ "recent_visitor_number, home_page_friend_number, " +\ "create_time, reference_id) " +\ "VALUES(%s, %s, " +\ "%s, %s, %s, " +\ "%s, %s, " +\ "%s, %s, " +\ "%s, %s, NOW(), %s);" visitorsCommand = "INSERT INTO RecentVisitors (" +\ "id, visitor) VALUES(%s, %s);" friendsCommand = "INSERT INTO HomePageFriends (" +\ "id, friend) VALUES(%s, %s);" profile, connection = convert(userInfo) self.cursor.execute(personsCommand, ( id.encode('utf-8'), Status.unexpanded, profile.name.encode('utf-8'), profile.gender if profile.gender else None, profile.hometown.encode('utf-8') \ if profile.hometown else None, profile.residence.encode('utf-8') \ if profile.residence else None, profile.birthday.encode('utf-8') \ if profile.birthday else None, profile.visitorNum, profile.friendNum, profile.recentVisitorNum, profile.homePageFriendNum, opt_referenceId.encode('utf-8') if opt_referenceId else None)) for visitor in connection.recentVisitorList: self.cursor.execute(visitorsCommand, (id, visitor)) for friend in connection.homePageFriendList: self.cursor.execute(friendsCommand, (id, friend)) self.mdbConnection.commit() sucess = True except Exception, e: log.warning("Add Record Failed! ("+ str(id) + ") " + str(e)) self.mdbConnection.rollback() sucess = False
def deleteAllProxies(self): """Delete all proxy from database.""" ProxyPool.acquireLock() try: command = """ DELETE FROM Proxies; """ self.cursor.execute(command) self.mdbConnection.commit() success = True except Exception, e: log.warning('Proxies pool delete all fail >>>>> ' + str(e)) self.mdbConnection.rollback() success = False
def parseTimelineProfilePage(self, document): """Returns: {UserInfo} the user information.""" info = UserInfo() # Name & visitedNum nameNode = document.find('h1', class_='avatar_title') try: info.name = nameNode.stripped_strings.next() visitedString = nameNode.span.strings.next() pat = re.compile('[^\d]*(\d+)[^\d]*',) mat = pat.match(visitedString) info.visitedNum = int(mat.group(1)) except Exception, e: log.warning('Shit happen in parse time line page: ' + str(e)) return None # Lack of critical information
def needMoreStartNode(self): """Return whether we need more start nodes.""" DataBase.acquireLock() try: command = """ SELECT COUNT(*) FROM StartList WHERE is_using = FALSE; """ self.cursor.execute(command) rows = self.cursor.fetchall() count = rows[0][0] if count < self.MIN_START_NODE_COUNT: return True except Exception, e: log.warning("Get start node count fail!" + str(e)) self.mdbConnection.rollback()
def getSharePageInfo(self, url): """Get share page info. Returns: None if error happen. SharePageInfo """ if not self.isLogin: raise Exception("Account is not login") try: response = self.opener.open(url, timeout=self.TIME_OUT) realUrl = response.geturl() html = response.read() except Exception, e: log.warning("Get share url error: " + str(e) + ". Share url: " + url) return None
def releaseStartNode(self, tableId): """Release a startNode by a table id. Args: tableId: the id of the table row. """ DataBase.acquireLock() try: command = """ UPDATE StartList SET is_using = 0 WHERE table_id = %s; """ self.cursor.execute(command, [tableId]) self.mdbConnection.commit() except Exception, e: log.warning("Release start list node failed!" + str(e)) self.mdbConnection.rollback()
def getAccounts(self, number): """Get a list of RenrenAccount. Read some accounts from database, mark them as using and write to log table. """ RenrenAccountPool.acquireLock() accounts = [] try: selectCommand = """ SELECT username, password FROM RenrenAccounts WHERE is_using = 0 AND is_valid = 1 AND last_used_time <= DATE_SUB(NOW(), INTERVAL 1 DAY) ORDER BY last_used_time ASC LIMIT %s; """ self.cursor.execute(selectCommand, [number]) rows = self.cursor.fetchall() for row in rows: accounts.append(RenrenAccount(row[0], row[1], self)) updateCommand = """ UPDATE RenrenAccounts SET is_using = 1 WHERE username = %s AND password = %s; """ insertCommand = """ INSERT INTO RenrenAccountsLog ( username, password, event) VALUES( %s, %s, %s); """ for account in accounts: self.cursor.execute( updateCommand, [account.username, account.password]); self.cursor.execute( insertCommand, [ account.username, account.password, RenrenAccountLogEvent.USE]); self.mdbConnection.commit() except Exception, e: log.warning( "RenrenAccountPool: set is_using = True failed! " + str(e)) self.mdbConnection.rollback()
def getStatus(self, id): """Get the status for id. Returns: {Status} the status which is a integer. """ DataBase.acquireLock() try: command = "SELECT status FROM Persons WHERE id = %s;" self.cursor.execute(command, [id.encode('utf-8')]) rows = self.cursor.fetchall() if len(rows): if len(rows) > 1: log.warning("Mutiple result for id: " + id) return rows[0][0] else: return Status.unrecorded finally: DataBase.releaseLock()
def getAllCreatedTime(self): """Returns all the create_time in the data store. Returns: [time] The profile list. """ ReadOnlyDataStore.acquireLock() times = [] try: command = """ SELECT create_time FROM Persons; """ self.cursor.execute(command) rows = self.cursor.fetchall() for row in rows: times.append(row[0]) except Exception, e: log.warning("Get all profiles failed!" + str(e))