示例#1
0
    def get100KProfiles(self):
        """Returns 100, 000 prifles, next call will return next 1000 profiles. 
        If there is no more profiles, it will return a empty list.

        Returns: [Profile] The profile list.
        """
        ReadOnlyDataStore.acquireLock()
        profiles = []
        try:
            command = """
                SELECT id, name, gender, hometown,
                   residence, birthday,
                   visitor_number, friend_number,
                   recent_visitor_number, home_page_friend_number
                FROM Persons
                LIMIT %s OFFSET %s;
                """
            self.cursor.execute(command, [self.step, self.offset])
            rows = self.cursor.fetchall()
            self.offset += len(rows)
            if len(rows) > 0:
                for row in rows:
                    profiles.append(self.convertToProfile(row))
        except Exception, e:
            log.warning("Get 100K profiles failed!" + str(e))
示例#2
0
    def getAgentWithAccount(self):
        loginFailLimit = self.FAIL_LOGIN_ACCOUNT_LIMIT
        pool = self.renrenAccountPool
        loginFailAccounts = []

        account = None
        agent = None
        for i in range(0, loginFailLimit):
            if self.accountUsed >= self.ACCOUNTS_LIMIT:
                # Run out of accounts credit.
                break
            self.accountUsed += 1
            account = pool.getAccount()
            if not account: 
                # No avaliable account in the database.
                break
            agent = RenrenAgent(account, self.proxy)
            agent.login()
            time.sleep(1)
            if agent.isLogin:
                # Login success.
                break
            else:
                log.warning('Thread %s login fail.' % self.threadId)
                loginFailAccounts.append(account)

        if agent and agent.isLogin:
            for account in loginFailAccounts:
                account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN)
            return agent, account
        else:
            for account in loginFailAccounts:
                account.finishUsing()
                #account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN)
            return None, None
示例#3
0
    def getStartNodes(self, number=1):
        """Get start nodes for start one or several crawl thread.
        
        Args:
            number: the number of nodes you would like to get.

        Returns:
            ((tableId, id)): A tuple that contains nodes.
        """
        DataBase.acquireLock()
        try:
            selectCommand = """
                SELECT id, table_id FROM StartList
                WHERE is_using = 0
                ORDER BY last_modified ASC
                LIMIT %s;
            """
            self.cursor.execute(selectCommand, [number])
            rows = self.cursor.fetchall()

            updateCommand = """
                UPDATE StartList SET is_using = 1
                WHERE table_id = %s;
            """
            for row in rows:
                self.cursor.execute(updateCommand, [row[1]])
            self.mdbConnection.commit()
            return rows
        except Exception, e:
            log.warning("Get start node failed!" + str(e))
            self.mdbConnection.rollback()
            return ()
示例#4
0
 def login(self):
     assert self.account.isLogin == False
     loginData={
             'email': self.username,
             'password': self.password,
             'originUrl': '',
             'formName': '',
             'method': '',
             'isplogin': '******',
             'submit': '登陆'
     }
     postData = urlencode(loginData)
     req = urllib2.Request(self.loginUrl, postData)
     if self.proxy:
         log.info('Try to login with proxy: (%s, %s) %s %s ====' %\
             (self.username, self.password, self.proxy.protocol,
                 self.proxy.getProxyString())) 
     else:
         log.info('Try to login: (%s, %s) ====' %\
             (self.username, self.password))
     try:
         response = self.opener.open(req, timeout=self.TIME_OUT)
     except urllib2.URLError, e:
         #print 'Login error: ' + e.reason
         log.warning('Login fail when requesting: ' + str(e.reason))
         return ({}, ErrorCode.URL_ERROR)
示例#5
0
    def startCrawling(self):
        """Crawl start nodes.
        Returns:
            True if success, otherwise False.
        """
        account = None
        try:
            url = self.getStartUrl()
            agent, account = self.getAgentWithAccount()
            if not agent:
                raise Exception('No account to crawl start nodes.')

            info = agent.getSharePageInfo(url)
            if not info:
                raise Exception('Get start share url fail.')
            time.sleep(0.8)

            self.requestCount = 1
            self.expandSharePage(info, agent)
            self.handleUserList()
            # Update the new share url
            if self.shareList:
                self.writeStartUrl(self.shareList[-1])
            return True
        except Exception, e:
            log.warning('Crawl start node fail: %s' % str(e))
            return False
示例#6
0
    def getAgentWithAccount(self):
        loginFailLimit = 5
        pool = self.renrenAccountPool
        loginFailAccounts = []

        account = None
        agent = None
        for i in range(0, loginFailLimit):
            account = pool.getAccount()
            if not account: 
                # No avaliable account in the database.
                break
            agent = ShareAgent(account)
            agent.login()
            time.sleep(1)
            if agent.isLogin:
                # Login success.
                break
            else:
                log.warning('Start node crawler login fail.')
                loginFailAccounts.append(account)

        if agent and agent.isLogin:
            for account in loginFailAccounts:
                account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN)
            return agent, account
        else:
            for account in loginFailAccounts:
                account.finishUsing()
                #account.reportInvalidAccount(RenrenAccountErrorCode.ERROR_WHEN_LOGIN)
            return None, None
示例#7
0
def saveInvalidAccount(pool):
    command = """
        SELECT username, password FROM RenrenAccounts
        WHERE is_valid = 0
        ORDER BY last_used_time DESC;
    """
    pool.cursor.execute(command)
    rows = pool.cursor.fetchall()
    log.info('Total InValid account:' + str(len(rows)))
    failCount = 0

    for row in rows:
        username = row[0]
        password = row[1]
        saveSuccess = False
        if pool.onceSaveFail(username, password):
            continue
        if failCount > 100:
            break;
        try:
            time.sleep(2)
            agent = RenrenAgent(username, password)
            agent.login()
            saveSuccess = agent.isLogin
        except Exception, e:
            log.warning('Save login fail:  ' + str(e))
        finally:
示例#8
0
    def insertProxy(self, proxy):
        """Insert a proxy to database. """
        ProxyPool.acquireLock()
        try:
            command = """
                INSERT INTO Proxies (
                    address, port, protocol,
                    info, source,
                    test_count, success_count, average_time
                ) VALUES (
                    %s, %s, %s,
                    %s, %s,
                    %s, %s, %s
                );
            """
            address = proxy.addr.encode('utf-8')
            port = proxy.port.encode('utf-8')
            protocol = proxy.protocol.encode('utf-8') if proxy.protocol\
                else u'http'

            info = proxy.info.encode('utf-8') if proxy.info else None
            source = proxy.source.encode('utf-8') if proxy.source else None

            self.cursor.execute(command, (
                address, port, protocol,
                info, source,
                proxy.testCount, proxy.successCount, proxy.averageTime))
            
            self.mdbConnection.commit()
            success = True
        except Exception, e:
            log.warning('Proxy pool insert fail >>>>> ' + str(e))
            self.mdbConnection.rollback()
            success = False
示例#9
0
    def _getRankNeighbors(self, rank, arrayName):
        """Get the neighbor keys in rank array and for the rank and the array.

        Returns: {(key...)} A tuple contain neighbor keys.
        """
        NEIGHBOR_COUNT = 7
        AnalysedDataBase._acquireLock()
        self.pingServer()
        try:
            # The result will contains the element which
            # rank is bottom and upper.
            command = """SELECT s_key FROM %s
                WHERE rank BETWEEN %s AND %s
                ORDER BY rank ASC;
            """ % (arrayName, '%s', '%s')
            bottom = rank - NEIGHBOR_COUNT / 2
            if bottom < 1: bottom = 1
            upper = rank + NEIGHBOR_COUNT / 2

            self.cursor.execute(command, [bottom, upper])
            rows = self.cursor.fetchall()
            array = [x[0] for x in rows]
            if not len(array) or len(array) > NEIGHBOR_COUNT:
                log.warning('Wrong result in rank neighbors, rank %s' % rank)
            return array
        finally:
            AnalysedDataBase._releaseLock()
示例#10
0
 def pingServer(self):
     if time.time() - self.lastPing <= self.PING_INTERVAL:
         return
     log.info('Ping Mysql Server.')
     self.lastPing = time.time()
     try:
         self.mdbConnection.ping()
     except Exception, e:
         log.warning('Mysql server gone: %s' % e)
         self._init()
示例#11
0
 def reportInvalidAccount(self, account, errorCode, errorInfo=None):
     """Update database when a RenrenAccoun become invalid.
     
     Set is_using = false and is_valid = fale the account, update account
     infomation and insert a log into log table.
     """
     RenrenAccountPool.acquireLock()
     try:
         updateCommand = """
             UPDATE RenrenAccounts
             SET is_using = 0,
                 is_valid = 0,
                 login_count = %s + login_count,
                 request_count = %s + request_count,
                 last_used_time = NOW(),
                 become_invalid_time = NOW(),
                 error_code = %s,
                 error_info = %s
             WHERE username = %s AND password = %s;
         """
         insertCommand = """
             INSERT INTO RenrenAccountsLog (
                 username, password, event, is_login, request_count) VALUES(
                 %s, %s, %s, %s, %s);
         """
         loginCount = 1 if account.isLogin else 0
         if not errorInfo:
             if errorCode == RenrenAccountErrorCode.ERROR_WHEN_LOGIN:
                 errorInfo = "Get error when login."
             elif errorCode == RenrenAccountErrorCode.ERROR_WHEN_REQUEST: 
                 errorInfo = "Get error when making request."
             else:
                 errorInfo = "Unknown error."
         self.cursor.execute(
             updateCommand, [
                 loginCount,
                 account.requestCount,
                 errorCode,
                 errorInfo,
                 account.username,
                 account.password]);
         self.cursor.execute(
             insertCommand, [
                 account.username,
                 account.password,
                 RenrenAccountLogEvent.BECOME_INVALID,
                 loginCount,
                 account.requestCount]);
         self.mdbConnection.commit()
     except Exception, e:
         log.warning(
             "RenrenAccountPool: report invalid failed! " +\
             "username: "******"  " +\
             "password: "******"  " + str(e))
         self.mdbConnection.rollback()
示例#12
0
 def run(self):
     try:
         opener = urllib2.build_opener()
         html = opener.open(self.url, timeout=10)
         proxies = self.parser.parse(html, self.url)
         for proxy in proxies:
             self.importer.addProxy(proxy)
         log.debug('Crawl proxies from ' + str(self.url) + ':')
         for proxy in proxies:
             log.debug('>>>>>' + proxy.getAllString())
     except Exception, e:
         log.warning('Crawling thread exception: ' + str(e))
示例#13
0
 def deleteFromStartList(self, id):
     """Delete a node from start list."""
     DataBase.acquireLock()
     try:
         command = "DELETE FROM StartList WHERE id = %s;"
         self.cursor.execute(command, [id.encode('utf-8')])
         self.mdbConnection.commit()
         sucess = True
     except Exception, e:
         log.warning("Delete from start list failed!" + str(e))
         self.mdbConnection.rollback()
         sucess = False
示例#14
0
 def releaseAllStartNode(self):
     """Release all startNode."""
     DataBase.acquireLock()
     try:
         command = """
             UPDATE StartList SET is_using = 0;
         """
         self.cursor.execute(command)
         self.mdbConnection.commit()
     except Exception, e:
         log.warning("Release all start list node failed!" + str(e))
         self.mdbConnection.rollback()
示例#15
0
 def clearAllStartNode(self):
     """Delete all startNode in the table."""
     DataBase.acquireLock()
     try:
         command = """
             DELETE FROM StartList;
         """
         self.cursor.execute(command)
         self.mdbConnection.commit()
     except Exception, e:
         log.warning("Clear start list node failed!" + str(e))
         self.mdbConnection.rollback()
示例#16
0
    def finishUsing(self, account):
        """Update database when a RenrenAccount is finished being using.

        Set is_using = false for the account, update account infomation and
        insert a log into log table.
        """
        RenrenAccountPool.acquireLock()
        try:
            updateCommand = """
                UPDATE RenrenAccounts
                SET is_using = 0,
                    login_count = %s + login_count,
                    request_count = %s + request_count,
                    last_used_time = NOW()
                WHERE username = %s AND password = %s;
            """
            updateCommandNoUse = """
                UPDATE RenrenAccounts
                SET is_using = 0
                WHERE username = %s AND password = %s;
            """
            insertCommand = """
                INSERT INTO RenrenAccountsLog (
                    username, password, event, is_login, request_count) VALUES(
                    %s, %s, %s, %s, %s);
            """
            loginCount = 1 if account.isLogin else 0
            if not account.isLogin and account.requestCount == 0:
                self.cursor.execute(
                    updateCommandNoUse, [
                        account.username,
                        account.password]);
            else:
                self.cursor.execute(
                    updateCommand, [
                        loginCount,
                        account.requestCount,
                        account.username,
                        account.password]);
            self.cursor.execute(
                insertCommand, [
                    account.username,
                    account.password,
                    RenrenAccountLogEvent.FINISH_USE,
                    loginCount,
                    account.requestCount]);
            self.mdbConnection.commit()
        except Exception, e:
            log.warning(
                "RenrenAccountPool: finish use failed! " +\
                "username: "******"  " +\
                "password: "******"  " + str(e))
            self.mdbConnection.rollback()
示例#17
0
    def saveAccount(self, username, password, successSave, time=None):
        """Set a account is_valid to valid."""
        RenrenAccountPool.acquireLock()
        try:
            accountCommandNoTime = """
                UPDATE RenrenAccounts
                SET is_using = 0,
                    is_valid = 1,
                    last_used_time = NOW()
                WHERE username = %s AND password = %s;
            """
            accountCommandWithTime = """
                UPDATE RenrenAccounts
                SET is_using = 0,
                    is_valid = 1,
                    last_used_time = %s
                WHERE username = %s AND password = %s;
            """
            logCommand = """
                INSERT INTO RenrenAccountsLog (
                    username, password, event
                ) VALUES (%s, %s, %s);
            """
            if successSave:
                if time:
                    self.cursor.execute(
                        accountCommandWithTime, [
                            time,
                            username,
                            password]);
                else:
                    self.cursor.execute(
                        accountCommandNoTime, [
                            username,
                            password]);

            event = RenrenAccountLogEvent.SAVE_ACCOUNT_SUCCESS if successSave\
                else RenrenAccountLogEvent.SAVE_ACCOUNT_FAIL
            self.cursor.execute(
                logCommand, [
                    username,
                    password,
                    event]);
            self.mdbConnection.commit()
            success = True
        except Exception, e:
            log.warning(
                "RenrenAccountPool: save account operation failed! " +\
                "username: "******"  " +\
                "password: "******"  " + str(e))
            self.mdbConnection.rollback()
            success = False
示例#18
0
 def parseOldProfilePageWithoutAccess(self, document):
     """Returns: {UserInfo} the user information."""
     info = UserInfo()
     # Name & visitedNum
     try:
         info.name = document.find('div', class_='add-guide').h3.string
         spanTag = document.find(
             'div', class_='status-holder').find(
             'span', class_='count').find('span', class_='count')
         info.visitedNum = int(spanTag.string)
     except Exception, e:
         log.warning('Shit happen in parse old without access: ' + str(e))
         return None  # Lack of critical information
示例#19
0
 def parse(self, html, source):
     """Parse a html page, and return all contained proxy."""
     document = BeautifulSoup(html)
     content = document.find('p')
     proxies = []
     if content:
         for string in content.stripped_strings:
             proxy = Proxy.parse(string, source)
             if proxy:
                 proxies.append(proxy)
     else:
         log.warning('Can not find content element.')
     return proxies
示例#20
0
 def insertIntoStartList(self, id, opt_lastModified=None):
     """Insert a node into start list."""
     DataBase.acquireLock()
     try:
         command = \
             "INSERT INTO StartList (id, last_modified) VALUES(%s, %s);"
         self.cursor.execute(command, [id.encode('utf-8'), opt_lastModified])
         self.mdbConnection.commit()
         sucess = True
     except Exception, e:
         log.warning("Insert into start list failed!" + str(e))
         self.mdbConnection.rollback()
         sucess = False
示例#21
0
 def replaceStartNode(self, originId, newId):
     """Replace a old node with new id."""
     DataBase.acquireLock()
     try:
         command = "UPDATE StartList SET id = %s WHERE id = %s;"
         self.cursor.execute(command, [
             newId.encode('utf-8'),
             originId.encode('utf-8')])
         self.mdbConnection.commit()
         sucess = True
     except Exception, e:
         log.warning("Replace start list node failed!" + str(e))
         self.mdbConnection.rollback()
         sucess = False
示例#22
0
    def addRecord(self, id, userInfo, opt_referenceId=None):
        """Insert a person into database, provided user id and userInfo
            from crawer.

        Reuturns:
            UserNode: the user node convert from the userInfo.
            None: if the operation failed.
        """
        DataBase.acquireLock()
        try:
            personsCommand = "INSERT INTO Persons (" +\
                "id, status, " +\
                "name, gender, hometown, " +\
                "residence, birthday, " +\
                "visitor_number, friend_number, " +\
                "recent_visitor_number, home_page_friend_number, " +\
                "create_time, reference_id) " +\
                "VALUES(%s, %s, " +\
                "%s, %s, %s, " +\
                "%s, %s, " +\
                "%s, %s, " +\
                "%s, %s, NOW(), %s);"
            visitorsCommand = "INSERT INTO RecentVisitors (" +\
                "id, visitor) VALUES(%s, %s);"
            friendsCommand = "INSERT INTO HomePageFriends (" +\
                "id, friend) VALUES(%s, %s);"
            profile, connection = convert(userInfo)
            self.cursor.execute(personsCommand, (
                id.encode('utf-8'), Status.unexpanded,
                profile.name.encode('utf-8'),
                profile.gender if profile.gender else None,
                profile.hometown.encode('utf-8') \
                    if profile.hometown else None,
                profile.residence.encode('utf-8') \
                    if profile.residence else None,
                profile.birthday.encode('utf-8') \
                    if profile.birthday else None,
                profile.visitorNum, profile.friendNum,
                profile.recentVisitorNum, profile.homePageFriendNum,
                opt_referenceId.encode('utf-8') if opt_referenceId else None))
            for visitor in connection.recentVisitorList:
                self.cursor.execute(visitorsCommand, (id, visitor))
            for friend in connection.homePageFriendList:
                self.cursor.execute(friendsCommand, (id, friend))
            self.mdbConnection.commit()
            sucess = True
        except Exception, e:
            log.warning("Add Record Failed! ("+ str(id) + ") " + str(e))
            self.mdbConnection.rollback()
            sucess = False
示例#23
0
 def deleteAllProxies(self):
     """Delete all proxy from database."""
     ProxyPool.acquireLock()
     try:
         command = """
             DELETE FROM Proxies;
         """
         self.cursor.execute(command)
         self.mdbConnection.commit()
         success = True
     except Exception, e:
         log.warning('Proxies pool delete all fail >>>>> ' + str(e))
         self.mdbConnection.rollback()
         success = False
示例#24
0
 def parseTimelineProfilePage(self, document):
     """Returns: {UserInfo} the user information."""
     info = UserInfo()
     # Name & visitedNum
     nameNode = document.find('h1', class_='avatar_title')
     try:
         info.name = nameNode.stripped_strings.next()
         visitedString = nameNode.span.strings.next()
         pat = re.compile('[^\d]*(\d+)[^\d]*',)
         mat = pat.match(visitedString)
         info.visitedNum = int(mat.group(1))
     except Exception, e:
         log.warning('Shit happen in parse time line page: ' + str(e))
         return None  # Lack of critical information
示例#25
0
 def needMoreStartNode(self):
     """Return whether we need more start nodes."""
     DataBase.acquireLock()
     try:
         command = """
             SELECT COUNT(*) FROM StartList WHERE is_using = FALSE;
         """
         self.cursor.execute(command)
         rows = self.cursor.fetchall()
         count = rows[0][0]
         if count < self.MIN_START_NODE_COUNT:
             return True
     except Exception, e:
         log.warning("Get start node count fail!" + str(e))
         self.mdbConnection.rollback()
示例#26
0
 def getSharePageInfo(self, url):
     """Get share page info.
     Returns:
         None if error happen.
         SharePageInfo
     """
     if not self.isLogin:
         raise Exception("Account is not login")
     try:
         response = self.opener.open(url, timeout=self.TIME_OUT)
         realUrl = response.geturl()
         html = response.read()
     except Exception, e:
         log.warning("Get share url error: " + str(e) + ". Share url: " + url)
         return None
示例#27
0
    def releaseStartNode(self, tableId):
        """Release a startNode by a table id.

        Args:
            tableId: the id of the table row.
        """
        DataBase.acquireLock()
        try:
            command = """
                UPDATE StartList SET is_using = 0 WHERE table_id = %s;
            """
            self.cursor.execute(command, [tableId])
            self.mdbConnection.commit()
        except Exception, e:
            log.warning("Release start list node failed!" + str(e))
            self.mdbConnection.rollback()
示例#28
0
    def getAccounts(self, number):
        """Get a list of RenrenAccount.

        Read some accounts from database, mark them as using and write to
        log table.
        """
        RenrenAccountPool.acquireLock()
        accounts = []
        try:
            selectCommand = """
                SELECT username, password FROM RenrenAccounts
                WHERE is_using = 0 AND
                    is_valid = 1 AND
                    last_used_time <= DATE_SUB(NOW(), INTERVAL 1 DAY)
                ORDER BY last_used_time ASC
                LIMIT %s;
            """
            self.cursor.execute(selectCommand, [number])
            rows = self.cursor.fetchall()
            for row in rows:
                accounts.append(RenrenAccount(row[0], row[1], self))

            updateCommand = """
                UPDATE RenrenAccounts
                SET is_using = 1
                WHERE username = %s AND password = %s;
            """
            insertCommand = """
                INSERT INTO RenrenAccountsLog (
                    username, password, event) VALUES(
                    %s, %s, %s);
            """
            for account in accounts:
                self.cursor.execute(
                    updateCommand, [account.username, account.password]);
                self.cursor.execute(
                    insertCommand, [
                        account.username,
                        account.password,
                        RenrenAccountLogEvent.USE]);
                self.mdbConnection.commit()
        except Exception, e:
            log.warning(
                "RenrenAccountPool: set is_using = True failed! " + str(e))
            self.mdbConnection.rollback()
示例#29
0
    def getStatus(self, id):
        """Get the status for id.

        Returns: {Status} the status which is a integer.
        """
        DataBase.acquireLock()
        try:
            command = "SELECT status FROM Persons WHERE id = %s;"
            self.cursor.execute(command, [id.encode('utf-8')])
            rows = self.cursor.fetchall()
            if len(rows):
                if len(rows) > 1:
                    log.warning("Mutiple result for id: " + id)
                return rows[0][0]
            else:
                return Status.unrecorded
        finally:
            DataBase.releaseLock()
示例#30
0
    def getAllCreatedTime(self):
        """Returns all the create_time in the data store.

        Returns: [time] The profile list.
        """
        ReadOnlyDataStore.acquireLock()
        times = []
        try:
            command = """
                SELECT create_time
                FROM Persons;
                """
            self.cursor.execute(command)
            rows = self.cursor.fetchall()
            for row in rows:
                times.append(row[0])
        except Exception, e:
            log.warning("Get all profiles failed!" + str(e))