Exemplo n.º 1
0
 def __init__(self, dataBase, pool, proxy=None):
     threading.Thread.__init__(self)
     self.threadId = self.getThreadId()
     log.info('>>>>>>  Create thread %s.  <<<<<<' % self.threadId)
     self.dataBase = dataBase
     self.renrenAccountPool = pool
     self.proxy = proxy
Exemplo n.º 2
0
def main():
    log.config(GC.LOG_FILE_DIR + "import_start_nodes", "info", "info")
    fileName = "tools/data/start_nodes"
    importCount = 0
    failCount = 0
    dataBase = database.createProdDataBase()
    with open(fileName) as importedFile:
        lines = importedFile.readlines()
        for line in lines:
            strs = line.split()
            if len(strs) < 1:
                continue  # May be not a valid account
            id = strs[0]  # Start node id.
            log.info("Importing start node: " + id)
            success = dataBase.insertIntoStartList(id)
            if success:
                importCount += 1
            else:
                failCount += 1

    log.info(
        "Finish importing..........\n"
        + "Total imported start nodes number: "
        + str(importCount)
        + "\n"
        + "Fail start nodes number: "
        + str(failCount)
    )
Exemplo n.º 3
0
 def investigate(self):
     self.allProxies.sort(key=lambda x: x.addr)
     for proxy in self.allProxies:
         log.info(str(proxy.averageTime) + '    ' +\
             str(proxy.successCount) + '/' +\
             str(proxy.testCount) + '    ' +\
             proxy.getAllString())
Exemplo n.º 4
0
def test():

    log.config(GC.LOG_FILE_DIR + 'crawler_test', 'info', 'info')
    db = createConnection()
    createTables(db)
    dropTables(db)
    createTables(db)

    pool = renrenaccountpool.createProdRenrenAccountPool()
    accounts = pool.getAccounts(1)
    account = accounts[0]

    global crawler
    
    try:
        crawler = Crawler(db)
        agent = RenrenAgent(account)
        agent.login()
        crawler.setAgent(agent)
        id = "322601086"
        crawler.crawl(id, 30)
    except CrawlerException, e:
        log.info("Crawler end, reason: " + str(e))
        if e.errorCode == CrawlerErrorCode.DETECT_STOP_SIGNAL:
            print "detect int signal"
            return
Exemplo n.º 5
0
def saveInvalidAccount(pool):
    command = """
        SELECT username, password FROM RenrenAccounts
        WHERE is_valid = 0
        ORDER BY last_used_time DESC;
    """
    pool.cursor.execute(command)
    rows = pool.cursor.fetchall()
    log.info('Total InValid account:' + str(len(rows)))
    failCount = 0

    for row in rows:
        username = row[0]
        password = row[1]
        saveSuccess = False
        if pool.onceSaveFail(username, password):
            continue
        if failCount > 100:
            break;
        try:
            time.sleep(2)
            agent = RenrenAgent(username, password)
            agent.login()
            saveSuccess = agent.isLogin
        except Exception, e:
            log.warning('Save login fail:  ' + str(e))
        finally:
Exemplo n.º 6
0
 def login(self):
     assert self.account.isLogin == False
     loginData={
             'email': self.username,
             'password': self.password,
             'originUrl': '',
             'formName': '',
             'method': '',
             'isplogin': '******',
             'submit': '登陆'
     }
     postData = urlencode(loginData)
     req = urllib2.Request(self.loginUrl, postData)
     if self.proxy:
         log.info('Try to login with proxy: (%s, %s) %s %s ====' %\
             (self.username, self.password, self.proxy.protocol,
                 self.proxy.getProxyString())) 
     else:
         log.info('Try to login: (%s, %s) ====' %\
             (self.username, self.password))
     try:
         response = self.opener.open(req, timeout=self.TIME_OUT)
     except urllib2.URLError, e:
         #print 'Login error: ' + e.reason
         log.warning('Login fail when requesting: ' + str(e.reason))
         return ({}, ErrorCode.URL_ERROR)
Exemplo n.º 7
0
 def start(self):
     thread = MainCrawlThread()
     thread.start()
     
     # Wait for a signal
     signal.pause()
     thread.join()
     log.info(">>>>>>>>>>  Main thread end....  <<<<<<<<<<")
Exemplo n.º 8
0
 def showHeadProxies(self, proxies, number):
     for i in range(0, number):
         if i >= len(proxies):
             break
         proxy = proxies[i]
         log.info('>>>  ' + str(proxy.averageTime) + '    ' +\
             str(proxy.successCount) + '/' + str(proxy.testCount) +\
             '    ' + proxy.getAllString())
Exemplo n.º 9
0
 def analyse(self):
     """Analyse the data."""
     profiles = self.getProfiles()
     log.info('Total Profile number:  %s' % len(profiles))
     self.processProfiles(profiles)
     if flag.getFlag('use_result_filter'):
         self.result.filter()
     self.result.caculate()
Exemplo n.º 10
0
def testAanlysedDataBaseImportFromResult():
    wdb = createTestAnalysedDataBase()
    # result = getRandomResult(10)
    result = getRandomResult(100)
    wdb.importResult(result)
    wdb.close()

    assertEqual(result)
    log.info("Pass the 1/2 test!")
Exemplo n.º 11
0
def main():
    flag.processArguments()
    log.config(GC.LOG_FILE_DIR + 'CrawlManager', 'info', 'info')
    signal.signal(signal.SIGINT, detectSignal)
    waitingTime = flag.getFlag('waiting_time')
    log.info('Wait for: ' + str(waitingTime) + ' minutes')
    time.sleep(waitingTime * 60)
    manager = CrawlManager()
    manager.start()
Exemplo n.º 12
0
 def pingServer(self):
     if time.time() - self.lastPing <= self.PING_INTERVAL:
         return
     log.info('Ping Mysql Server.')
     self.lastPing = time.time()
     try:
         self.mdbConnection.ping()
     except Exception, e:
         log.warning('Mysql server gone: %s' % e)
         self._init()
Exemplo n.º 13
0
def main():
    log.config(GC.LOG_FILE_DIR + 'database_test', 'debug', 'debug')
    db = createConnection()
    createTables(db)
    dropTables(db)
    createTables(db)
    test(db)
    testStartList(db)
    dropTables(db)
    db.close()
    log.info("Pass the test!")
Exemplo n.º 14
0
    def removeDuplicate(self):
        proxyMap = {}
        for proxy in self.allProxies:
            #key = proxy.addr[0 : proxy.addr.rfind('.')]
            key = proxy.addr + proxy.port
            if not proxyMap.get(key):
                proxyMap[key] = proxy

        self.proxiesToTest = [item[1] for item in proxyMap.items()]
        log.info( 'After remove duplicate, proxy number: ' +\
            str(len(self.proxiesToTest)))
Exemplo n.º 15
0
def main():
    log.config(GC.LOG_FILE_DIR + 'account_pool_test', 'debug', 'debug')
    pool = createTestRenrenAccountPool()
    createTables(pool)
    dropTables(pool)
    createTables(pool)
    
    testRenrenAccountPool()
    testProxyPool()
   
    dropTables(pool)
    log.info("Pass the test!")
Exemplo n.º 16
0
def testAanlysedDataBaseImportFromFile():
    tmpFileName = "files/tmpserializedata"
    # result = getRandomResult(10)
    result = getRandomResult(100)
    result.writeToFile(tmpFileName)

    wdb = createTestAnalysedDataBase()
    wdb.importResultFromFile(tmpFileName)
    wdb.close()

    assertEqual(result)
    log.info("Pass the 2/2 test!")
Exemplo n.º 17
0
def printInfo(info):
    log.info('Parsed user info:\n' +\
        'name: ' + str(info.name) + '\n'\
        'gender: ' + str(info.gender) + '\n'\
        'hometown: ' + str(info.hometown) + '\n'\
        'residence: ' + str(info.residence) + '\n'\
        'birthday: ' + str(info.birthday) + '\n'\
        'visitedNum: ' + str(info.visitedNum) + '\n'\
        'friendNum: ' + str(info.friendNum) + '\n'\
        'vistorListNum: ' + str(len(info.recentVisitedList)) + '\n'\
        'vistorList: ' + str(info.recentVisitedList) + '\n'\
        'friendListNum: ' + str(len(info.friendList)) + '\n'\
        'friendList: ' + str(info.friendList))
Exemplo n.º 18
0
    def testAll(self):
        startTime = time.time()
        threads = []
        proxies = self.proxiesToTest
        for proxy in proxies:
            tester = ProxyTester(proxy)
            threads.append(tester)

        pool = ThreadPool(200)
        pool.start(threads)
        
        log.info( 'Test finish.  Total test time:  ' +\
            str(time.time() - startTime) + 's')
Exemplo n.º 19
0
def saveInUsingAccounts(pool):
    getAllUsingAccountsCommand = """
        SELECT username, password FROM RenrenAccounts
        WHERE is_using = 1 AND
            is_valid = 1 AND
            last_used_time <= DATE_SUB(NOW(), INTERVAL 1 DAY)
        ORDER BY last_used_time ASC;
    """
    pool.cursor.execute(getAllUsingAccountsCommand)
    rows = pool.cursor.fetchall()
    log.info('Total InUsing account:' + str(len(rows)))
    for row in rows:
        pool.saveAccount(row[0], row[1], True, '1971-1-1')
Exemplo n.º 20
0
def recursiveProfileTest(
    username, password, testInterval, totalCount, startList):
    """Run a recursive get profile test."""
    generator = recursiveTestGenerator(
        username, password, testInterval, totalCount, startList)
    while True:
        try:
            id, info, errorCode = generator.next()
            if not errorCode:
                log.info('Profile url: ' + RenrenAgent.getProfileUrl(id))
                path = util.saveTestPage(info.html, id)
                log.info('Profile local path: file://'+path)
                printInfo(info)
        except Exception, e:
            log.error('Error happen or end: ' + str(e))
            break
Exemplo n.º 21
0
 def expandSharePage(self, sharePageInfo, agent):
     for url in sharePageInfo.popularShareList:
         if url in self.crawledShareSet:
             continue
         self.crawledShareSet.add(url)
         if self.requestCount >= self.REQUEST_LIMIT:
             return
         log.info('Share agent is crawling url: %s' % url)
         info = agent.getSharePageInfo(url)
         self.requestCount += 1
         if info:
             for user in info.commentUserList:
                 self.addUser(user)
             for shareUrl in info.popularShareList:
                 self.shareList.append(shareUrl)
         time.sleep(1)
         if info:
             self.expandSharePage(info, agent)
Exemplo n.º 22
0
    def verify(self):
        
        agent = RenrenAgent(self.account)
        agent.login()
        if agent.isLogin:
            self.success = True
        else:
            self.success = False
        log.info('Account Verify result: (%s, %s)  >>>  %s' %\
            (self.account.username, self.account.password, self.success))

        global lock
        global verifySuccessCount
        global verifyFailCount

        lock.acquire()
        if self.success:
            verifySuccessCount += 1
        else:
            verifyFailCount += 1
        lock.release()
Exemplo n.º 23
0
    def run(self):
        self.dataBase = createProdDataBase()
        self.renrenAccountPool = createProdRenrenAccountPool()
        for i in range(0, self.ROUND_NUMBER):
            log.info('>>>>>>>>  Main Crawl Thread Round(%s)  <<<<<<<<' % (i+1))

            if self.dataBase.needMoreStartNode():
                startNodeCrawler = StartNodeCrawler(\
                    self.dataBase, self.renrenAccountPool)
                startNodeCrawler.startCrawling()

            self.startMultiThreadCrawling(self.THREAD_NUMBER)
            #self.startMultiThreadCrawlingWithProxy(1)
            #manager.startSignleThreadCrawling()

            try:
                Crawler.detectStopSignal()
            except Exception, e:
                break

            log.info('>>>>>> Router disconnect PPPoE  <<<<<<')
            router.disconnectPPPoE()
            time.sleep(2)
            log.info('>>>>>> Router connect PPPoE  <<<<<<')
            router.connectPPPoE()
            # Wait for the connection being established.
            time.sleep(10)
Exemplo n.º 24
0
def getProfileTest(agent, id, filePath=''):
    if filePath:
        log.info('================= Get Profile test (Local Html) ======' +\
            '=======================')
        log.info('Local Profile path: file://' + filePath)
        html = open(filePath).read()
        info, errorCode = agent.parseProfileHtml(html)
        if errorCode:
            log.error('Error happen in parse local html, path: ' + filePath)
            return
    else:
        log.info('================= Get Profile test (Online Html) =====' +\
            '=======================')
        log.info('Profile url: ' + agent.getProfileUrl(id))
        info, errorCode = agent.getProfile(id)
        if errorCode:
            log.error('Error happen in get profile, id: ' + id)
            return
        if not info.html:
            log.warning('No html')
            return
        path = util.saveTestPage(info.html, id)
        log.info('Online Profile path: file://'+path)
    printInfo(info)
Exemplo n.º 25
0
 def handleUserList(self):
     log.info('Finish crawling start nodes, total user number: %s' %\
         len(self.userList))
     for id in self.userList:
         status = self.dataBase.getStatus(id)
         if status != Status.expanded:
             # Inser into start list.
             self.dataBase.insertIntoStartList(id)
             log.info('Inser into start list: %s' % id)
         else:
             log.info('This id have been expanded: %s' % id)
Exemplo n.º 26
0
def importFromFile(fname):
    log.config(GC.LOG_FILE_DIR + 'import_accounts', 'info', 'info')
    fileName = fname
    accounts = []
    pool = createProdRenrenAccountPool()

    with open(fileName) as importedFile:
        lines = importedFile.readlines()
        for line in lines:
            strs = line.split()
            if len(strs) < 2:
                continue # May be not a valid account
            username = strs[0] # User name first.
            password = strs[1] # And then password.
            log.info("Find username: "******"  " +\
                "password: "******"Finish importing..........\n" +\
        "Success on verify accounts number: " +\
        str(verifySuccessCount) + "\n" +\
        "Fail on verify accounts number: " +\
        str(verifyFailCount))
    log.info('Success imported number: %s' % importSuccessCount)
    log.info('Fail imported number: %s' % importFailCount)
Exemplo n.º 27
0
def main():
    log.config(GC.LOG_FILE_DIR + "analysed_data_base_test", "debug", "debug")
    testAanlysedDataBaseImportFromResult()
    testAanlysedDataBaseImportFromFile()
    log.info("Pass the all the test!")
Exemplo n.º 28
0
    def expand(self, id, opt_connection=None):
        """Expand the node with given id.

        This function will:
        1) If not provided connection, means the status of the id may be
            unrecorded. We need to check the status and makeHTTPRequest if
            needed. Then get the connection of the node.
        2) If provided opt_connection, use the connection.
        3) With the connection, get the profile (and with connection if
            availabel) of it's home page friend and recent visitors.
        3) Calculates the connected upexpanded nodes' score.
        4) Picks the node with highest score and return it for next expand.

        Returns:
            UserNode: the node for next expand.
        
        Raise:
            CrawlerException: happen.
        """
        log.info("Expand node: " + id)
        # Get the connection of the node.
        if not opt_connection:
            # If opt_connection is not provided.
            # Check the status and get the connection.
            status = self.dataBase.getStatus(id)
            if status == database.Status.unrecorded:
                node = self.makeRequsetAndSave(id)
                if not node:
                    raise CrawlerException(
                        "Failed to get expanding node's user profile!",
                        CrawlerErrorCode.GET_EXPANDING_NODE_FAILED)
                connection = node.connection
            elif status == database.Status.unexpanded or\
                status == database.Status.expanded:
                connection = self.dataBase.getConnection(id)
            # TODO: Bring it back later.
            #elif status == database.Status.expanded:
            #    raise CrawlerException(
            #        "Try to expand expanded node.",
            #        CrawlerErrorCode.EXPAND_EXPANDED_NODE)
            else:
                raise CrawlerException("WTF??", CrawlerErrorCode.UNKNOWN)
        else:
            connection = opt_connection

        # Get the profile of connected nodes.
        connectedUnexpandedNodes = []
        allConnection =\
            connection.recentVisitorList + connection.homePageFriendList
        for id in allConnection:
            status, profile = self.dataBase.getStatusAndProfile(id)
            if status == database.Status.unrecorded:
                node = self.makeRequsetAndSave(id)
                if node:
                    connectedUnexpandedNodes.append(node)
            elif status == database.Status.unexpanded:
                node = UserNode(id, status, profile)
                connectedUnexpandedNodes.append(node)

        # Set node status to expanded.
        # Raise exception
        if not len(connectedUnexpandedNodes):
            self.dataBase.setStatus(id, database.Status.expanded)
            raise CrawlerException(
                "There is no availabel node to expand.",
                CrawlerErrorCode.NO_NODE_TO_EXPAND)

        # Calculate the score.
        scores = [self.calculateScore(node) \
            for node in connectedUnexpandedNodes]

        # Random pick one base on the score.
        weights = [x*x*x for x in scores]
        index = util.weightPickInt(weights)

        nextToExpend = connectedUnexpandedNodes[index]

        #if self.dataBase.needMoreStartNode() and util.randomTrue(1.0/60):
        if self.dataBase.needMoreStartNode() and util.randomTrue(1.0/10):
            del connectedUnexpandedNodes[index]
            del weights[index]
            if (len(connectedUnexpandedNodes) > 0):
                index = util.weightPickInt(weights)
                newStartNode = connectedUnexpandedNodes[index]
                self.dataBase.insertIntoStartList(newStartNode.id)

        return nextToExpend
Exemplo n.º 29
0
def recursiveTestGenerator(
    username, password, testInterval, totalCount, startList):
    """A recursive test generator.
   
    Start from a list of user id, and get all the profile of these id and
    their friends and friend of the friends.
    Every time it gets a user profile, it will yield the
    (id, UserInfo, ErrorCode)
    
    Args:
        @username {string} the user name of the agent.
        @password {string} the password of the agent.
        @testInterval {float} the interval time between every request.
        @totalCount {integer} total number of profile to get.
        @startList {List} a list of user id to start test.
    """
    agent = RenrenAgent(username, password)
    info, error = agent.login()
    if not error:
        log.info(info['name'])
        log.info(info['href'])
    else:
        log.error('Login error(username, password): ' +\
                username + ', ' + password)
    count = 1
    visitList = []
    for elem in startList:
        visitList.append((elem, None))
    while visitList:
        # Get the element to do requet.
        elem = visitList[0]
        id = elem[0]
        log.info('processing(' + str(count) + '): ' + id)
        visitList = visitList[1:]
        info, errorCode = agent.getProfile(id)
        # Error handle
        if errorCode:
            if elem[1]:
                log.warning('Error happen when getProfile. Refer id: ' +\
                            str(elem[1]) + '. Refer page url: ' +\
                            agent.getProfileUrl(str(elem[1])))
            else:
                log.warning('Error happen when getProfile, no refer id.')
            continue
        # Yield result
        yield (id, info, errorCode)
        # Result handle
        if len(visitList) < totalCount - count:
            newList = []
            if info.friendList:
                for ele in info.friendList:
                    newList += [(ele, id)]
            if info.recentVisitedList:
                for ele in info.recentVisitedList:
                    newList += [(ele, id)]
            visitList += newList
        # Acc
        count += 1
        if count > totalCount:
            return
        time.sleep(testInterval)
Exemplo n.º 30
0
def main():
    log.config(GC.LOG_FILE_DIR + 'result_test', 'debug', 'debug')
    testResultSerialization()
    log.info("Pass the test!")