Пример #1
0
    def get_timelines(self, uid):
        """
        get all timelines of user with this uid
        :param uid:
        :return:
        """
        fetcher = self.fetchers[self.main_fetcher]

        timeline_page_num, first_page = self.get_timeline_page_num(uid)
        if timeline_page_num == 0:
            print 'No any posts.'
            return
        else:
            for pt in first_page:
                self.timeline_list.extend(
                    self.parser.parse_timelines(pt, uid, datetime.now()))
            if timeline_page_num == 1:
                print 'He/She just has one page timeline.'
                return

        timelines = []
        for pnum in xrange(2, timeline_page_num + 1):
            print 'There are totally %d timeline pages.' % (
                timeline_page_num, )
            for bnum in xrange(3):
                html = self.fetch_timelines_by_page_bar(uid, pnum, bnum)
                time.sleep(
                    random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                   2 * Config.SLEEP_BETWEEN_2FPAGES))
                if html is not None:
                    timelines = self.parser.parse_timelines(
                        html, uid, datetime.now())
                    self.timeline_list.extend(timelines)
            self.end_time = datetime.now()
            duration = self.end_time - self.start_time
            if duration.seconds > Config.ACCOUNT_CHANGE_TIME:
                self.main_fetcher = loop_increase(self.main_fetcher,
                                                  len(self.fetchers))
                self.start_time = datetime.now()
                emphasis_print('Account changed!!!')
                emphasis_print('Now %d of %d accounts are working!' %
                               (self.main_fetcher + 1, len(self.fetchers)))
            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES,
                               2 * Config.SLEEP_BETWEEN_TIMELINE_PAGES))
Пример #2
0
    def get_timelines(self, uid):
        """
        get all timelines of user with this uid
        :param uid:
        :return:
        """
        fetcher = self.fetchers[self.main_fetcher]

        timeline_page_num, first_page = self.get_timeline_page_num(uid)
        if timeline_page_num == 0:
            print 'No any posts.'
            return
        else:
            for pt in first_page:
                self.timeline_list.extend(self.parser.parse_timelines(pt, uid, datetime.now()))
            if timeline_page_num == 1:
                print 'He/She just has one page timeline.'
                return

        timelines = []
        for pnum in xrange(2, timeline_page_num+1):
            print 'There are totally %d timeline pages.' % (timeline_page_num,)
            for bnum in xrange(3):
                html = self.fetch_timelines_by_page_bar(uid, pnum, bnum)
                time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                if html is not None:
                    timelines = self.parser.parse_timelines(html, uid, datetime.now())
                    self.timeline_list.extend(timelines)
            self.end_time = datetime.now()
            duration = self.end_time - self.start_time
            if duration.seconds > Config.ACCOUNT_CHANGE_TIME:
                self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers))
                self.start_time = datetime.now()
                emphasis_print('Account changed!!!')
                emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher+1, len(self.fetchers)))
            time.sleep(random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2*Config.SLEEP_BETWEEN_TIMELINE_PAGES))
Пример #3
0
                while True:  # in case of connection lost
                    try:
                        spider.save()
                        break
                    except Exception as e:
                        print e.message, uid
                        if 'Lost connection to MySQL server during query' in e.message:
                            continue
                        else:
                            break

                crawled_list.append(uid)
                spider.end_time = datetime.now()
                duration = spider.end_time - spider.start_time
                if duration.seconds > ACCOUNT_CHANGE_TIME:
                    spider.main_fetcher = loop_increase(
                        spider.main_fetcher, len(spider.fetchers))
                    spider.start_time = datetime.now()
                    emphasis_print('Account changed!!!')

            print 'Complete a batch of tasks!'
            print 'Getting new tasks...'
            uid_list = get_tasks(TASK_NUM)
            if len(uid_list) == 0:
                print 'No tasks to proceed!'
                exit(-1)
    except Exception as e:
        print e.message
        log.error('Problematic UID: %s' % (uid, ))
    finally:
        reset(user_list, uid_list, crawled_list)  # reset
Пример #4
0
                while True: # in case of connection lost
                    try:
                        spider.save()
                        break
                    except Exception as e:
                        print e.message, uid
                        if 'Lost connection to MySQL server during query' in e.message:
                            continue
                        else:
                            break

                crawled_list.append(uid)
                spider.end_time = datetime.now()
                duration = spider.end_time - spider.start_time
                if duration.seconds > ACCOUNT_CHANGE_TIME:
                    spider.main_fetcher = loop_increase(spider.main_fetcher, len(spider.fetchers))
                    spider.start_time = datetime.now()
                    emphasis_print('Account changed!!!')

            print 'Complete a batch of tasks!'
            print 'Getting new tasks...'
            uid_list = get_tasks(TASK_NUM)
            if len(uid_list) == 0:
                print 'No tasks to proceed!'
                exit(-1)
    except Exception as e:
        print e.message
        log.error('Problematic UID: %s' % (uid, ))
    finally:
        reset(user_list, uid_list, crawled_list) # reset