示例#1
0
class CrawlSlave(LocalProc):
    def __init__(self, slave_id, todo, done):
        LocalProc.__init__(self, 'crawl', slave_id)
        self.twitter = TwitterResource()
        self.todo = todo
        self.done = done

    def run(self):
        Tweet.database = CouchDB(settings.couchdb_root + "hou_new_tweet", True)
        #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace()
        while not HALT:
            user = None
            try:
                uid = self.todo.get()
                user = User.get_id(uid)
                self.crawl(user)
                self.done.put(uid)
                self.todo.task_done()
                if self.twitter.remaining < 10:
                    dt = (self.twitter.reset_time - datetime.utcnow())
                    logging.info("goodnight for %r", dt)
                    time.sleep(dt.seconds)
            except Exception as ex:
                if user:
                    logging.exception("exception for user %s" % user.to_d())
                else:
                    logging.exception("exception and user is None")
            logging.info("api calls remaining: %d", self.twitter.remaining)
        print "slave is done"

    def crawl(self, user):
        logging.debug("visiting %s - %s", user._id, user.screen_name)
        tweets = self.twitter.save_timeline(user._id, user.last_tid)
        if tweets:
            user.last_tid = tweets[0]._id
        now = datetime.utcnow()
        last = user.last_crawl_date if user.last_crawl_date is not None else datetime(
            2010, 11, 12)
        delta = now - last
        seconds = delta.seconds + delta.days * 24 * 3600
        tph = (3600.0 * len(tweets) / seconds + user.tweets_per_hour) / 2
        user.tweets_per_hour = tph
        hours = min(settings.tweets_per_crawl / tph, settings.max_hours)
        user.next_crawl_date = now + timedelta(hours=hours)
        user.last_crawl_date = now
        user.save()
示例#2
0
class CrawlSlave(LocalProc):
    def __init__(self, slave_id, todo, done):
        LocalProc.__init__(self,'crawl', slave_id)
        self.twitter = TwitterResource()
        self.todo = todo
        self.done = done

    def run(self):
        Tweet.database = CouchDB(settings.couchdb_root+"hou_new_tweet",True)
        #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace()
        while not HALT:
            user=None
            try:
                uid = self.todo.get()
                user = User.get_id(uid)
                self.crawl(user)
                self.done.put(uid)
                self.todo.task_done()
                if self.twitter.remaining < 10:
                    dt = (self.twitter.reset_time-datetime.utcnow())
                    logging.info("goodnight for %r",dt)
                    time.sleep(dt.seconds)
            except Exception as ex:
                if user:
                    logging.exception("exception for user %s"%user.to_d())
                else:
                    logging.exception("exception and user is None")
            logging.info("api calls remaining: %d",self.twitter.remaining)
        print "slave is done"

    def crawl(self, user):
        logging.debug("visiting %s - %s",user._id,user.screen_name)
        tweets = self.twitter.save_timeline(user._id, user.last_tid)
        if tweets:
            user.last_tid = tweets[0]._id
        now = datetime.utcnow()
        last = user.last_crawl_date if user.last_crawl_date is not None else datetime(2010,11,12)
        delta = now - last
        seconds = delta.seconds + delta.days*24*3600
        tph = (3600.0*len(tweets)/seconds + user.tweets_per_hour)/2
        user.tweets_per_hour = tph
        hours = min(settings.tweets_per_crawl/tph, settings.max_hours)
        user.next_crawl_date = now+timedelta(hours=hours)
        user.last_crawl_date = now
        user.save()