class CrawlSlave(LocalProc): def __init__(self, slave_id, todo, done): LocalProc.__init__(self, 'crawl', slave_id) self.twitter = TwitterResource() self.todo = todo self.done = done def run(self): Tweet.database = CouchDB(settings.couchdb_root + "hou_new_tweet", True) #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace() while not HALT: user = None try: uid = self.todo.get() user = User.get_id(uid) self.crawl(user) self.done.put(uid) self.todo.task_done() if self.twitter.remaining < 10: dt = (self.twitter.reset_time - datetime.utcnow()) logging.info("goodnight for %r", dt) time.sleep(dt.seconds) except Exception as ex: if user: logging.exception("exception for user %s" % user.to_d()) else: logging.exception("exception and user is None") logging.info("api calls remaining: %d", self.twitter.remaining) print "slave is done" def crawl(self, user): logging.debug("visiting %s - %s", user._id, user.screen_name) tweets = self.twitter.save_timeline(user._id, user.last_tid) if tweets: user.last_tid = tweets[0]._id now = datetime.utcnow() last = user.last_crawl_date if user.last_crawl_date is not None else datetime( 2010, 11, 12) delta = now - last seconds = delta.seconds + delta.days * 24 * 3600 tph = (3600.0 * len(tweets) / seconds + user.tweets_per_hour) / 2 user.tweets_per_hour = tph hours = min(settings.tweets_per_crawl / tph, settings.max_hours) user.next_crawl_date = now + timedelta(hours=hours) user.last_crawl_date = now user.save()
class CrawlSlave(LocalProc): def __init__(self, slave_id, todo, done): LocalProc.__init__(self,'crawl', slave_id) self.twitter = TwitterResource() self.todo = todo self.done = done def run(self): Tweet.database = CouchDB(settings.couchdb_root+"hou_new_tweet",True) #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace() while not HALT: user=None try: uid = self.todo.get() user = User.get_id(uid) self.crawl(user) self.done.put(uid) self.todo.task_done() if self.twitter.remaining < 10: dt = (self.twitter.reset_time-datetime.utcnow()) logging.info("goodnight for %r",dt) time.sleep(dt.seconds) except Exception as ex: if user: logging.exception("exception for user %s"%user.to_d()) else: logging.exception("exception and user is None") logging.info("api calls remaining: %d",self.twitter.remaining) print "slave is done" def crawl(self, user): logging.debug("visiting %s - %s",user._id,user.screen_name) tweets = self.twitter.save_timeline(user._id, user.last_tid) if tweets: user.last_tid = tweets[0]._id now = datetime.utcnow() last = user.last_crawl_date if user.last_crawl_date is not None else datetime(2010,11,12) delta = now - last seconds = delta.seconds + delta.days*24*3600 tph = (3600.0*len(tweets)/seconds + user.tweets_per_hour)/2 user.tweets_per_hour = tph hours = min(settings.tweets_per_crawl/tph, settings.max_hours) user.next_crawl_date = now+timedelta(hours=hours) user.last_crawl_date = now user.save()