class LookupSlave(LocalProc): def __init__(self,slave_id): LocalProc.__init__(self,'lookup',slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() self.orig_db = CouchDB(settings.couchdb_root+"orig_houtx") def run(self): while True: jobs = [] for x in xrange(20): # reserve blocks to wait when x is 0, but returns None for 1-19 try: j = self.stalk.reserve(0 if x else None) except beanstalkc.DeadlineSoon: break if j is None: break jobs.append(j) bodies = [LookupJobBody.from_job(j) for j in jobs] users =self.twitter.user_lookup([b._id for b in bodies]) logging.info("looking at %r"%[getattr(u,'screen_name','') for u in users]) for job,body,user in zip(jobs,bodies,users): if user is None: continue try: if self.twitter.remaining < 30: dt = (self.twitter.reset_time-datetime.utcnow()) logging.info("goodnight for %r",dt) time.sleep(dt.seconds) logging.info("look at %s",user.screen_name) if user._id in User.database or user._id in self.orig_db: job.delete() continue self.crawl_user(user) user.save() job.delete() except: logging.exception("exception for job %s"%job.body) job.bury() logging.info("api calls remaining: %d",self.twitter.remaining) def crawl_user(self,user): user.local_prob = guess_location(user,self.gisgraphy) if user.local_prob != 1.0 or user.protected: return rels=None tweets=None if user.followers_count>0 and user.friends_count>0: rels = self.twitter.get_relationships(user._id) rels.attempt_save() if user.statuses_count>0: tweets = self.twitter.user_timeline(user._id,since_id=settings.min_tweet_id) for tweet in tweets: tweet.attempt_save() if tweets: user.next_crawl_date = datetime.utcnow() user.last_crawl_date = datetime.utcnow() user.tweets_per_hour = settings.tweets_per_hour user.last_tid = tweets[0]._id user.lookup_done = True if user.local_prob == 1.0: self.score_new_users(user, rels, tweets) def score_new_users(self, user, rels, tweets): jobs = defaultdict(LookupJobBody) jobs[user._id].done = True if rels: rfriends = rels.rfriends() if len(rfriends) < RFRIEND_POINTS: for u in rfriends: jobs[u].rfriends_score = RFRIEND_POINTS/len(rfriends) if tweets: ats = defaultdict(int) for tweet in tweets: for uid in tweet.mentions: ats[uid]+=1 for u,c in ats.iteritems(): points = c*MENTION_POINTS if points >0: jobs[u].mention_score = points for k,j in jobs.iteritems(): j._id = k j.put(self.stalk)
class LookupSlave(LocalProc): def __init__(self, slave_id): LocalProc.__init__(self, 'lookup', slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() self.orig_db = CouchDB(settings.couchdb_root + "orig_houtx") def run(self): while True: jobs = [] for x in xrange(20): # reserve blocks to wait when x is 0, but returns None for 1-19 try: j = self.stalk.reserve(0 if x else None) except beanstalkc.DeadlineSoon: break if j is None: break jobs.append(j) bodies = [LookupJobBody.from_job(j) for j in jobs] users = self.twitter.user_lookup([b._id for b in bodies]) logging.info("looking at %r" % [getattr(u, 'screen_name', '') for u in users]) for job, body, user in zip(jobs, bodies, users): if user is None: continue try: if self.twitter.remaining < 30: dt = (self.twitter.reset_time - datetime.utcnow()) logging.info("goodnight for %r", dt) time.sleep(dt.seconds) logging.info("look at %s", user.screen_name) if user._id in User.database or user._id in self.orig_db: job.delete() continue self.crawl_user(user) user.save() job.delete() except: logging.exception("exception for job %s" % job.body) job.bury() logging.info("api calls remaining: %d", self.twitter.remaining) def crawl_user(self, user): user.local_prob = guess_location(user, self.gisgraphy) if user.local_prob != 1.0 or user.protected: return rels = None tweets = None if user.followers_count > 0 and user.friends_count > 0: rels = self.twitter.get_relationships(user._id) rels.attempt_save() if user.statuses_count > 0: tweets = self.twitter.user_timeline(user._id, since_id=settings.min_tweet_id) for tweet in tweets: tweet.attempt_save() if tweets: user.next_crawl_date = datetime.utcnow() user.last_crawl_date = datetime.utcnow() user.tweets_per_hour = settings.tweets_per_hour user.last_tid = tweets[0]._id user.lookup_done = True if user.local_prob == 1.0: self.score_new_users(user, rels, tweets) def score_new_users(self, user, rels, tweets): jobs = defaultdict(LookupJobBody) jobs[user._id].done = True if rels: rfriends = rels.rfriends() if len(rfriends) < RFRIEND_POINTS: for u in rfriends: jobs[u].rfriends_score = RFRIEND_POINTS / len(rfriends) if tweets: ats = defaultdict(int) for tweet in tweets: for uid in tweet.mentions: ats[uid] += 1 for u, c in ats.iteritems(): points = c * MENTION_POINTS if points > 0: jobs[u].mention_score = points for k, j in jobs.iteritems(): j._id = k j.put(self.stalk)