class CrawlSlave(LocalProc): def __init__(self, slave_id, todo, done): LocalProc.__init__(self, 'crawl', slave_id) self.twitter = TwitterResource() self.todo = todo self.done = done def run(self): Tweet.database = CouchDB(settings.couchdb_root + "hou_new_tweet", True) #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace() while not HALT: user = None try: uid = self.todo.get() user = User.get_id(uid) self.crawl(user) self.done.put(uid) self.todo.task_done() if self.twitter.remaining < 10: dt = (self.twitter.reset_time - datetime.utcnow()) logging.info("goodnight for %r", dt) time.sleep(dt.seconds) except Exception as ex: if user: logging.exception("exception for user %s" % user.to_d()) else: logging.exception("exception and user is None") logging.info("api calls remaining: %d", self.twitter.remaining) print "slave is done" def crawl(self, user): logging.debug("visiting %s - %s", user._id, user.screen_name) tweets = self.twitter.save_timeline(user._id, user.last_tid) if tweets: user.last_tid = tweets[0]._id now = datetime.utcnow() last = user.last_crawl_date if user.last_crawl_date is not None else datetime( 2010, 11, 12) delta = now - last seconds = delta.seconds + delta.days * 24 * 3600 tph = (3600.0 * len(tweets) / seconds + user.tweets_per_hour) / 2 user.tweets_per_hour = tph hours = min(settings.tweets_per_crawl / tph, settings.max_hours) user.next_crawl_date = now + timedelta(hours=hours) user.last_crawl_date = now user.save()
class CrawlSlave(LocalProc): def __init__(self, slave_id, todo, done): LocalProc.__init__(self,'crawl', slave_id) self.twitter = TwitterResource() self.todo = todo self.done = done def run(self): Tweet.database = CouchDB(settings.couchdb_root+"hou_new_tweet",True) #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace() while not HALT: user=None try: uid = self.todo.get() user = User.get_id(uid) self.crawl(user) self.done.put(uid) self.todo.task_done() if self.twitter.remaining < 10: dt = (self.twitter.reset_time-datetime.utcnow()) logging.info("goodnight for %r",dt) time.sleep(dt.seconds) except Exception as ex: if user: logging.exception("exception for user %s"%user.to_d()) else: logging.exception("exception and user is None") logging.info("api calls remaining: %d",self.twitter.remaining) print "slave is done" def crawl(self, user): logging.debug("visiting %s - %s",user._id,user.screen_name) tweets = self.twitter.save_timeline(user._id, user.last_tid) if tweets: user.last_tid = tweets[0]._id now = datetime.utcnow() last = user.last_crawl_date if user.last_crawl_date is not None else datetime(2010,11,12) delta = now - last seconds = delta.seconds + delta.days*24*3600 tph = (3600.0*len(tweets)/seconds + user.tweets_per_hour)/2 user.tweets_per_hour = tph hours = min(settings.tweets_per_crawl/tph, settings.max_hours) user.next_crawl_date = now+timedelta(hours=hours) user.last_crawl_date = now user.save()
def __init__(self, slave_id, todo, done): LocalProc.__init__(self, 'crawl', slave_id) self.twitter = TwitterResource() self.todo = todo self.done = done
def __init__(self, slave_id, todo, done): LocalProc.__init__(self,'crawl', slave_id) self.twitter = TwitterResource() self.todo = todo self.done = done
def __init__(self,slave_id): LocalProc.__init__(self,'lookup',slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() self.orig_db = CouchDB(settings.couchdb_root+"orig_houtx")
class LookupSlave(LocalProc): def __init__(self,slave_id): LocalProc.__init__(self,'lookup',slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() self.orig_db = CouchDB(settings.couchdb_root+"orig_houtx") def run(self): while True: jobs = [] for x in xrange(20): # reserve blocks to wait when x is 0, but returns None for 1-19 try: j = self.stalk.reserve(0 if x else None) except beanstalkc.DeadlineSoon: break if j is None: break jobs.append(j) bodies = [LookupJobBody.from_job(j) for j in jobs] users =self.twitter.user_lookup([b._id for b in bodies]) logging.info("looking at %r"%[getattr(u,'screen_name','') for u in users]) for job,body,user in zip(jobs,bodies,users): if user is None: continue try: if self.twitter.remaining < 30: dt = (self.twitter.reset_time-datetime.utcnow()) logging.info("goodnight for %r",dt) time.sleep(dt.seconds) logging.info("look at %s",user.screen_name) if user._id in User.database or user._id in self.orig_db: job.delete() continue self.crawl_user(user) user.save() job.delete() except: logging.exception("exception for job %s"%job.body) job.bury() logging.info("api calls remaining: %d",self.twitter.remaining) def crawl_user(self,user): user.local_prob = guess_location(user,self.gisgraphy) if user.local_prob != 1.0 or user.protected: return rels=None tweets=None if user.followers_count>0 and user.friends_count>0: rels = self.twitter.get_relationships(user._id) rels.attempt_save() if user.statuses_count>0: tweets = self.twitter.user_timeline(user._id,since_id=settings.min_tweet_id) for tweet in tweets: tweet.attempt_save() if tweets: user.next_crawl_date = datetime.utcnow() user.last_crawl_date = datetime.utcnow() user.tweets_per_hour = settings.tweets_per_hour user.last_tid = tweets[0]._id user.lookup_done = True if user.local_prob == 1.0: self.score_new_users(user, rels, tweets) def score_new_users(self, user, rels, tweets): jobs = defaultdict(LookupJobBody) jobs[user._id].done = True if rels: rfriends = rels.rfriends() if len(rfriends) < RFRIEND_POINTS: for u in rfriends: jobs[u].rfriends_score = RFRIEND_POINTS/len(rfriends) if tweets: ats = defaultdict(int) for tweet in tweets: for uid in tweet.mentions: ats[uid]+=1 for u,c in ats.iteritems(): points = c*MENTION_POINTS if points >0: jobs[u].mention_score = points for k,j in jobs.iteritems(): j._id = k j.put(self.stalk)
def __init__(self, slave_id): LocalProc.__init__(self, 'lookup', slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() self.orig_db = CouchDB(settings.couchdb_root + "orig_houtx")
class LookupSlave(LocalProc): def __init__(self, slave_id): LocalProc.__init__(self, 'lookup', slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() self.orig_db = CouchDB(settings.couchdb_root + "orig_houtx") def run(self): while True: jobs = [] for x in xrange(20): # reserve blocks to wait when x is 0, but returns None for 1-19 try: j = self.stalk.reserve(0 if x else None) except beanstalkc.DeadlineSoon: break if j is None: break jobs.append(j) bodies = [LookupJobBody.from_job(j) for j in jobs] users = self.twitter.user_lookup([b._id for b in bodies]) logging.info("looking at %r" % [getattr(u, 'screen_name', '') for u in users]) for job, body, user in zip(jobs, bodies, users): if user is None: continue try: if self.twitter.remaining < 30: dt = (self.twitter.reset_time - datetime.utcnow()) logging.info("goodnight for %r", dt) time.sleep(dt.seconds) logging.info("look at %s", user.screen_name) if user._id in User.database or user._id in self.orig_db: job.delete() continue self.crawl_user(user) user.save() job.delete() except: logging.exception("exception for job %s" % job.body) job.bury() logging.info("api calls remaining: %d", self.twitter.remaining) def crawl_user(self, user): user.local_prob = guess_location(user, self.gisgraphy) if user.local_prob != 1.0 or user.protected: return rels = None tweets = None if user.followers_count > 0 and user.friends_count > 0: rels = self.twitter.get_relationships(user._id) rels.attempt_save() if user.statuses_count > 0: tweets = self.twitter.user_timeline(user._id, since_id=settings.min_tweet_id) for tweet in tweets: tweet.attempt_save() if tweets: user.next_crawl_date = datetime.utcnow() user.last_crawl_date = datetime.utcnow() user.tweets_per_hour = settings.tweets_per_hour user.last_tid = tweets[0]._id user.lookup_done = True if user.local_prob == 1.0: self.score_new_users(user, rels, tweets) def score_new_users(self, user, rels, tweets): jobs = defaultdict(LookupJobBody) jobs[user._id].done = True if rels: rfriends = rels.rfriends() if len(rfriends) < RFRIEND_POINTS: for u in rfriends: jobs[u].rfriends_score = RFRIEND_POINTS / len(rfriends) if tweets: ats = defaultdict(int) for tweet in tweets: for uid in tweet.mentions: ats[uid] += 1 for u, c in ats.iteritems(): points = c * MENTION_POINTS if points > 0: jobs[u].mention_score = points for k, j in jobs.iteritems(): j._id = k j.put(self.stalk)