示例#1
0
class LookupSlave(LocalProc):
    def __init__(self,slave_id):
        LocalProc.__init__(self,'lookup',slave_id)
        self.twitter = TwitterResource()
        self.gisgraphy = GisgraphyResource()
        self.orig_db = CouchDB(settings.couchdb_root+"orig_houtx")

    def run(self):
        while True:
            jobs = []
            for x in xrange(20):
                # reserve blocks to wait when x is 0, but returns None for 1-19
                try:
                    j = self.stalk.reserve(0 if x else None)
                except beanstalkc.DeadlineSoon:
                    break
                if j is None:
                    break
                jobs.append(j)

            bodies = [LookupJobBody.from_job(j) for j in jobs]
            users =self.twitter.user_lookup([b._id for b in bodies])

            logging.info("looking at %r"%[getattr(u,'screen_name','') for u in users])
            for job,body,user in zip(jobs,bodies,users):
                if user is None: continue
                try:
                    if self.twitter.remaining < 30:
                        dt = (self.twitter.reset_time-datetime.utcnow())
                        logging.info("goodnight for %r",dt)
                        time.sleep(dt.seconds)
                    logging.info("look at %s",user.screen_name)
                    if user._id in User.database or user._id in self.orig_db:
                        job.delete()
                        continue
                    self.crawl_user(user)
                    user.save()
                    job.delete()
                except:
                    logging.exception("exception for job %s"%job.body)
                    job.bury()
            logging.info("api calls remaining: %d",self.twitter.remaining)

    def crawl_user(self,user):
        user.local_prob = guess_location(user,self.gisgraphy)
        if user.local_prob != 1.0 or user.protected:
            return
        rels=None
        tweets=None
        if user.followers_count>0 and user.friends_count>0:
            rels = self.twitter.get_relationships(user._id)
            rels.attempt_save()

        if user.statuses_count>0:
            tweets = self.twitter.user_timeline(user._id,since_id=settings.min_tweet_id)
            for tweet in tweets:
                tweet.attempt_save()
        if tweets:
            user.next_crawl_date = datetime.utcnow()
            user.last_crawl_date = datetime.utcnow()
            user.tweets_per_hour = settings.tweets_per_hour
            user.last_tid = tweets[0]._id
        
        user.lookup_done = True
        if user.local_prob == 1.0:
            self.score_new_users(user, rels, tweets)

    def score_new_users(self, user, rels, tweets):
        jobs = defaultdict(LookupJobBody)
        jobs[user._id].done = True

        if rels:
            rfriends = rels.rfriends()
            if len(rfriends) < RFRIEND_POINTS:
                for u in rfriends:
                   jobs[u].rfriends_score = RFRIEND_POINTS/len(rfriends)

        if tweets:
            ats = defaultdict(int)
            for tweet in tweets:
                for uid in tweet.mentions:
                    ats[uid]+=1
            for u,c in ats.iteritems():
                points = c*MENTION_POINTS
                if points >0:
                    jobs[u].mention_score = points

        for k,j in jobs.iteritems():
            j._id = k
            j.put(self.stalk)
示例#2
0
class LookupSlave(LocalProc):
    def __init__(self, slave_id):
        LocalProc.__init__(self, 'lookup', slave_id)
        self.twitter = TwitterResource()
        self.gisgraphy = GisgraphyResource()
        self.orig_db = CouchDB(settings.couchdb_root + "orig_houtx")

    def run(self):
        while True:
            jobs = []
            for x in xrange(20):
                # reserve blocks to wait when x is 0, but returns None for 1-19
                try:
                    j = self.stalk.reserve(0 if x else None)
                except beanstalkc.DeadlineSoon:
                    break
                if j is None:
                    break
                jobs.append(j)

            bodies = [LookupJobBody.from_job(j) for j in jobs]
            users = self.twitter.user_lookup([b._id for b in bodies])

            logging.info("looking at %r" %
                         [getattr(u, 'screen_name', '') for u in users])
            for job, body, user in zip(jobs, bodies, users):
                if user is None: continue
                try:
                    if self.twitter.remaining < 30:
                        dt = (self.twitter.reset_time - datetime.utcnow())
                        logging.info("goodnight for %r", dt)
                        time.sleep(dt.seconds)
                    logging.info("look at %s", user.screen_name)
                    if user._id in User.database or user._id in self.orig_db:
                        job.delete()
                        continue
                    self.crawl_user(user)
                    user.save()
                    job.delete()
                except:
                    logging.exception("exception for job %s" % job.body)
                    job.bury()
            logging.info("api calls remaining: %d", self.twitter.remaining)

    def crawl_user(self, user):
        user.local_prob = guess_location(user, self.gisgraphy)
        if user.local_prob != 1.0 or user.protected:
            return
        rels = None
        tweets = None
        if user.followers_count > 0 and user.friends_count > 0:
            rels = self.twitter.get_relationships(user._id)
            rels.attempt_save()

        if user.statuses_count > 0:
            tweets = self.twitter.user_timeline(user._id,
                                                since_id=settings.min_tweet_id)
            for tweet in tweets:
                tweet.attempt_save()
        if tweets:
            user.next_crawl_date = datetime.utcnow()
            user.last_crawl_date = datetime.utcnow()
            user.tweets_per_hour = settings.tweets_per_hour
            user.last_tid = tweets[0]._id

        user.lookup_done = True
        if user.local_prob == 1.0:
            self.score_new_users(user, rels, tweets)

    def score_new_users(self, user, rels, tweets):
        jobs = defaultdict(LookupJobBody)
        jobs[user._id].done = True

        if rels:
            rfriends = rels.rfriends()
            if len(rfriends) < RFRIEND_POINTS:
                for u in rfriends:
                    jobs[u].rfriends_score = RFRIEND_POINTS / len(rfriends)

        if tweets:
            ats = defaultdict(int)
            for tweet in tweets:
                for uid in tweet.mentions:
                    ats[uid] += 1
            for u, c in ats.iteritems():
                points = c * MENTION_POINTS
                if points > 0:
                    jobs[u].mention_score = points

        for k, j in jobs.iteritems():
            j._id = k
            j.put(self.stalk)