Пример #1
0
class CrawlSlave(LocalProc):
    def __init__(self, slave_id, todo, done):
        LocalProc.__init__(self, 'crawl', slave_id)
        self.twitter = TwitterResource()
        self.todo = todo
        self.done = done

    def run(self):
        Tweet.database = CouchDB(settings.couchdb_root + "hou_new_tweet", True)
        #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace()
        while not HALT:
            user = None
            try:
                uid = self.todo.get()
                user = User.get_id(uid)
                self.crawl(user)
                self.done.put(uid)
                self.todo.task_done()
                if self.twitter.remaining < 10:
                    dt = (self.twitter.reset_time - datetime.utcnow())
                    logging.info("goodnight for %r", dt)
                    time.sleep(dt.seconds)
            except Exception as ex:
                if user:
                    logging.exception("exception for user %s" % user.to_d())
                else:
                    logging.exception("exception and user is None")
            logging.info("api calls remaining: %d", self.twitter.remaining)
        print "slave is done"

    def crawl(self, user):
        logging.debug("visiting %s - %s", user._id, user.screen_name)
        tweets = self.twitter.save_timeline(user._id, user.last_tid)
        if tweets:
            user.last_tid = tweets[0]._id
        now = datetime.utcnow()
        last = user.last_crawl_date if user.last_crawl_date is not None else datetime(
            2010, 11, 12)
        delta = now - last
        seconds = delta.seconds + delta.days * 24 * 3600
        tph = (3600.0 * len(tweets) / seconds + user.tweets_per_hour) / 2
        user.tweets_per_hour = tph
        hours = min(settings.tweets_per_crawl / tph, settings.max_hours)
        user.next_crawl_date = now + timedelta(hours=hours)
        user.last_crawl_date = now
        user.save()
Пример #2
0
class CrawlSlave(LocalProc):
    def __init__(self, slave_id, todo, done):
        LocalProc.__init__(self,'crawl', slave_id)
        self.twitter = TwitterResource()
        self.todo = todo
        self.done = done

    def run(self):
        Tweet.database = CouchDB(settings.couchdb_root+"hou_new_tweet",True)
        #pdb.Pdb(stdin=open('/dev/stdin', 'r+'), stdout=open('/dev/stdout', 'r+')).set_trace()
        while not HALT:
            user=None
            try:
                uid = self.todo.get()
                user = User.get_id(uid)
                self.crawl(user)
                self.done.put(uid)
                self.todo.task_done()
                if self.twitter.remaining < 10:
                    dt = (self.twitter.reset_time-datetime.utcnow())
                    logging.info("goodnight for %r",dt)
                    time.sleep(dt.seconds)
            except Exception as ex:
                if user:
                    logging.exception("exception for user %s"%user.to_d())
                else:
                    logging.exception("exception and user is None")
            logging.info("api calls remaining: %d",self.twitter.remaining)
        print "slave is done"

    def crawl(self, user):
        logging.debug("visiting %s - %s",user._id,user.screen_name)
        tweets = self.twitter.save_timeline(user._id, user.last_tid)
        if tweets:
            user.last_tid = tweets[0]._id
        now = datetime.utcnow()
        last = user.last_crawl_date if user.last_crawl_date is not None else datetime(2010,11,12)
        delta = now - last
        seconds = delta.seconds + delta.days*24*3600
        tph = (3600.0*len(tweets)/seconds + user.tweets_per_hour)/2
        user.tweets_per_hour = tph
        hours = min(settings.tweets_per_crawl/tph, settings.max_hours)
        user.next_crawl_date = now+timedelta(hours=hours)
        user.last_crawl_date = now
        user.save()
Пример #3
0
 def __init__(self, slave_id, todo, done):
     LocalProc.__init__(self, 'crawl', slave_id)
     self.twitter = TwitterResource()
     self.todo = todo
     self.done = done
Пример #4
0
 def __init__(self, slave_id, todo, done):
     LocalProc.__init__(self,'crawl', slave_id)
     self.twitter = TwitterResource()
     self.todo = todo
     self.done = done
Пример #5
0
 def __init__(self,slave_id):
     LocalProc.__init__(self,'lookup',slave_id)
     self.twitter = TwitterResource()
     self.gisgraphy = GisgraphyResource()
     self.orig_db = CouchDB(settings.couchdb_root+"orig_houtx")
Пример #6
0
class LookupSlave(LocalProc):
    def __init__(self,slave_id):
        LocalProc.__init__(self,'lookup',slave_id)
        self.twitter = TwitterResource()
        self.gisgraphy = GisgraphyResource()
        self.orig_db = CouchDB(settings.couchdb_root+"orig_houtx")

    def run(self):
        while True:
            jobs = []
            for x in xrange(20):
                # reserve blocks to wait when x is 0, but returns None for 1-19
                try:
                    j = self.stalk.reserve(0 if x else None)
                except beanstalkc.DeadlineSoon:
                    break
                if j is None:
                    break
                jobs.append(j)

            bodies = [LookupJobBody.from_job(j) for j in jobs]
            users =self.twitter.user_lookup([b._id for b in bodies])

            logging.info("looking at %r"%[getattr(u,'screen_name','') for u in users])
            for job,body,user in zip(jobs,bodies,users):
                if user is None: continue
                try:
                    if self.twitter.remaining < 30:
                        dt = (self.twitter.reset_time-datetime.utcnow())
                        logging.info("goodnight for %r",dt)
                        time.sleep(dt.seconds)
                    logging.info("look at %s",user.screen_name)
                    if user._id in User.database or user._id in self.orig_db:
                        job.delete()
                        continue
                    self.crawl_user(user)
                    user.save()
                    job.delete()
                except:
                    logging.exception("exception for job %s"%job.body)
                    job.bury()
            logging.info("api calls remaining: %d",self.twitter.remaining)

    def crawl_user(self,user):
        user.local_prob = guess_location(user,self.gisgraphy)
        if user.local_prob != 1.0 or user.protected:
            return
        rels=None
        tweets=None
        if user.followers_count>0 and user.friends_count>0:
            rels = self.twitter.get_relationships(user._id)
            rels.attempt_save()

        if user.statuses_count>0:
            tweets = self.twitter.user_timeline(user._id,since_id=settings.min_tweet_id)
            for tweet in tweets:
                tweet.attempt_save()
        if tweets:
            user.next_crawl_date = datetime.utcnow()
            user.last_crawl_date = datetime.utcnow()
            user.tweets_per_hour = settings.tweets_per_hour
            user.last_tid = tweets[0]._id
        
        user.lookup_done = True
        if user.local_prob == 1.0:
            self.score_new_users(user, rels, tweets)

    def score_new_users(self, user, rels, tweets):
        jobs = defaultdict(LookupJobBody)
        jobs[user._id].done = True

        if rels:
            rfriends = rels.rfriends()
            if len(rfriends) < RFRIEND_POINTS:
                for u in rfriends:
                   jobs[u].rfriends_score = RFRIEND_POINTS/len(rfriends)

        if tweets:
            ats = defaultdict(int)
            for tweet in tweets:
                for uid in tweet.mentions:
                    ats[uid]+=1
            for u,c in ats.iteritems():
                points = c*MENTION_POINTS
                if points >0:
                    jobs[u].mention_score = points

        for k,j in jobs.iteritems():
            j._id = k
            j.put(self.stalk)
Пример #7
0
 def __init__(self, slave_id):
     LocalProc.__init__(self, 'lookup', slave_id)
     self.twitter = TwitterResource()
     self.gisgraphy = GisgraphyResource()
     self.orig_db = CouchDB(settings.couchdb_root + "orig_houtx")
Пример #8
0
class LookupSlave(LocalProc):
    def __init__(self, slave_id):
        LocalProc.__init__(self, 'lookup', slave_id)
        self.twitter = TwitterResource()
        self.gisgraphy = GisgraphyResource()
        self.orig_db = CouchDB(settings.couchdb_root + "orig_houtx")

    def run(self):
        while True:
            jobs = []
            for x in xrange(20):
                # reserve blocks to wait when x is 0, but returns None for 1-19
                try:
                    j = self.stalk.reserve(0 if x else None)
                except beanstalkc.DeadlineSoon:
                    break
                if j is None:
                    break
                jobs.append(j)

            bodies = [LookupJobBody.from_job(j) for j in jobs]
            users = self.twitter.user_lookup([b._id for b in bodies])

            logging.info("looking at %r" %
                         [getattr(u, 'screen_name', '') for u in users])
            for job, body, user in zip(jobs, bodies, users):
                if user is None: continue
                try:
                    if self.twitter.remaining < 30:
                        dt = (self.twitter.reset_time - datetime.utcnow())
                        logging.info("goodnight for %r", dt)
                        time.sleep(dt.seconds)
                    logging.info("look at %s", user.screen_name)
                    if user._id in User.database or user._id in self.orig_db:
                        job.delete()
                        continue
                    self.crawl_user(user)
                    user.save()
                    job.delete()
                except:
                    logging.exception("exception for job %s" % job.body)
                    job.bury()
            logging.info("api calls remaining: %d", self.twitter.remaining)

    def crawl_user(self, user):
        user.local_prob = guess_location(user, self.gisgraphy)
        if user.local_prob != 1.0 or user.protected:
            return
        rels = None
        tweets = None
        if user.followers_count > 0 and user.friends_count > 0:
            rels = self.twitter.get_relationships(user._id)
            rels.attempt_save()

        if user.statuses_count > 0:
            tweets = self.twitter.user_timeline(user._id,
                                                since_id=settings.min_tweet_id)
            for tweet in tweets:
                tweet.attempt_save()
        if tweets:
            user.next_crawl_date = datetime.utcnow()
            user.last_crawl_date = datetime.utcnow()
            user.tweets_per_hour = settings.tweets_per_hour
            user.last_tid = tweets[0]._id

        user.lookup_done = True
        if user.local_prob == 1.0:
            self.score_new_users(user, rels, tweets)

    def score_new_users(self, user, rels, tweets):
        jobs = defaultdict(LookupJobBody)
        jobs[user._id].done = True

        if rels:
            rfriends = rels.rfriends()
            if len(rfriends) < RFRIEND_POINTS:
                for u in rfriends:
                    jobs[u].rfriends_score = RFRIEND_POINTS / len(rfriends)

        if tweets:
            ats = defaultdict(int)
            for tweet in tweets:
                for uid in tweet.mentions:
                    ats[uid] += 1
            for u, c in ats.iteritems():
                points = c * MENTION_POINTS
                if points > 0:
                    jobs[u].mention_score = points

        for k, j in jobs.iteritems():
            j._id = k
            j.put(self.stalk)