示例#1
0
    def map(self, items):
        self.twitter = TwitterResource()
        Model.database = MongoDB(name=self.db_name, host=settings.db_host)

        for user in items:
            try:
                self.crawl(user)
                self.twitter.sleep_if_needed()
            except Exception:
                logging.exception("exception for user %s" % user.to_d())
            yield None
示例#2
0
class CrawlProcess(SplitProcess):
    def __init__(self, db_name, **kwargs):
        SplitProcess.__init__(self, **kwargs)
        self.db_name = db_name
        self.waiting = set()

    def produce(self):
        Model.database = MongoDB(name=self.db_name, host=settings.db_host)
        endtime = datetime.utcnow()
        return User.find(User.next_crawl_date < endtime, sort=User.next_crawl_date, timeout=False)

    def map(self, items):
        self.twitter = TwitterResource()
        Model.database = MongoDB(name=self.db_name, host=settings.db_host)

        for user in items:
            try:
                self.crawl(user)
                self.twitter.sleep_if_needed()
            except Exception:
                logging.exception("exception for user %s" % user.to_d())
            yield None

    def crawl(self, user):
        logging.info("visiting %s - %s", user._id, user.screen_name)
        tweets = self.twitter.save_timeline(user._id, user.last_tid)
        if tweets:
            user.last_tid = tweets[0]._id
        logging.info("saved %d for %s", len(tweets), user.screen_name)
        now = datetime.utcnow()
        last = user.last_crawl_date if user.last_crawl_date is not None else datetime(2010, 11, 12)
        delta = now - last
        seconds = delta.seconds + delta.days * 24 * 3600
        tph = (3600.0 * len(tweets) / seconds + user.tweets_per_hour) / 2
        user.tweets_per_hour = tph
        hours = min(settings.tweets_per_crawl / tph, settings.max_hours)
        user.next_crawl_date = now + timedelta(hours=hours)
        user.last_crawl_date = now
        user.save()
示例#3
0
 def __init__(self,slave_id):
     LocalProc.__init__(self,'lookup',slave_id)
     self.twitter = TwitterResource()
     self.gisgraphy = GisgraphyResource()
示例#4
0
class LookupSlave(LocalProc):
    def __init__(self,slave_id):
        LocalProc.__init__(self,'lookup',slave_id)
        self.twitter = TwitterResource()
        self.gisgraphy = GisgraphyResource()

    def run(self):
        while True:
            jobs = []
            for x in xrange(100):
                try:
                    # reserve blocks to wait when x is 0, but returns None for 1-99
                    j = self.stalk.reserve(0 if x else None)
                except beanstalkc.DeadlineSoon:
                    break
                if j is None:
                    break
                jobs.append(j)

            bodies = [LookupJobBody.from_job(j) for j in jobs]
            try:
                users =self.twitter.user_lookup([b._id for b in bodies])
            except ResourceNotFound:
                logging.info("no profile for %r",[b._id for b in bodies])
                continue

            logging.info("looking at %r"%[getattr(u,'screen_name','') for u in users])
            for job,body,user in zip(jobs,bodies,users):
                if user is None:
                    logging.info("no profile for %d",body._id)
                    job.delete()
                    continue
                try:
                    self.twitter.sleep_if_needed()
                    logging.info("look at %s",user.screen_name)
                    if (not body.force) and User.in_db(user._id):
                        job.delete()
                        continue
                    self.crawl_user(user,body.force)
                    user.save()
                    job.delete()
                except:
                    logging.exception("exception for job %s"%job.body)
                    job.bury()
            logging.info("api calls remaining: %d",self.twitter.remaining)

    def crawl_user(self,user,force):
        user.local_prob = guess_location(user,self.gisgraphy)
        if (user.local_prob != 1.0 and not force) or user.protected:
            return
        rels=None
        tweets=None
        if user.followers_count>0 and user.friends_count>0:
            rels = self.twitter.get_edges(user._id)
            rels.attempt_save()

        if user.statuses_count>0:
            tweets = self.twitter.save_timeline(user._id,last_tid=settings.min_tweet_id)
        if tweets:
            user.next_crawl_date = datetime.utcnow()
            user.last_crawl_date = datetime.utcnow()
            user.tweets_per_hour = settings.tweets_per_hour
            user.last_tid = tweets[0]._id
        
        user.lookup_done = True
        if user.local_prob == 1.0 and not force:
            self.score_new_users(user, rels, tweets)

    def score_new_users(self, user, rels, tweets):
        jobs = defaultdict(LookupJobBody)
        jobs[user._id].done = True

        if rels:
            rfriends = rels.rfriends()
            if len(rfriends) < RFRIEND_POINTS:
                for u in rfriends:
                   jobs[u].rfriends_score = RFRIEND_POINTS/len(rfriends)

        if tweets:
            ats = defaultdict(int)
            for tweet in tweets:
                for uid in tweet.mentions:
                    ats[uid]+=1
            for u,c in ats.iteritems():
                points = c*MENTION_POINTS
                if points >0:
                    jobs[u].mention_score = points

        for k,j in jobs.iteritems():
            j._id = k
            j.put(self.stalk)