def __init__(self): LocalProc.__init__(self, "lookup") self.scores = Scores() if settings.lookup_in: self.scores.read(settings.lookup_in) self.lookups = self.scores.count_lookups() self.halt = False
def _users_from_scores(): scores = Scores() scores.read(settings.lookup_out) for uid in scores: state, rfs, ats = scores.split(uid) if log_score(rfs,ats)>=11: yield uid
def force_lookup(to_db="hou",start_id='',end_id=None): "Lookup users who were not included in the original crawl." start ='U'+start_id end = 'U'+end_id if end_id else 'V' user_view = Model.database.paged_view('_all_docs',include_docs=True,startkey=start,endkey=end) users = (User(d['doc']) for d in user_view) Model.database = connect(to_db) found_db = connect("houtx") found_view = found_db.paged_view('_all_docs',startkey=start,endkey=end) found = set(d['id'] for d in found_view) scores = Scores() scores.read(settings.lookup_out) region = ("Texas","United States") for user in users: int_uid = as_int_id(user._id) if ( user.lookup_done or user.protected or int_uid not in scores or user.local_prob==1 or (user.local_prob==0 and user.geonames_place.name not in region) or user._id in found ): continue state, rfs, ats = scores.split(int_uid) if user.utc_offset == -21600: if log_score(rfs,ats,.9) < 1: continue else: if log_score(rfs,ats) < settings.non_local_cutoff: continue user_lookup(user)
def __init__(self): LocalProc.__init__(self,"lookup") self.scores = Scores() if settings.lookup_in: self.scores.read(settings.lookup_in) self.lookups = self.scores.count_lookups() self.halt = False
def analyze(): "Find out how the scoring algorithm did." scores = Scores() scores.read(settings.lookup_out) local_db = CouchDB('http://127.0.0.1:5984/hou',True) local_view = local_db.paged_view('_all_docs',startkey='U',endkey='V') local_users = set(r['id'] for r in local_view) locs = (-1,0,.5,1) weights =(.1,.3,.5,.7,.9) counts = dict( (score, dict( (loc, dict( (weight,0) for weight in weights)) for loc in locs)) for score in xrange(BUCKETS)) for user in all_users(): if user['doc'].get('utco')!=-21600: continue state, rfs, ats = scores.split(as_int_id(user['id'])) if user['id'] in local_users: loc = 1 else: try: loc = .5 if user['doc']['prob']==.5 else 0 except ResourceNotFound: loc = -1 for weight in weights: score = log_score(rfs,ats,weight) counts[score][loc][weight]+=1 print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal" for score in xrange(BUCKETS): for loc in locs: for weight in weights: print "%d\t"%counts[score][loc][weight], print
def analyze(): "Find out how the scoring algorithm did." scores = Scores() scores.read(settings.lookup_out) local_db = CouchDB('http://127.0.0.1:5984/hou', True) local_view = local_db.paged_view('_all_docs', startkey='U', endkey='V') local_users = set(r['id'] for r in local_view) locs = (-1, 0, .5, 1) weights = (.1, .3, .5, .7, .9) counts = dict((score, dict((loc, dict((weight, 0) for weight in weights)) for loc in locs)) for score in xrange(BUCKETS)) for user in all_users(): if user['doc'].get('utco') != -21600: continue state, rfs, ats = scores.split(as_int_id(user['id'])) if user['id'] in local_users: loc = 1 else: try: loc = .5 if user['doc']['prob'] == .5 else 0 except ResourceNotFound: loc = -1 for weight in weights: score = log_score(rfs, ats, weight) counts[score][loc][weight] += 1 print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal" for score in xrange(BUCKETS): for loc in locs: for weight in weights: print "%d\t" % counts[score][loc][weight], print
class LookupMaster(LocalProc): def __init__(self): LocalProc.__init__(self,"lookup") self.scores = Scores() if settings.lookup_in: self.scores.read(settings.lookup_in) self.lookups = self.scores.count_lookups() self.halt = False def run(self): print "starting lookup" logging.info("started lookup") try: while not self.halt: ready = self.stalk.stats_tube(self.stalk.using())['current-jobs-ready'] logging.info("ready is %d",ready) if ready<1000: cutoff = self.calc_cutoff() if cutoff==0: self.halt=True print "halt because cutoff is 0" break logging.info("pick_users with score %d", cutoff) self.pick_users(max(settings.min_cutoff,cutoff)) print "scores:%d lookups:%d"%(len(self.scores),self.lookups) logging.info("read_scores") self.read_scores() except: logging.exception("exception caused HALT") self.read_scores() self.scores.dump(settings.lookup_out) print "Lookup is done!" def read_scores(self): job = None stop = 10000000 if self.halt else 100000 for x in xrange(stop): try: job = self.stalk.reserve(120) if job is None: logging.info("loaded %d scores",x) return if job.body=="halt": self.halt=True print "starting to halt..." logging.info("starting to halt...") job.delete() return body = LookupJobBody.from_job(job) if body.done: self.scores.set_state(as_int_id(body._id), scoredict.DONE) else: self.scores.increment( as_int_id(body._id), body.rfriends_score, body.mention_score ) job.delete() except: logging.exception("exception in read_scores caused HALT") self.halt = True if job: job.bury() return def calc_cutoff(self): self.stats = [0 for x in xrange(BUCKETS)] for u in self.scores: state, rfs, ats = self.scores.split(u) if state==scoredict.NEW: self.stats[log_score(rfs,ats)]+=1 for count,score in zip(self.stats,xrange(BUCKETS)): logging.info("%d %d",score,count) total = 0 for i in xrange(BUCKETS-1,-1,-1): total+=self.stats[i] if total > settings.crawl_ratio*(len(self.scores)-self.lookups): return i return 0 def pick_users(self, cutoff): for uid in self.scores: state, rfs, ats = self.scores.split(uid) if state==scoredict.NEW and log_score(rfs,ats) >= cutoff: job = LookupJobBody( _id=as_local_id('U',uid), rfriends_score=rfs, mention_score=ats, ) job.put(self.stalk) self.scores.set_state(uid, scoredict.LOOKUP) self.lookups+=1
class LookupMaster(LocalProc): def __init__(self): LocalProc.__init__(self,"lookup") self.scores = Scores() if settings.lookup_in: self.scores.read(settings.lookup_in) self.lookups = self.scores.count_lookups() self.halt = False def run(self): print "starting lookup" logging.info("started lookup") try: while not self.halt: tube = self.stalk.using() ready = self.stalk.stats_tube(tube)['current-jobs-ready'] logging.info("ready is %d",ready) if ready<1000: cutoff = self.calc_cutoff() old_lookups = self.lookups if cutoff<settings.min_cutoff and self.lookups>100: self.pick_users(settings.min_cutoff) else: self.pick_users(cutoff) print "scores:%d lookups:%d"%(len(self.scores),self.lookups) if old_lookups == self.lookups: print "halt because no new lookups" self.halt=True self.read_scores() self.force_lookup() logging.info("read_scores") self.read_scores() except: logging.exception("exception caused HALT") self.read_scores() self.scores.dump(settings.lookup_out) print "Lookup is done!" def read_scores(self): job = None stop = 10000000 if self.halt else 100000 for x in xrange(stop): try: job = self.stalk.reserve(35) if job is None: logging.info("loaded %d scores",x) return if job.body=="halt": self.halt=True print "starting to halt..." logging.info("starting to halt...") job.delete() return body = LookupJobBody.from_job(job) if body.done: self.scores.set_state(body._id, scoredict.DONE) else: self.scores.increment( body._id, body.rfriends_score, body.mention_score ) job.delete() except: logging.exception("exception in read_scores caused HALT") self.halt = True if job: job.bury() return def calc_cutoff(self): self.stats = [0 for x in xrange(BUCKETS)] for u in self.scores: state, rfs, ats = self.scores.split(u) if state==scoredict.NEW: self.stats[log_score(rfs,ats)]+=1 for count,score in zip(self.stats,xrange(BUCKETS)): logging.info("%d %d",score,count) total = 0 for i in xrange(BUCKETS-1,-1,-1): total+=self.stats[i] if total > settings.crawl_ratio*(len(self.scores)-self.lookups): return i return 0 def pick_users(self, cutoff): logging.info("pick_users with score %d", cutoff) for uid in self.scores: state, rfs, ats = self.scores.split(uid) if state==scoredict.NEW and log_score(rfs,ats) >= cutoff: self._send_job(uid,rfs,ats) self.lookups+=1 def _send_job(self, uid, rfs, ats, force=None): job = LookupJobBody( _id=uid, rfriends_score=rfs, mention_score=ats, force=force ) job.put(self.stalk) self.scores.set_state(uid, scoredict.LOOKUP) def force_lookup(self): "Lookup users who were not included in the original crawl." for user in User.get_all(): if ( user.lookup_done or user.protected or user._id not in self.scores or user.local_prob==1 ): continue state, rfs, ats = self.scores.split(user._id) reasons = [ user.utc_offset == settings.utc_offset, log_score(rfs,ats) >= settings.non_local_cutoff, user.local_prob == .5, ] if sum(reasons)>=2: logging.info("force %s - %d for %r", user.screen_name, user._id, reasons) self._send_job(user._id,rfs,ats,True)
class LookupMaster(LocalProc): def __init__(self): LocalProc.__init__(self, "lookup") self.scores = Scores() if settings.lookup_in: self.scores.read(settings.lookup_in) self.lookups = self.scores.count_lookups() self.halt = False def run(self): print "starting lookup" logging.info("started lookup") try: while not self.halt: ready = self.stalk.stats_tube( self.stalk.using())['current-jobs-ready'] logging.info("ready is %d", ready) if ready < 1000: cutoff = self.calc_cutoff() if cutoff == 0: self.halt = True print "halt because cutoff is 0" break logging.info("pick_users with score %d", cutoff) self.pick_users(max(settings.min_cutoff, cutoff)) print "scores:%d lookups:%d" % (len( self.scores), self.lookups) logging.info("read_scores") self.read_scores() except: logging.exception("exception caused HALT") self.read_scores() self.scores.dump(settings.lookup_out) print "Lookup is done!" def read_scores(self): job = None stop = 10000000 if self.halt else 100000 for x in xrange(stop): try: job = self.stalk.reserve(120) if job is None: logging.info("loaded %d scores", x) return if job.body == "halt": self.halt = True print "starting to halt..." logging.info("starting to halt...") job.delete() return body = LookupJobBody.from_job(job) if body.done: self.scores.set_state(as_int_id(body._id), scoredict.DONE) else: self.scores.increment(as_int_id(body._id), body.rfriends_score, body.mention_score) job.delete() except: logging.exception("exception in read_scores caused HALT") self.halt = True if job: job.bury() return def calc_cutoff(self): self.stats = [0 for x in xrange(BUCKETS)] for u in self.scores: state, rfs, ats = self.scores.split(u) if state == scoredict.NEW: self.stats[log_score(rfs, ats)] += 1 for count, score in zip(self.stats, xrange(BUCKETS)): logging.info("%d %d", score, count) total = 0 for i in xrange(BUCKETS - 1, -1, -1): total += self.stats[i] if total > settings.crawl_ratio * (len(self.scores) - self.lookups): return i return 0 def pick_users(self, cutoff): for uid in self.scores: state, rfs, ats = self.scores.split(uid) if state == scoredict.NEW and log_score(rfs, ats) >= cutoff: job = LookupJobBody( _id=as_local_id('U', uid), rfriends_score=rfs, mention_score=ats, ) job.put(self.stalk) self.scores.set_state(uid, scoredict.LOOKUP) self.lookups += 1