Exemplo n.º 1
0
 def __init__(self):
     LocalProc.__init__(self, "lookup")
     self.scores = Scores()
     if settings.lookup_in:
         self.scores.read(settings.lookup_in)
     self.lookups = self.scores.count_lookups()
     self.halt = False
Exemplo n.º 2
0
def _users_from_scores():
    scores = Scores()
    scores.read(settings.lookup_out)
    for uid in scores:
        state, rfs, ats = scores.split(uid)
        if log_score(rfs,ats)>=11:
            yield uid
Exemplo n.º 3
0
def force_lookup(to_db="hou",start_id='',end_id=None):
    "Lookup users who were not included in the original crawl."
    start ='U'+start_id
    end = 'U'+end_id if end_id else 'V'
    user_view = Model.database.paged_view('_all_docs',include_docs=True,startkey=start,endkey=end)
    users = (User(d['doc']) for d in user_view)
    Model.database = connect(to_db)
    found_db = connect("houtx")
    found_view = found_db.paged_view('_all_docs',startkey=start,endkey=end)
    found = set(d['id'] for d in found_view)
    scores = Scores()
    scores.read(settings.lookup_out)
    region = ("Texas","United States")
    for user in users:
        int_uid = as_int_id(user._id)
        if (    user.lookup_done or
                user.protected or
                int_uid not in scores or
                user.local_prob==1 or
                (user.local_prob==0 and user.geonames_place.name not in region) or
                user._id in found
           ):
            continue
        state, rfs, ats = scores.split(int_uid)
        if user.utc_offset == -21600:
            if log_score(rfs,ats,.9) < 1: continue
        else:
            if log_score(rfs,ats) < settings.non_local_cutoff: continue
        user_lookup(user)
Exemplo n.º 4
0
 def __init__(self):
     LocalProc.__init__(self,"lookup")
     self.scores = Scores()
     if settings.lookup_in:
         self.scores.read(settings.lookup_in)
     self.lookups = self.scores.count_lookups()
     self.halt = False
Exemplo n.º 5
0
def analyze():
    "Find out how the scoring algorithm did."
    scores = Scores()
    scores.read(settings.lookup_out)
    local_db = CouchDB('http://127.0.0.1:5984/hou',True)
    local_view = local_db.paged_view('_all_docs',startkey='U',endkey='V')
    local_users = set(r['id'] for r in local_view)

    locs = (-1,0,.5,1)
    weights =(.1,.3,.5,.7,.9)
    counts = dict(
        (score, dict(
            (loc, dict(
                (weight,0)
                for weight in weights))
            for loc in locs))
        for score in xrange(BUCKETS))
    

    for user in all_users():
        if user['doc'].get('utco')!=-21600:
            continue
        state, rfs, ats = scores.split(as_int_id(user['id']))
        if user['id'] in local_users:
            loc = 1
        else:
            try:
                loc = .5 if user['doc']['prob']==.5 else 0
            except ResourceNotFound:
                loc = -1

        for weight in weights:
            score = log_score(rfs,ats,weight)
            counts[score][loc][weight]+=1

    print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal"
    for score in xrange(BUCKETS):
        for loc in locs:
            for weight in weights:
                print "%d\t"%counts[score][loc][weight],
        print
Exemplo n.º 6
0
def analyze():
    "Find out how the scoring algorithm did."
    scores = Scores()
    scores.read(settings.lookup_out)
    local_db = CouchDB('http://127.0.0.1:5984/hou', True)
    local_view = local_db.paged_view('_all_docs', startkey='U', endkey='V')
    local_users = set(r['id'] for r in local_view)

    locs = (-1, 0, .5, 1)
    weights = (.1, .3, .5, .7, .9)
    counts = dict((score,
                   dict((loc, dict((weight, 0) for weight in weights))
                        for loc in locs)) for score in xrange(BUCKETS))

    for user in all_users():
        if user['doc'].get('utco') != -21600:
            continue
        state, rfs, ats = scores.split(as_int_id(user['id']))
        if user['id'] in local_users:
            loc = 1
        else:
            try:
                loc = .5 if user['doc']['prob'] == .5 else 0
            except ResourceNotFound:
                loc = -1

        for weight in weights:
            score = log_score(rfs, ats, weight)
            counts[score][loc][weight] += 1

    print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal"
    for score in xrange(BUCKETS):
        for loc in locs:
            for weight in weights:
                print "%d\t" % counts[score][loc][weight],
        print
Exemplo n.º 7
0
class LookupMaster(LocalProc):
    def __init__(self):
        LocalProc.__init__(self,"lookup")
        self.scores = Scores()
        if settings.lookup_in:
            self.scores.read(settings.lookup_in)
        self.lookups = self.scores.count_lookups()
        self.halt = False

    def run(self):
        print "starting lookup"
        logging.info("started lookup")
        try:
            while not self.halt:
                ready = self.stalk.stats_tube(self.stalk.using())['current-jobs-ready']
                logging.info("ready is %d",ready)
                if ready<1000:
                    cutoff = self.calc_cutoff()

                    if cutoff==0:
                        self.halt=True
                        print "halt because cutoff is 0"
                        break
                    logging.info("pick_users with score %d", cutoff)
                    self.pick_users(max(settings.min_cutoff,cutoff))
                    print "scores:%d lookups:%d"%(len(self.scores),self.lookups)

                logging.info("read_scores")
                self.read_scores()
        except:
            logging.exception("exception caused HALT")
        self.read_scores()
        self.scores.dump(settings.lookup_out)
        print "Lookup is done!"

    def read_scores(self):
        job = None
        stop = 10000000 if self.halt else 100000
        for x in xrange(stop):
            try:
                job = self.stalk.reserve(120)
                if job is None:
                    logging.info("loaded %d scores",x)
                    return
                if job.body=="halt":
                    self.halt=True
                    print "starting to halt..."
                    logging.info("starting to halt...")
                    job.delete()
                    return
                body = LookupJobBody.from_job(job)
                if body.done:
                    self.scores.set_state(as_int_id(body._id), scoredict.DONE)
                else:
                    self.scores.increment(
                        as_int_id(body._id),
                        body.rfriends_score,
                        body.mention_score
                    )
                job.delete()
            except:
                logging.exception("exception in read_scores caused HALT")
                self.halt = True
                if job:
                    job.bury()
                return

    def calc_cutoff(self):
        self.stats = [0 for x in xrange(BUCKETS)]
        for u in self.scores:
            state, rfs, ats = self.scores.split(u)
            if state==scoredict.NEW:
                self.stats[log_score(rfs,ats)]+=1
        for count,score in zip(self.stats,xrange(BUCKETS)):
            logging.info("%d %d",score,count)
        total = 0
        for i in xrange(BUCKETS-1,-1,-1):
            total+=self.stats[i]
            if total > settings.crawl_ratio*(len(self.scores)-self.lookups):
                return i
        return 0

    def pick_users(self, cutoff):
        for uid in self.scores:
            state, rfs, ats = self.scores.split(uid)
            if state==scoredict.NEW and log_score(rfs,ats) >= cutoff:
                job = LookupJobBody(
                    _id=as_local_id('U',uid),
                    rfriends_score=rfs,
                    mention_score=ats,
                )
                job.put(self.stalk)
                self.scores.set_state(uid, scoredict.LOOKUP)
                self.lookups+=1
Exemplo n.º 8
0
class LookupMaster(LocalProc):
    def __init__(self):
        LocalProc.__init__(self,"lookup")
        self.scores = Scores()
        if settings.lookup_in:
            self.scores.read(settings.lookup_in)
        self.lookups = self.scores.count_lookups()
        self.halt = False

    def run(self):
        print "starting lookup"
        logging.info("started lookup")
        try:
            while not self.halt:
                tube = self.stalk.using()
                ready = self.stalk.stats_tube(tube)['current-jobs-ready']
                logging.info("ready is %d",ready)
                if ready<1000:
                    cutoff = self.calc_cutoff()
                    old_lookups = self.lookups

                    if cutoff<settings.min_cutoff and self.lookups>100:
                        self.pick_users(settings.min_cutoff)
                    else:
                        self.pick_users(cutoff)
                    print "scores:%d lookups:%d"%(len(self.scores),self.lookups)
                    if old_lookups == self.lookups:
                        print "halt because no new lookups"
                        self.halt=True
                        self.read_scores()
                        self.force_lookup()

                logging.info("read_scores")
                self.read_scores()
        except:
            logging.exception("exception caused HALT")
        self.read_scores()
        self.scores.dump(settings.lookup_out)
        print "Lookup is done!"

    def read_scores(self):
        job = None
        stop = 10000000 if self.halt else 100000
        for x in xrange(stop):
            try:
                job = self.stalk.reserve(35)
                if job is None:
                    logging.info("loaded %d scores",x)
                    return
                if job.body=="halt":
                    self.halt=True
                    print "starting to halt..."
                    logging.info("starting to halt...")
                    job.delete()
                    return
                body = LookupJobBody.from_job(job)
                if body.done:
                    self.scores.set_state(body._id, scoredict.DONE)
                else:
                    self.scores.increment(
                        body._id,
                        body.rfriends_score,
                        body.mention_score
                    )
                job.delete()
            except:
                logging.exception("exception in read_scores caused HALT")
                self.halt = True
                if job:
                    job.bury()
                return

    def calc_cutoff(self):
        self.stats = [0 for x in xrange(BUCKETS)]
        for u in self.scores:
            state, rfs, ats = self.scores.split(u)
            if state==scoredict.NEW:
                self.stats[log_score(rfs,ats)]+=1
        for count,score in zip(self.stats,xrange(BUCKETS)):
            logging.info("%d %d",score,count)
        total = 0
        for i in xrange(BUCKETS-1,-1,-1):
            total+=self.stats[i]
            if total > settings.crawl_ratio*(len(self.scores)-self.lookups):
                return i
        return 0

    def pick_users(self, cutoff):
        logging.info("pick_users with score %d", cutoff)
        for uid in self.scores:
            state, rfs, ats = self.scores.split(uid)
            if state==scoredict.NEW and log_score(rfs,ats) >= cutoff:
                self._send_job(uid,rfs,ats)
                self.lookups+=1

    def _send_job(self, uid, rfs, ats, force=None):
        job = LookupJobBody(
            _id=uid,
            rfriends_score=rfs,
            mention_score=ats,
            force=force
        )
        job.put(self.stalk)
        self.scores.set_state(uid, scoredict.LOOKUP)

    def force_lookup(self):
        "Lookup users who were not included in the original crawl."
        for user in User.get_all():
            if (    user.lookup_done or
                    user.protected or
                    user._id not in self.scores or
                    user.local_prob==1
               ):
                continue

            state, rfs, ats = self.scores.split(user._id)
            reasons = [
                user.utc_offset == settings.utc_offset,
                log_score(rfs,ats) >= settings.non_local_cutoff,
                user.local_prob == .5,
            ]
            if sum(reasons)>=2:
                logging.info("force %s - %d for %r", user.screen_name, user._id, reasons)
                self._send_job(user._id,rfs,ats,True)
Exemplo n.º 9
0
class LookupMaster(LocalProc):
    def __init__(self):
        LocalProc.__init__(self, "lookup")
        self.scores = Scores()
        if settings.lookup_in:
            self.scores.read(settings.lookup_in)
        self.lookups = self.scores.count_lookups()
        self.halt = False

    def run(self):
        print "starting lookup"
        logging.info("started lookup")
        try:
            while not self.halt:
                ready = self.stalk.stats_tube(
                    self.stalk.using())['current-jobs-ready']
                logging.info("ready is %d", ready)
                if ready < 1000:
                    cutoff = self.calc_cutoff()

                    if cutoff == 0:
                        self.halt = True
                        print "halt because cutoff is 0"
                        break
                    logging.info("pick_users with score %d", cutoff)
                    self.pick_users(max(settings.min_cutoff, cutoff))
                    print "scores:%d lookups:%d" % (len(
                        self.scores), self.lookups)

                logging.info("read_scores")
                self.read_scores()
        except:
            logging.exception("exception caused HALT")
        self.read_scores()
        self.scores.dump(settings.lookup_out)
        print "Lookup is done!"

    def read_scores(self):
        job = None
        stop = 10000000 if self.halt else 100000
        for x in xrange(stop):
            try:
                job = self.stalk.reserve(120)
                if job is None:
                    logging.info("loaded %d scores", x)
                    return
                if job.body == "halt":
                    self.halt = True
                    print "starting to halt..."
                    logging.info("starting to halt...")
                    job.delete()
                    return
                body = LookupJobBody.from_job(job)
                if body.done:
                    self.scores.set_state(as_int_id(body._id), scoredict.DONE)
                else:
                    self.scores.increment(as_int_id(body._id),
                                          body.rfriends_score,
                                          body.mention_score)
                job.delete()
            except:
                logging.exception("exception in read_scores caused HALT")
                self.halt = True
                if job:
                    job.bury()
                return

    def calc_cutoff(self):
        self.stats = [0 for x in xrange(BUCKETS)]
        for u in self.scores:
            state, rfs, ats = self.scores.split(u)
            if state == scoredict.NEW:
                self.stats[log_score(rfs, ats)] += 1
        for count, score in zip(self.stats, xrange(BUCKETS)):
            logging.info("%d %d", score, count)
        total = 0
        for i in xrange(BUCKETS - 1, -1, -1):
            total += self.stats[i]
            if total > settings.crawl_ratio * (len(self.scores) -
                                               self.lookups):
                return i
        return 0

    def pick_users(self, cutoff):
        for uid in self.scores:
            state, rfs, ats = self.scores.split(uid)
            if state == scoredict.NEW and log_score(rfs, ats) >= cutoff:
                job = LookupJobBody(
                    _id=as_local_id('U', uid),
                    rfriends_score=rfs,
                    mention_score=ats,
                )
                job.put(self.stalk)
                self.scores.set_state(uid, scoredict.LOOKUP)
                self.lookups += 1