def rebuild_index(): """ Rebuild the K-nearest neighbors index based on 50000 of the most active users (ignoring the top 500 most active). """ pipe = get_pipeline() usernames = pipe.zrevrange(format_key("user"), 500, 50500).execute()[0] for user in usernames: get_vector(user, pipe=pipe) results = pipe.execute() points = np.zeros([len(usernames), nvector]) for i in range(len(usernames)): points[i, :] = parse_vector(results[8 * i:8 * (i + 1)]) flann = pyflann.FLANN() flann.build_index(points) # Save the index. fn1 = _h5_filename(index_filename) tmp1 = fn1 + ".tmp" flann.save_index(tmp1) # Save the index coordinates. fn2 = _h5_filename(points_filename) tmp2 = fn2 + ".tmp" with h5py.File(tmp2, "w") as f: f["points"] = points f["names"] = usernames # Atomically move the index files into place. shutil.move(tmp1, fn1) shutil.move(tmp2, fn2)
def set_expire(): pipe = get_pipeline() # Get the list of all keys. keys = pipe.keys().execute()[0] n = float(len(keys)) print("Found {0:.0f} keys".format(n)) # Loop over the keys and deal with each one. for i, key in enumerate(keys): # Skip the opt-out keys. if key.endswith(":optout"): continue # Deal with temporary keys. if any(imap(key.endswith, [":name", ":etag", ":gravatar", ":tz"])): pipe.expire(key, TEMP_TTL) continue # Everything else should get the default TTL. pipe.expire(key, DEFAULT_TTL) # Execute the updates in batches. if (i + 1) % 5000 == 0: print("Finished {0} keys [{1:.2f} %]".format( i + 1, (i + 1) / n * 100)) pipe.execute() pipe.execute()
def set_expire(): pipe = get_pipeline() # Get the list of all keys. keys = pipe.keys().execute()[0] n = float(len(keys)) print("Found {0:.0f} keys".format(n)) # Loop over the keys and deal with each one. for i, key in enumerate(keys): # Skip the opt-out keys. if key.endswith(":optout"): continue # Deal with temporary keys. if any(imap(key.endswith, [":name", ":etag", ":gravatar", ":tz"])): pipe.expire(key, TEMP_TTL) continue # Everything else should get the default TTL. pipe.expire(key, DEFAULT_TTL) # Execute the updates in batches. if (i+1) % 5000 == 0: print("Finished {0} keys [{1:.2f} %]".format(i+1, (i+1)/n*100)) pipe.execute() pipe.execute()
def get_vector(user, pipe=None): """ Given a username, fetch all of the data needed to build a behavior vector from the database. :param user: The GitHub username. :param pipe: (optional) if provided, simply add the requests to the existing redis pipeline and don't execute the request. """ no_pipe = False if pipe is None: pipe = get_pipeline() no_pipe = True user = user.lower() pipe.zscore(format_key("user"), user) pipe.hgetall(format_key("user:{0}:day".format(user))) pipe.zrevrange(format_key("user:{0}:event".format(user)), 0, -1, withscores=True) pipe.zcard(format_key("user:{0}:contribution".format(user))) pipe.zcard(format_key("user:{0}:connection".format(user))) pipe.zcard(format_key("user:{0}:repo".format(user))) pipe.zcard(format_key("user:{0}:lang".format(user))) pipe.zrevrange(format_key("user:{0}:lang".format(user)), 0, -1, withscores=True) if no_pipe: return pipe.execute()
def del_connections(): pipe = get_pipeline() # Get the list of all keys. keys = pipe.keys(format_key("social:connection:*")).execute()[0] n = float(len(keys)) print("Found {0:.0f} keys".format(n)) # Loop over the keys and deal with each one. for i, key in enumerate(keys): pipe.delete(key) pipe.execute()
def get_repo_info(username, reponame, maxusers=5, max_recommend=5): if _is_robot(): return None # Normalize the repository name. repo = "{0}/{1}".format(username, reponame) rkey = format_key("social:repo:{0}".format(repo)) recommend_key = format_key("social:recommend:{0}".format(repo)) # Get the list of users. pipe = get_pipeline() pipe.exists(format_key("user:{0}:optout".format(username.lower()))) pipe.exists(rkey) pipe.exists(recommend_key) pipe.zrevrange(recommend_key, 0, max_recommend-1) pipe.zrevrange(rkey, 0, maxusers-1, withscores=True) flag0, flag1, flag2, recommendations, users = pipe.execute() if flag0 or not flag1: return None if not flag2: # Compute the repository similarities. [pipe.zrevrange(format_key("social:user:{0}".format(u)), 0, -1) for u, count in users] repos = pipe.execute() [pipe.zincrby(recommend_key, r, 1) for l in repos for r in l if r != repo] pipe.expire(recommend_key, 172800) pipe.zrevrange(recommend_key, 0, max_recommend-1) recommendations = pipe.execute()[-1] # Get the contributor names. users = [(u, c) for u, c in users if int(c) > 1] [pipe.get(format_key("user:{0}:name".format(u))) for u, count in users] names = pipe.execute() return { "repository": repo, "recommendations": recommendations, "contributors": [{"username": u, "name": n.decode("utf-8") if n is not None else u, "count": int(count)} for (u, count), n in zip(users, names)] }
def _google_geocode(location): # Check for quota limits. pipe = get_pipeline() usage_key = format_key("google_usage_limit") usage = pipe.get(usage_key).execute()[0] if usage is not None: logging.warn("Skipping Google geocode request for usage limits.") return None # Submit the request. params = dict( address=location, sensor="false", key=flask.current_app.config["GOOGLE_KEY"], ) r = requests.get(goapi_url, params=params) if r.status_code != requests.codes.ok: logging.error(r.content) return None data = r.json() # Try not to go over usage limits. status = data.get("status", None) if status == "OVER_QUERY_LIMIT": pipe.set(usage_key, 1).expire(usage_key, 60*60) pipe.execute() return None # Parse the results. results = data.get("results", []) if not len(results): return None # Find the coordinates. loc = results[0].get("geometry", {}).get("location", None) return loc
def get_comparison(user1, user2): # Normalize the usernames. user1, user2 = user1.lower(), user2.lower() # Grab the stats from the database. pipe = get_pipeline() pipe.zscore(format_key("user"), user1) pipe.zscore(format_key("user"), user2) pipe.zrevrange(format_key("user:{0}:event".format(user1)), 0, -1, withscores=True) pipe.zrevrange(format_key("user:{0}:event".format(user2)), 0, -1, withscores=True) pipe.zrevrange(format_key("user:{0}:lang".format(user1)), 0, -1, withscores=True) pipe.zrevrange(format_key("user:{0}:lang".format(user2)), 0, -1, withscores=True) pipe.hgetall(format_key("user:{0}:day".format(user1))) pipe.hgetall(format_key("user:{0}:day".format(user2))) raw = pipe.execute() # Get the total number of events. total1 = float(raw[0]) if raw[0] is not None else 0 total2 = float(raw[1]) if raw[1] is not None else 0 if not total1: return "is more active on GitHub" elif not total2: return "is less active on GitHub" # Load the event types from disk. with flask.current_app.open_resource("event_types.json") as f: evttypes = json.load(f) # Compare the fractional event types. evts1 = dict(raw[2]) evts2 = dict(raw[3]) diffs = [] for e, desc in evttypes.iteritems(): if e in evts1 and e in evts2: d = float(evts2[e]) / total2 / float(evts1[e]) * total1 if d != 1: more = "more" if d > 1 else "less" if d > 1: d = 1.0 / d diffs.append((desc.format(more=more, user=user2), d * d)) # Compare language usage. langs1 = dict(raw[4]) langs2 = dict(raw[5]) for l in set(langs1.keys()) | set(langs2.keys()): n = float(langs1.get(l, 0)) / total1 d = float(langs2.get(l, 0)) / total2 if n != d and d > 0: if n > 0: d = d / n else: d = 1.0 / d more = "more" if d > 1 else "less" desc = "is {{more}} of a {0} aficionado".format(l) if d > 1: d = 1.0 / d diffs.append((desc.format(more=more), d * d)) # Number of languages. nl1, nl2 = len(raw[4]), len(raw[5]) if nl1 and nl2: desc = "speaks {more} languages" if nl1 > nl2: diffs.append((desc.format(more="fewer"), nl2 * nl2 / nl1 / nl1)) else: diffs.append((desc.format(user=user2, more="more"), nl1 * nl1 / nl2 / nl2)) # Compare the average weekly schedules. week1 = map(lambda v: int(v[1]), raw[6].iteritems()) week2 = map(lambda v: int(v[1]), raw[7].iteritems()) mu1, mu2 = sum(week1) / 7.0, sum(week2) / 7.0 var1 = np.sqrt(sum(map(lambda v: (v - mu1) ** 2, week1)) / 7.0) / mu1 var2 = np.sqrt(sum(map(lambda v: (v - mu2) ** 2, week2)) / 7.0) / mu2 if var1 or var2 and var1 != var2: if var1 > var2: diffs.append(("has a more consistent weekly schedule", var2/var1)) else: diffs.append(("has a less consistent weekly schedule", var1/var2)) # Compute the relative probabilities of the comparisons and normalize. ps = map(lambda v: v[1], diffs) norm = sum(ps) # Choose a random description weighted by the probabilities. return np.random.choice([d[0] for d in diffs], p=[p / norm for p in ps])
def get_usage_stats(username): user = username.lower() pipe = get_pipeline() # Get the total number of events performed by this user. pipe.zscore(format_key("user"), user) # The timezone estimate. pipe.get(format_key("user:{0}:tz".format(user))) # Get the top <= 5 most common events. pipe.zrevrangebyscore(format_key("user:{0}:event".format(user)), "+inf", 0, 0, 5, withscores=True) # The average daily and weekly schedules. pipe.hgetall(format_key("user:{0}:hour".format(user))) pipe.hgetall(format_key("user:{0}:day".format(user))) # The language stats. pipe.zrevrange(format_key("user:{0}:lang".format(user)), 0, -1, withscores=True) # Parse the results. results = pipe.execute() total_events = int(results[0]) if results[0] is not None else 0 if not total_events: return None timezone = results[1] offset = int(timezone) + 8 if timezone is not None else 0 event_counts = results[2] daily_histogram = make_histogram(results[3].items(), 24, offset) weekly_histogram = make_histogram(results[4].items(), 7) languages = results[5] # Parse the languages into a nicer form and get quantiles. [(pipe.zcount(format_key("lang:{0}:user".format(l)), 100, "+inf"), pipe.zrevrank(format_key("lang:{0}:user".format(l)), user)) for l, c in languages] quants = pipe.execute() languages = [{"language": l, "quantile": (min([100, int(100 * float(pos) / tot) + 1]) if tot > 0 and pos is not None else 100), "count": int(c)} for (l, c), tot, pos in zip(languages, quants[::2], quants[1::2])] # Generate some stats for the event specific event types. [(pipe.hgetall(format_key("user:{0}:event:{1}:day".format(user, e))), pipe.hgetall(format_key("user:{0}:event:{1}:hour".format(user, e)))) for e, c in event_counts] results = pipe.execute() events = [{"type": e[0], "total": int(e[1]), "week": map(int, make_histogram(w.items(), 7)), "day": map(int, make_histogram(d.items(), 24, offset))} for e, w, d in zip(event_counts, results[::2], results[1::2])] return { "total": total_events, "events": events, "day": map(int, daily_histogram), "week": map(int, weekly_histogram), "languages": languages, }
def get_user_info(username): # Normalize the username. user = username.lower() # Get the cached information. pipe = get_pipeline() pipe.get(format_key("user:{0}:name".format(user))) pipe.get(format_key("user:{0}:etag".format(user))) pipe.get(format_key("user:{0}:gravatar".format(user))) pipe.get(format_key("user:{0}:tz".format(user))) pipe.exists(format_key("user:{0}:optout".format(user))) name, etag, gravatar, timezone, optout = pipe.execute() if optout: return None, True if name is not None: name = name.decode("utf-8") # Return immediately if it's a robot. if not _is_robot(): # Work out the authentication headers. auth = {} client_id = flask.current_app.config.get("GITHUB_ID", None) client_secret = flask.current_app.config.get("GITHUB_SECRET", None) if client_id is not None and client_secret is not None: auth["client_id"] = client_id auth["client_secret"] = client_secret # Perform a conditional fetch on the database. headers = {} if etag is not None: headers = {"If-None-Match": etag} r = requests.get(ghapi_url.format(username=username), params=auth, headers=headers) code = r.status_code if code != 304 and code == requests.codes.ok: data = r.json() name = data.get("name") or data.get("login") or username etag = r.headers["ETag"] gravatar = data.get("gravatar_id", "none") location = data.get("location", None) if location is not None: tz = estimate_timezone(location) if tz is not None: timezone = tz # Update the cache. _redis_execute(pipe, "set", "user:{0}:name".format(user), name) _redis_execute(pipe, "set", "user:{0}:etag".format(user), etag) _redis_execute(pipe, "set", "user:{0}:gravatar".format(user), gravatar) if timezone is not None: _redis_execute(pipe, "set", "user:{0}:tz".format(user), timezone) pipe.execute() return { "username": username, "name": name if name is not None else username, "gravatar": gravatar if gravatar is not None else "none", "timezone": int(timezone) if timezone is not None else None, }, False