示例#1
0
def rebuild_index():
    """
    Rebuild the K-nearest neighbors index based on 50000 of the most active
    users (ignoring the top 500 most active).

    """
    pipe = get_pipeline()
    usernames = pipe.zrevrange(format_key("user"), 500, 50500).execute()[0]

    for user in usernames:
        get_vector(user, pipe=pipe)

    results = pipe.execute()
    points = np.zeros([len(usernames), nvector])
    for i in range(len(usernames)):
        points[i, :] = parse_vector(results[8 * i:8 * (i + 1)])

    flann = pyflann.FLANN()
    flann.build_index(points)

    # Save the index.
    fn1 = _h5_filename(index_filename)
    tmp1 = fn1 + ".tmp"
    flann.save_index(tmp1)

    # Save the index coordinates.
    fn2 = _h5_filename(points_filename)
    tmp2 = fn2 + ".tmp"
    with h5py.File(tmp2, "w") as f:
        f["points"] = points
        f["names"] = usernames

    # Atomically move the index files into place.
    shutil.move(tmp1, fn1)
    shutil.move(tmp2, fn2)
示例#2
0
def set_expire():
    pipe = get_pipeline()

    # Get the list of all keys.
    keys = pipe.keys().execute()[0]
    n = float(len(keys))
    print("Found {0:.0f} keys".format(n))

    # Loop over the keys and deal with each one.
    for i, key in enumerate(keys):
        # Skip the opt-out keys.
        if key.endswith(":optout"):
            continue

        # Deal with temporary keys.
        if any(imap(key.endswith, [":name", ":etag", ":gravatar", ":tz"])):
            pipe.expire(key, TEMP_TTL)
            continue

        # Everything else should get the default TTL.
        pipe.expire(key, DEFAULT_TTL)

        # Execute the updates in batches.
        if (i + 1) % 5000 == 0:
            print("Finished {0} keys [{1:.2f} %]".format(
                i + 1, (i + 1) / n * 100))
            pipe.execute()

    pipe.execute()
示例#3
0
def set_expire():
    pipe = get_pipeline()

    # Get the list of all keys.
    keys = pipe.keys().execute()[0]
    n = float(len(keys))
    print("Found {0:.0f} keys".format(n))

    # Loop over the keys and deal with each one.
    for i, key in enumerate(keys):
        # Skip the opt-out keys.
        if key.endswith(":optout"):
            continue

        # Deal with temporary keys.
        if any(imap(key.endswith, [":name", ":etag", ":gravatar", ":tz"])):
            pipe.expire(key, TEMP_TTL)
            continue

        # Everything else should get the default TTL.
        pipe.expire(key, DEFAULT_TTL)

        # Execute the updates in batches.
        if (i+1) % 5000 == 0:
            print("Finished {0} keys [{1:.2f} %]".format(i+1, (i+1)/n*100))
            pipe.execute()

    pipe.execute()
示例#4
0
def get_vector(user, pipe=None):
    """
    Given a username, fetch all of the data needed to build a behavior vector
    from the database.

    :param user: The GitHub username.
    :param pipe: (optional) if provided, simply add the requests to the
                 existing redis pipeline and don't execute the request.

    """
    no_pipe = False
    if pipe is None:
        pipe = get_pipeline()
        no_pipe = True

    user = user.lower()
    pipe.zscore(format_key("user"), user)
    pipe.hgetall(format_key("user:{0}:day".format(user)))
    pipe.zrevrange(format_key("user:{0}:event".format(user)), 0, -1,
                   withscores=True)
    pipe.zcard(format_key("user:{0}:contribution".format(user)))
    pipe.zcard(format_key("user:{0}:connection".format(user)))
    pipe.zcard(format_key("user:{0}:repo".format(user)))
    pipe.zcard(format_key("user:{0}:lang".format(user)))
    pipe.zrevrange(format_key("user:{0}:lang".format(user)), 0, -1,
                   withscores=True)

    if no_pipe:
        return pipe.execute()
示例#5
0
def del_connections():
    pipe = get_pipeline()

    # Get the list of all keys.
    keys = pipe.keys(format_key("social:connection:*")).execute()[0]
    n = float(len(keys))
    print("Found {0:.0f} keys".format(n))

    # Loop over the keys and deal with each one.
    for i, key in enumerate(keys):
        pipe.delete(key)

    pipe.execute()
示例#6
0
def del_connections():
    pipe = get_pipeline()

    # Get the list of all keys.
    keys = pipe.keys(format_key("social:connection:*")).execute()[0]
    n = float(len(keys))
    print("Found {0:.0f} keys".format(n))

    # Loop over the keys and deal with each one.
    for i, key in enumerate(keys):
        pipe.delete(key)

    pipe.execute()
示例#7
0
def get_repo_info(username, reponame, maxusers=5, max_recommend=5):
    if _is_robot():
        return None

    # Normalize the repository name.
    repo = "{0}/{1}".format(username, reponame)
    rkey = format_key("social:repo:{0}".format(repo))
    recommend_key = format_key("social:recommend:{0}".format(repo))

    # Get the list of users.
    pipe = get_pipeline()
    pipe.exists(format_key("user:{0}:optout".format(username.lower())))
    pipe.exists(rkey)
    pipe.exists(recommend_key)
    pipe.zrevrange(recommend_key, 0, max_recommend-1)
    pipe.zrevrange(rkey, 0, maxusers-1, withscores=True)
    flag0, flag1, flag2, recommendations, users = pipe.execute()
    if flag0 or not flag1:
        return None

    if not flag2:
        # Compute the repository similarities.
        [pipe.zrevrange(format_key("social:user:{0}".format(u)), 0, -1)
         for u, count in users]
        repos = pipe.execute()
        [pipe.zincrby(recommend_key, r, 1) for l in repos for r in l
         if r != repo]
        pipe.expire(recommend_key, 172800)
        pipe.zrevrange(recommend_key, 0, max_recommend-1)
        recommendations = pipe.execute()[-1]

    # Get the contributor names.
    users = [(u, c) for u, c in users if int(c) > 1]
    [pipe.get(format_key("user:{0}:name".format(u))) for u, count in users]
    names = pipe.execute()

    return {
        "repository": repo,
        "recommendations": recommendations,
        "contributors": [{"username": u, "name": n.decode("utf-8")
                          if n is not None else u,
                          "count": int(count)}
                         for (u, count), n in zip(users, names)]
    }
示例#8
0
def _google_geocode(location):
    # Check for quota limits.
    pipe = get_pipeline()
    usage_key = format_key("google_usage_limit")
    usage = pipe.get(usage_key).execute()[0]
    if usage is not None:
        logging.warn("Skipping Google geocode request for usage limits.")
        return None

    # Submit the request.
    params = dict(
        address=location,
        sensor="false",
        key=flask.current_app.config["GOOGLE_KEY"],
    )
    r = requests.get(goapi_url, params=params)
    if r.status_code != requests.codes.ok:
        logging.error(r.content)
        return None

    data = r.json()

    # Try not to go over usage limits.
    status = data.get("status", None)
    if status == "OVER_QUERY_LIMIT":
        pipe.set(usage_key, 1).expire(usage_key, 60*60)
        pipe.execute()
        return None

    # Parse the results.
    results = data.get("results", [])
    if not len(results):
        return None

    # Find the coordinates.
    loc = results[0].get("geometry", {}).get("location", None)
    return loc
示例#9
0
def get_comparison(user1, user2):
    # Normalize the usernames.
    user1, user2 = user1.lower(), user2.lower()

    # Grab the stats from the database.
    pipe = get_pipeline()
    pipe.zscore(format_key("user"), user1)
    pipe.zscore(format_key("user"), user2)
    pipe.zrevrange(format_key("user:{0}:event".format(user1)), 0, -1,
                   withscores=True)
    pipe.zrevrange(format_key("user:{0}:event".format(user2)), 0, -1,
                   withscores=True)
    pipe.zrevrange(format_key("user:{0}:lang".format(user1)), 0, -1,
                   withscores=True)
    pipe.zrevrange(format_key("user:{0}:lang".format(user2)), 0, -1,
                   withscores=True)
    pipe.hgetall(format_key("user:{0}:day".format(user1)))
    pipe.hgetall(format_key("user:{0}:day".format(user2)))
    raw = pipe.execute()

    # Get the total number of events.
    total1 = float(raw[0]) if raw[0] is not None else 0
    total2 = float(raw[1]) if raw[1] is not None else 0
    if not total1:
        return "is more active on GitHub"
    elif not total2:
        return "is less active on GitHub"

    # Load the event types from disk.
    with flask.current_app.open_resource("event_types.json") as f:
        evttypes = json.load(f)

    # Compare the fractional event types.
    evts1 = dict(raw[2])
    evts2 = dict(raw[3])
    diffs = []
    for e, desc in evttypes.iteritems():
        if e in evts1 and e in evts2:
            d = float(evts2[e]) / total2 / float(evts1[e]) * total1
            if d != 1:
                more = "more" if d > 1 else "less"
                if d > 1:
                    d = 1.0 / d
                diffs.append((desc.format(more=more, user=user2), d * d))

    # Compare language usage.
    langs1 = dict(raw[4])
    langs2 = dict(raw[5])
    for l in set(langs1.keys()) | set(langs2.keys()):
        n = float(langs1.get(l, 0)) / total1
        d = float(langs2.get(l, 0)) / total2
        if n != d and d > 0:
            if n > 0:
                d = d / n
            else:
                d = 1.0 / d
            more = "more" if d > 1 else "less"
            desc = "is {{more}} of a {0} aficionado".format(l)
            if d > 1:
                d = 1.0 / d
            diffs.append((desc.format(more=more), d * d))

    # Number of languages.
    nl1, nl2 = len(raw[4]), len(raw[5])
    if nl1 and nl2:
        desc = "speaks {more} languages"
        if nl1 > nl2:
            diffs.append((desc.format(more="fewer"),
                          nl2 * nl2 / nl1 / nl1))
        else:
            diffs.append((desc.format(user=user2, more="more"),
                          nl1 * nl1 / nl2 / nl2))

    # Compare the average weekly schedules.
    week1 = map(lambda v: int(v[1]), raw[6].iteritems())
    week2 = map(lambda v: int(v[1]), raw[7].iteritems())
    mu1, mu2 = sum(week1) / 7.0, sum(week2) / 7.0
    var1 = np.sqrt(sum(map(lambda v: (v - mu1) ** 2, week1)) / 7.0) / mu1
    var2 = np.sqrt(sum(map(lambda v: (v - mu2) ** 2, week2)) / 7.0) / mu2
    if var1 or var2 and var1 != var2:
        if var1 > var2:
            diffs.append(("has a more consistent weekly schedule", var2/var1))
        else:
            diffs.append(("has a less consistent weekly schedule", var1/var2))

    # Compute the relative probabilities of the comparisons and normalize.
    ps = map(lambda v: v[1], diffs)
    norm = sum(ps)

    # Choose a random description weighted by the probabilities.
    return np.random.choice([d[0] for d in diffs], p=[p / norm for p in ps])
示例#10
0
def get_usage_stats(username):
    user = username.lower()
    pipe = get_pipeline()

    # Get the total number of events performed by this user.
    pipe.zscore(format_key("user"), user)

    # The timezone estimate.
    pipe.get(format_key("user:{0}:tz".format(user)))

    # Get the top <= 5 most common events.
    pipe.zrevrangebyscore(format_key("user:{0}:event".format(user)),
                          "+inf", 0, 0, 5, withscores=True)

    # The average daily and weekly schedules.
    pipe.hgetall(format_key("user:{0}:hour".format(user)))
    pipe.hgetall(format_key("user:{0}:day".format(user)))

    # The language stats.
    pipe.zrevrange(format_key("user:{0}:lang".format(user)), 0, -1,
                   withscores=True)

    # Parse the results.
    results = pipe.execute()
    total_events = int(results[0]) if results[0] is not None else 0
    if not total_events:
        return None
    timezone = results[1]
    offset = int(timezone) + 8 if timezone is not None else 0
    event_counts = results[2]
    daily_histogram = make_histogram(results[3].items(), 24, offset)
    weekly_histogram = make_histogram(results[4].items(), 7)
    languages = results[5]

    # Parse the languages into a nicer form and get quantiles.
    [(pipe.zcount(format_key("lang:{0}:user".format(l)), 100, "+inf"),
      pipe.zrevrank(format_key("lang:{0}:user".format(l)), user))
     for l, c in languages]
    quants = pipe.execute()
    languages = [{"language": l,
                  "quantile": (min([100, int(100 * float(pos) / tot) + 1])
                               if tot > 0 and pos is not None
                               else 100),
                  "count": int(c)}
                 for (l, c), tot, pos in zip(languages, quants[::2],
                                             quants[1::2])]

    # Generate some stats for the event specific event types.
    [(pipe.hgetall(format_key("user:{0}:event:{1}:day".format(user, e))),
      pipe.hgetall(format_key("user:{0}:event:{1}:hour".format(user, e))))
     for e, c in event_counts]
    results = pipe.execute()
    events = [{"type": e[0],
               "total": int(e[1]),
               "week": map(int, make_histogram(w.items(), 7)),
               "day": map(int, make_histogram(d.items(), 24, offset))}
              for e, w, d in zip(event_counts, results[::2], results[1::2])]

    return {
        "total": total_events,
        "events": events,
        "day": map(int, daily_histogram),
        "week": map(int, weekly_histogram),
        "languages": languages,
    }
示例#11
0
def get_user_info(username):
    # Normalize the username.
    user = username.lower()

    # Get the cached information.
    pipe = get_pipeline()
    pipe.get(format_key("user:{0}:name".format(user)))
    pipe.get(format_key("user:{0}:etag".format(user)))
    pipe.get(format_key("user:{0}:gravatar".format(user)))
    pipe.get(format_key("user:{0}:tz".format(user)))
    pipe.exists(format_key("user:{0}:optout".format(user)))
    name, etag, gravatar, timezone, optout = pipe.execute()
    if optout:
        return None, True

    if name is not None:
        name = name.decode("utf-8")

    # Return immediately if it's a robot.
    if not _is_robot():
        # Work out the authentication headers.
        auth = {}
        client_id = flask.current_app.config.get("GITHUB_ID", None)
        client_secret = flask.current_app.config.get("GITHUB_SECRET", None)
        if client_id is not None and client_secret is not None:
            auth["client_id"] = client_id
            auth["client_secret"] = client_secret

        # Perform a conditional fetch on the database.
        headers = {}
        if etag is not None:
            headers = {"If-None-Match": etag}

        r = requests.get(ghapi_url.format(username=username), params=auth,
                         headers=headers)
        code = r.status_code
        if code != 304 and code == requests.codes.ok:
            data = r.json()
            name = data.get("name") or data.get("login") or username
            etag = r.headers["ETag"]
            gravatar = data.get("gravatar_id", "none")
            location = data.get("location", None)
            if location is not None:
                tz = estimate_timezone(location)
                if tz is not None:
                    timezone = tz

            # Update the cache.
            _redis_execute(pipe, "set", "user:{0}:name".format(user), name)
            _redis_execute(pipe, "set", "user:{0}:etag".format(user), etag)
            _redis_execute(pipe, "set", "user:{0}:gravatar".format(user),
                           gravatar)
            if timezone is not None:
                _redis_execute(pipe, "set", "user:{0}:tz".format(user),
                               timezone)
            pipe.execute()

    return {
        "username": username,
        "name": name if name is not None else username,
        "gravatar": gravatar if gravatar is not None else "none",
        "timezone": int(timezone) if timezone is not None else None,
    }, False