Пример #1
0
def demographics(con):
    data = {}

    cur = con.cursor()
    cur.execute("SELECT gender, birth_year, COUNT(trip_id) trip_count "
                "FROM trips WHERE gender NOT NULL AND birth_year NOT NULL "
                "GROUP BY gender, birth_year ORDER BY trip_count DESC")
    while True:
        rows = cur.fetchmany(1000)
        if not rows: break
        for row in rows:
            put_val(data, row["gender"], row["birth_year"], row["trip_count"])
    cur.close()

    for kvs, data in traverse(data, "gender"):
        gender = kvs["gender"]
        xs = list(sorted(data.keys()))
        ys = [data[x] for x in xs]
        total = sum(ys)

        fig = plt.figure()
        plt.scatter(xs, ys)
        plt.yscale("log")
        plt.xlabel("birth year")
        plt.ylabel("number of rides")
        plt.grid(axis="y")
        plt.title("%s n=%d" % (gender, total))
        plt.savefig("gender.%s.pdf" % gender, bbox_inches="tight")
Пример #2
0
def demographics(con):
    data = {}

    cur = con.cursor()
    cur.execute("SELECT gender, birth_year, COUNT(trip_id) trip_count "
        "FROM trips WHERE gender NOT NULL AND birth_year NOT NULL "
        "GROUP BY gender, birth_year ORDER BY trip_count DESC")
    while True:
        rows = cur.fetchmany(1000)
        if not rows: break
        for row in rows:
            put_val(data, row["gender"], row["birth_year"], row["trip_count"])
    cur.close()

    for kvs, data in traverse(data, "gender"):
        gender = kvs["gender"]
        xs = list(sorted(data.keys()))
        ys = [data[x] for x in xs]
        total = sum(ys)

        fig = plt.figure()
        plt.scatter(xs, ys)
        plt.yscale("log")
        plt.xlabel("birth year")
        plt.ylabel("number of rides")
        plt.grid(axis="y")
        plt.title("%s n=%d" % (gender, total))
        plt.savefig("gender.%s.pdf" % gender, bbox_inches="tight")
Пример #3
0
def symmetry(con, threshold=75):
    data = {}

    cur = con.cursor()
    cur.execute(
        "SELECT from_station_name, to_station_name, count(trip_id) num "
        "FROM trips WHERE from_station_name != to_station_name GROUP BY from_station_name, to_station_name HAVING num >= ?",
        (threshold, ))
    while True:
        rows = cur.fetchmany(1000)
        if not rows: break
        for row in rows:
            put_val(data, (row["from_station_name"], row["to_station_name"]),
                    "num", row["num"])

    print "have %d routes with at least %d trips" % (len(data), threshold)

    # drop pairs where either direction has less than the given threshold
    for pair in list(data):
        rev_pair = tuple(reversed(pair))
        if (get(data, pair, "num", leaf=0) <= threshold or rev_pair not in data
                or get(data, rev_pair, "num", leaf=0) <= threshold):
            del data[pair]
            continue

    print "have %d routes remaining after filtering for reverse trip meeting threshold" % len(
        data)

    i = 0
    for pair in data:
        durs = []
        cur.execute(
            "SELECT trip_duration FROM trips WHERE from_station_name = ? "
            "AND to_station_name = ?", pair)
        while True:
            rows = cur.fetchmany(1000)
            if not rows: break
            for row in rows:
                durs.append(row["trip_duration"])

        durs_a = array(durs)
        put_val(data, pair, "mean", mean(durs_a))
        put_val(data, pair, "median", median(durs_a))
        put_val(data, pair, "std", std(durs_a))

        #print pair, data[pair]
        i += 1
        if i % 100 == 0:
            print "getting duration stats; %d routes remaining" % (len(data) -
                                                                   i)

    cur.close()

    plot_data = {}

    done = set()
    for pair in data:
        rev_pair = tuple(reversed(pair))
        if rev_pair in done: continue
        done.add(rev_pair)

        for param in ["num", "mean", "median", "std"]:
            val1 = get(data, pair, param)
            val2 = get(data, rev_pair, param)

            put_list(plot_data, param, "x", min(val1, val2))
            put_list(plot_data, param, "y", max(val1, val2))

    for param in ["num", "mean", "median", "std"]:
        fig = plt.figure()
        xs = plot_data[param]["x"]
        ys = plot_data[param]["y"]
        plt.scatter(xs, ys)
        plt.xlabel("min(%s)" % param)
        plt.ylabel("max(%s)" % param)
        plt.savefig("symmetry_%s.pdf" % param, bbox_inches="tight")
Пример #4
0
def symmetry(con, threshold=75):
    data = {}

    cur = con.cursor()
    cur.execute("SELECT from_station_name, to_station_name, count(trip_id) num "
        "FROM trips WHERE from_station_name != to_station_name GROUP BY from_station_name, to_station_name HAVING num >= ?", (threshold,))
    while True:
        rows = cur.fetchmany(1000)
        if not rows: break
        for row in rows:
            put_val(data, (row["from_station_name"], row["to_station_name"]),
                    "num", row["num"])

    print "have %d routes with at least %d trips" % (len(data), threshold)

    # drop pairs where either direction has less than the given threshold
    for pair in list(data):
        rev_pair = tuple(reversed(pair))
        if (get(data, pair, "num", leaf=0) <= threshold
            or rev_pair not in data
            or get(data, rev_pair, "num", leaf=0) <= threshold):
            del data[pair]
            continue

    print "have %d routes remaining after filtering for reverse trip meeting threshold" % len(data)

    i = 0
    for pair in data:
        durs = []
        cur.execute("SELECT trip_duration FROM trips WHERE from_station_name = ? "
                    "AND to_station_name = ?", pair)
        while True:
            rows = cur.fetchmany(1000)
            if not rows: break
            for row in rows:
                durs.append(row["trip_duration"])

        durs_a = array(durs)
        put_val(data, pair, "mean", mean(durs_a))
        put_val(data, pair, "median", median(durs_a))
        put_val(data, pair, "std", std(durs_a))

        #print pair, data[pair]
        i += 1
        if i % 100 == 0:
            print "getting duration stats; %d routes remaining" % (len(data) - i)

    cur.close()

    plot_data = {}

    done = set()
    for pair in data:
        rev_pair = tuple(reversed(pair))
        if rev_pair in done: continue
        done.add(rev_pair)

        for param in ["num", "mean", "median", "std"]:
            val1 = get(data, pair, param)
            val2 = get(data, rev_pair, param)

            put_list(plot_data, param, "x", min(val1, val2))
            put_list(plot_data, param, "y", max(val1, val2))

    for param in ["num", "mean", "median", "std"]:
        fig = plt.figure()
        xs = plot_data[param]["x"]
        ys = plot_data[param]["y"]
        plt.scatter(xs, ys)
        plt.xlabel("min(%s)" % param)
        plt.ylabel("max(%s)" % param)
        plt.savefig("symmetry_%s.pdf" % param, bbox_inches="tight")