예제 #1
0
def agglo_logit_calc(Xps1, Yps1, nonmusic_subreddits):
    """ Handles fitting and scoring of the agglomeration->logistic regression
        machine learning scheme.
    """

    Xps1 = Xps1.toarray()

    logit = LogisticRegression()
    (n_samples_1, _) = Xps1.shape

    n_folds = 4
    rand = 0
    kf = KFold(n_samples_1, n_folds=n_folds, shuffle=True, random_state=rand)

    logit_1 = 0.0
    logit_20 = 0.0

    n_lo = 1
    n_hi = 155
    step = 1
    n_groups_gen = range(n_lo, n_hi + 1, step)

    agglo_1s = [0.0 for _ in n_groups_gen]
    agglo_20s = [0.0 for _ in n_groups_gen]

    params = np.empty([len(n_groups_gen), n_folds], dtype=object)
    logit_params = []

    for i_fold, (train, test) in enumerate(kf):
        print i_fold

        logit.fit(Xps1[train], Yps1[train])
        logit_params.append(logit.coef_)

        logit_1 += 100.0 * logit.score(Xps1[test], Yps1[test])

        (Xps20_test, Yps20_test) = prune_sparse_samples(Xps1[test], Yps1[test], threshold=20)
        (Xps20_test, Yps20_test) = balance_data(Xps20_test, Yps20_test)

        logit_20 += 100.0 * logit.score(Xps20_test, Yps20_test)

        for j, n_groups in enumerate(n_groups_gen):

            agglo = phi_agglomerate(N=n_groups).fit(Xps1[train], Yps1[train])
            Xagglo_train_1, _ = agglo.transform(Xps1[train])
            Xagglo_test_1, _ = agglo.transform(Xps1[test])
            Xagglo_test_20, _ = agglo.transform(Xps20_test)

            logit.fit(Xagglo_train_1, Yps1[train])

            params[j][i_fold] = logit.coef_

            agglo_1s[j] += 100.0 * logit.score(Xagglo_test_1, Yps1[test]) / n_folds
            agglo_20s[j] += 100.0 * logit.score(Xagglo_test_20, Yps20_test) / n_folds

    logit_1 /= n_folds
    logit_20 /= n_folds

    return (n_lo, n_hi, logit_1, logit_20, n_groups_gen, agglo_1s, agglo_20s, params, logit_params)
예제 #2
0
def graph_music_taste(Xps1, Yps1, nonmusic_subreddits, n_groups=20, node_cut=2000, edge_cut=0.15):
    """ Creates a graph of connected subreddits, colour-coded by which rank
        they come under.

        Keyword args:
        node_cut - # fans who need to post in a subreddit for it to be included
                     in visualisation
        edge_cut - # weakest edge that will be included in visualisation
    """

    (n_samples, n_features) = Xps1.shape
    pickle_fn = "pickles/agglo_graph.pickle"

    try:
        with open(pickle_fn, "r") as graph_file:
            g = pickle.load(graph_file)

    except IOError:
        print "No pickle of graph. Constructing."

        Xps1 = Xps1.toarray()
        (Xps1_agglo, sub_group) = phi_agglomerate(N=n_groups).fit(Xps1, Yps1).transform(Xps1)
        coefs = LogisticRegression().fit(Xps1_agglo, Yps1).coef_[0]
        colors = get_color_rgba(coefs)

        # Create mask to only deal with subreddits above a threshold size
        sub_size = Xps1.sum(axis=0)

        # Create connections array to obtain number of users linking two arrays
        n_connections = np.zeros([n_features, n_features], dtype=int)

        for i_fan in range(n_samples):
            subs = np.nonzero(Xps1[i_fan])[0]
            for sub1, sub2 in combinations(subs, r=2):
                n_connections[sub1, sub2] += 1

        # Make vertices and assign properties
        g = Graph(directed=False)
        verts = g.add_vertex(n=n_features)
        verts = list(verts)

        sub_name = g.new_vertex_property("string")
        group = g.new_vertex_property("int")
        group_colour = g.new_vertex_property("vector<double>")
        sub_size_v = g.new_vertex_property("float")
        for i_vert in range(n_features):
            sub_name[verts[i_vert]] = nonmusic_subreddits[i_vert]
            group[verts[i_vert]] = sub_group[i_vert]
            group_colour[verts[i_vert]] = colors[sub_group[i_vert]]
            sub_size_v[verts[i_vert]] = sub_size[i_vert]

        # Make edges and assign properties
        connections = g.new_edge_property("int")
        group_av_colour = g.new_edge_property("vector<double>")
        group_av = g.new_edge_property("int")
        for a, b in combinations(range(n_features), r=2):
            e = g.add_edge(verts[a], verts[b])
            connections[e] = n_connections[a][b]
            group_av[e] = (sub_group[a] + sub_group[b]) / 2
            group_av_colour[e] = colors[group_av[e]]

        # Make all properties internal for pickling
        g.vertex_properties["sub_name"] = sub_name
        g.vertex_properties["sub_size"] = sub_size_v
        g.vertex_properties["group"] = group
        g.vertex_properties["group_colour"] = group_colour
        g.edge_properties["connections"] = connections
        g.edge_properties["group_av"] = group_av
        g.edge_properties["group_color"] = group_av_colour

        with open(pickle_fn, "w") as graph_file:
            pickle.dump(g, graph_file)

    # Mask small subreddits (less than node_cut users)
    # Take log of subreddit size for size representations
    vertex_filter = g.new_vertex_property("bool")
    g.vp.sub_size_log = g.new_vertex_property("float")
    biggest = 0
    for vert in g.vertices():
        vertex_filter[vert] = g.vp.sub_size[vert] > node_cut
        g.vp.sub_size_log[vert] = np.log(g.vp.sub_size[vert])

        # Track biggest node to use as root
        if g.vp.sub_size[vert] > biggest:
            root_vert = vert
            biggest = g.vp.sub_size[vert]

    g.set_vertex_filter(vertex_filter)

    # Mask weakest edges (weight less than edge_cut)
    # Divide through # connections to make line thickness
    g.ep.line_thickness = g.new_edge_property("float")
    g.ep.line_thick_log = g.new_edge_property("float")
    edge_weight_threshold = g.new_edge_property("bool")
    for edge in g.edges():
        g.ep.line_thickness[edge] = g.ep.connections[edge] * 0.003
        # g.ep.line_thick_log[edge] = np.log(g.ep.connections[edge])
        a = edge.source()
        b = edge.target()
        edge_weight = min(
            float(g.ep.connections[edge]) / g.vp.sub_size[a], float(g.ep.connections[edge]) / g.vp.sub_size[b]
        )
        edge_weight_threshold[edge] = edge_weight > edge_cut
    g.set_edge_filter(edge_weight_threshold)

    # Mask nodes with no edges (needs to converge)
    for vert in g.vertices():
        if len(list(vert.all_edges())) == 0:
            vertex_filter[vert] = False

    g.set_vertex_filter(vertex_filter)

    pos = radial_tree_layout(g, root_vert)
    graph_draw(
        g,
        pos=pos,
        output_size=(1000, 800),
        output="README_figs/top_subreddits_graph.svg",
        vertex_font_size=10,
        vertex_text=g.vp.sub_name,
        vertex_fill_color=g.vp.group_colour,
        vertex_size=g.vp.sub_size_log,
        edge_pen_width=g.edge_properties.line_thickness,
        edge_color=g.edge_properties.group_color,
    )