def agglo_logit_calc(Xps1, Yps1, nonmusic_subreddits): """ Handles fitting and scoring of the agglomeration->logistic regression machine learning scheme. """ Xps1 = Xps1.toarray() logit = LogisticRegression() (n_samples_1, _) = Xps1.shape n_folds = 4 rand = 0 kf = KFold(n_samples_1, n_folds=n_folds, shuffle=True, random_state=rand) logit_1 = 0.0 logit_20 = 0.0 n_lo = 1 n_hi = 155 step = 1 n_groups_gen = range(n_lo, n_hi + 1, step) agglo_1s = [0.0 for _ in n_groups_gen] agglo_20s = [0.0 for _ in n_groups_gen] params = np.empty([len(n_groups_gen), n_folds], dtype=object) logit_params = [] for i_fold, (train, test) in enumerate(kf): print i_fold logit.fit(Xps1[train], Yps1[train]) logit_params.append(logit.coef_) logit_1 += 100.0 * logit.score(Xps1[test], Yps1[test]) (Xps20_test, Yps20_test) = prune_sparse_samples(Xps1[test], Yps1[test], threshold=20) (Xps20_test, Yps20_test) = balance_data(Xps20_test, Yps20_test) logit_20 += 100.0 * logit.score(Xps20_test, Yps20_test) for j, n_groups in enumerate(n_groups_gen): agglo = phi_agglomerate(N=n_groups).fit(Xps1[train], Yps1[train]) Xagglo_train_1, _ = agglo.transform(Xps1[train]) Xagglo_test_1, _ = agglo.transform(Xps1[test]) Xagglo_test_20, _ = agglo.transform(Xps20_test) logit.fit(Xagglo_train_1, Yps1[train]) params[j][i_fold] = logit.coef_ agglo_1s[j] += 100.0 * logit.score(Xagglo_test_1, Yps1[test]) / n_folds agglo_20s[j] += 100.0 * logit.score(Xagglo_test_20, Yps20_test) / n_folds logit_1 /= n_folds logit_20 /= n_folds return (n_lo, n_hi, logit_1, logit_20, n_groups_gen, agglo_1s, agglo_20s, params, logit_params)
def graph_music_taste(Xps1, Yps1, nonmusic_subreddits, n_groups=20, node_cut=2000, edge_cut=0.15): """ Creates a graph of connected subreddits, colour-coded by which rank they come under. Keyword args: node_cut - # fans who need to post in a subreddit for it to be included in visualisation edge_cut - # weakest edge that will be included in visualisation """ (n_samples, n_features) = Xps1.shape pickle_fn = "pickles/agglo_graph.pickle" try: with open(pickle_fn, "r") as graph_file: g = pickle.load(graph_file) except IOError: print "No pickle of graph. Constructing." Xps1 = Xps1.toarray() (Xps1_agglo, sub_group) = phi_agglomerate(N=n_groups).fit(Xps1, Yps1).transform(Xps1) coefs = LogisticRegression().fit(Xps1_agglo, Yps1).coef_[0] colors = get_color_rgba(coefs) # Create mask to only deal with subreddits above a threshold size sub_size = Xps1.sum(axis=0) # Create connections array to obtain number of users linking two arrays n_connections = np.zeros([n_features, n_features], dtype=int) for i_fan in range(n_samples): subs = np.nonzero(Xps1[i_fan])[0] for sub1, sub2 in combinations(subs, r=2): n_connections[sub1, sub2] += 1 # Make vertices and assign properties g = Graph(directed=False) verts = g.add_vertex(n=n_features) verts = list(verts) sub_name = g.new_vertex_property("string") group = g.new_vertex_property("int") group_colour = g.new_vertex_property("vector<double>") sub_size_v = g.new_vertex_property("float") for i_vert in range(n_features): sub_name[verts[i_vert]] = nonmusic_subreddits[i_vert] group[verts[i_vert]] = sub_group[i_vert] group_colour[verts[i_vert]] = colors[sub_group[i_vert]] sub_size_v[verts[i_vert]] = sub_size[i_vert] # Make edges and assign properties connections = g.new_edge_property("int") group_av_colour = g.new_edge_property("vector<double>") group_av = g.new_edge_property("int") for a, b in combinations(range(n_features), r=2): e = g.add_edge(verts[a], verts[b]) connections[e] = n_connections[a][b] group_av[e] = (sub_group[a] + sub_group[b]) / 2 group_av_colour[e] = colors[group_av[e]] # Make all properties internal for pickling g.vertex_properties["sub_name"] = sub_name g.vertex_properties["sub_size"] = sub_size_v g.vertex_properties["group"] = group g.vertex_properties["group_colour"] = group_colour g.edge_properties["connections"] = connections g.edge_properties["group_av"] = group_av g.edge_properties["group_color"] = group_av_colour with open(pickle_fn, "w") as graph_file: pickle.dump(g, graph_file) # Mask small subreddits (less than node_cut users) # Take log of subreddit size for size representations vertex_filter = g.new_vertex_property("bool") g.vp.sub_size_log = g.new_vertex_property("float") biggest = 0 for vert in g.vertices(): vertex_filter[vert] = g.vp.sub_size[vert] > node_cut g.vp.sub_size_log[vert] = np.log(g.vp.sub_size[vert]) # Track biggest node to use as root if g.vp.sub_size[vert] > biggest: root_vert = vert biggest = g.vp.sub_size[vert] g.set_vertex_filter(vertex_filter) # Mask weakest edges (weight less than edge_cut) # Divide through # connections to make line thickness g.ep.line_thickness = g.new_edge_property("float") g.ep.line_thick_log = g.new_edge_property("float") edge_weight_threshold = g.new_edge_property("bool") for edge in g.edges(): g.ep.line_thickness[edge] = g.ep.connections[edge] * 0.003 # g.ep.line_thick_log[edge] = np.log(g.ep.connections[edge]) a = edge.source() b = edge.target() edge_weight = min( float(g.ep.connections[edge]) / g.vp.sub_size[a], float(g.ep.connections[edge]) / g.vp.sub_size[b] ) edge_weight_threshold[edge] = edge_weight > edge_cut g.set_edge_filter(edge_weight_threshold) # Mask nodes with no edges (needs to converge) for vert in g.vertices(): if len(list(vert.all_edges())) == 0: vertex_filter[vert] = False g.set_vertex_filter(vertex_filter) pos = radial_tree_layout(g, root_vert) graph_draw( g, pos=pos, output_size=(1000, 800), output="README_figs/top_subreddits_graph.svg", vertex_font_size=10, vertex_text=g.vp.sub_name, vertex_fill_color=g.vp.group_colour, vertex_size=g.vp.sub_size_log, edge_pen_width=g.edge_properties.line_thickness, edge_color=g.edge_properties.group_color, )