def check_degree_corr(adj_mat_infile, edges_infile): """ Print and return correlation between vector of people's affil degrees and vector of people's num friends degrees :param adj_mat_infile: :param edges_infile: :return: """ adj_mat, row_labels, _ = read_loc_adj_mat(adj_mat_infile) edge_matrix, _ = load_edge_matrix(edges_infile) pass # not finished, not needed now
def loc_compute_corrs(): adj_mat_infile = '/home/lfriedl/ASOUND-bipartite/data-prep/loc_data/interim/gowalla/stratified/bipartite_adj_6friends.txt.gz' edges_infile = '/home/lfriedl/ASOUND-bipartite/data-prep/loc_data/interim/gowalla/stratified/loc-edges_6friends.txt.gz' # or original files (on local machine) adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/gowalla/bipartite_adj.txt' edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/gowalla/loc-gowalla_edges.txt' adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile) edge_matrix, edge_row_labels_map = load_edge_matrix(edges_infile, row_names) friends_per_item = adj_mat.sum(axis=1) affils_per_item = edge_matrix.sum(axis=1) correlation = np.corrcoef(friends_per_item, affils_per_item, rowvar=False)[0, 1] #print("correlation: " + str(correlation) + " for gowalla 6friends") print("correlation: " + str(correlation) + " for gowalla original")
def read_sample_save(adj_mat_infile, edges_infile, num_nodes, rows_outfile): adj_mat, item_names, _ = read_loc_adj_mat(adj_mat_infile) row_ids_to_keep = set(random.sample(list(range(adj_mat.shape[0])), num_nodes)) # indices w/in adj_mat adj_mat_to_keep = adj_mat[sorted(row_ids_to_keep),] item_names_to_keep = [item_names[i] for i in sorted(row_ids_to_keep)] # oddly, subset notation above doesn't work # challenge: adj_mat_to_keep doesn't remember the old/semantically meaningful row labels. Need to keep these around # to send to the pair generators. print("Sampled " + str(num_nodes) + " nodes") with open(rows_outfile, 'wt') as fp: fp.write(" ".join(map(str, sorted(row_ids_to_keep)))) # probably need better syntax # edges can be stored efficiently in another sparse matrix label_generator = get_label_generator_from_edgefile(edges_infile, item_names_to_keep) return adj_mat_to_keep, item_names_to_keep, label_generator
def get_loc_expt_data(adj_mat_infile, edges_infile, row_ids_infile): """ :param adj_mat_infile: bipartite graph. Each line = 1 item (name), followed by pipe-separated affils. :param edges_infile: true pairs. Each line = 2 item names (tab-separated). :param row_ids_infile: this expt will use subgraph induced by this set of items. N.B. stored as indices, not names. :return: """ adj_mat, row_labels, _ = read_loc_adj_mat(adj_mat_infile) with open(row_ids_infile, 'r') as fin: row_ids_to_keep = sorted(map(int, fin.readline().split())) # row ids are all on one line, space-separated adj_mat_to_keep = adj_mat[row_ids_to_keep,] row_labels_to_keep = [row_labels[i] for i in row_ids_to_keep] label_generator = get_label_generator_from_edgefile(edges_infile, row_labels_to_keep) return adj_mat_to_keep, row_labels_to_keep, label_generator
def run_all_pairs_loc_data(adj_mat_infile, edges_infile, outdir, tag): # modified just a bit from get_loc_expt_data() adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile) label_generator = get_label_generator_from_edgefile(edges_infile, row_names) evals_outfile = outdir + '/results_' + tag + '.txt' scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz' all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson', 'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr', 'mixed_pairs'] # (for gowalla, just ran standard before) mixed_pairs_sims = [.001, .005, .01, .05, .1, .2, .3, .4, .5] score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run, #[method], evals_outfile=evals_outfile, mixed_pairs_sims=mixed_pairs_sims, pair_scores_outfile=scored_pairs_outfile, row_labels=row_names, remove_boundary_items=False, print_timing=True, prefer_faiss=True)
def affil_subsets_loc_data(adj_mat_infile, edges_infile, outdir, tag, affil_subset_fraction, affil_subset_type): adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile) label_generator = get_label_generator_from_edgefile(edges_infile, row_names) tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float(adj_mat.shape[0]) affils_to_keep = affil_subsets.compute_affil_subsets(tmp_pi_vector, affil_subset_fraction, affil_subset_type) adj_mat = adj_mat[:, affils_to_keep] evals_outfile = outdir + '/results_' + tag + '.txt' scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz' all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson', 'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr', 'mixed_pairs'] score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run, #[method], evals_outfile=evals_outfile, pair_scores_outfile=scored_pairs_outfile, row_labels=row_names, remove_boundary_items=False, print_timing=True, prefer_faiss=True)
def stratify_by_num_edges(adj_mat_infile, edges_infile, outdir, min_edges, max_edges): """ Given a data set, calculate the number of true pairs each item participates in --> its degree. Then partition the data set by item degree. Write out a separate adj_mat and edges_file for items of each degree, from min_edges to max_edges. :param adj_mat_infile: The full bipartite graph for a data set. :param edges_infile: Full set of true pairs. :param outdir: :param min_edges: :param max_edges: :return: """ edge_matrix, edge_row_labels_map = prelim_loc_data_expts.load_edge_matrix(edges_infile) # people are numbered 0 through max in orig file edge_row_indices_to_labels = {v: k for k, v in edge_row_labels_map.iteritems()} # simply reversed, var[index] = label adj_mat, row_labels, loc_labels = loc_data.read_loc_adj_mat(adj_mat_infile) # missing some people; row_labels is what matches edge_matrix num_friends = np.asarray(edge_matrix.sum(axis=0)).squeeze() for edge_count in range(min_edges, max_edges + 1): # filter both matrices to people with edge_count people_to_keep_edgemat_index = set(np.nonzero(num_friends == edge_count)[0]) print("found " + str(len(people_to_keep_edgemat_index)) + " people having " + str(edge_count) + " friends each") people_labels_to_keep = set([label for (ind, label) in edge_row_indices_to_labels.iteritems() if ind in people_to_keep_edgemat_index]) # save files adj_mat_file = outdir + "/bipartite_adj_" + str(edge_count) + "friends.txt.gz" edge_mat_file = outdir + "/loc-edges_" + str(edge_count) + "friends.txt.gz" with gzip.open(adj_mat_file, 'w') as fout: # match formatting of orig file: row_id,loc|loc|loc|... fout.write("V1,checkins\n") for i in range(adj_mat.shape[0]): if row_labels[i] in people_labels_to_keep: fout.write(str(row_labels[i]) + "," + "|".join( [loc_labels[j] for j in np.nonzero(adj_mat[i,].toarray()[0])[0]]) + "\n") with gzip.open(edge_mat_file, 'w') as fout: # orig format: i<tab>j. (Was symmetric, and we'll store it as such.) for (i,j) in zip(*edge_matrix.nonzero()): if edge_row_indices_to_labels[i] in people_labels_to_keep and edge_row_indices_to_labels[j] in people_labels_to_keep: fout.write(str(edge_row_indices_to_labels[i]) + "\t" + str(edge_row_indices_to_labels[j]) + "\n")
def compare_timings_faiss_normal(adj_mat_infile, evals_outfile, scored_pairs_outfile): infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 5000) # my OS kills it at 10000 (due to memory) # num_nodes = [2000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) print("\n*** Running all faiss methods ***\n") print("(asked for " + str(num_to_try) + " nodes)") methods_to_run = scoring_with_faiss.all_faiss_methods start = timer() score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data. true_labels_for_expts_with_5pairs, method_spec=methods_to_run, evals_outfile=evals_outfile, pair_scores_outfile=scored_pairs_outfile, print_timing=True) end = timer() print("ran all " + str(len(methods_to_run)) + " methods in " + str(end - start) + " seconds") print("Now running normal versions for comparison") normal_versions = [x[:-6] for x in methods_to_run] start = timer() score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data. true_labels_for_expts_with_5pairs, method_spec=normal_versions, evals_outfile=evals_outfile, pair_scores_outfile=scored_pairs_outfile, print_timing=True, make_dense=True) end = timer() print("ran all " + str(len(normal_versions)) + " methods in " + str(end - start) + " seconds")
def resources_test(): infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 5000) # my OS kills it at 10000 (due to memory) for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # plain WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly methods_to_run = ['weighted_corr', 'weighted_corr_faiss'] adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\ndense version takes up " + str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes") start = timer() # scores_faiss = scoring_with_faiss.score_pairs_faiss(adj_mat, methods_to_run, print_timing=True, # pi_vector=pi_vector_preproc) score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc_dense, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using dense matrix in " + str(end - start) + " seconds")
def resources_test(run_all_implementations=True, use_faiss=False): # Let's read in portions of a big matrix in increasing size, and for each size, score all pairs (both sparse and dense). # This will let us see how things scale and where memory limits will come in. infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 10000, 100000) # num_nodes = [10000] # this size: no run finished in the length of time I was willing to wait num_nodes = (100, 500, 1000, 5000) # num_nodes = [5000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # (order given here doesn't matter) methods_to_run = [ 'cosine', 'cosineIDF', # use fast "transform" 'shared_size', 'adamic_adar', 'newman', 'shared_weight11', # medium 'hamming', 'pearson', 'jaccard', # WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly 'weighted_corr', 'weighted_corr_exp', # only have slow "terms" method 'shared_weight1100', 'mixed_pairs' ] adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\ndense version takes up " + str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes") want_exp_model = ('weighted_corr_exp' in methods_to_run) or \ ('weighted_corr_exp_faiss' in methods_to_run) or ('all' in methods_to_run) start = timer() graph_models = bipartite_fitting.learn_graph_models( adj_mat, bernoulli=False, pi_vector=None, exponential=want_exp_model) end = timer() print("time for learning exponential model: " + str(end - start) + " seconds" if want_exp_model else "") start = timer() score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc_dense, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True, exp_model=graph_models.get('exponential', None), run_all_implementations=run_all_implementations, prefer_faiss=use_faiss) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using dense matrix in " + str(end - start) + " seconds") print("\nsparse adj_matrix takes up " + str(asizeof.asizeof(adj_mat_preproc)) + " bytes;") start = timer() score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True, exp_model=graph_models.get('exponential', None), run_all_implementations=run_all_implementations) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using sparse matrix in " + str(end - start) + " seconds")
def test_cosine_versions(): infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 500, 1000, 5000) num_nodes = [1000, 2000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) print("\nmatrix has " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils ") print("process memory: ") print(get_process_memory()) print("\n** sklearn sparse cosine **") scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs, adj_mat_preproc, print_timing=True, use_package=True) print(get_process_memory()) print( "\n** sklearn, but called on sparse.csc of dense 'transformed' matrix **" ) start = timer() cos = [] all_pairs_scores = cosine_similarity( sparse.csr_matrix(adj_mat_preproc)) for (row_idx1, row_idx2, _, _, _, _) in score_data.gen_all_pairs(adj_mat_preproc): score = all_pairs_scores[row_idx1, row_idx2] cos.append(score if not np.isnan(score) else 0) end = timer() print("duration: " + str(end - start) + " seconds") print(get_process_memory()) adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\nmade matrix dense") print(get_process_memory()) print("\n** home-grown dense cosine **") scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs, adj_mat_preproc_dense, print_timing=True, use_package=False) print(get_process_memory()) print("\n** sklearn dense, using batches **") start = timer() cos = [] all_pairs_scores = scoring_methods_fast.cosine_similarity_n_space( adj_mat_preproc_dense, adj_mat_preproc_dense, verbose=True) for (row_idx1, row_idx2, _, _, _, _) in score_data.gen_all_pairs(adj_mat_preproc_dense): score = all_pairs_scores[row_idx1, row_idx2] cos.append(score if not np.isnan(score) else 0) end = timer() print("duration: " + str(end - start) + " seconds") print(get_process_memory()) print("\n** faiss (dense) ** ") scoring_with_faiss.score_pairs_faiss_all_exact(adj_mat_preproc_dense, ['cosine_faiss'], print_timing=True) print(get_process_memory()) print("\n** sklearn dense cosine **") adj_mat_preproc_dense = adj_mat_preproc.toarray() scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs, adj_mat_preproc_dense, print_timing=True, use_package=True) print(get_process_memory())