def test_adj_and_phi(): """ Reads adj matrix, makes sure we can match what R code did for learning pi_vector, preprocessing it, and flipping it. Uses & compares to files: 'ng_aa_data1/data15' . [_adj_mat.mtx.gz, .dataphi.txt.gz, .dataphipreproc.txt.gz, .dataphiflipped.txt.gz, .adj_mat_flipped.mtx.gz] Throws assertion error if unhappy """ print( "\n*** Testing reading adjacency matrix and computing pi_vector ***\n") # Use the example data files "data15_*". They contain the contents of my expt data file alt.atheism/data1.Rdata #pi_vector_infile = "ng_aa_data1/data15_phi.txt.gz" # this is from data1.Rdata, and it's the phi from the whole (larger) data set #pi_vector_whole_data = score_data.load_pi_from_file(pi_vector_infile) # ignoring this adj_mat_infile = "ng_aa_data1/data15_adj_mat.mtx.gz" # manually constructing these as I go along, using my existing R code in experimentRunner.R pi_vector_learned_R = expts_labeled_data.load_pi_from_file( "ng_aa_data1/data15.dataphi.txt.gz") pi_vector_preproc_R = expts_labeled_data.load_pi_from_file( "ng_aa_data1/data15.dataphipreproc.txt.gz") adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # Quirk from R: it saved floating point data with 7 digits of precision (see getOptions("digits") and format()). # Implication: if we want to ever use those phi files, should re-convert with higher precision. # For now, allow a difference of 1e-07 when comparing them # How annoying. Upping the precision simply revealed how I'm imprecise in the R code anyway. The Bernoulli <-> # multinomial conversion I do doesn't keep the exact probabilities anyway. Actually... that's a possible bug. The # other time the code does this, it explicitly fixes that. # Compare. Expect pi_vector_learned to match pi_vector_learned_R and match numCols of adj_mat. assert (pi_vector_learned.shape[0] == adj_mat.shape[1]) assert (max(abs(pi_vector_learned - pi_vector_learned_R)) < 1e-07) # Expect pi_vector_preproc to match pi_vector_preproc_R and match numCols of adj_mat_preproc assert (pi_vector_preproc.shape[0] == adj_mat_preproc.shape[1]) assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07) # test flipping pi_vector_flipped_R = expts_labeled_data.load_pi_from_file( "ng_aa_data1/data15.dataphiflipped.txt.gz") adj_mat_flipped_R = score_data.load_adj_mat( "ng_aa_data1/data15.adj_mat_flipped.mtx.gz") pi_vector_flipped, adj_mat_flipped = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat, flip_high_ps=True) # Expect the respective versions to match assert (pi_vector_flipped.shape == pi_vector_preproc.shape) assert (max(abs(pi_vector_flipped - pi_vector_flipped_R)) < 1e-07) assert (adj_mat_flipped_R.shape == adj_mat_flipped.shape) assert (abs(adj_mat_flipped_R - adj_mat_flipped).max() < 1e-07)
def test_score_wc_faiss(): adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) scores_data_frame = scoring_with_faiss.score_pairs_faiss( adj_mat, which_methods=['weighted_corr_faiss'], how_many_neighbors=-1, print_timing=True, pi_vector=pi_vector) print('scores look like (sample):\n' + str(scores_data_frame.head())) # note for later: scores_data_frame.reset_index() makes it save item1 & item2 as regular columns, defaults back to index of row numbers print("calling adamic-adar") scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact( adj_mat, 'adamic_adar_faiss', pi_vector=pi_vector, num_docs=adj_mat.shape[0]) print('scores look like (sample):\n' + str(scores_data_frame2.head())) print("calling pearson") scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact( adj_mat, 'pearson_faiss') print('scores look like (sample):\n' + str(scores_data_frame2.head())) print("(dense input)") scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact( adj_mat.toarray(), 'pearson_faiss') print('scores look like (sample):\n' + str(scores_data_frame2.head()))
def test_only_wc(adj_mat_infile, scored_pairs_file_R, scored_pairs_file_new): """ Like test_pair_scores_against_R(), but checks scores & timing of the function simple_only_weighted_corr(). (This was the first scoring method I implemented using a transform of the adj_matrix.) :param adj_mat_infile: local path ending in .mtx.gz :param scored_pairs_file_R: local path ending in .csv.gz """ print( "\n*** Checking simple_only_weighted_corr against scores from R ***\n") # Read adj data and prep pi_vector adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) scores_storage = magic_dictionary.make_me_a_dict(adj_mat_preproc.shape[0]) scoring_methods.extra_implementations.simple_only_weighted_corr( score_data.gen_all_pairs, adj_mat_preproc, scores_storage.create_and_store_array("weighted_corr"), pi_vector_preproc, print_timing=True) scores_storage.to_csv_gz( scored_pairs_file_new, lambda: score_data.ij_gen(adj_mat_preproc.shape[0])) with gzip.open(scored_pairs_file_new, 'r') as fpin: wc_frame = pd.read_csv(fpin) with gzip.open(scored_pairs_file_R, 'r') as fpin: scores_data_frame_R = pd.read_csv(fpin) print("max diff: " + str( abs(wc_frame["weighted_corr"] - scores_data_frame_R["pearsonWeighted"]).max())) assert (max( abs(wc_frame["weighted_corr"] - scores_data_frame_R["pearsonWeighted"])) < 1e-05)
def test_adj_and_phi2(): """ Reads adj matrix, checks that we can learn pi_vector for a second data set. Using files: "reality_appweek_50/data50_adjMat.mtx.gz", "reality_appweek_50/data50-inference-allto6.phi.csv.gz" """ print( "\n*** Testing reading adjacency matrix and computing pi_vector (2) ***\n" ) # Use something other than newsgroups! They're too complicated because they were run early. # Check that I can learn phi from the adjacency matrix and end up with the version in the inference file adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" pi_vector_preproc_R = expts_labeled_data.load_pi_from_file( "reality_appweek_50/data50-inference-allto6.phi.csv.gz") adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # Expect pi_vector_preproc to match pi_vector_preproc_R assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07)
def test_faiss_basic_calls(): adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # can do dot product on plain adj matrix -- just computes sharedSize index = faiss.IndexFlatIP(adj_mat.shape[1]) # takes numCols as arg # mimicking tutorial example: #index.add(np.random.random((100, adj_mat.shape[1])).astype('float32')) adj_for_faiss = adj_mat.toarray().astype( 'float32' ) # adj_mat is sparse, but faiss wants dense. and, apparently, wants float32. index.add(adj_for_faiss) print("index.is_trained: " + str(index.is_trained) + ", index.total: " + str(index.ntotal)) # look at 10 nearest neighbors of each input distances10, neighbors10 = index.search(adj_for_faiss, 10) distances, neighbors = index.search(adj_for_faiss, adj_for_faiss.shape[0]) # all pairs print('basic calls ran')
def resources_test(): infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 5000) # my OS kills it at 10000 (due to memory) for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # plain WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly methods_to_run = ['weighted_corr', 'weighted_corr_faiss'] adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\ndense version takes up " + str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes") start = timer() # scores_faiss = scoring_with_faiss.score_pairs_faiss(adj_mat, methods_to_run, print_timing=True, # pi_vector=pi_vector_preproc) score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc_dense, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using dense matrix in " + str(end - start) + " seconds")
def test_pair_scores_against_R(adj_mat_infile, scored_pairs_file_R, scored_pairs_file_new, make_dense=False, flip_high_ps=False, run_all=0, prefer_faiss=False): """ Starting from an adj matrix, score pairs (using current implementation) and compare to reference file run from R. Similar contents to score_data.run_and_eval(). :param run_all: set to 2 (or 1) to run and time all (or more) implementations. However, we only look at the scores of the last one. """ print("\n*** Testing scores computed for pairs ***\n") print("Adj matrix infile: " + adj_mat_infile + "; scored pairs reference file: " + scored_pairs_file_R) # Read adj data and prep pi_vector adj_mat = score_data.load_adj_mat(adj_mat_infile) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat, flip_high_ps=flip_high_ps) methods_to_run = [ 'jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson', 'weighted_corr', 'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'mixed_pairs' ] mixed_pairs_sims = [.01, .001] start = timer() if make_dense: adj_mat_preproc = adj_mat_preproc.toarray() scoring_methods.score_pairs(score_data.gen_all_pairs, adj_mat_preproc, which_methods=methods_to_run, outfile_csv_gz=scored_pairs_file_new, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=mixed_pairs_sims, print_timing=True, run_all_implementations=run_all, prefer_faiss=prefer_faiss) with gzip.open(scored_pairs_file_new, 'r') as fpin: scores_data_frame = pd.read_csv(fpin) scores_data_frame['label'] = expts_labeled_data.get_true_labels_expt_data( num_true_pairs=5, pairs_generator=score_data.gen_all_pairs(adj_mat_preproc)) end = timer() print("ran " \ + str(len(methods_to_run) + (len(mixed_pairs_sims) - 1 if 'mixed_pairs' in methods_to_run else 0)) \ + " methods " + ("(plus variants) " if run_all else "") \ + "on " + str(adj_mat.shape[0] * (adj_mat.shape[0]-1)/float(2)) + " pairs") print("num seconds: " + str(end - start)) # Read scores from R and compare with gzip.open(scored_pairs_file_R, 'r') as fpin: scores_data_frame_R = pd.read_csv(fpin) for (R_method, our_method) in list(mapping_from_R_methods.items()): if our_method in list(scores_data_frame): print("Checking " + our_method) # R data doesn't have item numbers, but is in the same all-pairs order as ours print("max diff: " + str( abs(scores_data_frame[our_method] - scores_data_frame_R[R_method]).max())) # Sadly, the p_i vectors are off by a smidgen (see notes above), so anything that uses them can # differ too. sharedWeight11 vals differed by > 1e-06, and that was with only 65 affils. tolerance = 1e-10 if prefer_faiss: tolerance = 1e-04 elif our_method in our_pi_methods: tolerance = 1e-05 assert (max( abs(scores_data_frame[our_method] - scores_data_frame_R[R_method])) < tolerance) return scores_data_frame
def resources_test(run_all_implementations=True, use_faiss=False): # Let's read in portions of a big matrix in increasing size, and for each size, score all pairs (both sparse and dense). # This will let us see how things scale and where memory limits will come in. infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 10000, 100000) # num_nodes = [10000] # this size: no run finished in the length of time I was willing to wait num_nodes = (100, 500, 1000, 5000) # num_nodes = [5000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) # (order given here doesn't matter) methods_to_run = [ 'cosine', 'cosineIDF', # use fast "transform" 'shared_size', 'adamic_adar', 'newman', 'shared_weight11', # medium 'hamming', 'pearson', 'jaccard', # WC uses "transform" when dense, "terms" when sparse -- speed varies accordingly 'weighted_corr', 'weighted_corr_exp', # only have slow "terms" method 'shared_weight1100', 'mixed_pairs' ] adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\ndense version takes up " + str(sys.getsizeof(adj_mat_preproc_dense)) + " bytes") want_exp_model = ('weighted_corr_exp' in methods_to_run) or \ ('weighted_corr_exp_faiss' in methods_to_run) or ('all' in methods_to_run) start = timer() graph_models = bipartite_fitting.learn_graph_models( adj_mat, bernoulli=False, pi_vector=None, exponential=want_exp_model) end = timer() print("time for learning exponential model: " + str(end - start) + " seconds" if want_exp_model else "") start = timer() score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc_dense, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True, exp_model=graph_models.get('exponential', None), run_all_implementations=run_all_implementations, prefer_faiss=use_faiss) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using dense matrix in " + str(end - start) + " seconds") print("\nsparse adj_matrix takes up " + str(asizeof.asizeof(adj_mat_preproc)) + " bytes;") start = timer() score_data.scoring_methods.score_pairs( score_data.gen_all_pairs, adj_mat_preproc, which_methods=methods_to_run, pi_vector=pi_vector_preproc, back_compat=True, num_docs=adj_mat_preproc.shape[0], mixed_pairs_sims=[.01], print_timing=True, exp_model=graph_models.get('exponential', None), run_all_implementations=run_all_implementations) end = timer() print("for matrix with " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils, ") print("ran all methods using sparse matrix in " + str(end - start) + " seconds")
def test_cosine_versions(): infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 500, 1000, 5000) num_nodes = [1000, 2000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) pi_vector_learned = score_data.learn_pi_vector(adj_mat) pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector( pi_vector_learned, adj_mat) print("\nmatrix has " + str(adj_mat_preproc.shape[0]) + " items, " + str(adj_mat_preproc.shape[1]) \ + " affils ") print("process memory: ") print(get_process_memory()) print("\n** sklearn sparse cosine **") scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs, adj_mat_preproc, print_timing=True, use_package=True) print(get_process_memory()) print( "\n** sklearn, but called on sparse.csc of dense 'transformed' matrix **" ) start = timer() cos = [] all_pairs_scores = cosine_similarity( sparse.csr_matrix(adj_mat_preproc)) for (row_idx1, row_idx2, _, _, _, _) in score_data.gen_all_pairs(adj_mat_preproc): score = all_pairs_scores[row_idx1, row_idx2] cos.append(score if not np.isnan(score) else 0) end = timer() print("duration: " + str(end - start) + " seconds") print(get_process_memory()) adj_mat_preproc_dense = adj_mat_preproc.toarray() print("\nmade matrix dense") print(get_process_memory()) print("\n** home-grown dense cosine **") scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs, adj_mat_preproc_dense, print_timing=True, use_package=False) print(get_process_memory()) print("\n** sklearn dense, using batches **") start = timer() cos = [] all_pairs_scores = scoring_methods_fast.cosine_similarity_n_space( adj_mat_preproc_dense, adj_mat_preproc_dense, verbose=True) for (row_idx1, row_idx2, _, _, _, _) in score_data.gen_all_pairs(adj_mat_preproc_dense): score = all_pairs_scores[row_idx1, row_idx2] cos.append(score if not np.isnan(score) else 0) end = timer() print("duration: " + str(end - start) + " seconds") print(get_process_memory()) print("\n** faiss (dense) ** ") scoring_with_faiss.score_pairs_faiss_all_exact(adj_mat_preproc_dense, ['cosine_faiss'], print_timing=True) print(get_process_memory()) print("\n** sklearn dense cosine **") adj_mat_preproc_dense = adj_mat_preproc.toarray() scoring_methods_fast.simple_only_cosine(score_data.gen_all_pairs, adj_mat_preproc_dense, print_timing=True, use_package=True) print(get_process_memory())