def run_expts_loc_data(loc_data_name='brightkite', existing_data=False, inference_subdir='inference'): #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj.txt' #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round3.txt' #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round2.txt' #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round2_filter.txt' #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round1.txt' #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round1_filter.txt' # adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round0_filter.txt' # edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/loc-' + loc_data_name + '_edges.txt' adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj_10friends.txt' edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/loc-edges_10friends.txt' exptdir = '/Users/lfriedl/Documents/dissertation/binary-ndim/' + loc_data_name + '-expts' for i in range(70, 79): rowIDs_file = exptdir + '/data' + str(i) + '.rowIDs' evals_outfile = exptdir + '/' + inference_subdir + '/results' + str(i) + '.txt' scored_pairs_outfile= exptdir + '/' + inference_subdir + '/scoredPairs' + str(i) + ".csv.gz" if existing_data: adj_mat, row_labels, label_generator = get_loc_expt_data(adj_mat_infile, edges_infile, rowIDs_file) else: adj_mat, row_labels, label_generator = read_sample_save(adj_mat_infile, edges_infile, num_nodes=500, rows_outfile=rowIDs_file) if label_generator is None: print("Found no edges; stopping") else: score_data.run_and_eval(adj_mat, true_labels_func = label_generator, method_spec="all", evals_outfile = evals_outfile, pair_scores_outfile=scored_pairs_outfile, row_labels=row_labels, print_timing=True, prefer_faiss=True) #, expt1=True)
def congress_all_pairs(base_dir_expts, adj_mat_dir, flip=False, special_extra_s=False): parties = ['dem', 'rep'] sessions = range(110, 114) mixed_pairs_sims = [.001, .005, .01, .05, .1, .2, .3, .4, .5] methods = set(scoring_methods.all_defined_methods) - {'weighted_corr_exp'} if special_extra_s: # just a few settings where we need to try additional s_hats (was still increasing at .5) parties = ['rep'] sessions = [110, 111] mixed_pairs_sims = [ .001, .005, .01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9 ] for party in parties: for sess in sessions: adj_mat_infile = adj_mat_dir + "/" + party + "Votes" + str( sess) + ".mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) true_pairs_infile = glob.glob(adj_mat_dir + "/" + party + "Cospons" + str(sess) + "GE*.mtx")[0] with open(true_pairs_infile) as fin: true_pairs_mat = mmread(fin).astype(int, copy=False).tocsc() if true_pairs_mat.shape[0] != true_pairs_mat.shape[ 1] and true_pairs_mat.shape[0] != adj_mat.shape[0]: print("error in matrix sizes for " + adj_mat_infile + " and " + true_pairs_infile) return # create label generator that uses this matrix. def get_true_labels_given_my_edges(pairs_generator): return experiment_runner.get_true_labels_from_matrix( pairs_generator, true_pairs_mat) if flip: evals_outfile = base_dir_expts + "/resultsFlip_" + party + str( sess) + ".txt" pair_scores_outfile = base_dir_expts + "/scoredPairsFlip_" + party + str( sess) + ".csv.gz" else: evals_outfile = base_dir_expts + "/results_" + party + str( sess) + ".txt" pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str( sess) + ".csv.gz" score_data.run_and_eval( adj_mat, true_labels_func=get_true_labels_given_my_edges, method_spec=methods, evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, print_timing=True, prefer_faiss=True, mixed_pairs_sims=mixed_pairs_sims, flip_high_ps=flip, remove_boundary_items=False, remove_boundary_affils=True ) # some votes are unanimous w/in party
def case1_no_bdry_nodes(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 1\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) new_evals_file = results_dir + "/evals-case1.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True) compare_auc_files(new_evals_file, aucs_file_to_match)
def test_faiss_plus_normal(): adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) score_data.run_and_eval( adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, # method_spec="all", method_spec=['weighted_corr', 'weighted_corr_faiss'], evals_outfile="reality_appweek_50/python-out/evals-test.txt", pair_scores_outfile='reality_appweek_50/tmp.scoredPairs.csv.gz', print_timing=True)
def case4_0item_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 4\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # Keep the natural all-0 item, and tell program to remove boundary affils new_evals_file = results_dir + "/evals-case4.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=True) compare_auc_files(new_evals_file, aucs_file_to_match)
def case5_0item_keep_0affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 5\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # want only all-0 affils, so set affil[,115] to all 0's adj_mat[:, 115] = 0 new_evals_file = results_dir + "/evals-case5.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def case3_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 3\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # Want the natural all-0 and all-1 affils, but still want item 26 to stay out. adj_mat[26,:] = 0 new_evals_file = results_dir + "/evals-case3.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def case6_0item_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 6\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) # score matrix the way it comes: with all-0 and all-1 affils, and an item that's all-0 once the all-1 affil is gone # Note: that all-0 item (an induced boundary node) can't be handled quite correctly by the exp model. But it works # out well enough, because it ends up with a parameter very close to zero. new_evals_file = results_dir + "/evals-case6.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def demo_run_and_eval(adj_mat_infile, pair_scores_outfile, evals_outfile, prefer_faiss=False): adj_mat = score_data.load_adj_mat(adj_mat_infile) score_data.run_and_eval( adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec="all", evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, print_timing=True, prefer_faiss=prefer_faiss)
def case8_01items_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 8\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze() adj_mat.resize((76, 206)) # orig shape was 75x206 adj_mat[75, affil_degrees > 0] = 1 # new almost-all-1 item (preserves orig all-0 affils) new_evals_file = results_dir + "/evals-case8.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=True) compare_auc_files(new_evals_file, aucs_file_to_match)
def case9_01items_keep_all(adj_mat_infile, results_dir, aucs_file_to_match): print("\nCase 9\n") adj_mat = score_data.load_adj_mat(adj_mat_infile) affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze() adj_mat.resize((76, 206)) # orig shape was 75x206 adj_mat[75, affil_degrees > 0] = 1 # new almost-all-1 item (preserves orig all-0 affils) # Note: similar to case 6, the all-1 item (induced boundary node) can't be handled by the exp model. There is no # max likelihood solution for this graph. In practice, the algorithm times out -- but even if it ran longer, # there's no good solution to converge to. The parameter for that item needs to be near-infinity, but not infinity. new_evals_file = results_dir + "/evals-case9.txt" score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs, method_spec=['weighted_corr', 'weighted_corr_exp'], evals_outfile=new_evals_file, pair_scores_outfile=None, print_timing=True, remove_boundary_items=False, remove_boundary_affils=False) compare_auc_files(new_evals_file, aucs_file_to_match)
def run_all_pairs_loc_data(adj_mat_infile, edges_infile, outdir, tag): # modified just a bit from get_loc_expt_data() adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile) label_generator = get_label_generator_from_edgefile(edges_infile, row_names) evals_outfile = outdir + '/results_' + tag + '.txt' scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz' all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson', 'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr', 'mixed_pairs'] # (for gowalla, just ran standard before) mixed_pairs_sims = [.001, .005, .01, .05, .1, .2, .3, .4, .5] score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run, #[method], evals_outfile=evals_outfile, mixed_pairs_sims=mixed_pairs_sims, pair_scores_outfile=scored_pairs_outfile, row_labels=row_names, remove_boundary_items=False, print_timing=True, prefer_faiss=True)
def affil_subsets_loc_data(adj_mat_infile, edges_infile, outdir, tag, affil_subset_fraction, affil_subset_type): adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile) label_generator = get_label_generator_from_edgefile(edges_infile, row_names) tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float(adj_mat.shape[0]) affils_to_keep = affil_subsets.compute_affil_subsets(tmp_pi_vector, affil_subset_fraction, affil_subset_type) adj_mat = adj_mat[:, affils_to_keep] evals_outfile = outdir + '/results_' + tag + '.txt' scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz' all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson', 'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr', 'mixed_pairs'] score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run, #[method], evals_outfile=evals_outfile, pair_scores_outfile=scored_pairs_outfile, row_labels=row_names, remove_boundary_items=False, print_timing=True, prefer_faiss=True)
def demo_loc_data(): # todo: set random seed so this is actually repeatable adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt' edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/loc-brightkite_edges.txt' rows_outfile = 'brightkite/data-ex1.txt' adj_mat, row_labels, label_generator = loc_data.read_sample_save( adj_mat_infile, edges_infile, num_nodes=300, rows_outfile=rows_outfile) if label_generator is None: print("Found no edges; stopping") else: score_data.run_and_eval( adj_mat, true_labels_func=label_generator, method_spec="all", evals_outfile="brightkite/evals-ex1.txt", pair_scores_outfile="brightkite/scoredPairs-ex1.csv.gz", row_labels=row_labels, print_timing=True)
def compare_timings_faiss_normal(adj_mat_infile, evals_outfile, scored_pairs_outfile): infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt" num_nodes = (100, 1000, 5000) # my OS kills it at 10000 (due to memory) # num_nodes = [2000] for num_to_try in num_nodes: adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try) print("\n*** Running all faiss methods ***\n") print("(asked for " + str(num_to_try) + " nodes)") methods_to_run = scoring_with_faiss.all_faiss_methods start = timer() score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data. true_labels_for_expts_with_5pairs, method_spec=methods_to_run, evals_outfile=evals_outfile, pair_scores_outfile=scored_pairs_outfile, print_timing=True) end = timer() print("ran all " + str(len(methods_to_run)) + " methods in " + str(end - start) + " seconds") print("Now running normal versions for comparison") normal_versions = [x[:-6] for x in methods_to_run] start = timer() score_data.run_and_eval(adj_mat, true_labels_func=expts_labeled_data. true_labels_for_expts_with_5pairs, method_spec=normal_versions, evals_outfile=evals_outfile, pair_scores_outfile=scored_pairs_outfile, print_timing=True, make_dense=True) end = timer() print("ran all " + str(len(normal_versions)) + " methods in " + str(end - start) + " seconds")
def congress_affil_subsets(base_dir_expts, adj_mat_dir): parties = ['dem', 'rep'] sessions = range(110, 114) mixed_pairs_best_sims = [0.05, 0.001, 0.001, 0.001, 0.5, 0.7, 0.4, 0.2] # determined by manual inspection mixed_pairs_best_sims.reverse( ) # so that we can .pop() to get them in orig order methods = set(scoring_methods.all_defined_methods) - {'weighted_corr_exp'} for party in parties: for sess in sessions: adj_mat_infile = adj_mat_dir + "/" + party + "Votes" + str( sess) + ".mtx.gz" adj_mat = score_data.load_adj_mat(adj_mat_infile) true_pairs_infile = glob.glob(adj_mat_dir + "/" + party + "Cospons" + str(sess) + "GE*.mtx")[0] with open(true_pairs_infile) as fin: true_pairs_mat = mmread(fin).astype(int, copy=False).tocsc() # create label generator that uses this matrix. def get_true_labels_given_my_edges(pairs_generator): return experiment_runner.get_true_labels_from_matrix( pairs_generator, true_pairs_mat) mixed_pairs_sim = [mixed_pairs_best_sims.pop()] tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float( adj_mat.shape[0]) # N.B. Decided to use original pi_vector here, including boundary affils, when choosing affil_subsets. # (Some items will end up all-0.) # This is different than when using sampled graphs. There, there's a universal pi_vector (always > 0), and your # data set might not see some affils. Here, even the universal pi_vector can be 0. # affil subset #1: subsetRand.25 evals_outfile = base_dir_expts + "/results_" + party + str( sess) + "_subsetRand.25.txt" pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str( sess) + "_subsetRand.25.csv.gz" affils_to_keep = affil_subsets.compute_affil_subsets( tmp_pi_vector, affil_subset_fraction=.25, affil_subset_type=1) adj_mat_to_use = adj_mat[:, affils_to_keep] score_data.run_and_eval( adj_mat_to_use, true_labels_func=get_true_labels_given_my_edges, method_spec=methods, evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, print_timing=True, prefer_faiss=True, mixed_pairs_sims=mixed_pairs_sim, remove_boundary_items=False, remove_boundary_affils=True) # affil subset #2: subsetMax.25 evals_outfile = base_dir_expts + "/results_" + party + str( sess) + "_subsetMax.25.txt" pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str( sess) + "_subsetMax.25.csv.gz" affils_to_keep = affil_subsets.compute_affil_subsets( tmp_pi_vector, affil_subset_fraction=.25, affil_subset_type=2) adj_mat_to_use = adj_mat[:, affils_to_keep] score_data.run_and_eval( adj_mat_to_use, true_labels_func=get_true_labels_given_my_edges, method_spec=methods, evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, print_timing=True, prefer_faiss=True, mixed_pairs_sims=mixed_pairs_sim, remove_boundary_items=False, remove_boundary_affils=True) # affil subset #3: subsetMin.25 evals_outfile = base_dir_expts + "/results_" + party + str( sess) + "_subsetMin.25.txt" pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str( sess) + "_subsetMin.25.csv.gz" affils_to_keep = affil_subsets.compute_affil_subsets( tmp_pi_vector, affil_subset_fraction=.25, affil_subset_type=3) adj_mat_to_use = adj_mat[:, affils_to_keep] score_data.run_and_eval( adj_mat_to_use, true_labels_func=get_true_labels_given_my_edges, method_spec=methods, evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, print_timing=True, prefer_faiss=True, mixed_pairs_sims=mixed_pairs_sim, remove_boundary_items=False, remove_boundary_affils=True)
def score_data_set(data_dir, trial_num, inference_dir_name, method_spec="all", save_pair_scores=True, sims_for_mixed_pairs=None, flip_high_ps=False, remove_boundary_items=False, remove_boundary_affils=True, pi_vector_to_use=None, prefer_faiss=True, affil_subset_every_1_4=False, affil_subset_type=0, affil_subset_fraction=0, loc_data_bipartite_file=None, loc_data_true_pairs_file=None, verbose=False): """ This function hard-codes some file names (to standardize): data_dir/allInputDataFiles.tgz contains files of the form data1_adjMat.mtx.gz, data1_numPos.txt and data1_phi.txt.gz output: data_dir/inference_dir_name (created if needed) will have results1.txt and (maybe) scoredPairs1.csv.gz :param data_dir: :param trial_num: integer :param inference_dir_name: subdirectory will be constructed (if necessary) under data_dir :param method_spec: :param save_pair_scores: whether to write a scoredPairs file :param sims_for_mixed_pairs: default runs a standard set of 3 :param flip_high_ps: :param remove_boundary_items: :param remove_boundary_affils: :param pi_vector_to_use: :param prefer_faiss: :param affil_subset_every_1_4: to match existing handling of newsgroups data, keep only the 1st of every 4 affils (pruned before handing to score_data package). :return: """ # 1a. construct input filenames and extract data file(s) from tar archive, if not already found locally. loc_data_format = (loc_data_bipartite_file is not None and loc_data_true_pairs_file is not None) if loc_data_format: basenamefiles = ["./data" + str(trial_num) + ".rowIDs"] else: # usual format # what we need are adjMat and numPos. adj_mat_file_basename = "./data" + str( trial_num ) + "_adjMat.mtx.gz" # nb: "./" required to access it from tar num_pos_file_basename = "./data" + str(trial_num) + "_numPos.txt" adj_mat_infile = data_dir + "/" + adj_mat_file_basename num_pos_infile = data_dir + "/" + num_pos_file_basename basenamefiles = [adj_mat_file_basename, num_pos_file_basename] fullpathfiles = [data_dir + "/" + basefile for basefile in basenamefiles] files_already_present = True if not all([os.path.isfile(file) for file in fullpathfiles]): # if not (os.path.isfile(adj_mat_infile) and os.path.isfile(num_pos_infile)): files_already_present = False tar_infile = data_dir + "/allInputDataFiles.tgz" tf = tarfile.open(tar_infile) members = [tf.getmember(filename) for filename in basenamefiles] # members = [tf.getmember(filename) for filename in [adj_mat_file_basename, num_pos_file_basename]] tf.extractall(path=data_dir, members=members) # 1b. construct other variables # outfiles # create or locate outputDir out_dir = data_dir + "/" + inference_dir_name if not (os.path.isdir(out_dir)): try: os.mkdir(out_dir) except OSError: # test again, because another thread might have created it meanwhile ok = os.path.isdir(out_dir) if not ok: os.mkdir(out_dir) # an error here stops the function # (full paths) evals_outfile = out_dir + "/results" + str(trial_num) + ".txt" if save_pair_scores: pair_scores_outfile = out_dir + "/scoredPairs" + str( trial_num) + ".csv.gz" else: pair_scores_outfile = None if sims_for_mixed_pairs is None: sims_for_mixed_pairs = "standard" # magic word for default # load data into variables if loc_data_format: adj_mat, item_names, true_labels_func = prelim_loc_data_expts.get_loc_expt_data( loc_data_bipartite_file, loc_data_true_pairs_file, fullpathfiles[0]) else: adj_mat = score_data.load_adj_mat(adj_mat_infile) item_names = None with open(num_pos_infile) as fin: num_true_pos = int(fin.readline()) true_labels_func = partial( expts_labeled_data.get_true_labels_expt_data, num_true_pos) if affil_subset_every_1_4: subset_adj_mat = sparse.csc_matrix( (adj_mat.shape[0], int(adj_mat.shape[1] / 4)), dtype='int') # copy over every 4th column for i in range(int(adj_mat.shape[1] / 4)): subset_adj_mat[:, i] = adj_mat[:, 4 * i] adj_mat = subset_adj_mat if pi_vector_to_use is not None: subset_pi_vector = np.array([ pi_vector_to_use[4 * i] for i in range(int(len(pi_vector_to_use) / 4)) ]) pi_vector_to_use = subset_pi_vector if affil_subset_type > 0: if pi_vector_to_use is not None: affils_to_keep = affil_subsets.compute_affil_subsets( pi_vector_to_use, affil_subset_fraction, affil_subset_type) pi_vector_to_use = pi_vector_to_use[affils_to_keep] else: tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float( adj_mat.shape[0]) affils_to_keep = affil_subsets.compute_affil_subsets( tmp_pi_vector, affil_subset_fraction, affil_subset_type) adj_mat = adj_mat[:, affils_to_keep] if not files_already_present: [os.remove(infile) for infile in fullpathfiles] # did affil_subsetting remove all data? If so, going to get errors. Don't run anything, just print. if affil_subset_type > 0 and np.sum(adj_mat) == 0: affil_subsets.print_results_for_0data( num_true_pos, adj_mat, true_labels_func=true_labels_func, method_spec=method_spec, evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, row_labels=item_names, print_timing=verbose, prefer_faiss=prefer_faiss, mixed_pairs_sims=sims_for_mixed_pairs, flip_high_ps=flip_high_ps, pi_vector_to_use=pi_vector_to_use, remove_boundary_items=remove_boundary_items, remove_boundary_affils=remove_boundary_affils) return # 2. run the expt score_data.run_and_eval(adj_mat, true_labels_func=true_labels_func, method_spec=method_spec, evals_outfile=evals_outfile, pair_scores_outfile=pair_scores_outfile, row_labels=item_names, print_timing=verbose, prefer_faiss=prefer_faiss, mixed_pairs_sims=sims_for_mixed_pairs, flip_high_ps=flip_high_ps, pi_vector_to_use=pi_vector_to_use, remove_boundary_items=remove_boundary_items, remove_boundary_affils=remove_boundary_affils)