def run_expts_loc_data(loc_data_name='brightkite', existing_data=False, inference_subdir='inference'):
    #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj.txt'
    #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round3.txt'
    #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round2.txt'
    #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round2_filter.txt'
    #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round1.txt'
    #adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round1_filter.txt'
    # adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/bipartite_adj_round0_filter.txt'
    # edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/' + loc_data_name + '/loc-' + loc_data_name + '_edges.txt'
    adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj_10friends.txt'
    edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/loc-edges_10friends.txt'

    exptdir = '/Users/lfriedl/Documents/dissertation/binary-ndim/' + loc_data_name + '-expts'
    for i in range(70, 79):
        rowIDs_file = exptdir + '/data' + str(i) + '.rowIDs'
        evals_outfile = exptdir + '/' + inference_subdir + '/results' + str(i) + '.txt'
        scored_pairs_outfile= exptdir + '/' + inference_subdir + '/scoredPairs' + str(i) + ".csv.gz"
        if existing_data:
            adj_mat, row_labels, label_generator = get_loc_expt_data(adj_mat_infile, edges_infile, rowIDs_file)
        else:
            adj_mat, row_labels, label_generator = read_sample_save(adj_mat_infile, edges_infile, num_nodes=500, rows_outfile=rowIDs_file)

        if label_generator is None:
            print("Found no edges; stopping")

        else:
            score_data.run_and_eval(adj_mat, true_labels_func = label_generator, method_spec="all",
                                    evals_outfile = evals_outfile,
                                    pair_scores_outfile=scored_pairs_outfile, row_labels=row_labels,
                                    print_timing=True, prefer_faiss=True) #, expt1=True)
示例#2
0
def congress_all_pairs(base_dir_expts,
                       adj_mat_dir,
                       flip=False,
                       special_extra_s=False):
    parties = ['dem', 'rep']
    sessions = range(110, 114)
    mixed_pairs_sims = [.001, .005, .01, .05, .1, .2, .3, .4, .5]
    methods = set(scoring_methods.all_defined_methods) - {'weighted_corr_exp'}
    if special_extra_s:  # just a few settings where we need to try additional s_hats (was still increasing at .5)
        parties = ['rep']
        sessions = [110, 111]
        mixed_pairs_sims = [
            .001, .005, .01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9
        ]

    for party in parties:
        for sess in sessions:
            adj_mat_infile = adj_mat_dir + "/" + party + "Votes" + str(
                sess) + ".mtx.gz"
            adj_mat = score_data.load_adj_mat(adj_mat_infile)

            true_pairs_infile = glob.glob(adj_mat_dir + "/" + party +
                                          "Cospons" + str(sess) + "GE*.mtx")[0]
            with open(true_pairs_infile) as fin:
                true_pairs_mat = mmread(fin).astype(int, copy=False).tocsc()
            if true_pairs_mat.shape[0] != true_pairs_mat.shape[
                    1] and true_pairs_mat.shape[0] != adj_mat.shape[0]:
                print("error in matrix sizes for " + adj_mat_infile + " and " +
                      true_pairs_infile)
                return

            # create label generator that uses this matrix.
            def get_true_labels_given_my_edges(pairs_generator):
                return experiment_runner.get_true_labels_from_matrix(
                    pairs_generator, true_pairs_mat)

            if flip:
                evals_outfile = base_dir_expts + "/resultsFlip_" + party + str(
                    sess) + ".txt"
                pair_scores_outfile = base_dir_expts + "/scoredPairsFlip_" + party + str(
                    sess) + ".csv.gz"
            else:
                evals_outfile = base_dir_expts + "/results_" + party + str(
                    sess) + ".txt"
                pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str(
                    sess) + ".csv.gz"

            score_data.run_and_eval(
                adj_mat,
                true_labels_func=get_true_labels_given_my_edges,
                method_spec=methods,
                evals_outfile=evals_outfile,
                pair_scores_outfile=pair_scores_outfile,
                print_timing=True,
                prefer_faiss=True,
                mixed_pairs_sims=mixed_pairs_sims,
                flip_high_ps=flip,
                remove_boundary_items=False,
                remove_boundary_affils=True
            )  # some votes are unanimous w/in party
示例#3
0
def case1_no_bdry_nodes(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 1\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    new_evals_file = results_dir + "/evals-case1.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True)
    compare_auc_files(new_evals_file, aucs_file_to_match)
示例#4
0
def test_faiss_plus_normal():
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    adj_mat = score_data.load_adj_mat(adj_mat_infile)

    score_data.run_and_eval(
        adj_mat,
        true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
        # method_spec="all",
        method_spec=['weighted_corr', 'weighted_corr_faiss'],
        evals_outfile="reality_appweek_50/python-out/evals-test.txt",
        pair_scores_outfile='reality_appweek_50/tmp.scoredPairs.csv.gz',
        print_timing=True)
示例#5
0
def case4_0item_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 4\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # Keep the natural all-0 item, and tell program to remove boundary affils
    new_evals_file = results_dir + "/evals-case4.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=True)
    compare_auc_files(new_evals_file, aucs_file_to_match)
示例#6
0
def case5_0item_keep_0affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 5\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # want only all-0 affils, so set affil[,115] to all 0's
    adj_mat[:, 115] = 0
    new_evals_file = results_dir + "/evals-case5.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
示例#7
0
def case3_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 3\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # Want the natural all-0 and all-1 affils, but still want item 26 to stay out.
    adj_mat[26,:] = 0
    new_evals_file = results_dir + "/evals-case3.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
示例#8
0
def case6_0item_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 6\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # score matrix the way it comes: with all-0 and all-1 affils, and an item that's all-0 once the all-1 affil is gone
    # Note: that all-0 item (an induced boundary node) can't be handled quite correctly by the exp model. But it works
    # out well enough, because it ends up with a parameter very close to zero.
    new_evals_file = results_dir + "/evals-case6.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
示例#9
0
def demo_run_and_eval(adj_mat_infile,
                      pair_scores_outfile,
                      evals_outfile,
                      prefer_faiss=False):

    adj_mat = score_data.load_adj_mat(adj_mat_infile)

    score_data.run_and_eval(
        adj_mat,
        true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
        method_spec="all",
        evals_outfile=evals_outfile,
        pair_scores_outfile=pair_scores_outfile,
        print_timing=True,
        prefer_faiss=prefer_faiss)
示例#10
0
def case8_01items_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 8\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze()
    adj_mat.resize((76, 206))   # orig shape was 75x206
    adj_mat[75, affil_degrees > 0] = 1  # new almost-all-1 item (preserves orig all-0 affils)

    new_evals_file = results_dir + "/evals-case8.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=True)
    compare_auc_files(new_evals_file, aucs_file_to_match)
示例#11
0
def case9_01items_keep_all(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 9\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze()
    adj_mat.resize((76, 206))   # orig shape was 75x206
    adj_mat[75, affil_degrees > 0] = 1  # new almost-all-1 item (preserves orig all-0 affils)
    # Note: similar to case 6, the all-1 item (induced boundary node) can't be handled by the exp model. There is no
    # max likelihood solution for this graph. In practice, the algorithm times out -- but even if it ran longer,
    # there's no good solution to converge to. The parameter for that item needs to be near-infinity, but not infinity.
    new_evals_file = results_dir + "/evals-case9.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
def run_all_pairs_loc_data(adj_mat_infile, edges_infile, outdir, tag):
    # modified just a bit from get_loc_expt_data()
    adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile)
    label_generator = get_label_generator_from_edgefile(edges_infile, row_names)

    evals_outfile = outdir + '/results_' + tag + '.txt'
    scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz'
    all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson',
                          'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr',
                          'mixed_pairs']
    # (for gowalla, just ran standard before)
    mixed_pairs_sims = [.001, .005, .01, .05, .1, .2, .3, .4, .5]

    score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run,  #[method],
                            evals_outfile=evals_outfile, mixed_pairs_sims=mixed_pairs_sims,
                            pair_scores_outfile=scored_pairs_outfile, row_labels=row_names,
                            remove_boundary_items=False,
                            print_timing=True, prefer_faiss=True)
def affil_subsets_loc_data(adj_mat_infile, edges_infile, outdir, tag, affil_subset_fraction, affil_subset_type):
    adj_mat, row_names, affil_names = read_loc_adj_mat(adj_mat_infile)
    label_generator = get_label_generator_from_edgefile(edges_infile, row_names)

    tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float(adj_mat.shape[0])
    affils_to_keep = affil_subsets.compute_affil_subsets(tmp_pi_vector, affil_subset_fraction, affil_subset_type)
    adj_mat = adj_mat[:, affils_to_keep]

    evals_outfile = outdir + '/results_' + tag + '.txt'
    scored_pairs_outfile = outdir + '/scoredPairs_' + tag + '.csv.gz'
    all_methods_to_run = ['jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson',
                          'shared_weight11', 'shared_weight1100', 'adamic_adar', 'newman', 'weighted_corr',
                          'mixed_pairs']

    score_data.run_and_eval(adj_mat, true_labels_func=label_generator, method_spec=all_methods_to_run,  #[method],
                            evals_outfile=evals_outfile,
                            pair_scores_outfile=scored_pairs_outfile, row_labels=row_names,
                            remove_boundary_items=False,
                            print_timing=True, prefer_faiss=True)
示例#14
0
def demo_loc_data():
    # todo: set random seed so this is actually repeatable
    adj_mat_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt'
    edges_infile = '/Users/lfriedl/Documents/dissertation/real-data/brightkite/loc-brightkite_edges.txt'
    rows_outfile = 'brightkite/data-ex1.txt'
    adj_mat, row_labels, label_generator = loc_data.read_sample_save(
        adj_mat_infile, edges_infile, num_nodes=300, rows_outfile=rows_outfile)
    if label_generator is None:
        print("Found no edges; stopping")

    else:
        score_data.run_and_eval(
            adj_mat,
            true_labels_func=label_generator,
            method_spec="all",
            evals_outfile="brightkite/evals-ex1.txt",
            pair_scores_outfile="brightkite/scoredPairs-ex1.csv.gz",
            row_labels=row_labels,
            print_timing=True)
示例#15
0
def compare_timings_faiss_normal(adj_mat_infile, evals_outfile,
                                 scored_pairs_outfile):
    infile = "/Users/lfriedl/Documents/dissertation/real-data/brightkite/bipartite_adj.txt"

    num_nodes = (100, 1000, 5000)  # my OS kills it at 10000 (due to memory)
    # num_nodes = [2000]
    for num_to_try in num_nodes:
        adj_mat, _, _ = loc_data.read_loc_adj_mat(infile, max_rows=num_to_try)

        print("\n*** Running all faiss methods ***\n")
        print("(asked for " + str(num_to_try) + " nodes)")

        methods_to_run = scoring_with_faiss.all_faiss_methods

        start = timer()
        score_data.run_and_eval(adj_mat,
                                true_labels_func=expts_labeled_data.
                                true_labels_for_expts_with_5pairs,
                                method_spec=methods_to_run,
                                evals_outfile=evals_outfile,
                                pair_scores_outfile=scored_pairs_outfile,
                                print_timing=True)
        end = timer()
        print("ran all " + str(len(methods_to_run)) + " methods in " +
              str(end - start) + " seconds")

        print("Now running normal versions for comparison")
        normal_versions = [x[:-6] for x in methods_to_run]
        start = timer()
        score_data.run_and_eval(adj_mat,
                                true_labels_func=expts_labeled_data.
                                true_labels_for_expts_with_5pairs,
                                method_spec=normal_versions,
                                evals_outfile=evals_outfile,
                                pair_scores_outfile=scored_pairs_outfile,
                                print_timing=True,
                                make_dense=True)
        end = timer()
        print("ran all " + str(len(normal_versions)) + " methods in " +
              str(end - start) + " seconds")
示例#16
0
def congress_affil_subsets(base_dir_expts, adj_mat_dir):
    parties = ['dem', 'rep']
    sessions = range(110, 114)
    mixed_pairs_best_sims = [0.05, 0.001, 0.001, 0.001, 0.5, 0.7, 0.4,
                             0.2]  # determined by manual inspection
    mixed_pairs_best_sims.reverse(
    )  # so that we can .pop() to get them in orig order
    methods = set(scoring_methods.all_defined_methods) - {'weighted_corr_exp'}

    for party in parties:
        for sess in sessions:
            adj_mat_infile = adj_mat_dir + "/" + party + "Votes" + str(
                sess) + ".mtx.gz"
            adj_mat = score_data.load_adj_mat(adj_mat_infile)

            true_pairs_infile = glob.glob(adj_mat_dir + "/" + party +
                                          "Cospons" + str(sess) + "GE*.mtx")[0]
            with open(true_pairs_infile) as fin:
                true_pairs_mat = mmread(fin).astype(int, copy=False).tocsc()

            # create label generator that uses this matrix.
            def get_true_labels_given_my_edges(pairs_generator):
                return experiment_runner.get_true_labels_from_matrix(
                    pairs_generator, true_pairs_mat)

            mixed_pairs_sim = [mixed_pairs_best_sims.pop()]

            tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float(
                adj_mat.shape[0])
            # N.B. Decided to use original pi_vector here, including boundary affils, when choosing affil_subsets.
            # (Some items will end up all-0.)
            # This is different than when using sampled graphs. There, there's a universal pi_vector (always > 0), and your
            # data set might not see some affils. Here, even the universal pi_vector can be 0.

            # affil subset #1: subsetRand.25
            evals_outfile = base_dir_expts + "/results_" + party + str(
                sess) + "_subsetRand.25.txt"
            pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str(
                sess) + "_subsetRand.25.csv.gz"
            affils_to_keep = affil_subsets.compute_affil_subsets(
                tmp_pi_vector, affil_subset_fraction=.25, affil_subset_type=1)
            adj_mat_to_use = adj_mat[:, affils_to_keep]
            score_data.run_and_eval(
                adj_mat_to_use,
                true_labels_func=get_true_labels_given_my_edges,
                method_spec=methods,
                evals_outfile=evals_outfile,
                pair_scores_outfile=pair_scores_outfile,
                print_timing=True,
                prefer_faiss=True,
                mixed_pairs_sims=mixed_pairs_sim,
                remove_boundary_items=False,
                remove_boundary_affils=True)

            # affil subset #2: subsetMax.25
            evals_outfile = base_dir_expts + "/results_" + party + str(
                sess) + "_subsetMax.25.txt"
            pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str(
                sess) + "_subsetMax.25.csv.gz"
            affils_to_keep = affil_subsets.compute_affil_subsets(
                tmp_pi_vector, affil_subset_fraction=.25, affil_subset_type=2)
            adj_mat_to_use = adj_mat[:, affils_to_keep]
            score_data.run_and_eval(
                adj_mat_to_use,
                true_labels_func=get_true_labels_given_my_edges,
                method_spec=methods,
                evals_outfile=evals_outfile,
                pair_scores_outfile=pair_scores_outfile,
                print_timing=True,
                prefer_faiss=True,
                mixed_pairs_sims=mixed_pairs_sim,
                remove_boundary_items=False,
                remove_boundary_affils=True)

            # affil subset #3: subsetMin.25
            evals_outfile = base_dir_expts + "/results_" + party + str(
                sess) + "_subsetMin.25.txt"
            pair_scores_outfile = base_dir_expts + "/scoredPairs_" + party + str(
                sess) + "_subsetMin.25.csv.gz"
            affils_to_keep = affil_subsets.compute_affil_subsets(
                tmp_pi_vector, affil_subset_fraction=.25, affil_subset_type=3)
            adj_mat_to_use = adj_mat[:, affils_to_keep]
            score_data.run_and_eval(
                adj_mat_to_use,
                true_labels_func=get_true_labels_given_my_edges,
                method_spec=methods,
                evals_outfile=evals_outfile,
                pair_scores_outfile=pair_scores_outfile,
                print_timing=True,
                prefer_faiss=True,
                mixed_pairs_sims=mixed_pairs_sim,
                remove_boundary_items=False,
                remove_boundary_affils=True)
示例#17
0
def score_data_set(data_dir,
                   trial_num,
                   inference_dir_name,
                   method_spec="all",
                   save_pair_scores=True,
                   sims_for_mixed_pairs=None,
                   flip_high_ps=False,
                   remove_boundary_items=False,
                   remove_boundary_affils=True,
                   pi_vector_to_use=None,
                   prefer_faiss=True,
                   affil_subset_every_1_4=False,
                   affil_subset_type=0,
                   affil_subset_fraction=0,
                   loc_data_bipartite_file=None,
                   loc_data_true_pairs_file=None,
                   verbose=False):
    """
    This function hard-codes some file names (to standardize):
        data_dir/allInputDataFiles.tgz contains files of the form data1_adjMat.mtx.gz, data1_numPos.txt and data1_phi.txt.gz
        output: data_dir/inference_dir_name (created if needed) will have results1.txt and (maybe) scoredPairs1.csv.gz

    :param data_dir:
    :param trial_num: integer
    :param inference_dir_name: subdirectory will be constructed (if necessary) under data_dir
    :param method_spec:
    :param save_pair_scores: whether to write a scoredPairs file
    :param sims_for_mixed_pairs: default runs a standard set of 3
    :param flip_high_ps:
    :param remove_boundary_items:
    :param remove_boundary_affils:
    :param pi_vector_to_use:
    :param prefer_faiss:
    :param affil_subset_every_1_4: to match existing handling of newsgroups data, keep only the 1st of every 4 affils
        (pruned before handing to score_data package).
    :return:
    """

    # 1a. construct input filenames and extract data file(s) from tar archive, if not already found locally.
    loc_data_format = (loc_data_bipartite_file is not None
                       and loc_data_true_pairs_file is not None)
    if loc_data_format:
        basenamefiles = ["./data" + str(trial_num) + ".rowIDs"]
    else:  # usual format
        # what we need are adjMat and numPos.
        adj_mat_file_basename = "./data" + str(
            trial_num
        ) + "_adjMat.mtx.gz"  # nb: "./" required to access it from tar
        num_pos_file_basename = "./data" + str(trial_num) + "_numPos.txt"
        adj_mat_infile = data_dir + "/" + adj_mat_file_basename
        num_pos_infile = data_dir + "/" + num_pos_file_basename
        basenamefiles = [adj_mat_file_basename, num_pos_file_basename]

    fullpathfiles = [data_dir + "/" + basefile for basefile in basenamefiles]
    files_already_present = True

    if not all([os.path.isfile(file) for file in fullpathfiles]):
        # if not (os.path.isfile(adj_mat_infile) and os.path.isfile(num_pos_infile)):
        files_already_present = False
        tar_infile = data_dir + "/allInputDataFiles.tgz"
        tf = tarfile.open(tar_infile)
        members = [tf.getmember(filename) for filename in basenamefiles]
        # members = [tf.getmember(filename) for filename in [adj_mat_file_basename, num_pos_file_basename]]
        tf.extractall(path=data_dir, members=members)

    # 1b. construct other variables

    # outfiles
    # create or locate outputDir
    out_dir = data_dir + "/" + inference_dir_name
    if not (os.path.isdir(out_dir)):
        try:
            os.mkdir(out_dir)
        except OSError:
            # test again, because another thread might have created it meanwhile
            ok = os.path.isdir(out_dir)
            if not ok:
                os.mkdir(out_dir)  # an error here stops the function

    # (full paths)
    evals_outfile = out_dir + "/results" + str(trial_num) + ".txt"
    if save_pair_scores:
        pair_scores_outfile = out_dir + "/scoredPairs" + str(
            trial_num) + ".csv.gz"
    else:
        pair_scores_outfile = None
    if sims_for_mixed_pairs is None:
        sims_for_mixed_pairs = "standard"  # magic word for default

    # load data into variables
    if loc_data_format:
        adj_mat, item_names, true_labels_func = prelim_loc_data_expts.get_loc_expt_data(
            loc_data_bipartite_file, loc_data_true_pairs_file,
            fullpathfiles[0])
    else:
        adj_mat = score_data.load_adj_mat(adj_mat_infile)
        item_names = None
        with open(num_pos_infile) as fin:
            num_true_pos = int(fin.readline())
        true_labels_func = partial(
            expts_labeled_data.get_true_labels_expt_data, num_true_pos)

    if affil_subset_every_1_4:
        subset_adj_mat = sparse.csc_matrix(
            (adj_mat.shape[0], int(adj_mat.shape[1] / 4)), dtype='int')
        # copy over every 4th column
        for i in range(int(adj_mat.shape[1] / 4)):
            subset_adj_mat[:, i] = adj_mat[:, 4 * i]
        adj_mat = subset_adj_mat
        if pi_vector_to_use is not None:
            subset_pi_vector = np.array([
                pi_vector_to_use[4 * i]
                for i in range(int(len(pi_vector_to_use) / 4))
            ])
            pi_vector_to_use = subset_pi_vector

    if affil_subset_type > 0:
        if pi_vector_to_use is not None:
            affils_to_keep = affil_subsets.compute_affil_subsets(
                pi_vector_to_use, affil_subset_fraction, affil_subset_type)
            pi_vector_to_use = pi_vector_to_use[affils_to_keep]
        else:
            tmp_pi_vector = np.asarray(adj_mat.sum(axis=0)).squeeze() / float(
                adj_mat.shape[0])
            affils_to_keep = affil_subsets.compute_affil_subsets(
                tmp_pi_vector, affil_subset_fraction, affil_subset_type)
        adj_mat = adj_mat[:, affils_to_keep]

    if not files_already_present:
        [os.remove(infile) for infile in fullpathfiles]

    # did affil_subsetting remove all data? If so, going to get errors. Don't run anything, just print.
    if affil_subset_type > 0 and np.sum(adj_mat) == 0:
        affil_subsets.print_results_for_0data(
            num_true_pos,
            adj_mat,
            true_labels_func=true_labels_func,
            method_spec=method_spec,
            evals_outfile=evals_outfile,
            pair_scores_outfile=pair_scores_outfile,
            row_labels=item_names,
            print_timing=verbose,
            prefer_faiss=prefer_faiss,
            mixed_pairs_sims=sims_for_mixed_pairs,
            flip_high_ps=flip_high_ps,
            pi_vector_to_use=pi_vector_to_use,
            remove_boundary_items=remove_boundary_items,
            remove_boundary_affils=remove_boundary_affils)
        return

    # 2. run the expt
    score_data.run_and_eval(adj_mat,
                            true_labels_func=true_labels_func,
                            method_spec=method_spec,
                            evals_outfile=evals_outfile,
                            pair_scores_outfile=pair_scores_outfile,
                            row_labels=item_names,
                            print_timing=verbose,
                            prefer_faiss=prefer_faiss,
                            mixed_pairs_sims=sims_for_mixed_pairs,
                            flip_high_ps=flip_high_ps,
                            pi_vector_to_use=pi_vector_to_use,
                            remove_boundary_items=remove_boundary_items,
                            remove_boundary_affils=remove_boundary_affils)