Пример #1
0
def test_adj_and_phi():
    """
    Reads adj matrix, makes sure we can match what R code did for learning pi_vector, preprocessing it, and flipping it.

    Uses & compares to files: 'ng_aa_data1/data15' . [_adj_mat.mtx.gz, .dataphi.txt.gz, .dataphipreproc.txt.gz,
                                                      .dataphiflipped.txt.gz, .adj_mat_flipped.mtx.gz]

    Throws assertion error if unhappy
    """
    print(
        "\n*** Testing reading adjacency matrix and computing pi_vector ***\n")
    # Use the example data files "data15_*". They contain the contents of my expt data file alt.atheism/data1.Rdata

    #pi_vector_infile = "ng_aa_data1/data15_phi.txt.gz"  # this is from data1.Rdata, and it's the phi from the whole (larger) data set
    #pi_vector_whole_data = score_data.load_pi_from_file(pi_vector_infile) # ignoring this

    adj_mat_infile = "ng_aa_data1/data15_adj_mat.mtx.gz"

    # manually constructing these as I go along, using my existing R code in experimentRunner.R
    pi_vector_learned_R = expts_labeled_data.load_pi_from_file(
        "ng_aa_data1/data15.dataphi.txt.gz")
    pi_vector_preproc_R = expts_labeled_data.load_pi_from_file(
        "ng_aa_data1/data15.dataphipreproc.txt.gz")

    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    # Quirk from R: it saved floating point data with 7 digits of precision (see getOptions("digits") and format()).
    # Implication: if we want to ever use those phi files, should re-convert with higher precision.

    # For now, allow a difference of 1e-07 when comparing them

    # How annoying. Upping the precision simply revealed how I'm imprecise in the R code anyway. The Bernoulli <->
    # multinomial conversion I do doesn't keep the exact probabilities anyway. Actually... that's a possible bug. The
    # other time the code does this, it explicitly fixes that.

    # Compare. Expect pi_vector_learned to match pi_vector_learned_R and match numCols of adj_mat.
    assert (pi_vector_learned.shape[0] == adj_mat.shape[1])
    assert (max(abs(pi_vector_learned - pi_vector_learned_R)) < 1e-07)

    # Expect pi_vector_preproc to match pi_vector_preproc_R and match numCols of adj_mat_preproc
    assert (pi_vector_preproc.shape[0] == adj_mat_preproc.shape[1])
    assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07)

    # test flipping
    pi_vector_flipped_R = expts_labeled_data.load_pi_from_file(
        "ng_aa_data1/data15.dataphiflipped.txt.gz")
    adj_mat_flipped_R = score_data.load_adj_mat(
        "ng_aa_data1/data15.adj_mat_flipped.mtx.gz")
    pi_vector_flipped, adj_mat_flipped = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat, flip_high_ps=True)
    # Expect the respective versions to match
    assert (pi_vector_flipped.shape == pi_vector_preproc.shape)
    assert (max(abs(pi_vector_flipped - pi_vector_flipped_R)) < 1e-07)
    assert (adj_mat_flipped_R.shape == adj_mat_flipped.shape)
    assert (abs(adj_mat_flipped_R - adj_mat_flipped).max() < 1e-07)
Пример #2
0
def test_score_wc_faiss():
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    scores_data_frame = scoring_with_faiss.score_pairs_faiss(
        adj_mat,
        which_methods=['weighted_corr_faiss'],
        how_many_neighbors=-1,
        print_timing=True,
        pi_vector=pi_vector)
    print('scores look like (sample):\n' + str(scores_data_frame.head()))
    # note for later: scores_data_frame.reset_index() makes it save item1 & item2 as regular columns, defaults back to index of row numbers

    print("calling adamic-adar")
    scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact(
        adj_mat,
        'adamic_adar_faiss',
        pi_vector=pi_vector,
        num_docs=adj_mat.shape[0])
    print('scores look like (sample):\n' + str(scores_data_frame2.head()))

    print("calling pearson")
    scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact(
        adj_mat, 'pearson_faiss')
    print('scores look like (sample):\n' + str(scores_data_frame2.head()))
    print("(dense input)")
    scores_data_frame2 = scoring_with_faiss.score_pairs_faiss_all_exact(
        adj_mat.toarray(), 'pearson_faiss')
    print('scores look like (sample):\n' + str(scores_data_frame2.head()))
Пример #3
0
def case1_no_bdry_nodes(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 1\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    new_evals_file = results_dir + "/evals-case1.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #4
0
def test_faiss_plus_normal():
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    adj_mat = score_data.load_adj_mat(adj_mat_infile)

    score_data.run_and_eval(
        adj_mat,
        true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
        # method_spec="all",
        method_spec=['weighted_corr', 'weighted_corr_faiss'],
        evals_outfile="reality_appweek_50/python-out/evals-test.txt",
        pair_scores_outfile='reality_appweek_50/tmp.scoredPairs.csv.gz',
        print_timing=True)
Пример #5
0
def case4_0item_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 4\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # Keep the natural all-0 item, and tell program to remove boundary affils
    new_evals_file = results_dir + "/evals-case4.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=True)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #6
0
def case5_0item_keep_0affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 5\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # want only all-0 affils, so set affil[,115] to all 0's
    adj_mat[:, 115] = 0
    new_evals_file = results_dir + "/evals-case5.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #7
0
def case3_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 3\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # Want the natural all-0 and all-1 affils, but still want item 26 to stay out.
    adj_mat[26,:] = 0
    new_evals_file = results_dir + "/evals-case3.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #8
0
def case6_0item_keep_0and1affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 6\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    # score matrix the way it comes: with all-0 and all-1 affils, and an item that's all-0 once the all-1 affil is gone
    # Note: that all-0 item (an induced boundary node) can't be handled quite correctly by the exp model. But it works
    # out well enough, because it ends up with a parameter very close to zero.
    new_evals_file = results_dir + "/evals-case6.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #9
0
def demo_run_and_eval(adj_mat_infile,
                      pair_scores_outfile,
                      evals_outfile,
                      prefer_faiss=False):

    adj_mat = score_data.load_adj_mat(adj_mat_infile)

    score_data.run_and_eval(
        adj_mat,
        true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
        method_spec="all",
        evals_outfile=evals_outfile,
        pair_scores_outfile=pair_scores_outfile,
        print_timing=True,
        prefer_faiss=prefer_faiss)
Пример #10
0
def case8_01items_no_bdry_affils(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 8\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze()
    adj_mat.resize((76, 206))   # orig shape was 75x206
    adj_mat[75, affil_degrees > 0] = 1  # new almost-all-1 item (preserves orig all-0 affils)

    new_evals_file = results_dir + "/evals-case8.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=True)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #11
0
def case9_01items_keep_all(adj_mat_infile, results_dir, aucs_file_to_match):
    print("\nCase 9\n")
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    affil_degrees = np.asarray(adj_mat.sum(axis=0)).squeeze()
    adj_mat.resize((76, 206))   # orig shape was 75x206
    adj_mat[75, affil_degrees > 0] = 1  # new almost-all-1 item (preserves orig all-0 affils)
    # Note: similar to case 6, the all-1 item (induced boundary node) can't be handled by the exp model. There is no
    # max likelihood solution for this graph. In practice, the algorithm times out -- but even if it ran longer,
    # there's no good solution to converge to. The parameter for that item needs to be near-infinity, but not infinity.
    new_evals_file = results_dir + "/evals-case9.txt"
    score_data.run_and_eval(adj_mat,
                            true_labels_func=expts_labeled_data.true_labels_for_expts_with_5pairs,
                            method_spec=['weighted_corr', 'weighted_corr_exp'],
                            evals_outfile=new_evals_file,
                            pair_scores_outfile=None,
                            print_timing=True, remove_boundary_items=False, remove_boundary_affils=False)
    compare_auc_files(new_evals_file, aucs_file_to_match)
Пример #12
0
def test_only_wc(adj_mat_infile, scored_pairs_file_R, scored_pairs_file_new):
    """
    Like test_pair_scores_against_R(), but checks scores & timing of the function simple_only_weighted_corr().
    (This was the first scoring method I implemented using a transform of the adj_matrix.)

    :param adj_mat_infile: local path ending in .mtx.gz
    :param scored_pairs_file_R: local path ending in .csv.gz
    """

    print(
        "\n*** Checking simple_only_weighted_corr against scores from R ***\n")

    # Read adj data and prep pi_vector
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    scores_storage = magic_dictionary.make_me_a_dict(adj_mat_preproc.shape[0])
    scoring_methods.extra_implementations.simple_only_weighted_corr(
        score_data.gen_all_pairs,
        adj_mat_preproc,
        scores_storage.create_and_store_array("weighted_corr"),
        pi_vector_preproc,
        print_timing=True)
    scores_storage.to_csv_gz(
        scored_pairs_file_new,
        lambda: score_data.ij_gen(adj_mat_preproc.shape[0]))
    with gzip.open(scored_pairs_file_new, 'r') as fpin:
        wc_frame = pd.read_csv(fpin)

    with gzip.open(scored_pairs_file_R, 'r') as fpin:
        scores_data_frame_R = pd.read_csv(fpin)

    print("max diff: " + str(
        abs(wc_frame["weighted_corr"] -
            scores_data_frame_R["pearsonWeighted"]).max()))
    assert (max(
        abs(wc_frame["weighted_corr"] -
            scores_data_frame_R["pearsonWeighted"])) < 1e-05)
Пример #13
0
def test_adj_and_phi2():
    """
    Reads adj matrix, checks that we can learn pi_vector for a second data set.
    Using files: "reality_appweek_50/data50_adjMat.mtx.gz", "reality_appweek_50/data50-inference-allto6.phi.csv.gz"
    """
    print(
        "\n*** Testing reading adjacency matrix and computing pi_vector (2) ***\n"
    )
    # Use something other than newsgroups! They're too complicated because they were run early.

    # Check that I can learn phi from the adjacency matrix and end up with the version in the inference file
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    pi_vector_preproc_R = expts_labeled_data.load_pi_from_file(
        "reality_appweek_50/data50-inference-allto6.phi.csv.gz")

    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    # Expect pi_vector_preproc to match pi_vector_preproc_R
    assert (max(abs(pi_vector_preproc - pi_vector_preproc_R)) < 1e-07)
Пример #14
0
def test_faiss_basic_calls():
    adj_mat_infile = "reality_appweek_50/data50_adjMat.mtx.gz"
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector, adj_mat = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat)

    # can do dot product on plain adj matrix -- just computes sharedSize
    index = faiss.IndexFlatIP(adj_mat.shape[1])  # takes numCols as arg
    # mimicking tutorial example:
    #index.add(np.random.random((100, adj_mat.shape[1])).astype('float32'))
    adj_for_faiss = adj_mat.toarray().astype(
        'float32'
    )  # adj_mat is sparse, but faiss wants dense. and, apparently, wants float32.
    index.add(adj_for_faiss)
    print("index.is_trained: " + str(index.is_trained) + ", index.total: " +
          str(index.ntotal))

    # look at 10 nearest neighbors of each input
    distances10, neighbors10 = index.search(adj_for_faiss, 10)

    distances, neighbors = index.search(adj_for_faiss,
                                        adj_for_faiss.shape[0])  # all pairs
    print('basic calls ran')
Пример #15
0
def test_pair_scores_against_R(adj_mat_infile,
                               scored_pairs_file_R,
                               scored_pairs_file_new,
                               make_dense=False,
                               flip_high_ps=False,
                               run_all=0,
                               prefer_faiss=False):
    """
    Starting from an adj matrix, score pairs (using current implementation) and compare to reference file run from R.
    Similar contents to score_data.run_and_eval().

    :param run_all: set to 2 (or 1) to run and time all (or more) implementations.
                    However, we only look at the scores of the last one.
    """
    print("\n*** Testing scores computed for pairs ***\n")
    print("Adj matrix infile: " + adj_mat_infile +
          "; scored pairs reference file: " + scored_pairs_file_R)

    # Read adj data and prep pi_vector
    adj_mat = score_data.load_adj_mat(adj_mat_infile)
    pi_vector_learned = score_data.learn_pi_vector(adj_mat)
    pi_vector_preproc, adj_mat_preproc = expts_labeled_data.adjust_pi_vector(
        pi_vector_learned, adj_mat, flip_high_ps=flip_high_ps)

    methods_to_run = [
        'jaccard', 'cosine', 'cosineIDF', 'shared_size', 'hamming', 'pearson',
        'weighted_corr', 'shared_weight11', 'shared_weight1100', 'adamic_adar',
        'newman', 'mixed_pairs'
    ]
    mixed_pairs_sims = [.01, .001]
    start = timer()

    if make_dense:
        adj_mat_preproc = adj_mat_preproc.toarray()
    scoring_methods.score_pairs(score_data.gen_all_pairs,
                                adj_mat_preproc,
                                which_methods=methods_to_run,
                                outfile_csv_gz=scored_pairs_file_new,
                                pi_vector=pi_vector_preproc,
                                back_compat=True,
                                num_docs=adj_mat_preproc.shape[0],
                                mixed_pairs_sims=mixed_pairs_sims,
                                print_timing=True,
                                run_all_implementations=run_all,
                                prefer_faiss=prefer_faiss)
    with gzip.open(scored_pairs_file_new, 'r') as fpin:
        scores_data_frame = pd.read_csv(fpin)

    scores_data_frame['label'] = expts_labeled_data.get_true_labels_expt_data(
        num_true_pairs=5,
        pairs_generator=score_data.gen_all_pairs(adj_mat_preproc))
    end = timer()
    print("ran " \
          + str(len(methods_to_run) + (len(mixed_pairs_sims) - 1 if 'mixed_pairs' in methods_to_run else 0)) \
          + " methods " + ("(plus variants) " if run_all else "") \
          +  "on " + str(adj_mat.shape[0] * (adj_mat.shape[0]-1)/float(2)) + " pairs")
    print("num seconds: " + str(end - start))

    # Read scores from R and compare
    with gzip.open(scored_pairs_file_R, 'r') as fpin:
        scores_data_frame_R = pd.read_csv(fpin)

    for (R_method, our_method) in list(mapping_from_R_methods.items()):
        if our_method in list(scores_data_frame):
            print("Checking " + our_method)
            # R data doesn't have item numbers, but is in the same all-pairs order as ours
            print("max diff: " + str(
                abs(scores_data_frame[our_method] -
                    scores_data_frame_R[R_method]).max()))

            # Sadly, the p_i vectors are off by a smidgen (see notes above), so anything that uses them can
            # differ too. sharedWeight11 vals differed by > 1e-06, and that was with only 65 affils.
            tolerance = 1e-10
            if prefer_faiss:
                tolerance = 1e-04
            elif our_method in our_pi_methods:
                tolerance = 1e-05
            assert (max(
                abs(scores_data_frame[our_method] -
                    scores_data_frame_R[R_method])) < tolerance)

    return scores_data_frame