verifier = Verification(random_state=random_state, metric=dm, feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=0, n_dev_pairs=n_dev_pairs, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print "\t\t + F1: "+str(max_f) fscore_row.append(format(max_f*100, '.1f')) # distribution of scores: same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"]) diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"]) D, p = ks_2samp(same_author_densities, diff_author_densities) print "\t\t- KS: D = "+str(D)+" (p = "+str(p)+")" sb.set_style("dark")
fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print('\t\t* ' + dm) verifier = Verification(random_state=random_state, metric=dm, feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=None, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=balanced_pairs) logging.info("Starting verification [dev / test]") verifier.vectorize(test_dataset) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1: ', max_f) fscore_row.append(format(max_f * 100, '.1f')) # distribution of scores: same_author_densities = np.asarray( [sc for c, sc in dev_results if c == "same_author"]) diff_author_densities = np.asarray( [sc for c, sc in dev_results if c == "diff_author"]) D, p = ks_2samp(same_author_densities, diff_author_densities) print("\t\t- KS: D = %s (p = %s)" % (D, p))
if fit: """ We fit a vectorizer with the best parametrization we obtained during the development phase. """ verifier = Verification(random_state=1066, metric='minmax', feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=10000, n_dev_pairs=1000000000000, vector_space_model='std', balanced_pairs=False) verifier.vectorize(verif_dataset) dev_results = verifier.fit() dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1 (pairwise):', max_f) print('getting distance table') df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, 'dev') df.to_csv('outputs/dm_no_sampl.csv') print('saved dist table!') df = pd.read_csv('outputs/dm_no_sampl.csv') df = df.set_index('id') print('loaded dist table!') top_scores = 0
fscore_row = [vsm] for dm_cnt, dm in enumerate(dms): print('\t\t* '+dm) verifier = Verification(random_state=random_state, metric=dm, feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=None, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=balanced_pairs) logging.info("Starting verification [dev / test]") verifier.vectorize(test_dataset) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1: ', max_f) fscore_row.append(format(max_f*100, '.1f')) # distribution of scores: same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"]) diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"]) D, p = ks_2samp(same_author_densities, diff_author_densities) print("\t\t- KS: D = %s (p = %s)" %(D, p)) sb.set_style("dark") ax = sb.plt.Subplot(fig, outer_grid[cnt])
for vsm_cnt, vsm in enumerate(vsms): print vsm verifier = Verification(random_state=1000, sample_features=False, metric=dm, sample_authors=False, n_features=5000, n_dev_pairs=10000, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev) dev_results = verifier.fit() logging.info("Computing results") test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev") ax = plt.Subplot(fig, outer_grid[cnt]) linkage_matrix = linkage(test_df, 'complete') f = dendrogram(linkage_matrix, truncate_mode='lastp', show_leaf_counts=True, ax=ax, orientation='right', labels=test_df.columns, leaf_font_size=0.5, link_color_func=None,
for vsm_cnt, vsm in enumerate(vsms): print vsm verifier = Verification(random_state=1000, sample_features=False, metric=dm, sample_authors=False, n_features=5000, n_dev_pairs=10000, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [train / test]") verifier.vectorize(X_dev) dev_results = verifier.fit() logging.info("Computing results") test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev") ax = plt.Subplot(fig, outer_grid[cnt]) linkage_matrix = linkage(test_df, 'complete') f = dendrogram(linkage_matrix, truncate_mode='lastp', show_leaf_counts=True, ax=ax, orientation='right', labels=test_df.columns, leaf_font_size=0.5, link_color_func=None, color_threshold=np.inf)