for dm_cnt, dm in enumerate(dms): print('\t\t* ' + dm) verifier = Verification(random_state=random_state, metric=dm, feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=None, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=balanced_pairs) logging.info("Starting verification [dev / test]") verifier.vectorize(test_dataset) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1: ', max_f) fscore_row.append(format(max_f * 100, '.1f')) # distribution of scores: same_author_densities = np.asarray( [sc for c, sc in dev_results if c == "same_author"]) diff_author_densities = np.asarray( [sc for c, sc in dev_results if c == "diff_author"]) D, p = ks_2samp(same_author_densities, diff_author_densities) print("\t\t- KS: D = %s (p = %s)" % (D, p)) sb.set_style("dark")
metric=dm, feature_type="words", sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=0, n_dev_pairs=n_dev_pairs, em_iterations=100, vector_space_model=vsm, weight=0.2, eps=0.01, norm="l2", balanced_pairs=True) logging.info("Starting verification [dev / test]") verifier.vectorize(X_dev, X_test) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print "\t\t + F1: "+str(max_f) fscore_row.append(format(max_f*100, '.1f')) # distribution of scores: same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"]) diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"]) D, p = ks_2samp(same_author_densities, diff_author_densities) print "\t\t- KS: D = "+str(D)+" (p = "+str(p)+")" sb.set_style("dark") ax = sb.plt.Subplot(fig, outer_grid[cnt])
for dm_cnt, dm in enumerate(dms): print('\t\t* '+dm) verifier = Verification(random_state=random_state, metric=dm, feature_type='chars', ngram_range=4, sample_authors=False, sample_features=False, n_features=n_features, n_test_pairs=None, n_dev_pairs=n_dev_pairs, vector_space_model=vsm, balanced_pairs=balanced_pairs) logging.info("Starting verification [dev / test]") verifier.vectorize(test_dataset) dev_results = verifier.fit() logging.info("Computing results") dev_f, dev_p, dev_r, dev_t = evaluate(dev_results) max_f = np.nanmax(dev_f) print('\t\t + F1: ', max_f) fscore_row.append(format(max_f*100, '.1f')) # distribution of scores: same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"]) diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"]) D, p = ks_2samp(same_author_densities, diff_author_densities) print("\t\t- KS: D = %s (p = %s)" %(D, p)) sb.set_style("dark") ax = sb.plt.Subplot(fig, outer_grid[cnt]) ax.set_xlim([0, 1])