Exemplo n.º 1
0
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type="words",
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=0,
                                n_dev_pairs=n_dev_pairs,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(X_dev, X_test)
        dev_results = verifier.fit()

        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print "\t\t + F1: "+str(max_f)
        fscore_row.append(format(max_f*100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print "\t\t- KS: D = "+str(D)+" (p = "+str(p)+")"
        sb.set_style("dark")
Exemplo n.º 2
0
    fscore_row = [vsm]
    for dm_cnt, dm in enumerate(dms):
        print('\t\t* ' + dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type='chars',
                                ngram_range=4,
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=None,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=balanced_pairs)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(test_dataset)
        dev_results = verifier.fit()
        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print('\t\t + F1: ', max_f)
        fscore_row.append(format(max_f * 100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray(
            [sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray(
            [sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print("\t\t- KS: D = %s (p = %s)" % (D, p))
Exemplo n.º 3
0
if fit:
    """
    We fit a vectorizer with the best parametrization
    we obtained during the development phase.
    """
    verifier = Verification(random_state=1066,
                            metric='minmax',
                            feature_type='chars',
                            ngram_range=4,
                            sample_authors=False,
                            sample_features=False,
                            n_features=10000,
                            n_dev_pairs=1000000000000,
                            vector_space_model='std',
                            balanced_pairs=False)
    verifier.vectorize(verif_dataset)
    dev_results = verifier.fit()
    dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
    max_f = np.nanmax(dev_f)
    print('\t\t + F1 (pairwise):', max_f)

    print('getting distance table')
    df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, 'dev')
    df.to_csv('outputs/dm_no_sampl.csv')
    print('saved dist table!')

df = pd.read_csv('outputs/dm_no_sampl.csv')
df = df.set_index('id')
print('loaded dist table!')

top_scores = 0
Exemplo n.º 4
0
    fscore_row = [vsm]
    for dm_cnt, dm in enumerate(dms):
        print('\t\t* '+dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type='chars',
                                ngram_range=4,
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=None,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=balanced_pairs)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(test_dataset)
        dev_results = verifier.fit()
        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print('\t\t + F1: ', max_f)
        fscore_row.append(format(max_f*100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print("\t\t- KS: D = %s (p = %s)" %(D, p))
        sb.set_style("dark")
        ax = sb.plt.Subplot(fig, outer_grid[cnt])
Exemplo n.º 5
0
    for vsm_cnt, vsm in enumerate(vsms):
        print vsm
        verifier = Verification(random_state=1000,
                                sample_features=False,
                                metric=dm,
                                sample_authors=False,
                                n_features=5000,
                                n_dev_pairs=10000,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [train / test]")
        verifier.vectorize(X_dev)
        dev_results = verifier.fit()
        logging.info("Computing results")

        test_df = verifier.get_distance_table(verifier.dev_dists,
                                              verifier.dev_pairs, "dev")
        ax = plt.Subplot(fig, outer_grid[cnt])
        linkage_matrix = linkage(test_df, 'complete')
        f = dendrogram(linkage_matrix,
                       truncate_mode='lastp',
                       show_leaf_counts=True,
                       ax=ax,
                       orientation='right',
                       labels=test_df.columns,
                       leaf_font_size=0.5,
                       link_color_func=None,
Exemplo n.º 6
0
 for vsm_cnt, vsm in enumerate(vsms):
     print vsm
     verifier = Verification(random_state=1000,
                             sample_features=False,
                             metric=dm,
                             sample_authors=False,
                             n_features=5000,
                             n_dev_pairs=10000,
                             em_iterations=100,
                             vector_space_model=vsm,
                             weight=0.2,
                             eps=0.01,
                             norm="l2",
                             balanced_pairs=True)
     logging.info("Starting verification [train / test]")
     verifier.vectorize(X_dev)
     dev_results = verifier.fit()
     logging.info("Computing results")
     
     test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev")
     ax = plt.Subplot(fig, outer_grid[cnt])
     linkage_matrix = linkage(test_df, 'complete')
     f = dendrogram(linkage_matrix,
                truncate_mode='lastp',
                show_leaf_counts=True,
                ax=ax,
                orientation='right',
                labels=test_df.columns,
                leaf_font_size=0.5,
                link_color_func=None,
                color_threshold=np.inf)