Exemplo n.º 1
0
                            feature_type='chars',
                            ngram_range=4,
                            sample_authors=False,
                            sample_features=False,
                            n_features=10000,
                            n_dev_pairs=1000000000000,
                            vector_space_model='std',
                            balanced_pairs=False)
    verifier.vectorize(verif_dataset)
    dev_results = verifier.fit()
    dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
    max_f = np.nanmax(dev_f)
    print('\t\t + F1 (pairwise):', max_f)

    print('getting distance table')
    df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, 'dev')
    df.to_csv('outputs/dm_no_sampl.csv')
    print('saved dist table!')

df = pd.read_csv('outputs/dm_no_sampl.csv')
df = df.set_index('id')
print('loaded dist table!')

top_scores = 0
if top_scores:
    scores = []
    for author1 in df.index:
        for author2 in df.columns:
            if author1.split('_', 1)[0] != author2.split('_', 1)[0]:
                sc = df[author1][author2]
                if sc > 0:
Exemplo n.º 2
0
verifier = Verification(random_state=1000,
                        metric=dm,
                        n_features=10000,
                        n_dev_pairs=0,
                        n_test_pairs=99999999,
                        vector_space_model=vsm,
                        balanced_pairs=False,
                        control_pairs=False)

logging.info("Starting verification [train / test]")
verifier.vectorize(X_dev, X_test)
train_results, test_results = verifier.predict(filter_imposters=False)
logging.info("Computing results")

test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs,
                                      "test")
test_df.to_csv("../outputs/caesar_test.csv")

test_df = pd.read_csv("../outputs/caesar_test.csv")
test_df = test_df.set_index("id")
test_df = test_df.applymap(lambda x: int(x * 1000)).corr()

# heatmap plotting:
sb.heatmap(test_df)
ax = sb.plt.gca()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontname('Arial')
    label.set_fontsize(3)
sb.plt.savefig("../outputs/caesar_imposter_heatmap.pdf")
sb.plt.clf()
Exemplo n.º 3
0
verifier = Verification(random_state=1000,
                        metric=dm,
                        n_features=10000,
                        n_dev_pairs=0,
                        n_test_pairs=99999999,
                        vector_space_model=vsm,
                        balanced_pairs=False,
                        control_pairs=False)

logging.info("Starting verification [train / test]")
verifier.vectorize(X_dev, X_test)
train_results, test_results = verifier.predict(filter_imposters=False)
logging.info("Computing results")

test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test")
test_df.to_csv("../outputs/caesar_test.csv")

test_df = pd.read_csv("../outputs/caesar_test.csv")
test_df = test_df.set_index("id")
test_df = test_df.applymap(lambda x:int(x*1000)).corr()

# heatmap plotting:
sb.heatmap(test_df)
ax = sb.plt.gca()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontname('Arial')
    label.set_fontsize(3)
sb.plt.savefig("../outputs/caesar_imposter_heatmap.pdf")
sb.plt.clf()
Exemplo n.º 4
0
                                metric=dm,
                                sample_authors=False,
                                n_features=5000,
                                n_dev_pairs=10000,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [train / test]")
        verifier.vectorize(X_dev)
        dev_results = verifier.fit()
        logging.info("Computing results")

        test_df = verifier.get_distance_table(verifier.dev_dists,
                                              verifier.dev_pairs, "dev")
        ax = plt.Subplot(fig, outer_grid[cnt])
        linkage_matrix = linkage(test_df, 'complete')
        f = dendrogram(linkage_matrix,
                       truncate_mode='lastp',
                       show_leaf_counts=True,
                       ax=ax,
                       orientation='right',
                       labels=test_df.columns,
                       leaf_font_size=0.5,
                       link_color_func=None,
                       color_threshold=np.inf)
        tickL = ax.yaxis.get_ticklabels()
        for t in tickL:
            t.set_fontsize(1)
            t.set_color('grey')
Exemplo n.º 5
0
                         metric=dm,
                         sample_authors=False,
                         n_features=5000,
                         n_dev_pairs=10000,
                         em_iterations=100,
                         vector_space_model=vsm,
                         weight=0.2,
                         eps=0.01,
                         norm="l2",
                         balanced_pairs=True)
 logging.info("Starting verification [train / test]")
 verifier.vectorize(X_dev)
 dev_results = verifier.fit()
 logging.info("Computing results")
 
 test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev")
 ax = plt.Subplot(fig, outer_grid[cnt])
 linkage_matrix = linkage(test_df, 'complete')
 f = dendrogram(linkage_matrix,
            truncate_mode='lastp',
            show_leaf_counts=True,
            ax=ax,
            orientation='right',
            labels=test_df.columns,
            leaf_font_size=0.5,
            link_color_func=None,
            color_threshold=np.inf)
 tickL = ax.yaxis.get_ticklabels()
 for t in tickL:
     t.set_fontsize(1)
     t.set_color('grey')