示例#1
0
random_state = 1000
data_path = "../data/"
corpus = "gr_articles"
n_pairs = 100000
n_features = 10000

logging.info("preparing corpus")
X_dev, X_test = split_corpus(prepare_corpus(data_path+corpus), controlled="authors", random_state=random_state)

verifier = Verification(random_state=random_state,
                        metric="minmax",
                        feature_type="words",
                        sample_authors=False,
                        sample_features=False,
                        n_features=n_features,
                        n_test_pairs=n_pairs,
                        n_dev_pairs=n_pairs,
                        em_iterations=100,
                        vector_space_model="std",
                        weight=0.2,
                        eps=0.01,
                        norm="l2",
                        balanced_pairs=False)
logging.info("Starting verification [dev / test]")
verifier.vectorize(X_dev, X_test)
dev_results, test_results = verifier.predict()
logging.info("Computing results")

# first prec rec curve of test results:
test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results)
fig = sb.plt.figure()
sb.plt.xlabel("recall", fontsize=10)
示例#2
0
# first baseline:
df = pd.DataFrame(columns=["vector space model"]+list(dms))

for vsm_cnt, vsm in enumerate(vsms):
    print("\t+ "+vsm)
    fscore_row = [vsm]
    for dm_cnt, dm in enumerate(dms):
        print("* "+dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type="words",
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=0,
                                n_dev_pairs=n_dev_pairs,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(X_dev, X_test)
        dev_results = verifier.fit()

        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print "\t\t + F1: "+str(max_f)
        fscore_row.append(format(max_f*100, '.1f'))
df_dev = pd.DataFrame(columns=["nb_iterations"]+[str(n) for n in prop_ranges])

for i, iteration in enumerate(iteration_ranges):
    dev_row = [str(iteration)]
    print "* nr of sampling iterations: "+str(iteration)
    for prop in prop_ranges:
        print "\t+ sampling proportion: "+str(prop)
        verifier = Verification(n_features=mfw,
                                feature_type="words",
                                random_prop=prop,
                                sample_features=True,
                                sample_authors=True,
                                metric=dm,
                                text_cutoff=None,
                                sample_iterations=iteration,
                                n_potential_imposters=60,
                                n_actual_imposters=10,
                                n_test_pairs=0,
                                n_dev_pairs=n_dev_pairs,
                                random_state=random_state,
                                top_rank=10,
                                vector_space_model=vsm,
                                balanced_pairs=True)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(X_dev, X_test)
        dev_results = verifier.fit()
        logging.info("Computing results")
        # get dev results:
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        dev_row.append(max_f)
# set fig params
fig = sb.plt.figure(figsize=(len(vsms), len(dms)))
cnt = 0
outer_grid = gridspec.GridSpec(len(vsms), len(dms), wspace=0.1, hspace=0.1)
c1, c2 = sb.color_palette("Set1")[:2]

for vsm_cnt, vsm in enumerate(vsms):
    print("\t+ " + vsm)
    fscore_row = [vsm]
    for dm_cnt, dm in enumerate(dms):
        print("\t\t* " + dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type="words",
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=0,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=True)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(X_dev, X_test)
        dev_results = verifier.fit()

        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print "\t\t\t+ F1: " + str(max_f)
        fscore_row.append(format(max_f * 100, '.1f'))

        # distribution of scores:
示例#5
0
corpus = 'soldier_letters'

logging.info('preparing corpus')
verif_dataset = prepare_corpus(data_path+corpus)

fit = 0
if fit:
    """
    We fit a vectorizer with the best parametrization
    we obtained during the development phase.
    """
    verifier = Verification(random_state=1066,
                            metric='minmax',
                            feature_type='chars',
                            ngram_range=4,
                            sample_authors=False,
                            sample_features=False,
                            n_features=10000,
                            n_dev_pairs=1000000000000,
                            vector_space_model='std',
                            balanced_pairs=False)
    verifier.vectorize(verif_dataset)
    dev_results = verifier.fit()
    dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
    max_f = np.nanmax(dev_f)
    print('\t\t + F1 (pairwise):', max_f)

    print('getting distance table')
    df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, 'dev')
    df.to_csv('outputs/dm_no_sampl.csv')
    print('saved dist table!')
示例#6
0
c1, c2 = sb.color_palette('Set1')[:2]

# first baseline:
df = pd.DataFrame(columns=['vector space model'] + list(dms))

for vsm_cnt, vsm in enumerate(vsms):
    print('\t+ ' + vsm)
    fscore_row = [vsm]
    for dm_cnt, dm in enumerate(dms):
        print('\t\t* ' + dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type='chars',
                                ngram_range=4,
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=None,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=balanced_pairs)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(test_dataset)
        dev_results = verifier.fit()
        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print('\t\t + F1: ', max_f)
        fscore_row.append(format(max_f * 100, '.1f'))

        # distribution of scores:
示例#7
0

# first baseline:
df = pd.DataFrame(columns=['vector space model']+list(dms))

for vsm_cnt, vsm in enumerate(vsms):
    print('\t+ '+vsm)
    fscore_row = [vsm]
    for dm_cnt, dm in enumerate(dms):
        print('\t\t* '+dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type='chars',
                                ngram_range=4,
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=None,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=balanced_pairs)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(test_dataset)
        dev_results = verifier.fit()
        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print('\t\t + F1: ', max_f)
        fscore_row.append(format(max_f*100, '.1f'))

        # distribution of scores:
示例#8
0
from verification.verification import Verification
from verification.preprocessing import prepare_corpus, split_corpus

data_path = "../data/"
corpora = ["du_essays", "gr_articles", "sp_articles"]
random_state = 1000
df = pd.DataFrame(columns=["name", "total words", "unique words", "authors", "docs", "SADPs", "DADPs"])

for corpus in corpora:
    print("=== "+corpus+" ===")
    # prepare data:
    X_dev, X_test = split_corpus(prepare_corpus(data_path+corpus), controlled="authors", random_state=random_state)
    # first *all* pairs
    verifier = Verification(random_state=random_state,
                            n_test_pairs=None,
                            n_dev_pairs=None,
                            balanced_pairs=False)
    verifier.vectorize(X_dev, X_test)
    # first dev data:
    nr_docs = len(X_dev.texts)
    total_nr_words = sum((len(doc) for doc in X_dev.texts))
    unique_words = len(set(sum(X_dev.texts, [])))
    distinct_authors = len(set(X_dev.authors))
    dev_pairs = verifier._setup_pairs(phase="dev")
    SADPs, DADPs = 0, 0
    for (i, j) in dev_pairs:
        if X_dev.authors[i] == X_dev.authors[j]:
            SADPs+=1
        else:
            DADPs+=1
    row = [corpus+" (dev)", total_nr_words, unique_words, distinct_authors, nr_docs, SADPs, DADPs]
示例#9
0
# we prepare the corpus
logging.info("preparing corpus")
X_dev = prepare_corpus(test)
X_test = prepare_corpus(test)

dm = 'minmax'
vsm = 'tf'

print dm
print vsm

verifier = Verification(random_state=1000,
                        metric=dm,
                        n_features=10000,
                        n_dev_pairs=0,
                        n_test_pairs=99999999,
                        vector_space_model=vsm,
                        balanced_pairs=False,
                        control_pairs=False)

logging.info("Starting verification [train / test]")
verifier.vectorize(X_dev, X_test)
train_results, test_results = verifier.predict(filter_imposters=False)
logging.info("Computing results")

test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs,
                                      "test")
test_df.to_csv("../outputs/caesar_test.csv")

test_df = pd.read_csv("../outputs/caesar_test.csv")
test_df = test_df.set_index("id")
示例#10
0
vsms = ('std', 'plm', 'tf')

fig = plt.figure()
cnt = 0
outer_grid = gridspec.GridSpec(len(dms), len(vsms))

for dm_cnt, dm in enumerate(dms):
    print dm
    for vsm_cnt, vsm in enumerate(vsms):
        print vsm
        verifier = Verification(random_state=1000,
                                sample_features=False,
                                metric=dm,
                                sample_authors=False,
                                n_features=5000,
                                n_dev_pairs=10000,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [train / test]")
        verifier.vectorize(X_dev)
        dev_results = verifier.fit()
        logging.info("Computing results")

        test_df = verifier.get_distance_table(verifier.dev_dists,
                                              verifier.dev_pairs, "dev")
        ax = plt.Subplot(fig, outer_grid[cnt])
        linkage_matrix = linkage(test_df, 'complete')
        f = dendrogram(linkage_matrix,
示例#11
0
# we prepare the corpus
logging.info("preparing corpus")
X_dev = prepare_corpus(test)
X_test = prepare_corpus(test)


dm = 'minmax'
vsm = 'tf'

print dm
print vsm

verifier = Verification(random_state=1000,
                        metric=dm,
                        n_features=10000,
                        n_dev_pairs=0,
                        n_test_pairs=99999999,
                        vector_space_model=vsm,
                        balanced_pairs=False,
                        control_pairs=False)

logging.info("Starting verification [train / test]")
verifier.vectorize(X_dev, X_test)
train_results, test_results = verifier.predict(filter_imposters=False)
logging.info("Computing results")

test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test")
test_df.to_csv("../outputs/caesar_test.csv")

test_df = pd.read_csv("../outputs/caesar_test.csv")
test_df = test_df.set_index("id")
test_df = test_df.applymap(lambda x:int(x*1000)).corr()
示例#12
0
for ftype in ftypes:
    # first baseline:
    df = pd.DataFrame(columns=["vector space model"] + list(dms))

    for vsm_cnt, vsm in enumerate(vsms):
        print("\t+ " + vsm)
        fscore_row = [vsm]
        for dm_cnt, dm in enumerate(dms):
            print("\t\t* " + dm)
            verifier = Verification(random_state=random_state,
                                    metric=dm,
                                    feature_type=ftype,
                                    sample_authors=False,
                                    sample_features=False,
                                    n_features=n_features,
                                    n_test_pairs=0,
                                    ngram_range=4,
                                    n_dev_pairs=n_dev_pairs,
                                    vector_space_model=vsm,
                                    balanced_pairs=True)
            logging.info("Starting verification [dev / test]")
            verifier.vectorize(X_dev, X_test)
            dev_results = verifier.fit()

            logging.info("Computing results")
            dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
            max_f = np.nanmax(dev_f)
            print "\t\t\t + F1: " + str(max_f)
            fscore_row.append(format(max_f * 100, '.1f'))
示例#13
0
fig = plt.figure()
cnt = 0
outer_grid = gridspec.GridSpec(len(dms), len(vsms))


for dm_cnt, dm in enumerate(dms):
    print dm
    for vsm_cnt, vsm in enumerate(vsms):
        print vsm
        verifier = Verification(random_state=1000,
                                sample_features=False,
                                metric=dm,
                                sample_authors=False,
                                n_features=5000,
                                n_dev_pairs=10000,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [train / test]")
        verifier.vectorize(X_dev)
        dev_results = verifier.fit()
        logging.info("Computing results")
        
        test_df = verifier.get_distance_table(verifier.dev_dists, verifier.dev_pairs, "dev")
        ax = plt.Subplot(fig, outer_grid[cnt])
        linkage_matrix = linkage(test_df, 'complete')
        f = dendrogram(linkage_matrix,
                   truncate_mode='lastp',