示例#1
0
                        metric="minmax",
                        feature_type="words",
                        sample_authors=False,
                        sample_features=False,
                        n_features=n_features,
                        n_test_pairs=n_pairs,
                        n_dev_pairs=n_pairs,
                        em_iterations=100,
                        vector_space_model="std",
                        weight=0.2,
                        eps=0.01,
                        norm="l2",
                        balanced_pairs=False)
logging.info("Starting verification [dev / test]")
verifier.vectorize(X_dev, X_test)
dev_results, test_results = verifier.predict()
logging.info("Computing results")

# first prec rec curve of test results:
test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results)
fig = sb.plt.figure()
sb.plt.xlabel("recall", fontsize=10)
sb.plt.ylabel("precision", fontsize=10)
sb.plt.xlim(0.4, 1)
sb.plt.ylim(0.4, 1.05)
sb.plt.plot(test_Rs, test_Ps, label="baseline")

# get max for dev:
dev_Fs, dev_Ps, dev_Rs, dev_Ts = evaluate(test_results)
best_t = dev_Ts[np.nanargmax(dev_Fs)]
baseline_test_f, test_p, test_r = evaluate_with_threshold(test_results, t=best_t)
示例#2
0
print dm
print vsm

verifier = Verification(random_state=1000,
                        metric=dm,
                        n_features=10000,
                        n_dev_pairs=0,
                        n_test_pairs=99999999,
                        vector_space_model=vsm,
                        balanced_pairs=False,
                        control_pairs=False)

logging.info("Starting verification [train / test]")
verifier.vectorize(X_dev, X_test)
train_results, test_results = verifier.predict(filter_imposters=False)
logging.info("Computing results")

test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs,
                                      "test")
test_df.to_csv("../outputs/caesar_test.csv")

test_df = pd.read_csv("../outputs/caesar_test.csv")
test_df = test_df.set_index("id")
test_df = test_df.applymap(lambda x: int(x * 1000)).corr()

# heatmap plotting:
sb.heatmap(test_df)
ax = sb.plt.gca()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontname('Arial')
示例#3
0
                        metric="minmax",
                        feature_type="words",
                        sample_authors=False,
                        sample_features=False,
                        n_features=n_features,
                        n_test_pairs=n_pairs,
                        n_dev_pairs=n_pairs,
                        em_iterations=100,
                        vector_space_model="std",
                        weight=0.2,
                        eps=0.01,
                        norm="l2",
                        balanced_pairs=False)
logging.info("Starting verification [dev / test]")
verifier.vectorize(X_dev, X_test)
dev_results, test_results = verifier.predict()
logging.info("Computing results")

# first prec rec curve of test results:
test_Fs, test_Ps, test_Rs, test_Ts = evaluate(test_results)
fig = sb.plt.figure()
sb.plt.xlabel("recall", fontsize=10)
sb.plt.ylabel("precision", fontsize=10)
sb.plt.xlim(0.4, 1)
sb.plt.ylim(0.4, 1.05)
sb.plt.plot(test_Rs, test_Ps, label="baseline")

# get max for dev:
dev_Fs, dev_Ps, dev_Rs, dev_Ts = evaluate(test_results)
best_t = dev_Ts[np.nanargmax(dev_Fs)]
baseline_test_f, test_p, test_r = evaluate_with_threshold(test_results,
print dm
print vsm

verifier = Verification(random_state=1000,
                        metric=dm,
                        n_features=10000,
                        n_dev_pairs=0,
                        n_test_pairs=99999999,
                        vector_space_model=vsm,
                        balanced_pairs=False,
                        control_pairs=False)

logging.info("Starting verification [train / test]")
verifier.vectorize(X_dev, X_test)
train_results, test_results = verifier.predict(filter_imposters=False)
logging.info("Computing results")

test_df = verifier.get_distance_table(verifier.test_dists, verifier.test_pairs, "test")
test_df.to_csv("../outputs/caesar_test.csv")

test_df = pd.read_csv("../outputs/caesar_test.csv")
test_df = test_df.set_index("id")
test_df = test_df.applymap(lambda x:int(x*1000)).corr()

# heatmap plotting:
sb.heatmap(test_df)
ax = sb.plt.gca()
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontname('Arial')
    label.set_fontsize(3)