示例#1
0
    for dm_cnt, dm in enumerate(dms):
        print('\t\t* ' + dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type='chars',
                                ngram_range=4,
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=None,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=balanced_pairs)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(test_dataset)
        dev_results = verifier.fit()
        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print('\t\t + F1: ', max_f)
        fscore_row.append(format(max_f * 100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray(
            [sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray(
            [sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print("\t\t- KS: D = %s (p = %s)" % (D, p))
        sb.set_style("dark")
示例#2
0
                                metric=dm,
                                feature_type="words",
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=0,
                                n_dev_pairs=n_dev_pairs,
                                em_iterations=100,
                                vector_space_model=vsm,
                                weight=0.2,
                                eps=0.01,
                                norm="l2",
                                balanced_pairs=True)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(X_dev, X_test)
        dev_results = verifier.fit()

        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print "\t\t + F1: "+str(max_f)
        fscore_row.append(format(max_f*100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print "\t\t- KS: D = "+str(D)+" (p = "+str(p)+")"
        sb.set_style("dark")
        ax = sb.plt.Subplot(fig, outer_grid[cnt])
示例#3
0
    for dm_cnt, dm in enumerate(dms):
        print('\t\t* '+dm)
        verifier = Verification(random_state=random_state,
                                metric=dm,
                                feature_type='chars',
                                ngram_range=4,
                                sample_authors=False,
                                sample_features=False,
                                n_features=n_features,
                                n_test_pairs=None,
                                n_dev_pairs=n_dev_pairs,
                                vector_space_model=vsm,
                                balanced_pairs=balanced_pairs)
        logging.info("Starting verification [dev / test]")
        verifier.vectorize(test_dataset)
        dev_results = verifier.fit()
        logging.info("Computing results")
        dev_f, dev_p, dev_r, dev_t = evaluate(dev_results)
        max_f = np.nanmax(dev_f)
        print('\t\t + F1: ', max_f)
        fscore_row.append(format(max_f*100, '.1f'))

        # distribution of scores:
        same_author_densities = np.asarray([sc for c, sc in dev_results if c == "same_author"])
        diff_author_densities = np.asarray([sc for c, sc in dev_results if c == "diff_author"])

        D, p = ks_2samp(same_author_densities, diff_author_densities)
        print("\t\t- KS: D = %s (p = %s)" %(D, p))
        sb.set_style("dark")
        ax = sb.plt.Subplot(fig, outer_grid[cnt])
        ax.set_xlim([0, 1])