def make_plot_from_text(sim_function,
                        texts=['../data/enwiki_pairs.txt'],
                        labels=['pairs'],
                        colors=['red'],
                        N=5100000,
                        w2v_model=None,
                        docfreqs=None,
                        output='plot.png'):
    data = []
    for t in texts:
        r = process_pairs(sim_function, t, N, w2v_model, docfreqs)
        data.append(r)

    plt.clf()
    _, _, _ = plt.hist(data,
                       300,
                       normed=1,
                       histtype='step',
                       label=labels,
                       color=colors)
    #plt.legend()
    plt.savefig(output)

    print 'Calculating optimal error rate...'
    (error, split) = metrics.optimal_error_rate(data[0], data[1])
    print 'Optimal error: %.5f' % error
    print 'Optimal split point: %.5f' % split
def make_plot_from_text(sim_function, texts=['../data/enwiki_pairs.txt'], labels=['pairs'], colors=['red'], N=5100000, w2v_model=None, docfreqs=None, output='plot.png'):
    data = []
    for t in texts:
        r = process_pairs(sim_function, t, N, w2v_model, docfreqs)
        data.append(r)

    plt.clf()
    _, _, _ = plt.hist(data, 300, normed=1, histtype='step', label=labels, color=colors)
    #plt.legend()
    plt.savefig(output)

    print 'Calculating optimal error rate...'
    (error, split) = metrics.optimal_error_rate(data[0], data[1])
    print 'Optimal error: %.5f' % error
    print 'Optimal split point: %.5f' % split
def calculate_split_from_table(tables=['../data/tfidf-pairs.txt'], verbose=True, normalize=(0.0, 1.0)):
    data = []
    for t in tables:
        f = open(t, 'rb')
        r = np.load(f)
        data.append(r)
    for d in xrange(len(data)):
        data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0])

    if verbose:
        print 'Calculating optimal error rate...'
    (error, split) = metrics.optimal_error_rate(data[0], data[1])
    if verbose:
        print 'Optimal error: %.5f' % error
        print 'Optimal split point: %.5f' % split
    return split
def calculate_split_from_table(tables=['../data/tfidf-pairs.txt'],
                               verbose=True,
                               normalize=(0.0, 1.0)):
    data = []
    for t in tables:
        f = open(t, 'rb')
        r = np.load(f)
        data.append(r)
    for d in xrange(len(data)):
        data[d] = (data[d] - normalize[0]) / (normalize[1] - normalize[0])

    if verbose:
        print 'Calculating optimal error rate...'
    (error, split) = metrics.optimal_error_rate(data[0], data[1])
    if verbose:
        print 'Optimal error: %.5f' % error
        print 'Optimal split point: %.5f' % split
    return split