# sp.make_plot_from_table(tables, labels=labels, colors=colors, output='minmaxcontop-google.png', log=False, normalize=(min, max)) # print "" # # tables = ['../data/google/pairs/sets/wmean_no_pairs_r-validation.npy', '../data/google/pairs/sets/wmean_pairs_r-validation.npy'] # min, max = sp.calculate_min_max_from_table(tables) # split = sp.calculate_split_from_table(tables, verbose=False, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='wmean-google-val.png', log=False, normalize=(min, max)) # tables = ['../data/google/pairs/sets/wmean_no_pairs_r-test.npy', '../data/google/pairs/sets/wmean_pairs_r-test.npy'] # sp.calculate_error_rate_from_table(tables, split, normalize=(min, max)) # sp.calculate_hellinger_distance_from_table(tables, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='wmean-google.png', log=False, normalize=(min, max)) # print "" # tables = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy'] min, max = sp.calculate_min_max_from_table(tables) split = sp.calculate_split_from_table(tables, verbose=True, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google-val.png', log=False, normalize=(min, max)) tables = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy'] sp.calculate_error_rate_from_table(tables, split, verbose=True, normalize=(min, max)) sp.calculate_JS_from_table(tables, verbose=True, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google.png', log=False, normalize=(min, max)) print "" # PROCESSING # texts1 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-validation.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-validation.txt'] # output1 = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy'] # p1 = Process(target=sp.process_to_file_with_filter, args=(metrics.NNVarMean(metrics.euclidean, 'VM'), texts1, output1, 2000000, w, docfreqs)) # p1.start() # # texts2 = ['../data/wiki/pairs/sets/enwiki_no_pairs_r-test.txt', '../data/wiki/pairs/sets/enwiki_pairs_r-test.txt'] # output2 = ['../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy']
# min, max = sp.calculate_min_max_from_table(tables) # split = sp.calculate_split_from_table(tables, verbose=False, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='wmean-google-val.png', log=False, normalize=(min, max)) # tables = ['../data/google/pairs/sets/wmean_no_pairs_r-test.npy', '../data/google/pairs/sets/wmean_pairs_r-test.npy'] # sp.calculate_error_rate_from_table(tables, split, normalize=(min, max)) # sp.calculate_hellinger_distance_from_table(tables, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='wmean-google.png', log=False, normalize=(min, max)) # print "" # tables = [ '../data/google/pairs/sets/nntopmedian_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-validation.npy' ] min, max = sp.calculate_min_max_from_table(tables) split = sp.calculate_split_from_table(tables, verbose=True, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google-val.png', log=False, normalize=(min, max)) tables = [ '../data/google/pairs/sets/nntopmedian_no_pairs_r-test.npy', '../data/google/pairs/sets/nntopmedian_pairs_r-test.npy' ] sp.calculate_error_rate_from_table(tables, split, verbose=True, normalize=(min, max)) sp.calculate_JS_from_table(tables, verbose=True, normalize=(min, max)) # sp.make_plot_from_table(tables, labels=labels, colors=colors, output='nntop-google.png', log=False, normalize=(min, max)) print "" # PROCESSING
# tablesA = ['../data/google/pairs/sets/wmean_no_pairs_r-test.npy', '../data/google/pairs/sets/wmean_pairs_r-test.npy'] # sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA)) # sp.calculate_JS_from_table(tablesA, normalize=(minA, maxA), verbose=True) # print "" # # tablesA = ['../data/google/pairs/sets/nntop_no_pairs_r-validation.npy', '../data/google/pairs/sets/nntop_pairs_r-validation.npy'] # minA, maxA = sp.calculate_min_max_from_table(tablesA) # splitA = sp.calculate_split_from_table(tablesA, verbose=True, normalize=(minA, maxA)) # tablesA = ['../data/google/pairs/sets/nntop_no_pairs_r-test.npy', '../data/google/pairs/sets/nntop_pairs_r-test.npy'] # sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA)) # sp.calculate_JS_from_table(tablesA, normalize=(minA, maxA), verbose=True) # print "" tablesA = ['../data/tweets/pairs/sets/nntopcontr_no_pairs-validation.npy', '../data/tweets/pairs/sets/nntopcontr_pairs-validation.npy'] minA, maxA = sp.calculate_min_max_from_table(tablesA) splitA = sp.calculate_split_from_table(tablesA, verbose=True, normalize=(minA, maxA)) tablesA = ['../data/tweets/pairs/sets/nntopcontr_no_pairs-test.npy', '../data/tweets/pairs/sets/nntopcontr_pairs-test.npy'] sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA)) sp.calculate_JS_from_table(tablesA, normalize=(minA, maxA), verbose=True) print "" #sp.calculate_bootstrap_test_from_table(tablesB, tablesA, splitB, splitA, True, 5000, (minB, maxB), (minA, maxA)) # LDA-code # dictionary = gensim.corpora.Dictionary.load('../data/model/wiki_wordids_filtered_2.dict') # dictionary.num_docs = metrics.N_DOCUMENTS #
p2 = Process(target=sp.process_to_file_with_filter, args=(metrics.NNVarMean(metrics.euclidean, length='V'), texts2, output2, 20100, w, docfreqs)) p2.start() p1.join() p2.join() #Test tablesA = [ '../data/tweets/pairs/sets/nntoptweets_no_pairs-train.npy', '../data/tweets/pairs/sets/nntoptweets_pairs-train.npy' ] minA, maxA = sp.calculate_min_max_from_table(tablesA) splitA = sp.calculate_split_from_table(tablesA, verbose=False, normalize=(minA, maxA)) ev = sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA)) tablesA = [ '../data/tweets/pairs/sets/nntoptweets_no_pairs-test.npy', '../data/tweets/pairs/sets/nntoptweets_pairs-test.npy' ] et = sp.calculate_error_rate_from_table(tablesA, splitA, normalize=(minA, maxA)) #Write f = open(print_file, 'aw') print str(i) + '\t' + str(ev) + '\t' + str(et)