def main(): """ Compute k nearest neighbors for targets. """ # Get the arguments args = docopt("""Compute k nearest neighbors for targets. Usage: knn.py <spacePrefix1> <k> <outPath> [<testset> <co>] <spacePrefix1> = path to pickled space without suffix <testset> = path to file with tab-separated word pairs <co> = column index for targets <k> = parameter k (k nearest neighbors) <outPath> = output path for result file Note: ... """) spacePrefix1 = args['<spacePrefix1>'] testset = args['<testset>'] co = int(args['<co>']) outPath = args['<outPath>'] k = int(args['<k>']) logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': True, }) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load spaces space1 = load_pkl_files(spacePrefix1) if testset != None: with codecs.open(testset, 'r', 'utf8') as f_in: targets = [line.strip().split('\t')[co] for line in f_in] else: # If no test set is provided, compute values for all targets occurring in both spaces targets = [target.decode('utf8') for target in space1.get_row2id()] target2neighbors = {} for i, t1 in enumerate(targets): try: neighbors1 = space1.get_neighbours(t1.encode('utf8'), k, CosSimilarity()) del neighbors1[0] except KeyError: neighbors1 = [('nan', float('nan'))] target2neighbors[t1] = neighbors1 with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out: for t1 in targets: # Convert cosine similarity to cosine distance, export nearest neighbors print >> f_out, t1 + '\t' + ' '.join( [str((n, 1 - v)) for (n, v) in target2neighbors[t1]]) logging.info("--- %s seconds ---" % (time.time() - start_time))
verbs_for_mean_bytes.append(verb_b) print str(len(verbs_bytes)) + ' verbs in main list\n' print str(len(verbs_for_mean_bytes)) + ' verbs in mean list\n' f_list = io.open(FLAGS.filebase + '/neighbours_list.csv', 'w', encoding='utf8') f_mean = io.open(FLAGS.filebase + '/distance_to_mean.csv', 'w', encoding='utf8') for verb in verbs_bytes: neighbour_list = this_space.get_neighbours(verb, FLAGS.number_neighbours, CosSimilarity()) neighbour_words = [item[0] for item in neighbour_list] neighbour_similarities = [item[1] for item in neighbour_list] f_list.write(verb.decode('utf8') + ',') f_list.write(','.join(neighbour_words).decode('utf8') + '\n') floats_line = 'Sim.,' for sim in neighbour_similarities: floats_line += '%.7f,' % sim verbs_line = 'Verb,' for word in neighbour_words: if word in verbs_bytes: verbs_line += '1,' else: verbs_line += ','
els_for_comp.append(element) return els_for_comp typ_space = create_space(TypDmFile, TypRowsFile) distr_space = create_space(DistrDmFile, DistrRowsFile) #load a space from a pickle file #my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl") #distributional vectors processing distr_space = distr_space.apply(PpmiWeighting()) distr_space = distr_space.apply(Svd(300)) #io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl") items = items_from_file(itemsFile) els_for_comp = elements_for_composition(items) my_comp = WeightedAdditive(alpha=1, beta=1) distr_space = my_comp.compose(els_for_comp, distr_space) pairs = pairs(items) predicted = distr_space.get_sims(pairs, CosSimilarity()) gold = typ_space.get_sims(pairs, CosSimilarity()) #compute correlations print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
#ex08.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load a space my_space = io_utils.load("./data/out/ex01.pkl") #get the top 2 neighbours of "car" print my_space.get_neighbours("car", 2, CosSimilarity())
print("Training Lexical Function composition model...") comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print("Composing phrases...") test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space) print("Reading similarity test data...") test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1]) gold = io_utils.read_list(test_similarity_file, field=2) print("Computing similarity with lexical function...") pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print("Scoring lexical function...") print(scoring_utils.score(gold, pred, "spearman")) print("Training Full Additive composition model...") comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print(scoring_utils.score(gold, pred, "spearman")) print("Training Weighted Additive composition model...") comp_model = WeightedAdditive() comp_model.train(train_data, space, per_space)
#ex07.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load two spaces my_space = io_utils.load("./data/out/ex01.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") print(my_space.id2row) print(my_per_space.id2row) #compute similarity between a word and a phrase in the two spaces print( my_space.get_sim("car", "sports_car", CosSimilarity(), space2=my_per_space))
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) lengths = [] found = True for wp in word_pairs: try: v1 = my_space.get_row(wp[0]) v2 = my_space.get_row(wp[1]) except KeyError: #print wp[0],"or",wp[1],"not found" found = False if found: composed_space = add.compose([(wp[0], wp[1], "_composed_")], my_space) neighbours = composed_space.get_neighbours("_composed_", 10, CosSimilarity(), space2=my_space) print wp[0], wp[1] print neighbours density = 0 for n in neighbours: density += n[1] density = density / 10 print "Density", density c = composed_space.get_row("_composed_") print "Norm ", c.norm() cos = composed_space.get_sim("_composed_", wp[1], CosSimilarity(), space2=my_space) print "Cos ", cos
#ex20.py #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity #read in a space my_space = io_utils.load("data/out/ex01.pkl") #compute similarities of a list of word pairs fname = "data/in/word_sims.txt" word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) predicted = my_space.get_sims(word_pairs, CosSimilarity()) #compute correlations gold = io_utils.read_list(fname, field=2) print("Spearman") print(scoring_utils.score(gold, predicted, "spearman")) print("Pearson") print(scoring_utils.score(gold, predicted, "pearson"))
#ex06.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load a space my_space = io_utils.load("./data/out/ex01.pkl") print my_space.cooccurrence_matrix print my_space.id2row #compute similarity between two words in the space print my_space.get_sim("car", "car", CosSimilarity()) print my_space.get_sim("car", "book", CosSimilarity())
def computeAnalogy(w1,w2,w3): composed_space = sub.compose([(w1,w2, "step1")], space) composed_space2 = add.compose([("step1", w3, "step2")], (composed_space,space)) guess=composed_space2.get_neighbours("step2", 1, CosSimilarity(),space) return guess
#kneighbours.py #USAGE: python kneighbours [space file] [word] [k] #EXAMPLE: python2.7 kneighbours.py ~/UkWac/dissect-data/ANs/out/CORE_SS.ans.ppmi.row.pkl car-n 30 #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity import sys #load a space my_space = io_utils.load(sys.argv[1]) #get the top 2 neighbours of "car" print my_space.get_neighbours(sys.argv[2], int(sys.argv[3]), CosSimilarity())
from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity import sys #read in a space my_space = io_utils.load(sys.argv[1]) #compute similarities of a list of word pairs fname = sys.argv[2] word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1, 2]) predicted = [] gold = [] cos = 0 for wp in word_pairs: try: cos = my_space.get_sim(wp[0], wp[1], CosSimilarity()) if cos > 0: #print wp[0],wp[1],cos predicted.append(cos) gold.append(wp[2]) except: print "Couldn't measure cosine..." #compute correlations print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")