def main(): args = docopt(""" Usage: pmi2svd.py [options] <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) start = time.time() ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) print("Time elapsed for SVD: %f" % (time.time() - start)) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def get_sim_pair_ppmi(corpus, target_word1, target_word2, year, results_dir): results_pair = target_word1 + '-' + target_word2 + '-cosines.tsv' embedd = PositiveExplicit.load(corpus + "/" + str(year)) cos = embedd.similarity(target_word1, target_word2) if os.path.isfile(results_dir + results_pair): print('file exists') with open(results_dir + results_pair) as infile: existing_results = infile.read().split('\n') else: existing_results = [] with open(results_dir + results_pair, 'a') as outfile: result = target_word1 + '-' + target_word2 + '\t' + str( year) + '\t' + str(cos) + '\n' if result.strip() in existing_results: print('result already there') else: outfile.write(result) print(cos)
def _counts2PMI(self): words = list(self.words.keys()) contexts = list(self.contexts.keys()) iw = sorted(words) ic = sorted(contexts) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 i = 0 with open(self.count_pair_file) as f: for line in f: count, word, context = line.strip().split() if word in wi and context in ci: tmp_counts[wi[word], ci[context]] = int(count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() pmi = self.calc_pmi(counts, self.cds) save_matrix(self.pmi_file, pmi) save_vocabulary(self.pmi_file + '.words.vocab', iw) save_vocabulary(self.pmi_file + '.contexts.vocab', ic) self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg) cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
def create_representation(args): rep_type = args['<representation>'] path = args['<representation_path>'] neg = int(args['--neg']) w_c = args['--w+c'] eig = float(args['--eig']) if rep_type == 'PPMI': if w_c: raise Exception('w+c is not implemented for PPMI.') else: return PositiveExplicit(path, True, neg) elif rep_type == 'SVD': if w_c: return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True) else: return SVDEmbedding(path, True, eig) elif rep_type == 'GLOVE': return GLOVEEmbedding(path, True) else: if w_c: return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True) else: return Embedding(path + '.words', True)
def main(): args = docopt(""" Usage: pmi2svd.py [options] <repres> <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] --k NUM [default: 1] """) repres = args['<repres>'] pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) k = int(args['--k']) if (repres == "BPMI"): explicit = BinExplicit(pmi_path, normalize=False) elif (repres == "PMI"): explicit = NoExplicit(pmi_path, normalize=False, k=k) elif (repres == "NPMI"): explicit = NegExplicit(pmi_path, normalize=False) else: explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def main(): args = docopt(""" Usage: ppmi2svd.py [options] <ppmi> <output> Options: --dim NUM Dimensionality of eigenvectors [default: 300] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) ppmi_path = args['<ppmi>'] output_path = args['<output>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(ppmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt)
def folder2chi(folder): return PositiveExplicit(join(folder, "chi")).similarity_first_order
def get_sim_neighbors_ppmi(corpus, target_word1, target_word2, year1, year2, n, results_dir): """Two options: either 2 differnt years and 1 target word or the same year and 2 target words""" if not os.path.isdir(results_dir+'neighbors'): os.mkdir(results_dir+'neighbors') results_words = 'neighbors/'+target_word1+'-'+target_word2+'-'+str(year1)+'-'+str(year2)+'.tsv' if (year1 != year2) and (target_word1 == target_word2): results_cosine = 'cosines-'+target_word1+'-n-'+str(n)+'.tsv' embedd_year1 = PositiveExplicit.load(corpus+ "/" + str(year1)) embedd_year2 = PositiveExplicit.load(corpus+ "/" + str(year2)) with open(corpus+'/'+str(year1)+'-index.pkl', 'rb') as infile: year1_vocab = pickle.load(infile, encoding = 'utf-8') with open(corpus+'/'+str(year2)+'-index.pkl', 'rb') as infile: year2_vocab = pickle.load(infile, encoding = 'utf-8') #year1_vocab = pickle.load(open(corpus+'/'+str(year1)+'-index.pkl')) #year2_vocab = pickle.load(open(corpus+'/'+str(year2)+'-index.pkl')) if (embedd_year1.represent(target_word1).nnz != 0) and (embedd_year2.represent(target_word1).nnz != 0): neighbors_year1 = get_nearest_neighbors(embedd_year1, target_word1, n) neighbors_year2 = get_nearest_neighbors(embedd_year2, target_word1, n) union = get_union(neighbors_year1, neighbors_year2) filtered_union = filter_union(union, embedd_year1, embedd_year2, target_word1) #clean_union = [] #for word in union: # if (word in year1_vocab) and (word in year2_vocab): # clean_union.append(word) vec1 = get_second_order_vector(embedd_year1, filtered_union, target_word1) vec2 = get_second_order_vector(embedd_year2, filtered_union, target_word1) #vec1, vec2 = filter_so_vector_for_nans(embedd_year1, embedd_year2, union, target_word1) neighbor_words1 = get_nearest_neighbor_words(neighbors_year1) neighbor_words2 = get_nearest_neighbor_words(neighbors_year2) cos = get_cosine(vec1, vec2) else: print('word out of vocab') cos = 'OOV' neighbor_words1 = ['OOV'] neighbor_words2 = ['OOV'] elif (year1 == year2) and (target_word1 != target_word2): results_cosine = 'cosines-'+target_word1+'-'+target_word2+'-n-'+str(n)+'.tsv' embedd_year = PositiveExplicit.load(corpus+ "/" + str(year1)) if (embedd_year.represent(target_word1).nnz) != 0 and (embedd_year.represent(target_word2).nnz != 0): neighbors_word1 = get_nearest_neighbors(embedd_year, target_word1, n) neighbors_word2 = get_nearest_neighbors(embedd_year, target_word2, n) union = get_union(neighbors_word1, neighbors_word2) vec1 = get_second_order_vector(embedd_year, union, target_word1) vec2 = get_second_order_vector(embedd_year, union, target_word2) neighbor_words1 = get_nearest_neighbor_words(neighbors_word1) neighbor_words2 = get_nearest_neighbor_words(neighbors_word2) cos = get_cosine(vec1, vec2) else: print('word out of vocab') cos = 'OOV' neighbor_words1 = ['OOV'] neighbor_words2 = ['OOV'] if os.path.isfile(results_dir+results_cosine): print('file exists') with open(results_dir+results_cosine) as infile: existing_results = infile.read().split('\n') else: existing_results = [] with open(results_dir+results_words, 'w') as outfile1: for word1, word2 in zip(neighbor_words1, neighbor_words2): #outfile1.write(word1.encode('utf-8')+'\t'+word2.encode('utf-8')+'\n') outfile1.write(word1+'\t'+word2+'\n') with open(results_dir+'/'+results_cosine, 'a') as outfile2: result = target_word1+'-'+target_word2+'\t'+str(year1)+'-'+str(year2)+'\t'+str(cos)+'\n' if result.strip() in existing_results: print('result already there') else: outfile2.write(result) print(cos)