def main(): args = docopt(""" Usage: pmi2svd.py [options] <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) start = time.time() ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) print("Time elapsed for SVD: %f" % (time.time() - start)) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) #words = load_count_vocabulary(counts_path + '.words.vocab') #contexts = load_count_vocabulary(counts_path + '.contexts.vocab') #loader = np.load(counts_path+'.pairs.counts.npz') #counts = csr_matrix((loader['data'], loader['indices'], loader['indptr'])) #counts, iw, ic = read_counts_matrix(counts_path) #iw = sorted(words) #ic = sorted(contexts) counts, iw, ic = read_counts_matrix(counts_path) pmi = calc_pmi(counts, cds, alpha=1.0) #words = load_count_vocabulary(counts_path + '.words.vocab') #contexts = load_count_vocabulary(counts_path + '.contexts.vocab') #iw = sorted(words) #ic = sorted(contexts) save_matrix(vectors_path + '.count_matrix', counts) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: pmi2svd.py [options] <repres> <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] --k NUM [default: 1] """) repres = args['<repres>'] pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) k = int(args['--k']) if (repres == "BPMI"): explicit = BinExplicit(pmi_path, normalize=False) elif (repres == "PMI"): explicit = NoExplicit(pmi_path, normalize=False, k=k) elif (repres == "NPMI"): explicit = NegExplicit(pmi_path, normalize=False) else: explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def _counts2PMI(self): words = list(self.words.keys()) contexts = list(self.contexts.keys()) iw = sorted(words) ic = sorted(contexts) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 i = 0 with open(self.count_pair_file) as f: for line in f: count, word, context = line.strip().split() if word in wi and context in ci: tmp_counts[wi[word], ci[context]] = int(count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() pmi = self.calc_pmi(counts, self.cds) save_matrix(self.pmi_file, pmi) save_vocabulary(self.pmi_file + '.words.vocab', iw) save_vocabulary(self.pmi_file + '.contexts.vocab', ic) self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg) cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
def main(): args = docopt(""" Usage: text2numpy.py <path> """) path = args['<path>'] matrix = read_vectors(path) iw = sorted(matrix.keys()) new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32) for i, word in enumerate(iw): if word in matrix: new_matrix[i, :] = matrix[word] if np.isnan(new_matrix).any(): print("Warning! {0} contains 1 or more `nan` values!".format(path)) truth = np.isnan(new_matrix) rows = np.array([np.any(x) for x in truth], dtype=bool).nonzero()[0] print("Target includes {0} rows: {1}".format(rows.shape[0], rows)) print(new_matrix[0]) exit(-1) np.save(path + '.npy', new_matrix) save_vocabulary(path + '.vocab', iw)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) o = open(counts_path + '-new',"w") for line in open(counts_path): o.write(line.strip()+"\n") o.close() counts_path_new = counts_path + '-new' counts, iw, ic = read_counts_matrxi_fast(counts_path, counts_path_new) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic) savePmiNonzeroTerm_fast(counts,vectors_path + '.cooccurrence') remain_index = pmi.data > 1 pmi.data = np.log(pmi.data) savePmiNonzeroTerm_fast(pmi,vectors_path + '.PMI') counts.data = counts.data * remain_index counts.eliminate_zeros() savePmiNonzeroTerm_fast(counts,vectors_path + '.PPMIcooccurrence') pmi.data[pmi.data < 0] = 0 pmi.eliminate_zeros() savePmiNonzeroTerm_fast(pmi,vectors_path + '.PPMI')
def text2numpy_nonewline(path): matrix = read_vectors(path) iw = sorted(matrix.keys()) new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32) for i, word in enumerate(iw): if word in matrix: new_matrix[i, :] = matrix[word] npy_file = path + '.npy' vocab_file = path + '.vocab' np.save(npy_file, new_matrix) save_vocabulary(vocab_file, iw) return [npy_file, vocab_file]
def main(): args = docopt(""" Usage: text2numpy.py <path> """) path = args['<path>'] matrix = read_vectors(path) iw = sorted(matrix.keys()) new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32) for i, word in enumerate(iw): if word in matrix: new_matrix[i, :] = matrix[word] np.save(path + '.npy', new_matrix) save_vocabulary(path + '.vocab', iw)
def main(): args = docopt(""" Usage: counts2ica.py [options] <counts> <output_path> Options: --cps NUM Number of ICA components to obtain [default: 50] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] counts, iw, ic = read_counts_matrix(counts_path) embeddings = calc_ica(counts, args['--cps']) save_matrix(vectors_path, embeddings) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) counts, iw, ic = read_counts_matrix(counts_path) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: pmi2svd.py [options] <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)