def __init__(self, path, normalize=True): self.wi, self.iw = load_vocabulary(path + '.words.vocab') self.ci, self.ic = load_vocabulary(path + '.contexts.vocab') self.m = load_matrix(path) self.m.data = np.log(self.m.data) self.normal = normalize if normalize: self.normalize()
def __init__(self, path, normalize=True, k=1): Explicit.__init__(self, path, False) self.wi, self.iw = load_vocabulary(path + '.words.vocab') self.ci, self.ic = load_vocabulary(path + '.contexts.vocab') self.m = load_matrix(path) self.m.data = self.m.data - np.log(k) # self.normal = normalize if normalize: self.normalize()
def __init__(self, path, normalize=True, glen=5): self.wi, self.iw = load_vocabulary(path + '.words.vocab') self.ci, self.ic = load_vocabulary(path + '.contexts.vocab') self.sz, self.ng_freqs = self.load_counts(path) self.m = load_matrix(path) self.m.data = np.log(self.m.data) self.normal = normalize self.glen = glen if normalize: self.normalize()
def __init__(self, path, normalize=True): Explicit.__init__(self, path, False) self.wi, self.iw = load_vocabulary(path + '.words.vocab') self.ci, self.ic = load_vocabulary(path + '.contexts.vocab') self.m = load_matrix(path) self.m.data = np.log(self.m.data) self.m.data[self.m.data <= 0] = 0 self.m.data[self.m.data > 0] = 1 # self.normal = normalize if normalize: self.normalize()
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --processes_num NUM The number of processes [default: 12] --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 300] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) processes_num = int(args['--processes_num']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 global_word_count = Value('l', 0) alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) print() for i in range(iters): pool = Pool(processes=processes_num, initializer=__init_process, initargs=(w2i, c2i, syn0, syn1, table, negative, size, alpha, processes_num, global_word_count, pairs_num, iters, pairs_path)) pool.map(train_process, range(processes_num)) save(i2w, syn0, outputs_path) print("word2vecf finished")
def __init__(self, path, normalize=True): self.tmp_m = [] self.m = [] for i in range(5): ind = (i + 1) * 100 self.tmp_m.append(np.load(path + '_' + str(ind) + '.words.npy')) self.wi, self.iw = load_vocabulary(path + '_500.words.vocab') diff_norms = np.linalg.norm(self.tmp_m[4], ord=2, axis=1) p_scores = [ np.percentile(diff_norms, i) for i in [0.0, 20.0, 40.0, 60.0, 80.0, 100.0] ] for i in range(self.tmp_m[0].shape[0]): norm = diff_norms[i] ind = 5 for j in range(len(p_scores)): if norm < p_scores[j]: ind = j break self.m.append( np.concatenate( (self.tmp_m[ind - 1][i], np.zeros(500 - (ind) * 100)))) self.m = np.asarray(self.m) if normalize: self.normalize() self.dim = self.m.shape[1]
def load(cls, path, normalize=True, restricted_context=None, **kwargs): mat = load_matrix(path) word_vocab, context_vocab = load_vocabulary(mat, path) return cls(mat, word_vocab, context_vocab, normalize=normalize, restricted_context=restricted_context)
def __init__(self, path): self.m = [] for line in open(path, 'r'): self.m.append([float(elem) for elem in line.split()[1:]]) self.m = np.asarray(self.m) self.wi, self.iw = load_vocabulary(path + '.words.vocab') self.dim = self.m.shape[1] self.normalize()
def __init__(self, path, normalize=True, glen=5): self.m = np.load(path + '.npy') self.sz, self.ng_freqs = self.load_counts(path) self.glen = glen if normalize: self.normalize() self.dim = self.m.shape[1] self.wi, self.iw = load_vocabulary(path + '.vocab')
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --processes_num NUM The number of processes [default: 12] --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 300] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) processes_num = int(args['--processes_num']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 global_word_count = Value('l', 0) alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) print () for i in range(iters): pool = Pool(processes=processes_num, initializer=__init_process, initargs=(w2i, c2i, syn0, syn1, table, negative, size, alpha, processes_num, global_word_count, pairs_num, iters, pairs_path)) pool.map(train_process, range(processes_num)) save(i2w, syn0, outputs_path) print ("word2vecf finished")
def __init__(self, path, normalize=True, eig=0.0, transpose=False): if transpose: ut = np.load(path + '.vt.npy') self.wi, self.iw = load_vocabulary(path + '.contexts.vocab') else: ut = np.load(path + '.ut.npy') self.wi, self.iw = load_vocabulary(path + '.words.vocab') s = np.load(path + '.s.npy') if eig == 0.0: self.m = ut.T elif eig == 1.0: self.m = s * ut.T else: self.m = np.power(s, eig) * ut.T self.dim = self.m.shape[1] diff_norms = np.linalg.norm(self.m, ord=2, axis=1) p_scores = [ np.percentile(diff_norms, i) for i in [0.0, 20.0, 40.0, 60.0, 80.0, 100.0] ] print(self.m.shape) dim = [600, 700, 800, 900, 1000] #dim = [1000, 1000, 1000, 1000, 1000] for i in range(self.m.shape[0]): norm = diff_norms[i] #ind = [j for j in range(len(p_scores)) if (p_scores[j] > norm) ] #ind = ind[0] ind = 0 for j in range(len(p_scores)): if norm < p_scores[j]: ind = j break #print (ind) self.m[i] = ut.T[i] * np.power( np.concatenate( (s[:dim[ind - 1]], np.zeros(self.dim - dim[ind - 1]))), eig) if normalize: self.normalize()
def __init__(self, path, pmi, normalize=True, neg=1): self.wi, self.iw = load_vocabulary(path + '.words.vocab') self.m = pmi self.m.data = np.log(self.m.data) self.m.data -= np.log(neg) self.m.data[self.m.data < 0] = 0 self.m.eliminate_zeros() if normalize: self.normalize()
def read_counts_matrix(words_path, contexts_path, counts_path): wi, iw = load_vocabulary(words_path) ci, ic = load_vocabulary(contexts_path) counts_num = 0 row = [] col = [] data = [] with open(counts_path) as f: print str(counts_num/1000**2) + "M counts processed." for line in f: if counts_num % 1000**2 == 0: print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed." word, context, count = line.strip().split() row.append(int(word)) col.append(int(context)) data.append(int(float(count))) counts_num += 1 counts = csr_matrix((data, (row, col)), shape=(len(wi), len(ci)), dtype=np.float32) return counts
def __init__(self, path, normalize=True, eig=0.0, transpose=False): if transpose: ut = np.load(path + '.vt.npy') self.wi, self.iw = load_vocabulary(path + '.contexts.vocab') else: ut = np.load(path + '.ut.npy') self.wi, self.iw = load_vocabulary(path + '.words.vocab') s = np.load(path + '.s.npy') if eig == 0.0: self.m = ut.T elif eig == 1.0: self.m = s * ut.T else: self.m = np.power(s, eig) * ut.T self.dim = self.m.shape[1] if normalize: self.normalize()
def read_counts_matrix(words_path, contexts_path, counts_path): wi, iw = load_vocabulary(words_path) ci, ic = load_vocabulary(contexts_path) counts_num = 0 row = [] col = [] data = [] with open(counts_path) as f: print str(counts_num / 1000**2) + "M counts processed." for line in f: if counts_num % 1000**2 == 0: print "\x1b[1A" + str( counts_num / 1000**2) + "M counts processed." word, context, count = line.strip().split() row.append(int(word)) col.append(int(context)) data.append(int(float(count))) counts_num += 1 counts = csr_matrix((data, (row, col)), shape=(len(wi), len(ci)), dtype=np.float32) return counts
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 100] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) for i in range(iters): train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha, negative, pairs_num, iters) save(i2w, syn0, outputs_path) print("word2vecf finished")
def load(cls, path, normalize=True, restricted_context=None, thresh=None, neg=1): mat = load_matrix(path, thresh) word_vocab, context_vocab = load_vocabulary(mat, path) return cls(mat, word_vocab, context_vocab, normalize, restricted_context, neg=neg)
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 100] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) for i in range(iters): train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha, negative, pairs_num, iters) save(i2w, syn0, outputs_path) print ("word2vecf finished")
def main(): args = docopt(""" Usage: text2numpy.py <path> """) path = args['<path>'] matrix = read_vectors(path) wi, iw = load_vocabulary(path + ".vocab") new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32) for i, word in enumerate(iw): if word in matrix: new_matrix[i, :] = matrix[word] np.save(path + '.npy', new_matrix)
def load(cls, path, normalize=True, restricted_context=None, thresh=None, neg=1): #This line produces an error because load_matrix takes only one argument #mat = load_matrix(path, thresh) #Changing the line: mat = load_matrix(path) word_vocab, context_vocab = load_vocabulary(mat, path) return cls(mat, word_vocab, context_vocab, normalize, restricted_context, neg=neg)
def __init__(self, path, normalize=True): self.m = np.load(path + '.npy') if normalize: self.normalize() self.dim = self.m.shape[1] self.wi, self.iw = load_vocabulary(path + '.vocab')
def main(): # get all parameters. args = docopt(""" Usage: pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts> Options: --memory_size NUM Memory size available [default: 8.0] """) print "**********************" print "pairs2counts" wi, iw =load_vocabulary(args['<vocab_word>']) ci, ic = load_vocabulary(args['<vocab_context>']) max_product = 10000 memory_size = float(args['--memory_size']) * 1000**3 D = {} #store bottom-right part of co-occurrence matrix in dictionary tmpfile_num = 1 memory_size_used = 0 #store top-left corner of co-occurrence matrix in array, which is the strategy used in GloVe lookup = [0,] for i in xrange(len(iw)): if max_product / (i + 1) == 0: break if max_product / (i + 1) > len(iw): lookup.append(lookup[-1] + len(iw)) else: lookup.append(lookup[-1] + max_product / (i + 1)) M = np.zeros(lookup[-1] + 1, dtype=np.int32) with open(args['<pairs>']) as f: pairs_num = 0 print str(pairs_num/1000**2) + "M pairs processed." for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: print "\x1b[1A" + str(pairs_num/1000**2) + "M pairs processed." if getsizeof(D) + memory_size_used + getsizeof(M) > memory_size * 0.8: #write dictionary to disk when memory is insufficient with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f: tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() memory_size_used = 0 tmpfile_num += 1 pair = line.strip().split() word_index = wi[pair[0]] context_index = ci[pair[1]] if (word_index + 1) * (context_index + 1) <= max_product: #store top-left corner in M, which stays in memory all time M[lookup[word_index] + context_index] += 1 else: #store bottom-right part in D, which is written to disk when memory is insufficient if word_index in D: tmp_size = getsizeof(D[word_index]) D[word_index].update({context_index: 1}) memory_size_used += getsizeof(D[word_index]) - tmp_size #estimate the size of memory used else: D[word_index] = Counter({context_index: 1}) memory_size_used += getsizeof(D[word_index]) with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f: tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() tmpfile_num += 1 for i in xrange(len(lookup)): #transform M to dictionary structure D[i] = Counter() if i == len(lookup) - 1: if M[lookup[i] + j] > 0: D[i].update({j: M[lookup[i] + j]}) break for j in xrange(lookup[i+1] - lookup[i]): if M[lookup[i] + j] > 0: D[i].update({j: M[lookup[i] + j]}) with open(args['<counts>'] + '_' + str(0), 'wb') as f: #write top-left corner to disk tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() #merge tmpfiles to co-occurrence matrix tmpfiles = [] top_buffer = [] #store top elements of tmpfiles counts_num = 0 counts_file = open(args['<counts>'], 'w') for i in xrange(tmpfile_num): tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb')) top_buffer.append(pickle.load(tmpfiles[i])) old = top_buffer[0] top_buffer[0] = pickle.load(tmpfiles[0]) print str(counts_num/1000**2) + "M counts processed." while True: arg_min = np.argmin(np.asarray([c[0] for c in top_buffer])) #find the element with smallest key (center word) if top_buffer[arg_min][0] == old[0]: #merge values when keys are the same old[1].update(top_buffer[arg_min][1]) else: tmp_sorted = sorted(old[1].keys()) #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated) for w in tmp_sorted: counts_num += 1 if counts_num % 1000**2 == 0: print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed." counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n") old = top_buffer[arg_min] try: top_buffer[arg_min] = pickle.load(tmpfiles[arg_min]) except EOFError: #when elements in file are exhausted top_buffer[arg_min] = (np.inf, Counter()) tmpfile_num -= 1 if tmpfile_num == 0: tmp_sorted = sorted(old[1].keys()) for w in tmp_sorted: counts_num += 1 counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n") break counts_file.close() print "number of counts: ", counts_num for i in xrange(len(top_buffer)): #remove tmpfiles os.remove(args['<counts>'] + '_' + str(i)) print "pairs2counts finished"
def main(): # get all parameters. args = docopt(""" Usage: pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts> Options: --memory_size NUM Memory size available [default: 8.0] """) print "**********************" print "pairs2counts" wi, iw = load_vocabulary(args['<vocab_word>']) ci, ic = load_vocabulary(args['<vocab_context>']) memory_size = float(args['--memory_size']) * 1000**3 D = {} #store co-occurrence matrix in dictionary tmpfile_num = 0 memory_size_used = 0 with open(args['<pairs>']) as f: pairs_num = 0 print str(pairs_num / 1000**2) + "M pairs processed." for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: print "\x1b[1A" + str( pairs_num / 1000**2) + "M pairs processed." if getsizeof( D ) + memory_size_used > memory_size * 0.8: #write dictionary to disk when memory is insufficient with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f: tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() memory_size_used = 0 tmpfile_num += 1 pair = line.strip().split() word_index = wi[pair[0]] context_index = ci[pair[1]] if word_index in D: tmp_size = getsizeof(D[word_index]) D[word_index].update({context_index: 1}) memory_size_used += getsizeof( D[word_index] ) - tmp_size #estimate the size of memory used else: D[word_index] = Counter({context_index: 1}) memory_size_used += getsizeof(D[word_index]) with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f: tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() tmpfile_num += 1 #merge tmpfiles to co-occurrence matrix tmpfiles = [] top_buffer = [] #store top elements of tmpfiles counts_num = 0 counts_file = open(args['<counts>'], 'w') for i in xrange(tmpfile_num): tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb')) top_buffer.append(pickle.load(tmpfiles[i])) old = top_buffer[0] top_buffer[0] = pickle.load(tmpfiles[0]) print str(counts_num / 1000**2) + "M counts processed." while True: arg_min = np.argmin(np.asarray([ c[0] for c in top_buffer ])) #find the element with smallest key (center word) if top_buffer[arg_min][0] == old[ 0]: #merge values when keys are the same old[1].update(top_buffer[arg_min][1]) else: tmp_sorted = sorted( old[1].keys() ) #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated) for w in tmp_sorted: counts_num += 1 if counts_num % 1000**2 == 0: print "\x1b[1A" + str( counts_num / 1000**2) + "M counts processed." counts_file.write( str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n") old = top_buffer[arg_min] try: top_buffer[arg_min] = pickle.load(tmpfiles[arg_min]) except EOFError: #when elements in file are exhausted top_buffer[arg_min] = (np.inf, Counter()) tmpfile_num -= 1 if tmpfile_num == 0: tmp_sorted = sorted(old[1].keys()) for w in tmp_sorted: counts_num += 1 counts_file.write( str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n") break counts_file.close() print "number of counts: ", counts_num for i in xrange(len(top_buffer)): #remove tmpfiles os.remove(args['<counts>'] + '_' + str(i)) print "pairs2counts finished"
def main(): # get all parameters. args = docopt(""" Usage: pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts> Options: --memory_size NUM Memory size available [default: 8.0] """) print "**********************" print "pairs2counts" wi, iw =load_vocabulary(args['<vocab_word>']) ci, ic = load_vocabulary(args['<vocab_context>']) memory_size = float(args['--memory_size']) * 1000**3 D = {} #store co-occurrence matrix in dictionary tmpfile_num = 0 memory_size_used = 0 with open(args['<pairs>']) as f: pairs_num = 0 print str(pairs_num/1000**2) + "M pairs processed." for line in f: pairs_num += 1 if pairs_num % 1000**2 == 0: print "\x1b[1A" + str(pairs_num/1000**2) + "M pairs processed." if getsizeof(D) + memory_size_used > memory_size * 0.8: #write dictionary to disk when memory is insufficient with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f: tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() memory_size_used = 0 tmpfile_num += 1 pair = line.strip().split() word_index = wi[pair[0]] context_index = ci[pair[1]] if word_index in D: tmp_size = getsizeof(D[word_index]) D[word_index].update({context_index: 1}) memory_size_used += getsizeof(D[word_index]) - tmp_size #estimate the size of memory used else: D[word_index] = Counter({context_index: 1}) memory_size_used += getsizeof(D[word_index]) with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f: tmp_sorted = sorted(D.keys()) for i in tmp_sorted: pickle.dump((i, D[i]), f, True) D.clear() tmpfile_num += 1 #merge tmpfiles to co-occurrence matrix tmpfiles = [] top_buffer = [] #store top elements of tmpfiles counts_num = 0 counts_file = open(args['<counts>'], 'w') for i in xrange(tmpfile_num): tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb')) top_buffer.append(pickle.load(tmpfiles[i])) old = top_buffer[0] top_buffer[0] = pickle.load(tmpfiles[0]) print str(counts_num/1000**2) + "M counts processed." while True: arg_min = np.argmin(np.asarray([c[0] for c in top_buffer])) #find the element with smallest key (center word) if top_buffer[arg_min][0] == old[0]: #merge values when keys are the same old[1].update(top_buffer[arg_min][1]) else: tmp_sorted = sorted(old[1].keys()) #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated) for w in tmp_sorted: counts_num += 1 if counts_num % 1000**2 == 0: print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed." counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n") old = top_buffer[arg_min] try: top_buffer[arg_min] = pickle.load(tmpfiles[arg_min]) except EOFError: #when elements in file are exhausted top_buffer[arg_min] = (np.inf, Counter()) tmpfile_num -= 1 if tmpfile_num == 0: tmp_sorted = sorted(old[1].keys()) for w in tmp_sorted: counts_num += 1 counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n") break counts_file.close() print "number of counts: ", counts_num for i in xrange(len(top_buffer)): #remove tmpfiles os.remove(args['<counts>'] + '_' + str(i)) print "pairs2counts finished"