def read_counts_matrix(counts_path): """ Reads the counts into a sparse matrix (CSR) from the count-word-context textual format. """ words = load_count_vocabulary(counts_path + '.words.vocab') contexts = load_count_vocabulary(counts_path + '.contexts.vocab') words = list(words.keys()) contexts = list(contexts.keys()) iw = sorted(words) ic = sorted(contexts) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 i = 0 with open(counts_path) as f: for line in f: count, word, context = line.strip().split() if word in wi and context in ci: tmp_counts[wi[word], ci[context]] = int(count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() return counts, iw, ic
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --processes_num NUM The number of processes [default: 12] --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 300] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) processes_num = int(args['--processes_num']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 global_word_count = Value('l', 0) alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) print() for i in range(iters): pool = Pool(processes=processes_num, initializer=__init_process, initargs=(w2i, c2i, syn0, syn1, table, negative, size, alpha, processes_num, global_word_count, pairs_num, iters, pairs_path)) pool.map(train_process, range(processes_num)) save(i2w, syn0, outputs_path) print("word2vecf finished")
def c2p(args, tid): pairs_file = open(args['<pairs>']+"_"+str(tid), 'w') threads_num = int(args['--threads_num']) subsample = float(args['--sub']) sub = subsample != 0 vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in six.iteritems(vocab): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique if tid == 0: print ('vocabulary size: ' + str(len(vocab))) with open(args['<corpus>']) as f: line_num = 0 for line in f: line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: sys.stdout.write("\r" + str(int(line_num/1000)) + "K lines processed.") sys.stdout.flush() if line_num % threads_num != tid: continue ngram_ngram(line, args, vocab, pairs_file, sub, subsampler) # word_word(line, args, vocab, pairs_file, sub, subsampler) # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num) # word_wordPos(line, args, vocab, pairs_file, sub, subsampler) pairs_file.close()
def c2p(args, tid): pairs_file = open(args['<pairs>']+"_"+str(tid), 'w') threads_num = int(args['--threads_num']) subsample = float(args['--sub']) sub = subsample != 0 vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in vocab.iteritems(): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in vocab.iteritems() if count > subsample]) #subsampling technique if tid == 0: print 'vocabulary size: ' + str(len(vocab)) with open(args['<corpus>']) as f: line_num = 0 if tid == 0: print str(line_num/1000**1) + "K lines processed." for line in f: line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: print "\x1b[1A" + str(line_num/1000) + "K lines processed." if line_num % threads_num != tid: continue line2features(line, args, vocab, pairs_file, sub, subsampler) pairs_file.close()
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --processes_num NUM The number of processes [default: 12] --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 300] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) processes_num = int(args['--processes_num']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 global_word_count = Value('l', 0) alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) print () for i in range(iters): pool = Pool(processes=processes_num, initializer=__init_process, initargs=(w2i, c2i, syn0, syn1, table, negative, size, alpha, processes_num, global_word_count, pairs_num, iters, pairs_path)) pool.map(train_process, range(processes_num)) save(i2w, syn0, outputs_path) print ("word2vecf finished")
def read_counts_matrxi_fast(counts_path, counts_path_new): """ Reads the counts into a sparse matrix (CSR) from the count-word-context textual format. """ df = pd.read_csv(counts_path_new, sep=" ", names = ["num", "word", "context"],converters = {"num": np.float32, "word": str, "context": str}, header=None) words = load_count_vocabulary(counts_path + '.words.vocab')#this is a dict, contains (word: how many this word appears) pair contexts = load_count_vocabulary(counts_path + '.contexts.vocab') words = list(words.keys())#is a list contains all the words contexts = list(contexts.keys())#is a list contains all the words iw = sorted(words)#this is a sorted words list ic = sorted(contexts)#this is a sorted context word list wi=pd.Series(index=iw, data=sp.arange(len(iw)))#this should be a dictionary, word: index ci=pd.Series(index=ic, data=sp.arange(len(ic)))#this should be a dictionary, context word: index return csr_matrix((df.num, (wi[df.word], ci[df.context])), [len(iw),len(ic)], dtype=np.float32), list(iw), list(ic)
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 100] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) for i in range(iters): train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha, negative, pairs_num, iters) save(i2w, syn0, outputs_path) print("word2vecf finished")
def main(): args = docopt(""" Usage: word2vecf.py [options] <pairs> <words> <contexts> <outputs> Options: --negative NUM Negative sampling [default: 5] --size NUM Embedding size [default: 100] --iters NUM The number of iterations [default: 1] """) words_path = args['<words>'] contexts_path = args['<contexts>'] pairs_path = args['<pairs>'] outputs_path = args['<outputs>'] size = int(args['--size']) negative = int(args['--negative']) iters = int(args['--iters']) w2i, i2w = load_vocabulary(words_path) c2i, i2c = load_vocabulary(contexts_path) words = load_count_vocabulary(words_path) contexts = load_count_vocabulary(contexts_path) pairs_num = 0 with open(pairs_path, 'r') as f: for l in f: pairs_num += 1 alpha = 0.025 syn0, syn1 = init_net(size, len(words), len(contexts)) table = UnigramTable(i2c, contexts) for i in range(iters): train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha, negative, pairs_num, iters) save(i2w, syn0, outputs_path) print ("word2vecf finished")
def c2p(args, tid): pairs_file = open(args['<pairs>'] + "_" + str(tid), 'w') feature = args[ '--feature'] #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on. threads_num = int(args['--threads_num']) subsample = float(args['--sub']) sub = subsample != 0 vocab = load_count_vocabulary( args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in six.iteritems(vocab): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique if tid == 0: print('vocabulary size: ' + str(len(vocab))) with open(args['<corpus>']) as f: line_num = 0 for line in f: line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: sys.stdout.write("\r" + str(int(line_num / 1000)) + "K lines processed.") sys.stdout.flush() if line_num % threads_num != tid: continue if feature == 'ngram-ngram': ngram_ngram(line, args, vocab, pairs_file, sub, subsampler) elif feature == 'word-word': #identical to word2vec word_word(line, args, vocab, pairs_file, sub, subsampler) elif feature == 'word-character': # similar with fasttext word_character(line, args, vocab, pairs_file, sub, subsampler) else: break # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num) # word_wordPos(line, args, vocab, pairs_file, sub, subsampler) pairs_file.close()
def main(): args = docopt(""" Usage: corpus2pairs.py [options] <corpus> <vocab> <pairs> Options: --win NUM Window size [default: 2] --sub NUM Subsampling threshold [default: 0] --ngram_word NUM (Center) word vocabulary includes grams of 1st to nth order [default: 1] --ngram_context NUM Context vocabulary includes grams of 1st to nth order [default: 1] --overlap Whether overlaping pairs are allowed or not """) print("**********************") print("corpus2pairs") pairs_file = open(args['<pairs>'], 'w') subsample = float(args['--sub']) sub = subsample != 0 vocab = load_count_vocabulary( args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in six.iteritems(vocab): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique print('vocabulary size: ' + str(len(vocab))) with open(args['<corpus>']) as f: line_num = 0 print(str(int(line_num / 1000**1)) + "K lines processed.") for line in f: line_num += 1 if ((line_num) % 1000) == 0: print("\x1b[1A" + str(int(line_num / 1000)) + "K lines processed.") line2features(line, args, vocab, pairs_file, sub, subsampler) pairs_file.close() print("corpus2pairs finished")
def main(): args = docopt(""" Usage: corpus2pairs.py [options] <corpus> <vocab> <pairs> Options: --win NUM Window size [default: 2] --sub NUM Subsampling threshold [default: 0] --ngram_word NUM (Center) word vocabulary includes grams of 1st to nth order [default: 1] --ngram_context NUM Context vocabulary includes grams of 1st to nth order [default: 1] --overlap Whether overlaping pairs are allowed or not """) print ("**********************") print ("corpus2pairs") pairs_file = open(args['<pairs>'], 'w') subsample = float(args['--sub']) sub = subsample != 0 vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in six.iteritems(vocab): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique print ('vocabulary size: ' + str(len(vocab))) with open(args['<corpus>']) as f: line_num = 0 for line in f: line_num += 1 if ((line_num) % 1000) == 0: sys.stdout.write("\r" + str(int(line_num/1000)) + "K lines processed.") line2features(line, args, vocab, pairs_file, sub, subsampler) pairs_file.close() print ("corpus2pairs finished")
def c2p(args, tid): pairs_file = open(args['<pairs>']+"_"+str(tid), 'w') feature = args['--feature'] #features, also known as co-occurrence types, are critical to the property of word representations. Supports ngram-ngram, word-word, word-character, and so on. threads_num = int(args['--threads_num']) subsample = float(args['--sub']) sub = subsample != 0 vocab = load_count_vocabulary(args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in six.iteritems(vocab): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in six.iteritems(vocab) if count > subsample]) #subsampling technique if tid == 0: print ('vocabulary size: ' + str(len(vocab))) with open(args['<corpus>']) as f: line_num = 0 for line in f: line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: sys.stdout.write("\r" + str(int(line_num/1000)) + "K lines processed.") sys.stdout.flush() if line_num % threads_num != tid: continue if feature == 'ngram-ngram': ngram_ngram(line, args, vocab, pairs_file, sub, subsampler) elif feature == 'word-word': #identical to word2vec word_word(line, args, vocab, pairs_file, sub, subsampler) elif feature == 'word-character': # similar with fasttext word_character(line, args, vocab, pairs_file, sub, subsampler) else: break # word_text(line, args, vocab, pairs_file, sub, subsampler, line_num) # word_wordPos(line, args, vocab, pairs_file, sub, subsampler) pairs_file.close()
def load_counts(self, path): count_path = path[:path.rfind('/') + 1] + 'counts.words.vocab' ng_freqs = load_count_vocabulary(count_path) sz = sum(int(v) for v in ng_freqs.values()) return sz, ng_freqs
def c2p(args, tid): pairs_file = open(args['<pairs>'] + "_" + str(tid), 'w') win = int(args['--win']) subsample = float(args['--sub']) sub = subsample != 0 ngram_word = int(args['--ngram_word']) ngram_context = int(args['--ngram_context']) overlap = args['--overlap'] threads_num = int(args['--threads_num']) vocab = load_count_vocabulary( args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in vocab.iteritems(): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus if tid == 0: print 'vocabulary size: ' + str(len(vocab)) print 'number of training words (uni-grams): ' + str(train_uni_num) print 'number of training n-grams: ' + str(train_num) subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in vocab.iteritems() if count > subsample]) #subsampling technique rnd = Random(17) with open(args['<corpus>']) as f: line_num = 0 if tid == 0: print str(line_num / 1000**1) + "K lines processed." for line in f: line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: print "\x1b[1A" + str(line_num / 1000) + "K lines processed." if line_num % threads_num != tid: continue tokens = line.strip().split() for i in xrange(len(tokens)): #loop for each position in a line for gram_word in xrange( 1, ngram_word + 1 ): #loop for grams of different orders in (center) word word = getNgram(tokens, i, gram_word) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue for gram_context in xrange( 1, ngram_context + 1): #loop for grams of different orders in context start = i - win + gram_word - 1 end = i + win - gram_context + 1 for j in xrange(start, end + 1): if overlap: if i == j and gram_word == gram_context: continue else: if len( set(range(i, i + gram_word)) & set(range(j, j + gram_context))) > 0: continue context = getNgram(tokens, j, gram_context) context = check_word(context, vocab, sub, subsampler, rnd) if context is None: continue pairs_file.write(word + ' ' + context + "\n") #write pairs to the file pairs_file.close()