def line2features(line, args, vocab, pairs_file, sub, subsampler): win = int(args['--win']) ngram_word = int(args['--ngram_word']) ngram_context = int(args['--ngram_context']) overlap = args['--overlap'] rnd = Random(17) tokens = line.strip().split() for i in range(len(tokens)): #loop for each position in a line for gram_word in range( 1, ngram_word + 1): #loop for grams of different orders in (center) word word = getNgram(tokens, i, gram_word) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue for gram_context in range( 1, ngram_context + 1): #loop for grams of different orders in context start = i - win + gram_word - 1 end = i + win - gram_context + 1 for j in range(start, end + 1): if overlap: if i == j and gram_word == gram_context: continue else: if len( set(range(i, i + gram_word)) & set(range(j, j + gram_context))) > 0: continue context = getNgram(tokens, j, gram_context) context = check_word(context, vocab, sub, subsampler, rnd) if context is None: continue pairs_file.write(word + ' ' + context + "\n") #write pairs to the file
def line2features(line, args, vocab, pairs_file, sub, subsampler): win = int(args['--win']) ngram_word = int(args['--ngram_word']) ngram_context = int(args['--ngram_context']) overlap = args['--overlap'] rnd = Random(17) tokens = line.strip().split() for i in range(len(tokens)): #loop for each position in a line for gram_word in range(1, ngram_word+1): #loop for grams of different orders in (center) word word = getNgram(tokens, i, gram_word) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue for gram_context in range(1, ngram_context+1): #loop for grams of different orders in context start = i - win + gram_word - 1 end = i + win - gram_context + 1 for j in range(start, end + 1): if overlap: if i == j and gram_word == gram_context: continue else: if len(set(range(i, i + gram_word)) & set(range(j, j + gram_context))) > 0: continue context = getNgram(tokens, j, gram_context) context = check_word(context, vocab, sub, subsampler, rnd) if context is None: continue pairs_file.write(word + ' ' + context + "\n") #write pairs to the file
def word_word(line, args, vocab, pairs_file, sub, subsampler ): #identical to the word2vec toolkit; dynamic and dirty window! rnd = Random(17) win = int(args['--win']) win = rnd.randint(1, win) #dynamic window tokens = [t if t in vocab else None for t in line.strip().split()] if sub: tokens = [ t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens ] tokens = [t for t in tokens if t is not None] #dirty window for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) if word is None: continue start = i - win end = i + win for j in range(start, end + 1): if i == j: continue context = getNgram(tokens, j, 1) if context is None: continue pairs_file.write(word + ' ' + context + "\n")
def word_character(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window! rnd = Random(17) char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up'])) #character range win = int(args['--win']) dynamic = args['--dynamic_win'] if dynamic: win = rnd.randint(1, win) #dynamic window tokens = [t if t in vocab else None for t in line.strip().split()] if sub: tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens] for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) if word is None: continue start = i - win end = i + win for j in range(start, end + 1): context = getNgram(tokens, j, 1) if context is None: continue if i == j: characters = [] for character in context.decode('utf-8'): characters.append(character) for char_ngram in range(char_range[0], char_range[1] + 1): for char_start in range(len(characters)): char_end = char_start + char_ngram if char_end > len(characters): break pairs_file.write(word + ' ' + ''.join([char.encode('utf-8') for char in characters[char_start: char_end]]) + "\n") continue pairs_file.write(word + ' ' + context + "\n")
def word_word(line, args, vocab, pairs_file, sub, subsampler): win = int(args['--win']) rnd = Random(17) tokens = line.strip().split() for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue start = i - win end = i + win for j in range(start, end + 1): if i == j: continue context = getNgram(tokens, j, 1) context = check_word(context, vocab, sub, subsampler, rnd) if context is None: continue pairs_file.write(word + ' ' + context + "\n")
def word_character( line, args, vocab, pairs_file, sub, subsampler ): #identical to the word2vec toolkit; dynamic and dirty window! rnd = Random(17) char_range = (int(args['--ngram_char_low']), int(args['--ngram_char_up']) ) #character range win = int(args['--win']) dynamic = args['--dynamic_win'] if dynamic: win = rnd.randint(1, win) #dynamic window tokens = [t if t in vocab else None for t in line.strip().split()] if sub: tokens = [ t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens ] for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) if word is None: continue start = i - win end = i + win for j in range(start, end + 1): context = getNgram(tokens, j, 1) if context is None: continue if i == j: characters = [] for character in context.decode('utf-8'): characters.append(character) for char_ngram in range(char_range[0], char_range[1] + 1): for char_start in range(len(characters)): char_end = char_start + char_ngram if char_end > len(characters): break pairs_file.write(word + ' ' + ''.join([ char.encode('utf-8') for char in characters[char_start:char_end] ]) + "\n") continue pairs_file.write(word + ' ' + context + "\n")
def word_text(line, args, vocab, pairs_file, sub, subsampler, text_id): rnd = Random(17) tokens = line.strip().split() if len(tokens) < 200: return for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue pairs_file.write('#' + str(text_id) + ' ' + word + "\n")
def word_word(line, args, vocab, pairs_file, sub, subsampler): #identical to the word2vec toolkit; dynamic and dirty window! rnd = Random(17) win = int(args['--win']) win = rnd.randint(1, win) #dynamic window tokens = [t if t in vocab else None for t in line.strip().split()] if sub: tokens = [t if t not in subsampler or rnd.random() > subsampler[t] else None for t in tokens] tokens = [t for t in tokens if t is not None] #dirty window for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) if word is None: continue start = i - win end = i + win for j in range(start, end + 1): if i == j: continue context = getNgram(tokens, j, 1) if context is None: continue pairs_file.write(word + ' ' + context + "\n")
def word_wordLR(line, args, vocab, pairs_file, sub, subsampler): win = int(args['--win']) rnd = Random(17) tokens = line.strip().split() for i in range(len(tokens)): #loop for each position in a line word = getNgram(tokens, i, 1) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue start = i - win end = i + win for j in range(start, end + 1): if i == j: continue context = getNgram(tokens, j, 1) context = check_word(context, vocab, sub, subsampler, rnd) if context is None: continue if j < i: pairs_file.write(word + ' ' + context + '#L' + "\n") else: pairs_file.write(word + ' ' + context + '#R' + "\n")
def c2p(args, tid): pairs_file = open(args['<pairs>'] + "_" + str(tid), 'w') win = int(args['--win']) subsample = float(args['--sub']) sub = subsample != 0 ngram_word = int(args['--ngram_word']) ngram_context = int(args['--ngram_context']) overlap = args['--overlap'] threads_num = int(args['--threads_num']) vocab = load_count_vocabulary( args['<vocab>']) #load vocabulary (generated in corpus2vocab stage) train_uni_num = 0 #number of (unigram) tokens in corpus for w, c in vocab.iteritems(): if '@$' not in w: train_uni_num += c train_num = sum(vocab.values()) #number of (ngram) tokens in corpus if tid == 0: print 'vocabulary size: ' + str(len(vocab)) print 'number of training words (uni-grams): ' + str(train_uni_num) print 'number of training n-grams: ' + str(train_num) subsample *= train_uni_num if sub: subsampler = dict([(word, 1 - sqrt(subsample / count)) for word, count in vocab.iteritems() if count > subsample]) #subsampling technique rnd = Random(17) with open(args['<corpus>']) as f: line_num = 0 if tid == 0: print str(line_num / 1000**1) + "K lines processed." for line in f: line_num += 1 if ((line_num) % 1000) == 0 and tid == 0: print "\x1b[1A" + str(line_num / 1000) + "K lines processed." if line_num % threads_num != tid: continue tokens = line.strip().split() for i in xrange(len(tokens)): #loop for each position in a line for gram_word in xrange( 1, ngram_word + 1 ): #loop for grams of different orders in (center) word word = getNgram(tokens, i, gram_word) word = check_word(word, vocab, sub, subsampler, rnd) if word is None: continue for gram_context in xrange( 1, ngram_context + 1): #loop for grams of different orders in context start = i - win + gram_word - 1 end = i + win - gram_context + 1 for j in xrange(start, end + 1): if overlap: if i == j and gram_word == gram_context: continue else: if len( set(range(i, i + gram_word)) & set(range(j, j + gram_context))) > 0: continue context = getNgram(tokens, j, gram_context) context = check_word(context, vocab, sub, subsampler, rnd) if context is None: continue pairs_file.write(word + ' ' + context + "\n") #write pairs to the file pairs_file.close()