def train_document_dm_concat_xy_generator(model, docs): for doc in docs: indexed_doctags = model.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags word_vocabs = [model.vocab[w] for w in doc.words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] #doctag_len = len(doctag_indexes) #if doctag_len != model.dm_tag_count: null_word = model.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count): pos] # preceding words + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words ) #word_context_len = len(word_context_indexes) #print word_context_len predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]] xy = train_cbow_pair(model, predict_word, word_context_indexes) x2=doctag_indexes xy1=[xy[0],x2,xy[1],xy[2]] yield xy1
def train_batch_dm_xy_generator(model, docs): for doc in docs: indexed_doctags = model.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags word_vocabs = [ model.wv.vocab[w] for w in doc.words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [ word2.index for pos2, word2 in window_pos if pos2 != pos ] xy_gen = train_cbow_pair(model, word, word2_indexes) x2 = doctag_indexes for xy in xy_gen: if xy != None: yield [xy[0], x2, xy[1], xy[2]]
def train_document_dm_concat_xy_generator(model, docs): for doc in docs: indexed_doctags = model.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags word_vocabs = [ model.wv.vocab[w] for w in doc.words if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32 ] null_word = model.wv.vocab['\0'] pre_pad_count = model.window post_pad_count = model.window padded_document_indexes = ( (pre_pad_count * [null_word.index]) # pre-padding + [word.index for word in word_vocabs if word is not None ] # elide out-of-Vocabulary words + (post_pad_count * [null_word.index]) # post-padding ) for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): word_context_indexes = ( padded_document_indexes[(pos - pre_pad_count):pos] # preceding words + padded_document_indexes[ (pos + 1):(pos + 1 + post_pad_count)] # following words ) predict_word = model.wv.vocab[model.index2word[ padded_document_indexes[pos]]] xy_gen = train_cbow_pair(model, predict_word, word_context_indexes) for xy in xy_gen: if xy != None: x2 = doctag_indexes xy1 = [xy[0], x2, xy[1], xy[2]] yield xy1
def train_batch_score_cbow_xy_generator(model, scored_word_sentences): for scored_word_sentence in scored_word_sentences: #print scored_word_sentence scored_word_vocabs = [ [model.vocab[w], s] for [w, s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, scored_word in enumerate(scored_word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate( scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [ scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos) ] xy_gen = train_cbow_pair(model, scored_word[0], word2_indices, None, None) for xy in xy_gen: if xy != None: xy1 = [xy[0], xy[1], xy[2], [scored_word[1]]] yield xy1
def train_batch_dm_xy_generator(model, docs): for doc in docs: indexed_doctags = model.docvecs.indexed_doctags(doc.tags) doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags word_vocabs = [model.vocab[w] for w in doc.words if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] xy_gen=train_cbow_pair(model, word, word2_indexes) #, l1=None, alpha=None,learn_vectors=False, learn_hidden=learn_hidden) xy_gen=train_cbow_pair(model, word , word2_indices , None, None) x2=doctag_indexes for xy in xy_gen: if xy !=None: yield [xy[0],x2,xy[1],xy[2]]
def train_batch_score_cbow_xy_generator(model, scored_word_sentences): for scored_word_sentence in scored_word_sentences: #print scored_word_sentence scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, scored_word in enumerate(scored_word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code start = max(0, pos - model.window + reduced_window) window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)] xy=train_cbow_pair(model, scored_word[0] , word2_indices , None, None) if xy !=None: xy1=[xy[0],xy[1],xy[2],scored_word[1]] yield xy1