def train_batch_dbow(model, docs, alpha, work=None, train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None, batch_size=100): #print 'train_batch_dbow' batch_count=0 train_x0=[[0]]*batch_size train_x1=[[0]]*batch_size train_y=[[0]]*batch_size while 1: for doc in docs: for doctag_index in doc.tags: for word in doc.words: xy=train_sg_pair(model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, context_vectors=doctag_vectors, context_locks=doctag_locks) if xy !=None: (x0,x1,y)=xy #print xy train_x0[batch_count]=[x0] train_x1[batch_count]=x1 train_y[batch_count]=y batch_count += 1 if batch_count >= batch_size : yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y)} batch_count=0
def train_batch_dbow(model, docs, sub_batch_size=256, batch_size=256): batch_count = 0 sub_batch_count = 0 train_x0 = np.zeros((batch_size, sub_batch_size), dtype='int32') train_x1 = np.zeros((batch_size, sub_batch_size), dtype='int32') train_y = np.zeros((batch_size, sub_batch_size), dtype='int8') while 1: for doc in docs: for doctag_index in doc.tags: for word in doc.words: xy_gen = train_sg_pair( model, word, doctag_index, ) for xy in xy_gen: if xy != None: (x0, x1, y) = xy train_x0[batch_count][sub_batch_count] = x0 train_x1[batch_count][sub_batch_count] = x1 train_y[batch_count][sub_batch_count] = y sub_batch_count += 1 if sub_batch_count >= sub_batch_size: batch_count += 1 sub_batch_count = 0 if batch_count >= batch_size: yield { 'index': train_x0, 'point': train_x1, 'code': train_y } batch_count = 0
def train_batch_dbow(model, docs, sub_batch_size=256,batch_size=256 ): batch_count=0 sub_batch_count=0 train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_y =np.zeros((batch_size,sub_batch_size),dtype='int8') while 1: for doc in docs: for doctag_index in doc.tags: for word in doc.words: xy_gen=train_sg_pair(model, word, doctag_index,) for xy in xy_gen : if xy !=None: (x0,x1,y)=xy train_x0[batch_count][sub_batch_count]=x0 train_x1[batch_count][sub_batch_count]=x1 train_y[batch_count][sub_batch_count]=y sub_batch_count += 1 if sub_batch_count >= sub_batch_size : batch_count += 1 sub_batch_count=0 if batch_count >= batch_size : yield { 'index':train_x0, 'point':train_x1, 'code':train_y} batch_count=0
def train_batch_score_sg(model, scored_word_sentences, score_vector_size, alpha=None, work=None, sub_batch_size=256, batch_size=256): batch_count=0 sub_batch_count=0 train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32') train_y0 =np.zeros((batch_size,sub_batch_size),dtype='int8') train_y1 =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32') # train_x0=[[0]]*batch_size # train_x1=[[0]]*batch_size # train_y0=[[0]]*batch_size # train_y1=[[0]]*batch_size while 1: for scored_word_sentence in scored_word_sentences: #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence] word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, scored_word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code word=scored_word2word(scored_word) # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): word2=scored_word2word(scored_word2) # don't train on the `word` itself if pos2 != pos: xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha) for xy in xy_gen : if xy !=None: (x0,x1,y0)=xy y1=scored_word2score(scored_word) train_x0[batch_count][sub_batch_count]=x0 train_x1[batch_count][sub_batch_count]=x1 train_y0[batch_count][sub_batch_count]=y0 train_y1[batch_count][sub_batch_count]=y1 sub_batch_count += 1 if sub_batch_count >= sub_batch_size : batch_count += 1 sub_batch_count=0 if batch_count >= batch_size : yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1} batch_count=0
def train_batch_score_sg(model, scored_word_sentences, alpha=None, work=None,batch_size=100): batch_count=0 train_x0=[[0]]*batch_size train_x1=[[0]]*batch_size train_y0=[[0]]*batch_size train_y1=[[0]]*batch_size while 1: for scored_word_sentence in scored_word_sentences: #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence] word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32] for pos, scored_word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code word=scored_word2word(scored_word) # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): word2=scored_word2word(scored_word2) # don't train on the `word` itself if pos2 != pos: xy=train_sg_pair(model, model.index2word[word.index], word2.index, alpha) if xy !=None: (x0,x1,y0)=xy y1=scored_word2score(scored_word) train_x0[batch_count]=[x0] train_x1[batch_count]=x1 train_y0[batch_count]=y0 train_y1[batch_count]=y1 #print train_x0,train_y1, batch_count += 1 if batch_count >= batch_size : #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)} #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)} yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)} batch_count=0
def train_batch_score_sg(model, scored_word_sentences, score_vector_size, alpha=None, work=None, sub_batch_size=256, batch_size=256): batch_count = 0 sub_batch_count = 0 train_x0 = np.zeros((batch_size, sub_batch_size), dtype='int32') train_x1 = np.zeros((batch_size, sub_batch_size), dtype='int32') train_y0 = np.zeros((batch_size, sub_batch_size), dtype='int8') train_y1 = np.zeros((batch_size, sub_batch_size, score_vector_size), dtype='float32') # train_x0=[[0]]*batch_size # train_x1=[[0]]*batch_size # train_y0=[[0]]*batch_size # train_y1=[[0]]*batch_size while 1: for scored_word_sentence in scored_word_sentences: #sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence] word_vocabs = [ [model.vocab[w], s] for [w, s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32 ] for pos, scored_word in enumerate(word_vocabs): reduced_window = model.random.randint( model.window) # `b` in the original word2vec code word = scored_word2word(scored_word) # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) for pos2, scored_word2 in enumerate( word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): word2 = scored_word2word(scored_word2) # don't train on the `word` itself if pos2 != pos: xy_gen = train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha) for xy in xy_gen: if xy != None: (x0, x1, y0) = xy y1 = scored_word2score(scored_word) train_x0[batch_count][sub_batch_count] = x0 train_x1[batch_count][sub_batch_count] = x1 train_y0[batch_count][sub_batch_count] = y0 train_y1[batch_count][sub_batch_count] = y1 sub_batch_count += 1 if sub_batch_count >= sub_batch_size: batch_count += 1 sub_batch_count = 0 if batch_count >= batch_size: yield { 'index': train_x0, 'point': train_x1, 'code': train_y0, 'score': train_y1 } batch_count = 0