def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Sample K negative words for each predicted word at each position all_neg_words = np.random.choice(nwords, size=2 * N * K * len(emb), replace=True, p=word_probabilities) # W_w = dy.parameter(W_w_p) # Step through the sentence and calculate the negative and positive losses all_losses = [] for i, my_emb in enumerate(emb): neg_words = all_neg_words[i * K * 2 * N:(i + 1) * K * 2 * N] pos_words = ( [sent[x] if x >= 0 else S for x in range(i - N, i)] + [sent[x] if x < len(sent) else S for x in range(i + 1, i + N + 1)]) neg_loss = -dy.log( dy.logistic( -dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) pos_loss = -dy.log( dy.logistic( dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) return dy.esum(all_losses)
def train(self, rnnlm, train_quatrains, dev_quatrains): min_dev_loss = sys.maxsize for i in tqdm(range(self.epochs), desc='Training'): losses = [] tqdm.write('Epoch {}'.format(i)) total_loss = 0 state = rnnlm.initialize() for count, quatrain in enumerate(train_quatrains): for token, (next_word, _, _, _) in zip(quatrain, quatrain[1:]): state, probs = rnnlm.add_input(state, token) loss = -dy.log(dy.pick(probs, next_word)) losses.append(loss) if count % self.BATCH_SIZE == 0: loss = dy.esum(losses) total_loss += loss.value() loss.backward() self.trainer.update() losses = [] dy.renew_cg() state = rnnlm.initialize() #if (count + 1)% 4 == 0: # dy.renew_cg() # state = rnnlm.initialize() dev_loss = 0 state = rnnlm.initialize() for count, quatrain in enumerate(dev_quatrains): for token, (next_word, _, _, _) in zip(quatrain, quatrain[1:]): state, probs = rnnlm.add_input(state, token) loss = -dy.log(dy.pick(probs, next_word)) dev_loss += loss.value() if (count + 1) % 4 == 0: dy.renew_cg() state = rnnlm.initialize() tqdm.write('Dev loss: {}'.format(dev_loss)) if dev_loss < min_dev_loss: tqdm.write('Best dev loss. Saving parameters...') self.pc.save('model.pt') min_loss = dev_loss else: tqdm.write('Not brst dev loss. Restarting with smaller...') self.lr = self.lr * .5 self.trainer.restart(self.lr) tqdm.write('Training Loss: {}'.format(total_loss)) rnnlm.generate(rnnlm.initialize())
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) w1 = dy.parameter(attention_w1) input_mat = dy.concatenate_cols(vectors) w1dt = None last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or w1 * input_mat vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Sample K negative words for each predicted word at each position all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities) # W_w = dy.parameter(W_w_p) # Step through the sentence and calculate the negative and positive losses all_losses = [] for i, my_emb in enumerate(emb): neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N] pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) return dy.esum(all_losses)
def decode(dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = dy.parameter(decoder_w) b = dy.parameter(decoder_b) last_output_embeddings = output_lookup[char2int[EOS]] s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])) loss = [] for char in output: vector = dy.concatenate([attend(vectors, s), last_output_embeddings]) s = s.add_input(vector) out_vector = w * s.output() + b probs = dy.softmax(out_vector) last_output_embeddings = output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) return loss
def create_network_return_loss(inputs, expected_output): ''' inputs is a list of numbers ''' dy.renew_cg() W = dy.parameter(pW) # from parameters to expressions b = dy.parameter(pB) if(len(inputs) > documentLength): inputs = inputs[0:documentLength] emb_vectors = [lookup[i] for i in inputs] while(len(emb_vectors) < documentLength): pad = dy.vecInput(embDimension) pad.set(np.zeros(embDimension)) emb_vectors.append(pad) net_input = dy.concatenate(emb_vectors) net_output = dy.softmax( (W*net_input) + b) loss = -dy.log(dy.pick(net_output, expected_output)) return loss
def CalculateLossForWord(word_obj, fValidation=False, fRunning=False): dy.renew_cg() if not fRunning: gold_lang = word_obj['tag'] # add a bos before and after seq = ['*BOS*'] + list(word_obj['word']) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) bilistm_output = dy.concatenate([char_bilstm_outputs[0],char_bilstm_outputs[-1]]) mlp_input = bilistm_output mlp_out = lang_mlp(mlp_input) predicted_lang = lang_tags[np.argmax(mlp_out.npvalue())] confidence = (mlp_out.npvalue()[:2] / np.sum(mlp_out.npvalue()[:2])).tolist() #skip ambiguous # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: loss = -dy.log(dy.pick(mlp_out, gold_lang)) # otherwise, set the answer to be the argmax elif not fRunning and fValidation: loss = None lang_conf_matrix(np.argmax(mlp_out.npvalue()), gold_lang) else: return predicted_lang,confidence pos_prec = 1 if predicted_lang == lang_tags[gold_lang] else 0 tagged_word = { 'word': word_obj['word'], 'tag': predicted_lang, 'confidence':confidence, 'gold_tag':lang_tags[gold_lang]} if fValidation: return pos_prec, tagged_word return loss, pos_prec
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words":[],"file":daf["file"]} daf = daf["words"] # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _, _, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_pos_lstm_state = prev_pos_lstm.initial_state().add_input(pos_enc('*BOS*')) all_losses = [] pos_prec = 0.0 rough_pos_prec = 0.0 pos_items = 0 class_prec = 0.0 class_items = 0.0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_class, gold_word_pos, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): should_backprop = gold_word_class == 1 # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, prev_pos_lstm_state.output()]) # run through the class mlp class_mlp_output = class_mlp(mlp_input) predicted_word_class = np.argmax(class_mlp_output.npvalue()) confidence = np.max(class_mlp_output.npvalue()) / np.sum(class_mlp_output.npvalue()) # prec if should_backprop: class_prec += 1 if predicted_word_class == gold_word_class else 0 class_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log(dy.pick(class_mlp_output, gold_word_class))) word_class_ans = gold_word_class # otherwise, set the answer to be the argmax else: word_class_ans = predicted_word_class # if the word_class answer is 1, do the pos! # alternatively, if validating and it's aramic, do the pos! if word_class_ans or (fValidation and gold_word_lang) or (fRunning and gold_word_lang): # run the pos mlp output pos_mlp_output = pos_mlp(mlp_input) try: temp_pos_array = pos_mlp_output.npvalue() possible_pos_array = np.zeros(temp_pos_array.shape) pos_list = pos_hashtable[word] # pos_list.add('') #concat 'unknown' as possible pos possible_pos_indices = [pos_vocab[temp_pos] for temp_pos in pos_list] possible_pos_array[possible_pos_indices] = temp_pos_array[possible_pos_indices] except KeyError: possible_pos_array = pos_mlp_output.npvalue() # if fValidation: # possible_pos_array[pos_vocab['']] = 0.0 # don't allow validation to guess UNK b/c it never trained against that TODO this makes sense, right? predicted_word_pos = pos_vocab.getItem(np.argmax(possible_pos_array)) confidence = np.max(possible_pos_array) / np.sum(possible_pos_array) # prec if should_backprop: pos_prec += 1 if predicted_word_pos == gold_word_pos else 0 rough_pos_prec += 1 if predicted_word_pos[0] == gold_word_pos[0] else 0 # you got at least the rough pos right pos_items += 1 # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: if should_backprop: all_losses.append(-dy.log(dy.pick(pos_mlp_output, pos_vocab[gold_word_pos]))) word_pos_ans = gold_word_pos # otherwise, set the answer to be the argmax elif not fRunning and fValidation: if should_backprop: pos_conf_matrix(pos_vocab[predicted_word_pos], pos_vocab[gold_word_pos]) word_pos_ans = predicted_word_pos else: word_pos_ans = predicted_word_pos # run through the prev-pos-mlp predicted = predicted_word_pos prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc(word_pos_ans)) # if the answer is 0, put a '' through the prev-pos lstm else: predicted = 'UNK' prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) tagged_daf["words"].append({"word":word,"gold_pos":gold_word_pos,"gold_class":gold_word_class,"predicted":predicted,"confidence":confidence, "lang": gold_word_lang}) if fRunning: return tagged_daf pos_prec = pos_prec / pos_items if pos_items > 0 else None rough_pos_prec = rough_pos_prec / pos_items if pos_items > 0 else None class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return class_prec, pos_prec,tagged_daf, rough_pos_prec total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, class_prec, pos_prec, rough_pos_prec
def CalculateLossForDaf(daf, fValidation=False, fRunning=False): dy.renew_cg() tagged_daf = {"words": []} # add a bos before and after seq = ['*BOS*'] + list(' '.join([word for word, _ in daf])) + ['*BOS*'] # get all the char encodings for the daf char_embeds = [let_enc(let) for let in seq] # run it through the bilstm char_bilstm_outputs = bilstm(char_embeds) # now iterate and get all the separate word representations by concatenating the bilstm output # before and after the word word_bilstm_outputs = [] iLet_start = 0 for iLet, char in enumerate(seq): # if it is a bos, check if it's at the end of the sequence if char == '*BOS*': if iLet + 1 == len(seq): char = ' ' else: continue # if we are at a space, take this bilstm output and the one at the letter start if char == ' ': cur_word_bilstm_output = dy.concatenate([char_bilstm_outputs[iLet_start], char_bilstm_outputs[iLet]]) # add it in word_bilstm_outputs.append(cur_word_bilstm_output) # set the iLet_start ocunter to here iLet_start = iLet # safe-check, make sure word bilstm outputs length is the same as the daf if len(word_bilstm_outputs) != len(daf): log_message('Size mismatch!! word_bilstm_outputs: ' + str(len(word_bilstm_outputs)) + ', daf: ' + str(len(daf))) prev_lang_lstm_state = prev_lang_lstm.initial_state().add_input(lang_enc('*BOS*')) all_losses = [] lang_prec = 0.0 lang_items = 0 # now iterate through the bilstm outputs, and each word in the daf for (word, gold_word_lang), bilstm_output in zip(daf, word_bilstm_outputs): # create the mlp input, a concatenate of the bilstm output and of the prev pos output mlp_input = dy.concatenate([bilstm_output, prev_lang_lstm_state.output()]) # run through the class mlp lang_mlp_output = lang_mlp(mlp_input) predicted_word_lang = lang_vocab.getItem(np.argmax(lang_mlp_output.npvalue())) confidence = np.max(lang_mlp_output.npvalue()) / np.sum(lang_mlp_output.npvalue()) lang_prec += 1 if predicted_word_lang == gold_word_lang else 0 lang_items += 1 tagged_daf["words"].append( {"word": word, "predicted_lang": predicted_word_lang, "confidence": confidence}) # if we aren't doing validation, calculate the loss if not fValidation and not fRunning: all_losses.append(-dy.log(dy.pick(lang_mlp_output, lang_vocab[gold_word_lang]))) word_pos_ans = gold_word_lang # otherwise, set the answer to be the argmax elif not fRunning and fValidation: lang_conf_matrix(lang_vocab[predicted_word_lang], lang_vocab[gold_word_lang]) word_pos_ans = predicted_word_lang else: continue # run through the prev-pos-mlp prev_lang_lstm_state = prev_lang_lstm_state.add_input(lang_enc(word_pos_ans)) # prev_pos_lstm_state = prev_pos_lstm_state.add_input(pos_enc('')) lang_prec = lang_prec / lang_items if lang_items > 0 else None # class_prec = class_prec / class_items if class_items > 0 else None if fValidation: return lang_prec, tagged_daf if fRunning: return tagged_daf total_loss = dy.esum(all_losses) if len(all_losses) > 0 else None return total_loss, lang_prec
# regular lookup a = lp[1].npvalue() b = lp[2].npvalue() c = lp[3].npvalue() # batch lookup instead of single elements. # two ways of doing this. abc1 = dy.lookup_batch(lp, [1,2,3]) print(abc1.npvalue()) abc2 = lp.batch([1,2,3]) print(abc2.npvalue()) print(np.hstack([a,b,c])) # use pick and pickneglogsoftmax in batch mode # (must be used in conjunction with lookup_batch): print("\nPick") W = dy.parameter( m.add_parameters((5, 10)) ) h = W * lp.batch([1,2,3]) print(h.npvalue()) print(dy.pick_batch(h,[1,2,3]).npvalue()) print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value()) # using pickneglogsoftmax_batch print("\nPick neg log softmax") print((-dy.log(dy.softmax(h))).npvalue()) print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())