def generate(model, input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): def sample(probs): rnd = random.random() for i, p in enumerate(probs): rnd -= p if rnd <= 0: break return i embedded = embedd_sentence(model, input) encoded = encode_sentence(model, enc_fwd_lstm, enc_bwd_lstm, embedded) w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE * 2)) out = '' count_EOS = 0 for i in range(len(input)*2): if count_EOS == 2: break vector = attend(model, encoded, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) probs = probs.vec_value() next_char = sample(probs) if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def predict(sent, model, builders): """ predict tags and demographic labels :param sent: :param model: :param builders: :return: tag and label predictions """ if MLP: H = pycnn.parameter(pH) O = pycnn.parameter(pO) else: O = pycnn.parameter(pO) tags = [] for forward_state, backward_state in build_tagging_graph(words, model, builders): if MLP: r_t = O * (pycnn.tanh(H * pycnn.concatenate([forward_state, backward_state]))) else: r_t = O * pycnn.concatenate([forward_state, backward_state]) out = pycnn.softmax(r_t) chosen = np.argmax(out.npvalue()) tags.append(vocab_tags.i2w[chosen]) return tags
def tag_sentence(self, sentence): word_expression_list = self._build_word_expression_list(sentence, is_train=False) for word, word_expression in zip(sentence, word_expression_list): out = softmax(word_expression) tag_index = np.argmax(out.npvalue()) word.tag = self.tag_indexer.get_object(tag_index)
def generate(model, input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm): def sample(probs): rnd = random.random() for i, p in enumerate(probs): rnd -= p if rnd <= 0: break return i embedded = embedd_sentence(model, input) encoded = encode_sentence(model, enc_fwd_lstm, enc_bwd_lstm, embedded) w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE * 2)) out = '' count_EOS = 0 for i in range(len(input) * 2): if count_EOS == 2: break vector = attend(model, encoded, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) probs = probs.vec_value() next_char = sample(probs) if int2char[next_char] == EOS: count_EOS += 1 continue out += int2char[next_char] return out
def predict(word_indices, model, builder, target): """ predict demographic label :param word_indices: :param model: :param builder: :return: tag and label predictions """ forward_states = build_tagging_graph(word_indices, model, builder) final_state = forward_states[-1] H = pycnn.parameter(pH) bias_H = pycnn.parameter(biasH) H2 = pycnn.parameter(pH2) bias_H2 = pycnn.parameter(biasH2) if target in ['age', 'both', 'joint']: O = pycnn.parameter(pOutAge) bias_O = pycnn.parameter(biasOutAge) elif target == 'gender': O = pycnn.parameter(pOutGender) bias_O = pycnn.parameter(biasOutGender) if target == 'both': O2 = pycnn.parameter(pOutGender) bias_O2 = pycnn.parameter(biasOutGender) if target == 'both': # hidden = bias_H2 + pycnn.tanh(H2 * (bias_H + pycnn.tanh(H * final_state))) hidden = bias_H + pycnn.tanh(H * final_state) r_age = bias_O + (O * hidden) r_gender = bias_O2 + (O2 * hidden) out_age = pycnn.softmax(r_age) out_gender = pycnn.softmax(r_gender) return [np.argmax(out_age.npvalue()), np.argmax(out_gender.npvalue())] else: # r_t = bias_O + (O * (bias_H2 + pycnn.tanh(H2 * (bias_H + pycnn.tanh(H * final_state))))) r_t = bias_O + (O * (bias_H + (H * final_state))) out = pycnn.softmax(r_t) chosen = np.argmax(out.npvalue()) return chosen
def predict_output_sequence(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, alphabet_index, inverse_alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) W_c = pc.parameter(model["W_c"]) W__a = pc.parameter(model["W__a"]) U__a = pc.parameter(model["U__a"]) v__a = pc.parameter(model["v__a"]) # encode the lemma blstm_outputs = encode_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, lemma) # convert features to matching embeddings, if UNK handle properly feats_input = encode_feats(feat_index, feat_lookup, feats, feature_types) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] i = 0 predicted_sequence = [] # run the decoder through the sequence and predict characters while i < MAX_PREDICTION_LEN: # get current h of the decoder s = s.add_input(pc.concatenate([prev_output_vec, feats_input])) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas, W = task1_attention_implementation.attend(blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias # find best candidate output probs = pc.softmax(readout) next_char_index = common.argmax(probs.vec_value()) predicted_sequence.append(inverse_alphabet_index[next_char_index]) # check if reached end of word if predicted_sequence[-1] == END_WORD: break # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[next_char_index] i += 1 # remove the end word symbol return predicted_sequence[0:-1]
def tag_sentence(self, sentence): renew_cg() for word in sentence: word.vector = self._get_word_vector(word, use_dropout=False) sentence_expressions = self._build_sentence_expressions(sentence) for word, word_expression in zip(sentence, sentence_expressions): out = softmax(word_expression) tag_index = np.argmax(out.npvalue()) word.tag = self.tag_indexer.get_object(tag_index)
def attend(model, input_vectors, state): w1 = pc.parameter(model['attention_w1']) w2 = pc.parameter(model['attention_w2']) v = pc.parameter(model['attention_v']) attention_weights = [] w2dt = w2*pc.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*pc.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = pc.softmax(pc.concatenate(attention_weights)) output_vectors = pc.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def attend(model, vectors, state): w = pc.parameter(model['attention_w']) attention_weights = [] for vector in vectors: #concatenate each encoded vector with the current decoder state attention_input = pc.concatenate([vector, pc.concatenate(list(state.s()))]) #get the attention wieght for the decoded vector attention_weights.append(w * attention_input) #normalize the weights attention_weights = pc.softmax(pc.concatenate(attention_weights)) #apply the weights vectors = pc.esum([vector*attention_weight for vector, attention_weight in zip(vectors, attention_weights)]) return vectors
def decode(model, dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE*2)) loss = [] for char in output: vector = attend(model, vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def decode(model, dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE * 2)) loss = [] for char in output: vector = attend(model, vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def one_word_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, aligned_pair, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) padded_lemma = BEGIN_WORD + lemma + END_WORD # convert characters to matching embeddings lemma_char_vecs = [] for char in padded_lemma: try: lemma_char_vecs.append(char_lookup[alphabet_index[char]]) except KeyError: # handle UNK lemma_char_vecs.append(char_lookup[alphabet_index[UNK]]) # convert features to matching embeddings, if UNK handle properly feat_vecs = [] for feat in sorted(feature_types): # TODO: is it OK to use same UNK for all feature types? and for unseen feats as well? # if this feature has a value, take it from the lookup. otherwise use UNK if feat in feats: feat_str = feat + ':' + feats[feat] try: feat_vecs.append(feat_lookup[feat_index[feat_str]]) except KeyError: # handle UNK or dropout feat_vecs.append(feat_lookup[feat_index[UNK_FEAT]]) else: feat_vecs.append(feat_lookup[feat_index[UNK_FEAT]]) feats_input = pc.concatenate(feat_vecs) # BiLSTM forward pass s_0 = encoder_frnn.initial_state() s = s_0 frnn_outputs = [] for c in lemma_char_vecs: s = s.add_input(c) frnn_outputs.append(s.output()) # BiLSTM backward pass s_0 = encoder_rrnn.initial_state() s = s_0 rrnn_outputs = [] for c in reversed(lemma_char_vecs): s = s.add_input(c) rrnn_outputs.append(s.output()) # BiLTSM outputs blstm_outputs = [] lemma_char_vecs_len = len(lemma_char_vecs) for i in xrange(lemma_char_vecs_len): blstm_outputs.append(pc.concatenate([frnn_outputs[i], rrnn_outputs[lemma_char_vecs_len - i - 1]])) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] prev_char_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] # i is input index, j is output index i = 0 j = 0 # go through alignments, progress j when new output is introduced, progress i when new char is seen on lemma (no ~) # TODO: try sutskever flip trick? # TODO: attention on the lemma chars/feats could help here? aligned_lemma, aligned_word = aligned_pair aligned_lemma += END_WORD aligned_word += END_WORD # run through the alignments for index, (input_char, output_char) in enumerate(zip(aligned_lemma, aligned_word)): possible_outputs = [] # feedback, i, j, blstm[i], feats decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) # if reached the end word symbol if output_char == END_WORD: s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[END_WORD]))) continue # if there is no prefix, step if padded_lemma[i] == BEGIN_WORD and aligned_lemma[index] != ALIGN_SYMBOL: # perform rnn step # feedback, i, j, blstm[i], feats s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP]))) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[STEP]] prev_char_vec = char_lookup[alphabet_index[EPSILON]] i += 1 # if there is new output if aligned_word[index] != ALIGN_SYMBOL: decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) # copy i action - maybe model as a single action? if padded_lemma[i] == aligned_word[j]: possible_outputs.append(str(i)) possible_outputs.append(padded_lemma[i]) else: possible_outputs.append(aligned_word[index]) # perform rnn step s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) local_loss = pc.scalarInput(0) max_output_loss = -pc.log(pc.pick(probs, alphabet_index[possible_outputs[0]])) max_likelihood_output = possible_outputs[0] # sum over all correct output possibilities and pick feedback output to be the one with the highest # probability for output in possible_outputs: neg_log_likelihood = -pc.log(pc.pick(probs, alphabet_index[output])) if neg_log_likelihood < max_output_loss: max_likelihood_output = output max_output_loss = neg_log_likelihood local_loss += neg_log_likelihood loss.append(local_loss) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[max_likelihood_output]] prev_char_vec = char_lookup[alphabet_index[aligned_word[index]]] j += 1 # now check if it's time to progress on input if i < len(padded_lemma) - 1 and aligned_lemma[index + 1] != ALIGN_SYMBOL: # perform rnn step # feedback, i, j, blstm[i], feats decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) s = s.add_input(decoder_input) decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) # compute local loss loss.append(-pc.log(pc.pick(probs, alphabet_index[STEP]))) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[STEP]] prev_char_vec = char_lookup[alphabet_index[EPSILON]] i += 1 # TODO: maybe here a "special" loss function is appropriate? # loss = esum(loss) loss = pc.average(loss) return loss
def predict_output_sequence(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, alphabet_index, inverse_alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) # convert characters to matching embeddings, if UNK handle properly padded_lemma = BEGIN_WORD + lemma + END_WORD lemma_char_vecs = [] for char in padded_lemma: try: lemma_char_vecs.append(char_lookup[alphabet_index[char]]) except KeyError: # handle UNK lemma_char_vecs.append(char_lookup[alphabet_index[UNK]]) # convert features to matching embeddings, if UNK handle properly feat_vecs = [] for feat in sorted(feature_types): # TODO: is it OK to use same UNK for all feature types? and for unseen feats as well? # if this feature has a value, take it from the lookup. otherwise use UNK if feat in feats: feat_str = feat + ':' + feats[feat] try: feat_vecs.append(feat_lookup[feat_index[feat_str]]) except KeyError: # handle UNK or dropout feat_vecs.append(feat_lookup[feat_index[UNK_FEAT]]) else: feat_vecs.append(feat_lookup[feat_index[UNK_FEAT]]) feats_input = pc.concatenate(feat_vecs) # BiLSTM forward pass s_0 = encoder_frnn.initial_state() s = s_0 frnn_outputs = [] for c in lemma_char_vecs: s = s.add_input(c) frnn_outputs.append(s.output()) # BiLSTM backward pass s_0 = encoder_rrnn.initial_state() s = s_0 rrnn_outputs = [] for c in reversed(lemma_char_vecs): s = s.add_input(c) rrnn_outputs.append(s.output()) # BiLTSM outputs blstm_outputs = [] lemma_char_vecs_len = len(lemma_char_vecs) for i in xrange(lemma_char_vecs_len): blstm_outputs.append(pc.concatenate([frnn_outputs[i], rrnn_outputs[lemma_char_vecs_len - i - 1]])) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] prev_char_vec = char_lookup[alphabet_index[BEGIN_WORD]] # i is input index, j is output index i = j = 0 num_outputs = 0 predicted_output_sequence = [] # run the decoder through the sequence and predict characters, twice max prediction as step outputs are added while num_outputs < MAX_PREDICTION_LEN * 3: # prepare input vector and perform LSTM step decoder_input = pc.concatenate([prev_output_vec, prev_char_vec, # char_lookup[alphabet_index[str(i)]], # char_lookup[alphabet_index[str(j)]], blstm_outputs[i], feats_input]) s = s.add_input(decoder_input) # compute softmax probs vector and predict with argmax decoder_rnn_output = s.output() probs = pc.softmax(R * decoder_rnn_output + bias) probs = probs.vec_value() predicted_output_index = common.argmax(probs) predicted_output = inverse_alphabet_index[predicted_output_index] predicted_output_sequence.append(predicted_output) # check if step or char output to promote i or j. if predicted_output == STEP: prev_char_vec = char_lookup[alphabet_index[EPSILON]] if i < len(padded_lemma) - 1: i += 1 else: if predicted_output.isdigit(): # handle copy # try: # prev_char_vec = char_lookup[alphabet_index[padded_lemma[i]]] # except KeyError: # prev_char_vec = char_lookup[alphabet_index[UNK]] try: # this way END_WORD cannot be copied (as it is in the training stage) if i < len(lemma) + 1: prev_char_vec = char_lookup[alphabet_index[padded_lemma[i]]] else: # if trying to copy from a non-existent index, pad with last lemma character prev_char_vec = char_lookup[alphabet_index[lemma[-1]]] except KeyError: prev_char_vec = char_lookup[alphabet_index[UNK]] else: # handle char prev_char_vec = char_lookup[predicted_output_index] j += 1 num_outputs += 1 # check if reached end of word if predicted_output_sequence[-1] == END_WORD: break # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[predicted_output_index] # remove the end word symbol return predicted_output_sequence[0:-1]
def train( feature_mapper, word_dims, tag_dims, lstm_units, hidden_units, epochs, batch_size, train_data_file, dev_data_file, model_save_file, droprate, unk_param, alpha=1.0, beta=0.0, ): start_time = time.time() fm = feature_mapper word_count = fm.total_words() tag_count = fm.total_tags() network = Network( word_count=word_count, tag_count=tag_count, word_dims=word_dims, tag_dims=tag_dims, lstm_units=lstm_units, hidden_units=hidden_units, struct_out=2, label_out=fm.total_label_actions(), droprate=droprate, ) network.init_params() print('Hidden units: {}, per-LSTM units: {}'.format( hidden_units, lstm_units, )) print('Embeddings: word={} tag={}'.format( (word_count, word_dims), (tag_count, tag_dims), )) print('Dropout rate: {}'.format(droprate)) print('Parameters initialized in [-0.01, 0.01]') print('Random UNKing parameter z = {}'.format(unk_param)) print('Exploration: alpha={} beta={}'.format(alpha, beta)) training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_treefile(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): batch = training_data[(b * batch_size):((b + 1) * batch_size)] explore = [ Parser.exploration( example, fm, network, alpha=alpha, beta=beta, ) for example in batch ] for (_, acc) in explore: training_acc += acc batch = [example for (example, _) in explore] pycnn.renew_cg() network.prep_params() errors = [] for example in batch: ## random UNKing ## for (i, w) in enumerate(example['w']): if w <= 2: continue freq = fm.word_freq_list[w] drop_prob = unk_param / (unk_param + freq) r = np.random.random() if r < drop_prob: example['w'][i] = 0 fwd, back = network.evaluate_recurrent( example['w'], example['t'], ) for (left, right), correct in example['struct_data'].items(): scores = network.evaluate_struct( fwd, back, left, right) probs = pycnn.softmax(scores) loss = -pycnn.log(pycnn.pick(probs, correct)) errors.append(loss) total_states += len(example['struct_data']) for (left, right), correct in example['label_data'].items(): scores = network.evaluate_label(fwd, back, left, right) probs = pycnn.softmax(scores) loss = -pycnn.log(pycnn.pick(probs, correct)) errors.append(loss) total_states += len(example['label_data']) batch_error = pycnn.esum(errors) total_cost += batch_error.scalar_value() batch_error.backward() network.trainer.update() mean_cost = total_cost / total_states print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Val: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc network.save(model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))
def predict_output_sequence(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, alphabet_index, inverse_alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) W_c = pc.parameter(model["W_c"]) W__a = pc.parameter(model["W__a"]) U__a = pc.parameter(model["U__a"]) v__a = pc.parameter(model["v__a"]) # encode the lemma blstm_outputs = encode_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, lemma) # convert features to matching embeddings, if UNK handle properly feats_input = encode_feats(feat_index, feat_lookup, feats, feature_types) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] i = 0 predicted_sequence = [] # run the decoder through the sequence and predict characters while i < MAX_PREDICTION_LEN: # get current h of the decoder s = s.add_input(pc.concatenate([prev_output_vec, feats_input])) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas, W = task1_attention_implementation.attend( blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias # find best candidate output probs = pc.softmax(readout) next_char_index = common.argmax(probs.vec_value()) predicted_sequence.append(inverse_alphabet_index[next_char_index]) # check if reached end of word if predicted_sequence[-1] == END_WORD: break # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[next_char_index] i += 1 # remove the end word symbol return predicted_sequence[0:-1]