def fit(words, tags, labels, model, builders): """ compute joint error of the :param words: list of indices :param tags: list of indices :param labels: index :param model: current model to access parameters :param builders: builder to create state combinations :return: joint error """ # retrieve model parameters if MLP: H = pycnn.parameter(pH) O = pycnn.parameter(pO) else: O = pycnn.parameter(pO) errs = [] for (forward_state, backward_state), tag in zip(build_tagging_graph(words, model, builders), tags): f_b = pycnn.concatenate([forward_state, backward_state]) if MLP: # TODO: add bias terms r_t = O * (pycnn.tanh(H * f_b)) else: r_t = O * f_b err = pycnn.pickneglogsoftmax(r_t, tag) errs.append(err) return pycnn.esum(errs)
def calc_sentence_error(self, sentence): word_expression_list = self._build_word_expression_list(sentence, is_train=True) sentence_errors = [] for word, word_expression in zip(sentence, word_expression_list): gold_label_index = self.tag_indexer.get_index(word.gold_label) word_error = pickneglogsoftmax(word_expression, gold_label_index) sentence_errors.append(word_error) return esum(sentence_errors)
def compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) W_c = pc.parameter(model["W_c"]) W__a = pc.parameter(model["W__a"]) U__a = pc.parameter(model["U__a"]) v__a = pc.parameter(model["v__a"]) blstm_outputs = encode_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, lemma) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # convert features to matching embeddings, if UNK handle properly feats_input = encode_feats(feat_index, feat_lookup, feats, feature_types) # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] padded_word = word + END_WORD # run the decoder through the output sequence and aggregate loss for i, output_char in enumerate(padded_word): # get current h of the decoder s = s.add_input(pc.concatenate([prev_output_vec, feats_input])) decoder_rnn_output = s.output() attention_output_vector, alphas, W = task1_attention_implementation.attend( blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias current_loss = pc.pickneglogsoftmax(readout, alphabet_index[output_char]) # print 'computed readout layer' loss.append(current_loss) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[output_char]] total_sequence_loss = pc.esum(loss) # loss = average(loss) return total_sequence_loss
def compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, feat_index, feature_types): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) W_c = pc.parameter(model["W_c"]) W__a = pc.parameter(model["W__a"]) U__a = pc.parameter(model["U__a"]) v__a = pc.parameter(model["v__a"]) blstm_outputs = encode_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, lemma) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # convert features to matching embeddings, if UNK handle properly feats_input = encode_feats(feat_index, feat_lookup, feats, feature_types) # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] padded_word = word + END_WORD # run the decoder through the output sequence and aggregate loss for i, output_char in enumerate(padded_word): # get current h of the decoder s = s.add_input(pc.concatenate([prev_output_vec, feats_input])) decoder_rnn_output = s.output() attention_output_vector, alphas, W = task1_attention_implementation.attend(blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias current_loss = pc.pickneglogsoftmax(readout, alphabet_index[output_char]) # print 'computed readout layer' loss.append(current_loss) # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[output_char]] total_sequence_loss = pc.esum(loss) # loss = average(loss) return total_sequence_loss
def attend(model, vectors, state): w = pc.parameter(model['attention_w']) attention_weights = [] for vector in vectors: #concatenate each encoded vector with the current decoder state attention_input = pc.concatenate([vector, pc.concatenate(list(state.s()))]) #get the attention wieght for the decoded vector attention_weights.append(w * attention_input) #normalize the weights attention_weights = pc.softmax(pc.concatenate(attention_weights)) #apply the weights vectors = pc.esum([vector*attention_weight for vector, attention_weight in zip(vectors, attention_weights)]) return vectors
def attend(model, input_vectors, state): w1 = pc.parameter(model['attention_w1']) w2 = pc.parameter(model['attention_w2']) v = pc.parameter(model['attention_v']) attention_weights = [] w2dt = w2*pc.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*pc.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = pc.softmax(pc.concatenate(attention_weights)) output_vectors = pc.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def fit(self, list_folders_name, num_iterations, train_algo, dev=None): """ train the tagger """ print("read training data",file=sys.stderr) nb_tasks = len( list_folders_name ) train_X, train_Y, task_labels, w2i, c2i, task2t2i = self.get_train_data(list_folders_name) ## after calling get_train_data we have self.tasks_ids self.task2layer = {task_id: out_layer for task_id, out_layer in zip(self.tasks_ids, self.pred_layer)} print("task2layer", self.task2layer, file=sys.stderr) # store mappings of words and tags to indices self.set_indices(w2i,c2i,task2t2i) # init lookup parameters and define graph print("build graph",file=sys.stderr) num_words = len(self.w2i) num_chars = len(self.c2i) assert(nb_tasks==len(self.pred_layer)) self.predictors, self.char_rnn, self.wembeds, self.cembeds = self.build_computation_graph(num_words, num_chars) if train_algo == "sgd": trainer = pycnn.SimpleSGDTrainer(self.model) elif train_algo == "adam": trainer = pycnn.AdamTrainer(self.model) train_data = list(zip(train_X,train_Y, task_labels)) for iter in range(num_iterations): total_loss=0.0 total_tagged=0.0 random.shuffle(train_data) for ((word_indices,char_indices),y, task_of_instance) in train_data: # use same predict function for training and testing output = self.predict(word_indices, char_indices, task_of_instance, train=True) loss1 = pycnn.esum([self.pick_neg_log(pred,gold) for pred, gold in zip(output, y)]) lv = loss1.value() total_loss += lv total_tagged += len(word_indices) loss1.backward() trainer.update() print("iter {2} {0:>12}: {1:.2f}".format("total loss",total_loss/total_tagged,iter), file=sys.stderr)
def calc_sentence_error(self, sentence): renew_cg() for word in sentence: # word.vector = noise(self._get_word_vector(word), 0.1) word.vector = self._get_word_vector(word, use_dropout=True) sentence_expressions = self._build_sentence_expressions(sentence) sentence_errors = [] for word, word_expression in zip(sentence, sentence_expressions): gold_label_index = self.tag_indexer.get_index(word.gold_label) word_error = pickneglogsoftmax(word_expression, gold_label_index) sentence_errors.append(word_error) return esum(sentence_errors)
def decode(model, dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE * 2)) loss = [] for char in output: vector = attend(model, vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def decode(model, dec_lstm, vectors, output): output = [EOS] + list(output) + [EOS] output = [char2int[c] for c in output] w = pc.parameter(model["decoder_w"]) b = pc.parameter(model["decoder_b"]) s = dec_lstm.initial_state().add_input(pc.vecInput(STATE_SIZE*2)) loss = [] for char in output: vector = attend(model, vectors, s) s = s.add_input(vector) out_vector = w * s.output() + b probs = pc.softmax(out_vector) loss.append(-pc.log(pc.pick(probs, char))) loss = pc.esum(loss) return loss
def train( feature_mapper, word_dims, tag_dims, lstm_units, hidden_units, epochs, batch_size, train_data_file, dev_data_file, model_save_file, droprate, unk_param, alpha=1.0, beta=0.0, ): start_time = time.time() fm = feature_mapper word_count = fm.total_words() tag_count = fm.total_tags() network = Network( word_count=word_count, tag_count=tag_count, word_dims=word_dims, tag_dims=tag_dims, lstm_units=lstm_units, hidden_units=hidden_units, struct_out=2, label_out=fm.total_label_actions(), droprate=droprate, ) network.init_params() print('Hidden units: {}, per-LSTM units: {}'.format( hidden_units, lstm_units, )) print('Embeddings: word={} tag={}'.format( (word_count, word_dims), (tag_count, tag_dims), )) print('Dropout rate: {}'.format(droprate)) print('Parameters initialized in [-0.01, 0.01]') print('Random UNKing parameter z = {}'.format(unk_param)) print('Exploration: alpha={} beta={}'.format(alpha, beta)) training_data = fm.gold_data_from_file(train_data_file) num_batches = -(-len(training_data) // batch_size) print('Loaded {} training sentences ({} batches of size {})!'.format( len(training_data), num_batches, batch_size, )) parse_every = -(-num_batches // 4) dev_trees = PhraseTree.load_treefile(dev_data_file) print('Loaded {} validation trees!'.format(len(dev_trees))) best_acc = FScore() for epoch in xrange(1, epochs + 1): print('........... epoch {} ...........'.format(epoch)) total_cost = 0.0 total_states = 0 training_acc = FScore() np.random.shuffle(training_data) for b in xrange(num_batches): batch = training_data[(b * batch_size):((b + 1) * batch_size)] explore = [ Parser.exploration( example, fm, network, alpha=alpha, beta=beta, ) for example in batch ] for (_, acc) in explore: training_acc += acc batch = [example for (example, _) in explore] pycnn.renew_cg() network.prep_params() errors = [] for example in batch: ## random UNKing ## for (i, w) in enumerate(example['w']): if w <= 2: continue freq = fm.word_freq_list[w] drop_prob = unk_param / (unk_param + freq) r = np.random.random() if r < drop_prob: example['w'][i] = 0 fwd, back = network.evaluate_recurrent( example['w'], example['t'], ) for (left, right), correct in example['struct_data'].items(): scores = network.evaluate_struct( fwd, back, left, right) probs = pycnn.softmax(scores) loss = -pycnn.log(pycnn.pick(probs, correct)) errors.append(loss) total_states += len(example['struct_data']) for (left, right), correct in example['label_data'].items(): scores = network.evaluate_label(fwd, back, left, right) probs = pycnn.softmax(scores) loss = -pycnn.log(pycnn.pick(probs, correct)) errors.append(loss) total_states += len(example['label_data']) batch_error = pycnn.esum(errors) total_cost += batch_error.scalar_value() batch_error.backward() network.trainer.update() mean_cost = total_cost / total_states print( '\rBatch {} Mean Cost {:.4f} [Train: {}]'.format( b, mean_cost, training_acc, ), end='', ) sys.stdout.flush() if ((b + 1) % parse_every) == 0 or b == (num_batches - 1): dev_acc = Parser.evaluate_corpus( dev_trees, fm, network, ) print(' [Val: {}]'.format(dev_acc)) if dev_acc > best_acc: best_acc = dev_acc network.save(model_save_file) print(' [saved model: {}]'.format(model_save_file)) current_time = time.time() runmins = (current_time - start_time) / 60. print(' Elapsed time: {:.2f}m'.format(runmins))
def compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, feat_index, feature_types, alignment): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) W_c = pc.parameter(model["W_c"]) W__a = pc.parameter(model["W__a"]) U__a = pc.parameter(model["U__a"]) v__a = pc.parameter(model["v__a"]) template = task1_ms2s.generate_template_from_alignment(alignment) blstm_outputs = encode_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, lemma) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # convert features to matching embeddings, if UNK handle properly feats_input = encode_feats(feat_index, feat_lookup, feats, feature_types) # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] prev_char_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] padded_word = word + END_WORD padded_template = template + [END_WORD] # run the decoder through the output sequence and aggregate loss for i, output_char in enumerate(padded_word): # find all possible actions - copy from index, output specific character etc. possible_outputs = list(set([padded_template[i]] + [output_char])) # get current h of the decoder s = s.add_input( pc.concatenate([prev_output_vec, prev_char_vec, feats_input])) decoder_rnn_output = s.output() attention_output_vector, alphas, W = task1_attention_implementation.attend( blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias # choose which feedback based on minimum neg. log likelihood: initialize with the character loss min_neg_log_loss = pc.pickneglogsoftmax(readout, alphabet_index[output_char]) prev_output_char = output_char prev_output_action = output_char for output in possible_outputs: current_loss = pc.pickneglogsoftmax(readout, alphabet_index[output]) # append the loss of all options loss.append(current_loss) if current_loss < min_neg_log_loss: min_neg_log_loss = current_loss prev_output_action = output # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[prev_output_action]] prev_char_vec = char_lookup[alphabet_index[prev_output_char]] total_sequence_loss = pc.esum(loss) # loss = average(loss) return total_sequence_loss
def compute_loss(model, encoder_frnn, encoder_rrnn, decoder_rnn, lemma, feats, word, alphabet_index, feat_index, feature_types, alignment): pc.renew_cg() # read the parameters char_lookup = model["char_lookup"] feat_lookup = model["feat_lookup"] R = pc.parameter(model["R"]) bias = pc.parameter(model["bias"]) W_c = pc.parameter(model["W_c"]) W__a = pc.parameter(model["W__a"]) U__a = pc.parameter(model["U__a"]) v__a = pc.parameter(model["v__a"]) template = task1_ms2s.generate_template_from_alignment(alignment) blstm_outputs = task1_attention_implementation.encode_feats_and_chars(alphabet_index, char_lookup, encoder_frnn, encoder_rrnn, feat_index, feat_lookup, feats, feature_types, lemma) # initialize the decoder rnn s_0 = decoder_rnn.initial_state() s = s_0 # set prev_output_vec for first lstm step as BEGIN_WORD prev_output_vec = char_lookup[alphabet_index[BEGIN_WORD]] prev_char_vec = char_lookup[alphabet_index[BEGIN_WORD]] loss = [] padded_word = word + END_WORD padded_template = template + [END_WORD] # run the decoder through the output sequence and aggregate loss for i, output_char in enumerate(padded_word): # find all possible actions - copy from index, output specific character etc. possible_outputs = list(set([padded_template[i]]))# + [output_char])) # get current h of the decoder s = s.add_input(pc.concatenate([prev_output_vec, prev_char_vec])) decoder_rnn_output = s.output() attention_output_vector, alphas, W = task1_attention_implementation.attend(blstm_outputs, decoder_rnn_output, W_c, v__a, W__a, U__a) # compute output probabilities # print 'computing readout layer...' readout = R * attention_output_vector + bias # choose which feedback based on minimum neg. log likelihood: initialize with the character loss min_neg_log_loss = pc.pickneglogsoftmax(readout, alphabet_index[output_char]) prev_output_char = output_char prev_output_action = output_char for output in possible_outputs: current_loss = pc.pickneglogsoftmax(readout, alphabet_index[output]) # append the loss of all options loss.append(current_loss) if current_loss < min_neg_log_loss: min_neg_log_loss = current_loss prev_output_action = output # prepare for the next iteration - "feedback" prev_output_vec = char_lookup[alphabet_index[prev_output_action]] prev_char_vec = char_lookup[alphabet_index[prev_output_char]] total_sequence_loss = pc.esum(loss) # loss = average(loss) return total_sequence_loss