def epoch_train(self,examples): count=0 dy.renew_cg() current_losses = [ ] loss_list = [] for word,context in (examples): loss = self.get_score(word,context) current_losses.append(loss) loss_list.append(loss.value()) if len(current_losses) >= self.batch_size: mean_loss = dy.esum(current_losses) / float(len(current_losses)) mean_loss.forward() mean_loss.backward() self.optimizer.update() current_losses = [ ] dy.renew_cg() count+=1 ## Print out the average loss in every 1M example if count%1000000==1000: print (count,np.mean(np.array(loss_list))) loss_list = [] if current_losses: mean_loss = dy.esum(current_losses) / float(len(current_losses)) mean_loss.forward() mean_loss.backward() self.optimizer.update()
def decode_loss(self, src1, src2, tgt): src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward( src1, src2 ) _, prev_coverage = self.get_coverage( a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1)) ) loss = [] cov_loss = [] diag_loss = [] embedded_tgt = self.embed_idx(tgt, self.tgt_lookup) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)): a_t, c1_t = self.attend( src1_mat, decoder_state, src1_w1dt, self.att1_w2, self.att1_v, prev_coverage, ) if not self.single_source: _, c2_t = self.attend( src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None ) else: c2_t = dy.vecInput(2 * HIDDEN_DIM) x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings]) decoder_state = decoder_state.add_input(x_t) out_vector = self.dec_w * decoder_state.output() + self.dec_b probs = dy.softmax(out_vector) probs, _ = self.get_pointergen_probs( c1_t, decoder_state, x_t, a_t, probs, src1 ) loss.append(-dy.log(dy.pick(probs, char))) cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage) cov_loss.append(cov_loss_cur) diag_loss.append(self.get_diag_loss(a_t, t)) last_output_embeddings = embedded_char loss = dy.esum(loss) cov_loss = dy.esum(cov_loss) diag_loss = dy.esum(diag_loss) return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
def get_pointergen_probs(self, c_t, state, x_t, a_t, probs, src1): if not self.pointer_gen: return probs, 1.0 unk_idx = self.tgt_vocab.str2int(UNK) p_gen = dy.logistic( self.ptr_w_c * c_t + self.ptr_w_s * dy.concatenate(list(state.s())) + self.ptr_w_x * x_t ) gen_probs = probs * p_gen copy_probs = a_t * (1 - p_gen) copy_probs_update = [] for i in gen_probs: copy_probs_update.append([i]) for char, prob in zip(src1, copy_probs): cur_idx = self.tgt_vocab.str2int(self.src1_vocab.int2str(char)) if cur_idx == unk_idx: continue if isinstance(cur_idx, int): copy_probs_update[cur_idx].append(prob) else: for idx in cur_idx: copy_probs_update[idx].append(prob / len(cur_idx)) sum_probs = dy.concatenate([dy.esum(exps) for exps in copy_probs_update]) return sum_probs, p_gen.scalar_value()
def get_diag_loss(self, a_t, t): if self.diag_loss < 0: return dy.scalarInput(0) off_diag_elems = [dy.scalarInput(0)] for i, prob in enumerate(a_t): if i < (t - self.diag_loss) or i > (t + self.diag_loss): off_diag_elems.append(prob) return dy.esum(off_diag_elems)
def compute_loss_multilabel(self, task, seq, multi_y): """ computes the loss for multi-label instances by summing over the negative log probabilities of all correct labels """ out_probs = self(task, seq) losses = [] for y in multi_y: assigned_prob = dn.pick(out_probs, y) losses.append(-dn.log(assigned_prob) / len(multi_y)) return dn.esum(losses)
def train_batch(self, words): losses = [] W = dy.parameter(self.W) b = dy.parameter(self.b) for word in words: wlosses = [] word = self.word_to_indices(word) s = self.lstm.initial_state() for c, next_c in zip(word, word[1:]): s = s.add_input(self.lookup[c]) unnormalized = dy.affine_transform([b, W, s.output()]) wlosses.append(dy.pickneglogsoftmax(unnormalized, next_c)) losses.append(dy.esum(wlosses) / len(word)) return dy.esum(losses) / len(words)
def process_one_instance(instance, update=True, x_y_vectors=None, features=None, mode='train'): lemma_lookup = self.model_parameters['lemma_lookup'] if self.opt['use_path']: pos_lookup = self.model_parameters['pos_lookup'] dep_lookup = self.model_parameters['dep_lookup'] dir_lookup = self.model_parameters['dir_lookup'] # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embeddings = [ self.get_path_embedding_from_cache( lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, mode) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embeddings) * (1.0 / num_paths) # Concatenate x and y embeddings if self.opt['use_xy_embeddings']: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) if self.opt['use_path']: input_vec = dy.concatenate([x_vector, input_vec, y_vector]) else: input_vec = dy.concatenate([x_vector, y_vector]) if self.opt['use_features']: for k in feat_dims: if 'diff' in k and not self.opt['use_freq_features']: continue feat = dy.lookup(self.model_parameters[k], features[k]) input_vec = dy.concatenate([input_vec, feat]) if self.opt['use_height_ebd']: if j in tree.term_height: h = tree.get_height(j) - 1 else: h = 0 height_vector = dy.lookup( self.model_parameters['height_lookup'], h) input_vec = dy.concatenate([input_vec, height_vector]) return input_vec
def calculate_loss(self, sents): dy.renew_cg() losses = [] for sent in sents: features, t_features, feat_reconstruct = self.get_features_for_tagging( sent, True ) gold_tags = [tag for chars, word, feats, tag in sent] cur_loss = self.crf_module.negative_log_loss( features, t_features, gold_tags ) if self.autoencoder: autoencoder_loss = [ dy.binary_log_loss(reconstruct, dy.inputTensor(feats)) for reconstruct, (chars, word, feats, tag) in zip( feat_reconstruct, sent ) ] else: # remove autoencoder loss autoencoder_loss = [dy.scalarInput(0)] losses.append(cur_loss + (dy.esum(autoencoder_loss) / self.featsize)) return dy.esum(losses)
def calc_attend(self, a_vecs, b_vecs, dropout): l_a = a_vecs.dim()[1] l_b = b_vecs.dim()[1] fa = self.attend.evaluate_network(a_vecs, True, dropout) fb = self.attend.evaluate_network(b_vecs, True, dropout) e_ij = list() for i in range(l_a): e_ij.append(list()) for j in range(l_b): e_ij[i].append( dy.dot_product(dy.pick_batch_elem(fa, i), dy.pick_batch_elem(fb, j))) beta_softmaxes = [ dy.softmax(dy.concatenate(e_ij[i])) for i in range(l_a) ] alpha_softmaxes = [ dy.softmax(dy.concatenate([e_ij[i][j] for j in range(l_b)])) for i in range(l_a) ] betas = [ dy.esum([ dy.pick_batch_elem(b_vecs, j) * beta_softmaxes[i][j] for j in range(l_b) ]) for i in range(l_a) ] alphas = [ dy.esum([ dy.pick_batch_elem(a_vecs, i) * alpha_softmaxes[i][j] for i in range(l_a) ]) for j in range(l_b) ] return alphas, betas
def set_initial_states(self, x): self.xt_embs = [dy.lookup(self.F, x_t) for x_t in x] if self.encoder_type == 'bow': self.W_enc = self.W * dy.average(self.xt_embs) elif self.encoder_type == 'attention': self.xb = dy.concatenate([ dy.esum(self.xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) self.xt = dy.transpose(dy.concatenate(self.xt_embs, d=1))
def predict_sequence(self, seq, inputs, train=False, output_confidences=False, unk_tag=None, dictionary=None, type_constraint=False, **kwargs): output = [self.network_builder(x, **kwargs) for x in inputs] if not train: if dictionary and type_constraint: # to type constraint decoding only during testing pred_tags = [] for i, o in enumerate(output): softmax_distr = o.npvalue() word = seq.words[i] softmax_distr = self.prune_softmax(softmax_distr, word, dictionary) tag_best = self.index2tag[np.argmax(softmax_distr)] pred_tags.append(tag_best) seq.pred_tags = pred_tags else: seq.pred_tags = [self.index2tag[np.argmax(o.npvalue())] for o in output] # logprobs to indices if output_confidences: seq.tag_confidences = array.array('f', [np.max(o.npvalue()) for o in output]) if train: # return loss per tag gold_tag_indices = array.array('I',[self.tag2index[t] for t in seq.tags]) return dynet.esum([pick_neg_log(pred,gold) for pred, gold in zip(output, gold_tag_indices)])
def train(builder, model, model_parameters, X_train, y_train, nepochs, alpha=0.01, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Train the LSTM :param builder: the LSTM builder :param model: LSTM RNN model :param model_parameters: the model parameters :param X_train: the lstm instances :param y_train: the lstm labels :param nepochs: number of epochs :param alpha: the learning rate (only for SGD) :param update: whether to update the lemma embeddings :param dropout: dropout probability for all component embeddings :param x_y_vectors: the word vectors of x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network """ trainer = dy.AdamTrainer(model, alpha=alpha) minibatch_size = min(MINIBATCH_SIZE, len(y_train)) nminibatches = int(math.ceil(len(y_train) / minibatch_size)) previous_loss = 1000 for epoch in range(nepochs): total_loss = 0.0 epoch_indices = np.random.permutation(len(y_train)) for minibatch in range(nminibatches): path_cache = {} batch_indices = epoch_indices[minibatch * minibatch_size:(minibatch + 1) * minibatch_size] dy.renew_cg() loss = dy.esum([ -dy.log( dy.pick( process_one_instance( builder, model, model_parameters, X_train[batch_indices[i]], path_cache, update, dropout, x_y_vectors=x_y_vectors[batch_indices[i]] if x_y_vectors is not None else None, num_hidden_layers=num_hidden_layers), y_train[batch_indices[i]])) for i in range(minibatch_size) ]) total_loss += loss.value() # forward computation loss.backward() trainer.update() # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB # and requires an argument (would be epoch i guess...) # trainer.update_epoch() trainer.update() total_loss /= len(y_train) print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss # Early stopping if math.fabs(previous_loss - total_loss) < LOSS_EPSILON: break previous_loss = total_loss
def process_one_instance(builder, model, model_parameters, instance, path_cache, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Return the LSTM output vector of a single term-pair - the average path embedding :param builder: the LSTM builder :param model: the LSTM model :param model_parameters: the model parameters :param instance: a Counter object with paths :param path_cache: the cache for path embeddings :param update: whether to update the lemma embeddings :param dropout: word dropout rate :param x_y_vectors: the current word vectors for x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network :return: the LSTM output vector of a single term-pair """ W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] # Use the LSTM output vector and feed it to the MLP # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embbedings = [ get_path_embedding_from_cache(path_cache, builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, dropout) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embbedings) * (1.0 / num_paths) # Concatenate x and y embeddings if x_y_vectors is not None: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) input_vec = dy.concatenate([x_vector, input_vec, y_vector]) h = W1 * input_vec + b1 if num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 output = dy.softmax(h) return output
def __call__(self, x, tm1s=None, test=False): if test: # Initial states s_tm1 = tm1s[0] c_tm1 = tm1s[1] w_tm1 = x # GRU s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input( dy.concatenate([w_tm1, c_tm1])).output() # Attention e_t = dy.pick( self.va * dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = dy.softmax(self.Wo * m_t) return s_t, c_t, y_t else: w_embs = x # Initial states s_tm1 = self.s_0 c_tm1 = self.c_0 GRU = self.GRUBuilder.initial_state().set_s([s_tm1]) y = [] for w_tm1 in w_embs: # GRU GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1])) s_t = GRU.output() # Attention e_t = dy.pick( self.va * dy.tanh( dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = self.Wo * m_t y.append(y_t) # t -> tm1 s_tm1 = s_t c_tm1 = c_t return y
def main(): parser = argparse.ArgumentParser(description='Selective Encoding for Abstractive Sentence Summarization in DyNet') parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]') parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]') parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651])') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--vocab_size', type=int, default=124404, help='Vocabulary size [default: 124404]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--maxout_dim', type=int, default=2, help='Maxout size [default: 2]') parser.add_argument('--alloc_mem', type=int, default=10000, help='Amount of memory to allocate [mb] [default: 10000]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = args.vocab_size EMB_DIM = args.emb_dim HID_DIM = args.hid_dim MAXOUT_DIM = args.maxout_dim ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset dataset = Dataset( TRAIN_X_FILE, TRAIN_Y_FILE, VALID_X_FILE, VALID_Y_FILE, vocab_size=VOCAB_SIZE, batch_size=BATCH_SIZE, n_train=N_TRAIN, n_valid=N_VALID ) VOCAB_SIZE = len(dataset.w2i) print('VOCAB_SIZE', VOCAB_SIZE) # Build model model = dy.Model() trainer = dy.AdamTrainer(model) V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) encoder = SelectiveBiGRU(model, EMB_DIM, HID_DIM) decoder = AttentionalGRU(model, EMB_DIM, HID_DIM, MAXOUT_DIM, VOCAB_SIZE) # Train model start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] dataset.reset_train_iter() for train_x_mb, train_y_mb in tqdm(dataset.train_iter): # Create a new computation graph dy.renew_cg() associate_parameters([encoder, decoder]) losses = [] for x, t in zip(train_x_mb, train_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) # Decoder decoder.set_initial_states(hp, hb_1) t_embs = [dy.lookup(V, t_t) for t_t in t_in] y = decoder(t_embs) # Loss loss = dy.esum( [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)] ) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] dataset.reset_valid_iter() for valid_x_mb, valid_y_mb in dataset.valid_iter: # Create a new computation graph dy.renew_cg() associate_parameters([encoder, decoder]) losses = [] for x, t in zip(valid_x_mb, valid_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] hp, hb_1 = encoder(x_embs) # Decoder decoder.set_initial_states(hp, hb_1) t_embs = [dy.lookup(V, t_t) for t_t in t_in] y = decoder(t_embs) # Loss loss = dy.esum( [dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)] ) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f, Time: %.3f[s]' % ( epoch+1, np.mean(loss_all_train), np.mean(loss_all_valid), time.time()-start_time )) # Save model dy.save('./model_e'+str(epoch+1), [V, encoder, decoder]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(dataset.w2i, f_w2i) pickle.dump(dataset.i2w, f_i2w)
def train_batched(self, tasks, batch_size, scale_gradient_factor, validation_data, seqs_trg, early_stopping, patience, num_epochs, min_num_epochs, num_updates, prob_main_task, prob_adv): trainer = dn.SimpleSGDTrainer(self.model) # stores best observed validation accuracy val_best = 0 # stores the number of iterations without improvement no_improvement = 0 val_prev = 0 for epoch in range(num_epochs): sum_losses = 0 adversarial_loss = 0 losses_prediction_task = [] losses_aux_task = [] batch_dict = self.generate_batches_across_tasks(tasks, batch_size) # number of updates is twice the length of the main task batch list num_updates = len(batch_dict[self.prediction_layer]) * 2 print(num_updates) #logging.INFO('Number of updates to do: {}'.format(num_updates)) # sample batches according to some schema update_counter = 0 while update_counter <= num_updates: update_counter += 1 # with prob 1-prob_adv, do a task update outcome = np.random.binomial(1, prob_adv, size=None) if outcome == 0: task_id, batch_ids = self.sample_task_batch( batch_dict, prob_main_task=prob_main_task) losses = [] dn.renew_cg() # iterate through the batch for i in batch_ids: seq = tasks[task_id].train_seqs[i] label = tasks[task_id].train_labels[i] loss = self.compute_loss_multilabel( task_id, seq, label) losses.append(loss) batch_loss = dn.esum(losses) / len(batch_ids) batch_loss_value = batch_loss.value() batch_loss.backward() trainer.update() sum_losses += batch_loss_value if task_id == self.prediction_layer: losses_prediction_task.append(batch_loss_value) else: losses_aux_task.append(batch_loss_value) else: # do adversarial step losses = [] dn.renew_cg() seqs, labels = self.generate_adversarial_batch( seqs_src=tasks[self.src_domain].train_seqs, seqs_trg=seqs_trg, batch_size=batch_size) for i in range(len(seqs)): seq = seqs[i] label = labels[i] loss = self.compute_loss_multilabel(task='adversarial', seq=seq, multi_y=label) losses.append(loss) batch_loss = dn.esum(losses) / len(seqs) batch_loss_value = batch_loss.value() batch_loss.backward() trainer.update() adversarial_loss += batch_loss_value # compute the validation accuracy to monitor early stopping # use the micro averaged f as criterion res = evaluate_model_predictions( self.predict(self.main_task, validation_data['seq']), validation_data['label'], validation_data['labelset']) f_avg = res['f_avg'] logging.info( 'Epoch {}. Sum loss: {}. Avg loss: {}. Avg loss predtask {}. Avg loss aux tasks: {}. No improv: {}. Best f_val: {}. Avg f_val: {}' .format(epoch, sum_losses, sum_losses / num_updates, np.mean(losses_prediction_task), np.mean(losses_aux_task), no_improvement, val_best, f_avg)) logging.info( 'Epoch {}. Adv loss: {}. Avg loss: {}. Avg loss predtask {}. Avg loss aux tasks: {}. No improv: {}. Best f_val: {}. Avg f_val: {}' .format(epoch, adversarial_loss, sum_losses / num_updates, np.mean(losses_prediction_task), np.mean(losses_aux_task), no_improvement, val_best, f_avg)) # init early stopping after min number of epochs if epoch == min_num_epochs - 1: val_prev = f_avg no_improvement = 0 self.save(self.exp_path) # if early_stopping: if f_avg <= val_prev: no_improvement += 1 if early_stopping: if no_improvement >= patience and epoch > min_num_epochs: break else: if epoch >= min_num_epochs: self.save(self.exp_path) no_improvement = 0 if f_avg >= val_best: val_best = f_avg val_prev = f_avg return epoch, f_avg, sum_losses, no_improvement, val_best
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence, output, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split(' ') ] loss = [] generate = [] s_vector = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] index = 1 input_word = "<start>" _lookup = params_decoder["lookup"] while True: dy_env = dy.inputTensor(get_state_embed3(env)) word = vocab_out.index(input_word) gt_y = vocab_out.index(output[index]) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_output = dy_s_vector * weight encode_state = dy_sc_vector * weight_char _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) prediction = np.argsort(probs.npvalue())[-1] if (vocab_out[prediction]) == '<start>': prediction = np.argsort(probs.npvalue())[-2] generate.append(vocab_out[prediction]) loss.append(-dy.log(dy.pick(probs, gt_y))) if output[index] == '<end>': break index += 1 input_word = vocab_out[prediction] if input_word == '<end>': continue env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' loss = dy.esum(loss) while '<start>' in generate: generate.remove('<start>') previous = s.output() return loss, generate, previous
def fit(self, train_dict, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, clip_threshold=5.0, orthogonality_weight=0.0, adversarial=False, adversarial_weight=1.0, ignore_src_Ft=False): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param adversarial: note: if we want to use adversarial, we have to call add_adversarial_loss before; :param adversarial_weight: 1 by default (do not weigh adv loss) :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src' :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft") to a dictionary {"X": list of examples, "Y": list of labels, "domain": list of domain tag (0,1) of example} Three tasks are indexed as "F0", "F1" and "Ft" Note: if a task 'src' is given than a single model with three heads is trained where all data is given to all outputs """ print("read training data") widCount = Counter() train_data = [] for task, task_dict in train_dict.items(): #task: eg. "F0" for key in ["X", "Y", "domain"]: assert key in task_dict, "Error: %s is not available." % key examples, labels, domain_tags = task_dict["X"], task_dict[ "Y"], task_dict["domain"] assert len(examples) == len(labels) if word_dropout_rate > 0.0: # keep track of the counts for word dropout for sentence, _ in examples: widCount.update([w for w in sentence]) # train data is a list of 4-tuples: (example, label, task_id, domain_id) train_data += list( zip(examples, labels, [[task] * len(labels)][0], domain_tags)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) if seed: random.seed(seed) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) random_indices = np.arange(len(train_data)) random.shuffle(random_indices) total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0 total_orth_constr = 0 # count how many updates # log separate losses log_losses = {} log_total = {} for task_id in self.task_ids: log_losses[task_id] = 0.0 log_total[task_id] = 0 for i, idx in enumerate(random_indices): (word_indices, char_indices), y, task_id, domain_id = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output, constraint, adv = self.predict( word_indices, char_indices, task_id, train=True, orthogonality_weight=orthogonality_weight, domain_id=domain_id if adversarial else None) if task_id not in ['src', 'trg']: if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) other_loss = unsup_weight * dynet.average([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) loss += other_loss if orthogonality_weight != 0.0 and task_id != 'Ft': # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output log_losses[task_id] += total_loss total_tagged += len(word_indices) log_total[task_id] += total_tagged loss.backward() self.trainer.update() bar.next() else: # bootstrap=False, the output contains list of outputs one for each task assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False' loss = dynet.scalarInput(1) #initialize if ignore_src_Ft: output = output[: -1] # ignore last = Ft when further training with 'src' for t_i, output_t in enumerate( output): # get loss for each task loss += dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output_t, y) ]) task_id = self.task_ids[t_i] log_losses[task_id] += total_loss log_total[task_id] += total_tagged if orthogonality_weight != 0.0: # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if adversarial and orthogonality_weight: print( "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}" .format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr, total_adversarial / total_tagged), file=sys.stderr) elif orthogonality_weight: print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}". format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr), file=sys.stderr) else: print("iter {}. Total loss: {:.3f} ".format( cur_iter, total_loss / total_tagged), file=sys.stderr) for task_id in self.task_ids: if log_total[task_id] > 0: print("{0}: {1:.3f}".format( task_id, log_losses[task_id] / log_total[task_id])) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}.' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def process_one(self, i, j, tree, mode): def process_one_instance(instance, update=True, x_y_vectors=None, features=None, mode='train'): lemma_lookup = self.model_parameters['lemma_lookup'] if self.opt['use_path']: pos_lookup = self.model_parameters['pos_lookup'] dep_lookup = self.model_parameters['dep_lookup'] dir_lookup = self.model_parameters['dir_lookup'] # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embeddings = [ self.get_path_embedding_from_cache( lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, mode) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embeddings) * (1.0 / num_paths) # Concatenate x and y embeddings if self.opt['use_xy_embeddings']: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) if self.opt['use_path']: input_vec = dy.concatenate([x_vector, input_vec, y_vector]) else: input_vec = dy.concatenate([x_vector, y_vector]) if self.opt['use_features']: for k in feat_dims: if 'diff' in k and not self.opt['use_freq_features']: continue feat = dy.lookup(self.model_parameters[k], features[k]) input_vec = dy.concatenate([input_vec, feat]) if self.opt['use_height_ebd']: if j in tree.term_height: h = tree.get_height(j) - 1 else: h = 0 height_vector = dy.lookup( self.model_parameters['height_lookup'], h) input_vec = dy.concatenate([input_vec, height_vector]) return input_vec if (i, j) not in self.f_cache: data = self.get_data(i, j) f = process_one_instance(instance=data[0], update=self.opt['update_word_ebd'], x_y_vectors=data[1], features=data[2], mode=mode) self.f_cache[(i, j)] = f if not self.opt['use_sibling']: # return dy.concatenate([self.f_cache[(i, j)], self.history[0].output()]) return self.f_cache[(i, j)] else: sib = [ self.f_cache[(sibling, j)] for sibling in tree.get_children(j) ] if len(sib) == 0: return self.f_cache[(i, j)] else: return self.f_cache[(i, j)] + dy.esum(sib) / len(sib)
def calc_aggregate(self, v1_i, v2_j, dropout): v1 = dy.esum(v1_i) v2 = dy.esum(v2_j) ret = self.aggregate.evaluate_network(dy.concatenate([v1, v2]), False, dropout) return ret
def fit(self, train_X, train_Y, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, variance_weights=None, labeled_weight_proportion=1.0): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0) :param labeled_weight_proportion: proportion of the unsupervised weight that should be assigned to labeled examples """ print("read training data", file=sys.stderr) if variance_weights is not None: print('First 20 variance weights:', variance_weights[:20]) if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed # if we use word dropout keep track of counts if word_dropout_rate > 0.0: widCount = Counter() for sentence, _ in train_X: widCount.update([w for w in sentence]) assert (len(train_X) == len(train_Y)) train_data = list(zip(train_X, train_Y)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] sentence_var_weights = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) if variance_weights is not None: sentence_var_weights.append( variance_weights[trg_start_id:trg_start_id + len(example[0])]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) assert len(sentence_trg_vectors) == len(train_X) if variance_weights is not None: assert trg_start_id == len(variance_weights) assert len(sentence_var_weights) == len(train_X) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 total_tagged = 0.0 total_other_loss, total_other_loss_weighted = 0.0, 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): (word_indices, char_indices), y = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output = self.predict(word_indices, char_indices, train=True) if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) if variance_weights is not None: var_weights = sentence_var_weights[idx] assert len(output) == len(var_weights) # multiply the normalized mean variance with each loss other_loss = dynet.esum([ v * dynet.squared_distance(o, dynet.inputVector(t)) for o, t, v in zip(output, targets, var_weights) ]) else: other_loss = dynet.esum([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) total_other_loss += other_loss.value() if len(y) == 1 and y[0] == 0: #unlab_ex other_loss += other_loss * unsup_weight else: #lab_ex # assign the unsupervised weight for labeled examples other_loss += other_loss * unsup_weight * labeled_weight_proportion # keep track for logging total_loss += loss.value() # main loss total_tagged += len(word_indices) total_other_loss_weighted += other_loss.value() # combine losses loss += other_loss else: # keep track for logging total_loss += loss.value() total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if trg_vectors is None: print("iter {2} {0:>12}: {1:.2f}".format( "total loss", total_loss / total_tagged, cur_iter), file=sys.stderr) else: print( "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})" .format("supervised loss", total_loss / total_tagged, cur_iter, total_other_loss / total_tagged, total_other_loss_weighted / total_tagged), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def main(): parser = argparse.ArgumentParser(description='A Neural Attention Model for Abstractive Sentence Summarization in DyNet') parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--n_train', type=int, default=3803957, help='Number of training data (up to 3803957 in gigaword) [default: 3803957]') parser.add_argument('--n_valid', type=int, default=189651, help='Number of validation data (up to 189651 in gigaword) [default: 189651]') parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--vocab_size', type=int, default=60000, help='Vocabulary size [default: 60000]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--encoder_type', type=str, default='attention', help='Encoder type. bow: Bag-of-words encoder. attention: Attention-based encoder [default: attention]') parser.add_argument('--c', type=int, default=5, help='Window size in neural language model [default: 5]') parser.add_argument('--q', type=int, default=2, help='Window size in attention-based encoder [default: 2]') parser.add_argument('--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = args.vocab_size EMB_DIM = args.emb_dim HID_DIM = args.hid_dim ENCODER_TYPE = args.encoder_type C = args.c Q = args.q ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset ==================================================================================== w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN) w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN) train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE) train_y, _, _ = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN) valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID) valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID) VOCAB_SIZE = len(w2i) OUT_DIM = VOCAB_SIZE print('VOCAB_SIZE:', VOCAB_SIZE) # Build model ====================================================================================== model = dy.Model() trainer = dy.AdamTrainer(model) rush_abs = ABS(model, EMB_DIM, HID_DIM, VOCAB_SIZE, Q, C, encoder_type=ENCODER_TYPE) # Padding train_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in train_y] valid_y = [[w2i['<s>']]*(C-1)+instance_y for instance_y in valid_y] n_batches_train = math.ceil(len(train_X)/BATCH_SIZE) n_batches_valid = math.ceil(len(valid_X)/BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train train_X, train_y = shuffle(train_X, train_y) loss_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() rush_abs.associate_parameters() # Create a mini batch start = i*BATCH_SIZE end = start + BATCH_SIZE train_X_mb = train_X[start:end] train_y_mb = train_y[start:end] losses = [] for x, t in zip(train_X_mb, train_y_mb): t_in, t_out = t[:-1], t[C:] y = rush_abs(x, t_in) loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() rush_abs.associate_parameters() # Create a mini batch start = i*BATCH_SIZE end = start + BATCH_SIZE valid_X_mb = valid_X[start:end] valid_y_mb = valid_y[start:end] losses = [] for x, t in zip(valid_X_mb, valid_y_mb): t_in, t_out = t[:-1], t[C:] y = rush_abs(x, t_in) loss = dy.esum([dy.pickneglogsoftmax(y_t, t_t) for y_t, t_t in zip(y, t_out)]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % ( epoch+1, np.mean(loss_all_train), np.mean(loss_all_valid) )) # Save model ======================================================================== dy.save('./model_e'+str(epoch+1), [rush_abs]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w)
def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False): """ train the tagger """ losses_log = {} # log losses print("init parameters") self.init_parameters(train) # init lookup parameters and define graph print("build graph") self.build_computation_graph(len(self.w2i), len(self.c2i)) update_embeds = True if self.backprob_embeds == False: ## disable backprob into embeds print(">>> disable wembeds update <<<") update_embeds = False best_val_acc, epochs_no_improvement = 0.0, 0 if dev and model_path is not None and patience > 0: print('Using early stopping with patience of {}...'.format(patience)) batch = [] print("train..") for iteration in range(num_iterations): total_loss=0.0 total_tagged=0.0 indices = [i for i in range(len(train.seqs))] random.shuffle(indices) loss_accum_loss = defaultdict(float) loss_accum_tagged = defaultdict(float) for idx in indices: seq = train.seqs[idx] if seq.task_id not in losses_log: losses_log[seq.task_id] = [] #initialize if minibatch_size > 1: # accumulate instances for minibatch update loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) batch.append(loss1) if len(batch) == minibatch_size: loss = dynet.esum(batch) total_loss += loss.value() # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss.value() loss.backward() self.trainer.update() dynet.renew_cg() # use new computational graph for each BATCH when batching is active batch = [] else: dynet.renew_cg() # new graph per item loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) lv = loss1.value() total_loss += lv # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss1.value() loss1.backward() self.trainer.update() print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss/total_tagged, iteration)) # log losses for task_id in sorted(losses_log): losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id]) if log_losses: dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb")) if dev: # evaluate after every epoch correct, total = self.evaluate(dev, "task0") val_accuracy = correct/total print("dev accuracy: {0:.4f}".format(val_accuracy)) if val_accuracy > best_val_acc: print('Accuracy {0:.4f} is better than best val accuracy ' '{1:.4f}.'.format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save(self, model_path) else: print('Accuracy {0:.4f} is worse than best val loss {1:.4f}.'.format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if patience > 0: if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'.format(epochs_no_improvement)) break
def train_network(self, train_data, epochs = 3, dev_data = None, test_data = None): trainer = dy.SimpleSGDTrainer(self.pc,0.05) i = 0 mloss = 0. goods = 0. loss = [] dy.renew_cg() max_dev_acc = MIN_SAVE_ACC run_id = randint(0,9999) save_path = "{}{:04d}".format(SAVE_TO,run_id) report_path = "{}{:04d}.txt".format(SAVE_REPORT_TO,run_id) test_path = "{}{:04d}.txt".format(SAVE_TAGGED_TEST_TO,run_id) rprt = open(report_path,'wt') print report_path for e in range(epochs): shuffle(train_data) for x, y in train_data: i = i + 1 loss = loss + [self.eval_loss(x, y, dropout=True)] good = y == self.last_case_class goods += int(good) if i % UPDATE_EVERY == 0: losses = dy.esum(loss) mloss += losses.value() losses.backward() trainer.update() loss = [] dy.renew_cg() if i % EVALUATE_LOSS_EVERY == 1000: goods_dev = 0. j = 0 for d in dev_data or []: dy.renew_cg() j+=1 x, y = d self.eval_loss(x, y) goods_dev += 1 if y==self.last_case_class else 0 dev_acc = goods_dev / len(dev_data or 'a') message = "{} average loss after {} iterations: {} acc: {}".format( now_string(), i, mloss/EVALUATE_LOSS_EVERY, goods/EVALUATE_LOSS_EVERY) dev_acc_str = " dev acc: {}".format(dev_acc) if dev_data else "" print(message + dev_acc_str) rprt.write(message + dev_acc_str+'\n') mloss = 0. goods = 0. if dev_acc > max_dev_acc and i > START_SAVE_AFTER: max_dev_acc = dev_acc print("saving.") rprt.write("saving.\n") self.save(save_path) if test_data: outf = open(test_path,'wt') k = 0 goods_test = 0. print("tagging test data.") for dd in test_data: dy.renew_cg() k += 1 x, y = dd self.eval_loss(x,y) y_hat = self.last_case_class goods_test += 1 if y == y_hat else 0 outf.write("{}{}{}\n".format(x, y, y_hat)) outf.close() test_acc = goods_test / len(test_data) print("accurcy on test: {}".format(test_acc)) rprt.flush()
def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False): """ train the tagger """ losses_log = {} # log losses print("init parameters") self.init_parameters(train) # init lookup parameters and define graph print("build graph") self.build_computation_graph(len(self.w2i), len(self.c2i)) update_embeds = True if self.backprob_embeds == False: ## disable backprob into embeds print(">>> disable wembeds update <<<") update_embeds = False best_val_acc, epochs_no_improvement = 0.0, 0 if dev and model_path is not None and patience > 0: print( 'Using early stopping with patience of {}...'.format(patience)) batch = [] print("train..") for iteration in range(num_iterations): total_loss = 0.0 total_tagged = 0.0 indices = [i for i in range(len(train.seqs))] random.shuffle(indices) loss_accum_loss = defaultdict(float) loss_accum_tagged = defaultdict(float) for idx in indices: seq = train.seqs[idx] if seq.task_id not in losses_log: losses_log[seq.task_id] = [] #initialize if minibatch_size > 1: # accumulate instances for minibatch update loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) batch.append(loss1) if len(batch) == minibatch_size: loss = dynet.esum(batch) total_loss += loss.value() # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss.value() loss.backward() self.trainer.update() dynet.renew_cg( ) # use new computational graph for each BATCH when batching is active batch = [] else: dynet.renew_cg() # new graph per item loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) lv = loss1.value() total_loss += lv # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss1.value() loss1.backward() self.trainer.update() print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss / total_tagged, iteration)) # log losses for task_id in sorted(losses_log): losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id]) if log_losses: dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb")) if dev: # evaluate after every epoch correct, total = self.evaluate(dev, "task0") val_accuracy = correct / total print("dev accuracy: {0:.4f}".format(val_accuracy)) if val_accuracy > best_val_acc: print('Accuracy {0:.4f} is better than best val accuracy ' '{1:.4f}.'.format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save(self, model_path) else: print( 'Accuracy {0:.4f} is worse than best val loss {1:.4f}.' .format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if patience > 0: if epochs_no_improvement == patience: print( 'No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch( self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend( encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1, ), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate( [feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def __call__(self, x=None, t=None, test=False): if test: tt_embs = [dy.lookup(self.E, t_t) for t_t in t] if self.encoder_type == 'bow': # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Output with softmax y_t = dy.softmax(self.V * h + self.W_enc) elif self.encoder_type == 'attention': ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate(ttp_embs) p = dy.softmax(self.xt * self.P * ttp_c) # Attention weight enc = self.xb * p # Context vector # Output with softmax y_t = dy.softmax(self.V * h + self.W * enc) return y_t else: xt_embs = [dy.lookup(self.F, x_t) for x_t in x] tt_embs = [dy.lookup(self.E, t_t) for t_t in t] y = [] if self.encoder_type == 'bow': # BoW enc = dy.average(xt_embs) W_enc = self.W * enc for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Output without softmax y_t = self.V * h + W_enc y.append(y_t) elif self.encoder_type == 'attention': xb = dy.concatenate([ dy.esum(xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) xt = dy.transpose(dy.concatenate(xt_embs, d=1)) ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate( ttp_embs[i:i + self.c]) # Window-sized embedding p = dy.softmax(xt * self.P * ttp_c) # Attention weight enc = xb * p # Context vector # Output without softmax y_t = self.V * h + self.W * enc y.append(y_t) return y
def main(): parser = argparse.ArgumentParser( description= 'Deep Recurrent Generative Decoder for Abstractive Text Summarization in DyNet' ) parser.add_argument('--gpu', type=str, default='0', help='GPU ID to use. For cpu, set -1 [default: -1]') parser.add_argument('--n_epochs', type=int, default=3, help='Number of epochs [default: 3]') parser.add_argument( '--n_train', type=int, default=3803957, help= 'Number of training examples (up to 3803957 in gigaword) [default: 3803957]' ) parser.add_argument( '--n_valid', type=int, default=189651, help= 'Number of validation examples (up to 189651 in gigaword) [default: 189651])' ) parser.add_argument('--batch_size', type=int, default=32, help='Mini batch size [default: 32]') parser.add_argument('--emb_dim', type=int, default=256, help='Embedding size [default: 256]') parser.add_argument('--hid_dim', type=int, default=256, help='Hidden state size [default: 256]') parser.add_argument('--lat_dim', type=int, default=256, help='Latent size [default: 256]') parser.add_argument( '--alloc_mem', type=int, default=8192, help='Amount of memory to allocate [mb] [default: 8192]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu N_EPOCHS = args.n_epochs N_TRAIN = args.n_train N_VALID = args.n_valid BATCH_SIZE = args.batch_size VOCAB_SIZE = 60000 EMB_DIM = args.emb_dim HID_DIM = args.hid_dim LAT_DIM = args.lat_dim ALLOC_MEM = args.alloc_mem # File paths TRAIN_X_FILE = './data/train.article.txt' TRAIN_Y_FILE = './data/train.title.txt' VALID_X_FILE = './data/valid.article.filter.txt' VALID_Y_FILE = './data/valid.title.filter.txt' # DyNet setting dyparams = dy.DynetParams() dyparams.set_autobatch(True) dyparams.set_random_seed(RANDOM_STATE) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Build dataset ==================================================================================== w2c = build_word2count(TRAIN_X_FILE, n_data=N_TRAIN) w2c = build_word2count(TRAIN_Y_FILE, w2c=w2c, n_data=N_TRAIN) train_X, w2i, i2w = build_dataset(TRAIN_X_FILE, w2c=w2c, padid=False, eos=True, unksym='<unk>', target=False, n_data=N_TRAIN, vocab_size=VOCAB_SIZE) train_y, _, _ = build_dataset(TRAIN_Y_FILE, w2i=w2i, target=True, n_data=N_TRAIN) valid_X, _, _ = build_dataset(VALID_X_FILE, w2i=w2i, target=False, n_data=N_VALID) valid_y, _, _ = build_dataset(VALID_Y_FILE, w2i=w2i, target=True, n_data=N_VALID) VOCAB_SIZE = len(w2i) OUT_DIM = VOCAB_SIZE print(VOCAB_SIZE) # Build model ====================================================================================== model = dy.Model() trainer = dy.AdamTrainer(model) V = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) encoder = BiGRU(model, EMB_DIM, 2 * HID_DIM) decoder = RecurrentGenerativeDecoder(model, EMB_DIM, 2 * HID_DIM, LAT_DIM, OUT_DIM) # Train model ======================================================================================= n_batches_train = math.ceil(len(train_X) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_X) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train train_X, train_y = shuffle(train_X, train_y) loss_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() encoder.associate_parameters() decoder.associate_parameters() # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE train_X_mb = train_X[start:end] train_y_mb = train_y[start:end] losses = [] for x, t in zip(train_X_mb, train_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] he = encoder(x_embs) # Decoder t_embs = [dy.lookup(V, t_t) for t_t in t_in] decoder.set_initial_states(he) y, KL = decoder(t_embs) loss = dy.esum([ dy.pickneglogsoftmax(y_t, t_t) + KL_t for y_t, t_t, KL_t in zip(y, t_out, KL) ]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_train.append(mb_loss.value()) # Backward prop mb_loss.backward() trainer.update() # Valid loss_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() encoder.associate_parameters() decoder.associate_parameters() # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE valid_X_mb = valid_X[start:end] valid_y_mb = valid_y[start:end] losses = [] for x, t in zip(valid_X_mb, valid_y_mb): t_in, t_out = t[:-1], t[1:] # Encoder x_embs = [dy.lookup(V, x_t) for x_t in x] he = encoder(x_embs) # Decoder t_embs = [dy.lookup(V, t_t) for t_t in t_in] decoder.set_initial_states(he) y, KL = decoder(t_embs) loss = dy.esum([ dy.pickneglogsoftmax(y_t, t_t) + KL_t for y_t, t_t, KL_t in zip(y, t_out, KL) ]) losses.append(loss) mb_loss = dy.average(losses) # Forward prop loss_all_valid.append(mb_loss.value()) print('EPOCH: %d, Train Loss: %.3f, Valid Loss: %.3f' % (epoch + 1, np.mean(loss_all_train), np.mean(loss_all_valid))) # Save model ====================================================================================== dy.save('./model_e' + str(epoch + 1), [V, encoder, decoder]) with open('./w2i.dump', 'wb') as f_w2i, open('./i2w.dump', 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w)