def forward(self, observations): # calculate forward pass def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim( (scores - max_score_expr_broadcast), 0) init_alphas = [-1e10] * self.num_tags init_alphas[START_TAG] = 0 for_expr = dynet.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.num_tags): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags) next_tag_expr = for_expr + self.trans_mat[ next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.trans_mat[END_TAG] alpha = log_sum_exp(terminal_expr) return alpha
def pick_neg_log(self, pred, gold): # TODO make this a static function in both classes if not isinstance(gold, int) and not isinstance(gold, np.int64): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def get_top_k_paths(self, all_paths, k=None, threshold=None): """ Get the top k scoring paths """ cg = renew_cg() path_scores = [] lemma_lookup = self.model_parameters["lemma_lookup"] pos_lookup = self.model_parameters["pos_lookup"] dep_lookup = self.model_parameters["dep_lookup"] dir_lookup = self.model_parameters["dir_lookup"] builder = self.builder W = parameter(self.model_parameters["W"]) for path in all_paths: path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = _dynet.inputVector([0.0] * self.lemma_dim) path_embedding = concatenate( [zero_word, path_embedding, zero_word]) path_scores.append(softmax(W * path_embedding).npvalue()[1]) path_scores = np.array(path_scores) indices = np.argsort(-path_scores) if k is not None: indices = indices[:k] top_paths = [(all_paths[index], path_scores[index]) for index in indices if threshold is None or path_scores[index] >= threshold] return top_paths
def get_summer(s, size): # list of values (bidirection) => one value if s == "avg": return dy.average else: mask = [0. for _ in range(size // 2) ] + [1. for _ in range(size // 2)] mask2 = [1. for _ in range(size // 2) ] + [0. for _ in range(size // 2)] if s == "fend": return lambda x: dy.cmult(dy.inputVector(mask2), x[-1]) elif s == "bend": return lambda x: dy.cmult(dy.inputVector(mask), x[0]) elif s == "ends": return lambda x: dy.cmult(dy.inputVector(mask2), x[ -1]) + dy.cmult(dy.inputVector(mask), x[0]) else: return None
def predict(self, feature_vector, task_ids, train=False, soft_labels=False, temperature=None, dropout_rate=0.0, orthogonality_weight=0.0, domain_id=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) outputs = [] for task_id in task_ids: output = self.output_layers_dict[task_id](input, soft_labels=soft_labels, temperature=temperature) outputs.append(output) constraint, adv_loss = 0, 0 if orthogonality_weight != 0: # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP F0_layer = self.output_layers_dict["F0"] F1_layer = self.output_layers_dict["F1"] F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W F0_W = dynet.parameter(F0_param) F1_W = dynet.parameter(F1_param) # calculate the matrix product of the task matrix with both others matrix_product = dynet.transpose(F0_W) * F1_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) constraint += squared_frobenius_norm # print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient(input) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) # print('Adversarial loss:', avg_adv_loss.value()) return outputs, constraint, adv_loss
def get_top_k_paths(self, all_paths, relation_index, threshold): """ Get the top k scoring paths """ builder = self.builder model = self.model model_parameters = self.model_parameters lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] path_scores = [] for i, path in enumerate(all_paths): if i % 1000 == 0: cg = dy.renew_cg() W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if self.num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim) path_embedding = dy.concatenate( [zero_word, path_embedding, zero_word]) h = W1 * path_embedding + b1 if self.num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 path_score = dy.softmax(h).npvalue().T path_scores.append(path_score) path_scores = np.vstack(path_scores) top_paths = [] for i in range(len(relation_index)): indices = np.argsort(-path_scores[:, i]) top_paths.append([ (all_paths[index], path_scores[index, i]) for index in indices if threshold is None or path_scores[index, i] >= threshold ]) return top_paths
def augment(scores, oracle_index, crossing=False): ''' Add the hinge loss into scores. ''' assert isinstance(scores, dy.Expression) shape = scores.dim()[0] assert len(shape) == 1 increment = np.ones(shape) increment[oracle_index] = crossing return scores + dy.inputVector(increment)
def get_w_repr(self, word, train=False, update=True): """ Get representation of word (word embedding) """ if train: if self.w_dropout_rate > 0.0: w_id = self.w2i[UNK] if drop(word, self.wcount, self.w_dropout_rate) else self.w2i.get(word, self.w2i[UNK]) else: if self.mimickx_model_path: # if given use MIMICKX if word not in self.w2i: # #print("predict with MIMICKX for: ", word) return dynet.inputVector(self.mimickx_model.predict(word).npvalue()) w_id = self.w2i.get(word, self.w2i[UNK]) if not update: return dynet.nobackprop(self.wembeds[w_id]) else: return self.wembeds[w_id]
def viterbi(self, observations, unk_tag=None, dictionary=None): #if dictionary: # raise NotImplementedError("type constraints not yet implemented for CRF") backpointers = [] init_vvars = [-1e10] * self.num_tags init_vvars[START_TAG] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_tags): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) if unk_tag: best_tag = self.index2tag[best_tag_id] if best_tag == unk_tag: next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0 best_tag_id = np.argmax( next_tag_arr) # get second best bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[END_TAG] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id ] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == START_TAG # Return best path and best path's score return best_path, path_score
def evaluate_adversary(self, dataset): loss = 0 acc = 0 tot = len(dataset) predictions = [] for i, ex in enumerate(dataset): dy.renew_cg() vec, labels = ex vec = dy.inputVector(vec) l, p = self.adversary_classifier.get_loss_and_prediction(vec, labels) predictions.append(p) if p == labels: acc += 1 loss += l.value() return loss / tot, acc / tot * 100, predictions
def viterbi(self, observations, unk_tag=None, dictionary=None): #if dictionary: # raise NotImplementedError("type constraints not yet implemented for CRF") backpointers = [] init_vvars = [-1e10] * self.num_tags init_vvars[START_TAG] = 0 # <Start> has all the probability for_expr = dynet.inputVector(init_vvars) trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)] for obs in observations: bptrs_t = [] vvars_t = [] for next_tag in range(self.num_tags): next_tag_expr = for_expr + trans_exprs[next_tag] next_tag_arr = next_tag_expr.npvalue() best_tag_id = np.argmax(next_tag_arr) if unk_tag: best_tag = self.index2tag[best_tag_id] if best_tag == unk_tag: next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0 best_tag_id = np.argmax(next_tag_arr) # get second best bptrs_t.append(best_tag_id) vvars_t.append(dynet.pick(next_tag_expr, best_tag_id)) for_expr = dynet.concatenate(vvars_t) + obs backpointers.append(bptrs_t) # Perform final transition to terminal terminal_expr = for_expr + trans_exprs[END_TAG] terminal_arr = terminal_expr.npvalue() best_tag_id = np.argmax(terminal_arr) path_score = dynet.pick(terminal_expr, best_tag_id) # Reverse over the backpointers to get the best path best_path = [best_tag_id] # Start with the tag that was best for terminal for bptrs_t in reversed(backpointers): best_tag_id = bptrs_t[best_tag_id] best_path.append(best_tag_id) start = best_path.pop() # Remove the start symbol best_path.reverse() assert start == START_TAG # Return best path and best path's score return best_path, path_score
def forward(self, observations): # calculate forward pass def log_sum_exp(scores): npval = scores.npvalue() argmax_score = np.argmax(npval) max_score_expr = dynet.pick(scores, argmax_score) max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags) return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0) init_alphas = [-1e10] * self.num_tags init_alphas[START_TAG] = 0 for_expr = dynet.inputVector(init_alphas) for obs in observations: alphas_t = [] for next_tag in range(self.num_tags): obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags) next_tag_expr = for_expr + self.trans_mat[next_tag] + obs_broadcast alphas_t.append(log_sum_exp(next_tag_expr)) for_expr = dynet.concatenate(alphas_t) terminal_expr = for_expr + self.trans_mat[END_TAG] alpha = log_sum_exp(terminal_expr) return alpha
def predict(self, feature_vector, train=False, soft_labels=False, temperature=None, dropout_rate=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers - 1): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) output = self.layers[-1](input, soft_labels=soft_labels, temperature=temperature) return output
def fit(self, train_X, train_Y, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, variance_weights=None, labeled_weight_proportion=1.0): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0) :param labeled_weight_proportion: proportion of the unsupervised weight that should be assigned to labeled examples """ print("read training data", file=sys.stderr) if variance_weights is not None: print('First 20 variance weights:', variance_weights[:20]) if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed # if we use word dropout keep track of counts if word_dropout_rate > 0.0: widCount = Counter() for sentence, _ in train_X: widCount.update([w for w in sentence]) assert (len(train_X) == len(train_Y)) train_data = list(zip(train_X, train_Y)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] sentence_var_weights = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) if variance_weights is not None: sentence_var_weights.append( variance_weights[trg_start_id:trg_start_id + len(example[0])]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) assert len(sentence_trg_vectors) == len(train_X) if variance_weights is not None: assert trg_start_id == len(variance_weights) assert len(sentence_var_weights) == len(train_X) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 total_tagged = 0.0 total_other_loss, total_other_loss_weighted = 0.0, 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): (word_indices, char_indices), y = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output = self.predict(word_indices, char_indices, train=True) if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) if variance_weights is not None: var_weights = sentence_var_weights[idx] assert len(output) == len(var_weights) # multiply the normalized mean variance with each loss other_loss = dynet.esum([ v * dynet.squared_distance(o, dynet.inputVector(t)) for o, t, v in zip(output, targets, var_weights) ]) else: other_loss = dynet.esum([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) total_other_loss += other_loss.value() if len(y) == 1 and y[0] == 0: #unlab_ex other_loss += other_loss * unsup_weight else: #lab_ex # assign the unsupervised weight for labeled examples other_loss += other_loss * unsup_weight * labeled_weight_proportion # keep track for logging total_loss += loss.value() # main loss total_tagged += len(word_indices) total_other_loss_weighted += other_loss.value() # combine losses loss += other_loss else: # keep track for logging total_loss += loss.value() total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if trg_vectors is None: print("iter {2} {0:>12}: {1:.2f}".format( "total loss", total_loss / total_tagged, cur_iter), file=sys.stderr) else: print( "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})" .format("supervised loss", total_loss / total_tagged, cur_iter, total_other_loss / total_tagged, total_other_loss_weighted / total_tagged), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def train_adversary(self, train, dev): lr = self.args.learning_rate dc = self.args.decay_constant random.shuffle(train) sample_train = train[:len(dev)] self.trainer.learning_rate = lr epochs = self.args.iterations_adversary n_updates = 0 best = 0 ibest=0 for epoch in range(self.args.iterations_adversary): random.shuffle(train) for i, example in enumerate(train): dy.renew_cg() vec, label = example vec = dy.inputVector(vec) sys.stderr.write("\r{}%".format(i / len(train) * 100)) loss = self.adversary_classifier.get_loss(vec, label) loss.backward() self.trainer.update() self.trainer.learning_rate = lr / (1 + n_updates * dc) n_updates += 1 sys.stderr.write("\r") targets_t = [label for _, label in sample_train] targets_d = [label for _, label in dev] loss_t, acc_t, predictions_t = self.evaluate_adversary(sample_train) loss_d, acc_d, predictions_d = self.evaluate_adversary(dev) cmpare = acc_d ftrain = compute_eval_metrics(self.adversary_classifier.output_size(), targets_t, predictions_t) fdev = compute_eval_metrics(self.adversary_classifier.output_size(), targets_d, predictions_d) Fscore = "F: t = {} d = {}".format(ftrain, fdev) cmpare = fdev[2] if "tp" in self.args.dataset or "bl" in self.args.dataset: acc_all = fdev[3] cmpare = sum(acc_all) / len(acc_all) if cmpare >= best: best = cmpare ibest = epoch self.model.save("{}/adverse_model{}".format(self.output_folder, ibest)) print("Epoch {} train: l={:.4f} acc={:.2f} dev: l={:.4f} acc={:.2f} {} ".format(epoch, loss_t, acc_t, loss_d, acc_d, Fscore), flush=True) if epochs > 0: self.model.populate("{}/adverse_model{}".format(self.output_folder, ibest)) return best
def pick_neg_log(self, pred, gold): if not isinstance(gold, int): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def fit(self, train_dict, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, clip_threshold=5.0, orthogonality_weight=0.0, adversarial=False, adversarial_weight=1.0, ignore_src_Ft=False): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param adversarial: note: if we want to use adversarial, we have to call add_adversarial_loss before; :param adversarial_weight: 1 by default (do not weigh adv loss) :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src' :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft") to a dictionary {"X": list of examples, "Y": list of labels, "domain": list of domain tag (0,1) of example} Three tasks are indexed as "F0", "F1" and "Ft" Note: if a task 'src' is given than a single model with three heads is trained where all data is given to all outputs """ print("read training data") widCount = Counter() train_data = [] for task, task_dict in train_dict.items(): #task: eg. "F0" for key in ["X", "Y", "domain"]: assert key in task_dict, "Error: %s is not available." % key examples, labels, domain_tags = task_dict["X"], task_dict[ "Y"], task_dict["domain"] assert len(examples) == len(labels) if word_dropout_rate > 0.0: # keep track of the counts for word dropout for sentence, _ in examples: widCount.update([w for w in sentence]) # train data is a list of 4-tuples: (example, label, task_id, domain_id) train_data += list( zip(examples, labels, [[task] * len(labels)][0], domain_tags)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) if seed: random.seed(seed) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) random_indices = np.arange(len(train_data)) random.shuffle(random_indices) total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0 total_orth_constr = 0 # count how many updates # log separate losses log_losses = {} log_total = {} for task_id in self.task_ids: log_losses[task_id] = 0.0 log_total[task_id] = 0 for i, idx in enumerate(random_indices): (word_indices, char_indices), y, task_id, domain_id = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output, constraint, adv = self.predict( word_indices, char_indices, task_id, train=True, orthogonality_weight=orthogonality_weight, domain_id=domain_id if adversarial else None) if task_id not in ['src', 'trg']: if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) other_loss = unsup_weight * dynet.average([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) loss += other_loss if orthogonality_weight != 0.0 and task_id != 'Ft': # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output log_losses[task_id] += total_loss total_tagged += len(word_indices) log_total[task_id] += total_tagged loss.backward() self.trainer.update() bar.next() else: # bootstrap=False, the output contains list of outputs one for each task assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False' loss = dynet.scalarInput(1) #initialize if ignore_src_Ft: output = output[: -1] # ignore last = Ft when further training with 'src' for t_i, output_t in enumerate( output): # get loss for each task loss += dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output_t, y) ]) task_id = self.task_ids[t_i] log_losses[task_id] += total_loss log_total[task_id] += total_tagged if orthogonality_weight != 0.0: # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if adversarial and orthogonality_weight: print( "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}" .format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr, total_adversarial / total_tagged), file=sys.stderr) elif orthogonality_weight: print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}". format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr), file=sys.stderr) else: print("iter {}. Total loss: {:.3f} ".format( cur_iter, total_loss / total_tagged), file=sys.stderr) for task_id in self.task_ids: if log_total[task_id] > 0: print("{0}: {1:.3f}".format( task_id, log_losses[task_id] / log_total[task_id])) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}.' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch( self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend( encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1, ), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate( [feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def fit(self, train_X, train_Y, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, labeled_weight_proportion=1.0): """ train the model :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling """ if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed assert(train_X.shape[0] == len(train_Y)), \ '# examples %d != # labels %d.' % (train_X.shape[0], len(train_Y)) train_data = list(zip(train_X, train_Y)) print('Starting training for %d epochs...' % num_epochs) best_val_f1, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print('Using early stopping with patience of %d...' % patience) for cur_iter in range(num_epochs): bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): x, y = train_data[idx] output = self.predict(x, train=True, dropout_rate=word_dropout_rate) # in temporal ensembling, we assign a dummy label of -1 for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) if y == -1 else self.pick_neg_log( output, y) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input target = trg_vectors[idx] other_loss = dynet.squared_distance( output, dynet.inputVector(target)) if y != -1: other_loss *= labeled_weight_proportion loss += other_loss * unsup_weight total_loss += loss.value() loss.backward() self.trainer.update() bar.next() print(" iter {2} {0:>12}: {1:.2f}".format( "total loss", total_loss / len(train_data), cur_iter), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best F1 score on the validation set val_f1 = self.evaluate(val_X, val_Y) if val_f1 > best_val_f1: print('F1 %.4f is better than best val F1 %.4f.' % (val_f1, best_val_f1)) best_val_f1 = val_f1 epochs_no_improvement = 0 save_model(self, model_path) else: print('F1 %.4f is worse than best val F1 %.4f.' % (val_f1, best_val_f1)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement) break
def pick_neg_log(self, pred, gold): if hasattr(gold, "__len__"): # calculate cross-entropy loss against the whole vector dy_gold = dynet.inputVector(gold) return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred))) return -dynet.log(dynet.pick(pred, gold))
def fit(self, train_dict, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, orthogonality_weight=0.0, adversarial=False): """ train the model :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling """ if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed train_data = [] for task, task_dict in train_dict.items(): for key in ["X", "Y", "domain"]: assert key in task_dict, "Error: %s is not available." % key examples, labels, domain_tags = task_dict["X"], task_dict["Y"], \ task_dict["domain"] assert examples.shape[0] == len(labels) # train data is a list of 4-tuples: (example, label, task_id, domain_id) train_data += list( zip(examples, labels, [[task] * len(labels)][0], domain_tags)) print('Starting training for %d epochs...' % num_epochs) best_val_f1, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print('Using early stopping with patience of %d...' % patience) for cur_iter in range(num_epochs): bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss, total_constraint, total_adversarial = 0.0, 0.0, 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): x, y, task_id, domain_id = train_data[idx] task_ids = [task_id] if task_id == 'src': # we train both F0 and F1 on source data task_ids = ['F0', 'F1'] elif task_id == 'src_all': # we train F0, F1, and Ft on source data for base training task_ids = ['F0', 'F1', 'Ft'] loss = 0 outputs, constraint, adv = self.predict( x, task_ids, train=True, dropout_rate=word_dropout_rate, orthogonality_weight=orthogonality_weight, domain_id=domain_id if adversarial else None) # in temporal ensembling, we assign a dummy label of -1 for # unlabeled sequences; we skip the supervised loss for these for output in outputs: loss += dynet.scalarInput( 0) if y == -1 else self.pick_neg_log(output, y) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input target = trg_vectors[idx] other_loss = dynet.squared_distance( output, dynet.inputVector(target)) loss += other_loss * unsup_weight # the orthogonality weight is the same for every prediction, # so we can add it in the end if orthogonality_weight != 0.0: # add the orthogonality constraint to the loss loss += constraint * orthogonality_weight total_constraint += constraint.value() if adversarial: total_adversarial += adv.value() loss += adv total_loss += loss.value() loss.backward() self.trainer.update() bar.next() print( "\niter {}. Total loss: {:.3f}, total penalty: {:.3f}, adv: {:.3f}" .format(cur_iter, total_loss / len(train_data), total_constraint / len(train_data), total_adversarial / len(train_data)), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best F1 score on the validation set val_f1 = self.evaluate(val_X, val_Y, 'F0') if val_f1 > best_val_f1: print('F1 %.4f is better than best val F1 %.4f.' % (val_f1, best_val_f1)) best_val_f1 = val_f1 epochs_no_improvement = 0 save_mttri_model(self, model_path) else: print('F1 %.4f is worse than best val F1 %.4f.' % (val_f1, best_val_f1)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement) break