def do_cpu(): C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward()
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000,1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward() print("CPU time:",time.time() - s)
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000, 1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W * W * W * W * W * W * W z = C.squared_distance(W, W) z.value() z.backward() print("CPU time:", time.time() - s)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000, 1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W * W * W * W * W * W * W z = G.squared_distance(W, W) z.value() z.backward() print("GPU time:", time.time() - s)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000,1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W*W*W*W*W*W*W z = G.squared_distance(W,W) z.value() z.backward() print("GPU time:",time.time() - s)
def fit(self, train_X, train_Y, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, labeled_weight_proportion=1.0): """ train the model :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling """ if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed assert(train_X.shape[0] == len(train_Y)), \ '# examples %d != # labels %d.' % (train_X.shape[0], len(train_Y)) train_data = list(zip(train_X, train_Y)) print('Starting training for %d epochs...' % num_epochs) best_val_f1, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print('Using early stopping with patience of %d...' % patience) for cur_iter in range(num_epochs): bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): x, y = train_data[idx] output = self.predict(x, train=True, dropout_rate=word_dropout_rate) # in temporal ensembling, we assign a dummy label of -1 for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) if y == -1 else self.pick_neg_log( output, y) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input target = trg_vectors[idx] other_loss = dynet.squared_distance( output, dynet.inputVector(target)) if y != -1: other_loss *= labeled_weight_proportion loss += other_loss * unsup_weight total_loss += loss.value() loss.backward() self.trainer.update() bar.next() print(" iter {2} {0:>12}: {1:.2f}".format( "total loss", total_loss / len(train_data), cur_iter), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best F1 score on the validation set val_f1 = self.evaluate(val_X, val_Y) if val_f1 > best_val_f1: print('F1 %.4f is better than best val F1 %.4f.' % (val_f1, best_val_f1)) best_val_f1 = val_f1 epochs_no_improvement = 0 save_model(self, model_path) else: print('F1 %.4f is worse than best val F1 %.4f.' % (val_f1, best_val_f1)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement) break
def fit(self, train_dict, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, orthogonality_weight=0.0, adversarial=False): """ train the model :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling """ if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed train_data = [] for task, task_dict in train_dict.items(): for key in ["X", "Y", "domain"]: assert key in task_dict, "Error: %s is not available." % key examples, labels, domain_tags = task_dict["X"], task_dict["Y"], \ task_dict["domain"] assert examples.shape[0] == len(labels) # train data is a list of 4-tuples: (example, label, task_id, domain_id) train_data += list( zip(examples, labels, [[task] * len(labels)][0], domain_tags)) print('Starting training for %d epochs...' % num_epochs) best_val_f1, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print('Using early stopping with patience of %d...' % patience) for cur_iter in range(num_epochs): bar = Bar('Training epoch %d/%d...' % (cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss, total_constraint, total_adversarial = 0.0, 0.0, 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): x, y, task_id, domain_id = train_data[idx] task_ids = [task_id] if task_id == 'src': # we train both F0 and F1 on source data task_ids = ['F0', 'F1'] elif task_id == 'src_all': # we train F0, F1, and Ft on source data for base training task_ids = ['F0', 'F1', 'Ft'] loss = 0 outputs, constraint, adv = self.predict( x, task_ids, train=True, dropout_rate=word_dropout_rate, orthogonality_weight=orthogonality_weight, domain_id=domain_id if adversarial else None) # in temporal ensembling, we assign a dummy label of -1 for # unlabeled sequences; we skip the supervised loss for these for output in outputs: loss += dynet.scalarInput( 0) if y == -1 else self.pick_neg_log(output, y) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input target = trg_vectors[idx] other_loss = dynet.squared_distance( output, dynet.inputVector(target)) loss += other_loss * unsup_weight # the orthogonality weight is the same for every prediction, # so we can add it in the end if orthogonality_weight != 0.0: # add the orthogonality constraint to the loss loss += constraint * orthogonality_weight total_constraint += constraint.value() if adversarial: total_adversarial += adv.value() loss += adv total_loss += loss.value() loss.backward() self.trainer.update() bar.next() print( "\niter {}. Total loss: {:.3f}, total penalty: {:.3f}, adv: {:.3f}" .format(cur_iter, total_loss / len(train_data), total_constraint / len(train_data), total_adversarial / len(train_data)), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best F1 score on the validation set val_f1 = self.evaluate(val_X, val_Y, 'F0') if val_f1 > best_val_f1: print('F1 %.4f is better than best val F1 %.4f.' % (val_f1, best_val_f1)) best_val_f1 = val_f1 epochs_no_improvement = 0 save_mttri_model(self, model_path) else: print('F1 %.4f is worse than best val F1 %.4f.' % (val_f1, best_val_f1)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for %d epochs. Early stopping...' % epochs_no_improvement) break
def fit(self, train_X, train_Y, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, variance_weights=None, labeled_weight_proportion=1.0): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param clip_threshold: use gradient clipping with threshold (on if >0; default: 5.0) :param labeled_weight_proportion: proportion of the unsupervised weight that should be assigned to labeled examples """ print("read training data", file=sys.stderr) if variance_weights is not None: print('First 20 variance weights:', variance_weights[:20]) if seed: print(">>> using seed: ", seed, file=sys.stderr) random.seed(seed) #setting random seed # if we use word dropout keep track of counts if word_dropout_rate > 0.0: widCount = Counter() for sentence, _ in train_X: widCount.update([w for w in sentence]) assert (len(train_X) == len(train_Y)) train_data = list(zip(train_X, train_Y)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] sentence_var_weights = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) if variance_weights is not None: sentence_var_weights.append( variance_weights[trg_start_id:trg_start_id + len(example[0])]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) assert len(sentence_trg_vectors) == len(train_X) if variance_weights is not None: assert trg_start_id == len(variance_weights) assert len(sentence_var_weights) == len(train_X) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) total_loss = 0.0 total_tagged = 0.0 total_other_loss, total_other_loss_weighted = 0.0, 0.0 random_indices = np.arange(len(train_data)) random.shuffle(random_indices) for i, idx in enumerate(random_indices): (word_indices, char_indices), y = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output = self.predict(word_indices, char_indices, train=True) if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) if variance_weights is not None: var_weights = sentence_var_weights[idx] assert len(output) == len(var_weights) # multiply the normalized mean variance with each loss other_loss = dynet.esum([ v * dynet.squared_distance(o, dynet.inputVector(t)) for o, t, v in zip(output, targets, var_weights) ]) else: other_loss = dynet.esum([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) total_other_loss += other_loss.value() if len(y) == 1 and y[0] == 0: #unlab_ex other_loss += other_loss * unsup_weight else: #lab_ex # assign the unsupervised weight for labeled examples other_loss += other_loss * unsup_weight * labeled_weight_proportion # keep track for logging total_loss += loss.value() # main loss total_tagged += len(word_indices) total_other_loss_weighted += other_loss.value() # combine losses loss += other_loss else: # keep track for logging total_loss += loss.value() total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if trg_vectors is None: print("iter {2} {0:>12}: {1:.2f}".format( "total loss", total_loss / total_tagged, cur_iter), file=sys.stderr) else: print( "iter {2} {0:>12}: {1:.2f} unsupervised loss: {3:.2f} (weighted: {4:.2f})" .format("supervised loss", total_loss / total_tagged, cur_iter, total_other_loss / total_tagged, total_other_loss_weighted / total_tagged), file=sys.stderr) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def fit(self, train_dict, num_epochs, val_X=None, val_Y=None, patience=2, model_path=None, seed=None, word_dropout_rate=0.25, trg_vectors=None, unsup_weight=1.0, clip_threshold=5.0, orthogonality_weight=0.0, adversarial=False, adversarial_weight=1.0, ignore_src_Ft=False): """ train the tagger :param trg_vectors: the prediction targets used for the unsupervised loss in temporal ensembling :param unsup_weight: weight for the unsupervised consistency loss used in temporal ensembling :param adversarial: note: if we want to use adversarial, we have to call add_adversarial_loss before; :param adversarial_weight: 1 by default (do not weigh adv loss) :param ignore_src_Ft: if asymm.tri. 2nd stage, do not further train Ft on 'src' :param train_dict: a dictionary mapping tasks ("F0", "F1", and "Ft") to a dictionary {"X": list of examples, "Y": list of labels, "domain": list of domain tag (0,1) of example} Three tasks are indexed as "F0", "F1" and "Ft" Note: if a task 'src' is given than a single model with three heads is trained where all data is given to all outputs """ print("read training data") widCount = Counter() train_data = [] for task, task_dict in train_dict.items(): #task: eg. "F0" for key in ["X", "Y", "domain"]: assert key in task_dict, "Error: %s is not available." % key examples, labels, domain_tags = task_dict["X"], task_dict[ "Y"], task_dict["domain"] assert len(examples) == len(labels) if word_dropout_rate > 0.0: # keep track of the counts for word dropout for sentence, _ in examples: widCount.update([w for w in sentence]) # train data is a list of 4-tuples: (example, label, task_id, domain_id) train_data += list( zip(examples, labels, [[task] * len(labels)][0], domain_tags)) # if we use target vectors, keep track of the targets per sentence if trg_vectors is not None: trg_start_id = 0 sentence_trg_vectors = [] for i, (example, y) in enumerate(train_data): sentence_trg_vectors.append( trg_vectors[trg_start_id:trg_start_id + len(example[0]), :]) trg_start_id += len(example[0]) assert trg_start_id == len(trg_vectors),\ 'Error: Idx {} is not at {}.'.format(trg_start_id, len(trg_vectors)) print('Starting training for {} epochs...'.format(num_epochs)) best_val_acc, epochs_no_improvement = 0., 0 if val_X is not None and val_Y is not None and model_path is not None: print( 'Using early stopping with patience of {}...'.format(patience)) if seed: random.seed(seed) for cur_iter in range(num_epochs): bar = Bar('Training epoch {}/{}...'.format(cur_iter + 1, num_epochs), max=len(train_data), flush=True) random_indices = np.arange(len(train_data)) random.shuffle(random_indices) total_loss, total_tagged, total_constraint, total_adversarial = 0.0, 0.0, 0.0, 0.0 total_orth_constr = 0 # count how many updates # log separate losses log_losses = {} log_total = {} for task_id in self.task_ids: log_losses[task_id] = 0.0 log_total[task_id] = 0 for i, idx in enumerate(random_indices): (word_indices, char_indices), y, task_id, domain_id = train_data[idx] if word_dropout_rate > 0.0: word_indices = [ self.w2i["_UNK"] if (random.random() > (widCount.get(w) / (word_dropout_rate + widCount.get(w)))) else w for w in word_indices ] output, constraint, adv = self.predict( word_indices, char_indices, task_id, train=True, orthogonality_weight=orthogonality_weight, domain_id=domain_id if adversarial else None) if task_id not in ['src', 'trg']: if len(y) == 1 and y[0] == 0: # in temporal ensembling, we assign a dummy label of [0] for # unlabeled sequences; we skip the supervised loss for these loss = dynet.scalarInput(0) else: loss = dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output, y) ]) if trg_vectors is not None: # the consistency loss in temporal ensembling is used for # both supervised and unsupervised input targets = sentence_trg_vectors[idx] assert len(output) == len(targets) other_loss = unsup_weight * dynet.average([ dynet.squared_distance(o, dynet.inputVector(t)) for o, t in zip(output, targets) ]) loss += other_loss if orthogonality_weight != 0.0 and task_id != 'Ft': # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output log_losses[task_id] += total_loss total_tagged += len(word_indices) log_total[task_id] += total_tagged loss.backward() self.trainer.update() bar.next() else: # bootstrap=False, the output contains list of outputs one for each task assert trg_vectors is None, 'temporal ensembling not implemented for bootstrap=False' loss = dynet.scalarInput(1) #initialize if ignore_src_Ft: output = output[: -1] # ignore last = Ft when further training with 'src' for t_i, output_t in enumerate( output): # get loss for each task loss += dynet.esum([ self.pick_neg_log(pred, gold) for pred, gold in zip(output_t, y) ]) task_id = self.task_ids[t_i] log_losses[task_id] += total_loss log_total[task_id] += total_tagged if orthogonality_weight != 0.0: # add the orthogonality constraint to the loss total_constraint += constraint.value( ) * orthogonality_weight total_orth_constr += 1 loss += constraint * orthogonality_weight if adversarial: total_adversarial += adv.value() * adversarial_weight loss += adv * adversarial_weight total_loss += loss.value() # for output total_tagged += len(word_indices) loss.backward() self.trainer.update() bar.next() if adversarial and orthogonality_weight: print( "iter {}. Total loss: {:.3f}, total penalty: {:.3f}, total weighted adv loss: {:.3f}" .format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr, total_adversarial / total_tagged), file=sys.stderr) elif orthogonality_weight: print("iter {}. Total loss: {:.3f}, total penalty: {:.3f}". format(cur_iter, total_loss / total_tagged, total_constraint / total_orth_constr), file=sys.stderr) else: print("iter {}. Total loss: {:.3f} ".format( cur_iter, total_loss / total_tagged), file=sys.stderr) for task_id in self.task_ids: if log_total[task_id] > 0: print("{0}: {1:.3f}".format( task_id, log_losses[task_id] / log_total[task_id])) if val_X is not None and val_Y is not None and model_path is not None: # get the best accuracy on the validation set val_correct, val_total = self.evaluate(val_X, val_Y) val_accuracy = val_correct / val_total if val_accuracy > best_val_acc: print( 'Accuracy {:.4f} is better than best val accuracy {:.4f}.' .format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save_tagger(self, model_path) else: print( 'Accuracy {:.4f} is worse than best val loss {:.4f}.'. format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'. format(epochs_no_improvement)) break
def eval_loss(self, case_x, case_class): output = self.evaluate_network(case_x) y_param = dy.vecInput(2) y_param.set(one_hot(2,case_class)) loss = dy.squared_distance(output, y_param) return loss