def train_network(params, ntags, train_data, dev_set, telemetry_file, randstring, very_common_tag=-1): global MIN_ACC prev_acc = 0 m = params[0] t0 = time.clock() # train the network trainer = dy.SimpleSGDTrainer(m) total_loss = 0 seen_instances = 0 train_good = 0 very_common_tag_count = 0 for x_data, train_y in train_data: dy.renew_cg() output = build_network(params, x_data) # l2 regularization did not look promising at all, so it's commented out loss = -dy.log( output[train_y] ) #+ REG_LAMBDA * sum([dy.l2_norm(p) for p in params[2:]]) if train_y == np.argmax(output.npvalue()): train_good += 1 seen_instances += 1 total_loss += loss.value() loss.backward() trainer.update() if seen_instances % 20000 == 0: # measure elapsed seconds secs = time.clock() - t0 t0 = time.clock() good = case = 0 max_dev_instances = 70 * 1000 dev_instances = 0 for x_tuple, dev_y in dev_set: output = build_network(params, x_tuple) y_hat = np.argmax(output.npvalue()) case += 1 if y_hat == dev_y and y_hat == very_common_tag: case -= 1 # don't count this case very_common_tag_count += 1 elif y_hat == dev_y: good += 1 dev_instances += 1 if dev_instances >= max_dev_instances: break acc = float(good) / case print( "iterations: {}. train_accuracy: {} accuracy: {} avg loss: {} secs per 1000:{}" .format(seen_instances, float(train_good) / 20000, acc, total_loss / (seen_instances + 1), secs / 20)) train_good = 0 if acc > MIN_ACC and acc > prev_acc: print("saving.") dy.save("params_" + randstring, list(params)[1:]) prev_acc = acc telemetry_file.write("{}\t{}\t{}\t{}\n".format( seen_instances, acc, total_loss / (seen_instances + 1), secs / 20)) print("very common tag count: {}".format(very_common_tag_count))
def train_batched(self, tasks, batch_size, scale_gradient_factor, validation_data, seqs_trg, early_stopping, patience, num_epochs, min_num_epochs, num_updates, prob_main_task, prob_adv): trainer = dn.SimpleSGDTrainer(self.model) # stores best observed validation accuracy val_best = 0 # stores the number of iterations without improvement no_improvement = 0 val_prev = 0 for epoch in range(num_epochs): sum_losses = 0 adversarial_loss = 0 losses_prediction_task = [] losses_aux_task = [] batch_dict = self.generate_batches_across_tasks(tasks, batch_size) # number of updates is twice the length of the main task batch list num_updates = len(batch_dict[self.prediction_layer]) * 2 print(num_updates) #logging.INFO('Number of updates to do: {}'.format(num_updates)) # sample batches according to some schema update_counter = 0 while update_counter <= num_updates: update_counter += 1 # with prob 1-prob_adv, do a task update outcome = np.random.binomial(1, prob_adv, size=None) if outcome == 0: task_id, batch_ids = self.sample_task_batch( batch_dict, prob_main_task=prob_main_task) losses = [] dn.renew_cg() # iterate through the batch for i in batch_ids: seq = tasks[task_id].train_seqs[i] label = tasks[task_id].train_labels[i] loss = self.compute_loss_multilabel( task_id, seq, label) losses.append(loss) batch_loss = dn.esum(losses) / len(batch_ids) batch_loss_value = batch_loss.value() batch_loss.backward() trainer.update() sum_losses += batch_loss_value if task_id == self.prediction_layer: losses_prediction_task.append(batch_loss_value) else: losses_aux_task.append(batch_loss_value) else: # do adversarial step losses = [] dn.renew_cg() seqs, labels = self.generate_adversarial_batch( seqs_src=tasks[self.src_domain].train_seqs, seqs_trg=seqs_trg, batch_size=batch_size) for i in range(len(seqs)): seq = seqs[i] label = labels[i] loss = self.compute_loss_multilabel(task='adversarial', seq=seq, multi_y=label) losses.append(loss) batch_loss = dn.esum(losses) / len(seqs) batch_loss_value = batch_loss.value() batch_loss.backward() trainer.update() adversarial_loss += batch_loss_value # compute the validation accuracy to monitor early stopping # use the micro averaged f as criterion res = evaluate_model_predictions( self.predict(self.main_task, validation_data['seq']), validation_data['label'], validation_data['labelset']) f_avg = res['f_avg'] logging.info( 'Epoch {}. Sum loss: {}. Avg loss: {}. Avg loss predtask {}. Avg loss aux tasks: {}. No improv: {}. Best f_val: {}. Avg f_val: {}' .format(epoch, sum_losses, sum_losses / num_updates, np.mean(losses_prediction_task), np.mean(losses_aux_task), no_improvement, val_best, f_avg)) logging.info( 'Epoch {}. Adv loss: {}. Avg loss: {}. Avg loss predtask {}. Avg loss aux tasks: {}. No improv: {}. Best f_val: {}. Avg f_val: {}' .format(epoch, adversarial_loss, sum_losses / num_updates, np.mean(losses_prediction_task), np.mean(losses_aux_task), no_improvement, val_best, f_avg)) # init early stopping after min number of epochs if epoch == min_num_epochs - 1: val_prev = f_avg no_improvement = 0 self.save(self.exp_path) # if early_stopping: if f_avg <= val_prev: no_improvement += 1 if early_stopping: if no_improvement >= patience and epoch > min_num_epochs: break else: if epoch >= min_num_epochs: self.save(self.exp_path) no_improvement = 0 if f_avg >= val_best: val_best = f_avg val_prev = f_avg return epoch, f_avg, sum_losses, no_improvement, val_best
#last = dy.cmult(layers[-1], me) + e #print("gradient", last.value()) #log_loss = dy.log(last + epsilon) #print(log_loss.value()) ys = dy.vecInput(self.dim_out) ys.set([1 if i in targets else 0 for i in range(self.dim_out)]) loss = dy.binary_log_loss(layers[-1], ys) return dy.sum_elems(loss) if __name__ == "__main__": import dynet model = dy.Model() trainer = dy.SimpleSGDTrainer(model) classifier = MLP_sigmoid(2, 2, 2, 10, dy.rectify, model) dataset = [([-1, -1], {0}), ([-1, 1], {1}), ([1, -1], {1}), ([1, 1], {0})] for e in range(10040): for xs, y in dataset: dy.renew_cg() x = dy.vecInput(2) x.set(xs) l = classifier.get_loss(x, y) l.backward() trainer.update() loss = 0
params_decoder["R"] = pc.add_parameters((VOCAB_SIZE_out, HIDDEN_DIM)) params_decoder["bias"] = pc.add_parameters((VOCAB_SIZE_out)) params_decoder["attention_w"] = pc.add_parameters((ATTENTION_DIM, HIDDEN_DIM)) params_decoder["attention_b"] = pc.add_parameters((ATTENTION_DIM)) params_decoder["attention_wc"] = pc.add_parameters((ATTENTION_DIM, HIDDEN_DIM)) params_decoder["attention_bc"] = pc.add_parameters((ATTENTION_DIM, HIDDEN_DIM)) params_decoder["attention_v"] = pc.add_parameters((1, ATTENTION_DIM)) char_encoder = dy.CompactVanillaLSTMBuilder(LAYERS, 50, 75, pc) params_encoder["char_lookup"] = pc.add_lookup_parameters((VOCAB_char, 50)) params_encoder["pos_lookup"] = pc.add_lookup_parameters((7, 25)) dropout = 0.05 encoder.set_dropouts(0, dropout) decoder.set_dropouts(0, dropout) char_encoder.set_dropouts(0, dropout) trainer = dy.SimpleSGDTrainer(pc) ## TRAIN ### 200 epoch unless dev acc. on instruction above 0.58 dev_interaction = [] dev_instruction = [] for i in range(200): print('Epoch%d' % i) count = 0 sum = 0 batch_loss = [] dy.renew_cg() previous = None first = True for sentence, output, env in zip(ins, act, env_int): if count % 5 != 0:
def __init__(self, model, type, lrate, moment=None): self._tt = { "sgd": dy.SimpleSGDTrainer(model, lrate), "momentum": dy.MomentumSGDTrainer(model, lrate, moment), "adam": dy.AdamTrainer(model, lrate) }[type]
def train_network(self, train_data, epochs = 3, dev_data = None, test_data = None): trainer = dy.SimpleSGDTrainer(self.pc,0.05) i = 0 mloss = 0. goods = 0. loss = [] dy.renew_cg() max_dev_acc = MIN_SAVE_ACC run_id = randint(0,9999) save_path = "{}{:04d}".format(SAVE_TO,run_id) report_path = "{}{:04d}.txt".format(SAVE_REPORT_TO,run_id) test_path = "{}{:04d}.txt".format(SAVE_TAGGED_TEST_TO,run_id) rprt = open(report_path,'wt') print report_path for e in range(epochs): shuffle(train_data) for x, y in train_data: i = i + 1 loss = loss + [self.eval_loss(x, y, dropout=True)] good = y == self.last_case_class goods += int(good) if i % UPDATE_EVERY == 0: losses = dy.esum(loss) mloss += losses.value() losses.backward() trainer.update() loss = [] dy.renew_cg() if i % EVALUATE_LOSS_EVERY == 1000: goods_dev = 0. j = 0 for d in dev_data or []: dy.renew_cg() j+=1 x, y = d self.eval_loss(x, y) goods_dev += 1 if y==self.last_case_class else 0 dev_acc = goods_dev / len(dev_data or 'a') message = "{} average loss after {} iterations: {} acc: {}".format( now_string(), i, mloss/EVALUATE_LOSS_EVERY, goods/EVALUATE_LOSS_EVERY) dev_acc_str = " dev acc: {}".format(dev_acc) if dev_data else "" print(message + dev_acc_str) rprt.write(message + dev_acc_str+'\n') mloss = 0. goods = 0. if dev_acc > max_dev_acc and i > START_SAVE_AFTER: max_dev_acc = dev_acc print("saving.") rprt.write("saving.\n") self.save(save_path) if test_data: outf = open(test_path,'wt') k = 0 goods_test = 0. print("tagging test data.") for dd in test_data: dy.renew_cg() k += 1 x, y = dd self.eval_loss(x,y) y_hat = self.last_case_class goods_test += 1 if y == y_hat else 0 outf.write("{}{}{}\n".format(x, y, y_hat)) outf.close() test_acc = goods_test / len(test_data) print("accurcy on test: {}".format(test_acc)) rprt.flush()
def train_model(model, encoder, decoder, params, train_inputs, train_outputs, dev_inputs, dev_outputs, y2int, int2y, epochs, optimization, results_file_path, plot, batch_size, eval_after, min_epochs): print 'training...' sys.stdout.flush() np.random.seed(17) random.seed(17) # sort training sentences by length in descending order train_data = zip(train_inputs, train_outputs) train_data.sort(key=lambda t: -len(t[0])) train_order = [ x * batch_size for x in range(len(train_data) / batch_size + 1) ] # sort dev sentences by length in descending order dev_batch_size = 1 dev_data = zip(dev_inputs, dev_outputs) dev_data.sort(key=lambda t: -len(t[0])) dev_order = [ x * dev_batch_size for x in range(len(dev_data) / dev_batch_size + 1) ] if optimization == 'ADAM': trainer = dn.AdamTrainer( model ) # lam=REGULARIZATION, alpha=LEARNING_RATE, beta_1=0.9, beta_2=0.999, eps=1e-8) elif optimization == 'MOMENTUM': trainer = dn.MomentumSGDTrainer(model) elif optimization == 'SGD': trainer = dn.SimpleSGDTrainer(model) elif optimization == 'ADAGRAD': trainer = dn.AdagradTrainer(model) elif optimization == 'ADADELTA': trainer = dn.AdadeltaTrainer(model) else: trainer = dn.SimpleSGDTrainer(model) trainer.set_clip_threshold(float(arguments['--grad-clip'])) seen_examples_count = 0 total_loss = 0 best_dev_epoch = 0 best_train_epoch = 0 patience = 0 train_len = len(train_outputs) dev_len = len(dev_inputs) avg_train_loss = -1 train_loss_patience = 0 train_loss_patience_threshold = 99999999 max_patience = int(arguments['--max-patience']) log_path = results_file_path + '_log.txt' start_epoch, checkpoints_x, train_loss_y, dev_loss_y, dev_accuracy_y = read_from_log( log_path) if len(train_loss_y) > 0: total_batches = checkpoints_x[-1] best_avg_train_loss = max(train_loss_y) best_dev_accuracy = max(dev_accuracy_y) best_dev_loss = max(dev_loss_y) else: total_batches = 0 best_avg_train_loss = 999999 best_dev_loss = 999999 best_dev_accuracy = 0 # progress bar init # noinspection PyArgumentList # widgets = [progressbar.Bar('>'), ' ', progressbar.ETA()] # train_progress_bar = progressbar.ProgressBar(widgets=widgets, maxval=epochs).start() e = -1 for e in xrange(start_epoch, epochs): try: # shuffle the batch start indices in each epoch random.shuffle(train_order) batches_per_epoch = len(train_order) start = time.time() # go through batches for i, batch_start_index in enumerate(train_order, start=1): # get batch examples batch_inputs = [ x[0] for x in train_data[batch_start_index:batch_start_index + batch_size] ] batch_outputs = [ x[1] for x in train_data[batch_start_index:batch_start_index + batch_size] ] actual_batch_size = len(batch_inputs) # skip empty batches if actual_batch_size == 0 or len(batch_inputs[0]) == 0: continue # compute batch loss # debug prints for batch seq lengths # print 'batch {} seq lens'.format(i) # print [len(s) for s in batch_inputs] loss = compute_batch_loss(encoder, decoder, batch_inputs, batch_outputs, y2int) # forward pass total_loss += loss.scalar_value() loss.backward() total_batches += 1 # update parameters trainer.update() seen_examples_count += actual_batch_size # avg loss per sample avg_train_loss = total_loss / float(i * batch_size + e * train_len) # start patience counts only after 20 batches if avg_train_loss < best_avg_train_loss and total_batches > 20: best_avg_train_loss = avg_train_loss train_loss_patience = 0 else: train_loss_patience += 1 if train_loss_patience > train_loss_patience_threshold: print 'train loss patience exceeded: {}'.format( train_loss_patience) sys.stdout.flush() return model, params, e, best_dev_epoch if total_batches % 100 == 0 and total_batches > 0: print 'epoch {}: {} batches out of {} ({} examples out of {}) total: {} batches, {} examples. avg \ loss per example: {}'.format(e, i, batches_per_epoch, i * batch_size, train_len, total_batches, total_batches * batch_size, avg_train_loss) sys.stdout.flush() # print sentences per second end = time.time() elapsed_seconds = end - start print '{} sentences per second'.format( seen_examples_count / elapsed_seconds) sys.stdout.flush() seen_examples_count = 0 start = time.time() # checkpoint if total_batches % eval_after == 0: print 'starting checkpoint evaluation' sys.stdout.flush() dev_bleu, dev_loss = checkpoint_eval( encoder, decoder, params, dev_batch_size, dev_data, dev_inputs, dev_len, dev_order, dev_outputs, int2y, y2int, results_file_path=results_file_path) log_to_file(log_path, e, total_batches, avg_train_loss, dev_loss, dev_bleu) save_model(model, results_file_path, total_batches, models_to_save=int( arguments['--models-to-save'])) if dev_bleu > best_dev_accuracy: best_dev_accuracy = dev_bleu best_dev_epoch = e # save best model to disk save_best_model(model, results_file_path) print 'saved new best model' sys.stdout.flush() patience = 0 else: patience += 1 if dev_loss < best_dev_loss: best_dev_loss = dev_loss print 'epoch: {0} train loss: {1:.4f} dev loss: {2:.4f} dev bleu: {3:.4f} \ best dev bleu {4:.4f} (epoch {5}) patience = {6}'.format( e, avg_train_loss, dev_loss, dev_bleu, best_dev_accuracy, best_dev_epoch, patience) sys.stdout.flush() if (patience == max_patience) and (e >= min_epochs): print 'out of patience after {0} checkpoints'.format( str(e)) sys.stdout.flush() # train_progress_bar.finish() if plot: plt.cla() print 'checkpoint patience exceeded' sys.stdout.flush() return model, params, e, best_dev_epoch # plotting results from checkpoint evaluation if plot: train_loss_y.append(avg_train_loss) checkpoints_x.append(total_batches) dev_accuracy_y.append(dev_bleu) dev_loss_y.append(dev_loss) y_vals = [('train_loss', train_loss_y), ('dev loss', dev_loss_y), ('dev_bleu', dev_accuracy_y)] common.plot_to_file(y_vals, x_name='total batches', x_vals=checkpoints_x, file_path=results_file_path + '_learning_curve.png') except RuntimeError as exception: # sometimes the above two instructions fail due to memory allocation failure. # I was unable to find a fix for these failures. # perhaps we can just "skip" the failures. print 'WARNING: Skipping epoch due to RuntimeError (' + str( exception) + ')' sys.stdout.flush() # update progress bar after completing epoch # train_progress_bar.update(e) # update progress bar after completing training # train_progress_bar.finish() if plot: # clear plot when done plt.cla() print 'finished training. average loss: {} best epoch on dev: {} best epoch on train: {}'.format( str(avg_train_loss), best_dev_epoch, best_train_epoch) sys.stdout.flush() return model, params, e, best_dev_epoch