def validate(model, sess, x_dev, y_dev): valid_costs, valid_lengths = [], [] for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers): cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask) valid_costs.append(cost * target_mask.shape[1]) valid_lengths.append(np.sum(target_mask[1:, :])) valid_cost = sum(valid_costs) / float(sum(valid_lengths)) return valid_cost
def batch_decode(model, sess, x_dev, y_dev, alpha): error_source = []; error_target = []; error_generated = []; generated_score = []; generated_lm_score = []; generated_nw_score = []; target_lm_score = []; target_nw_score = []; count = 0 for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1, FLAGS.num_layers, sort_and_shuffle=False): src_sent = detokenize_tgt(source_tokens, reverse_vocab) tgt_sent = detokenize_tgt(target_tokens, reverse_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) tgt_nw_score = network_score(model, sess, encoder_output, target_tokens) print("pair: %d network score: %f" % (count + 1, tgt_nw_score)) # Language Model ranking if not FLAGS.score: best_str = lm_rank(beam_strs, probs) else: best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs) tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split()) print("%s | %s | %s" % (src_sent, tgt_sent, best_str)) # see if this is too stupid, or doesn't work at all error_source.append(src_sent) error_target.append(tgt_sent) error_generated.append(best_str) if FLAGS.score: target_lm_score.append(tgt_lm_score) target_nw_score.append(tgt_nw_score) generated_score.append(rerank_score) generated_nw_score.append(nw_score) generated_lm_score.append(lm_score) count += 1 with open(FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f: f.write("\n".join(error_generated))
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, _ = initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None total_iters = 0 start_time = time.time() while FLAGS.epochs == 0 or epoch < FLAGS.epochs: epoch += 1 current_step = 0 # Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) total_iters += np.sum(target_mask) tps = total_iters / (time.time() - start_time) current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, ' 'length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length, std_length)) epoch_toc = time.time() # Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") # Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path) else: previous_losses.append(valid_cost) model.saver.save(sess, checkpoint_path) sys.stdout.flush()
def batch_decode(model, sess, x_dev, y_dev, alpha): error_source = []; error_target = []; error_generated = []; generated_score = []; generated_lm_score = []; generated_nw_score = []; target_lm_score = []; target_nw_score = []; count = 0 for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1, FLAGS.num_layers, sort_and_shuffle=False): src_sent = detokenize_tgt(source_tokens, reverse_vocab) tgt_sent = detokenize_tgt(target_tokens, reverse_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) tgt_nw_score = network_score(model, sess, encoder_output, target_tokens) print("pair: %d network score: %f" % (count + 1, tgt_nw_score)) # Language Model ranking if not FLAGS.score: best_str = lm_rank(beam_strs, probs) else: best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs) tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split()) print("%s | %s | %s" % (src_sent, tgt_sent, best_str)) # see if this is too stupid, or doesn't work at all error_source.append(src_sent) error_target.append(tgt_sent) error_generated.append(best_str) if FLAGS.score: target_lm_score.append(tgt_lm_score) target_nw_score.append(tgt_nw_score) generated_score.append(rerank_score) generated_nw_score.append(nw_score) generated_lm_score.append(lm_score) count += 1 """ print("outputting in csv file...") # dump it out in train_dir with open("err_val_alpha_" + str(alpha) + ".csv", 'wb') as f: wrt = csv.writer(f) wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score']) if not FLAGS.score: for s, t, g in itertools.izip(error_source, error_target, error_generated): wrt.writerow([s, t, g]) # source, correct target, wrong target else: for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score): wrt.writerow([s, t, tns, tls, g, gs, gns, gls]) """ # print("err_val_alpha_" + str(alpha) + ".csv" + "file finished") with open(FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f: f.write("\n".join(error_generated))
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data( FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer()) vocab, _ = initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None total_iters = 0 start_time = time.time() while FLAGS.epochs == 0 or epoch < FLAGS.epochs: epoch += 1 current_step = 0 # Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask) total_iters += np.sum(target_mask) tps = total_iters / (time.time() - start_time) current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, ' 'length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length, std_length)) epoch_toc = time.time() # Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") # Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path) else: previous_losses.append(valid_cost) model.saver.save(sess, checkpoint_path) sys.stdout.flush()