def batch_decode(model, sess, x_dev, y_dev, alpha): error_source = []; correct_source = [] error_target = []; correct_target = [] error_generated = []; correct_generated = [] target_score = []; correct_target_score = [] generated_score = []; correct_generated_score = [] for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1, FLAGS.num_layers): src_sent = detokenize_tgt(source_tokens, reverse_vocab) tgt_sent = detokenize_tgt(target_tokens, reverse_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) # Language Model ranking if not FLAGS.score: best_str = lm_rank(beam_strs, probs) else: best_str, tgt_score, rerank_score = lm_rank_score(beam_strs, tgt_sent, probs) if best_str != tgt_sent: # see if this is too stupid, or doesn't work at all error_source.append(src_sent) error_target.append(tgt_sent) error_generated.append(best_str) if FLAGS.score: target_score.append(tgt_score) generated_score.append(rerank_score) else: correct_source.append(src_sent) correct_target.append(tgt_sent) correct_generated.append(best_str) if FLAGS.score: correct_target_score.append(tgt_score) correct_generated_score.append(rerank_score) print("outputting in csv file...") # dump it out in train_dir with open(FLAGS.train_dir + "/err_analysis/" + "err_val_alpha_" + str(alpha) + ".csv", 'wb') as f: wrt = csv.writer(f) if not FLAGS.score: for s, t, g in itertools.izip(error_source, error_target, error_generated): wrt.writerow([s, t, g]) # source, correct target, wrong target wrt.writerow([]) # space between error and correct for s, t, g in itertools.izip(correct_source, correct_target, correct_generated): wrt.writerow([s, t, g]) else: for s, t, g, ts, gs in itertools.izip(error_source, error_target, error_generated, target_score, generated_score): wrt.writerow([s, t, ts, g, gs]) # source, correct target, wrong target wrt.writerow([]) # space between error and correct for s, t, g, ts, gs in itertools.izip(correct_source, correct_target, correct_generated, correct_target_score, correct_generated_score): wrt.writerow([s, t, ts, g, gs]) print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")
def validate(model, sess, x_dev, y_dev): valid_costs, valid_lengths = [], [] for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers): cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask) valid_costs.append(cost * target_mask.shape[1]) valid_lengths.append(np.sum(target_mask[1:, :])) valid_cost = sum(valid_costs) / float(sum(valid_lengths)) return valid_cost
def batch_decode(model, sess, x_dev, y_dev, alpha): error_source = []; error_target = []; error_generated = []; generated_score = []; generated_lm_score = []; generated_nw_score = []; target_lm_score = []; target_nw_score = []; for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1, FLAGS.num_layers): src_sent = detokenize_tgt(source_tokens, reverse_vocab) tgt_sent = detokenize_tgt(target_tokens, reverse_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) tgt_nw_score = network_score(model, sess, encoder_output, target_tokens) print("Network score: %f" % tgt_nw_score) # Language Model ranking if not FLAGS.score: best_str = lm_rank(beam_strs, probs) else: best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs) tgt_lm_score = lm.score(tgt_sent) # see if this is too stupid, or doesn't work at all error_source.append(src_sent) error_target.append(tgt_sent) error_generated.append(best_str) if FLAGS.score: target_lm_score.append(tgt_lm_score) target_nw_score.append(tgt_nw_score) generated_score.append(rerank_score) generated_nw_score.append(nw_score) generated_lm_score.append(lm_score) print("outputting in csv file...") # dump it out in train_dir with open(FLAGS.train_dir + "/err_analysis/" + "err_val_alpha_" + str(alpha) + ".csv", 'wb') as f: wrt = csv.writer(f) wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score']) if not FLAGS.score: for s, t, g in itertools.izip(error_source, error_target, error_generated): wrt.writerow([s, t, g]) # source, correct target, wrong target else: for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score): wrt.writerow([s, t, tns, tls, g, gs, gns, gls]) print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")
def validate(model, sess, x_dev, y_dev): cost_all = 0 step = 0 for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers): cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask) lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) cost = cost / mean_length cost_all += cost step += 1 return cost_all / step
def cer_evaluate(model, sess, x_dev, y_dev, curr_epoch, sample_rate=0.005, delay_sampling=10): valid_cers = [] for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1, FLAGS.num_layers): # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode # beam decode might only work on GPU...so we use greedy decode beam_toks, probs = decode_beam(model, sess, encoder_output, 1) # De-tokenize beam_strs = detokenize(beam_toks, rev_vocab) target_str = detokenize_tgt(target_tokens, rev_vocab) # Language Model ranking best_str = lm_rank(beam_strs, probs) # return first MML-based string valid_cers.append(compute_cer(target_str, best_str)) if curr_epoch >= delay_sampling: if np.random.sample() <= sample_rate: # don't know performance penalty of np.random.sample() print("sampled target str: %s" % target_str) print("sampled best str: %s" % best_str) mean_valid_cer = sum(valid_cers) / float(len(valid_cers)) return mean_valid_cer
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum( map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: print( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length)) ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) ## Validate valid_cost = validate(model, sess, x_dev, y_dev) print("Epoch %d Validation cost: %f" % (epoch, valid_cost)) if len(previous_losses) > 2 and valid_cost > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(valid_cost) sys.stdout.flush()
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum( map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 best_epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None total_iters = 0 start_time = time.time() while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic total_iters += np.sum(target_mask) tps = total_iters / (time.time() - start_time) current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length, std_length)) epoch_toc = time.time() ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") ## Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) else: previous_losses.append(valid_cost) best_epoch = epoch model.saver.save(sess, checkpoint_path, global_step=epoch) sys.stdout.flush()
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 best_epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99*exp_cost + 0.01*cost exp_length = 0.99*exp_length + 0.01*mean_length exp_norm = 0.99*exp_norm + 0.01*grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: logging.info('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length)) epoch_toc = time.time() ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") ## Validate valid_cost = validate(model, sess, x_dev, y_dev) logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic)) if len(previous_losses) > 2 and valid_cost > previous_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) else: previous_losses.append(valid_cost) best_epoch = epoch model.saver.save(sess, checkpoint_path, global_step=epoch) sys.stdout.flush()
def train_seq2seq(model, sess, x_dev, y_dev, x_train, y_train): print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99 * exp_cost + 0.01 * cost exp_length = 0.99 * exp_length + 0.01 * mean_length exp_norm = 0.99 * exp_norm + 0.01 * grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: print( 'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length)) ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) ## Validate valid_cost = validate(model, sess, x_dev, y_dev) print("Epoch %d Validation cost: %f" % (epoch, valid_cost)) ## Evaluate if FLAGS.evaluate == "CER": # CER evaluate does not do beam-decode with n-gram LM, Max Likelihood decode # because we don't have a language model (chop-off is clean-cut) # we evaluate on validation set cer = cer_evaluate(model, sess, x_dev, y_dev, epoch, delay_sampling=10) print("Epoch %d CER: %f" % (epoch, cer)) if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(valid_cost) sys.stdout.flush() return model
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. logging.info("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + os.sep + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=nlc_data.char_tokenizer) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) logging.info("Vocabulary size: %d" % vocab_size) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir)) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) with tf.Session() as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) tic = time.time() params = tf.trainable_variables() num_params = sum( map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print("Number of params: %d (retrieval took %f secs)" % (num_params, toc - tic)) epoch = 0 best_epoch = 0 train_costs = [] valid_costs = [] previous_valid_losses = [] while FLAGS.epochs == 0 or epoch < FLAGS.epochs: epoch += 1 current_step = 0 epoch_cost = 0 epoch_tic = time.time() for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step.fa grad_norm, cost, param_norm = model.train( sess, source_tokens, source_mask, target_tokens, target_mask) lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) cost = cost / mean_length epoch_cost += cost current_step += 1 if current_step % FLAGS.print_every == 0: logging.info( 'epoch %d, iter %d, cost %f, length mean/std %f/%f' % (epoch, current_step, cost, mean_length, std_length)) if (epoch >= FLAGS.anomaly_epochs) and \ (cost >= FLAGS.anomaly_threshold): write_anomaly( source_tokens, vocab_path, SOURCE_PATH + '_' + str(epoch) + '_' + str(current_step)) write_anomaly( target_tokens, vocab_path, TARGET_PATH + '_' + str(epoch) + '_' + str(current_step)) # One epoch average train cost train_costs.append(epoch_cost / current_step) # After one epoch average validate cost epoch_toc = time.time() epoch_time = epoch_toc - epoch_tic valid_cost = validate(model, sess, x_dev, y_dev) valid_costs.append(valid_cost) logging.info("Epoch %d Validation cost: %f time:to %2fs" % (epoch, valid_cost, epoch_time)) # Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") if len(previous_valid_losses ) > 2 and valid_cost > previous_valid_losses[-1]: logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor) sess.run(model.learning_rate_decay_op) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) else: previous_valid_losses.append(valid_cost) best_epoch = epoch model.saver.save(sess, checkpoint_path, global_step=epoch) pickle.dump([train_costs, valid_costs], open('costs_data.pkl', 'wb'))
best_epoch = 2 vocab_size = 42 checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt") config = tf.ConfigProto( device_count={'GPU': 0} ) with tf.Session(config=config) as sess: logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch)) valid_costs, valid_lengths = [], [] for source_tokens, source_mask, target_tokens, target_mask in pair_iter("data/char/valid.ids.x", "data/char/valid.ids.y", 1, FLAGS.num_layers): # cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask) # valid_costs.append(cost * target_mask.shape[1]) # valid_lengths.append(np.sum(target_mask[1:, :])) # enc = model.encode(sess, source_tokens, source_mask) # (48, 128, 256) # print(enc.shape) # pdb.set_trace() # dec = model.decode(sess, enc, target_tokens, target_mask) # (50, 128, 42) # dec = model.decode_beam(sess, enc) encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) orig_str = "".join(reverse_vocab[x] for x in source_tokens.T[0]) noisy_str = "".join(reverse_vocab[x] for x in target_tokens.T[0])
def batch_decode(model, sess, x_dev, y_dev, alpha): error_source = []; error_target = []; error_generated = []; generated_score = []; generated_lm_score = []; generated_nw_score = []; target_lm_score = []; target_nw_score = []; count = 0 for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1, FLAGS.num_layers, sort_and_shuffle=False): src_sent = detokenize_tgt(source_tokens, reverse_vocab) tgt_sent = detokenize_tgt(target_tokens, reverse_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) tgt_nw_score = network_score(model, sess, encoder_output, target_tokens) print("pair: %d network score: %f" % (count+1, tgt_nw_score)) # Language Model ranking if not FLAGS.score: best_str = lm_rank(beam_strs, probs) else: best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs) tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split()) print("%s | %s | %s" % (src_sent, tgt_sent, best_str)) # see if this is too stupid, or doesn't work at all error_source.append(src_sent) error_target.append(tgt_sent) error_generated.append(best_str) if FLAGS.score: target_lm_score.append(tgt_lm_score) target_nw_score.append(tgt_nw_score) generated_score.append(rerank_score) generated_nw_score.append(nw_score) generated_lm_score.append(lm_score) count += 1 """ print("outputting in csv file...") # dump it out in train_dir with open("err_val_alpha_" + str(alpha) + ".csv", 'wb') as f: wrt = csv.writer(f) wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score']) if not FLAGS.score: for s, t, g in itertools.izip(error_source, error_target, error_generated): wrt.writerow([s, t, g]) # source, correct target, wrong target else: for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score): wrt.writerow([s, t, tns, tls, g, gs, gns, gls]) """ #print("err_val_alpha_" + str(alpha) + ".csv" + "file finished") with open(FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f: f.write("\n".join(error_generated))
def batch_decode(model, sess, x_dev, y_dev, alpha): error_source = [] error_target = [] error_generated = [] generated_score = [] generated_lm_score = [] generated_nw_score = [] target_lm_score = [] target_nw_score = [] count = 0 for source_tokens, source_mask, target_tokens, target_mask in pair_iter( x_dev, y_dev, 1, FLAGS.num_layers, sort_and_shuffle=False): src_sent = detokenize_tgt(source_tokens, reverse_vocab) tgt_sent = detokenize_tgt(target_tokens, reverse_vocab) # Encode encoder_output = model.encode(sess, source_tokens, source_mask) # Decode beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size) # De-tokenize beam_strs = detokenize(beam_toks, reverse_vocab) tgt_nw_score = network_score(model, sess, encoder_output, target_tokens) print("pair: %d network score: %f" % (count + 1, tgt_nw_score)) # Language Model ranking if not FLAGS.score: best_str = lm_rank(beam_strs, probs) else: best_str, rerank_score, nw_score, lm_score = lm_rank_score( beam_strs, probs) tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split()) print("%s | %s | %s" % (src_sent, tgt_sent, best_str)) # see if this is too stupid, or doesn't work at all error_source.append(src_sent) error_target.append(tgt_sent) error_generated.append(best_str) if FLAGS.score: target_lm_score.append(tgt_lm_score) target_nw_score.append(tgt_nw_score) generated_score.append(rerank_score) generated_nw_score.append(nw_score) generated_lm_score.append(lm_score) count += 1 """ print("outputting in csv file...") # dump it out in train_dir with open("err_val_alpha_" + str(alpha) + ".csv", 'wb') as f: wrt = csv.writer(f) wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score']) if not FLAGS.score: for s, t, g in itertools.izip(error_source, error_target, error_generated): wrt.writerow([s, t, g]) # source, correct target, wrong target else: for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score): wrt.writerow([s, t, tns, tls, g, gs, gns, gls]) """ # print("err_val_alpha_" + str(alpha) + ".csv" + "file finished") with open( FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f: f.write("\n".join(error_generated))
def train(): """Train a translation model using NLC data.""" # Prepare NLC data. print("Preparing NLC data in %s" % FLAGS.data_dir) x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data( FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size, tokenizer=get_tokenizer(FLAGS)) vocab, _ = nlc_data.initialize_vocabulary(vocab_path) vocab_size = len(vocab) print("Vocabulary size: %d" % vocab_size) with tf.Session() as sess: print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, vocab_size, False) print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev)) if False: tic = time.time() params = tf.trainable_variables() num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params)) toc = time.time() print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic)) epoch = 0 previous_losses = [] exp_cost = None exp_length = None exp_norm = None while (FLAGS.epochs == 0 or epoch < FLAGS.epochs): epoch += 1 current_step = 0 ## Train for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers): # Get a batch and make a step. tic = time.time() grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask) toc = time.time() iter_time = toc - tic current_step += 1 lengths = np.sum(target_mask, axis=0) mean_length = np.mean(lengths) std_length = np.std(lengths) if not exp_cost: exp_cost = cost exp_length = mean_length exp_norm = grad_norm else: exp_cost = 0.99*exp_cost + 0.01*cost exp_length = 0.99*exp_length + 0.01*mean_length exp_norm = 0.99*exp_norm + 0.01*grad_norm cost = cost / mean_length if current_step % FLAGS.print_every == 0: print('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' % (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length)) ## Checkpoint checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) ## Validate valid_cost = validate(model, sess, x_dev, y_dev) print("Epoch %d Validation cost: %f" % (epoch, valid_cost)) if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(valid_cost) sys.stdout.flush()
def process_samples(sess, actor, x, y): # this batch things together based on the batch size # in the end, we can just izip the arrays, and iterate on them rewards, actions_dist, actions, actions_mask = [], [], [], [] source_tokenss, target_tokenss = [], [] # actions: (time, batch_size, vocab) # condition on ground truth targets # for universal padding, we can iterate through the dataset, and determine the # optimal batch_max_len for each batch, then pass in # batch_pads can be a list, we keep track of an iterator, and each turn just pass it in # Note: action_dist is [T, batch_size, vocab_size] # target_tokens now have SOS, EOS for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x, y, 1, FLAGS.num_layers, add_sos_eos_bool=True): source_tokenss.append(np.squeeze(source_tokens).tolist()) target_tokenss.append(np.squeeze(target_tokens).tolist()) encoder_output = actor.encode(sess, source_tokens, source_mask) best_tok, _ = decode_beam(actor, sess, encoder_output, 1) best_tok[0][-1] = nlc_data.EOS_ID # last data mark as EOS padded_best_tok = padded(best_tok, depth=1, batch_pad=32) # TODO: remember to switch to a univeral pad list # best_tok has <SOS> and <EOS> now # way to solve batch problem - pad best_tok! decoder_output, _, _ = actor.decode(sess, encoder_output, np.matrix(padded_best_tok).T) tok_highest_prob = np.argmax(np.squeeze(decoder_output), axis=1) # clipped_tok_highest_prob = clip_after_eos(tok_highest_prob) # hmmm, not sure if we should clip after eos clipped_tok_highest_prob = tok_highest_prob # print("beam token: {}".format(best_tok)) # print("token with highest prob: ") # print(clipped_tok_highest_prob) # print("target toks: ") # print(np.squeeze(target_tokens)) # TODO: test reward :( # TODO: if something is still not certain in this model, it's the reward reward = decompose_reward(np.squeeze(target_tokens), np.array(best_tok[0], dtype=np.int32)) # print(reward) rewards.append(reward) # need to pad actions and make masks... # print("action shape: %s" % (best_tok.shape,)) # print(best_tok[0]) # print("action dist shape: %s" % (tok_prob.shape,)) # print("token len: {}".format(clipped_tok_highest_prob.shape)) # print("target len: {}".format(target_tokens.shape)) # print("action dist shape: {}".format(decoder_output.shape)) actions.append(clipped_tok_highest_prob) actions_dist.append(decoder_output) if len(rewards) % FLAGS.batch_size == 0: # padding problem solved!! # TODO: concatenate failed (why?) batch = (np.array(rewards), np.concatenate(actions_dist, axis=1), np.array(actions)) # notice the transpose for source, not for target # notice no sos_eos for target! x_padded = np.array(padded(source_tokenss, FLAGS.num_layers)).T source_masks = (x_padded != nlc_data.PAD_ID).astype(np.int32) y_padded = np.array(padded(target_tokenss, 1)) target_masks = (y_padded != nlc_data.PAD_ID).astype(np.int32) batch += (x_padded, source_masks, y_padded, target_masks) rewards, actions_dist, actions = [], [], [] source_tokenss, target_tokenss = [], [] yield batch # for residuals x_padded = np.array(padded(source_tokenss, FLAGS.num_layers)).T source_masks = (x_padded != nlc_data.PAD_ID).astype(np.int32) y_padded = np.array(padded(target_tokenss, 1)) target_masks = (y_padded != nlc_data.PAD_ID).astype(np.int32) yield (np.array(rewards), np.concatenate(actions_dist, axis=1), np.array(actions), x_padded, source_masks, y_padded, target_masks) return
def make_dot(self, fd): lg.info(funcname()) # header fd.write('digraph {\n') # timeline fd.write('{\n') fd.write('node [shape=plaintext];\n') fd.write(' -> '.join(map(lambda x: x.get_dot_name(), sorted(self.times)))) fd.write(';\n') fd.write('}\n\n') # threads list fd.write('{\n') fd.write('rank = same; "past"; ') fd.write('; '.join(map(lambda x: x.get_dot_name(), self.threads))) fd.write('\n}\n\n') # time ranking fd.write('node [shape=box];\n') for tm, elist in self.time_events_th_uniq.iteritems(): fd.write('{{ rank = same; {0}; '.format(tm.get_dot_name())) fd.write('; '.join(map(lambda x: x.get_dot_name(), elist))) fd.write('}\n') fd.write('\n') # events def node_list(node): while node: yield node node = node.child raise StopIteration for th in self.threads: for a, b in pair_iter(node_list(th)): fd.write('{0} -> {1};\n'.format(a.get_dot_name(), b.get_dot_name())) pass pass # invisible nodes for ie in self.invis_nodes: fd.write(InvisibleLink(ie.parent, ie).get_dot_code()); fd.write('\n') fd.write(InvisibleLink(ie, ie.child).get_dot_code()); fd.write('\n') fd.write('\n') # nodes attributes def write_attribs(lis): for ev in lis: fd.write(ev.get_dot_node_name_attrib()) fd.write('\n') pass fd.write('\n') pass write_attribs(self.events) write_attribs(self.invis_nodes) # ipc links for il in self.ipc_links: fd.write(il.get_dot_code()); fd.write('\n') # footer fd.write('}')
def make_graph(self): lg.info(funcname()) # parse all events and fill base structures for _,v in self.raw_events.iteritems(): tm = TimeNode(v.time) if not tm in self.times: self.times.add(tm) self.time_events[tm] = list() pass th = ThreadNode(v.thread, v.proc) if not th in self.threads: self.threads.add(th) self.thread_events[th] = list() pass ev = EventNode(tm, th, v) self.thread_events[th].append(ev) self.time_events[tm].append(ev) self.events.append(ev) pass # build the linked list of event and thread nodes for th, elist in self.thread_events.iteritems(): elist[0].set_parent(th) elist[0].first = True for a,b in pair_iter(elist): b.set_parent(a) pass pass # build time events list unique by thread for tm, elist in self.time_events.iteritems(): self.time_events_th_uniq[tm] = list(unique_everseen(elist, lambda x: x.thread)) # set invisible nodes to fix time ranking for tma, tmb in pair_iter(sorted(self.times)): elista = self.time_events_th_uniq[tma] elistb = self.time_events_th_uniq[tmb] have_threads_a = {e.thread for e in elista} have_threads_b = {e.thread for e in elistb} lack_threads_b = self.threads - have_threads_b for th in lack_threads_b: if th in have_threads_a: e = (e for e in elista if e.thread == th).next() while e.child and e.child.time == e.time: e = e.child pass if e.child: ie = InvisibleNode(str(th) + str(tmb) + 'invis') ie.thread = th ie.time = tmb e.set_sec_child(ie) e.child.set_sec_parent(ie) ie.set_parent(e) ie.set_child(e.child) self.time_events_th_uniq[tmb].append(ie) self.invis_nodes.append(ie) pass pass pass # for th in lac_th pass self.find_hor_links() self.shrink_graph() pass