示例#1
0
def validate(model, sess, x_dev, y_dev):
    valid_costs, valid_lengths = [], []
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, FLAGS.batch_size,
                                                                            FLAGS.num_layers):
        cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask)
        valid_costs.append(cost * target_mask.shape[1])
        valid_lengths.append(np.sum(target_mask[1:, :]))
    valid_cost = sum(valid_costs) / float(sum(valid_lengths))
    return valid_cost
示例#2
0
def batch_decode(model, sess, x_dev, y_dev, alpha):
    error_source = [];
    error_target = [];
    error_generated = [];
    generated_score = [];
    generated_lm_score = [];
    generated_nw_score = [];
    target_lm_score = [];
    target_nw_score = [];

    count = 0
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1,
                                                                            FLAGS.num_layers, sort_and_shuffle=False):
        src_sent = detokenize_tgt(source_tokens, reverse_vocab)
        tgt_sent = detokenize_tgt(target_tokens, reverse_vocab)

        # Encode
        encoder_output = model.encode(sess, source_tokens, source_mask)
        # Decode
        beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size)
        # De-tokenize
        beam_strs = detokenize(beam_toks, reverse_vocab)
        tgt_nw_score = network_score(model, sess, encoder_output, target_tokens)
        print("pair: %d network score: %f" % (count + 1, tgt_nw_score))
        # Language Model ranking
        if not FLAGS.score:
            best_str = lm_rank(beam_strs, probs)
        else:
            best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs)
            tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split())

        print("%s | %s | %s" % (src_sent, tgt_sent, best_str))

        # see if this is too stupid, or doesn't work at all
        error_source.append(src_sent)
        error_target.append(tgt_sent)
        error_generated.append(best_str)
        if FLAGS.score:
            target_lm_score.append(tgt_lm_score)
            target_nw_score.append(tgt_nw_score)
            generated_score.append(rerank_score)
            generated_nw_score.append(nw_score)
            generated_lm_score.append(lm_score)
        count += 1

    with open(FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f:
        f.write("\n".join(error_generated))
示例#3
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())

    vocab, _ = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' %
                     validate(model, sess, x_dev, y_dev))

        epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while FLAGS.epochs == 0 or epoch < FLAGS.epochs:
            epoch += 1
            current_step = 0

            # Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, '
                        'length mean/std %f/%f' %
                        (epoch, current_step, cost, exp_cost / exp_length,
                         grad_norm, param_norm, tps, mean_length, std_length))
            epoch_toc = time.time()

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            # Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" %
                         (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess, checkpoint_path)
            else:
                previous_losses.append(valid_cost)
                model.saver.save(sess, checkpoint_path)
            sys.stdout.flush()
示例#4
0
def batch_decode(model, sess, x_dev, y_dev, alpha):
    error_source = [];
    error_target = [];
    error_generated = [];
    generated_score = [];
    generated_lm_score = [];
    generated_nw_score = [];
    target_lm_score = [];
    target_nw_score = [];

    count = 0
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1,
                                                                            FLAGS.num_layers, sort_and_shuffle=False):
        src_sent = detokenize_tgt(source_tokens, reverse_vocab)
        tgt_sent = detokenize_tgt(target_tokens, reverse_vocab)

        # Encode
        encoder_output = model.encode(sess, source_tokens, source_mask)
        # Decode
        beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size)
        # De-tokenize
        beam_strs = detokenize(beam_toks, reverse_vocab)
        tgt_nw_score = network_score(model, sess, encoder_output, target_tokens)
        print("pair: %d network score: %f" % (count + 1, tgt_nw_score))
        # Language Model ranking
        if not FLAGS.score:
            best_str = lm_rank(beam_strs, probs)
        else:
            best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs)
            tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split())

        print("%s | %s | %s" % (src_sent, tgt_sent, best_str))

        # see if this is too stupid, or doesn't work at all
        error_source.append(src_sent)
        error_target.append(tgt_sent)
        error_generated.append(best_str)
        if FLAGS.score:
            target_lm_score.append(tgt_lm_score)
            target_nw_score.append(tgt_nw_score)
            generated_score.append(rerank_score)
            generated_nw_score.append(nw_score)
            generated_lm_score.append(lm_score)
        count += 1

    """
    print("outputting in csv file...")

    # dump it out in train_dir
    with open("err_val_alpha_" + str(alpha) + ".csv", 'wb') as f:
      wrt = csv.writer(f)
      wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score'])
      if not FLAGS.score:
        for s, t, g in itertools.izip(error_source, error_target, error_generated):
          wrt.writerow([s, t, g])  # source, correct target, wrong target
      else:
        for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score):
          wrt.writerow([s, t, tns, tls, g, gs, gns, gls])
    """

    # print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")

    with open(FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f:
        f.write("\n".join(error_generated))
示例#5
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = prepare_nlc_data(
        FLAGS.data_dir, FLAGS.max_vocab_size, tokenizer=get_tokenizer())

    vocab, _ = initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

        epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while FLAGS.epochs == 0 or epoch < FLAGS.epochs:
            epoch += 1
            current_step = 0

            # Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size,
                                                                                    FLAGS.num_layers):
                # Get a batch and make a step.
                grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, '
                        'length mean/std %f/%f' %
                        (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, tps, mean_length,
                         std_length))
            epoch_toc = time.time()

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            # Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess, checkpoint_path)
            else:
                previous_losses.append(valid_cost)
                model.saver.save(sess, checkpoint_path)
            sys.stdout.flush()