示例#1
0
文件: decode.py 项目: windweller/nlc
def batch_decode(model, sess, x_dev, y_dev, alpha):

    error_source = []; correct_source = []
    error_target = []; correct_target = []
    error_generated = []; correct_generated = []
    target_score = []; correct_target_score = []
    generated_score = []; correct_generated_score = []

    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1,
                                                                            FLAGS.num_layers):
      src_sent = detokenize_tgt(source_tokens, reverse_vocab)
      tgt_sent = detokenize_tgt(target_tokens, reverse_vocab)

      # Encode
      encoder_output = model.encode(sess, source_tokens, source_mask)
      # Decode
      beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size)
      # De-tokenize
      beam_strs = detokenize(beam_toks, reverse_vocab)
      # Language Model ranking
      if not FLAGS.score:
        best_str = lm_rank(beam_strs, probs)
      else:
        best_str, tgt_score, rerank_score = lm_rank_score(beam_strs, tgt_sent, probs)

      if best_str != tgt_sent:
        # see if this is too stupid, or doesn't work at all
        error_source.append(src_sent)
        error_target.append(tgt_sent)
        error_generated.append(best_str)
        if FLAGS.score:
          target_score.append(tgt_score)
          generated_score.append(rerank_score)
      else:
        correct_source.append(src_sent)
        correct_target.append(tgt_sent)
        correct_generated.append(best_str)
        if FLAGS.score:
          correct_target_score.append(tgt_score)
          correct_generated_score.append(rerank_score)

    print("outputting in csv file...")

    # dump it out in train_dir
    with open(FLAGS.train_dir + "/err_analysis/" + "err_val_alpha_" + str(alpha) + ".csv", 'wb') as f:
      wrt = csv.writer(f)
      if not FLAGS.score:
        for s, t, g in itertools.izip(error_source, error_target, error_generated):
          wrt.writerow([s, t, g])  # source, correct target, wrong target
        wrt.writerow([])  # space between error and correct
        for s, t, g in itertools.izip(correct_source, correct_target, correct_generated):
          wrt.writerow([s, t, g])
      else:
        for s, t, g, ts, gs in itertools.izip(error_source, error_target, error_generated, target_score, generated_score):
          wrt.writerow([s, t, ts, g, gs])  # source, correct target, wrong target
        wrt.writerow([])  # space between error and correct
        for s, t, g, ts, gs in itertools.izip(correct_source, correct_target, correct_generated, correct_target_score, correct_generated_score):
          wrt.writerow([s, t, ts, g, gs])

    print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")
示例#2
0
文件: train.py 项目: windweller/nlc
def validate(model, sess, x_dev, y_dev):
  valid_costs, valid_lengths = [], []
  for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers):
    cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask)
    valid_costs.append(cost * target_mask.shape[1])
    valid_lengths.append(np.sum(target_mask[1:, :]))
  valid_cost = sum(valid_costs) / float(sum(valid_lengths))
  return valid_cost
示例#3
0
def validate(model, sess, x_dev, y_dev):
  valid_costs, valid_lengths = [], []
  for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers):
    cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask)
    valid_costs.append(cost * target_mask.shape[1])
    valid_lengths.append(np.sum(target_mask[1:, :]))
  valid_cost = sum(valid_costs) / float(sum(valid_lengths))
  return valid_cost
示例#4
0
def batch_decode(model, sess, x_dev, y_dev, alpha):

    error_source = [];
    error_target = [];
    error_generated = [];
    generated_score = [];
    generated_lm_score = [];
    generated_nw_score = [];
    target_lm_score = [];
    target_nw_score = [];

    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1,
                                                                            FLAGS.num_layers):
      src_sent = detokenize_tgt(source_tokens, reverse_vocab)
      tgt_sent = detokenize_tgt(target_tokens, reverse_vocab)

      # Encode
      encoder_output = model.encode(sess, source_tokens, source_mask)
      # Decode
      beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size)
      # De-tokenize
      beam_strs = detokenize(beam_toks, reverse_vocab)
      tgt_nw_score = network_score(model, sess, encoder_output, target_tokens)
      print("Network score: %f" % tgt_nw_score)
      # Language Model ranking
      if not FLAGS.score:
        best_str = lm_rank(beam_strs, probs)
      else:
        best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs)
        tgt_lm_score = lm.score(tgt_sent)

      # see if this is too stupid, or doesn't work at all
      error_source.append(src_sent)
      error_target.append(tgt_sent)
      error_generated.append(best_str)
      if FLAGS.score:
        target_lm_score.append(tgt_lm_score)
        target_nw_score.append(tgt_nw_score)
        generated_score.append(rerank_score)
        generated_nw_score.append(nw_score)
        generated_lm_score.append(lm_score)

    print("outputting in csv file...")

    # dump it out in train_dir
    with open(FLAGS.train_dir + "/err_analysis/" + "err_val_alpha_" + str(alpha) + ".csv", 'wb') as f:
      wrt = csv.writer(f)
      wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score'])
      if not FLAGS.score:
        for s, t, g in itertools.izip(error_source, error_target, error_generated):
          wrt.writerow([s, t, g])  # source, correct target, wrong target
      else:
        for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score):
          wrt.writerow([s, t, tns, tls, g, gs, gns, gls]) 

    print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")
示例#5
0
def validate(model, sess, x_dev, y_dev):
    cost_all = 0
    step = 0
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
            x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers):
        cost = model.test(sess, source_tokens, source_mask, target_tokens,
                          target_mask)
        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        cost = cost / mean_length
        cost_all += cost
        step += 1
    return cost_all / step
示例#6
0
def cer_evaluate(model, sess, x_dev, y_dev, curr_epoch, sample_rate=0.005, delay_sampling=10):
    valid_cers = []
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1,
                                                                            FLAGS.num_layers):
        # Encode
        encoder_output = model.encode(sess, source_tokens, source_mask)
        # Decode
        # beam decode might only work on GPU...so we use greedy decode
        beam_toks, probs = decode_beam(model, sess, encoder_output, 1)
        # De-tokenize
        beam_strs = detokenize(beam_toks, rev_vocab)
        target_str = detokenize_tgt(target_tokens, rev_vocab)
        # Language Model ranking
        best_str = lm_rank(beam_strs, probs)  # return first MML-based string

        valid_cers.append(compute_cer(target_str, best_str))
        if curr_epoch >= delay_sampling:
            if np.random.sample() <= sample_rate:  # don't know performance penalty of np.random.sample()
                print("sampled target str: %s" % target_str)
                print("sampled best str: %s" % best_str)

    mean_valid_cer = sum(valid_cers) / float(len(valid_cers))
    return mean_valid_cer
示例#7
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        print('Initial validation cost: %f' %
              validate(model, sess, x_dev, y_dev))

        if False:
            tic = time.time()
            params = tf.trainable_variables()
            num_params = sum(
                map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
            toc = time.time()
            print("Number of params: %d (retreival took %f secs)" %
                  (num_params, toc - tic))

        epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
            epoch += 1
            current_step = 0

            ## Train
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                tic = time.time()

                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                toc = time.time()
                iter_time = toc - tic
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    print(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f'
                        % (epoch, current_step, cost, exp_cost / exp_length,
                           grad_norm, param_norm, iter_time, mean_length,
                           std_length))

            ## Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)

            ## Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

            if len(previous_losses) > 2 and valid_cost > max(
                    previous_losses[-3:]):
                sess.run(model.learning_rate_decay_op)
            previous_losses.append(valid_cost)
            sys.stdout.flush()
示例#8
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' %
                     validate(model, sess, x_dev, y_dev))

        if False:
            tic = time.time()
            params = tf.trainable_variables()
            num_params = sum(
                map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
            toc = time.time()
            print("Number of params: %d (retreival took %f secs)" %
                  (num_params, toc - tic))

        epoch = 0
        best_epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
            epoch += 1
            current_step = 0

            ## Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                tic = time.time()

                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                toc = time.time()
                iter_time = toc - tic
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, length mean/std %f/%f'
                        %
                        (epoch, current_step, cost, exp_cost / exp_length,
                         grad_norm, param_norm, tps, mean_length, std_length))
            epoch_toc = time.time()

            ## Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            ## Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" %
                         (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess,
                                    checkpoint_path + ("-%d" % best_epoch))
            else:
                previous_losses.append(valid_cost)
                best_epoch = epoch
                model.saver.save(sess, checkpoint_path, global_step=epoch)
            sys.stdout.flush()
示例#9
0
文件: train.py 项目: windweller/nlc
def train():
  """Train a translation model using NLC data."""
  # Prepare NLC data.
  logging.info("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  logging.info("Vocabulary size: %d" % vocab_size)

  if not os.path.exists(FLAGS.train_dir):
    os.makedirs(FLAGS.train_dir)
  file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
  logging.getLogger().addHandler(file_handler)

  print(vars(FLAGS))
  with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
    json.dump(FLAGS.__flags, fout)

  with tf.Session() as sess:
    logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

    if False:
      tic = time.time()
      params = tf.trainable_variables()
      num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
      toc = time.time()
      print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))

    epoch = 0
    best_epoch = 0
    previous_losses = []
    exp_cost = None
    exp_length = None
    exp_norm = None
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
      epoch += 1
      current_step = 0

      ## Train
      epoch_tic = time.time()
      for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
        # Get a batch and make a step.
        tic = time.time()

        grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

        toc = time.time()
        iter_time = toc - tic
        current_step += 1

        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        std_length = np.std(lengths)

        if not exp_cost:
          exp_cost = cost
          exp_length = mean_length
          exp_norm = grad_norm
        else:
          exp_cost = 0.99*exp_cost + 0.01*cost
          exp_length = 0.99*exp_length + 0.01*mean_length
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          logging.info('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length))
      epoch_toc = time.time()

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

      ## Validate
      valid_cost = validate(model, sess, x_dev, y_dev)

      logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic))

      if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
        logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor)
        sess.run(model.learning_rate_decay_op)
        model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch))
      else:
        previous_losses.append(valid_cost)
        best_epoch = epoch
        model.saver.save(sess, checkpoint_path, global_step=epoch)
      sys.stdout.flush()
示例#10
0
def train_seq2seq(model, sess, x_dev, y_dev, x_train, y_train):
    print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

    if False:
        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        print("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))

    epoch = 0
    previous_losses = []
    exp_cost = None
    exp_length = None
    exp_norm = None
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
        epoch += 1
        current_step = 0

        ## Train
        for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size,
                                                                                FLAGS.num_layers):
            # Get a batch and make a step.
            tic = time.time()

            grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

            toc = time.time()
            iter_time = toc - tic
            current_step += 1

            lengths = np.sum(target_mask, axis=0)
            mean_length = np.mean(lengths)
            std_length = np.std(lengths)

            if not exp_cost:
                exp_cost = cost
                exp_length = mean_length
                exp_norm = grad_norm
            else:
                exp_cost = 0.99 * exp_cost + 0.01 * cost
                exp_length = 0.99 * exp_length + 0.01 * mean_length
                exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

            cost = cost / mean_length

            if current_step % FLAGS.print_every == 0:
                print(
                    'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                    (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time,
                     mean_length,
                     std_length))

        ## Checkpoint
        checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)

        ## Validate
        valid_cost = validate(model, sess, x_dev, y_dev)

        print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

        ## Evaluate
        if FLAGS.evaluate == "CER":
            # CER evaluate does not do beam-decode with n-gram LM, Max Likelihood decode
            # because we don't have a language model (chop-off is clean-cut)

            # we evaluate on validation set
            cer = cer_evaluate(model, sess, x_dev, y_dev, epoch, delay_sampling=10)
            print("Epoch %d CER: %f" % (epoch, cer))

        if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]):
            sess.run(model.learning_rate_decay_op)
        previous_losses.append(valid_cost)
        sys.stdout.flush()

    return model
示例#11
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + os.sep + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=nlc_data.char_tokenizer)
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(
            map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        print("Number of params: %d (retrieval took %f secs)" %
              (num_params, toc - tic))

        epoch = 0
        best_epoch = 0
        train_costs = []
        valid_costs = []
        previous_valid_losses = []
        while FLAGS.epochs == 0 or epoch < FLAGS.epochs:
            epoch += 1
            current_step = 0
            epoch_cost = 0
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.fa
                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                cost = cost / mean_length
                epoch_cost += cost
                current_step += 1

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, length mean/std %f/%f' %
                        (epoch, current_step, cost, mean_length, std_length))

                    if (epoch >= FLAGS.anomaly_epochs) and \
                            (cost >= FLAGS.anomaly_threshold):
                        write_anomaly(
                            source_tokens, vocab_path, SOURCE_PATH + '_' +
                            str(epoch) + '_' + str(current_step))
                        write_anomaly(
                            target_tokens, vocab_path, TARGET_PATH + '_' +
                            str(epoch) + '_' + str(current_step))

            # One epoch average train cost
            train_costs.append(epoch_cost / current_step)

            # After one epoch average validate cost
            epoch_toc = time.time()
            epoch_time = epoch_toc - epoch_tic
            valid_cost = validate(model, sess, x_dev, y_dev)
            valid_costs.append(valid_cost)
            logging.info("Epoch %d Validation cost: %f time:to %2fs" %
                         (epoch, valid_cost, epoch_time))

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")
            if len(previous_valid_losses
                   ) > 2 and valid_cost > previous_valid_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess,
                                    checkpoint_path + ("-%d" % best_epoch))
            else:
                previous_valid_losses.append(valid_cost)
                best_epoch = epoch
                model.saver.save(sess, checkpoint_path, global_step=epoch)

        pickle.dump([train_costs, valid_costs], open('costs_data.pkl', 'wb'))
示例#12
0
文件: test.py 项目: davidrossouw/nlc
best_epoch = 2
vocab_size = 42
checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

config = tf.ConfigProto(
    device_count={'GPU': 0}
)

with tf.Session(config=config) as sess:
    logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch))

    valid_costs, valid_lengths = [], []
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter("data/char/valid.ids.x", "data/char/valid.ids.y", 1,
                                                                            FLAGS.num_layers):
        # cost = model.test(sess, source_tokens, source_mask, target_tokens, target_mask)
        # valid_costs.append(cost * target_mask.shape[1])
        # valid_lengths.append(np.sum(target_mask[1:, :]))
        # enc = model.encode(sess, source_tokens, source_mask) # (48, 128, 256)
        # print(enc.shape)
        # pdb.set_trace()
        # dec = model.decode(sess, enc, target_tokens, target_mask) # (50, 128, 42)
        # dec = model.decode_beam(sess, enc)
        encoder_output = model.encode(sess, source_tokens, source_mask)
        # Decode
        beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size)
        # De-tokenize
        beam_strs = detokenize(beam_toks, reverse_vocab)
        orig_str = "".join(reverse_vocab[x] for x in source_tokens.T[0])
        noisy_str = "".join(reverse_vocab[x] for x in target_tokens.T[0])
示例#13
0
def batch_decode(model, sess, x_dev, y_dev, alpha):

    error_source = [];
    error_target = [];
    error_generated = [];
    generated_score = [];
    generated_lm_score = [];
    generated_nw_score = [];
    target_lm_score = [];
    target_nw_score = [];

    count = 0
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_dev, y_dev, 1,
                                                                            FLAGS.num_layers, sort_and_shuffle=False):
      src_sent = detokenize_tgt(source_tokens, reverse_vocab)
      tgt_sent = detokenize_tgt(target_tokens, reverse_vocab)

      # Encode
      encoder_output = model.encode(sess, source_tokens, source_mask)
      # Decode
      beam_toks, probs = decode_beam(model, sess, encoder_output, FLAGS.beam_size)
      # De-tokenize
      beam_strs = detokenize(beam_toks, reverse_vocab)
      tgt_nw_score = network_score(model, sess, encoder_output, target_tokens)
      print("pair: %d network score: %f" % (count+1, tgt_nw_score))
      # Language Model ranking
      if not FLAGS.score:
        best_str = lm_rank(beam_strs, probs)
      else:
        best_str, rerank_score, nw_score, lm_score = lm_rank_score(beam_strs, probs)
        tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split())

      print("%s | %s | %s" % (src_sent, tgt_sent, best_str))

      # see if this is too stupid, or doesn't work at all
      error_source.append(src_sent)
      error_target.append(tgt_sent)
      error_generated.append(best_str)
      if FLAGS.score:
        target_lm_score.append(tgt_lm_score)
        target_nw_score.append(tgt_nw_score)
        generated_score.append(rerank_score)
        generated_nw_score.append(nw_score)
        generated_lm_score.append(lm_score)
      count += 1

    """
    print("outputting in csv file...")

    # dump it out in train_dir
    with open("err_val_alpha_" + str(alpha) + ".csv", 'wb') as f:
      wrt = csv.writer(f)
      wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score'])
      if not FLAGS.score:
        for s, t, g in itertools.izip(error_source, error_target, error_generated):
          wrt.writerow([s, t, g])  # source, correct target, wrong target
      else:
        for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score):
          wrt.writerow([s, t, tns, tls, g, gs, gns, gls])
    """

    #print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")

    with open(FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) + "/alpha" + str(alpha) + ".txt", 'wb') as f:
      f.write("\n".join(error_generated))
示例#14
0
def batch_decode(model, sess, x_dev, y_dev, alpha):
    error_source = []
    error_target = []
    error_generated = []
    generated_score = []
    generated_lm_score = []
    generated_nw_score = []
    target_lm_score = []
    target_nw_score = []

    count = 0
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
            x_dev, y_dev, 1, FLAGS.num_layers, sort_and_shuffle=False):
        src_sent = detokenize_tgt(source_tokens, reverse_vocab)
        tgt_sent = detokenize_tgt(target_tokens, reverse_vocab)

        # Encode
        encoder_output = model.encode(sess, source_tokens, source_mask)
        # Decode
        beam_toks, probs = decode_beam(model, sess, encoder_output,
                                       FLAGS.beam_size)
        # De-tokenize
        beam_strs = detokenize(beam_toks, reverse_vocab)
        tgt_nw_score = network_score(model, sess, encoder_output,
                                     target_tokens)
        print("pair: %d network score: %f" % (count + 1, tgt_nw_score))
        # Language Model ranking
        if not FLAGS.score:
            best_str = lm_rank(beam_strs, probs)
        else:
            best_str, rerank_score, nw_score, lm_score = lm_rank_score(
                beam_strs, probs)
            tgt_lm_score = lm.score(tgt_sent) / len(tgt_sent.split())

        print("%s | %s | %s" % (src_sent, tgt_sent, best_str))

        # see if this is too stupid, or doesn't work at all
        error_source.append(src_sent)
        error_target.append(tgt_sent)
        error_generated.append(best_str)
        if FLAGS.score:
            target_lm_score.append(tgt_lm_score)
            target_nw_score.append(tgt_nw_score)
            generated_score.append(rerank_score)
            generated_nw_score.append(nw_score)
            generated_lm_score.append(lm_score)
        count += 1
    """
    print("outputting in csv file...")

    # dump it out in train_dir
    with open("err_val_alpha_" + str(alpha) + ".csv", 'wb') as f:
      wrt = csv.writer(f)
      wrt.writerow(['Bad Input', 'Ground Truth', 'Network Score', 'LM Score', 'Generated Hypothesis', 'Combined Score', 'Network Score', 'LM Score'])
      if not FLAGS.score:
        for s, t, g in itertools.izip(error_source, error_target, error_generated):
          wrt.writerow([s, t, g])  # source, correct target, wrong target
      else:
        for s, t, tns, tls, g, gs, gns, gls in itertools.izip(error_source, error_target, target_nw_score, target_lm_score, error_generated, generated_score, generated_nw_score, generated_lm_score):
          wrt.writerow([s, t, tns, tls, g, gs, gns, gls])
    """

    # print("err_val_alpha_" + str(alpha) + ".csv" + "file finished")

    with open(
            FLAGS.tokenizer.lower() + "_runs" + str(FLAGS.beam_size) +
            "/alpha" + str(alpha) + ".txt", 'wb') as f:
        f.write("\n".join(error_generated))
示例#15
0
文件: train.py 项目: NaveenAri/nlc
def train():
  """Train a translation model using NLC data."""
  # Prepare NLC data.
  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  with tf.Session() as sess:
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    print('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

    if False:
      tic = time.time()
      params = tf.trainable_variables()
      num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
      toc = time.time()
      print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))

    epoch = 0
    previous_losses = []
    exp_cost = None
    exp_length = None
    exp_norm = None
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
      epoch += 1
      current_step = 0

      ## Train
      for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
        # Get a batch and make a step.
        tic = time.time()

        grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

        toc = time.time()
        iter_time = toc - tic
        current_step += 1

        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        std_length = np.std(lengths)

        if not exp_cost:
          exp_cost = cost
          exp_length = mean_length
          exp_norm = grad_norm
        else:
          exp_cost = 0.99*exp_cost + 0.01*cost
          exp_length = 0.99*exp_length + 0.01*mean_length
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          print('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length))

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
      model.saver.save(sess, checkpoint_path, global_step=model.global_step)

      ## Validate
      valid_cost = validate(model, sess, x_dev, y_dev)

      print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

      if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]):
        sess.run(model.learning_rate_decay_op)
      previous_losses.append(valid_cost)
      sys.stdout.flush()
示例#16
0
def process_samples(sess, actor, x, y):
    # this batch things together based on the batch size
    # in the end, we can just izip the arrays, and iterate on them
    rewards, actions_dist, actions, actions_mask = [], [], [], []
    source_tokenss, target_tokenss = [], []
    # actions: (time, batch_size, vocab) # condition on ground truth targets

    # for universal padding, we can iterate through the dataset, and determine the
    # optimal batch_max_len for each batch, then pass in
    # batch_pads can be a list, we keep track of an iterator, and each turn just pass it in

    # Note: action_dist is [T, batch_size, vocab_size]
    # target_tokens now have SOS, EOS
    for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x, y, 1,
                                                                            FLAGS.num_layers,
                                                                            add_sos_eos_bool=True):

        source_tokenss.append(np.squeeze(source_tokens).tolist())
        target_tokenss.append(np.squeeze(target_tokens).tolist())

        encoder_output = actor.encode(sess, source_tokens, source_mask)
        best_tok, _ = decode_beam(actor, sess, encoder_output, 1)
        best_tok[0][-1] = nlc_data.EOS_ID  # last data mark as EOS
        padded_best_tok = padded(best_tok, depth=1, batch_pad=32)  # TODO: remember to switch to a univeral pad list

        # best_tok has <SOS> and <EOS> now
        # way to solve batch problem - pad best_tok!

        decoder_output, _, _ = actor.decode(sess, encoder_output, np.matrix(padded_best_tok).T)

        tok_highest_prob = np.argmax(np.squeeze(decoder_output), axis=1)
        # clipped_tok_highest_prob = clip_after_eos(tok_highest_prob)  # hmmm, not sure if we should clip after eos
        clipped_tok_highest_prob = tok_highest_prob

        # print("beam token: {}".format(best_tok))
        # print("token with highest prob: ")
        # print(clipped_tok_highest_prob)
        # print("target toks: ")
        # print(np.squeeze(target_tokens))

        # TODO: test reward :(
        # TODO: if something is still not certain in this model, it's the reward
        reward = decompose_reward(np.squeeze(target_tokens), np.array(best_tok[0], dtype=np.int32))
        # print(reward)
        rewards.append(reward)

        # need to pad actions and make masks...
        # print("action shape: %s" % (best_tok.shape,))
        # print(best_tok[0])
        # print("action dist shape: %s" % (tok_prob.shape,))

        # print("token len: {}".format(clipped_tok_highest_prob.shape))
        # print("target len: {}".format(target_tokens.shape))
        # print("action dist shape: {}".format(decoder_output.shape))

        actions.append(clipped_tok_highest_prob)
        actions_dist.append(decoder_output)

        if len(rewards) % FLAGS.batch_size == 0:
            # padding problem solved!!
            # TODO: concatenate failed (why?)
            batch = (np.array(rewards), np.concatenate(actions_dist, axis=1), np.array(actions))

            # notice the transpose for source, not for target
            # notice no sos_eos for target!
            x_padded = np.array(padded(source_tokenss, FLAGS.num_layers)).T
            source_masks = (x_padded != nlc_data.PAD_ID).astype(np.int32)
            y_padded = np.array(padded(target_tokenss, 1))
            target_masks = (y_padded != nlc_data.PAD_ID).astype(np.int32)

            batch += (x_padded, source_masks, y_padded, target_masks)

            rewards, actions_dist, actions = [], [], []
            source_tokenss, target_tokenss = [], []

            yield batch

    # for residuals
    x_padded = np.array(padded(source_tokenss, FLAGS.num_layers)).T
    source_masks = (x_padded != nlc_data.PAD_ID).astype(np.int32)
    y_padded = np.array(padded(target_tokenss, 1))
    target_masks = (y_padded != nlc_data.PAD_ID).astype(np.int32)

    yield (np.array(rewards), np.concatenate(actions_dist, axis=1), np.array(actions),
           x_padded, source_masks, y_padded, target_masks)

    return
示例#17
0
    def make_dot(self, fd):
        lg.info(funcname())

        # header        
        fd.write('digraph {\n')
        
        # timeline
        fd.write('{\n')
        fd.write('node [shape=plaintext];\n')
        fd.write(' -> '.join(map(lambda x: x.get_dot_name(), sorted(self.times))))
        fd.write(';\n')
        fd.write('}\n\n')
        
        # threads list
        fd.write('{\n')
        fd.write('rank = same; "past"; ')
        fd.write('; '.join(map(lambda x: x.get_dot_name(), self.threads)))
        fd.write('\n}\n\n')
        
        # time ranking
        fd.write('node [shape=box];\n')
        for tm, elist in self.time_events_th_uniq.iteritems():
            fd.write('{{ rank = same; {0}; '.format(tm.get_dot_name()))
            fd.write('; '.join(map(lambda x: x.get_dot_name(), elist)))
            fd.write('}\n')
        fd.write('\n')
        
        # events
        def node_list(node): 
            while node:
                yield node
                node = node.child
            raise StopIteration
        for th in self.threads:
            for a, b in pair_iter(node_list(th)):
                fd.write('{0} -> {1};\n'.format(a.get_dot_name(), b.get_dot_name()))
                pass
            pass
        
        # invisible nodes
        for ie in self.invis_nodes:
            fd.write(InvisibleLink(ie.parent, ie).get_dot_code()); fd.write('\n')
            fd.write(InvisibleLink(ie, ie.child).get_dot_code()); fd.write('\n')
        fd.write('\n')
        
        # nodes attributes
        def write_attribs(lis):
            for ev in lis:
                fd.write(ev.get_dot_node_name_attrib())
                fd.write('\n')
                pass
            fd.write('\n')
            pass
        write_attribs(self.events)
        write_attribs(self.invis_nodes)
        
        # ipc links
        for il in self.ipc_links:
            fd.write(il.get_dot_code()); fd.write('\n')
        
        
        
        # footer
        fd.write('}')
示例#18
0
 def make_graph(self):
     lg.info(funcname())
     
     # parse all events and fill base structures
     for _,v in self.raw_events.iteritems():
         tm = TimeNode(v.time)
         if not tm in self.times:
             self.times.add(tm)
             self.time_events[tm] = list()
             pass
         
         th = ThreadNode(v.thread, v.proc)
         if not th in self.threads:
             self.threads.add(th)
             self.thread_events[th] = list()
             pass
         
         ev = EventNode(tm, th, v)
         self.thread_events[th].append(ev)
         self.time_events[tm].append(ev)
         self.events.append(ev)
         
         pass
     
     # build the linked list of event and thread nodes
     for th, elist in self.thread_events.iteritems():
         elist[0].set_parent(th)
         elist[0].first = True
         
         for a,b in pair_iter(elist):
             b.set_parent(a)
             pass
         pass
     
     # build time events list unique by thread
     for tm, elist in self.time_events.iteritems():
         self.time_events_th_uniq[tm] = list(unique_everseen(elist, lambda x: x.thread))
     
     # set invisible nodes to fix time ranking
     for tma, tmb in pair_iter(sorted(self.times)):
         elista = self.time_events_th_uniq[tma]
         elistb = self.time_events_th_uniq[tmb]
         have_threads_a = {e.thread for e in elista}
         have_threads_b = {e.thread for e in elistb}
         lack_threads_b = self.threads - have_threads_b
         
         for th in lack_threads_b:
             if th in have_threads_a:
                 e = (e for e in elista if e.thread == th).next()
                 while e.child and e.child.time == e.time: 
                     e = e.child
                     pass
                 if e.child:
                     ie = InvisibleNode(str(th) + str(tmb) + 'invis')
                     ie.thread = th
                     ie.time = tmb
                     e.set_sec_child(ie)
                     e.child.set_sec_parent(ie)
                     ie.set_parent(e)
                     ie.set_child(e.child)
                     self.time_events_th_uniq[tmb].append(ie)
                     
                     self.invis_nodes.append(ie)
                     pass
                 pass
             pass # for th in lac_th
         pass
     
     self.find_hor_links()
     self.shrink_graph()
     
     pass