Пример #1
0
Файл: decode.py Проект: sdlg/nlc
def decode():
  # Prepare NLC data.
  global reverse_vocab, vocab, lm

  if FLAGS.lmfile is not None:
    print("Loading Language model from %s" % FLAGS.lmfile)
    lm = kenlm.LanguageModel(FLAGS.lmfile)

  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  with tf.Session() as sess:
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    while True:
      sent = raw_input("Enter a sentence: ")

      output_sent = fix_sent(model, sess, sent)

      print("Candidate: ", output_sent)
Пример #2
0
def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    config = tf.ConfigProto(
        device_count={'GPU': 0}
    )
    with tf.Session(config=config) as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        while True:
            sent = input("Enter a sentence: ")

            output_sent = fix_sent(model, sess, sent)

            print("Candidate: ", output_sent)
Пример #3
0
def minimize_language(args):
    tokenizer = util.get_tokenizer(args.tokenizer_name)

    if not args.splits:
        minimize_partition(args.filename, "conll", args, tokenizer)
    else:
        minimize_partition('dev', 'v4_gold_conll', args, tokenizer)
        minimize_partition('test', 'v4_gold_conll', args, tokenizer)
        minimize_partition('train', 'v4_gold_conll', args, tokenizer)
Пример #4
0
    def __init__(self, config, language='english'):
        self.config = config
        self.language = language

        self.max_seg_len = config['max_segment_len']
        self.max_training_seg = config['max_training_sentences']
        self.data_dir = config['data_dir']

        self.tokenizer = util.get_tokenizer(config['bert_tokenizer_name'])
        self.tensor_samples, self.stored_info = None, None  # For dataset samples; lazy loading
Пример #5
0
def tokenize(sent, vocab, depth=FLAGS.num_layers):
    align = pow(2, depth - 1)
    token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS))
    ones = [1] * len(token_ids)
    pad = (align - len(token_ids)) % align

    token_ids += [nlc_data.PAD_ID] * pad
    ones += [0] * pad

    source = np.array(token_ids).reshape([-1, 1])
    mask = np.array(ones).reshape([-1, 1])

    return source, mask
Пример #6
0
Файл: decode.py Проект: sdlg/nlc
def tokenize(sent, vocab, depth=FLAGS.num_layers):
  align = pow(2, depth - 1)
  token_ids = nlc_data.sentence_to_token_ids(sent, vocab, get_tokenizer(FLAGS))
  ones = [1] * len(token_ids)
  pad = (align - len(token_ids)) % align

  token_ids += [nlc_data.PAD_ID] * pad
  ones += [0] * pad

  source = np.array(token_ids).reshape([-1, 1])
  mask = np.array(ones).reshape([-1, 1])

  return source, mask
Пример #7
0
def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." %
              (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)
        # import codecs
        # outfile = open('predictions.txt','w')
        # outfile2 = open('predictions_all.txt','w')
        # outfile = open('lambda_train.txt','w')
        # infile = open(FLAGS.data_dir+'/'+FLAGS.tokenizer.lower()+'/test.y.txt')
        # lines = infile.readlines()
        # index = 0
        # with codecs.open(FLAGS.data_dir+'/'+FLAGS.tokenizer.lower()+'/test.x.txt', encoding='utf-8') as fr:
        # with codecs.open('ytc_test.txt', encoding='utf-8') as fr:
        # for sent in fr:
        # print("Original: ", sent.strip().encode('utf-8'))
        # output_sent,all_sents,all_prob,all_lmscore = fix_sent(model, sess, sent.encode('utf-8'))
        # print("Revised: ", output_sent)
        # print('*'*30)
        # outfile.write(output_sent+'\n')
        # outfile2.write('\t'.join(all_sents)+'\n')
        # correct_sent = lines[index].strip('\n').strip('\r')
        # for i in range(len(all_sents)):
        #    if all_sents[i] == correct_sent:
        #        outfile.write('10 qid:'+str(index)+' 1:'+str(all_prob[i])+' 2:'+str(all_lmscore[i])+' #'+all_sents[i]+'\n')
        #    else:
        #        outfile.write('0 qid:'+str(index)+' 1:'+str(all_prob[i])+' 2:'+str(all_lmscore[i])+' #'+all_sents[i]+'\n')
        # index+=1
        while True:
            sent = raw_input("Enter a sentence: ")
            output_sent, _, _, _ = fix_sent(model, sess, sent)
            print("Candidate: ", output_sent)
Пример #8
0
def setup_batch_decode(sess):
  # decode for dev-sets, in batches
  global reverse_vocab, vocab, lm

  if FLAGS.lmfile is not None:
    print("Loading Language model from %s" % FLAGS.lmfile)
    lm = kenlm.LanguageModel(FLAGS.lmfile)

  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS), other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
  vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower()=="bpe"))
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
  model = create_model(sess, vocab_size, False)

  return model, x_dev, y_dev
Пример #9
0
def setup_batch_decode(sess):
  # decode for dev-sets, in batches
  global reverse_vocab, vocab, lm

  if FLAGS.lmfile is not None:
    print("Loading Language model from %s" % FLAGS.lmfile)
    lm = kenlm.LanguageModel(FLAGS.lmfile)

  print("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS), other_dev_path="/deep/group/nlp_data/nlc_data/ourdev/bpe")
  vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path, bpe=(FLAGS.tokenizer.lower()=="bpe"))
  vocab_size = len(vocab)
  print("Vocabulary size: %d" % vocab_size)

  print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
  model = create_model(sess, vocab_size, False)

  return model, x_dev, y_dev
def decode():
    # Prepare NLC data.
    global reverse_vocab, vocab, lm

    if FLAGS.lmfile is not None:
        print("Loading Language model from %s" % FLAGS.lmfile)
        lm = kenlm.LanguageModel(FLAGS.lmfile)

    print("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, reverse_vocab = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    print("Vocabulary size: %d" % vocab_size)

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)
        if FLAGS.interactive:
            while True:
                sent = raw_input("Enter a sentence: ")
                if sent == 'exit':
                    exit(0)
                output_sent = fix_sent(model, sess, sent.decode('utf-8'))
                print("Candidate: ", output_sent)
        else:
            test_x_data = os.path.join(FLAGS.data_dir, FLAGS.tokenizer.lower()+'/test.x.txt')
            if not os.path.exists(test_x_data):
                print("Please provide {} to test.".format(test_x_data))
                exit(-1)
            with codecs.open(test_x_data, encoding='utf-8') as fr:
                for sent in fr:
                    print("Original: ", sent.strip().encode('utf-8'))
                    output_sent = fix_sent(model, sess, sent1)
                    print("Revised: ", output_sent.encode('utf-8'))
                    print('*'*30)
Пример #11
0
def minimize_language(args):
    tokenizer = util.get_tokenizer(args.tokenizer_name)

    minimize_partition('dev', 'v4_gold_conll', args, tokenizer)
    minimize_partition('test', 'v4_gold_conll', args, tokenizer)
    minimize_partition('train', 'v4_gold_conll', args, tokenizer)
                        nargs='?',
                        default=256)
    parser.add_argument('-ep', '--epochs', type=int, nargs='?', default=7)

    args = parser.parse_args()

    batch_size = args.batch_size
    epochs = args.epochs

    # 1. data process
    logger.info("start train...")

    x_train, y_train, x_dev, y_dev, x_test, y_test = get_x_and_y_for_train_dev_test(
    )

    tokenizer = util.get_tokenizer(num_words=constant.MAX_NUM_WORDS,
                                   texts=list(x_train) + list(x_dev))

    train_sequences, dev_sequences, test_sequences = util.get_train_dev_test_sequences(
        tokenizer, x_train, x_dev, x_test)

    padded_train_sequences, padded_dev_sequences, padded_test_sequences = util.get_train_dev_test_padded_sequences(
        maxlen=constant.MAX_LEN,
        train_sequences=train_sequences,
        dev_sequences=dev_sequences,
        test_sequences=test_sequences)

    # 2. embedding_matrix

    from config import glove_embedding_data_path

    num_words = constant.MAX_NUM_WORDS
Пример #13
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)

    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        logging.info('Initial validation cost: %f' %
                     validate(model, sess, x_dev, y_dev))

        if False:
            tic = time.time()
            params = tf.trainable_variables()
            num_params = sum(
                map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
            toc = time.time()
            print("Number of params: %d (retreival took %f secs)" %
                  (num_params, toc - tic))

        epoch = 0
        best_epoch = 0
        previous_losses = []
        exp_cost = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
            epoch += 1
            current_step = 0

            ## Train
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                tic = time.time()

                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                toc = time.time()
                iter_time = toc - tic
                total_iters += np.sum(target_mask)
                tps = total_iters / (time.time() - start_time)
                current_step += 1

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                if not exp_cost:
                    exp_cost = cost
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_cost = 0.99 * exp_cost + 0.01 * cost
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                cost = cost / mean_length

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, tps %f, length mean/std %f/%f'
                        %
                        (epoch, current_step, cost, exp_cost / exp_length,
                         grad_norm, param_norm, tps, mean_length, std_length))
            epoch_toc = time.time()

            ## Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

            ## Validate
            valid_cost = validate(model, sess, x_dev, y_dev)

            logging.info("Epoch %d Validation cost: %f time: %f" %
                         (epoch, valid_cost, epoch_toc - epoch_tic))

            if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess,
                                    checkpoint_path + ("-%d" % best_epoch))
            else:
                previous_losses.append(valid_cost)
                best_epoch = epoch
                model.saver.save(sess, checkpoint_path, global_step=epoch)
            sys.stdout.flush()
Пример #14
0
def train():
  """Train a translation model using NLC data."""
  # Prepare NLC data.
  logging.info("Preparing NLC data in %s" % FLAGS.data_dir)

  x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
    FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(), FLAGS.max_vocab_size,
    tokenizer=get_tokenizer(FLAGS))
  vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
  vocab_size = len(vocab)
  logging.info("Vocabulary size: %d" % vocab_size)

  if not os.path.exists(FLAGS.train_dir):
    os.makedirs(FLAGS.train_dir)
  file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
  logging.getLogger().addHandler(file_handler)

  print(vars(FLAGS))
  with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
    json.dump(FLAGS.__flags, fout)

  with tf.Session() as sess:
    logging.info("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, vocab_size, False)

    logging.info('Initial validation cost: %f' % validate(model, sess, x_dev, y_dev))

    if False:
      tic = time.time()
      params = tf.trainable_variables()
      num_params = sum(map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
      toc = time.time()
      print ("Number of params: %d (retreival took %f secs)" % (num_params, toc - tic))

    epoch = 0
    best_epoch = 0
    previous_losses = []
    exp_cost = None
    exp_length = None
    exp_norm = None
    while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
      epoch += 1
      current_step = 0

      ## Train
      epoch_tic = time.time()
      for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
        # Get a batch and make a step.
        tic = time.time()

        grad_norm, cost, param_norm = model.train(sess, source_tokens, source_mask, target_tokens, target_mask)

        toc = time.time()
        iter_time = toc - tic
        current_step += 1

        lengths = np.sum(target_mask, axis=0)
        mean_length = np.mean(lengths)
        std_length = np.std(lengths)

        if not exp_cost:
          exp_cost = cost
          exp_length = mean_length
          exp_norm = grad_norm
        else:
          exp_cost = 0.99*exp_cost + 0.01*cost
          exp_length = 0.99*exp_length + 0.01*mean_length
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          logging.info('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length))
      epoch_toc = time.time()

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")

      ## Validate
      valid_cost = validate(model, sess, x_dev, y_dev)

      logging.info("Epoch %d Validation cost: %f time: %f" % (epoch, valid_cost, epoch_toc - epoch_tic))

      if len(previous_losses) > 2 and valid_cost > previous_losses[-1]:
        logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor)
        sess.run(model.learning_rate_decay_op)
        model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch))
      else:
        previous_losses.append(valid_cost)
        best_epoch = epoch
        model.saver.save(sess, checkpoint_path, global_step=epoch)
      sys.stdout.flush()
Пример #15
0
def train():
    """Train a translation model using NLC data."""
    # Prepare NLC data.
    logging.info("Preparing NLC data in %s" % FLAGS.data_dir)
    x_train, y_train, x_dev, y_dev, vocab_path = nlc_data.prepare_nlc_data(
        FLAGS.data_dir + '/' + FLAGS.tokenizer.lower(),
        FLAGS.max_vocab_size,
        tokenizer=get_tokenizer(FLAGS))
    vocab, _ = nlc_data.initialize_vocabulary(vocab_path)
    vocab_size = len(vocab)
    logging.info("Vocabulary size: %d" % vocab_size)

    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.train_dir))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Session() as sess:
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        model = create_model(sess, vocab_size, False)

        tic = time.time()
        params = tf.trainable_variables()
        num_params = sum(
            map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
        toc = time.time()
        print("Number of params: %d (retreival took %f secs)" %
              (num_params, toc - tic))

        epoch = 0
        best_epoch = 0
        train_costs = []
        valid_costs = []
        previous_valid_losses = []
        while (FLAGS.epochs == 0 or epoch < FLAGS.epochs):
            epoch += 1
            current_step = 0
            epoch_cost = 0
            epoch_tic = time.time()
            for source_tokens, source_mask, target_tokens, target_mask in pair_iter(
                    x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
                # Get a batch and make a step.
                grad_norm, cost, param_norm = model.train(
                    sess, source_tokens, source_mask, target_tokens,
                    target_mask)

                lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(lengths)
                std_length = np.std(lengths)

                cost = cost / mean_length
                epoch_cost += cost
                current_step += 1

                if current_step % FLAGS.print_every == 0:
                    logging.info(
                        'epoch %d, iter %d, cost %f, length mean/std %f/%f' %
                        (epoch, current_step, cost, mean_length, std_length))

            # One epoch average train cost
            train_costs.append(epoch_cost / current_step)

            # After one epoch average validate cost
            epoch_toc = time.time()
            epoch_time = epoch_toc - epoch_tic
            valid_cost = validate(model, sess, x_dev, y_dev)
            valid_costs.append(valid_cost)
            logging.info("Epoch %d Validation cost: %f time: %2fs" %
                         (epoch, valid_cost, epoch_time))

            # Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "best.ckpt")
            if len(previous_valid_losses
                   ) > 2 and valid_cost > previous_valid_losses[-1]:
                logging.info("Annealing learning rate by %f" %
                             FLAGS.learning_rate_decay_factor)
                sess.run(model.learning_rate_decay_op)
                model.saver.restore(sess,
                                    checkpoint_path + ("-%d" % best_epoch))
            else:
                previous_valid_losses.append(valid_cost)
                best_epoch = epoch
                model.saver.save(sess, checkpoint_path, global_step=epoch)

        import cPickle as pickle
        pickle.dump([train_costs, valid_costs], open('costs_data.pkl', 'wb'))