示例#1
0
def main(_):
  if not FLAGS.data_path:
    raise ValueError("Must set --data_path to RNNLM data directory")

  raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
  train_data, valid_data, _, word_map = raw_data

  config = get_config()
  config.hidden_size = FLAGS.hidden_size
  config.vocab_size = len(word_map)
  eval_config = get_config()
  eval_config.batch_size = 1
  eval_config.num_steps = 1

  with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale,
                                                config.init_scale)

    with tf.name_scope("Train"):
      train_input = RnnlmInput(config=config, data=train_data, name="TrainInput")
      with tf.variable_scope("Model", reuse=None, initializer=initializer):
        m = RnnlmModel(is_training=True, config=config, input_=train_input)
      tf.summary.scalar("Training Loss", m.cost)
      tf.summary.scalar("Learning Rate", m.lr)

    with tf.name_scope("Valid"):
      valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput")
      with tf.variable_scope("Model", reuse=True, initializer=initializer):
        mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input)
      tf.summary.scalar("Validation Loss", mvalid.cost)

    sv = tf.train.Supervisor(logdir=FLAGS.save_path)
    with sv.managed_session() as session:
      for i in range(config.max_max_epoch):
        lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)

        m.assign_lr(session, config.learning_rate * lr_decay)

        print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
        train_perplexity = run_epoch(session, m, eval_op=m.train_op,
                                     verbose=True)

        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
        valid_perplexity = run_epoch(session, mvalid)
        print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

      if FLAGS.save_path:
        print("Saving model to %s." % FLAGS.save_path)
        sv.saver.save(session, FLAGS.save_path)
示例#2
0
文件: lstm.py 项目: opendp/sotto-voce
def main(_):
  # Turn this on to try the model code with this source file itself!
  __TESTING = False

  if __TESTING:
    (train_data, valid_data), word_map = reader.rnnlm_gen_data(__file__, reader.__file__)
  else:
    if not FLAGS.data_path:
      raise ValueError("Must set --data_path to RNNLM data directory")

    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
    train_data, valid_data, _, word_map = raw_data

  config = get_config()
  config.hidden_size = FLAGS.hidden_size
  config.vocab_size = len(word_map)

  if __TESTING:
    # use a much smaller scale on our tiny test data
    config.num_steps = 8
    config.batch_size = 4

  model = RNNLMModel(config)
  train_producer = reader.RNNLMProducer(train_data, config.batch_size, config.num_steps)
  trainer = RNNLMModelTrainer(model, config)

  valid_producer = reader.RNNLMProducer(valid_data, config.batch_size, config.num_steps)

  # Save variables to disk if you want to prevent crash...
  # Data producer can also be saved to preverse feeding progress.
  checkpoint = tf.train.Checkpoint(trainer=trainer, data_feeder=train_producer)
  manager = tf.train.CheckpointManager(checkpoint, "checkpoints/", 5)

  for i in range(config.max_max_epoch):
    lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
    lr = config.learning_rate * lr_decay
    trainer.train_one_epoch(train_producer, lr)
    manager.save()

    eval_loss = trainer.evaluate(valid_producer)
    print("validating: loss={}".format(eval_loss))

  # Export
  print("Saving model to %s." % FLAGS.save_path)
  spec = [tf.TensorSpec(shape=[config.num_layers, 2, 1, config.hidden_size], dtype=data_type(), name="context"),
          tf.TensorSpec(shape=[1, 1], dtype=tf.int32, name="word_id")]
  cfunc = model.single_step.get_concrete_function(*spec)
  cfunc2 = model.get_initial_state.get_concrete_function()
  tf.saved_model.save(model, FLAGS.save_path, signatures={"single_step": cfunc, "get_initial_state": cfunc2})
示例#3
0
        target = skip_window
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
            weights[i * num_skips + j] = abs(1.0 / (target - skip_window))
        buffer.append(train_data[data_index])
        data_index = (data_index + 1) % len(train_data)
    return batch, labels, weights


# 读取数据
raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
train_data, valid_data, _, word_map = raw_data
# train_data: data
reverse_wordmap = dict(zip(word_map.values(), word_map.keys()))
vocabulary_size = len(word_map)
cooc_data_index = 0
cooc_mat = lil_matrix((vocabulary_size, vocabulary_size), dtype=np.float32)
dataset_size = len(train_data)
print(cooc_mat.shape)


def generate_cooc(train_data, embed_batch_size, num_skips, skip_window):
    data_index = 0
    print('Running %d iterations to compute the co-occurance matrix' %
          (dataset_size // embed_batch_size))
    for i in range(dataset_size // embed_batch_size):
示例#4
0
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to RNNLM data directory")

    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
    train_data, valid_data, _, word_map = raw_data
    reverse_wordmap = dict(zip(word_map.values(), word_map.keys()))
    vocabulary_size = len(word_map)

    config = get_config()
    config.vocab_size = len(word_map)
    config.hidden_size = FLAGS.hidden_size
    config.num_layers = FLAGS.num_layers
    config.batch_size = FLAGS.batch_size
    config.keep_prob = FLAGS.keep_prob
    config.max_max_epoch = FLAGS.max_epoch

    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    # word embedding参数设置
    embed_batch_size = 128
    embedding_size = 200
    skip_window = 1
    num_skips = 2

    valid_size = 16
    valid_window = 100
    embed_num_steps = 100001

    valid_examples = np.array(
        random.sample(range(valid_window), valid_size // 2))
    valid_examples = np.append(
        valid_examples,
        random.sample(range(1000, 1000 + valid_window), valid_size // 2))
    num_sampled = 64

    graph_skipgram = tf.Graph()
    with graph_skipgram.as_default():
        train_dataset = tf.placeholder(tf.int32, shape=[embed_batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[embed_batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        softmax_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

        embed = tf.nn.embedding_lookup(embeddings, train_dataset)
        print("Embed size: %s" % embed.get_shape().as_list())

        loss = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=softmax_weights,
                                       biases=softmax_biases,
                                       inputs=embed,
                                       labels=train_labels,
                                       num_sampled=num_sampled,
                                       num_classes=vocabulary_size))

        optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               tf.transpose(normalized_embeddings))

    with tf.Session(graph=graph_skipgram) as session:
        tf.global_variables_initializer().run()
        print("Initialized!")
        average_loss = 0
        for step in range(embed_num_steps):
            batch_data, batch_labels = generate_batch(
                train_data=train_data,
                embed_batch_size=embed_batch_size,
                num_skips=num_skips,
                skip_window=skip_window)
            feed_dict = {train_dataset: batch_data, train_labels: batch_labels}
            _, lo = session.run([optimizer, loss], feed_dict=feed_dict)
            average_loss += lo
            if step % 2000 == 0:
                if step > 0:
                    average_loss = average_loss / 2000
                print("Averge loss at step %d: %f" % (step, average_loss))
                average_loss = 0
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reverse_wordmap[valid_examples[i]]
                    top_k = 8
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log = "Nearest to %s:" % valid_word
                    for k in range(top_k):
                        close_word = reverse_wordmap[nearest[k]]
                        log = log + " " + close_word + ","
                    print(log)
        final_embeddings = normalized_embeddings.eval()

    graph_lstmlm = tf.Graph()
    with graph_lstmlm.as_default():
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        with tf.name_scope("Train"):
            train_input = RnnlmInput(config=config,
                                     data=train_data,
                                     name="TrainInput")
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m = RnnlmModel(is_training=True,
                               config=config,
                               input_=train_input,
                               skipgram_embeddings=final_embeddings)
            tf.summary.scalar("Training Loss", m.cost)
            tf.summary.scalar("Learning Rate", m.lr)

        with tf.name_scope("Valid"):
            valid_input = RnnlmInput(config=config,
                                     data=valid_data,
                                     name="ValidInput")
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mvalid = RnnlmModel(is_training=False,
                                    config=config,
                                    input_=valid_input,
                                    skipgram_embeddings=final_embeddings)
            tf.summary.scalar("Validation Loss", mvalid.cost)

        sv = tf.train.Supervisor(logdir=FLAGS.save_path)
        with sv.managed_session() as session:
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                print("Epoch: %d Learning rate: %.3f" %
                      (i + 1, session.run(m.lr)))
                train_perplexity = run_epoch(session,
                                             m,
                                             eval_op=m.train_op,
                                             verbose=True)

                print("Epoch: %d Train Perplexity: %.3f" %
                      (i + 1, train_perplexity))
                valid_perplexity = run_epoch(session, mvalid)
                print("Epoch: %d Valid Perplexity: %.3f" %
                      (i + 1, valid_perplexity))

            if FLAGS.save_path:
                print("Saving model to %s." % FLAGS.save_path)
                sv.saver.save(session, FLAGS.save_path)
示例#5
0
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to RNNLM data directory")

    raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path)
    train_data, valid_data, _, word_map = raw_data

    config = get_config()
    config.vocab_size = len(word_map)
    config.hidden_size = FLAGS.hidden_size
    config.num_layers = FLAGS.num_layers
    config.batch_size = FLAGS.batch_size
    config.keep_prob = FLAGS.keep_prob
    config.max_max_epoch = FLAGS.max_epoch

    eval_config = get_config()
    eval_config.batch_size = 1
    eval_config.num_steps = 1

    skipgram_embeddings = tf.Variable(tf.random_normal(
        shape=[config.vocab_size, 100], stddev=1.0),
                                      dtype=tf.float32)
    cbow_embeddings = tf.Variable(tf.random_normal(
        shape=[config.vocab_size, 100], stddev=1.0),
                                  dtype=tf.float32)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, FLAGS.skip_ckpt)
        skipgram_embed = skipgram_embeddings.eval()
        saver.restore(sess, FLAGS.cbow_ckpt)
        cbow_embed = cbow_embeddings.eval()

        skipgram_embeddings = tf.constant(skipgram_embed,
                                          shape=[config.vocab_size, 100],
                                          dtype=tf.float32)
        cbow_embeddings = tf.constant(cbow_embed,
                                      shape=[config.vocab_size, 100],
                                      dtype=tf.float32)

        final_embed = tf.concat([skipgram_embeddings, cbow_embeddings], axis=1)
        final_embeddings = final_embed.eval()
        print(final_embeddings[1:5])

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)

        with tf.name_scope("Train"):
            train_input = RnnlmInput(config=config,
                                     data=train_data,
                                     name="TrainInput")
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m = RnnlmModel(is_training=True,
                               config=config,
                               input_=train_input,
                               concat_embeddings=final_embeddings)
            tf.summary.scalar("Training Loss", m.cost)
            tf.summary.scalar("Learning Rate", m.lr)

        with tf.name_scope("Valid"):
            valid_input = RnnlmInput(config=config,
                                     data=valid_data,
                                     name="ValidInput")
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                mvalid = RnnlmModel(is_training=False,
                                    config=config,
                                    input_=valid_input,
                                    concat_embeddings=final_embeddings)
            tf.summary.scalar("Validation Loss", mvalid.cost)

        sv = tf.train.Supervisor(logdir=FLAGS.save_path)
        with sv.managed_session() as session:
            for i in range(config.max_max_epoch):
                lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0)
                m.assign_lr(session, config.learning_rate * lr_decay)

                print("Epoch: %d Learning rate: %.3f" %
                      (i + 1, session.run(m.lr)))
                train_perplexity = run_epoch(session,
                                             m,
                                             eval_op=m.train_op,
                                             verbose=True)

                print("Epoch: %d Train Perplexity: %.3f" %
                      (i + 1, train_perplexity))
                valid_perplexity = run_epoch(session, mvalid)
                print("Epoch: %d Valid Perplexity: %.3f" %
                      (i + 1, valid_perplexity))

            if FLAGS.save_path:
                print("Saving model to %s." % FLAGS.save_path)
                sv.saver.save(session, FLAGS.save_path)