def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to RNNLM data directory") raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data config = get_config() config.hidden_size = FLAGS.hidden_size config.vocab_size = len(word_map) eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = RnnlmInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = RnnlmModel(is_training=True, config=config, input_=train_input) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input) tf.summary.scalar("Validation Loss", mvalid.cost) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path)
def main(_): # Turn this on to try the model code with this source file itself! __TESTING = False if __TESTING: (train_data, valid_data), word_map = reader.rnnlm_gen_data(__file__, reader.__file__) else: if not FLAGS.data_path: raise ValueError("Must set --data_path to RNNLM data directory") raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data config = get_config() config.hidden_size = FLAGS.hidden_size config.vocab_size = len(word_map) if __TESTING: # use a much smaller scale on our tiny test data config.num_steps = 8 config.batch_size = 4 model = RNNLMModel(config) train_producer = reader.RNNLMProducer(train_data, config.batch_size, config.num_steps) trainer = RNNLMModelTrainer(model, config) valid_producer = reader.RNNLMProducer(valid_data, config.batch_size, config.num_steps) # Save variables to disk if you want to prevent crash... # Data producer can also be saved to preverse feeding progress. checkpoint = tf.train.Checkpoint(trainer=trainer, data_feeder=train_producer) manager = tf.train.CheckpointManager(checkpoint, "checkpoints/", 5) for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) lr = config.learning_rate * lr_decay trainer.train_one_epoch(train_producer, lr) manager.save() eval_loss = trainer.evaluate(valid_producer) print("validating: loss={}".format(eval_loss)) # Export print("Saving model to %s." % FLAGS.save_path) spec = [tf.TensorSpec(shape=[config.num_layers, 2, 1, config.hidden_size], dtype=data_type(), name="context"), tf.TensorSpec(shape=[1, 1], dtype=tf.int32, name="word_id")] cfunc = model.single_step.get_concrete_function(*spec) cfunc2 = model.get_initial_state.get_concrete_function() tf.saved_model.save(model, FLAGS.save_path, signatures={"single_step": cfunc, "get_initial_state": cfunc2})
target = skip_window targets_to_avoid = [skip_window] for j in range(num_skips): while target in targets_to_avoid: target = random.randint(0, span - 1) targets_to_avoid.append(target) batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j, 0] = buffer[target] weights[i * num_skips + j] = abs(1.0 / (target - skip_window)) buffer.append(train_data[data_index]) data_index = (data_index + 1) % len(train_data) return batch, labels, weights # 读取数据 raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data # train_data: data reverse_wordmap = dict(zip(word_map.values(), word_map.keys())) vocabulary_size = len(word_map) cooc_data_index = 0 cooc_mat = lil_matrix((vocabulary_size, vocabulary_size), dtype=np.float32) dataset_size = len(train_data) print(cooc_mat.shape) def generate_cooc(train_data, embed_batch_size, num_skips, skip_window): data_index = 0 print('Running %d iterations to compute the co-occurance matrix' % (dataset_size // embed_batch_size)) for i in range(dataset_size // embed_batch_size):
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to RNNLM data directory") raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data reverse_wordmap = dict(zip(word_map.values(), word_map.keys())) vocabulary_size = len(word_map) config = get_config() config.vocab_size = len(word_map) config.hidden_size = FLAGS.hidden_size config.num_layers = FLAGS.num_layers config.batch_size = FLAGS.batch_size config.keep_prob = FLAGS.keep_prob config.max_max_epoch = FLAGS.max_epoch eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 # word embedding参数设置 embed_batch_size = 128 embedding_size = 200 skip_window = 1 num_skips = 2 valid_size = 16 valid_window = 100 embed_num_steps = 100001 valid_examples = np.array( random.sample(range(valid_window), valid_size // 2)) valid_examples = np.append( valid_examples, random.sample(range(1000, 1000 + valid_window), valid_size // 2)) num_sampled = 64 graph_skipgram = tf.Graph() with graph_skipgram.as_default(): train_dataset = tf.placeholder(tf.int32, shape=[embed_batch_size]) train_labels = tf.placeholder(tf.int32, shape=[embed_batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) softmax_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) softmax_biases = tf.Variable(tf.zeros([vocabulary_size])) embed = tf.nn.embedding_lookup(embeddings, train_dataset) print("Embed size: %s" % embed.get_shape().as_list()) loss = tf.reduce_mean( tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed, labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size)) optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) with tf.Session(graph=graph_skipgram) as session: tf.global_variables_initializer().run() print("Initialized!") average_loss = 0 for step in range(embed_num_steps): batch_data, batch_labels = generate_batch( train_data=train_data, embed_batch_size=embed_batch_size, num_skips=num_skips, skip_window=skip_window) feed_dict = {train_dataset: batch_data, train_labels: batch_labels} _, lo = session.run([optimizer, loss], feed_dict=feed_dict) average_loss += lo if step % 2000 == 0: if step > 0: average_loss = average_loss / 2000 print("Averge loss at step %d: %f" % (step, average_loss)) average_loss = 0 if step % 10000 == 0: sim = similarity.eval() for i in range(valid_size): valid_word = reverse_wordmap[valid_examples[i]] top_k = 8 nearest = (-sim[i, :]).argsort()[1:top_k + 1] log = "Nearest to %s:" % valid_word for k in range(top_k): close_word = reverse_wordmap[nearest[k]] log = log + " " + close_word + "," print(log) final_embeddings = normalized_embeddings.eval() graph_lstmlm = tf.Graph() with graph_lstmlm.as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = RnnlmInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = RnnlmModel(is_training=True, config=config, input_=train_input, skipgram_embeddings=final_embeddings) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input, skipgram_embeddings=final_embeddings) tf.summary.scalar("Validation Loss", mvalid.cost) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path)
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to RNNLM data directory") raw_data = reader.rnnlm_raw_data(FLAGS.data_path, FLAGS.vocab_path) train_data, valid_data, _, word_map = raw_data config = get_config() config.vocab_size = len(word_map) config.hidden_size = FLAGS.hidden_size config.num_layers = FLAGS.num_layers config.batch_size = FLAGS.batch_size config.keep_prob = FLAGS.keep_prob config.max_max_epoch = FLAGS.max_epoch eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 skipgram_embeddings = tf.Variable(tf.random_normal( shape=[config.vocab_size, 100], stddev=1.0), dtype=tf.float32) cbow_embeddings = tf.Variable(tf.random_normal( shape=[config.vocab_size, 100], stddev=1.0), dtype=tf.float32) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, FLAGS.skip_ckpt) skipgram_embed = skipgram_embeddings.eval() saver.restore(sess, FLAGS.cbow_ckpt) cbow_embed = cbow_embeddings.eval() skipgram_embeddings = tf.constant(skipgram_embed, shape=[config.vocab_size, 100], dtype=tf.float32) cbow_embeddings = tf.constant(cbow_embed, shape=[config.vocab_size, 100], dtype=tf.float32) final_embed = tf.concat([skipgram_embeddings, cbow_embeddings], axis=1) final_embeddings = final_embed.eval() print(final_embeddings[1:5]) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): train_input = RnnlmInput(config=config, data=train_data, name="TrainInput") with tf.variable_scope("Model", reuse=None, initializer=initializer): m = RnnlmModel(is_training=True, config=config, input_=train_input, concat_embeddings=final_embeddings) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): valid_input = RnnlmInput(config=config, data=valid_data, name="ValidInput") with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = RnnlmModel(is_training=False, config=config, input_=valid_input, concat_embeddings=final_embeddings) tf.summary.scalar("Validation Loss", mvalid.cost) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path)