def test_read_train_dev_test(): data_dir = os.path.join(root_dir, "data/ptb") train_path = os.path.join(data_dir, "train") dev_path = os.path.join(data_dir, "valid") test_path = os.path.join(data_dir, "test") cache_dir = os.path.join(root_dir, "data/ptb/cache") vocab_size = 20000 if not os.path.exists(cache_dir): os.mkdir(cache_dir) train_data_bucket, dev_data_bucket, _buckets, vocab_path = data_util.read_train_dev( cache_dir, train_path, dev_path, vocab_size, 100, 10) test_data_bucket, _buckets_test = data_util.read_test( cache_dir, test_path, vocab_path, vocab_size, 100, 10) def print_bucket_data(data): l = [len(x) for x in data] print l print "_buckets: {}\n".format(_buckets) print_bucket_data(train_data_bucket) print_bucket_data(dev_data_bucket) print "_buckets_test: {}\n".format(_buckets_test) print_bucket_data(test_data_bucket)
def dump_lstm(): # dump the hidden states to some where mylog_section("READ DATA") test_data_bucket, _buckets, test_data_order = read_test( FLAGS.data_cache_dir, FLAGS.test_path, get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket) vocab_path = get_vocab_path(FLAGS.data_cache_dir) real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size test_bucket_sizes = [ len(test_data_bucket[b]) for b in range(len(_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog_section("REPORT") mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets:{}".format(FLAGS._buckets)) mylog("DUMP_LSTM:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL") mylog("Creating Model") model = create_model(sess, run_options, run_metadata) mylog("Init tensors to dump") model.init_dump_states() # dump_graph('graph.txt') mylog_section("All Variables") show_all_variables() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size mylog_section("Data Iterators") dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size, None, data_order=test_data_order) ite = dite.next_original() fdump = open(FLAGS.dump_file, 'wb') mylog_section("DUMP_LSTM") i_sent = 0 for inputs, outputs, weights, bucket_id in ite: # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]] # positions: [4] mylog("--- decoding {}/{} sent ---".format(i_sent, test_total_size)) i_sent += 1 # print(inputs) # print(outputs) # print(weights) # print(bucket_id) L, states = model.step(sess, inputs, outputs, weights, bucket_id, forward_only=True, dump_lstm=True) mylog("LOSS: {}".format(L)) sw = StateWrapper() sw.create(inputs, outputs, weights, states) sw.save_to_stream(fdump) # do the following convert: # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]] # positions:[2] fdump.close()
def force_decode(): # force_decode it: generate a file which contains every score and the final score; mylog_section("READ DATA") #读入test数据,test不需要新建立词典,直接调用建立好的词典就可以了。 test_data_bucket, _buckets, test_data_order = read_test( FLAGS.data_cache_dir, FLAGS.test_path, get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket) vocab_path = get_vocab_path(FLAGS.data_cache_dir) real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size test_bucket_sizes = [ len(test_data_bucket[b]) for b in range(len(_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog_section("REPORT") mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets:{}".format(FLAGS._buckets)) mylog("FORCE_DECODE:") mylog("total: {}".format(test_total_size)) mylog("bucket_sizes: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth mylog_section("IN TENSORFLOW") with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, run_options, run_metadata) mylog_section("All Variables") show_all_variables() sess.run(model.dropoutRate.assign(1.0)) batch_size = FLAGS.batch_size mylog_section("Data Iterators") dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size, None, data_order=test_data_order) ite = dite.next_original() fdump = open(FLAGS.score_file, 'w') i_sent = 0 mylog_section("FORCE_DECODING") for inputs, outputs, weights, bucket_id in ite: # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]] # positions: [4] mylog("--- decoding {}/{} sent ---".format(i_sent, test_total_size)) i_sent += 1 L = model.step(sess, inputs, outputs, weights, bucket_id, forward_only=True, dump_lstm=False) mylog("LOSS: {}".format(L)) fdump.write("{}\n".format(L)) fdump.close()
def beam_decode(): # not yet tested: # known issues: # should use next_original mylog("Reading Data...") test_data_bucket, _buckets, test_data_order = read_test( FLAGS.data_cache_dir, FLAGS.test_path, get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket) vocab_path = get_vocab_path(FLAGS.data_cache_dir) real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size test_bucket_sizes = [ len(test_data_bucket[b]) for b in range(len(_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets:{}".format(FLAGS._buckets)) mylog("BEAM_DECODE:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, run_options, run_metadata) mylog("before init_beam_decoder()") show_all_variables() model.init_beam_decoder(beam_size=FLAGS.beam_size, max_steps=FLAGS.beam_step) model.init_beam_variables(sess) mylog("after init_beam_decoder()") show_all_variables() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size, None) ite = dite.next_sequence(stop=True, test=True) i_sent = 0 for inputs, positions, valids, bucket_id in ite: # user : [0] # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]] # positions: [4] print("--- decoding {}/{} sent ---".format(i_sent, n_total_user)) i_sent += 1 # do the following convert: # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]] # positions:[2] PAD_ID = 0 last_history = inputs[positions[0]] inputs_beam = [last_history * FLAGS.beam_size] inputs[positions[0]] = list([PAD_ID] * FLAGS.beam_size) inputs[positions[0] - 1] = list([PAD_ID] * FLAGS.beam_size) positions[0] = positions[0] - 2 if positions[0] >= 2 else 0 scores = [0.0] * FLAGS.beam_size sentences = [[] for x in range(FLAGS.beam_size)] beam_parent = range(FLAGS.beam_size) for i in range(FLAGS.beam_step): if i == 0: top_value, top_index = model.beam_step( sess, index=i, word_inputs_history=inputs, sequence_length=positions, word_inputs_beam=inputs_beam) else: top_value, top_index = model.beam_step( sess, index=i, word_inputs_beam=inputs_beam, beam_parent=beam_parent) # expand global_queue = [] if i == 0: nrow = 1 else: nrow = top_index[0].shape[0] for row in range(nrow): for col in range(top_index[0].shape[1]): score = scores[row] + np.log(top_value[0][row, col]) word_index = top_index[0][row, col] beam_index = row if FLAGS.no_repeat: if not word_index in sentences[beam_index]: global_queue.append( (score, beam_index, word_index)) else: global_queue.append( (score, beam_index, word_index)) global_queue = sorted(global_queue, key=lambda x: -x[0]) inputs_beam = [] beam_parent = [] scores = [] temp_sentences = [] if FLAGS.print_beam: print("--------- Step {} --------".format(i)) for j, (score, beam_index, word_index) in enumerate( global_queue[:FLAGS.beam_size]): if FLAGS.print_beam: print("Beam:{} Father:{} word:{} score:{}".format( j, beam_index, word_index, score)) beam_parent.append(beam_index) inputs_beam.append(word_index) scores.append(score) temp_sentences.append(sentences[beam_index] + [word_index]) inputs_beam = [inputs_beam] sentences = temp_sentences if FLAGS.print_beam: print(sentences)