def force_decode(): # force_decode it: generate a file which contains every score and the final score; mylog_section("READ DATA") #读入test数据,test不需要新建立词典,直接调用建立好的词典就可以了。 test_data_bucket, _buckets, test_data_order = read_test( FLAGS.data_cache_dir, FLAGS.test_path, get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket) vocab_path = get_vocab_path(FLAGS.data_cache_dir) real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size test_bucket_sizes = [ len(test_data_bucket[b]) for b in range(len(_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog_section("REPORT") mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets:{}".format(FLAGS._buckets)) mylog("FORCE_DECODE:") mylog("total: {}".format(test_total_size)) mylog("bucket_sizes: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth mylog_section("IN TENSORFLOW") with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, run_options, run_metadata) mylog_section("All Variables") show_all_variables() sess.run(model.dropoutRate.assign(1.0)) batch_size = FLAGS.batch_size mylog_section("Data Iterators") dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size, None, data_order=test_data_order) ite = dite.next_original() fdump = open(FLAGS.score_file, 'w') i_sent = 0 mylog_section("FORCE_DECODING") for inputs, outputs, weights, bucket_id in ite: # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]] # positions: [4] mylog("--- decoding {}/{} sent ---".format(i_sent, test_total_size)) i_sent += 1 L = model.step(sess, inputs, outputs, weights, bucket_id, forward_only=True, dump_lstm=False) mylog("LOSS: {}".format(L)) fdump.write("{}\n".format(L)) fdump.close()
def dump_lstm(): # dump the hidden states to some where mylog_section("READ DATA") test_data_bucket, _buckets, test_data_order = read_test( FLAGS.data_cache_dir, FLAGS.test_path, get_vocab_path(FLAGS.data_cache_dir), FLAGS.L, FLAGS.n_bucket) vocab_path = get_vocab_path(FLAGS.data_cache_dir) real_vocab_size = get_real_vocab_size(vocab_path) FLAGS._buckets = _buckets FLAGS.real_vocab_size = real_vocab_size test_bucket_sizes = [ len(test_data_bucket[b]) for b in range(len(_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog_section("REPORT") mylog("real_vocab_size: {}".format(FLAGS.real_vocab_size)) mylog("_buckets:{}".format(FLAGS._buckets)) mylog("DUMP_LSTM:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog_section("MODEL") mylog("Creating Model") model = create_model(sess, run_options, run_metadata) mylog("Init tensors to dump") model.init_dump_states() # dump_graph('graph.txt') mylog_section("All Variables") show_all_variables() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size mylog_section("Data Iterators") dite = DataIterator(model, test_data_bucket, len(_buckets), batch_size, None, data_order=test_data_order) ite = dite.next_original() fdump = open(FLAGS.dump_file, 'wb') mylog_section("DUMP_LSTM") i_sent = 0 for inputs, outputs, weights, bucket_id in ite: # inputs: [[_GO],[1],[2],[3],[_EOS],[pad_id],[pad_id]] # positions: [4] mylog("--- decoding {}/{} sent ---".format(i_sent, test_total_size)) i_sent += 1 # print(inputs) # print(outputs) # print(weights) # print(bucket_id) L, states = model.step(sess, inputs, outputs, weights, bucket_id, forward_only=True, dump_lstm=True) mylog("LOSS: {}".format(L)) sw = StateWrapper() sw.create(inputs, outputs, weights, states) sw.save_to_stream(fdump) # do the following convert: # inputs: [[pad_id],[1],[2],[pad_id],[pad_id],[pad_id]] # positions:[2] fdump.close()
def beam_decode(): mylog("Reading Data...") from_test = None from_vocab_path, to_vocab_path, real_vocab_size_from, real_vocab_size_to = data_utils.get_vocab_info( FLAGS.data_cache_dir) FLAGS._buckets = _buckets FLAGS._beam_buckets = _beam_buckets FLAGS.real_vocab_size_from = real_vocab_size_from FLAGS.real_vocab_size_to = real_vocab_size_to from_test = data_utils.prepare_test_data(FLAGS.data_cache_dir, FLAGS.test_path_from, from_vocab_path) test_data_bucket, test_data_order = read_data_test(from_test) test_bucket_sizes = [ len(test_data_bucket[b]) for b in xrange(len(_beam_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog("from_vocab_size: {}".format(FLAGS.from_vocab_size)) mylog("to_vocab_size: {}".format(FLAGS.to_vocab_size)) mylog("_beam_buckets: {}".format(FLAGS._beam_buckets)) mylog("BEAM_DECODE:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, run_options, run_metadata) show_all_variables() sess.run(model.dropoutRate.assign(1.0)) start_id = 0 n_steps = 0 batch_size = FLAGS.batch_size dite = DataIterator(model, test_data_bucket, len(_beam_buckets), batch_size, None, data_order=test_data_order) ite = dite.next_original() i_sent = 0 targets = [] for source_inputs, bucket_id, length in ite: print("--- decoding {}/{} sent ---".format(i_sent, test_total_size)) i_sent += 1 results = [] # (sentence,score) scores = [0.0] * FLAGS.beam_size sentences = [[] for x in xrange(FLAGS.beam_size)] beam_parent = range(FLAGS.beam_size) target_inputs = [data_utils.GO_ID] * FLAGS.beam_size min_target_length = int(length * FLAGS.min_ratio) + 1 max_target_length = int( length * FLAGS.max_ratio) + 1 # include EOS for i in xrange(max_target_length): if i == 0: top_value, top_index, eos_value = model.beam_step( sess, bucket_id, index=i, sources=source_inputs, target_inputs=target_inputs) else: top_value, top_index, eos_value = model.beam_step( sess, bucket_id, index=i, target_inputs=target_inputs, beam_parent=beam_parent) # top_value = [array[batch_size, batch_size]] # top_index = [array[batch_size, batch_size]] # eos_value = [array[batch_size, 1] ] # expand global_queue = [] if i == 0: nrow = 1 else: nrow = FLAGS.beam_size if i == max_target_length - 1: # last_step for row in xrange(nrow): score = scores[row] + np.log(eos_value[0][row, 0]) word_index = data_utils.EOS_ID beam_index = row global_queue.append((score, beam_index, word_index)) else: for row in xrange(nrow): for col in xrange(top_index[0].shape[1]): score = scores[row] + np.log(top_value[0][row, col]) word_index = top_index[0][row, col] beam_index = row global_queue.append( (score, beam_index, word_index)) global_queue = sorted(global_queue, key=lambda x: -x[0]) if FLAGS.print_beam: print("--------- Step {} --------".format(i)) target_inputs = [] beam_parent = [] scores = [] temp_sentences = [] for j, (score, beam_index, word_index) in enumerate(global_queue): if word_index == data_utils.EOS_ID: if len(sentences[beam_index]) + 1 < min_target_length: continue results.append( (sentences[beam_index] + [word_index], score)) if FLAGS.print_beam: print("*Beam:{} Father:{} word:{} score:{}".format( j, beam_index, word_index, score)) continue if FLAGS.print_beam: print("Beam:{} Father:{} word:{} score:{}".format( j, beam_index, word_index, score)) beam_parent.append(beam_index) target_inputs.append(word_index) scores.append(score) temp_sentences.append(sentences[beam_index] + [word_index]) if len(scores) >= FLAGS.beam_size: break # can not fill beam_size, just repeat the last one while len(scores ) < FLAGS.beam_size and i < max_target_length - 1: beam_parent.append(beam_parent[-1]) target_inputs.append(target_inputs[-1]) scores.append(scores[-1]) temp_sentences.append(temp_sentences[-1]) sentences = temp_sentences # print the 1 best results = sorted(results, key=lambda x: -x[1]) targets.append(results[0][0]) data_utils.ids_to_tokens(targets, to_vocab_path, FLAGS.decode_output)
def beam_decode(): mylog("Reading Data...") from_test = None from_vocab_path, to_vocab_path, real_vocab_size_from, real_vocab_size_to = data_util.get_vocab_info( FLAGS.data_cache_dir) FLAGS._buckets = _buckets FLAGS._beam_buckets = _beam_buckets FLAGS.real_vocab_size_from = real_vocab_size_from FLAGS.real_vocab_size_to = real_vocab_size_to # 得到test文件转换成ids的地址。 from_test = data_util.prepare_test_data(FLAGS.data_cache_dir, FLAGS.test_path_from, from_vocab_path) test_data_bucket, test_data_order = read_data_test(from_test) test_bucket_sizes = [ len(test_data_bucket[b]) for b in xrange(len(_beam_buckets)) ] test_total_size = int(sum(test_bucket_sizes)) # reports mylog("from_vocab_size: {}".format(FLAGS.from_vocab_size)) mylog("to_vocab_size: {}".format(FLAGS.to_vocab_size)) mylog("_beam_buckets: {}".format(FLAGS._beam_buckets)) mylog("BEAM_DECODE:") mylog("total: {}".format(test_total_size)) mylog("buckets: {}".format(test_bucket_sizes)) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = FLAGS.allow_growth with tf.Session(config=config) as sess: # runtime profile if FLAGS.profile: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None mylog("Creating Model") model = create_model(sess, run_options, run_metadata) show_all_variables() sess.run(model.dropoutRate.assign(1.0)) batch_size = FLAGS.batch_size dite = DataIterator(model, test_data_bucket, len(_beam_buckets), batch_size, None, data_order=test_data_order) ite = dite.next_original() i_sent = 0 targets = [] for source_inputs, bucket_id, length in ite: print("--- decoding {}/{} sent ---".format(i_sent, test_total_size)) i_sent += 1 results = [] # (sentence,score) scores = [0.0] * FLAGS.beam_size sentences = [[] for x in xrange(FLAGS.beam_size)] beam_parent = range(FLAGS.beam_size) target_inputs = [data_util.GO_ID] * FLAGS.beam_size min_target_length = int(length * FLAGS.min_ratio) + 1 max_target_length = int( length * FLAGS.max_ratio) + 1 # include EOS for i in xrange(max_target_length): if i == 0: top_value, top_index, eos_value = model.beam_step( sess, bucket_id, index=i, sources=source_inputs, target_inputs=target_inputs) else: top_value, top_index, eos_value = model.beam_step( sess, bucket_id, index=i, target_inputs=target_inputs, beam_parent=beam_parent) # expand global_queue = [ ] #没预测一个词之前都重新定义,用来记录加入句子以后的分数以及对应的句子,最后根据分数排名选出最佳的句子。 if i == 0: #如果是decoder的第一步,则只取第一行作为输出,作为第二次的输入。 nrow = 1 else: nrow = FLAGS.beam_size if i == max_target_length - 1: # last_step for row in xrange(nrow): score = scores[row] + np.log(eos_value[0][row, 0]) word_index = data_util.EOS_ID beam_index = row global_queue.append((score, beam_index, word_index)) else: for row in xrange( nrow ): # 对每一个parent的子预测结果进行预测,xrange(nrow)就是循环遍历每一个Parent。 for col in xrange( top_index[0].shape[1] ): #对每一个parent 的 top_index的每一个预测结果进行计算。 top_index的每一列就是一个预测结果。 score = scores[row] + np.log( top_value[0][ row, col]) #新的分数是原parent的句子的分数*后面生成的单词的分数。 word_index = top_index[0][row, col] beam_index = row #parent global_queue.append( (score, beam_index, word_index)) global_queue = sorted(global_queue, key=lambda x: -x[0]) if FLAGS.print_beam: print("--------- Step {} --------".format(i)) target_inputs = [] beam_parent = [] scores = [] temp_sentences = [] #对排序好的global_queue取前beam_size个存入target_inputs、beam_parent、scores、temp_sentences中供下一步预测使用。 for j, (score, beam_index, word_index) in enumerate(global_queue): if word_index == data_util.EOS_ID: if len(sentences[beam_index]) + 1 < min_target_length: continue results.append((sentences[beam_index] + [word_index], score)) #每预测一个句子,就加入到results中。 if FLAGS.print_beam: print("*Beam:{} Father:{} word:{} score:{}".format( j, beam_index, word_index, score)) continue if FLAGS.print_beam: print("Beam:{} Father:{} word:{} score:{}".format( j, beam_index, word_index, score)) beam_parent.append(beam_index) target_inputs.append(word_index) scores.append(score) temp_sentences.append(sentences[beam_index] + [word_index]) if len(scores ) >= FLAGS.beam_size: #选取前beam_size个结果保存供下次使用。 break # can not fill beam_size, just repeat the last one,不足beam_size个数据,用最后一个数据填充。 while len(scores ) < FLAGS.beam_size and i < max_target_length - 1: beam_parent.append(beam_parent[-1]) target_inputs.append(target_inputs[-1]) scores.append(scores[-1]) temp_sentences.append(temp_sentences[-1]) sentences = temp_sentences # print the 1 best #将一个source的所有预测的句子排序 results = sorted(results, key=lambda x: -x[1]) #选取最好的结果加入到targets中。 targets.append(results[0][0]) #对所有的预测的句子转换成word并写入文件中。 data_util.ids_to_tokens(targets, to_vocab_path, FLAGS.decode_output)