def encode(sess, model, config, sentences): # Load vocabularies. en_vocab_path = os.path.join(config.data_dir, "vocab%d.in" % config.en_vocab_size) fr_vocab_path = os.path.join(config.data_dir, "vocab%d.out" % config.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) means = [] logvars = [] for i, sentence in enumerate(sentences): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) # Which bucket does it belong to? bucket_id = len(config.buckets) - 1 for i, bucket in enumerate(config.buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, _, _ = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. mean, logvar = model.encode_to_latent(sess, encoder_inputs, bucket_id) means.append(mean) logvars.append(logvar) return means, logvars
def prepare_data(gen_config): train_path = gen_config.train_dir voc_file_path = [train_path + _incorpus, train_path + _outcorpus] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # # print("Preparing Chitchat disc_data in %s" % gen_config.data_dir) # train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( # gen_config.data_dir, vocab, gen_config.vocab_size) # # # Read disc_data into buckets and compute their sizes. # print ("Reading development and training disc_data (limit: %d)." # % gen_config.max_train_data_size) query_path = os.path.join(train_path + _incorpus) answer_path = os.path.join(train_path + _outcorpus) null_path = os.path.join(train_path + _nullcorpus) gen_path = os.path.join(train_path + _gencorpus) dev_set = read_data(gen_config, query_path, answer_path) train_set = read_data(gen_config, query_path, answer_path, gen_config.max_train_data_size) negative_train_set = read_data(gen_config, null_path, gen_path, gen_config.max_train_data_size) null_train_set = read_data(gen_config, null_path, answer_path, gen_config.max_train_data_size) return vocab, rev_vocab, dev_set, train_set, negative_train_set, null_train_set
def get_dataset(gen_config): """ 获取训练数据 :return: vocab, rev_vocab, dev_set, train_set """ train_path = os.path.join(gen_config.train_dir, "chitchat.train") voc_file_path = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary( vocab_path) # {dog: 0, cat: 1} [dog, cat] print(just("Preparing Chitchat gen_data in %s" % gen_config.train_dir)) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( gen_config.train_dir, vocab, gen_config.vocab_size) # Read disc_data into buckets and compute their sizes. print( just("Reading development and training gen_data (limit: %d)." % gen_config.max_train_data_size)) dev_set = read_data(gen_config, dev_query, dev_answer) train_set = read_data(gen_config, train_query, train_answer, gen_config.max_train_data_size) return vocab, rev_vocab, dev_set, train_set
def test_decoder(gen_config): with tf.Session() as sess: model = create_model(sess, gen_config, forward_only=True, name_scope=gen_config.name_model) model.batch_size = 1 train_path = os.path.join(gen_config.train_dir, "chitchat.train") voc_file_path = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab) print("token_id: ", token_ids) bucket_id = len(gen_config.buckets) - 1 for i, bucket in enumerate(gen_config.buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: print("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights, _, _ = model.get_batch( {bucket_id: [(token_ids, [1])]}, bucket_id, model.batch_size, type=0) print("bucket_id: ", bucket_id) print("encoder_inputs:", encoder_inputs) print("decoder_inputs:", decoder_inputs) print("target_weights:", target_weights) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) print("output_logits", np.shape(output_logits)) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join( [tf.compat.as_str(rev_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def init_session(sess, gen_config): model = gens.create_model(sess, gen_config, forward_only=True, name_scope="genModel") vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) return sess, model, vocab, rev_vocab
def reward_evaluate(disc_config): beam_path = os.path.join(disc_config.beam_dir, disc_config.beam_file) vocab_path = os.path.join(disc_config.data_dir, "vocab%d.all" % disc_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) answer_train_ids_path = beam_path + (".ids%d.answer" % disc_config.vocab_size) query_train_ids_path = beam_path + (".ids%d.query" % disc_config.vocab_size) data_utils.data_to_token_ids(beam_path + ".gen", answer_train_ids_path, vocab) data_utils.data_to_token_ids(beam_path + ".query", query_train_ids_path, vocab) unique_list = [] beam_set, unique_list = read_data(disc_config, query_train_ids_path, answer_train_ids_path, unique_list=unique_list) for set in beam_set: print(len(set)) tf_config = tf.ConfigProto(allow_soft_placement=True, device_count={'GPU': 1}) g1 = tf.Graph() with g1.as_default(): sess_r = tf.Session(config=tf_config, graph=g1) disc_model = create_model(sess_r, disc_config, disc_config.name_model, vocab) buckets_num = len(disc_config.buckets) reward_sum = 0 length_sum = 0 line_num = 0 for bucket_id in range(buckets_num): gen_data_reader = Gen_Data_Reader(beam_set[bucket_id]) batch_number = gen_data_reader.get_batch_num(disc_config.batch_size) line_num += batch_number * disc_config.batch_size for batch_id in range(batch_number): train_batch = gen_data_reader.generate_testing_batch( disc_config.batch_size) train_query, train_answer = get_beam_batch(train_batch, disc_config.batch_size) reward_batch_sum, length_batch_sum = reward_beam_fetch( sess=sess_r, disc_config=disc_config, disc_model=disc_model, bucket_id=bucket_id, queries=train_query, answers=train_answer) reward_sum += reward_batch_sum length_sum += length_batch_sum print('the average reward of {}: {}'.format(disc_config.beam_file, reward_sum / line_num)) print('the average length of {}: {}'.format(disc_config.beam_file, length_sum / line_num))
def prepare_data(config): train_path = os.path.join(config.train_dir, "train") voc_file_path = [train_path + ".query", train_path + ".answer", train_path + ".gen"] vocab_path = os.path.join(config.train_dir, "vocab%d.all" % config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) print("Preparing train disc_data in %s" % config.train_dir) train_query_path, train_answer_path, train_gen_path =data_utils.hier_prepare_disc_data(config.train_dir, vocab, config.vocab_size) query_set, answer_set, gen_set = hier_read_data(config, train_query_path, train_answer_path, train_gen_path) return query_set, answer_set, gen_set
def test(): # data preparation vocab_input, _ = data_utils.initialize_vocabulary(FLAGS.vocab_input_file) _, rev_vocab_output = data_utils.initialize_vocabulary( FLAGS.vocab_output_file) config = Config.ModelConfig() config.batch_size = 1 #only 1 batch for inferencing config.num_input_symbols = len(vocab_input) config.num_output_symbols = len(rev_vocab_output) # read in testing data if there is any if FLAGS.data_test_path: with open(FLAGS.data_test_path, 'rb') as f: data = json.load(f) f.close() with tf.Session() as sess: model = create_model(sess, config, 'inference', True, cell_mode=FLAGS.cell_mode) test_total_losses, count_sample = 0, 0 for instance in data: inp_ids, tgt_ids = instance[0], instance[1] inputs, targets, target_weights = model.get_batch( [[inp_ids, tgt_ids]]) #DBRNN: output logits, brnn (forward then backward) output logits, tuple of [state, losses, fw_losses, bw_losses] #BASIC RNN: output logits, losses, state #ENC_DEC_ATT: output logits, losses, state output_logits, loss_or_brnn_logits, state = model.step( sess, inputs, targets, target_weights, True) if FLAGS.model == "DBRNN": _, test_losses, test_losses_fw, test_losses_bw = state else: test_losses = loss_or_brnn_logits #test losses per sample sentence test_total_losses += test_losses count_sample += 1 print('PPX: ' + str(math.exp(test_total_losses / float(count_sample))))
def test_file_decoder(gen_config, input_file, output_file): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = create_model(sess, gen_config, forward_only=True, name_scope=gen_config.name_model) model.batch_size = 1 train_path = os.path.join(gen_config.train_dir, "chitchat.train") voc_file_path = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) with open(output_file, 'w') as fout: with open(input_file, 'r') as fin: for sent in fin: print(sent) token_ids = data_utils.sentence_to_token_ids( tf.compat.as_str(sent), vocab) print("token_id: ", token_ids) bucket_id = len(gen_config.buckets) - 1 for i, bucket in enumerate(gen_config.buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: print("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights, _, _ = model.get_batch( {bucket_id: [(token_ids, [1])]}, bucket_id, model.batch_size, type=0) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] out_sent = " ".join([ tf.compat.as_str(rev_vocab[output]) for output in outputs ]) fout.write(out_sent + '\n') print(out_sent)
def prepare_data(gen_config): if os.path.exists('vocab') and os.path.exists( 'rev_vocab') and os.path.exists('dev_set') and os.path.exists( 'train_set'): fr_vocab = open('vocab', 'rb') fr_rev_vocab = open('rev_vocab', 'rb') fr_dev_set = open('dev_set', 'rb') fr_train_set = open('train_set', 'rb') vocab = pickle.load(fr_vocab) rev_vocab = pickle.load(fr_rev_vocab) dev_set = pickle.load(fr_dev_set) train_set = pickle.load(fr_train_set) fr_vocab.close() fr_rev_vocab.close() fr_dev_set.close() fr_train_set.close() else: train_path = os.path.join(gen_config.train_dir, "chitchat.train") voc_file_path = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) print("Preparing Chitchat gen_data in %s" % gen_config.train_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( gen_config.train_dir, vocab, gen_config.vocab_size) # Read disc_data into buckets and compute their sizes. print("Reading development and training gen_data (limit: %d)." % gen_config.max_train_data_size) dev_set = read_data(gen_config, dev_query, dev_answer) train_set = read_data(gen_config, train_query, train_answer, gen_config.max_train_data_size) fw_vocab = open('vocab', 'wb') fw_rev_vocab = open('rev_vocab', 'wb') fw_dev_set = open('dev_set', 'wb') fw_train_set = open('train_set', 'wb') pickle.dump(vocab, fw_vocab) pickle.dump(rev_vocab, fw_rev_vocab) pickle.dump(dev_set, fw_dev_set) pickle.dump(train_set, fw_train_set) fw_vocab.close() fw_rev_vocab.close() fw_dev_set.close() fw_train_set.close() return vocab, rev_vocab, dev_set, train_set
def prepare_data(gen_config): train_path = os.path.join(gen_config.data_dir, "chitchat.train") voc_file_path = [train_path+".answer", train_path+".query"] vocab_path = os.path.join(gen_config.data_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) print("Preparing Chitchat data in %s" % gen_config.data_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( gen_config.data_dir, vocab, gen_config.vocab_size) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % gen_config.max_train_data_size) dev_set = read_data(dev_query, dev_answer) train_set = read_data(train_query, train_answer, gen_config.max_train_data_size) return vocab, rev_vocab, dev_set, train_set
def prepare_data(gen_config): train_path = os.path.join(gen_config.data_dir, "train") test_path = os.path.join(gen_config.data_dir, "test") dev_path = os.path.join(gen_config.data_dir, "dev") voc_file_path = [ train_path + ".answer", train_path + ".query", test_path + ".answer", test_path + ".query", dev_path + ".answer", dev_path + ".query" ] vocab_path = os.path.join(gen_config.data_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) # vocab_path = os.path.join(gen_config.data_dir, "vocab%d.all" % 30000) # TODO: change 30000 to 2500 #其中vocab是word2id,rev_vocab是个list,保存所有的word vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) gen_config.vocab_size = len(vocab) # print("Preparing Chitchat gen_data in %s" % gen_config.train_dir) #返回的是相应文件的名字 train_query, train_answer, dev_query, dev_answer, test_query, test_answer = data_utils.prepare_chitchat_data( gen_config.data_dir, vocab, gen_config.vocab_size) # train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data_OpenSub(gen_config.data_dir) # Read disc_data into buckets and compute their sizes. print("Reading development and training gen_data (limit: %d)." % gen_config.max_train_data_size) unique_list = [] train_set, unique_list = read_data(gen_config, train_query, train_answer, unique_list=unique_list) dev_set, unique_list = read_data(gen_config, dev_query, dev_answer, unique_list=unique_list) test_set, unique_list = read_data(gen_config, test_query, test_answer, unique_list=unique_list) return vocab, rev_vocab, test_set, dev_set, train_set
def query_embedding(): q2v = Query2vec() # Create model and load parameters. gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=gpu_options, intra_op_parallelism_threads=20)) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) with tf.device("/gpu:0"): model = model_helper.create_model(sess, True) model.batch_size = FLAGS.batch_size # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size) _, vocab_rev = data_utils.initialize_vocabulary(vocab_path) q2v.embedding_batch(sess, model, vocab_rev)
def decode(sess, model, config, means, logvars, bucket_id): vocab_path = os.path.join(config.data_dir, "vocab%d" % config.vocab_size) _, rev_vocab = data_utils.initialize_vocabulary(vocab_path) _, decoder_inputs, target_weights = model.get_batch( {bucket_id: [([], [])]}, bucket_id) outputs = [] for mean, logvar in zip(means, logvars): mean = mean.reshape(1, -1) logvar = logvar.reshape(1, -1) output_logits = model.decode_from_latent(sess, mean, logvar, bucket_id, decoder_inputs, target_weights) output = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in output: output = output[:output.index(data_utils.EOS_ID)] output = " ".join([rev_vocab[word] for word in output]) + "\n" outputs.append(output) return outputs
def prepare_data(gen_config): train_path = os.path.join(gen_config.train_dir, "train") voc_file_path = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) print("Preparing Chitchat gen_data in %s" % gen_config.train_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( gen_config.train_dir, vocab, gen_config.vocab_size) # Read disc_data into buckets and compute their sizes. print("Reading development and training gen_data (limit: %d)." % gen_config.max_train_data_size) dev_set = read_data(gen_config, dev_query, dev_answer) #数据格式:train_set[[ [[source],[target]],[[source],[target]] ],....] 最外层的维度为bucket的个数 train_set = read_data(gen_config, train_query, train_answer, gen_config.max_train_data_size) return vocab, rev_vocab, dev_set, train_set
def prepare_data(gen_config): """ 1. data_utils.create_vocabulary : 創字典 2. data_utils.initialize_vocabulary: 回傳 train data answer and query & dev data answer and query 的 id path 3. train set = read_data : 製造 bucket (bucket就是每行一問一答句子的id對應),dev & train 都製造bucket buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] ex: Q:How old are you? = [5,7,4,2,3] ; len = 5 A:I'm six. = [44,6,8] ; len = 3 bucket = [ [[5,7,4,2,3],[44,6,8]] ],[],[],[]] 也就是說會放把QA放在固定長度範圍的bucket """ train_path = os.path.join(gen_config.train_dir, "chitchat.train") voc_file_path = [train_path+".answer", train_path+".query"] vocab_path = os.path.join(gen_config.train_dir, "vocab%d.all" % gen_config.vocab_size) # 35000個字 data_utils.create_vocabulary(vocab_path, voc_file_path, gen_config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # vocab & reverse vocab print("Preparing Chitchat gen_data in %s" % gen_config.train_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( gen_config.train_dir, vocab, gen_config.vocab_size) # Read disc_data into buckets and compute their sizes. print ("Reading development and training gen_data (limit: %d)." % gen_config.max_train_data_size) dev_set = read_data(gen_config, dev_query, dev_answer) train_set = read_data(gen_config, train_query, train_answer, gen_config.max_train_data_size) print("see what bucket is:") print("\n") print(dev_set) return vocab, rev_vocab, dev_set, train_set
from tensorflow_serving_client.protos import predict_pb2, prediction_service_pb2 from grpc.beta import implementations import tensorflow as tf import time from keras.preprocessing.sequence import pad_sequences from utils import data_utils, data_process from lib import config #文件读取和处理 vocb, rev_vocb = data_utils.initialize_vocabulary(config.VOCABULARY_PATH) test_sentence_ = ["我", "讨厌", "这", "车"] test_token_sentence = [[ vocb.get(i.encode('utf-8'), 1) for i in test_sentence_ ]] #将多条数据放到一个request中: for i in range(128): test_token_sentence.append( [vocb.get(i.encode('utf-8'), 1) for i in test_sentence_]) padding_sentence = pad_sequences(test_token_sentence, maxlen=config.MAX_SEQUENCE_LENGTH) #计时 start_time = time.time() #建立连接 IP = "your ip address" port = 8000 #replace this with your server port channel = implementations.insecure_channel(IP, port) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) request = predict_pb2.PredictRequest() #这里由保存和运行时定义,第一个启动tensorflow serving时配置的model_name,第二个是保存模型时的方法名 request.model_spec.name = "sentiment_classification"
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) ja_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.ja" % FLAGS.target_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_ja_vocab = data_utils.initialize_vocabulary(ja_vocab_path) _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) if len(rev_ja_vocab) < FLAGS.target_vocab_size: rev_ja_vocab += [ "_" for i in range(FLAGS.target_vocab_size - len(rev_ja_vocab)) ] # Prepare visual context integration for file in os.listdir(FLAGS.data_dir): if file.endswith("ids{0}.{1}".format(str(FLAGS.target_vocab_size), FLAGS.target_language)): target_id_file = os.path.join(FLAGS.data_dir, file) W, bias_vec = train_visual(target_id_file, os.path.join(FLAGS.data_dir, FLAGS.visual_vec_file_name), ja_vocab_path, FLAGS.target_vocab_size, num_epochs=100) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() visual_vec = [] while sentence: # Create visual context vector if given for el in reversed(sentence.split()): if el.isdigit(): visual_vec = [int(el)] + visual_vec # Get token-ids for the input sentence. sentence = " ".join(sentence.split()[:len(sentence.split()) - len(visual_vec)]) token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) print("token ids: " + str(token_ids)) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True ) #step(session, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only) #output_logits = [[log_prob(target1), log_prob(target2)...log_prob(target_n)], [log_prob(target1), ...]] #with length of decode bucket e.g. Hello! -> input buckt 5 -> decode bucket 10 # Store tensorflow scores for each output until the end-of-a-sentence symbol output_list = [] for logits in output_logits: max_id = np.argmax(logits, axis=1) if max_id == data_utils.EOS_ID: break nmt_score_dict = dict(enumerate(logits[0])) output_list.append(nmt_score_dict) #logits[0] += visual_scores # Store visual scores if visual_vec != [] and FLAGS.visual: visual_scores = feedforward(W, visual_vec, bias_vec) visual_scores = np.array([ math.log(prob / (1 - prob)) for prob in visual_scores ]) #turn probabilities into logits visual_score_dict = dict(enumerate(visual_scores)) # print ("--visual score--") # for k,v in sorted(visual_score_dict.items(), key=lambda x:x[1], reverse=True): # print(rev_ja_vocab[k]+":"+str(v), end=" ") # print ("\n") else: visual_score_dict = {} # Integrate visual scores if given and output the result print("--result--") outputs = [] for dic in output_list: for k in visual_score_dict: dic[k] += visual_score_dict[k] outputs.append( max(dic.iteritems(), key=operator.itemgetter(1))[0]) for k, v in sorted(dic.items(), key=lambda x: x[1], reverse=True): print(rev_ja_vocab[k] + ":" + str(v), end=" ") print("\n") print(" ".join([rev_ja_vocab[output] for output in outputs])) print("> ", end="") visual_vec = [] sys.stdout.flush() sentence = sys.stdin.readline()
def reconstruct(sess, model, config): model.batch_size = 1 # We decode one sentence at a time. model.probabilistic = config.probabilistic beam_size = config.beam_size # Load vocabularies. vocab_path = os.path.join(config.data_dir, "vocab%d.in" % config.vocab_size) en_vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input. outputs = [] with gfile.GFile(FLAGS.input, "r") as fs: sentences = fs.readlines() for i, sentence in enumerate(sentences): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) # Which bucket does it belong to? bucket_id = len(config.buckets) - 1 for i, bucket in enumerate(config.buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) if beam_size > 1: path, symbol, output_logits = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True, config.probabilistic, beam_size) k = output_logits[0] paths = [] for kk in range(beam_size): paths.append([]) curr = range(beam_size) num_steps = len(path) for i in range(num_steps - 1, -1, -1): for kk in range(beam_size): paths[kk].append(symbol[i][curr[kk]]) curr[kk] = path[i][curr[kk]] recos = set() for kk in range(beam_size): output = [int(logit) for logit in paths[kk][::-1]] if EOS_ID in output: output = output[:output.index(EOS_ID)] output = " ".join([rev_vocab[word] for word in output]) + "\n" outputs.append(output) else: # Get output logits for the sentence. _, _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True, config.probabilistic) # This is a greedy decoder - outputs are just argmaxes of output_logits. output = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in output: output = output[:output.index(data_utils.EOS_ID)] output = " ".join([rev_vocab[word] for word in output]) + "\n" outputs.append(output) with gfile.GFile(FLAGS.output, "w") as enc_dec_f: for output in outputs: enc_dec_f.write(output)
def train(config): # Prepare WMT data. print("Preparing WMT data in %s" % config.data_dir) train, dev, _ = data_utils.prepare_wmt_data(config.data_dir, config.vocab_size) with tf.Session() as sess: if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) # Create model. print("Creating %d layers of %d units." % (config.num_layers, config.size)) model = create_model(sess, config, False) if not config.probabilistic: model.kl_rate_update(0.0) train_writer = tf.summary.FileWriter(os.path.join( FLAGS.model_dir, "train"), graph=sess.graph) dev_writer = tf.summary.FileWriter(os.path.join( FLAGS.model_dir, "test"), graph=sess.graph) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % config.max_train_data_size) dev_set = read_data(dev, config) train_set = read_data(train, config, config.max_train_data_size) train_bucket_sizes = [ len(train_set[b]) for b in xrange(len(config.buckets)) ] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # Load vocabularies. vocab_path = os.path.join(config.data_dir, "vocab%d" % config.vocab_size) vocab, _ = data_utils.initialize_vocabulary(vocab_path) # Load word embeddings print("Loading pretrained word embeddings.") with tf.variable_scope("", reuse=True): enc_embedding = tf.get_variable("enc_embedding") dec_embedding = tf.get_variable("dec_embedding") embedding_matrix = load_embeddings(vocab, config) sess.run(tf.assign(enc_embedding, embedding_matrix)) sess.run(tf.assign(dec_embedding, embedding_matrix)) # This is the training loop. print("Starting training loop.") step_time, loss = 0.0, 0.0 KL_loss = 0.0 current_step = model.global_step.eval() step_loss_summaries = [] step_KL_loss_summaries = [] overall_start_time = time.time() while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, step_KL_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False, config.probabilistic) if config.anneal and model.global_step.eval( ) > config.kl_rate_rise_time and model.kl_rate < 1: new_kl_rate = model.kl_rate.eval() + config.kl_rate_rise_factor sess.run(model.kl_rate_update, feed_dict={'new_kl_rate': new_kl_rate}) step_time += (time.time() - start_time) / config.steps_per_checkpoint step_loss_summaries.append( tf.Summary(value=[ tf.Summary.Value(tag="step loss", simple_value=float(step_loss)) ])) step_KL_loss_summaries.append( tf.Summary(value=[ tf.Summary.Value(tag="KL step loss", simple_value=float(step_KL_loss)) ])) loss += step_loss / config.steps_per_checkpoint KL_loss += step_KL_loss / config.steps_per_checkpoint current_step = model.global_step.eval() # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % config.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) print( "global step %d learning rate %.4f step-time %.2f KL divergence " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, KL_loss)) wall_time = time.time() - overall_start_time print("time passed: {0}".format(wall_time)) # Add perplexity, KL divergence to summary and stats. perp_summary = tf.Summary(value=[ tf.Summary.Value(tag="train perplexity", simple_value=perplexity) ]) train_writer.add_summary(perp_summary, current_step) KL_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="KL divergence", simple_value=KL_loss) ]) train_writer.add_summary(KL_loss_summary, current_step) for i, summary in enumerate(step_loss_summaries): train_writer.add_summary(summary, current_step - 200 + i) step_loss_summaries = [] for i, summary in enumerate(step_KL_loss_summaries): train_writer.add_summary(summary, current_step - 200 + i) step_KL_loss_summaries = [] # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name + ".ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss, KL_loss = 0.0, 0.0, 0.0 # Run evals on development set and print their perplexity. eval_losses = [] eval_KL_losses = [] eval_bucket_num = 0 for bucket_id in xrange(len(config.buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue eval_bucket_num += 1 encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, eval_KL_loss, _ = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True, config.probabilistic) eval_losses.append(float(eval_loss)) eval_KL_losses.append(float(eval_KL_loss)) eval_ppx = math.exp( float(eval_loss)) if eval_loss < 300 else float("inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) eval_perp_summary = tf.Summary(value=[ tf.Summary.Value(tag="eval perplexity for bucket {0}". format(bucket_id), simple_value=eval_ppx) ]) dev_writer.add_summary(eval_perp_summary, current_step) mean_eval_loss = sum(eval_losses) / float(eval_bucket_num) mean_eval_KL_loss = sum(eval_KL_losses) / float( eval_bucket_num) mean_eval_ppx = math.exp(float(mean_eval_loss)) print(" eval: mean perplexity {0}".format(mean_eval_ppx)) eval_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="mean eval loss", simple_value=float(mean_eval_ppx)) ]) dev_writer.add_summary(eval_loss_summary, current_step) eval_KL_loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="mean eval loss", simple_value=float(mean_eval_KL_loss)) ]) dev_writer.add_summary(eval_KL_loss_summary, current_step)