def generate(middle_sentence, forwards_sentence, backwards_sentence): ''' Generates forwards and backwards sentences given a middle sentence. Args: middle_sentence: middle sentence (not tokenized) forwards_sentence: preceding sentence (not tokenized) backwards_sentence: following sentence (not tokenized) ''' train_path, vocab_path, train_ids_path = data_utils.prepare_skip_thought_data( FLAGS.data_dir, FLAGS.train_data_name, FLAGS.vocab_size) with tf.Session() as sess: m = SkipThoughtModel(FLAGS.vocab_size, max_sentence_len=FLAGS.max_sentence_len, batch_size=FLAGS.batch_size, learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, encoder_cell_size=FLAGS.encoder_cell_size, word_embedding_size=FLAGS.word_embedding_size, decoder_cell_size=FLAGS.decoder_cell_size, max_gradient_norm=FLAGS.max_gradient_norm, initial_decoder_state=None) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) m.saver.restore(sess, ckpt.model_checkpoint_path) else: print("No model found") return vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) tokenized_middle_sentence = data_utils.sentence_to_token_ids( middle_sentence, vocab) tokenized_forwards_sentence = data_utils.sentence_to_token_ids( forwards_sentence, vocab) tokenized_backwards_sentence = data_utils.sentence_to_token_ids( " ".join(reversed(backwards_sentence.split())), vocab) forwards_batch_logits, backwards_batch_logits = m.step(sess, [m.forwards_batch_logits_tensor, m.backwards_batch_logits_tensor], *m.prep_data( [tokenized_middle_sentence], [tokenized_forwards_sentence], [tokenized_backwards_sentence])) forwards_logits = forwards_batch_logits[:, 0, :] backwards_logits = backwards_batch_logits[:, 0, :] print(forwards_logits) print(forwards_logits.shape) forwards_sentence = map( lambda x: rev_vocab[x], map(np.argmax, forwards_logits)) backwards_sentence = map( lambda x: rev_vocab[x], map(np.argmax, backwards_logits)) print("Generated Forwards Sentence") print(" ".join(forwards_sentence)) print("Generated Backwards Sentence") print(" ".join(backwards_sentence))
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[: outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode_from_stdin(show_all_n_best=False, FLAGS=None, buckets=None): assert FLAGS is not None assert buckets is not None # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Create model and load parameters. model = create_seq2seq_model(sess, True, FLAGS, buckets, translate=True) # Load vocabularies. source_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \ ('.vocab.%s' % FLAGS.source_lang) target_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \ ('.vocab.%s' % FLAGS.target_lang) src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file) _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab) # Get output logits for the sentence. output_hypotheses, output_scores = model.translation_step(sess, token_ids, beam_size=FLAGS.beam_size, dump_remaining=False) outputs = [] for x in output_hypotheses: try: outputs.append(x[:x.index(data_utils.EOS_ID)]) except ValueError: pass output_hypotheses = outputs # print translations if show_all_n_best: for x in xrange(len(outputs)): out = outputs[x] # Print out French sentence corresponding to outputs. print(str(numpy.exp(-output_scores[x])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out])) else: out = outputs[0] # Print out French sentence corresponding to outputs. print(str(numpy.exp(-output_scores[0])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out])) # wait for a new sentence to translate print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.tgt" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.src" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path ) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path ) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() sentence = map(lambda x:x.decode('utf-8'), ['こんにちは']).pop() with open('./narou/narou_dev.src.txt', 'r') as f: lines = f.read().split('\n') print(lines) #while sentence: for sentence in lines: print(sentence) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) print(token_ids) # Which bucket does it belong to? try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) except: continue # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print("ANS:>", " ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") continue sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. ast_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.ast" % FLAGS.ast_vocab_size) nl_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.nl" % FLAGS.nl_v1000ocab_size) ast_vocab, _ = data_utils.initialize_vocabulary(ast_vocab_path) _, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), ast_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([ tf.compat.as_str(rev_nl_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join(working_directory, "vocab%d.enc" % enc_vocab_size) dec_vocab_path = os.path.join(working_directory, "vocab%d.dec" % dec_vocab_size) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), enc_vocab) bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] if sentence[:-1] in lines: temp_output = " ".join([ tf.compat.as_str(rev_dec_vocab[output]) for output in outputs ]) trigger_check = trigger_activator(temp_output) if trigger_check == True: print(" ".join([ tf.compat.as_str(rev_dec_vocab[output]) for output in outputs[:-1] ])) else: print(temp_output) else: print('i dont understand you') print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def scorer(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) test_file = os.path.join(gConfig['encoder_test_file']) test_captions = open(test_file,'r').readlines() model.batch_size = 1 # We decode one sentence at a time. output_captions = [] for sentence in test_captions: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? token_ids = token_ids[:40] try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] >= len(token_ids)]) except: # if sentence length greater than the largest bucket size pdb.set_trace() # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. output_caption = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) output_captions.append(output_caption) # pdb.set_trace() gt_info = pkl.load(open(test_file[:-4]+'.pkl','r')) id_list = gt_info['ids'] gt_dict = gt_info['gt'] pred_dict = {idx: [{'image_id':idx,'caption':sent}] for idx, sent in enumerate(output_captions)} with open(gConfig['result_dir']+gConfig['encoder_test_file'].split('/')[-1][:-4]+'_output.txt','w') as f: [f.write(i+'\n') for i in output_captions] scorer = COCOScorer() total_score = scorer.score(gt_dict, pred_dict, id_list)
def encode(): """Encode all of the sentences to vector form""" train, dev, test = loader.getData() sentences = [] tokens = [] # Load the vocab en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE) # Collect all the training sentences for i, row in pd.concat((train, test)).iterrows(): if isinstance(row["sentence1"], basestring) and isinstance( row["sentence2"], basestring): sentences.append(row["sentence1"]) sentences.append(row["sentence2"]) # Allocate the sentences to buckets bucketed = {} for sentence in sentences: bucket_id = get_bucket(en_vocab, sentence) bucketed.setdefault(bucket_id, []) bucketed[bucket_id].append(sentence) mapped = {} with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True, train_dir=TRAIN_DIR) model.batch_size = BATCH_SIZE # We decode 64 sentence at a time. # Iterate over each bucket for bucket_id, sentences in bucketed.iteritems(): for batch in chunker(sentences, BATCH_SIZE): data = [] for sentence in batch: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) expected_output = [] data.append((token_ids, expected_output)) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: data}, bucket_id) contexts = model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id) features = np.hstack(contexts) print 'Extracted another set of features with shape:', features.shape # Now we align sentences with their contexts for i, sentence in enumerate(batch): mapped[sentence] = features[i, :].tolist() print sentence print mapped[sentence] print "Saving sentences to %s" % JSON_NAME with open(JSON_NAME, 'w') as file: json.dump(mapped, file)
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), enc_vocab) #print(token_ids) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out corresponding response print(" ".join([ tf.compat.as_str(rev_dec_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def train(self, pretrained_model_path=None): # get captions and feats captions = data_utils.get_some_captions(5000) # shape:[5000, 192, 512] feats = data_utils.get_features(FEATURES_PATH) maxlen = self.n_time_step # get word2ix, ixtoword dictionary word2ix, ixtoword = data_utils.initialize_vocabulary(VOCAB_PATH) learning_rate = self.learning_rate n_words = len(word2ix) sess = tf.InteractiveSession() loss, context, sentence = self.build_model() saver = tf.train.Saver(max_to_keep=25) train_op = self.optimizer(learning_rate, loss) tf.initialize_all_variables().run() if pretrained_model_path is not None: print("Starting with pretrained model") saver.restore(sess, pretrained_model_path) for epoch in range(self.epochs): for start, end in zip(range(0, len(captions), self.batch_size), range(self.batch_size, len(captions), self.batch_size)): current_feats = feats[start:end] current_feats = current_feats.reshape(-1, self.D, self.L).swapaxes(1, 2) current_captions = captions[start:end] current_captions_ind = [] for caption in current_captions: caption2id = data_utils.sentence_to_token_ids(caption, word2ix) if len(caption2id) < maxlen: caption2id = [data_utils.GO_ID] + caption2id + [data_utils.EOS_ID] caption2id = caption2id + [data_utils.PAD_ID] * (maxlen - len(caption2id)) current_captions_ind.append(caption2id) current_captions_ind = np.asarray(current_captions_ind) _, loss_value = sess.run([train_op, loss], feed_dict={ context: current_feats, sentence: current_captions_ind }) print("Epoch:%d, Current loss:" % epoch, loss_value) saver.save(sess, MODEL_PATH, global_step=epoch)
def multi_test(): """generate paraphrasing sentences for multiple input sentences.""" with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # test_file = os.path.join(gConfig['encoder_test_file']) test_dir = 'data/top5/' output_dir = 'data/top5/' # test_captions = open(test_file,'r').readlines() for filename in os.listdir(test_dir): if filename.endswith(".txt"): test_captions = open(test_dir + filename,'r').readlines() output_captions = [] for sentence in test_captions: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence.lower()), enc_vocab) # Which bucket does it belong to? token_ids = token_ids[:40] try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] >= len(token_ids)]) except: # if sentence length greater than the largest bucket size pdb.set_trace() # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits, project = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. output_caption = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) output_captions.append(output_caption) with open(output_dir + 'paraphrase_' + filename, 'w') as f: for item in output_captions: f.write("%s\n" % item)
def inter_decode(): if not (FLAGS.inter_decode_sent and FLAGS.inter_decode_position and FLAGS.inter_decode_map): raise ValueError(" Invalid argument, please set inter_decode setting! ") with tf.Session() as sess: # Load dictionary srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min) trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min) srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path) trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path) # Create model model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True) model.batch_size = 1 # We decode one sentence at a time. # Decode from standard input. ---> interactive decoding # sys.stdout.write("> ") # sys.stdout.flush() # sentence = sys.stdin.readline() sentence = FLAGS.inter_decode_sent # read supplement input: children, weight. # init_pos = eval(sys.stdin.readline()) # mapp = eval(sys.stdin.readline()) init_pos = eval(FLAGS.inter_decode_position) mapp = eval(FLAGS.inter_decode_map) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. # pdb.set_trace() encoder_input, decoder_input, target_weight, pos, maps = model.get_batch( {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] final_pos = out_pos[0].tolist() for l in xrange(len(outputs)-1): final_pos.extend(out_pos[l+1].tolist()) return final_pos
def evaluate(filename): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. writer = open('pred.txt', 'w') count = 0 with open(filename, 'r') as reader: for sentence in reader: count += 1 if count % 1000 == 0: print (count) chunks = parser(sentence) #print (chunks) # Get token-ids for the input sentence. for sen in chunks: token_ids = data_utils.sentence_to_token_ids(sen.strip('\n'), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. # print ("previous: ") # print (outputs) if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. # print ("after: ") # print (outputs) output = (" ".join([rev_fr_vocab[output] for output in outputs])) #print (rev_fr_vocab) #print ("output: %s" % output) writer.write(output.split()[0]+'\n') writer.close()
def test(): print("Loading data...") vocab_word, vocab_word_list, train_words, train_labels, test_words, test_labels \ = data_utils.prepare_data(args.data_path) max_text_len = max([len(words) for words in train_words]) vocab_len = len(vocab_word_list) # 模型构造 model = models.TextCNN(max_text_len, 2, vocab_len, args.embedding_size, list(map(int, args.filter_sizes.split(","))), args.num_filters, args.max_gradient_norm, args.learning_rate, args.l2_reg_lambda) # 给定数据,对模型进行测试 text = '''说实话没吃成, 但是对这家太不满意了, 所有都给差评!到那之后满屋子都是座, 服务员非得给安排在一个犄角旮旯, 黑咕隆咚的冷气还吹不到. 点了餐喊半天服务员都不来拿单子, 而且是眼看着服务员从身边经过, 喊着服务员服务员, 她们就只当没听见. 然后我自己换了个显眼的位置, 举着手喊服务员, 她们还是无视的从我旁边走过.我k, 你又不服务, 没事走来走去干什么?所以后来干脆不吃了. 请问, 您家是要做生意么?''' words_ids = data_utils.sentence_to_token_ids(text, vocab_word) predicts = model.predict(words_ids, args.model_path) print text print "预测结果为:", predicts if predicts == [0]: print "正面评论" else: print "负面评论" text = '''瘦了点,可能和季节有关吧吃完加点青菜做泡饭满嗲的~孔雀开屏 45.00很大一条鱼,摆盘很漂亮,肉质挺嫩,如果加点醋更好, 去腥更美味~~香菇菜心这个 我喜欢的呀~上面酱很嗲~ 香菇很入味,菜心很爽口~ 解油腻 总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错, 摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~雨天滴滴答答,不是很舒服,但却并没影响到FB的心情~~~店开在 比较老式的弄堂里, 周围都是居民区,门面并不大,不过据说这里生意很好。性价比高么做的是绍兴菜,装修比较朴素,菜单也是很简单的A4纸 塑封一下总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错,摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~''' words_ids = data_utils.sentence_to_token_ids(text, vocab_word) predicts = model.predict(words_ids, args.model_path) print text print "预测结果为:", predicts if predicts == [0]: print "正面评论" else: print "负面评论"
def decode(): ''' Manually input sentence interactively and the headline will be printed out ''' with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = FLAGS.batch_size #repeat single sentence 10 times as one batch # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir,"vocab") vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input interactively sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: if (len(sentence.strip('\n')) == 0): sys.stdout.flush() sentence = sys.stdin.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) # print (token_ids) # print token ids # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(buckets)) if buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. # print ("current bucket id" + str(bucket_id)) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits_batch = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. output_logits = [] for item in output_logits_batch: output_logits.append(item[0]) #print (output_logits) #print (len(output_logits)) #print (output_logits[0]) outputs = [int(np.argmax(logit)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def translate_fun(sentence, sess, model, nl_vocab, rev_cm_vocab, FLAGS): # Get token-ids for the input sentence. if FLAGS.char: token_ids = data_utils.sentence_to_token_ids(sentence, nl_vocab, data_tools.char_tokenizer, data_tools.basic_tokenizer) else: token_ids = data_utils.sentence_to_token_ids(sentence, nl_vocab, data_tools.basic_tokenizer, None) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(model.buckets)) if model.buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. formatted_example = model.format_example([token_ids], [[data_utils.ROOT_ID]], bucket_id=bucket_id) # Get output for the sentence. output_symbols, output_logits, losses, attn_masks = \ model.step(sess, formatted_example, bucket_id, forward_only=True) batch_outputs = decode(output_symbols, rev_cm_vocab, FLAGS) return batch_outputs, output_logits
def decode(): with tf.Session() as sess: # Load vocabularies. src_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.src" % FLAGS.src_vocab_size) trg_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.trg" % FLAGS.trg_vocab_size) src_vocab, rev_src_vocab = data_utils.initialize_vocabulary( src_vocab_path) trg_vocab, rev_trg_vocab = data_utils.initialize_vocabulary( trg_vocab_path) if FLAGS.src_vocab_size > len(src_vocab): FLAGS.src_vocab_size = len(src_vocab) if FLAGS.trg_vocab_size > len(trg_vocab): FLAGS.trg_vocab_size = len(trg_vocab) # Create model and load parameters. model = create_model(sess, True, FLAGS.model) model.batch_size = 1 # We decode one sentence at a time. sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), src_vocab) token_ids.append(data_utils.EOS_ID) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, encoder_mask, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, encoder_mask, decoder_inputs, target_weights, bucket_id, True) # This is a beam search decoder - output is the best result from beam search outputs = [int(logit) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([ tf.compat.as_str(rev_trg_vocab[output]) for output in outputs ])) sentence = sys.stdin.readline()
def reply_all(message): sentence = (message.text).lower() token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] message_text = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) bot.reply_to(message, message_text)
def decode(config): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, config, True) model.batch_size = 1 # We decode one sentence at a time. # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), config.embeddings_mentions) # Which bucket does it belong to? bucket_id = len(config.buckets) - 1 for i, bucket in enumerate(config.buckets): if bucket[0] >= len(token_ids): bucket_id = i break # else: # logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. # encoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) encoder_inputs, target_weights = model.get_batch([token_ids], bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit[0])) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. # print(" ".join([tf.compat.as_str(config.embeddings_mentions_list[output]) for output in outputs])) out_str = "" for output in outputs: try: out_str += config.embeddings_mentions_list[output] + ' ' except: pass print(out_str) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(originalText): global sess global model global en_vocab_path global cn_vocab_path global en_vocab global rev_cn_vocab if model is None: # Create model and load parameters. sess = tf.Session() model = create_model(sess, True) model.batch_size = 1 # We decode one originalText at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.train_dir, "vocab%d.en" % FLAGS.en_vocab_size) cn_vocab_path = os.path.join(FLAGS.train_dir, "vocab%d.cn" % FLAGS.cn_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary( en_vocab_path, data_utils.basic_decoder) _, rev_cn_vocab = data_utils.initialize_vocabulary( cn_vocab_path, data_utils.chinese_decoder) # Get token-ids for the input originalText. token_ids = data_utils.sentence_to_token_ids(originalText, en_vocab, data_utils.basic_decoder) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("originalText truncated: %s", originalText) # Get a 1-element batch to feed the originalText to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the originalText. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French originalText corresponding to outputs. res = ("".join([rev_cn_vocab[output] for output in outputs])) return res
def decode(): with tf.Session() as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) print("enc_vocab:", enc_vocab, "\n") _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) print("rev_dec_vocab:", rev_dec_vocab, "\n") sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: print("sentence:", sentence) token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) print("token_ids:", token_ids) bucket_id = min([ b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) print("bucket_id:", bucket_id) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(outfile, sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) print("output_logits:", output_logits) outputs = [] for logit in output_logits: print("logit:", logit) _mx = np.argmax(logit, axis=1) print("_mx:", _mx) print("int(_mx):", int(_mx)) outputs.append(int(_mx)) print("outputs:", outputs) if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] res_str = "" for output in outputs: res_str += rev_dec_vocab[output][0] + " " print(res_str) print(">", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def testBLEU(): source = sys.argv[1] target = sys.argv[2] with tf.Session() as sess: model = create_model(sess, True, True) model.batch_size = 1 s_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.s_vocab_size, source)) t_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.t_vocab_size, target)) s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path) _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path) BLEUscore = {0:[], 1:[], 2:[], 3:[]} s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source) t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target) f_s = open(s_test_path, 'r') f_t = open(t_test_path, 'r') step = 0 for sentence in f_s: print(step) token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab) bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs] reference = f_t.readline().split(' ') try: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate) except: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5)) BLEUscore[bucket_id].append(temp_score) step += 1 print(temp_score) for key,val in BLEUscore.iteritems(): print(key, ": ", np.mean(val))
def decode(): input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. in_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.in_vocab_size) out_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.out" % FLAGS.out_vocab_size) in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path) _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path) # Decode from standard input. with gfile.GFile("test.txt", "r") as f: sentence = f.readline() for _ in range(1): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. with gfile.GFile("output.txt", "w") as f: f.write(" ".join([rev_out_vocab[output] for output in outputs]))
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = FLAGS.batch_size # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. #sys.stdout.write("> ") #sys.stdout.flush() predif = open(FLAGS.predifname).readlines() predof = open(FLAGS.predofname, 'w') #sentence = predif.readline() #count = 0 batch_decode = [] for predin in predif: token_ids = data_utils.sentence_to_token_ids(predin, en_vocab) # Which bucket does it belong to? bucket_id = 0#min([b for b in xrange(len(_buckets)) # if _buckets[b][0] > len(token_ids)]) batch_decode.append((token_ids, [])) if len(batch_decode) == FLAGS.batch_size: # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: batch_decode}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #embed() outputs = np.transpose(np.array(output_logits), (1, 0, 2)) outputs = np.argmax(outputs, axis=2) # If there is an EOS symbol in outputs, cut them at that point. for ii, out in enumerate(outputs): idxx = np.where(out == data_utils.EOS_ID)[0] if len(idxx)>0: out = out[:idxx[0]] predo = " ".join([rev_fr_vocab[word] for word in out]) print (predo) predof.write(predo + '\n') batch_decode = []
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode sentence and store it with open(gConfig["test_enc"], 'r') as test_enc: with open(gConfig["output"], 'w') as predicted_headline: sentence_count = 0 for sentence in test_enc: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then X. bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write predicted headline corresponding to article. predicted_headline.write(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])+'\n') sentence_count += 1 if sentence_count % 100 == 0: print("predicted data line %d" % sentence_count) sys.stdout.flush() predicted_headline.close() test_enc.close() print("Finished decoding and stored predicted results in %s!" % gConfig["output"])
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_idsgb = data_utils.sentence_to_token_ids(sentence, en_vocab) # Truncate sentence to the maximum bucket size token_ids = token_idsgb[0:479] # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. source_vocab_path = os.path.join(FLAGS.data_dir, ("vocab%d." + FLAGS.source_ext) % FLAGS.source_vocab_size) target_vocab_path = os.path.join(FLAGS.data_dir, ("vocab%d." + FLAGS.target_ext) % FLAGS.target_vocab_size) source_vocab, _ = data_utils.initialize_vocabulary(source_vocab_path) _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), source_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out target language sentence corresponding to outputs. out_sentence = " ".join([tf.compat.as_str(rev_target_vocab[output]) for output in outputs]) print(out_sentence) if FLAGS.translation_file != "": with gfile.GFile(FLAGS.translation_file, mode="ab") as fw: fw.write(FLAGS.source_ext + "> " + sentence) fw.write(FLAGS.target_ext + "> " + out_sentence + b"\n\n") fw.flush() print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # bucket_belong??? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get the required batch. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output_logits _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] final_output = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) if('_UNK' in final_output ): final_output = "I didn\'t learn how to respond to that." print(final_output) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode_input(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) bucket_id = min([ b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids) ] + [len(_buckets) - 1]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([ tf.compat.as_str(rev_dec_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size) vocab, vocab_rev = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input. sys.stdout.write("You> ") sys.stdout.flush() sentence = sys.stdin.readline().lower() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out our response sentence corresponding to outputs. try: print('%s: %s' % (name, buildSentence(outputs, vocab_rev))) except Exception as e: print(e) pass print("You> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def catreco(): #parse out GET request parameters: e.g.: /api/catreco?title=iphone5s&?nbest=10 title = request.args.get('title') if 'nbest' in request.args: nbest = int(request.args.get('nbest')) else: nbest = 10 # inference tensorflow model # Get token-ids for the input sentence. source_tokens = data_utils.sentence_to_token_ids(tf.compat.as_bytes(title), app.src_vocab, normalize_digits=True) src_len = len(source_tokens) if src_len > FLAGS.max_seq_length: source_tokens = source_tokens[:FLAGS.max_seq_length] else: source_tokens = source_tokens + [data_utils.PAD_ID ] * (FLAGS.max_seq_length - src_len) dict = app.model.get_predict_feed_dict(np.array([source_tokens]), app.target_inputs, np.array([src_len]), app.target_lens) pred_conf, pred_labels = app.sess.run( [app.model.predicted_tgts_score, app.model.predicted_labels], feed_dict=dict) pred_labels = np.vstack(pred_labels) pred_conf = np.vstack(pred_conf) top_confs = pred_conf[0][:nbest] top_tgtIDs = [ app.fullLabel_tgtID_Map[lbl] for lbl in pred_labels[0][:nbest] ] top_tgtNames = [app.tgtID_Name_Map[id] for id in top_tgtIDs] topCategories = [] for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) entry = {} entry['leafCatId'] = top_tgtIDs[idx] entry['leafCatName'] = top_tgtNames[idx] entry['confScore'] = float(top_confs[idx]) topCategories.append(entry) return jsonify({'ReqeustTitle': title, 'ClassifyResults': topCategories})
def decode(): """Propagate forward and create a response to an input sentence""" with tf.Session() as sess: # Create model and load parameters. model = create_model( sess, True ) # forward_only is True, because we don't need to backpropagate model.batch_size = 1 # We decode one sentence at a time. # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(buckets)) if buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( # Creating dictionary, not list, because there's only one bucket_id which is maybe != 0 {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: # TODO rewrite according to data_utils outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out the response sentence corresponding to outputs. print(data_utils.TOTAL_VOCAB[output] for output in outputs) # Read next input line print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def run(self, sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, self.en_vocab) # Which bucket does it belong to? bucket_id = min(b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = self.model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = self.model.step(self.sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. return "".join([self.rev_fr_vocab[output] for output in outputs])
def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence): """ DOCSTRING """ token_ids = data_utils.sentence_to_token_ids( tensorflow.compat.as_bytes(sentence), enc_vocab) bucket_id = min( [b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(numpy.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] return " ".join([ tensorflow.compat.as_str(rev_dec_vocab[output]) for output in outputs ])
def decode(): with tf.device("/cpu:0") and tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = './en-vocab.txt' ner_vocab_path = './ner-vocab.txt' en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_ner_vocab = data_utils.initialize_vocabulary(ner_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. inp_sentence = list(sentence.split()) out_sentence = [tf.compat.as_str(rev_ner_vocab[output]) for output in outputs] tagging = zip(inp_sentence, out_sentence) for tags in tagging: print (tags) #print(" ".join([tf.compat.as_str(rev_ner_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): """ DOCSTRING """ gpu_options = tensorflow.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tensorflow.ConfigProto(gpu_options=gpu_options) with tensorflow.Session(config=config) as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids( tensorflow.compat.as_bytes(sentence), enc_vocab) bucket_id = min([ b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(numpy.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([ tensorflow.compat.as_str(rev_dec_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): # Decode net = network_building() model = tflearn.DNN(net, tensorboard_verbose=0) model.load(MODEL_PATH) en_vocab, _ = data_utils.initialize_vocabulary(vocabulary_Path) #text = 'The paper is great. However, it comes glued and to start the roll there is quite a bit of waste. Perhaps, you can find a way to package it. Thanks' print('start commenting') HOST = '' # Symbolic name meaning all available interfaces PORT = 8082 # Arbitrary non-privileged port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind((HOST, PORT)) s.listen(1) conn, addr = s.accept() print('Connected by', addr) while True: text = conn.recv(1024) if not text: break print(str(text,'utf-8').rstrip()) text = data_utils.sentence_to_token_ids(tf.compat.as_bytes(text), vocabulary=en_vocab, tokenizer=False) # print(text) datalist = [] datalist.append(text) datalist = pad_sequences(datalist, maxlen=300, value=0.) # print(datalist) result = model.predict(datalist) # resultlabel = model.predict_label(datalist) print(result) # print(resultlabel) resultlist = result[0] maxnum = 0.0 score = 0 # print(resultlist) for i in range(len(resultlist)): if resultlist[i] > maxnum: maxnum = resultlist[i] score = i print(score) conn.sendall(bytes(" ".join(str(score))+'\n','utf-8'))
def decode_tester(sess, model): model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. # sys.stdout.write("> ") # sys.stdout.flush() sentence = "Who is the president of the United States?" # print(" input: " + sentence) # while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print("\toutput: " + " ".join( [tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) sys.stdout.flush()
def decode(): with tf.Session(config=config) as sess: #print ("Hello!!") model = create_model(sess, True) model.batch_size = 1 in_vocab_path = os.path.join(FLAGS.data_dir, "vocab_in.txt") out_vocab_path = os.path.join(FLAGS.data_dir, "vocab_out.txt") in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path) _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path) print("Hello!!") sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: sentence = wakati(sentence) token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab) bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print("".join([rev_out_vocab[output] for output in outputs])) print("\n> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # bucket_belong??? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get the required batch. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output_logits _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode_word(word, sess, model, gr_vocab, rev_ph_vocab): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(word, gr_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out phoneme corresponding to outputs. res_phoneme_seq = " ".join([rev_ph_vocab[output] for output in outputs]) return res_phoneme_seq
def decode(): print("Decoding interactively") with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = "vocab%d" % FLAGS.vocab_size vocab, rev_vocab = data_utils.initialize_vocab(vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] return " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])
def decode(): with tf.Session() as sess: print ("Hello!!") model = create_model(sess, True) model.batch_size = 1 in_vocab_path = os.path.join(FLAGS.data_dir, "vocab_in.txt") out_vocab_path = os.path.join(FLAGS.data_dir, "vocab_out.txt" ) in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path) _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([rev_out_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def inter_decode(sent, position, mapp): with tf.Session() as sess: # Load dictionary srce_vocab_path = os.path.join(data_dir, "train", "vocab%d.srce" % 2) trgt_vocab_path = os.path.join(data_dir, "train", "vocab%d.trgt" % 0) srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path) trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path) # Create model model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True) # model.batch_size = 1 # We decode one sentence at a time. sentence = sent init_pos = eval(position) mapp = eval(mapp) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, target_weight, pos, maps = model.get_batch( {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] final_pos = out_pos[0].tolist() for l in xrange(len(outputs)-1): final_pos.extend(out_pos[l+1].tolist()) return final_pos
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 #预测阶段我们只输入一个句子 # 加载词汇表 en_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # 翻译:我们用控制台输入英语句子 sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() #对输入结果进行翻译解码 while sentence: # 先把输入的单词,转换成索引形式 token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # 根据句子的长度,判读属于哪个buckets bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) #得到概率输出序列 _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #取一个输出序列的argmax最大的概率单词 outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs:#如果翻译结果中存在EOS_ID,那么我们只需截取前面的单词,作为结果 outputs = outputs[:outputs.index(data_utils.EOS_ID)] # 大印结果 print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. if FLAGS.beam_size > 0: use_beamsearch = True else: use_beamsearch = False model = create_model(sess, True, use_beamsearch=use_beamsearch) model.batch_size = 1 # We decode one sentence at a time. if FLAGS.use_ori: tokenizer = useori_tokenizer vocab_data_dir = os.path.join(FLAGS.data_dir, 'ori') else: tokenizer = cut_tokenizer vocab_data_dir = os.path.join(FLAGS.data_dir, 'cut') # Load vocabularies. en_vocab_path = os.path.join(vocab_data_dir, "vocab%d.q" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(vocab_data_dir, "vocab%d.a" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: try: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab, tokenizer=tokenizer) if len(token_ids) >= _buckets[-1][0]: token_ids = token_ids[0:(_buckets[-1][0]-1)] print(token_ids) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # TODO: indeed can produce longer answers, but with some repeat parts consequently #bucket_id = len(_buckets) - 1 if FLAGS.beam_size > 0: def cal_function(decoder_token_ids, idx): print('decoder_token_ids:', decoder_token_ids) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, decoder_token_ids)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #print(np.shape(output_logits[idx-1])) #print(output_logits[idx-1]) fake_logits = output_logits[idx-1].reshape([-1]) return log_sigmoid(inputs=fake_logits) beam_search = BeamSearch(beam_size=FLAGS.beam_size) beam_search.run(max_step=model.buckets[bucket_id][1], cal_function=cal_function) final_token_paths = beam_search.get_final_token_paths() for outputs in final_token_paths: print(outputs) if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print('done') else: model.use_beamsearch = False # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] print(outputs) # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print('done') except ValueError, e: print(e) print("Bad input! Try again:") finally:
def decode(): with tf.Session() as sess: # Create model and load parameters. # second arguments means this model are not Training model = create_model(sess, True) # we decode one sentence at a time model.batch_size = 1 # Load vocabularies vocab_path = os.path.join(FLAGS.data_dir,"Word_map.txt") vocab, Q_vocab = data_utils.initialize_vocabulary(vocab_path) while 1: # Get token-ids for the input sentence sys.stdout.write("Input >> ") sys.stdout.flush() sentence = sys.stdin.readline() token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) #print sentence #print token_ids #print np.shape(token_ids) # Which bucket oes it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id : [(token_ids,[])]},bucket_id) #print np.shape(decoder_inputs) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) bucket_length = (_buckets[bucket_id])[1] # softmax_output_logits = np.zeros(),dtype=np.float) #outputs = np.zeros(bucket_length,np.int) outputs = [] max_outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits] # for i in range(bucket_length): # softmax_output_logits = sess.run(tf.nn.softmax(output_logits[i])) # cum_sum = np.cumsum(softmax_output_logits) # random_number_02 = np.random.random_sample() #print softmax_output_logits.max() #print softmax_output_logits.argmax() # max_outputs.append(softmax_output_logits.argmax()) # output = min( [j for j in xrange(len(cum_sum)) if cum_sum[j] > random_number_02] ) # outputs.append(output) # This is a greedy decoder - outputs are just argmaxes of output_logits. # outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. # # if data_utils.EOS_ID in outputs: # outputs = outputs[:outputs.index(data_utils.EOS_ID)] if data_utils.EOS_ID in max_outputs: max_outputs = max_outputs[:max_outputs.index(data_utils.EOS_ID)] #print Q_vocab[outputs[0]] #print (outputs) # print ("sampling output >>") # print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in outputs])) #print (max_outputs) print ("output >>") print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in max_outputs])) print("=====================")
def decode_from_file(files, model_path=None, use_best=False, get_ids=True, FLAGS=None, buckets=None): assert FLAGS is not None assert buckets is not None with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # load model parameters. model = create_seq2seq_model(sess, model_path=model_path, forward_only=True, use_best=use_best, FLAGS=FLAGS, buckets=buckets, translate=True) # Load vocabularies. source_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \ ('.vocab.%s' % FLAGS.source_lang) target_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \ ('.vocab.%s' % FLAGS.target_lang) src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file) _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file) start_total_time = time.time() total_sentence_count = 0 for file_path in files: print("Translating file %s\n" % file_path) sentence_count = 0 # Decode from file. with gfile.GFile(file_path, mode='r') as source: with gfile.GFile(file_path + '.trans', mode='w') as destiny: sentence = source.readline() start_time = time.time() while sentence: sentence_count += 1 print("Translating sentence %d ", sentence_count) if get_ids: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab) else: # if sentence is already converted, just split the ids token_ids = [int(ss) for ss in sentence.strip().split()] # Get output logits for the sentence. output_hypotheses, output_scores = model.translation_step(sess, token_ids, FLAGS.beam_size, normalize=True, dump_remaining=True) outputs = output_hypotheses[0] # Print out sentence corresponding to outputs. destiny.write(" ".join([rev_tgt_vocab[output] for output in outputs])) destiny.write("\n") sentence = source.readline() end_time = time.time() - start_time print("\nDone file %s" % file_path) print("Avg. %.3f sentences/sec" % (sentence_count / end_time)) total_sentence_count += sentence_count end_total_time = time.time() - start_total_time print("\nDone!") print("Avg. %.3f sentences/sec" % (total_sentence_count / end_total_time))
def translate_file(source_path=dev_code_file, target_path=translated_dev_code): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. code_vocab_path = os.path.join(data_dir, "vocab%d.code" % FLAGS.code_vocab_size) en_vocab_path = os.path.join(data_dir, "vocab%d.en" % FLAGS.en_vocab_size) code_vocab, _ = data_utils.initialize_vocabulary(code_vocab_path) _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="w") as translated_file: sentence = source_file.readline() counter = 0 print (" Translating file %s " % dev_code_file) while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), code_vocab) buckets = [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)] if buckets: bucket_id = min(buckets) else: # print ("line %d with tokens %d" % (counter, len(token_ids))) translated_file.write("_UNK \n") sentence = source_file.readline() continue # Which bucket does it belong to? # bucket_id = min([b for b in xrange(len(_buckets)) # if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write translated sentence to translation file. translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n") # print ("> %s" % sentence) # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])) # Get next sentence and print checkpoints. counter +=1 sentence = source_file.readline() if( counter % 500 is 0): print(" Line %d translated" % counter) print (" File translated")
def test_BLEU(): # Perform BLEU score testing here with tf.Session() as sess: model = create_model(sess, True, False) source = sys.argv[1] target = sys.argv[2] model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. s_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.s_vocab_size, source)) t_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.t_vocab_size, target)) s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path) _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path) # Decode from standard input. BLEUscore = {0:[], 1:[], 2:[], 3:[]} s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source) t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target) f_s = open(s_test_path, 'r') f_t = open(t_test_path, 'r') # print(f_s.readline()) step = 0 for sentence in f_s: print(step) # sentence = f_ja.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out Japanese sentence corresponding to outputs. candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs] reference = f_t.readline().split(' ') print(candidate, reference) try: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate) except: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5)) BLEUscore[bucket_id].append(temp_score) step += 1 print(temp_score) for key,val in BLEUscore.iteritems(): print(key, ": ", np.mean(val))
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. # en_vocab_path = os.path.join(FLAGS.data_dir, # "vocab%d.from" % FLAGS.from_vocab_size) # fr_vocab_path = os.path.join(FLAGS.data_dir, # "vocab%d.to" % FLAGS.to_vocab_size) # en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) # _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) input = open('chinese_word2id.txt', 'r') chinese_word2id = {} while True: line = input.readline() if line == None or len(line) == 0: break words = line.split(' ') chinese_word2id[words[0]] = int(words[1].strip('\n')) input = open('english_word2id.txt', 'r') english_id2word = {} while True: line = input.readline() if line == None or len(line) == 0: break words = line.split(' ') english_id2word[int(words[1].strip('\n'))] = words[0] # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), chinese_word2id) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(english_id2word[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()