def interactive(): with tf.Session() as sess: # Create model and load parameters. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() w = sys.stdin.readline() word = " ".join(list(w)) while word: gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) print(res_phoneme_seq) else: print("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) print("> ", end="") sys.stdout.flush() w = sys.stdin.readline() word = " ".join(list(w))
def decode_from_stdin(show_all_n_best=False, FLAGS=None, buckets=None): assert FLAGS is not None assert buckets is not None # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess: with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Create model and load parameters. model = create_seq2seq_model(sess, True, FLAGS, buckets, translate=True) # Load vocabularies. source_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \ ('.vocab.%s' % FLAGS.source_lang) target_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \ ('.vocab.%s' % FLAGS.target_lang) src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file) _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab) # Get output logits for the sentence. output_hypotheses, output_scores = model.translation_step(sess, token_ids, beam_size=FLAGS.beam_size, dump_remaining=False) outputs = [] for x in output_hypotheses: try: outputs.append(x[:x.index(data_utils.EOS_ID)]) except ValueError: pass output_hypotheses = outputs # print translations if show_all_n_best: for x in xrange(len(outputs)): out = outputs[x] # Print out French sentence corresponding to outputs. print(str(numpy.exp(-output_scores[x])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out])) else: out = outputs[0] # Print out French sentence corresponding to outputs. print(str(numpy.exp(-output_scores[0])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out])) # wait for a new sentence to translate print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[: outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.tgt" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.src" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path ) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path ) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() sentence = map(lambda x:x.decode('utf-8'), ['こんにちは']).pop() with open('./narou/narou_dev.src.txt', 'r') as f: lines = f.read().split('\n') print(lines) #while sentence: for sentence in lines: print(sentence) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) print(token_ids) # Which bucket does it belong to? try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) except: continue # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print("ANS:>", " ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") continue sys.stdout.flush() sentence = sys.stdin.readline()
def eval(): # Load vocabularies. nl_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.nl" % FLAGS.nl_vocab_size) cm_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.cm.ast" % FLAGS.cm_vocab_size) nl_vocab, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path) cm_vocab, rev_cm_vocab = data_utils.initialize_vocabulary(cm_vocab_path) train_set, dev_set, _ = load_data() model = knn.KNNModel() model.train(train_set) eval_tools.eval_set(model_name, dev_set, rev_nl_vocab, FLAGS)
def evaluate(filename): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. writer = open('pred.txt', 'w') count = 0 with open(filename, 'r') as reader: for sentence in reader: count += 1 if count % 1000 == 0: print (count) chunks = parser(sentence) #print (chunks) # Get token-ids for the input sentence. for sen in chunks: token_ids = data_utils.sentence_to_token_ids(sen.strip('\n'), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. # print ("previous: ") # print (outputs) if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. # print ("after: ") # print (outputs) output = (" ".join([rev_fr_vocab[output] for output in outputs])) #print (rev_fr_vocab) #print ("output: %s" % output) writer.write(output.split()[0]+'\n') writer.close()
def inter_decode(): if not (FLAGS.inter_decode_sent and FLAGS.inter_decode_position and FLAGS.inter_decode_map): raise ValueError(" Invalid argument, please set inter_decode setting! ") with tf.Session() as sess: # Load dictionary srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min) trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min) srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path) trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path) # Create model model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True) model.batch_size = 1 # We decode one sentence at a time. # Decode from standard input. ---> interactive decoding # sys.stdout.write("> ") # sys.stdout.flush() # sentence = sys.stdin.readline() sentence = FLAGS.inter_decode_sent # read supplement input: children, weight. # init_pos = eval(sys.stdin.readline()) # mapp = eval(sys.stdin.readline()) init_pos = eval(FLAGS.inter_decode_position) mapp = eval(FLAGS.inter_decode_map) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. # pdb.set_trace() encoder_input, decoder_input, target_weight, pos, maps = model.get_batch( {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] final_pos = out_pos[0].tolist() for l in xrange(len(outputs)-1): final_pos.extend(out_pos[l+1].tolist()) return final_pos
def decode(): # Load vocabularies. nl_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.nl" % FLAGS.nl_vocab_size) cm_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.cm" % FLAGS.cm_vocab_size) nl_vocab, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path) cm_vocab, rev_cm_vocab = data_utils.initialize_vocabulary(cm_vocab_path) train_set, dev_set, _ = load_data() model = knn.KNNModel() model.train(train_set) decode_set(model, dev_set, rev_nl_vocab, rev_cm_vocab)
def get_vocabs(): """Initialize and return vocabularies and pathes to them. Returns: gr_vocab: Graphemes vocabulary; rev_ph_vocab: Reversed phonemes vocabulary; gr_vocab_path: Path to the graphemes vocabulary; ph_vocab_path: Path to the phonemes vocabulary. """ # Initialize vocabularies gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) return (gr_vocab, rev_ph_vocab, gr_vocab_path, ph_vocab_path)
def __init__(self): self.sess = tf.Session() self.download_trained_if_not_exists() # Create model and load parameters. self.model = create_model(self.sess, True) self.model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) self.en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, self.rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
def init_session(sess, conf='seq2seq.ini'): """ DOCSTRING """ global gConfig gConfig = get_config(conf) model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join(gConfig['working_directory'], "vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'], "vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) return sess, model, enc_vocab, rev_dec_vocab
def decode(): input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. in_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.in_vocab_size) out_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.out" % FLAGS.out_vocab_size) in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path) _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path) # Decode from standard input. with gfile.GFile("test.txt", "r") as f: sentence = f.readline() for _ in range(1): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. with gfile.GFile("output.txt", "w") as f: f.write(" ".join([rev_out_vocab[output] for output in outputs]))
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([ tf.compat.as_str(rev_fr_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode sentence and store it with open(gConfig["test_enc"], 'r') as test_enc: with open(gConfig["output"], 'w') as predicted_headline: sentence_count = 0 for sentence in test_enc: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then X. bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write predicted headline corresponding to article. predicted_headline.write(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])+'\n') sentence_count += 1 if sentence_count % 100 == 0: print("predicted data line %d" % sentence_count) sys.stdout.flush() predicted_headline.close() test_enc.close() print("Finished decoding and stored predicted results in %s!" % gConfig["output"])
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = FLAGS.batch_size # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. #sys.stdout.write("> ") #sys.stdout.flush() predif = open(FLAGS.predifname).readlines() predof = open(FLAGS.predofname, 'w') #sentence = predif.readline() #count = 0 batch_decode = [] for predin in predif: token_ids = data_utils.sentence_to_token_ids(predin, en_vocab) # Which bucket does it belong to? bucket_id = 0#min([b for b in xrange(len(_buckets)) # if _buckets[b][0] > len(token_ids)]) batch_decode.append((token_ids, [])) if len(batch_decode) == FLAGS.batch_size: # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: batch_decode}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #embed() outputs = np.transpose(np.array(output_logits), (1, 0, 2)) outputs = np.argmax(outputs, axis=2) # If there is an EOS symbol in outputs, cut them at that point. for ii, out in enumerate(outputs): idxx = np.where(out == data_utils.EOS_ID)[0] if len(idxx)>0: out = out[:idxx[0]] predo = " ".join([rev_fr_vocab[word] for word in out]) print (predo) predof.write(predo + '\n') batch_decode = []
def testBLEU(): source = sys.argv[1] target = sys.argv[2] with tf.Session() as sess: model = create_model(sess, True, True) model.batch_size = 1 s_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.s_vocab_size, source)) t_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.t_vocab_size, target)) s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path) _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path) BLEUscore = {0:[], 1:[], 2:[], 3:[]} s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source) t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target) f_s = open(s_test_path, 'r') f_t = open(t_test_path, 'r') step = 0 for sentence in f_s: print(step) token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab) bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs] reference = f_t.readline().split(' ') try: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate) except: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5)) BLEUscore[bucket_id].append(temp_score) step += 1 print(temp_score) for key,val in BLEUscore.iteritems(): print(key, ": ", np.mean(val))
def init_session(sess, conf='seq2seq.ini'): global gConfig gConfig = get_config(conf) # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) return sess, model, enc_vocab, rev_dec_vocab
def chat(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, forward_only=True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() sentence = re.sub(u'[^\u4e00-\u9fa5,。;:?!‘’“”、]', '', sentence.decode('utf-8')) sentence = re.sub(u'(?P<chinese>[\u4e00-\u9fa5,。;:?!‘’“”、])', add_space, sentence) while sentence: predicted_sentence = get_predicted_sentence( sentence, vocab, rev_vocab, model, sess) print(predicted_sentence) print("> ") sys.stdout.flush() sentence = sys.stdin.readline() sentence = re.sub(u'[^\u4e00-\u9fa5,。;:?!‘’“”、]', '', sentence.decode('utf-8')) sentence = re.sub(u'(?P<chinese>[\u4e00-\u9fa5,。;:?!‘’“”、])', add_space, sentence)
def predict(): def _get_test_dataset(): with open(TEST_DATASET_PATH) as test_fh: test_sentences = [s.strip() for s in test_fh.readlines()] return test_sentences results_filename = '_'.join([ 'results', str(FLAGS.num_layers), str(FLAGS.size), str(FLAGS.vocab_size) ]) results_path = os.path.join(FLAGS.results_dir, results_filename) with tf.Session() as sess, open(results_path, 'w') as results_fh: # Create model and load parameters. model = create_model(sess, forward_only=True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) test_dataset = _get_test_dataset() for sentence in test_dataset: # Get token-ids for the input sentence. predicted_sentence = get_predicted_sentence( sentence, vocab, rev_vocab, model, sess) print(sentence + ' -> ' + predicted_sentence) results_fh.write(predicted_sentence + '\n')
def __init__(self): chatbot.FLAGS.train_dir = 'tmp' chatbot.FLAGS.data_dir = 'tmp' self.sess = tf.InteractiveSession() self.model = chatbot.create_model(self.sess, True) self.model.batch_size = 1 data_dir = 'tmp' input_vocab_size = 40000 output_vocab_size = 40000 input_vocab_path = os.path.join(data_dir, "vocab%d.in" % input_vocab_size) output_vocab_path = os.path.join(data_dir, "vocab%d.out" % output_vocab_size) self.in_vocab, _ = data_utils.initialize_vocabulary(input_vocab_path) _, self.rev_out_vocab = data_utils.initialize_vocabulary( output_vocab_path)
def __init__(self): self.VOCAB_SIZE = 1000 self.SEQ_LEN = 20 vocab_path = 'data/sequence/vocab{}'.format(self.VOCAB_SIZE + 4) if os.path.exists(vocab_path): vocab, self.rev_vocab = data_utils.initialize_vocabulary( vocab_path) def gen_data(f, num_data, test=False): for _ in range(num_data): inp_init = np.random.randint(self.VOCAB_SIZE) inp_len = np.random.randint(1, high=self.SEQ_LEN + 1) inp, out_init = self.compute(inp_init, inp_len) buf = ' '.join(str(i) for i in inp) buf += '\n' f.write(buf) if not test: out_len = np.random.randint(1, high=self.SEQ_LEN + 1) out, _ = self.compute(out_init, out_len) buf = ' '.join(str(i) for i in out) buf += '\n' f.write(buf) if not os.path.exists('data/sequence/train_sequence.txt'): with open('data/sequence/train_sequence.txt', 'w') as f: gen_data(f, 100000) with open('data/sequence/dev_sequence.txt', 'w') as f: gen_data(f, 10000) with open('data/sequence/test_sequence.txt', 'w') as f: gen_data(f, 10000, test=True)
def read_data(src_path, vocab_path): data_set = [] max_length1, max_length2 = 0, 0 from_vocab, rev_from_vocab = data_utils.initialize_vocabulary(vocab_path) with tf.gfile.GFile(src_path, mode="r") as src_file: src = src_file.readline() counter = 0 while src: if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() # if counter > 100000: # break sentences = [] s = [] for x in src.split(" "): id = int(x) if id != -1: s.append(id) else: if len(s) > max_length1: max_length1 = len(s) if len(s) > 25: s = s[:25] sentences.append(s) s = [] data_set.append(sentences) counter += 1 src = src_file.readline() print(counter) print(max_length1) return data_set
def predict(): with tf.Session() as sess: model_obj = model.Seq2SeqModel(config, 'decode') model_obj.batch_size = 1 model_obj.model_restore(sess) vocab_path = config.source_vocabulary vocab, vocab_list = data_utils.initialize_vocabulary(vocab_path) while True: question = input("输入:") if question == "" or question == 'exit': break sentence = " ".join(list(jieba.cut(question))) token_ids_sentence = data_utils.sentence_to_token_ids( sentence, vocab) if config.beam_with > 1: predicted_sentence = model_obj.predict_beam_search( sess, np.array([token_ids_sentence]), np.array([len(token_ids_sentence)]), vocab_list) else: predicted_sentence = model_obj.predict( sess, np.array([token_ids_sentence]), np.array([len(token_ids_sentence)]), vocab_list) print("输出:", predicted_sentence)
def read_chat_data(data_path, vocabulary_path, max_size=None): counter = 0 vocab, _ = initialize_vocabulary(vocabulary_path) print("size of vocab: %s" % len(vocab)) print("max size: %s" % max_size) data_set = [[] for _ in _buckets] with codecs.open(data_path, "rb") as fi: for line in fi.readlines(): counter += 1 if max_size != 0 and counter > max_size: break if counter % 10000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() entities = line.decode().lower().split("\t") # print entities if len(entities) == 2: source = entities[0] target = entities[1] source_ids = [ int(x) for x in sentence_to_token_ids(source, vocab) ] target_ids = [ int(x) for x in sentence_to_token_ids(target, vocab) ] target_ids.append(EOS_ID) for bucket_id, (source_size, target_size) in enumerate(_buckets): if len(source_ids) < source_size and len( target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break return data_set
def __init__(self): self.SEQ_LEN = 10 vocab_path = 'data/addition/vocab{}'.format(10 + 4) if os.path.exists(vocab_path): _, self.rev_vocab = data_utils.initialize_vocabulary(vocab_path) def gen_data(f, num_data, test=False): for _ in range(num_data): seq_len = np.random.randint(2, high=self.SEQ_LEN + 1) seq = np.random.randint(10, size=seq_len) inp_seq = ' '.join(str(i) for i in seq) inp_seq += '\n' f.write(inp_seq) if not test: flag = np.random.randint(1, high=seq_len) num1 = ''.join(str(i) for i in seq[:flag]) num1 = int(num1) num2 = ''.join(str(i) for i in seq[flag:]) num2 = int(num2) out = str(num1 + num2) out_seq = ' '.join(i for i in out) out_seq += '\n' f.write(out_seq) if not os.path.exists('data/addition/train_addition.txt'): with open('data/addition/train_addition.txt', 'w') as f: gen_data(f, 100000) with open('data/addition/dev_addition.txt', 'w') as f: gen_data(f, 10000) with open('data/addition/test_addition.txt', 'w') as f: gen_data(f, 10000, test=True)
def __init__(self): self.UPBOUND = 9 self.SEQ_LEN = 10 vocab_path = 'data/counting/vocab{}'.format(self.UPBOUND + 1 + 4) vocab, self.rev_vocab = data_utils.initialize_vocabulary(vocab_path) self.number_rev_vocab = tf.string_to_number( tf.constant(self.rev_vocab[4:]), tf.int32) def gen_data(f, num_data, test=False): for _ in range(num_data): inp_len = np.random.randint(1, high=self.SEQ_LEN) inp = np.random.randint(self.UPBOUND + 1, size=inp_len) buf = ' '.join(str(i) for i in inp) buf += '\n' f.write(buf) if not test: #out_flags_num = np.random.randint(inp_len + 1) out_flag = np.random.randint(inp_len) out = [out_flag, inp[out_flag], len(inp) - out_flag - 1] buf = ' '.join(str(i) for i in out) buf += '\n' f.write(buf) if not os.path.exists('data/counting/train_counting.txt'): with open('data/counting/train_counting.txt', 'w') as f: gen_data(f, 100000) with open('data/counting/dev_counting.txt', 'w') as f: gen_data(f, 10000) with open('data/counting/test_counting.txt', 'w') as f: gen_data(f, 10000, test=True)
def prepare_data(config): train_path = os.path.join(config.train_dir, "chitchat.train") data_path_list = [train_path + ".answer", train_path + ".query"] vocab_path = os.path.join(config.train_dir, "vocab%d.all" % config.vocab_size) data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # # if os.path.isfile(config.dev_set) and os.path.isfile(config.train_set): # dev_set_file = open(config.dev_set, "rb") # dev_set = pickle.load(dev_set_file) # dev_set_file.close() # # train_set_file = open(config.train_set, "rb") # train_set = pickle.load(train_set_file) # train_set_file.close() # else: print("Prepare Chitchat data in %s" % config.train_dir) train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data( config.train_dir, vocab, config.vocab_size) print("Reading development and training data (limit: %d)." % config.max_train_data_size) dev_set = read_data(config, dev_query, dev_answer) train_set = read_data(config, train_query, train_answer) # dev_set_file = open(config.dev_set, "wb") # pickle.dump(dev_set, dev_set_file) # dev_set_file.close() # # train_set_file = open(config.train_set, "wb") # pickle.dump(train_set, train_set_file) # train_set_file.close() return vocab, rev_vocab, dev_set, train_set
def chat(args): with tf.Session() as sess: # Create model and load parameters. args.batch_size = 1 # We decode one sentence at a time. model = create_model(sess, args) # Load vocabularies. vocab_path = os.path.join(args.data_dir, "vocab%d.in" % args.vocab_size) vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: predicted_sentence = get_predicted_sentence( args, sentence, vocab, rev_vocab, model, sess) # print(predicted_sentence) if isinstance(predicted_sentence, list): for sent in predicted_sentence: print(" (%s) -> %s" % (sent['prob'], sent['dec_inp'])) else: print(sentence, ' -> ', predicted_sentence) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): ''' Manually input sentence interactively and the headline will be printed out ''' with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = FLAGS.batch_size #repeat single sentence 10 times as one batch # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab") vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) # Decode from standard input interactively sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: sentence = SeqSentence(sentence) if (len(sentence.strip('\n')) == 0): sys.stdout.flush() sentence = sys.stdin.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), vocab) # print (token_ids) # print token ids # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(buckets)) if buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. # print ("current bucket id" + str(bucket_id)) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits_batch = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. output_logits = [] for item in output_logits_batch: output_logits.append(item[0]) #print (output_logits) #print (len(output_logits)) #print (output_logits[0]) outputs = [int(np.argmax(logit)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join( [tf.compat.as_str(rev_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, rev_en_vocab = data_utils.initialize_vocabulary( en_vocab_path) #_, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. #sys.stdout.write("> ") #sys.stdout.flush() #sentence = sys.stdin.readline() #while sentence: id_counts = 0 with open('./wikisql_in_nmt/dev.seq') as f, open( 'tmp.eval.ids.true', 'w') as ft: lines = f.readlines() for l in tqdm(lines, total=len(lines)): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(l), en_vocab) token_ids.append(data_utils.EOS_ID) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights, target_id, sent_id = model.get_batch( {bucket_id: [(token_ids, [], 1)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, target_id, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. #if data_utils.EOS_ID in outputs: # outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. ft.write(" ".join([ tf.compat.as_str(rev_en_vocab[int(encoder_inputs[output])]) for output in outputs ]) + "|" + str(id_counts) + '\n') id_counts += 1
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. source_vocab_path = os.path.join(FLAGS.data_dir, ("vocab%d." + FLAGS.source_ext) % FLAGS.source_vocab_size) target_vocab_path = os.path.join(FLAGS.data_dir, ("vocab%d." + FLAGS.target_ext) % FLAGS.target_vocab_size) source_vocab, _ = data_utils.initialize_vocabulary(source_vocab_path) _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), source_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out target language sentence corresponding to outputs. out_sentence = " ".join([tf.compat.as_str(rev_target_vocab[output]) for output in outputs]) print(out_sentence) if FLAGS.translation_file != "": with gfile.GFile(FLAGS.translation_file, mode="ab") as fw: fw.write(FLAGS.source_ext + "> " + sentence) fw.write(FLAGS.target_ext + "> " + out_sentence + b"\n\n") fw.flush() print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode_input(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d_enc.txt" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d_dec.txt" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab) bucket_id = min([ b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids) ] + [len(_buckets) - 1]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([ tf.compat.as_str(rev_dec_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # bucket_belong??? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get the required batch. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output_logits _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] final_output = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]) if('_UNK' in final_output ): final_output = "I didn\'t learn how to respond to that." print(final_output) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def original(): # test_vec = [26, 12, 10, 11, 15, 17, 28, 171, 18, 339] # print "[command] ", decode_vec_to_str(test_vec, nl_dictionary) # find k nearest neighbor # knn = find_k_nearest_neighbor(test_vec, nl_vec_list, 1) # for p in knn: # print "[nn vec] ", p # print the decoding result of these filters # print "[nearest neighbor] ", decode_vec_to_str(p[0], nl_dictionary) sys.stdout = open('result.txt', 'w') # Load vocabularies. nl_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.nl" % FLAGS.nl_vocab_size) cm_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.cm" % FLAGS.cm_vocab_size) nl_vocab, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path) cm_vocab, rev_cm_vocab = data_utils.initialize_vocabulary(cm_vocab_path) # the file containing traning nl vectors and cmd vectors train_set, dev_set, _ = load_data() model = knn.KNNModel() model.train(train_set) test_cmd_vec_list = [cmd_vec for _, _, _, cmd_vec in dev_set] test_nl_vec_list = [nl_vec for _, _, nl_vec, _ in dev_set] for i in range(len(test_nl_vec_list)): test_vec = test_nl_vec_list[i] cmd_vec = test_cmd_vec_list[i] nl, cmd, score = model.test(test_vec, 1) print "[text-case ", i, "] =========================================================" print " [original-pair]" print " ", knn.decode_vec_to_str(test_vec, rev_nl_vocab) print " ", knn.decode_vec_to_str(cmd_vec, rev_cm_vocab) print " [new-pair]" print " ", knn.decode_vec_to_str(nl, rev_nl_vocab) print " ", knn.decode_vec_to_str(cmd, rev_cm_vocab) print knn.decode_vec_to_str(cmd, rev_cm_vocab)
def get_mem_s2t(): slines = open("./data/train.ids30000.src") tlines = open("./data/train.ids30000.trg") mlines = open("./data/aligns") mem = {} for sline, tline, mline in zip(slines, tlines, mlines): zh_words = sline.strip().split(' ') en_words = tline.strip().split(' ') maps = mline.strip().split(' ') for m in maps: zhid, enid = m.split('-') zh_word = zh_words[int(zhid)] if int(zh_word) == 3: continue en_word = en_words[int(enid)] if int(en_word) == 3: continue if int(zh_word) not in mem: mem[int(zh_word)] = [] mem[int(zh_word)].append(int(en_word)) for m in mem: l = len(mem[m]) words = Counter(mem[m]) words = sorted(words.items(), key=lambda x: x[1], reverse=True) mem[m] = map(lambda x: (x[0], x[1] / float(l)), words) del slines del tlines del mlines en_vocab_path = "./data/vocab30000.src" fr_vocab_path = "./data/vocab30000.trg" en_vocab, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) fr_vocab, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) for i, word in enumerate(rev_en_vocab): if i not in mem: if word in fr_vocab: mem[i] = [(fr_vocab[word], 1.0), (fr_vocab['_NULL'], 0.0)] else: mem[i] = [(fr_vocab['_NULL'], 0.0), (fr_vocab['_NULL'], 0.0)] f = open("./data/mems2t.pkl", 'wb') pkl.dump(mem, f)
def decode(): # Only allocate part of the gpu memory when predicting. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode_tester(sess, model): model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. # sys.stdout.write("> ") # sys.stdout.flush() sentence = "Who is the president of the United States?" # print(" input: " + sentence) # while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print("\toutput: " + " ".join( [tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) sys.stdout.flush()
def decode(): """ DOCSTRING """ gpu_options = tensorflow.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tensorflow.ConfigProto(gpu_options=gpu_options) with tensorflow.Session(config=config) as sess: model = create_model(sess, True) model.batch_size = 1 enc_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join( gConfig['working_directory'], "vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids( tensorflow.compat.as_bytes(sentence), enc_vocab) bucket_id = min([ b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(numpy.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([ tensorflow.compat.as_str(rev_dec_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(input): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from passed input variable sentence = input # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. #-- print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) return " ".join( [tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])
def generate(middle_sentence, forwards_sentence, backwards_sentence): ''' Generates forwards and backwards sentences given a middle sentence. Args: middle_sentence: middle sentence (not tokenized) forwards_sentence: preceding sentence (not tokenized) backwards_sentence: following sentence (not tokenized) ''' train_path, vocab_path, train_ids_path = data_utils.prepare_skip_thought_data( FLAGS.data_dir, FLAGS.train_data_name, FLAGS.vocab_size) with tf.Session() as sess: m = SkipThoughtModel(FLAGS.vocab_size, max_sentence_len=FLAGS.max_sentence_len, batch_size=FLAGS.batch_size, learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, encoder_cell_size=FLAGS.encoder_cell_size, word_embedding_size=FLAGS.word_embedding_size, decoder_cell_size=FLAGS.decoder_cell_size, max_gradient_norm=FLAGS.max_gradient_norm, initial_decoder_state=None) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) m.saver.restore(sess, ckpt.model_checkpoint_path) else: print("No model found") return vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) tokenized_middle_sentence = data_utils.sentence_to_token_ids( middle_sentence, vocab) tokenized_forwards_sentence = data_utils.sentence_to_token_ids( forwards_sentence, vocab) tokenized_backwards_sentence = data_utils.sentence_to_token_ids( " ".join(reversed(backwards_sentence.split())), vocab) forwards_batch_logits, backwards_batch_logits = m.step(sess, [m.forwards_batch_logits_tensor, m.backwards_batch_logits_tensor], *m.prep_data( [tokenized_middle_sentence], [tokenized_forwards_sentence], [tokenized_backwards_sentence])) forwards_logits = forwards_batch_logits[:, 0, :] backwards_logits = backwards_batch_logits[:, 0, :] print(forwards_logits) print(forwards_logits.shape) forwards_sentence = map( lambda x: rev_vocab[x], map(np.argmax, forwards_logits)) backwards_sentence = map( lambda x: rev_vocab[x], map(np.argmax, backwards_logits)) print("Generated Forwards Sentence") print(" ".join(forwards_sentence)) print("Generated Backwards Sentence") print(" ".join(backwards_sentence))
def decode(): with tf.Session(config=config) as sess: #print ("Hello!!") model = create_model(sess, True) model.batch_size = 1 in_vocab_path = os.path.join(FLAGS.data_dir, "vocab_in.txt") out_vocab_path = os.path.join(FLAGS.data_dir, "vocab_out.txt") in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path) _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path) print("Hello!!") sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: sentence = wakati(sentence) token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab) bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print("".join([rev_out_vocab[output] for output in outputs])) print("\n> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. first_vocab_path = os.path.join( FLAGS.train_dir, "vocab%d.first" % FLAGS.first_vocab_size) last_vocab_path = os.path.join(FLAGS.train_dir, "vocab%d.last" % FLAGS.last_vocab_size) first_vocab, _ = data_utils.initialize_vocabulary(first_vocab_path) _, rev_last_vocab = data_utils.initialize_vocabulary(last_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = FLAGS.input # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, first_vocab) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. result = (" ".join([rev_last_vocab[output] for output in outputs])) print(result) output = os.path.join(FLAGS.output_dir, str(int(time.time())) + ".txt") with open(output, "w") as text_file: text_file.write(result) print(output) sys.stdout.flush()
def decode(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2) config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session(config=config) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size']) dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size']) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab) # bucket_belong??? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get the required batch. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output_logits _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) # Decode from input file. graphemes = open(FLAGS.decode).readlines() output_file_path = FLAGS.output if output_file_path: with gfile.GFile(output_file_path, mode="w") as output_file: for w in graphemes: word = " ".join(list(w)) gr_absent = [gr for gr in w if gr not in gr_vocab] if not gr_absent: res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) output_file.write(w.replace('\n',' ')) output_file.write(res_phoneme_seq) output_file.write('\n') else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) else: for w in graphemes: word = " ".join(list(w)) gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) print(w.replace('\n',' ') + res_phoneme_seq) sys.stdout.flush() else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
def __init__(self, save_dir, source='10.0.2.32', is_local=False): super().__init__(source, is_local) self.client = MongoClient(source) self.corpus = 'cornell-corpus' self.col = 'dialogs' self.open() self.buckets = _buckets self.vocabfileA = os.path.join(save_dir, self.corpus + '_vocabfileA') self.vocabfileB = os.path.join(save_dir, self.corpus + '_vocabfileB') if not os.path.isfile(self.vocabfileA): self.create_vocab("A", self.vocabfileA) if not os.path.isfile(self.vocabfileB): self.create_vocab("B", self.vocabfileB) print("initializing vocab") self.vocabA, self.vocabA_rev = data_utils.initialize_vocabulary( self.vocabfileA) self.vocabB, self.vocabB_rev = data_utils.initialize_vocabulary( self.vocabfileB) print("vocab initialized")
def decode(): with tf.Session() as sess: print ("Hello!!") model = create_model(sess, True) model.batch_size = 1 in_vocab_path = os.path.join(FLAGS.data_dir, "vocab_in.txt") out_vocab_path = os.path.join(FLAGS.data_dir, "vocab_out.txt" ) in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path) _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" ".join([rev_out_vocab[output] for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def inter_decode(sent, position, mapp): with tf.Session() as sess: # Load dictionary srce_vocab_path = os.path.join(data_dir, "train", "vocab%d.srce" % 2) trgt_vocab_path = os.path.join(data_dir, "train", "vocab%d.trgt" % 0) srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path) trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path) # Create model model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True) # model.batch_size = 1 # We decode one sentence at a time. sentence = sent init_pos = eval(position) mapp = eval(mapp) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, target_weight, pos, maps = model.get_batch( {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] final_pos = out_pos[0].tolist() for l in xrange(len(outputs)-1): final_pos.extend(out_pos[l+1].tolist()) return final_pos
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 #预测阶段我们只输入一个句子 # 加载词汇表 en_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # 翻译:我们用控制台输入英语句子 sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() #对输入结果进行翻译解码 while sentence: # 先把输入的单词,转换成索引形式 token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # 根据句子的长度,判读属于哪个buckets bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) #得到概率输出序列 _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #取一个输出序列的argmax最大的概率单词 outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in outputs:#如果翻译结果中存在EOS_ID,那么我们只需截取前面的单词,作为结果 outputs = outputs[:outputs.index(data_utils.EOS_ID)] # 大印结果 print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. if FLAGS.beam_size > 0: use_beamsearch = True else: use_beamsearch = False model = create_model(sess, True, use_beamsearch=use_beamsearch) model.batch_size = 1 # We decode one sentence at a time. if FLAGS.use_ori: tokenizer = useori_tokenizer vocab_data_dir = os.path.join(FLAGS.data_dir, 'ori') else: tokenizer = cut_tokenizer vocab_data_dir = os.path.join(FLAGS.data_dir, 'cut') # Load vocabularies. en_vocab_path = os.path.join(vocab_data_dir, "vocab%d.q" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(vocab_data_dir, "vocab%d.a" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: try: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab, tokenizer=tokenizer) if len(token_ids) >= _buckets[-1][0]: token_ids = token_ids[0:(_buckets[-1][0]-1)] print(token_ids) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # TODO: indeed can produce longer answers, but with some repeat parts consequently #bucket_id = len(_buckets) - 1 if FLAGS.beam_size > 0: def cal_function(decoder_token_ids, idx): print('decoder_token_ids:', decoder_token_ids) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, decoder_token_ids)]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #print(np.shape(output_logits[idx-1])) #print(output_logits[idx-1]) fake_logits = output_logits[idx-1].reshape([-1]) return log_sigmoid(inputs=fake_logits) beam_search = BeamSearch(beam_size=FLAGS.beam_size) beam_search.run(max_step=model.buckets[bucket_id][1], cal_function=cal_function) final_token_paths = beam_search.get_final_token_paths() for outputs in final_token_paths: print(outputs) if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print('done') else: model.use_beamsearch = False # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] print(outputs) # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print('done') except ValueError, e: print(e) print("Bad input! Try again:") finally:
import tensorflow as tf import numpy as np from model import create_model, buildSentence, respond from config.config import FLAGS, _buckets, name import data_utils import os.path sess = tf.Session() # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.in" % FLAGS.vocab_size) vocab, vocab_rev = data_utils.initialize_vocabulary(vocab_path) print '%s: %s' % (name, respond('hi.', sess, model, vocab, vocab_rev)) print '%s: %s' % (name, respond('hello.', sess, model, vocab, vocab_rev)) print '%s: %s' % (name, respond('hey.', sess, model, vocab, vocab_rev)) print '%s: %s' % (name, respond('how are you?', sess, model, vocab, vocab_rev)) print '%s: %s' % (name, respond('what is the meaning of life?', sess, model, vocab, vocab_rev)) print '%s: %s' % (name, respond('you are a machine.', sess, model, vocab, vocab_rev)) print '%s: %s' % (name, respond('you\'re a machine.', sess, model, vocab, vocab_rev))
def translate_file(source_path=dev_code_file, target_path=translated_dev_code): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. code_vocab_path = os.path.join(data_dir, "vocab%d.code" % FLAGS.code_vocab_size) en_vocab_path = os.path.join(data_dir, "vocab%d.en" % FLAGS.en_vocab_size) code_vocab, _ = data_utils.initialize_vocabulary(code_vocab_path) _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="w") as translated_file: sentence = source_file.readline() counter = 0 print (" Translating file %s " % dev_code_file) while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), code_vocab) buckets = [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)] if buckets: bucket_id = min(buckets) else: # print ("line %d with tokens %d" % (counter, len(token_ids))) translated_file.write("_UNK \n") sentence = source_file.readline() continue # Which bucket does it belong to? # bucket_id = min([b for b in xrange(len(_buckets)) # if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write translated sentence to translation file. translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n") # print ("> %s" % sentence) # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])) # Get next sentence and print checkpoints. counter +=1 sentence = source_file.readline() if( counter % 500 is 0): print(" Line %d translated" % counter) print (" File translated")
def decode_from_file(files, model_path=None, use_best=False, get_ids=True, FLAGS=None, buckets=None): assert FLAGS is not None assert buckets is not None with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # load model parameters. model = create_seq2seq_model(sess, model_path=model_path, forward_only=True, use_best=use_best, FLAGS=FLAGS, buckets=buckets, translate=True) # Load vocabularies. source_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \ ('.vocab.%s' % FLAGS.source_lang) target_vocab_file = FLAGS.data_dir + \ (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \ ('.vocab.%s' % FLAGS.target_lang) src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file) _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file) start_total_time = time.time() total_sentence_count = 0 for file_path in files: print("Translating file %s\n" % file_path) sentence_count = 0 # Decode from file. with gfile.GFile(file_path, mode='r') as source: with gfile.GFile(file_path + '.trans', mode='w') as destiny: sentence = source.readline() start_time = time.time() while sentence: sentence_count += 1 print("Translating sentence %d ", sentence_count) if get_ids: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab) else: # if sentence is already converted, just split the ids token_ids = [int(ss) for ss in sentence.strip().split()] # Get output logits for the sentence. output_hypotheses, output_scores = model.translation_step(sess, token_ids, FLAGS.beam_size, normalize=True, dump_remaining=True) outputs = output_hypotheses[0] # Print out sentence corresponding to outputs. destiny.write(" ".join([rev_tgt_vocab[output] for output in outputs])) destiny.write("\n") sentence = source.readline() end_time = time.time() - start_time print("\nDone file %s" % file_path) print("Avg. %.3f sentences/sec" % (sentence_count / end_time)) total_sentence_count += sentence_count end_total_time = time.time() - start_total_time print("\nDone!") print("Avg. %.3f sentences/sec" % (total_sentence_count / end_total_time))
def test_BLEU(): # Perform BLEU score testing here with tf.Session() as sess: model = create_model(sess, True, False) source = sys.argv[1] target = sys.argv[2] model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. s_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.s_vocab_size, source)) t_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.%s" % (FLAGS.t_vocab_size, target)) s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path) _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path) # Decode from standard input. BLEUscore = {0:[], 1:[], 2:[], 3:[]} s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source) t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target) f_s = open(s_test_path, 'r') f_t = open(t_test_path, 'r') # print(f_s.readline()) step = 0 for sentence in f_s: print(step) # sentence = f_ja.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out Japanese sentence corresponding to outputs. candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs] reference = f_t.readline().split(' ') print(candidate, reference) try: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate) except: temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5)) BLEUscore[bucket_id].append(temp_score) step += 1 print(temp_score) for key,val in BLEUscore.iteritems(): print(key, ": ", np.mean(val))
def decode(): with tf.Session() as sess: # Create model and load parameters. # second arguments means this model are not Training model = create_model(sess, True) # we decode one sentence at a time model.batch_size = 1 # Load vocabularies vocab_path = os.path.join(FLAGS.data_dir,"Word_map.txt") vocab, Q_vocab = data_utils.initialize_vocabulary(vocab_path) while 1: # Get token-ids for the input sentence sys.stdout.write("Input >> ") sys.stdout.flush() sentence = sys.stdin.readline() token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab) #print sentence #print token_ids #print np.shape(token_ids) # Which bucket oes it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id : [(token_ids,[])]},bucket_id) #print np.shape(decoder_inputs) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) bucket_length = (_buckets[bucket_id])[1] # softmax_output_logits = np.zeros(),dtype=np.float) #outputs = np.zeros(bucket_length,np.int) outputs = [] max_outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits] # for i in range(bucket_length): # softmax_output_logits = sess.run(tf.nn.softmax(output_logits[i])) # cum_sum = np.cumsum(softmax_output_logits) # random_number_02 = np.random.random_sample() #print softmax_output_logits.max() #print softmax_output_logits.argmax() # max_outputs.append(softmax_output_logits.argmax()) # output = min( [j for j in xrange(len(cum_sum)) if cum_sum[j] > random_number_02] ) # outputs.append(output) # This is a greedy decoder - outputs are just argmaxes of output_logits. # outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. # # if data_utils.EOS_ID in outputs: # outputs = outputs[:outputs.index(data_utils.EOS_ID)] if data_utils.EOS_ID in max_outputs: max_outputs = max_outputs[:max_outputs.index(data_utils.EOS_ID)] #print Q_vocab[outputs[0]] #print (outputs) # print ("sampling output >>") # print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in outputs])) #print (max_outputs) print ("output >>") print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in max_outputs])) print("=====================")
def decode(): with tf.Session() as sess: # load dictionary srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min) trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min) _, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path) _, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path) # Load test data. if FLAGS.decode_test: srce_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids%d.srce" % FLAGS.srce_vocab_min) trgt_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids.trgt") srce_test_data_path = os.path.join(FLAGS.data_dir, "test/data.srce") trgt_test_data_path = os.path.join(FLAGS.data_dir, "test/data.trgt") # Prepare test data data_utils.data_to_token_ids(srce_test_data_path, srce_test_ids_path, srce_vocab_path) data_utils.data_to_token_ids(trgt_test_data_path, trgt_test_ids_path, trgt_vocab_path) trgt_test_pos = os.path.join(FLAGS.data_dir, "test", "positions.trgt") trgt_test_map = os.path.join(FLAGS.data_dir, "test", "map.srce") test_set = read_data(srce_test_ids_path, trgt_test_ids_path, trgt_test_pos, trgt_test_map) elif FLAGS.decode_dev: srce_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.srce" % FLAGS.srce_vocab_min) trgt_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.trgt" % FLAGS.trgt_vocab_min) trgt_dev_pos = os.path.join(FLAGS.data_dir, "dev", "positions.trgt") trgt_dev_map = os.path.join(FLAGS.data_dir, "dev", "map.srce") test_set = read_data(srce_dev_ids_path, trgt_dev_ids_path, trgt_dev_pos, trgt_dev_map) else: raise ValueError(" Please set decode_test or decode_dev to True! ") # Create model and load parameters. model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True) model.batch_size = 1 # We decode one sentence at a time. # Decode test data. ---> read from files decode_result_path = os.path.join(FLAGS.data_dir, ("result/result_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob))) decode_data_path = os.path.join(FLAGS.data_dir, ("result/gold_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob))) test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))] print ("test bucket size: ", test_bucket_sizes) count = 0 correct = 0 with open(decode_result_path, 'w') as fpred: with open(decode_data_path, 'w') as fgold: # note that the test data has been sorted by bucket size for b in xrange(len(_buckets)): print ("bucket%d:" % b) if len(test_set[b]) == 0: # empty bucket continue for sent in test_set[b]: encoder_input, decoder_input, target_weight, pos, maps = model.get_batch({b: [sent]}, b) # get output_logits _, _, output_logits, _, _, _= model.step(sess, encoder_input, decoder_input, target_weight, b, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) # greedy decoder: outputs are argmax of output_logits outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # write to file fpred.write(data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n') gold = sent[1] if data_utils.EOS_ID in sent[1]: gold = sent[1][:sent[1].index(data_utils.EOS_ID)] fgold.write(data_utils.token_ids_to_sentence(gold, re_trgt_vocab) + '\n') if gold == outputs: correct += 1 # else: # print ("source: ", data_utils.token_ids_to_sentence(sent[0], re_srce_vocab), '\t', pos, '\t', maps) # print ("target: ", data_utils.token_ids_to_sentence(gold, re_trgt_vocab)) # print ("predict: ", data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n') count += 1 print("count = %d, correct = %d, accuracy = %f" % (count, correct, float(correct)/count))
def evaluate(): with tf.Session() as sess: # Create model and load parameters. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path) ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path) model = create_model(sess, True, gr_vocab_size, ph_vocab_size) model.batch_size = 1 # We decode one word at a time. # Load vocabularies. gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme") ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme") gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path) _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path) # Decode from input file. test = open(FLAGS.evaluate).read().split('\n') test_graphemes = [] test_phonemes = [] for line in test: lst = line.split() if len(lst)>=2: test_graphemes.append(lst[0]) test_phonemes.append(" ".join(lst[1:])) duplicates = {} total_dupl_num = 0 for i, gr in enumerate(test_graphemes): if test_graphemes.count(gr) > 1: total_dupl_num += test_graphemes.count(gr) - 1 if gr in duplicates: duplicates[gr].append(test_phonemes[i]) else: duplicates[gr] = [test_phonemes[i]] errors = 0 counter = 0 dupl_error_calculated = [] for i, w in enumerate(test_graphemes): if w not in duplicates: counter += 1 word = " ".join(list(w)) gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) if model_assumption != test_phonemes[i]: errors += 1 else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) elif w not in dupl_error_calculated: counter += 1 dupl_error_calculated.append(w) word = " ".join(list(w)) gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab] if not gr_absent: model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) if model_assumption not in duplicates[w]: errors += 1 else: raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) print("WER : ", errors/counter ) print("Accuracy : ", (1-errors/counter) )
def train(): print ('Applying Parameters:') for k,v in FLAGS.__dict__['__flags'].iteritems(): print ('%s: %s' % (k, str(v))) print("Preparing data in %s" % FLAGS.data_dir) vocab_path = '' tag_vocab_path = '' label_vocab_path = '' in_seq_train, out_seq_train, label_train, in_seq_dev, out_seq_dev, label_dev, in_seq_test, out_seq_test, label_test, vocab_path, tag_vocab_path, label_vocab_path = data_utils.prepare_multi_task_data( FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size) result_dir = FLAGS.train_dir + '/test_results' if not os.path.isdir(result_dir): os.makedirs(result_dir) current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt' current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt' vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path) tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path) label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(label_vocab_path) with tf.Session() as sess: # Create model. print("Max sequence length: %d." % _buckets[0][0]) print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab)) print ("Creating model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab))) # Read data into buckets and compute their sizes. print ("Reading train/valid/test data (training set limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(in_seq_dev, out_seq_dev, label_dev) test_set = read_data(in_seq_test, out_seq_test, label_test) train_set = read_data(in_seq_train, out_seq_train, label_train) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 best_valid_score = 0 best_test_score = 0 while model.global_step.eval() < FLAGS.max_training_steps: random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, tags, tag_weights, batch_sequence_length, labels = model.get_batch(train_set, bucket_id) if task['joint'] == 1: _, step_loss, tagging_logits, classification_logits = model.joint_step(sess, encoder_inputs, tags, tag_weights, labels, batch_sequence_length, bucket_id, False) elif task['tagging'] == 1: _, step_loss, tagging_logits = model.tagging_step(sess, encoder_inputs, tags, tag_weights, batch_sequence_length, bucket_id, False) elif task['intent'] == 1: _, step_loss, classification_logits = model.classification_step(sess, encoder_inputs, labels, batch_sequence_length, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d step-time %.2f. Training perplexity %.2f" % (model.global_step.eval(), step_time, perplexity)) sys.stdout.flush() # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 def run_valid_test(data_set, mode): # mode: Eval, Test # Run evals on development/test set and print the accuracy. word_list = list() ref_tag_list = list() hyp_tag_list = list() ref_label_list = list() hyp_label_list = list() correct_count = 0 accuracy = 0.0 tagging_eval_result = dict() for bucket_id in xrange(len(_buckets)): eval_loss = 0.0 count = 0 for i in xrange(len(data_set[bucket_id])): count += 1 encoder_inputs, tags, tag_weights, sequence_length, labels = model_test.get_one( data_set, bucket_id, i) tagging_logits = [] classification_logits = [] if task['joint'] == 1: _, step_loss, tagging_logits, classification_logits = model_test.joint_step(sess, encoder_inputs, tags, tag_weights, labels, sequence_length, bucket_id, True) elif task['tagging'] == 1: _, step_loss, tagging_logits = model_test.tagging_step(sess, encoder_inputs, tags, tag_weights, sequence_length, bucket_id, True) elif task['intent'] == 1: _, step_loss, classification_logits = model_test.classification_step(sess, encoder_inputs, labels, sequence_length, bucket_id, True) eval_loss += step_loss / len(data_set[bucket_id]) hyp_label = None if task['intent'] == 1: ref_label_list.append(rev_label_vocab[labels[0][0]]) hyp_label = np.argmax(classification_logits[0],0) hyp_label_list.append(rev_label_vocab[hyp_label]) if labels[0] == hyp_label: correct_count += 1 if task['tagging'] == 1: word_list.append([rev_vocab[x[0]] for x in encoder_inputs[:sequence_length[0]]]) ref_tag_list.append([rev_tag_vocab[x[0]] for x in tags[:sequence_length[0]]]) hyp_tag_list.append([rev_tag_vocab[np.argmax(x)] for x in tagging_logits[:sequence_length[0]]]) accuracy = float(correct_count)*100/count if task['intent'] == 1: print(" %s accuracy: %.2f %d/%d" % (mode, accuracy, correct_count, count)) sys.stdout.flush() if task['tagging'] == 1: if mode == 'Eval': taging_out_file = current_taging_valid_out_file elif mode == 'Test': taging_out_file = current_taging_test_out_file tagging_eval_result = conlleval(hyp_tag_list, ref_tag_list, word_list, taging_out_file) print(" %s f1-score: %.2f" % (mode, tagging_eval_result['f1'])) sys.stdout.flush() return accuracy, tagging_eval_result # valid valid_accuracy, valid_tagging_result = run_valid_test(dev_set, 'Eval') if task['tagging'] == 1 and valid_tagging_result['f1'] > best_valid_score: best_valid_score = valid_tagging_result['f1'] # save the best output file subprocess.call(['mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' % best_valid_score]) # test, run test after each validation for development purpose. test_accuracy, test_tagging_result = run_valid_test(test_set, 'Test') if task['tagging'] == 1 and test_tagging_result['f1'] > best_test_score: best_test_score = test_tagging_result['f1'] # save the best output file subprocess.call(['mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' % best_test_score])
def train(): """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.fr_vocab_size) #en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) with tf.Session(config=tf.ConfigProto(device_count={'GPU':1}, gpu_options = gpu_options)) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) #embed() train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) print ("step loss:%.4f", step_loss) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt"+str(loss)) model.saver = tf.train.Saver(tf.all_variables(), max_to_keep=0) model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0
def orig_train(): """Train a en->fr translation model using WMT data.""" # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Read data into buckets and compute their sizes. print ("Reading development training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(float(loss)) if loss < 300 else float("inf") print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # output train/hyp out_batch_size = output_logits[0].shape[0] rand_idx = np.random.randint(0, out_batch_size) inputs = [int(x[rand_idx]) for x in decoder_inputs] outputs = [int(np.argmax(logit[rand_idx])) for logit in output_logits] if data_utils.EOS_ID in inputs: inputs = inputs[:inputs.index(data_utils.EOS_ID)] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] print(" trg = " + " ".join([tf.compat.as_str(rev_fr_vocab[input]) for input in inputs])) print(" hyp = " + " ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float( "inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()