def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess): input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab) bucket_id = min([ b for b in xrange(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids) ]) outputs = [] feed_data = {bucket_id: [(input_token_ids, outputs)]} encoder_inputs, decoder_inputs, target_weights = model.get_batch( feed_data, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) outputs = [] for logit in output_logits: selected_token_id = int(np.argmax(logit, axis=1)) if selected_token_id == data_utils.EOS_ID: break else: outputs.append(selected_token_id) output_sentence = ' '.join([rev_vocab[output] for output in outputs]) return output_sentence
def interactive_comparison(): """Compare two sentences separated by a semi-colon""" with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("(1) > ") sys.stdout.flush() sentence = sys.stdin.readline() contexts = [] while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) print("tokenids:", token_ids) # Which bucket does it belong to? bucket_id = get_bucket(en_vocab, sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get the output context vector output_context = model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id) # Append the context so we can compute the dot product contexts.append(output_context) # Display the output print("bucket_id: ", bucket_id) print("output_context", output_context) # Now we compute similarity metrics if len(contexts) == 2: cosine_distance = cosine_similarity(*contexts) euclid_distance = np.linalg.norm(contexts[1] - contexts[0]) print('cosine_similarity', cosine_distance) print('euclid_distance', euclid_distance) print('-------------------------------') contexts = [] # Start again next_sentence = len(contexts) + 1 print("(%i) > " % next_sentence, end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.from" % FLAGS.from_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.to" % FLAGS.to_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) print("tokenids:", token_ids) # Which bucket does it belong to? bucket_id = get_bucket(en_vocab, sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # Get the output context vector output_context = model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id) # Display the output print("bucket_id: ", bucket_id) print("output_context", output_context) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([ tf.compat.as_str(rev_fr_vocab[output]) for output in outputs ])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def get_bucket(en_vocab, sentence): """ Return the bucket_id that the sentence belongs to """ token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) return bucket_id
def get_context(sess, model, en_vocab, sentence): """ Return the context vector for the sentence """ token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = get_bucket(en_vocab, sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get the output context vector return model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id)
def get_sentence_to_context_map(sentences): """ Process all of the sentences with the model Return a map between sentence text and the context vectors The order of the map is undefined due to the bucketing process """ # Load the vocab en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE) # Allocate the sentences to buckets bucketed = {} for sentence in sentences: bucket_id = get_bucket(en_vocab, sentence) bucketed.setdefault(bucket_id, []) bucketed[bucket_id].append(sentence) mapped = {} with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True, train_dir=TRAIN_DIR) model.batch_size = BATCH_SIZE # We decode 64 sentence at a time. # Iterate over each bucket for bucket_id, sentences in bucketed.iteritems(): for batch in chunker(sentences, BATCH_SIZE): data = [] # Tokenize each sentence for sentence in batch: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) expected_output = [] data.append((token_ids, expected_output)) # Use the model to obtain contexts for each sentence in the batch encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: data}, bucket_id) contexts = model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id) features = np.hstack(contexts) print 'Encoded {0} sentences into {1} dimensional vectors'.format( *features.shape) # Now we align sentences with their contexts for i, sentence in enumerate(batch): mapped[sentence] = features[i, :].tolist() return mapped
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = data_utils.initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) utterences = line.split('\t') tokenized_utterences = [] for utter in utterences: token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(utter), vocab, tokenizer, normalize_digits) tokenized_utterences.append(" ".join([str(tok) for tok in token_ids])) tokens_file.write("\t".join(tokenized_utterences) + "\n")
def get_predicted_sentence(args, input_sentence, vocab, rev_vocab, model, sess, debug=False, return_raw=False): def model_step(enc_inp, dec_inp, dptr, target_weights, bucket_id): _, _, logits = model.step(sess, enc_inp, dec_inp, target_weights, bucket_id, forward_only=True) prob = softmax(logits[dptr][0]) return prob def greedy_dec(output_logits, rev_vocab): selected_token_ids = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if data_utils.EOS_ID in selected_token_ids: eos = selected_token_ids.index(data_utils.EOS_ID) selected_token_ids = selected_token_ids[:eos] output_sentence = ' '.join( [dict_lookup(rev_vocab, t) for t in selected_token_ids]) return output_sentence input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab) # Which bucket does it belong to? bucket_id = min([ b for b in range(len(args.buckets)) if args.buckets[b][0] > len(input_token_ids) ]) outputs = [] feed_data = {bucket_id: [(input_token_ids, outputs)]} # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( feed_data, bucket_id) if debug: print("\n[get_batch]\n", encoder_inputs, decoder_inputs, target_weights) # Original greedy decoding if args.beam_size == 1: _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) return [{"dec_inp": greedy_dec(output_logits, rev_vocab), 'prob': 1}] # Get output logits for the sentence. beams, new_beams, results = [(1, 0, { 'eos': 0, 'dec_inp': decoder_inputs, 'prob': 1, 'prob_ts': 1, 'prob_t': 1 })], [], [] # initialize beams as (log_prob, empty_string, eos) dummy_encoder_inputs = [ np.array([data_utils.PAD_ID]) for _ in range(len(encoder_inputs)) ] for dptr in range(len(decoder_inputs) - 1): if dptr > 0: target_weights[dptr] = [1.] beams, new_beams = new_beams[:args.beam_size], [] if debug: print("=====[beams]=====", beams) heapq.heapify(beams) # since we will remove something for prob, _, cand in beams: if cand['eos']: results += [(prob, 0, cand)] continue # normal seq2seq if debug: print( cand['prob'], " ".join( [dict_lookup(rev_vocab, w) for w in cand['dec_inp']])) all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm: # anti-lm all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) # adjusted probability all_prob = all_prob_ts - args.antilm * all_prob_t # + args.n_bonus * dptr + random() * 1e-50 else: all_prob_t = [0] * len(all_prob_ts) all_prob = all_prob_ts # suppress copy-cat (respond the same as input) if dptr < len(input_token_ids): all_prob[input_token_ids[dptr]] = all_prob[ input_token_ids[dptr]] * 0.01 # for debug use if return_raw: return all_prob, all_prob_ts, all_prob_t # beam search for c in np.argsort(all_prob)[::-1][:args.beam_size]: new_cand = { 'eos': (c == data_utils.EOS_ID), 'dec_inp': [(np.array([c]) if i == (dptr + 1) else k) for i, k in enumerate(cand['dec_inp'])], 'prob_ts': cand['prob_ts'] * all_prob_ts[c], 'prob_t': cand['prob_t'] * all_prob_t[c], 'prob': cand['prob'] * all_prob[c], } new_cand = (new_cand['prob'], random(), new_cand ) # stuff a random to prevent comparing new_cand try: if (len(new_beams) < args.beam_size): heapq.heappush(new_beams, new_cand) elif (new_cand[0] > new_beams[0][0]): heapq.heapreplace(new_beams, new_cand) except Exception as e: print("[Error]", e) print("-----[new_beams]-----\n", new_beams) print("-----[new_cand]-----\n", new_cand) results += new_beams # flush last cands # post-process results res_cands = [] for prob, _, cand in sorted(results, reverse=True): cand['dec_inp'] = " ".join( [dict_lookup(rev_vocab, w) for w in cand['dec_inp']]) res_cands.append(cand) return res_cands[:args.beam_size]
def get_predicted_sentence(args, input_sentence, vocab, rev_vocab, model, sess, debug=False, return_raw=False): def model_step(enc_inp, dec_inp, dptr, target_weights, bucket_id): _, _, logits = model.step(sess, enc_inp, dec_inp, target_weights, bucket_id, forward_only=True) prob = softmax(logits[dptr][0]) # print("model_step @ %s" % (datetime.now())) return prob def greedy_dec(output_logits, rev_vocab): selected_token_ids = [int(np.argmax(logit, axis=1)) for logit in output_logits] if data_utils.EOS_ID in selected_token_ids: eos = selected_token_ids.index(data_utils.EOS_ID) selected_token_ids = selected_token_ids[:eos] output_sentence = ' '.join([dict_lookup(rev_vocab, t) for t in selected_token_ids]) return output_sentence input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab) # Which bucket does it belong to? bucket_id = min([b for b in range(len(args.buckets)) if args.buckets[b][0] > len(input_token_ids)]) outputs = [] feed_data = {bucket_id: [(input_token_ids, outputs)]} # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id) if debug: print("\n[get_batch]\n", encoder_inputs, decoder_inputs, target_weights) ### Original greedy decoding if args.beam_size == 1: _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True) return [{"dec_inp": greedy_dec(output_logits, rev_vocab), 'prob': 1}] # Get output logits for the sentence. beams, new_beams, results = [(1, 0, {'eos': 0, 'dec_inp': decoder_inputs, 'prob': 1, 'prob_ts': 1, 'prob_t': 1})], [], [] # initialize beams as (log_prob, empty_string, eos) dummy_encoder_inputs = [np.array([data_utils.PAD_ID]) for _ in range(len(encoder_inputs))] for dptr in range(len(decoder_inputs)-1): if dptr > 0: target_weights[dptr] = [1.] beams, new_beams = new_beams[:args.beam_size], [] if debug: print("=====[beams]=====", beams) heapq.heapify(beams) # since we will remove something for prob, _, cand in beams: if cand['eos']: results += [(prob, 0, cand)] continue # normal seq2seq if debug: print(cand['prob'], " ".join([dict_lookup(rev_vocab, w) for w in cand['dec_inp']])) all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) if args.antilm: # anti-lm all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id) # adjusted probability all_prob = all_prob_ts - args.antilm * all_prob_t #+ args.n_bonus * dptr + random() * 1e-50 else: all_prob_t = [0]*len(all_prob_ts) all_prob = all_prob_ts # suppress copy-cat (respond the same as input) if dptr < len(input_token_ids): all_prob[input_token_ids[dptr]] = all_prob[input_token_ids[dptr]] * 0.01 # for debug use if return_raw: return all_prob, all_prob_ts, all_prob_t # beam search for c in np.argsort(all_prob)[::-1][:args.beam_size]: new_cand = { 'eos' : (c == data_utils.EOS_ID), 'dec_inp' : [(np.array([c]) if i == (dptr+1) else k) for i, k in enumerate(cand['dec_inp'])], 'prob_ts' : cand['prob_ts'] * all_prob_ts[c], 'prob_t' : cand['prob_t'] * all_prob_t[c], 'prob' : cand['prob'] * all_prob[c], } new_cand = (new_cand['prob'], random(), new_cand) # stuff a random to prevent comparing new_cand try: if (len(new_beams) < args.beam_size): heapq.heappush(new_beams, new_cand) elif (new_cand[0] > new_beams[0][0]): heapq.heapreplace(new_beams, new_cand) except Exception as e: print("[Error]", e) print("-----[new_beams]-----\n", new_beams) print("-----[new_cand]-----\n", new_cand) results += new_beams # flush last cands # post-process results res_cands = [] for prob, _, cand in sorted(results, reverse=True): cand['dec_inp'] = " ".join([dict_lookup(rev_vocab, w) for w in cand['dec_inp']]) res_cands.append(cand) return res_cands[:args.beam_size]