def main(): with tf.Session() as sess: model = load_model(sess, FLAGS.checkpoint_dir) model.batch_size = 1 model.dropout = 1 vocab = vocab_utils.VocabMapper(FLAGS.data_dir) sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline().lower() #conversation_history=sentence.decode('utf-8') while sentence: sentence = b'how are you' sentence = util.tokenizer.basic_tokenizer(sentence) #token_ids = list(reversed(vocab.token_2_indices(" ".join(conversation_history)))) token_ids = list(reversed(vocab.token_2_indices(sentence))) source = np.zeros(shape=[1, len(token_ids)], dtype=np.int32) for i,j in enumerate(token_ids): source[0,i]=j source_lengths=[] source_lengths.append(1) output_logits = model.test(sess,source,source,source_lengths,source_lengths) #TODO implement beam search #outputs = outputs[:outputs.index(5)] print (output_logits) # ids of output words sequence convo_output=" ".join(vocab.indices_2_tokens(output_logits)) conversation_history.append(convo_output) print(convo_output) sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline().lower() conversation_history.append(sentence) conversation_history = conversation_history[-convo_hist_limit:]
def main(): with tf.Session() as sess: model = loadModel(sess, FLAGS.checkpoint_dir) print _buckets model.batch_size = 1 vocab = vocab_utils.VocabMapper(FLAGS.data_dir) sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline().lower() conversation_history = [sentence] while sentence: use_static_match = False if len(static_sources) > 0: #static_match = process.extractOne(sentence, static_sources) #Check is static match is close enough to original input best_ratio = 0 static_match = "" for s in static_sources: score = fuzz.partial_ratio(sentence, s) if score > best_ratio: static_match = s best_ratio = score if best_ratio > FLAGS.static_temp: use_static_match = True #Find corresponding target in static list, bypass neural net output convo_output = static_targets[static_sources.index(static_match)] if not use_static_match: token_ids = list(reversed(vocab.tokens2Indices(" ".join(conversation_history)))) #token_ids = list(reversed(vocab.tokens2Indices(sentence))) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #TODO implement beam search outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if vocab_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(vocab_utils.EOS_ID)] convo_output = " ".join(vocab.indices2Tokens(outputs)) conversation_history.append(convo_output) print convo_output sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline().lower() conversation_history.append(sentence) conversation_history = conversation_history[-convo_hist_limit:]
def run(self): if not self.data_files_exist: print("Obtaining raw text conversation files...") text_files = self.get_raw_file_list(self.source_data_path) # randomly shuffle order of files shuffle(text_files) num_train_files = int(self.train_frac * len(text_files)) #create vocab file if not self.vocab_exists: vocab_builder = vocab_utils.VocabBuilder(self.max_vocab_size, self.processed_data_path) print("Building vocab...") #loop through data for text_file in text_files: with open(text_file, "rb") as f: vocab_builder.grow_vocab(f.read()) print("Creating vocab file...") vocab_builder.create_vocab_file() if not self.data_files_exist: self.vocab_mapper = vocab_utils.VocabMapper( self.processed_data_path) #create source and target token id files processes = [] print("Creating token id data source and target train files...") if len(text_files) == 1: num_train_files = 1 text_files = self.split_single_2_many(text_files[0], self.train_frac) p1 = Process(target=self.loop_parse_text_files, args=([text_files[:num_train_files]], True)) p1.start() processes.append(p1) print("Creating token id data source and target test files...") print("This is going to take a while...") p2 = Process(target=self.loop_parse_text_files, args=([text_files[num_train_files:]], False)) p2.start() processes.append(p2) for p in processes: if p.is_alive(): p.join() print("Done data pre-processing...")
def main(): with tf.Session() as sess: model = load_model(sess, FLAGS.checkpoint_dir) model.batch_size = 1 model.dropout = 1 vocab = vocab_utils.VocabMapper(FLAGS.data_dir) sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline().lower() conversation_history = [sentence] while sentence: token_ids = list(reversed(vocab.token_2_indices(" ".join(conversation_history)))) #token_ids = list(reversed(vocab.token_2_indices(sentence))) bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) #TODO implement beam search outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] if vocab_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(vocab_utils.EOS_ID)] convo_output = " ".join(vocab.indices_2_tokens(outputs)) conversation_history.append(convo_output) print(convo_output) sys.stdout.write(">") sys.stdout.flush() sentence = sys.stdin.readline().lower() conversation_history.append(sentence) conversation_history = conversation_history[-convo_hist_limit:]
def main(): if not os.path.exists(FLAGS.checkpoint_dir): os.mkdir(FLAGS.checkpoint_dir) path = get_checkpoint_path() print("path is {0}".format(path)) data_processor = data_utils.DataProcessor( FLAGS.vocab_size, FLAGS.raw_data_dir, FLAGS.data_dir, FLAGS.train_frac, FLAGS.tokenizer, FLAGS.convo_limits, FLAGS.max_target_length, FLAGS.max_source_length) data_processor.run() #create model print("Creating model with...") print("Number of hidden layers: {0}".format(FLAGS.num_layers)) print("Number of units per layer: {0}".format(FLAGS.hidden_size)) print("Dropout: {0}".format(FLAGS.dropout)) vocab_mapper = vocab_utils.VocabMapper(FLAGS.data_dir) vocab_size = vocab_mapper.get_vocab_size() print("Vocab size is: {0}".format(vocab_size)) FLAGS.vocab_size = vocab_size last_test_loss = float('inf') with tf.Session() as sess: model = create_model(sess, path, vocab_size) #train model and save to checkpoint print("Beggining training...") print("Maximum number of epochs to train for: {0}".format( FLAGS.max_epoch)) print("Batch size: {0}".format(FLAGS.batch_size)) print("Starting learning rate: {0}".format(FLAGS.learning_rate)) print("Learning rate decay factor: {0}".format(FLAGS.lr_decay_factor)) source_train_file_path = data_processor.data_source_train target_train_file_path = data_processor.data_target_train source_test_file_path = data_processor.data_source_test target_test_file_path = data_processor.data_target_test print(source_train_file_path) print(target_train_file_path) train_set = read_data(source_train_file_path, target_train_file_path, FLAGS.max_train_data_size) random.shuffle(train_set) test_set = read_data(source_test_file_path, target_test_file_path, FLAGS.max_train_data_size) random.shuffle(test_set) step_time, train_loss = 0.0, 0.0 current_step = 0 previous_losses = [] num_batches = len(train_set) / FLAGS.batch_size batch_pointer = 0 while True: # Get a batch and make a step. start_time = time.time() start_index = int(batch_pointer * FLAGS.batch_size) end_index = int(start_index + FLAGS.batch_size) inputs, targets, input_lengths, target_lengths =\ model.get_batch(train_set[start_index : end_index]) step_loss = model.step(sess, inputs, targets, input_lengths, target_lengths) batch_pointer = (batch_pointer + 1) % num_batches step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint train_loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, show statistics, and run tests. if current_step % FLAGS.steps_per_checkpoint == 0: # show statistics for the previous epoch. print("Step {0} learning rate {1} step-time {2} training loss {3}"\ .format(model.global_step.eval(), round(model.learning_rate,4), round(step_time, 4), round(train_loss,4))) # Decrease learning rate if no improvement was seen over last 3 times. #if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): # sess.run(model.learning_rate_decay_op) previous_losses.append(train_loss) # Run tests on test set and show their perplexity. test_losses = [] num_test_batches = int(len(test_set) / FLAGS.batch_size) for test_pointer in range(0, num_test_batches): start_index = test_pointer * FLAGS.batch_size inputs, targets, input_lengths, target_lengths =\ model.get_batch(test_set[start_index : start_index + FLAGS.batch_size]) test_loss = model.step(sess, inputs, targets, input_lengths, target_lengths, test_mode=True) test_losses.append(test_loss) test_loss = float(np.mean(test_losses)) print(" step: {0} test loss: {1}".format( model.global_step.eval(), round(test_loss, 4))) # Save checkpoint and zero timer and loss. if test_loss < last_test_loss: checkpoint_path = os.path.join(path, "chatbot") model.saver.save(sess, checkpoint_path, global_step=model.global_step) last_test_loss = test_loss step_time, train_loss = 0.0, 0.0 sys.stdout.flush()
def main(): config.read(FLAGS.config_file) max_num_lines = int(config.get("max_data_sizes", "num_lines")) max_target_length = int(config.get("max_data_sizes", "max_target_length")) max_source_length = int(config.get("max_data_sizes", "max_source_length")) if not os.path.exists(FLAGS.checkpoint_dir): os.mkdir(FLAGS.checkpoint_dir) path = getCheckpointPath() print "path is {0}".format(path) data_processor = data_utils.DataProcessor(FLAGS.vocab_size, FLAGS.raw_data_dir,FLAGS.data_dir, FLAGS.train_frac, FLAGS.tokenizer, max_num_lines, max_target_length, max_source_length, FLAGS.is_discrete, FLAGS.extra_discrete_data) data_processor.run() #create model print "Creating model with..." print "Number of hidden layers: {0}".format(FLAGS.num_layers) print "Number of units per layer: {0}".format(FLAGS.hidden_size) print "Dropout: {0}".format(FLAGS.dropout) vocab_mapper = vocab_utils.VocabMapper(FLAGS.data_dir) vocab_size = vocab_mapper.getVocabSize() print "Vocab size is: {0}".format(vocab_size) FLAGS.vocab_size = vocab_size with tf.Session() as sess: writer = tf.train.SummaryWriter("/tmp/tb_logs_chatbot", sess.graph) model = createModel(sess, path, vocab_size) print "Using bucket sizes:" print _buckets #train model and save to checkpoint print "Beggining training..." print "Maximum number of epochs to train for: {0}".format(FLAGS.max_epoch) print "Batch size: {0}".format(FLAGS.batch_size) print "Starting learning rate: {0}".format(FLAGS.learning_rate) print "Learning rate decay factor: {0}".format(FLAGS.lr_decay_factor) source_train_file_path = data_processor.data_source_train target_train_file_path = data_processor.data_target_train source_test_file_path = data_processor.data_source_test target_test_file_path = data_processor.data_target_test print source_train_file_path print target_train_file_path train_set = readData(source_train_file_path, target_train_file_path, FLAGS.max_train_data_size) test_set = readData(source_test_file_path, target_test_file_path, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] print "bucket sizes = {0}".format(train_bucket_sizes) train_total_size = float(sum(train_bucket_sizes)) train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: train_loss_summary = tf.Summary() str_summary_train_loss = train_loss_summary.value.add() str_summary_train_loss.simple_value = loss str_summary_train_loss.tag = "train_loss" writer.add_summary(train_loss_summary, current_step) # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(path, "chatbot.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. perplexity_summary = tf.Summary() eval_loss_summary = tf.Summary() for bucket_id in xrange(len(_buckets)): if len(test_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( test_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) str_summary_ppx = perplexity_summary.value.add() str_summary_ppx.simple_value = eval_ppx str_summary_ppx.tag = "peplexity_bucket)%d" % bucket_id str_summary_eval_loss = eval_loss_summary.value.add() #need to convert from numpy.float32 to float native type str_summary_eval_loss.simple_value = float(eval_loss) str_summary_eval_loss.tag = "eval_loss_bucket)%d" % bucket_id writer.add_summary(perplexity_summary, current_step) writer.add_summary(eval_loss_summary, current_step) sys.stdout.flush()
def run(self): if not self.data_files_exist: print "Obtaining raw text conversation files..." text_files = self.getRawFileList(self.source_data_path) if not self.extra_discrete_data == "": extra_files = self.getRawFileList(self.extra_discrete_data) else: extra_files = [] # randomly shuffle order of files shuffle(text_files) num_train_files = int(self.train_frac * len(text_files)) #create vocab file if not self.vocab_exists: vocab_builder = vocab_utils.VocabBuilder(self.max_vocab_size, self.processed_data_path) print "Building vocab..." #loop through continuous/discrete data for text_file in text_files: with open(text_file, "r+") as f: vocab_builder.growVocab(f.read()) #loopthrough extra discrete data for text_file in extra_files: with open(text_file, "r+") as f: vocab_builder.growVocab(f.read()) print "Creating vocab file..." vocab_builder.createVocabFile() if not self.data_files_exist: self.vocab_mapper = vocab_utils.VocabMapper( self.processed_data_path) #create source and target token id files processes = [] print "Creating token id data source and target train files..." if len(text_files) == 1: num_train_files = 1 text_files = self.splitSingle2Many(text_files[0], self.train_frac) if len(extra_files) == 1: num_extra_files = 1 extra_files = self.splitSingle2Many(extra_files[0], self.train_frac) else: num_extra_files = len(extra_files) p1 = Process(target=self.loopParseTextFiles, args=([text_files[:num_train_files]], True, self.is_discrete)) p1.start() processes.append(p1) print "Creating token id data source and target test files..." print "This is going to take a while..." p2 = Process(target=self.loopParseTextFiles, args=([text_files[num_train_files:]], False, self.is_discrete)) p2.start() processes.append(p2) for p in processes: if p.is_alive(): p.join() if len(extra_files) > 0: p2 = Process(target=self.loopParseTextFiles, args=([extra_files[num_extra_files:]], False, True)) p2.start() processes.append(p2) p1 = Process(target=self.loopParseTextFiles, args=([extra_files[:num_extra_files]], True, True)) p1.start() processes.append(p1) for p in processes: if p.is_alive(): p.join() print "Done data pre-processing..."