示例#1
0
def main():
        with tf.Session() as sess:
                model = load_model(sess, FLAGS.checkpoint_dir)
                model.batch_size = 1
                model.dropout = 1
                vocab = vocab_utils.VocabMapper(FLAGS.data_dir)
                sys.stdout.write(">")
                sys.stdout.flush()
                sentence = sys.stdin.readline().lower()
                #conversation_history=sentence.decode('utf-8') 
                while sentence:
                        sentence = b'how are you'
                        sentence = util.tokenizer.basic_tokenizer(sentence)
                        #token_ids = list(reversed(vocab.token_2_indices(" ".join(conversation_history))))
                        token_ids = list(reversed(vocab.token_2_indices(sentence)))
                        source = np.zeros(shape=[1, len(token_ids)], dtype=np.int32)

                        for i,j in enumerate(token_ids):
                                source[0,i]=j
                        source_lengths=[]
                        source_lengths.append(1)
                        output_logits = model.test(sess,source,source,source_lengths,source_lengths)

                        #TODO implement beam search
                        #outputs = outputs[:outputs.index(5)]
                        print (output_logits) # ids of output words sequence
                        convo_output=" ".join(vocab.indices_2_tokens(output_logits))
                        conversation_history.append(convo_output)
                        print(convo_output)
                        sys.stdout.write(">")
                        sys.stdout.flush()
                        sentence = sys.stdin.readline().lower()
                        conversation_history.append(sentence)
                        conversation_history = conversation_history[-convo_hist_limit:]
示例#2
0
def main():
	with tf.Session() as sess:
		model = loadModel(sess, FLAGS.checkpoint_dir)
		print _buckets
		model.batch_size = 1
		vocab = vocab_utils.VocabMapper(FLAGS.data_dir)
		sys.stdout.write(">")
		sys.stdout.flush()
		sentence = sys.stdin.readline().lower()
		conversation_history = [sentence]
		while sentence:

			use_static_match = False
			if len(static_sources) > 0:
				#static_match = process.extractOne(sentence, static_sources)
				#Check is static match is close enough to original input
				best_ratio = 0
				static_match = ""
				for s in static_sources:
					score = fuzz.partial_ratio(sentence, s)
					if score > best_ratio:
						static_match = s
						best_ratio = score
				if best_ratio > FLAGS.static_temp:
					use_static_match = True
					#Find corresponding target in static list, bypass neural net output
					convo_output = static_targets[static_sources.index(static_match)]

			if not use_static_match:
				token_ids = list(reversed(vocab.tokens2Indices(" ".join(conversation_history))))
				#token_ids = list(reversed(vocab.tokens2Indices(sentence)))
				bucket_id = min([b for b in xrange(len(_buckets))
					if _buckets[b][0] > len(token_ids)])

				encoder_inputs, decoder_inputs, target_weights = model.get_batch(
				{bucket_id: [(token_ids, [])]}, bucket_id)

				_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
					target_weights, bucket_id, True)

				#TODO implement beam search
				outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

				if vocab_utils.EOS_ID in outputs:
					outputs = outputs[:outputs.index(vocab_utils.EOS_ID)]

				convo_output =  " ".join(vocab.indices2Tokens(outputs))

			conversation_history.append(convo_output)
			print convo_output
			sys.stdout.write(">")
			sys.stdout.flush()
			sentence = sys.stdin.readline().lower()
			conversation_history.append(sentence)
			conversation_history = conversation_history[-convo_hist_limit:]
示例#3
0
    def run(self):
        if not self.data_files_exist:
            print("Obtaining raw text conversation files...")
            text_files = self.get_raw_file_list(self.source_data_path)
            # randomly shuffle order of files
            shuffle(text_files)
            num_train_files = int(self.train_frac * len(text_files))

        #create vocab file
        if not self.vocab_exists:
            vocab_builder = vocab_utils.VocabBuilder(self.max_vocab_size,
                                                     self.processed_data_path)
            print("Building vocab...")
            #loop through data
            for text_file in text_files:
                with open(text_file, "rb") as f:
                    vocab_builder.grow_vocab(f.read())
            print("Creating vocab file...")
            vocab_builder.create_vocab_file()

        if not self.data_files_exist:
            self.vocab_mapper = vocab_utils.VocabMapper(
                self.processed_data_path)
            #create source and target token id files
            processes = []
            print("Creating token id data source and target train files...")

            if len(text_files) == 1:
                num_train_files = 1
                text_files = self.split_single_2_many(text_files[0],
                                                      self.train_frac)

            p1 = Process(target=self.loop_parse_text_files,
                         args=([text_files[:num_train_files]], True))
            p1.start()
            processes.append(p1)
            print("Creating token id data source and target test files...")
            print("This is going to take a while...")
            p2 = Process(target=self.loop_parse_text_files,
                         args=([text_files[num_train_files:]], False))
            p2.start()
            processes.append(p2)

            for p in processes:
                if p.is_alive():
                    p.join()

            print("Done data pre-processing...")
示例#4
0
def main():
	with tf.Session() as sess:
		model = load_model(sess, FLAGS.checkpoint_dir)
		model.batch_size = 1
		model.dropout = 1
		vocab = vocab_utils.VocabMapper(FLAGS.data_dir)
		sys.stdout.write(">")
		sys.stdout.flush()
		sentence = sys.stdin.readline().lower()
		conversation_history = [sentence]
		while sentence:

			token_ids = list(reversed(vocab.token_2_indices(" ".join(conversation_history))))
			#token_ids = list(reversed(vocab.token_2_indices(sentence)))
			bucket_id = min([b for b in xrange(len(_buckets))
				if _buckets[b][0] > len(token_ids)])

			encoder_inputs, decoder_inputs, target_weights = model.get_batch(
			{bucket_id: [(token_ids, [])]}, bucket_id)

			_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
				target_weights, bucket_id, True)

			#TODO implement beam search
			outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

			if vocab_utils.EOS_ID in outputs:
				outputs = outputs[:outputs.index(vocab_utils.EOS_ID)]

			convo_output =  " ".join(vocab.indices_2_tokens(outputs))

			conversation_history.append(convo_output)
			print(convo_output)
			sys.stdout.write(">")
			sys.stdout.flush()
			sentence = sys.stdin.readline().lower()
			conversation_history.append(sentence)
			conversation_history = conversation_history[-convo_hist_limit:]
示例#5
0
def main():
    if not os.path.exists(FLAGS.checkpoint_dir):
        os.mkdir(FLAGS.checkpoint_dir)
    path = get_checkpoint_path()
    print("path is {0}".format(path))
    data_processor = data_utils.DataProcessor(
        FLAGS.vocab_size, FLAGS.raw_data_dir, FLAGS.data_dir, FLAGS.train_frac,
        FLAGS.tokenizer, FLAGS.convo_limits, FLAGS.max_target_length,
        FLAGS.max_source_length)
    data_processor.run()
    #create model
    print("Creating model with...")
    print("Number of hidden layers: {0}".format(FLAGS.num_layers))
    print("Number of units per layer: {0}".format(FLAGS.hidden_size))
    print("Dropout: {0}".format(FLAGS.dropout))
    vocab_mapper = vocab_utils.VocabMapper(FLAGS.data_dir)
    vocab_size = vocab_mapper.get_vocab_size()
    print("Vocab size is: {0}".format(vocab_size))
    FLAGS.vocab_size = vocab_size

    last_test_loss = float('inf')
    with tf.Session() as sess:
        model = create_model(sess, path, vocab_size)
        #train model and save to checkpoint
        print("Beggining training...")
        print("Maximum number of epochs to train for: {0}".format(
            FLAGS.max_epoch))
        print("Batch size: {0}".format(FLAGS.batch_size))
        print("Starting learning rate: {0}".format(FLAGS.learning_rate))
        print("Learning rate decay factor: {0}".format(FLAGS.lr_decay_factor))

        source_train_file_path = data_processor.data_source_train
        target_train_file_path = data_processor.data_target_train
        source_test_file_path = data_processor.data_source_test
        target_test_file_path = data_processor.data_target_test
        print(source_train_file_path)
        print(target_train_file_path)

        train_set = read_data(source_train_file_path, target_train_file_path,
                              FLAGS.max_train_data_size)
        random.shuffle(train_set)
        test_set = read_data(source_test_file_path, target_test_file_path,
                             FLAGS.max_train_data_size)
        random.shuffle(test_set)

        step_time, train_loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        num_batches = len(train_set) / FLAGS.batch_size
        batch_pointer = 0

        while True:
            # Get a batch and make a step.
            start_time = time.time()
            start_index = int(batch_pointer * FLAGS.batch_size)
            end_index = int(start_index + FLAGS.batch_size)
            inputs, targets, input_lengths, target_lengths =\
              model.get_batch(train_set[start_index : end_index])
            step_loss = model.step(sess, inputs, targets, input_lengths,
                                   target_lengths)
            batch_pointer = (batch_pointer + 1) % num_batches
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            train_loss += step_loss / FLAGS.steps_per_checkpoint
            current_step += 1

            # Once in a while, we save checkpoint, show statistics, and run tests.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # show statistics for the previous epoch.
                print("Step {0} learning rate {1} step-time {2} training loss {3}"\
                .format(model.global_step.eval(), round(model.learning_rate,4),
                   round(step_time, 4), round(train_loss,4)))
                # Decrease learning rate if no improvement was seen over last 3 times.
                #if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                #	sess.run(model.learning_rate_decay_op)
                previous_losses.append(train_loss)

                # Run tests on test set and show their perplexity.
                test_losses = []
                num_test_batches = int(len(test_set) / FLAGS.batch_size)
                for test_pointer in range(0, num_test_batches):
                    start_index = test_pointer * FLAGS.batch_size
                    inputs, targets, input_lengths, target_lengths =\
                     model.get_batch(test_set[start_index : start_index + FLAGS.batch_size])
                    test_loss = model.step(sess,
                                           inputs,
                                           targets,
                                           input_lengths,
                                           target_lengths,
                                           test_mode=True)
                    test_losses.append(test_loss)

                test_loss = float(np.mean(test_losses))

                print(" step: {0} test loss: {1}".format(
                    model.global_step.eval(), round(test_loss, 4)))
                # Save checkpoint and zero timer and loss.
                if test_loss < last_test_loss:
                    checkpoint_path = os.path.join(path, "chatbot")
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=model.global_step)
                last_test_loss = test_loss
                step_time, train_loss = 0.0, 0.0

                sys.stdout.flush()
示例#6
0
def main():
	config.read(FLAGS.config_file)

	max_num_lines = int(config.get("max_data_sizes", "num_lines"))
	max_target_length = int(config.get("max_data_sizes", "max_target_length"))
	max_source_length = int(config.get("max_data_sizes", "max_source_length"))

	if not os.path.exists(FLAGS.checkpoint_dir):
		os.mkdir(FLAGS.checkpoint_dir)
	path = getCheckpointPath()
	print "path is {0}".format(path)
	data_processor = data_utils.DataProcessor(FLAGS.vocab_size,
		FLAGS.raw_data_dir,FLAGS.data_dir, FLAGS.train_frac, FLAGS.tokenizer,
		max_num_lines, max_target_length, max_source_length, FLAGS.is_discrete,
		FLAGS.extra_discrete_data)
	data_processor.run()
	#create model
	print "Creating model with..."
	print "Number of hidden layers: {0}".format(FLAGS.num_layers)
	print "Number of units per layer: {0}".format(FLAGS.hidden_size)
	print "Dropout: {0}".format(FLAGS.dropout)
	vocab_mapper = vocab_utils.VocabMapper(FLAGS.data_dir)
	vocab_size = vocab_mapper.getVocabSize()
	print "Vocab size is: {0}".format(vocab_size)
	FLAGS.vocab_size = vocab_size
	with tf.Session() as sess:
		writer = tf.train.SummaryWriter("/tmp/tb_logs_chatbot", sess.graph)
		model = createModel(sess, path, vocab_size)
		print "Using bucket sizes:"
		print _buckets
		#train model and save to checkpoint
		print "Beggining training..."
		print "Maximum number of epochs to train for: {0}".format(FLAGS.max_epoch)
		print "Batch size: {0}".format(FLAGS.batch_size)
		print "Starting learning rate: {0}".format(FLAGS.learning_rate)
		print "Learning rate decay factor: {0}".format(FLAGS.lr_decay_factor)

		source_train_file_path = data_processor.data_source_train
		target_train_file_path = data_processor.data_target_train
		source_test_file_path = data_processor.data_source_test
		target_test_file_path = data_processor.data_target_test
		print source_train_file_path
		print target_train_file_path

		train_set = readData(source_train_file_path, target_train_file_path,
			FLAGS.max_train_data_size)
		test_set = readData(source_test_file_path, target_test_file_path,
			FLAGS.max_train_data_size)

		train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
		print "bucket sizes = {0}".format(train_bucket_sizes)
		train_total_size = float(sum(train_bucket_sizes))

		train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
			for i in xrange(len(train_bucket_sizes))]
		step_time, loss = 0.0, 0.0
		current_step = 0
		previous_losses = []
		while True:
			# Choose a bucket according to data distribution. We pick a random number
			# in [0, 1] and use the corresponding interval in train_buckets_scale.
			random_number_01 = np.random.random_sample()
			bucket_id = min([i for i in xrange(len(train_buckets_scale))
					   if train_buckets_scale[i] > random_number_01])

			# Get a batch and make a step.
			start_time = time.time()
			encoder_inputs, decoder_inputs, target_weights = model.get_batch(
			train_set, bucket_id)
			_, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
				target_weights, bucket_id, False)
			step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
			loss += step_loss / FLAGS.steps_per_checkpoint
			current_step += 1

			# Once in a while, we save checkpoint, print statistics, and run evals.
			if current_step % FLAGS.steps_per_checkpoint == 0:
				train_loss_summary = tf.Summary()
				str_summary_train_loss = train_loss_summary.value.add()
				str_summary_train_loss.simple_value = loss
				str_summary_train_loss.tag = "train_loss"
				writer.add_summary(train_loss_summary, current_step)
				# Print statistics for the previous epoch.
				perplexity = math.exp(loss) if loss < 300 else float('inf')
				print ("global step %d learning rate %.4f step-time %.2f perplexity "
					"%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
						 step_time, perplexity))
				# Decrease learning rate if no improvement was seen over last 3 times.
				if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
					sess.run(model.learning_rate_decay_op)
				previous_losses.append(loss)
				# Save checkpoint and zero timer and loss.
				checkpoint_path = os.path.join(path, "chatbot.ckpt")
				model.saver.save(sess, checkpoint_path, global_step=model.global_step)
				step_time, loss = 0.0, 0.0
				# Run evals on development set and print their perplexity.
				perplexity_summary = tf.Summary()
				eval_loss_summary = tf.Summary()
				for bucket_id in xrange(len(_buckets)):
					if len(test_set[bucket_id]) == 0:
						print("  eval: empty bucket %d" % (bucket_id))
						continue
					encoder_inputs, decoder_inputs, target_weights = model.get_batch(
						test_set, bucket_id)
					_, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
						target_weights, bucket_id, True)
					eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
					print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
					str_summary_ppx = perplexity_summary.value.add()
					str_summary_ppx.simple_value = eval_ppx
					str_summary_ppx.tag = "peplexity_bucket)%d" % bucket_id

					str_summary_eval_loss = eval_loss_summary.value.add()
					#need to convert from numpy.float32 to float native type
					str_summary_eval_loss.simple_value = float(eval_loss)
					str_summary_eval_loss.tag = "eval_loss_bucket)%d" % bucket_id
					writer.add_summary(perplexity_summary, current_step)
					writer.add_summary(eval_loss_summary, current_step)
				sys.stdout.flush()
示例#7
0
    def run(self):
        if not self.data_files_exist:
            print "Obtaining raw text conversation files..."
            text_files = self.getRawFileList(self.source_data_path)
            if not self.extra_discrete_data == "":
                extra_files = self.getRawFileList(self.extra_discrete_data)
            else:
                extra_files = []
            # randomly shuffle order of files
            shuffle(text_files)
            num_train_files = int(self.train_frac * len(text_files))

        #create vocab file
        if not self.vocab_exists:
            vocab_builder = vocab_utils.VocabBuilder(self.max_vocab_size,
                                                     self.processed_data_path)
            print "Building vocab..."
            #loop through continuous/discrete data
            for text_file in text_files:
                with open(text_file, "r+") as f:
                    vocab_builder.growVocab(f.read())
            #loopthrough extra discrete data
            for text_file in extra_files:
                with open(text_file, "r+") as f:
                    vocab_builder.growVocab(f.read())
            print "Creating vocab file..."
            vocab_builder.createVocabFile()

        if not self.data_files_exist:
            self.vocab_mapper = vocab_utils.VocabMapper(
                self.processed_data_path)
            #create source and target token id files
            processes = []
            print "Creating token id data source and target train files..."

            if len(text_files) == 1:
                num_train_files = 1
                text_files = self.splitSingle2Many(text_files[0],
                                                   self.train_frac)
            if len(extra_files) == 1:
                num_extra_files = 1
                extra_files = self.splitSingle2Many(extra_files[0],
                                                    self.train_frac)
            else:
                num_extra_files = len(extra_files)

            p1 = Process(target=self.loopParseTextFiles,
                         args=([text_files[:num_train_files]], True,
                               self.is_discrete))
            p1.start()
            processes.append(p1)
            print "Creating token id data source and target test files..."
            print "This is going to take a while..."
            p2 = Process(target=self.loopParseTextFiles,
                         args=([text_files[num_train_files:]], False,
                               self.is_discrete))
            p2.start()
            processes.append(p2)

            for p in processes:
                if p.is_alive():
                    p.join()

            if len(extra_files) > 0:
                p2 = Process(target=self.loopParseTextFiles,
                             args=([extra_files[num_extra_files:]], False,
                                   True))
                p2.start()
                processes.append(p2)
                p1 = Process(target=self.loopParseTextFiles,
                             args=([extra_files[:num_extra_files]], True,
                                   True))
                p1.start()
                processes.append(p1)

            for p in processes:
                if p.is_alive():
                    p.join()
            print "Done data pre-processing..."