示例#1
0
def decoder(params):

	data_path = params['data_path']
	glove_embed = params['glove_embedding']
	oov_embed = params['oov_embedding']
	preprocessed_data = params['preprocessed_data']
	decode_path = params['decode_path']
	model_path = params['model_path']
	result_path = params['result_path']
	file_name = params['file_name']
	weights = params['weights']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']
	max_sents = params['max_sents']
	embedding_dim = params['embedding_dim']
	birnn_dim = params['birnn_dim']
	rnn_dim = params['rnn_dim']
	vocab_size = params['vocab_size']
	batch_size = params['batch_size']
	epoch = params['epoch']
	

	'''
	Reading vocabulary dictionaries

	'''
	indices_words_connector = DataConnector(preprocessed_data, 'all_idxword_vocabulary_sent_fsoftmax.pkl', data=None)
	indices_words_connector.read_pickle()
	indices_words = indices_words_connector.read_file

	words_indices_connector = DataConnector(preprocessed_data, 'all_wordidx_vocabulary_sent_fsoftmax.pkl', data=None)
	words_indices_connector.read_pickle()
	words_indices = words_indices_connector.read_file

	y_test_true_connector = DataConnector(data_path, 'test_sent_output_tokens.npy', data=None)
	y_test_true_connector.read_numpys()
	y_test_true = y_test_true_connector.read_file


	# non-paired data set


	X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_sent_fsoftmax.npy', data=None)
	X_test_connector.read_numpys()
	X_test = X_test_connector.read_file

	


	'''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

	glove_embedding_conn = DataConnector(preprocessed_data, glove_embed, data=None)
	glove_embedding_conn.read_pickle()
	pretrained_embedding = glove_embedding_conn.read_file

	print("pretrained_embedding shape: %s"%str(pretrained_embedding.shape))
	print("pretrained_embedding [0][:10]: %s"%str(pretrained_embedding[0,:10]))
	print("pretrained_embedding [1][:10]: %s"%str(pretrained_embedding[1,:10]))

	oov_embedding_conn = DataConnector(preprocessed_data, oov_embed, data=None)
	oov_embedding_conn.read_pickle()
	oov_embedding = oov_embedding_conn.read_file

	print("oov_embedding shape: %s"%str(oov_embedding.shape))
	print("oov_embedding [0][:10]: %s"%str(oov_embedding[0,:10]))
	print("oov_embedding [1][:10]: %s"%str(oov_embedding[1,:10]))
	print("oov_embedding [2][:10]: %s"%str(oov_embedding[2,:10]))

	full_softmax = HierarchyFullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, max_sents=max_sents, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_path, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None)

	# skeleton of model architecture
	full_softmax.train_hier_seq2seq(pretrained_embedding, oov_embedding)
	encoder_model = full_softmax.encoder_model

	predict_softmax_model = full_softmax.predict_seq2seq(weights)
	
	decoder_model = full_softmax.create_decoder_model()

	
	# transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
	keyphrases_transform =  TrueKeyphrases(y_test_true)
	keyphrases_transform.get_true_keyphrases()
	keyphrases_transform.get_stat_keyphrases()
	y_true = keyphrases_transform.y_true
	max_kp_num = keyphrases_transform.max_kp_num
	mean_kp_num = keyphrases_transform.mean_kp_num
	std_kp_num = keyphrases_transform.std_kp_num

	print("Maximum number of key phrases per document in corpus: %s" %max_kp_num)
	sys.stdout.flush()
	print("Average number of key phrases per document in corpus: %s" %mean_kp_num)
	sys.stdout.flush()
	print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num)
	sys.stdout.flush()

	# round up function for computing beam width 
	def roundup(x):
		return x if x % 5 == 0 else x + 5 - x % 5

	beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
	print("\nBeam width: %s\n" %beam_width)
	sys.stdout.flush()
	num_hypotheses = beam_width

	print(str(datetime.now()))
	sys.stdout.flush()

	t0 = time.time()
	print("Start decoding...")
	sys.stdout.flush()

	inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name)

	
	t0_1 = time.time()
	print("Start beam decoding...")
	sys.stdout.flush()

	beam_keyphrases = inference_mode.beam_decoder(X_test[:500])
	
	beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy'%(file_name), beam_keyphrases)
	beam_decode_connector.save_numpys()

	t1_1 = time.time()
	print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
	sys.stdout.flush()
def decoder(params):

	data_path = params['data_path']
	kp20k_path = params['kp20k_path']
	glove_embed = params['glove_embedding']
	oov_embed = params['oov_embedding']
	preprocessed_v2 = params['preprocessed_v2']
	preprocessed_data = params['preprocessed_data']
	decode_path = params['decode_path']
	model_path = params['model_path']
	result_path = params['result_path']
	result_kp20k = params['result_kp20k']
	file_name = params['file_name']
	weights = params['weights']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']
	max_sents = params['max_sents']
	embedding_dim = params['embedding_dim']
	birnn_dim = params['birnn_dim']
	rnn_dim = params['rnn_dim']
	vocab_size = params['vocab_size']
	batch_size = params['batch_size']
	epoch = params['epoch']
	

	'''
	Reading vocabulary dictionaries

	'''
	indices_words_connector = DataConnector(preprocessed_v2, 'all_idxword_vocabulary_sent_fsoftmax.pkl', data=None)
	indices_words_connector.read_pickle()
	indices_words = indices_words_connector.read_file

	words_indices_connector = DataConnector(preprocessed_v2, 'all_wordidx_vocabulary_sent_fsoftmax.pkl', data=None)
	words_indices_connector.read_pickle()
	words_indices = words_indices_connector.read_file

	## merge all set into one test set for trained model

	train_outputs_conn = DataConnector(data_path, 'train_sent_output_tokens.npy', data=None)
	train_outputs_conn.read_numpys()
	train_outputs = train_outputs_conn.read_file


	test_outputs_conn = DataConnector(data_path, 'test_sent_output_tokens.npy', data=None)
	test_outputs_conn.read_numpys()
	test_outputs = test_outputs_conn.read_file

	y_test_true = np.concatenate((train_outputs, test_outputs))

	print("Ground truth of keyphrases shape: %s"%str(y_test_true.shape)) # input for encoder
	sys.stdout.flush()

	

	# non-paired data set


	X_train_connector = DataConnector(preprocessed_data, 'X_train_pad_sent_fsoftmax.npy', data=None)
	X_train_connector.read_numpys()
	X_train = X_train_connector.read_file

	
	X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_sent_fsoftmax.npy', data=None)
	X_test_connector.read_numpys()
	X_test = X_test_connector.read_file

	X_in = np.concatenate((X_train, X_test))

	print("\n Non-paired test set: \n")
	sys.stdout.flush()
	print("X (input for encoder) shape: %s"%str(X_in.shape)) # input for encoder
	sys.stdout.flush()
	

	'''
	Decoder model for inference stage
	Return: generated keyphrases
	'''

	glove_embedding_conn = DataConnector(preprocessed_v2, glove_embed, data=None)
	glove_embedding_conn.read_pickle()
	pretrained_embedding = glove_embedding_conn.read_file

	print("pretrained_embedding shape: %s"%str(pretrained_embedding.shape))
	print("pretrained_embedding [0][:10]: %s"%str(pretrained_embedding[0,:10]))
	print("pretrained_embedding [1][:10]: %s"%str(pretrained_embedding[1,:10]))

	oov_embedding_conn = DataConnector(preprocessed_v2, oov_embed, data=None)
	oov_embedding_conn.read_pickle()
	oov_embedding = oov_embedding_conn.read_file

	print("oov_embedding shape: %s"%str(oov_embedding.shape))
	print("oov_embedding [0][:10]: %s"%str(oov_embedding[0,:10]))
	print("oov_embedding [1][:10]: %s"%str(oov_embedding[1,:10]))
	print("oov_embedding [2][:10]: %s"%str(oov_embedding[2,:10]))

	full_softmax = HierarchyFullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, max_sents=max_sents, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_kp20k, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None)

	# skeleton of model architecture
	full_softmax.train_hier_seq2seq(pretrained_embedding, oov_embedding)
	

	'''
	Model for retrieving softmax probability
	Return: softmax probability of prediction layer
	'''

	full_softmax.predict_seq2seq(weights)
	encoder_model = full_softmax.encoder_model
	# 1. Prediction model after being trained on sampled softmax setting
	predict_softmax_model = full_softmax.prediction_model

	'''

	Inference stage
	Model: layers from prediction model and decoder model
	Inference (text generation) approach: 
	1. One best search decoding (Greedy search): 
	   Return one best (top) probable word sequence, from joint probability of words within decoder time steps (decoder sequence length)
	2. N-Beam search decoding: 
	   Return N-top best most probable word sequences, by utilizing beam tree search per time steps and joint probability within decoder time steps (decoder sequence length)

	'''

	decoder_model = full_softmax.create_decoder_model()

	
	# transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
	keyphrases_transform =  TrueKeyphrases(y_test_true)
	keyphrases_transform.get_true_keyphrases()
	keyphrases_transform.get_stat_keyphrases()
	y_true = keyphrases_transform.y_true
	max_kp_num = keyphrases_transform.max_kp_num
	mean_kp_num = keyphrases_transform.mean_kp_num
	std_kp_num = keyphrases_transform.std_kp_num

	print("Maximum number of key phrases per document in corpus: %s" %max_kp_num)
	sys.stdout.flush()
	print("Average number of key phrases per document in corpus: %s" %mean_kp_num)
	sys.stdout.flush()
	print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num)
	sys.stdout.flush()

	# round up function for computing beam width 
	def roundup(x):
		return x if x % 5 == 0 else x + 5 - x % 5

	beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
	num_hypotheses = beam_width
	print("\nBeam width: %s\n" %beam_width)
	sys.stdout.flush()


	inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name)

	t0_1 = time.time()
	print("Start beam decoding...")
	sys.stdout.flush()

	beam_keyphrases = inference_mode.beam_decoder(X_in)
	
	beam_decode_connector = DataConnector(decode_path, 'beam_kp-hier-%s.npy'%(file_name), beam_keyphrases)
	beam_decode_connector.save_numpys()

	t1_1 = time.time()
	print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
	sys.stdout.flush()
示例#3
0
def decoder(params):

	data_path = params['data_path']
	preprocessed_v2 = params['preprocessed_v2']
	preprocessed_data = params['preprocessed_data']
	decode_path = params['decode_path']
	model_path = params['model_path']
	result_path = params['result_path']
	result_kp20k = params['result_kp20k']
	file_name = params['file_name']
	weights = params['weights']

	encoder_length = params['encoder_length']
	decoder_length = params['decoder_length']
	embedding_dim = params['embedding_dim']
	birnn_dim = params['birnn_dim']
	rnn_dim = params['rnn_dim']
	vocab_size = params['vocab_size']
	batch_size = params['batch_size']
	epoch = params['epoch']

	'''
	Reading vocabulary dictionaries

	'''
	indices_words_connector = DataConnector(preprocessed_v2, 'all_indices_words_fsoftmax.pkl', data=None)
	indices_words_connector.read_pickle()
	indices_words = indices_words_connector.read_file

	words_indices_connector = DataConnector(preprocessed_v2, 'all_words_indices_fsoftmax.pkl', data=None)
	words_indices_connector.read_pickle()
	words_indices = words_indices_connector.read_file

	## merge all set into one test set for trained model

	train_outputs_conn = DataConnector(data_path, 'train_output_tokens.npy', data=None)
	train_outputs_conn.read_numpys()
	train_outputs = train_outputs_conn.read_file

	valid_outputs_conn = DataConnector(data_path, 'val_output_tokens.npy', data=None)
	valid_outputs_conn.read_numpys()
	valid_outputs = valid_outputs_conn.read_file

	test_outputs_conn = DataConnector(data_path, 'test_output_tokens.npy', data=None)
	test_outputs_conn.read_numpys()
	test_outputs = test_outputs_conn.read_file

	y_test_true = np.concatenate((train_outputs, valid_outputs, test_outputs))

	print("Ground truth of keyphrases shape: %s"%str(y_test_true.shape)) # input for encoder
	sys.stdout.flush()


	# non-paired data set

	X_train_connector = DataConnector(preprocessed_data, 'X_train_pad_fsoftmax.npy', data=None)
	X_train_connector.read_numpys()
	X_train = X_train_connector.read_file

	
	X_valid_connector = DataConnector(preprocessed_data, 'X_valid_pad_fsoftmax.npy', data=None)
	X_valid_connector.read_numpys()
	X_valid = X_valid_connector.read_file

	
	X_test_connector = DataConnector(preprocessed_data, 'X_test_pad_fsoftmax.npy', data=None)
	X_test_connector.read_numpys()
	X_test = X_test_connector.read_file


	X_in = np.concatenate((X_train, X_valid, X_test))
	
	print("\n Non-paired test set: \n")
	sys.stdout.flush()
	print("X (input for encoder) shape: %s"%str(X_in.shape)) # input for encoder
	sys.stdout.flush()

	full_softmax = FullSoftmax(encoder_length=encoder_length, decoder_length=decoder_length, embedding_dim=embedding_dim, birnn_dim=birnn_dim, rnn_dim=rnn_dim, vocab_size=vocab_size, filepath=result_kp20k, filename=file_name, batch_train_iter=None, batch_val_iter=None, batch_size=None, steps_epoch=None, val_steps=None, epochs=None)

	full_softmax.train_seq2seq()
	predict_softmax_model = full_softmax.predict_seq2seq(weights)
	encoder_model = full_softmax.encoder_model
	

	decoder_model = full_softmax.create_decoder_model()
	
	# transform tokenized y_true (ground truth of keyphrases) into full sentences / keyphrases
	keyphrases_transform =  TrueKeyphrases(y_test_true)
	keyphrases_transform.get_true_keyphrases()
	keyphrases_transform.get_stat_keyphrases()
	y_true = keyphrases_transform.y_true
	max_kp_num = keyphrases_transform.max_kp_num
	mean_kp_num = keyphrases_transform.mean_kp_num
	std_kp_num = keyphrases_transform.std_kp_num

	print("Maximum number of key phrases per document in corpus: %s" %max_kp_num)
	sys.stdout.flush()
	print("Average number of key phrases per document in corpus: %s" %mean_kp_num)
	sys.stdout.flush()
	print("Standard Deviation of number of key phrases per document in corpus: %s" %std_kp_num)
	sys.stdout.flush()

	# round up function for computing beam width 
	def roundup(x):
		return x if x % 5 == 0 else x + 5 - x % 5

	beam_width = int(roundup(mean_kp_num + (3 * std_kp_num)))
	num_hypotheses = beam_width
	print("\nBeam width: %s\n" %beam_width)
	sys.stdout.flush()


	inference_mode = DecodingSoftmax(encoder_model=encoder_model, decoder_model=decoder_model, indices_words=indices_words, words_indices=words_indices, enc_in_seq=None, decoder_length=decoder_length, rnn_dim=rnn_dim, beam_width=beam_width, num_hypotheses=num_hypotheses, filepath=decode_path, filename=file_name)


	t0_1 = time.time()
	print("Start beam decoding...")
	sys.stdout.flush()

	beam_keyphrases = inference_mode.beam_decoder(X_in[:500])
	
	beam_decode_connector = DataConnector(decode_path, 'beam_kp-%s.npy'%(file_name), beam_keyphrases)
	beam_decode_connector.save_numpys()

	t1_1 = time.time()
	print("Beam decoding is done in %.3fsec" % (t1_1 - t0_1))
	sys.stdout.flush()