Python pad_sentences 예제들, data_helper.pad_sentences Python 예제들

예제 #1

0

파일 보기

def load_testing_data():
    Dp.establish_db_connection()
    testing_dataset = Dp.load_dataset_by_type("testing")

    x = Dp.load_nn_input_dataset_string(testing_dataset[:, [0, 6]])
    y = Dp.load_nn_labels_dataset_string(testing_dataset[:, [0, 1]])

    sent_num, sen_len = Dp.load_nn_seq_lengths(testing_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = Dp.pad_sentences(x, sen_len, 4, 10)

    return sentences_padded, y, vocabulary, vocabulary_inv

예제 #2

0

파일 보기

파일: predict.py 프로젝트: thukg/query-intent-classification

def get_intent(sentence):
    x_ = load_test_data(sentence)
    # note 
    x_ = data_helper.pad_sentences(x_, forced_sequence_length=param_length)
    x_ = map_word_to_index(x_, words_index)

    x_test = np.asarray(x_)

    predict_labels = []
    batch_predictions = predict_step(x_test)[0]
    for batch_prediction in batch_predictions:
	predict_labels.append(labels[batch_prediction])
    return (predict_labels)

예제 #3

0

파일 보기

def load_training_data():
    dp.establish_db_connection()
    training_dataset = DBHelperMethod.load_dataset_by_type("training")

    # x = dp.load_nn_input_dataset_string(training_dataset[:, [0, 6]])
    x = dp.load_nn_input_dataset_string_space_only(training_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(training_dataset[:, [0, 1]])

    sent_num, sen_len = dp.load_nn_seq_lengths(training_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = dp.pad_sentences(
        x, sen_len, 4, 10)

    return sentences_padded, y, vocabulary, vocabulary_inv

예제 #4

0

파일 보기

def load_testing_data():
    dp.establish_db_connection()
    testing_dataset = dp.load_testing_dataset()

    x = dp.load_nn_input_dataset_string(testing_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(testing_dataset[:, [0, 1]])

    sent_num, sen_len = dp.load_nn_seq_lengths(testing_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = dp.pad_sentences(x, sen_len, 4, 10)

    testing_words = np.take(testing_dataset, 4, axis=1)
    input_testing_letters = np.take(testing_dataset, 0, axis=1)
    op_testing_letters = np.take(testing_dataset, 5, axis=1)
    sent_num = np.take(testing_dataset, 3, axis=1)
    letters_loc = np.take(testing_dataset, 6, axis=1)
    undiac_word = np.take(testing_dataset, 7, axis=1)

    return sentences_padded, y, vocabulary, vocabulary_inv, testing_words, input_testing_letters, op_testing_letters,\
           sent_num, letters_loc, undiac_word

예제 #5

0

파일 보기

def predict_unseen_data():
    trained_dir = sys.argv[1]
    if not trained_dir.endswith('/'):
        trained_dir += '/'
    test_file = sys.argv[2]

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    x_, y_, df = load_test_data(test_file, labels)
    x_ = data_helper.pad_sentences(
        x_, forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)

    x_test, y_test = np.asarray(x_), None
    if y_ is not None:
        y_test = np.asarray(y_)

    timestamp = trained_dir.split('/')[-2].split('_')[-1]
    predicted_dir = './predicted_results_' + timestamp + '/'
    if os.path.exists(predicted_dir):
        shutil.rmtree(predicted_dir)
    os.makedirs(predicted_dir)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def predict_step(x_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                predictions = sess.run([cnn_rnn.predictions], feed_dict)
                return predictions

            checkpoint_file = trained_dir + 'best_model.ckpt'
            saver = tf.train.Saver(tf.all_variables())
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)

            predictions, predict_labels = [], []
            for x_batch in batches:
                batch_predictions = predict_step(x_batch)[0]
                for batch_prediction in batch_predictions:
                    predictions.append(batch_prediction)
                    predict_labels.append(labels[batch_prediction])

            df['PREDICTED'] = predict_labels
            columns = sorted(df.columns, reverse=True)
            df.to_csv(predicted_dir + 'predictions_all.csv',
                      index=False,
                      columns=columns,
                      sep='|')

            if y_test is not None:
                y_test = np.array(np.argmax(y_test, axis=1))
                accuracy = sum(np.array(predictions) == y_test) / float(
                    len(y_test))
                logging.critical(
                    'The prediction accuracy is: {}'.format(accuracy))

            logging.critical(
                'Prediction is complete, all files have been saved: {}'.format(
                    predicted_dir))

예제 #6

0

파일 보기

파일: predict.py 프로젝트: thukg/query-intent-classification

    x_ = data_helper.pad_sentences(x_, forced_sequence_length=param_length)
    x_ = map_word_to_index(x_, words_index)

    x_test = np.asarray(x_)

    predict_labels = []
    batch_predictions = predict_step(x_test)[0]
    for batch_prediction in batch_predictions:
	predict_labels.append(labels[batch_prediction])
    return (predict_labels)

if __name__ == '__main__':
    print ('please enter \"exit\" to exit.')
    logging.critical('The maximum length is {}'.format(param_length))
    while (True):
	x_ = load_test_data()
        if x_ == -1:
            break
	# note 
	x_ = data_helper.pad_sentences(x_, forced_sequence_length=param_length)
	x_ = map_word_to_index(x_, words_index)

	x_test = np.asarray(x_)

	predict_labels = []
	batch_predictions = predict_step(x_test)[0]
	for batch_prediction in batch_predictions:
	    predict_labels.append(labels[batch_prediction])
	print ('The intent is [{}]'.format(predict_labels[0]))

예제 #7

0

파일 보기

def predict():
	#trained_dir = sys.argv[1]
	trained_dir = './trained_results_1575177514/' 
	
	#if not trained_dir.endswith('/'):
	#	trained_dir += '/'
	#test_file = sys.argv[2]
	
	if request.method == 'POST':
		file = request.files['ReceivedFile']
		logging.critical('Received Filename from App: {}'.format(file))
		if file and allowed_file(file.filename):
			filename = secure_filename(file.filename)
			file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
			test_file = './uploads/'+filename
		
		
			df = pd.read_csv(test_file)
			text = df[['Descript']]
				   
			params, words_index, labels, embedding_mat = load_trained_params(trained_dir)
			flag = 0
			x_, y_, df = load_test_data(test_file, labels,flag)
			x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length'])
			x_ = map_word_to_index(x_, words_index)
		else:
			#text = request.json["text"]
			text =request.form["query"]
			params, words_index, labels, embedding_mat = load_trained_params(trained_dir)
			flag = 1
			x_, y_, df = load_test_data(text, labels,flag)
			x_ = data_helper.pad_sentences(x_, forced_sequence_length=params['sequence_length'])
			x_ = map_word_to_index(x_, words_index)

	x_test, y_test = np.asarray(x_), None
	if y_ is not None:
		y_test = np.asarray(y_)

	timestamp = trained_dir.split('/')[-2].split('_')[-1]
	predicted_dir = './predicted_results_' + timestamp + '/'
	if os.path.exists(predicted_dir):
		shutil.rmtree(predicted_dir)
	os.makedirs(predicted_dir)

	with tf.Graph().as_default():
		session_conf = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.compat.v1.Session(config=session_conf)
		with sess.as_default():
			cnn_rnn = TextCNNRNN(
				embedding_mat = embedding_mat,
				non_static = params['non_static'],
				hidden_unit = params['hidden_unit'],
				sequence_length = len(x_test[0]),
				max_pool_size = params['max_pool_size'],
				filter_sizes = map(int, params['filter_sizes'].split(",")),
				num_filters = params['num_filters'],
				num_classes = len(labels),
				embedding_size = params['embedding_dim'],
				l2_reg_lambda = params['l2_reg_lambda'])

			def real_len(batches):
				return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]

			def predict_step(x_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.dropout_keep_prob: 1.0,
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				predictions = sess.run([cnn_rnn.predictions], feed_dict)
				return predictions

			checkpoint_file = trained_dir + 'best_model.ckpt'
			saver = tf.compat.v1.train.Saver(tf.compat.v1.all_variables())
			saver = tf.compat.v1.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)
			logging.critical('{} has been loaded'.format(checkpoint_file))

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)

			predictions, predict_labels = [], []
			for x_batch in batches:
				batch_predictions = predict_step(x_batch)[0]
				for batch_prediction in batch_predictions:
					predictions.append(batch_prediction)
					predict_labels.append(labels[batch_prediction])
					logging.critical('Prediction is complete Class belongs to: {}'.format(predict_labels[0]))

			#if os.path.exists(test_file):
			#	os.remove(test_file)
				
			return render_template('results.html',prediction = predict_labels[0],name =text)

예제 #8

0

파일 보기

파일: test.py 프로젝트: lcssos/multi-class-text-classification-cnn-rnn

def test_data():
    trained_dir = sys.argv[1]
    if not trained_dir.endswith('/'):
        trained_dir += '/'
    test_file = sys.argv[2]

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    x_, y_, df = load_test_data(test_file, labels)
    x_ = data_helper.pad_sentences(
        x_, forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)

    x_test, y_test = np.asarray(x_), None
    if y_ is not None:
        y_test = np.asarray(y_)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # logging.info(params)
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                loss, num_correct, predictions = sess.run(
                    [cnn_rnn.loss, cnn_rnn.num_correct, cnn_rnn.predictions],
                    feed_dict)
                return loss, num_correct, predictions

            checkpoint_file = trained_dir + 'model-4'
            saver = tf.train.Saver(tf.global_variables())
            logging.info(checkpoint_file)
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            total_test_correct = 0
            loss, num_test_correct, predictions = dev_step(x_test, y_test)
            logging.info(num_test_correct)
            logging.info(predictions)
            total_test_correct += int(num_test_correct)
            logging.critical('Accuracy on test set: {}'.format(
                float(total_test_correct) / len(y_test)))

예제 #9

0

파일 보기

def predict_raw_data(argv):
    ###################################################################
    #       Make sure to enter the trained_dir you want to load       #
    # 				                                				  #
    #       AT predicted_dir, there is predict_labels.txt   		  #
    #		which store the prediction result of test file			  #
    # 				                                				  #
    ###################################################################

    in_file = ''
    out_file = ''

    try:
        opts, args = getopt.getopt(
            argv, "h:t:i:o:",
            ["trained_dir=", "in_filepath=", "out_filepath="])
    except getopt.GetoptError:
        print(
            "python main.py -i <in_filepath> -o <out_filepath> -t <trained_dir>"
        )
        sys.exit(2)

    trained_dir = './trained_results/'

    for opt, arg in opts:
        if opt == '-h':
            print("python main.py -i <in_filepath> -o <out_filepath>")
            sys.exit()
        elif opt in ("-i", "--in_filepath"):
            in_file = arg
        elif opt in ("-o", "--out_filepath"):
            out_file = arg
        elif opt in ("-t", "--trained_dir"):
            trained_dir = arg

    params, words_index, labels, embedding_mat = load_trained_params(
        trained_dir)
    original_x, x_, y_ = load_test_data(in_file)

    x_ = data_helper.pad_sentences(
        x_, forced_sequence_length=params['sequence_length'])
    x_ = map_word_to_index(x_, words_index)

    x_test, y_test = np.asarray(x_), None
    if y_ is not None:
        y_test = np.asarray(y_)

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 sequence_length=len(x_test[0]),
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 num_classes=len(labels),
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def predict_step(x_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch),
                }
                predictions, scores = sess.run(
                    [cnn_rnn.predictions, cnn_rnn.scores], feed_dict)
                return predictions, scores

            checkpoint_file = trained_dir + 'best_model.ckpt'
            saver = tf.train.Saver(tf.global_variables())
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)

            predictions, predict_labels, predict_probs = [], [], []
            for x_test_batch in batches:
                batch_predictions = predict_step(x_test_batch)[0]
                batch_prop_preds = predict_step(x_test_batch)[1]

                for batch_prediction, batch_prop_pred in zip(
                        batch_predictions, batch_prop_preds):
                    predictions.append(batch_prediction)
                    predict_labels.append(labels[batch_prediction])
                    predict_probs.append(batch_prop_pred[batch_prediction])

            with open(out_file, "w", encoding='utf-8') as f:
                for original_x_, predict_label, predict_prob in zip(
                        original_x, predict_labels, predict_probs):
                    print_prob = round(predict_prob * 100, 2)
                    f.write(
                        str(original_x_) + '\t' + str(predict_label) + '\t' +
                        str(print_prob) + '\n')

            if y_test is not None:
                y_test = np.array(np.argmax(y_test, axis=1))
                accuracy = sum(np.array(predictions) == y_test) / float(
                    len(y_test))
                logging.critical(
                    'The prediction accuracy is: {}'.format(accuracy))

            logging.critical('Prediction is complete')

예제 #10

0

파일 보기

def predict_unseen_data():
	test_x = []
	#test_input = os.environ.get('TEST_X', None)
	test_input = "What time is the class"

	if test_input is None:
		logging.critical(' TEST_X is not found ')
		sys.exit()
	test_x.append(test_input.split(' '))
	trained_dir = "trained_results_1512435063"
	#os.environ.get('TRAINED_RESULTS', None)



	if trained_dir is None:
		logging.critical(' TRAINED_RESULTS is not found ')
		sys.exit()

	if not trained_dir.endswith('/'):
		trained_dir += '/'

	x_ = data_helper.pad_sentences(test_x, forced_sequence_length=params['sequence_length'])
	x_ = map_word_to_index(x_, words_index)

	x_test, y_test = np.asarray(x_), None

	timestamp = trained_dir.split('/')[-2].split('_')[-1]

	with tf.Graph().as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)
		with sess.as_default():
			cnn_rnn = TextCNNRNN(
				embedding_mat = embedding_mat,
				non_static = params['non_static'],
				hidden_unit = params['hidden_unit'],
				sequence_length = len(x_test[0]),
				max_pool_size = params['max_pool_size'],
				filter_sizes = map(int, params['filter_sizes'].split(",")),
				num_filters = params['num_filters'],
				num_classes = len(labels),
				embedding_size = params['embedding_dim'],
				l2_reg_lambda = params['l2_reg_lambda'])

			def real_len(batches):
				return [np.ceil(np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches]

			def predict_step(x_batch):
				feed_dict = {
					cnn_rnn.input_x: x_batch,
					cnn_rnn.dropout_keep_prob: 1.0,
					cnn_rnn.batch_size: len(x_batch),
					cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
					cnn_rnn.real_len: real_len(x_batch),
				}
				scores,predictions = sess.run([cnn_rnn.scores,cnn_rnn.predictions], feed_dict)
				return scores,predictions

			checkpoint_file = trained_dir + 'best_model.ckpt'
			saver = tf.train.Saver(tf.all_variables())
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)
			logging.critical('{} has been loaded'.format(checkpoint_file))

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
			response=""
			predictions, predict_labels = [], []
			for x_batch in batches:
				scores,batch_predictions = predict_step(x_batch)
				print scores
				score=normalize(scores[0])
				print score
				print score.max()
				mscore=score.max()
				range_perc = 0.01

				max_range = mscore + (mscore * range_perc)
				min_range = mscore - (mscore * range_perc)

				for s in score:
					if(s > min_range and s < max_range)


				max_score = score.max()
				if(max_score>0.1):
					print scores
					for batch_prediction in batch_predictions:
						predictions.append(batch_prediction)
						predict_labels.append(labels[batch_prediction])
					response= predict_labels[0]
				else:
					response="Fall back!"
			sys.stdout.write(response)
			print response

			os.environ['PRED_LABEL'] = response