Python clean_str 예제들, data_helper.clean_str Python 예제들

예제 #1

0

파일 보기

def load_test_data(test_file, labels,flag):
	#df = pd.read_csv(test_file, sep='|')
	logging.critical('{} test_file received '.format(test_file))
	select = ['Descript']
	if flag == 0:
		df = pd.read_csv(test_file)
		df = df.dropna(axis=0, how='any', subset=select)
		test_examples = df[select[0]].apply(lambda x: data_helper.clean_str(x).split(' ')).tolist()
	else: 
		df = pd.Series(test_file)
		df =pd.DataFrame(df.values, columns=[select]) 
		test_examples = df.iloc[0].apply(lambda x: data_helper.clean_str(x).split(' ')).tolist()

	logging.critical('{} df received '.format(df))

	num_labels = len(labels)
	one_hot = np.zeros((num_labels, num_labels), int)
	np.fill_diagonal(one_hot, 1)
	label_dict = dict(zip(labels, one_hot))

	y_ = None
	if 'Category' in df.columns:
		select.append('Category')
		y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()

	not_select = list(set(df.columns) - set(select))
	df = df.drop(not_select, axis=1)
	return test_examples, y_, df

예제 #2

0

파일 보기

파일: test.py 프로젝트: lcssos/multi-class-text-classification-cnn-rnn

def load_test_data(test_file, labels):
    df = pd.read_csv(test_file,
                     sep=',',
                     encoding="utf8",
                     error_bad_lines=False)
    select = ['Descript']

    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()
    # logging.info(test_examples)

    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    y_ = None
    if 'Category' in df.columns:
        select.append('Category')
        y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()

    not_select = list(set(df.columns) - set(select))
    df = df.drop(not_select, axis=1)
    return test_examples, y_, df

예제 #3

0

파일 보기

파일: predict.py 프로젝트: thukg/query-intent-classification

def load_test_data(sentence = None):
	if not sentence:
	    sentence = raw_input("input >>> ")
            if sentence == 'exit':
                return -1 
	test_examples = [data_helper.clean_str(sentence).split(' ')]
	return test_examples

예제 #4

0

파일 보기

파일: main.py 프로젝트: wyu-du/DeepLearning-NLP

def load_test_data(test_file, labels):
    df=pd.read_csv(test_file, encoding='utf-8')
    select=['WYSText']
    
    df=df.dropna(axis=0, how='any', subset=select)
    test_examples=df[select[0]].apply(lambda x:data_helper.clean_str(x).split(' ')).tolist()
    
    num_labels=len(labels)
    one_hot=np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict=dict(zip(labels, one_hot))
    
    y_=None
    if 'category' in df.columns:
        select.append('category')
#        y_=df[select[1]].apply(lambda x:label_dict[x]).tolist()
        y_=[]
        labels=label_dict.keys()
        for i in range(len(df)):
            if df.iloc[i,0] in labels:
                y_.append(label_dict[df.iloc[i, 0]].tolist())
            else:
                y_.append(label_dict['<UNK>'].tolist())
        
    not_select=list(set(df.columns)-set(select))
    df=df.drop(not_select, axis=1)
    return test_examples, y_, df

예제 #5

0

파일 보기

파일: labeled_file_parser.py 프로젝트: anubha2704/IndependentStudy

def load_file(hotel_name, filepath):
    review = data_helper.clean_str(filepath[0])
    ratings = product_info.parse_rating(filepath[1])
    num_prod_mentions = []
    mention_count = product_info.count_product_mentions(hotel_name, review)
    num_prod_mentions.append(mention_count)
    return review, num_prod_mentions, ratings

예제 #6

0

파일 보기

파일: predict2.py 프로젝트: zfnow/LogstashAI

def load_test_data(test_file, labels):
    df = pd.read_csv(test_file)
    df.loc[df.severity == "crit", "severity"] = 1
    df.loc[df.severity == "err", "severity"] = 1
    df.loc[df.severity != 1, "severity"] = 0
    # 	print(df)
    select = ['message']

    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()

    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    y_ = None
    if 'severity' in df.columns:
        select.append('severity')
        y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()

    not_select = list(set(df.columns) - set(select))
    df = df.drop(not_select, axis=1)
    return test_examples, y_, df

예제 #7

0

파일 보기

파일: predict.py 프로젝트: IPDT/text_classification

def load_test_data(test_file, labels):
    df = pd.read_csv(test_file, sep='|')
    select = ['Descript']

    #x变成用，隔开的数据
    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()
    print("test example:", test_examples)

    #把所有的label变成onehot存储下来
    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))
    #print("label_dict",label_dict)

    #找到测试案例的label对应的onehot
    y_ = None
    if 'Category' in df.columns:
        select.append('Category')
        y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()
    #print("y_",y_)

    #去掉重复的内容
    not_select = list(set(df.columns) - set(select))
    print('not_select', not_select)
    df = df.drop(not_select, axis=1)
    print("df:", df)
    return test_examples, y_, df

예제 #8

0

파일 보기

def load_file(filename):
    hotel_name = product_info.parse_product_name(re.sub(r"_", " ", filename[2]).lower())
    ratings = product_info.parse_rating(filename[1])
    hotel_data = data_helper.clean_str(filename[0])
    num_prod_mentions = []
    mention_count = product_info.count_product_mentions(hotel_name, hotel_data)
    num_prod_mentions.append(mention_count)
    return hotel_name, hotel_data, ratings, num_prod_mentions

예제 #9

0

파일 보기

def predict_new_data():
	"""Step 0: load trained model and parameters"""
	params = json.loads(open('./parameters.json').read())
	checkpoint_dir = sys.argv[1]
	if not checkpoint_dir.endswith('/'):
		checkpoint_dir += '/'
	checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir + 'checkpoints')
	logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

	"""Step 1: load data for prediction"""
	test_file = sys.argv[2]
	test_examples = json.loads(open(test_file).read())

	# labels.json was saved during training, and it has to be loaded during prediction
	labels = json.loads(open('./labels.json').read())
	one_hot = np.zeros((len(labels), len(labels)), int)
	np.fill_diagonal(one_hot, 1)
	label_dict = dict(zip(labels, one_hot))

	x_raw = [example['consumer_complaint_narrative'] for example in test_examples]
	x_test = [data_helper.clean_str(x) for x in x_raw]
	logging.info('The number of x_test: {}'.format(len(x_test)))

	y_test = None
	if 'product' in test_examples[0]:
		y_raw = [example['product'] for example in test_examples]
		y_test = [label_dict[y] for y in y_raw]
		logging.info('The number of y_test: {}'.format(len(y_test)))

	vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
	vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
	x_test = np.array(list(vocab_processor.transform(x_test)))

	"""Step 2: compute the predictions"""
	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)

		with sess.as_default():
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)

			input_x = graph.get_operation_by_name("input_x").outputs[0]
			dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
			predictions = graph.get_operation_by_name("output/predictions").outputs[0]

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
			all_predictions = []
			for x_test_batch in batches:
				batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
				all_predictions = np.concatenate([all_predictions, batch_predictions])

	if y_test is not None:
		y_test = np.argmax(y_test, axis=1)
		correct_predictions = sum(all_predictions == y_test)
		logging.critical('The accuracy is: {}'.format(correct_predictions / float(len(y_test))))

예제 #10

0

파일 보기

파일: eval_helper.py 프로젝트: ubenz55555/cnn_website_text_classify

def input_valid_data(input_file):
    df = pd.read_excel(input_file)
    contents = df['web_content'].values
    valid_data = []
    for content in contents:
        content = data_helper.clean_str(find_chinese(content))
        content = data_helper.seperate_line(content)
        valid_data.append(content)
    return valid_data

예제 #11

0

파일 보기

def load_test_data(test_file, labels):
    df = pd.read_csv(test_file, sep='|')
    select = ['Descript']

    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()

    num_labels = len(labels)
    ont_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonals(one_hot, 1)
    label_dict = dict(zip(labels, ont_hot))

예제 #12

0

파일 보기

파일: predict_merge.py 프로젝트: sidrai97/DarwinBot

def predict_unseen_data(userMessage):
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
	#logging.getLogger().setLevel(logging.INFO)
	"""Step 0: load trained model and parameters"""
	mainDir = "C:/Users/sid/Desktop/Darwin/DarwinBot/process_message/intentClassifier/"
	params = json.loads(open(mainDir+'parameters.json').read())
	#checkpoint_dir = mainDir+"trained_model_1519274258/"
	checkpoint_dir = mainDir+"trained_model_1522994422"
	if not checkpoint_dir.endswith('/'):
		checkpoint_dir += '/'
	checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir + 'checkpoints')
	#logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

	"""Step 1: load data for prediction"""
	
	# labels.json was saved during training, and it has to be loaded during prediction
	labels = json.loads(open(mainDir+'labels.json').read())
	one_hot = np.zeros((len(labels), len(labels)), int)
	np.fill_diagonal(one_hot, 1)
	label_dict = dict(zip(labels, one_hot))

	x_raw = userMessage
	x_test = [data_helper.clean_str(x_raw)]
	#logging.info('The number of x_test: {}'.format(len(x_test)))
	
	vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
	vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
	x_test = np.array(list(vocab_processor.transform(x_test)))

	"""Step 2: compute the predictions"""
	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)

		with sess.as_default():
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)

			input_x = graph.get_operation_by_name("input_x").outputs[0]
			dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
			predictions = graph.get_operation_by_name("output/predictions").outputs[0]

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
			all_predictions = []
			for x_test_batch in batches:
				batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
				all_predictions = np.concatenate([all_predictions, batch_predictions])
				all_predictions = all_predictions.tolist()
	return labels[int(all_predictions[0])]

예제 #13

0

파일 보기

def load_test_data(test_file, column):
    df = pd.read_csv(test_file, sep=',')
    select = ['content']

    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()
    y_ = None
    if column in df.columns:
        select.append(column)

    not_select = list(set(df.columns) - set(select))
    df = df.drop(not_select, axis=1)
    return test_examples, y_, df

예제 #14

0

파일 보기

def load_test_data(test_file, labels, column):
    df = pd.read_csv(test_file, sep=',')
    select = ['content']

    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()

    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    y_ = None
    if column in df.columns:
        select.append(column)
        y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()

    not_select = list(set(df.columns) - set(select))
    df = df.drop(not_select, axis=1)
    return test_examples, y_, df

예제 #15

0

파일 보기

def load_test_data(test_file, labels):
    df = pd.read_csv(test_file, sep="|")
    select = ["Descript"]

    df = df.dropna(axis=0, how="any", subset=select)
    test_examples = (df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(" ")).tolist())

    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    y_ = None
    if "Category" in df.columns:
        select.append("Category")
        y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()

    not_select = list(set(df.columns) - set(select))
    df = df.drop(not_select, axis=1)
    return test_examples, y_, df

예제 #16

0

파일 보기

def load_test_data(test_file, labels):
	# df = pd.read_csv(test_file, sep='|')
	# select = ['Descript']

	#x变成用，隔开的数据
	test_examples_test=[]
	# df = df.dropna(axis=0, how='any', subset=select)
	test_str=sys.argv[1]
	#test_examples = df[select[0]].apply(lambda x: data_helper.clean_str(x).split(' ')).tolist()
	clean_str = list(data_helper.clean_str(test_str).split(' '))
	test_examples_test.append(clean_str)
	# print("test example:",test_examples)
	# print("test example type:",type(test_examples))
	# print('test example test:',test_examples_test)
	# print('test example test type:', type(test_examples_test))
	test_examples=test_examples_test

	#把所有的label变成onehot存储下来
	num_labels = len(labels)
	one_hot = np.zeros((num_labels, num_labels), int)
	np.fill_diagonal(one_hot, 1)
	label_dict = dict(zip(labels, one_hot))
	#print("label_dict",label_dict)

	#找到测试案例的label对应的onehot
	y_ = None
	# if 'Category' in df.columns:
	# 	select.append('Category')
	# 	y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()
	#print("y_",y_)

	#去掉重复的内容
	# not_select = list(set(df.columns) - set(select))
	# print('not_select',not_select)
	# df = df.drop(not_select, axis=1)
	# print("df:",df)
	df=pd.DataFrame()
	return test_examples, y_, df

예제 #17

0

파일 보기

파일: predict.py 프로젝트: saumya-saumya/Contextual-Domain-classification-in-spoken-language-systems

def load_test_data(test_file, labels):

    df = pd.read_csv(test_file)
    select = ['Descript']

    df = df.dropna(axis=0, how='any', subset=select)
    test_examples = df[select[0]].apply(
        lambda x: data_helper.clean_str(x).split(' ')).tolist()
    vocabulary, vocabulary_inv = data_helper.build_vocab(test_examples)
    x = np.array([[vocabulary[word] for word in sentence]
                  for sentence in test_examples])
    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    y_ = None
    if 'Category' in df.columns:
        select.append('Category')
        y_ = df[select[1]].apply(lambda x: label_dict[x]).tolist()

    not_select = list(set(df.columns) - set(select))
    df = df.drop(not_select, axis=1)
    return x, y_, df

예제 #18

0

파일 보기

df_sports   = pd.read_json("Sports_and_Outdoors_5.json", lines=True)

#Concat the dataframes
df = pd.concat([df_clothing, df_sports])

# delete this later
df = df[:1000]

#Get the text from the dataframe
text = df["reviewText"].values

#Reduce memory
del df

#clean strings
text = [data_helper.clean_str(sent) for sent in text]


'''
if max sent length = 4

Vocab processor:

          Our text                             Our new data
["wow what great stuff", "good stuff"] --> [[1,2,3], [4,5]]

'''

#set max sentence length
max_sent_length = 100
#make vocab processor

예제 #19

0

파일 보기

파일: survey_reader.py 프로젝트: Li-Pengfei/review_clustering_Deloitte

def read_Surveycsv(filepath):
    """
        author: Pengfei
        date: 07/06/2017
    """
    df = pd.read_csv(filepath, usecols=['Q3_1', 'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q3_9', 'Q3_10',
                                        'Q4', 'Q5'])
    #print(df.describe())

    q4 = df['Q4'].astype('category')
    #print("\nCount summary for each question:")
    #print(q4.value_counts())

    q5 = df['Q5']

    idx_bool_Noimprovement = q4 == "No improvement require"
    idx_Noimprovement = [i for i, bool in enumerate(idx_bool_Noimprovement) if bool]
    q5_Noimprovement = q5[q4 == "No improvement require"].tolist()
    corpus_Noimprovement = [data_helper.clean_str(sent) for sent in q5_Noimprovement]

    idx_bool_Charges = q4 == "The charges you paid for the last service"
    idx_Charges = [i for i, bool in enumerate(idx_bool_Charges) if bool]
    q5_Charges = q5[q4 == "The charges you paid for the last service"].tolist()
    corpus_Charges = [data_helper.clean_str(sent) for sent in q5_Charges]

    idx_bool_Location = q4 == "The dealership location"
    idx_Location = [i for i, bool in enumerate(idx_bool_Location) if bool]
    q5_Location = q5[q4 == "The dealership location"].tolist()
    corpus_Location = [data_helper.clean_str(sent) for sent in q5_Location]

    idx_bool_VehicleCondition = q4 == "The condition and cleanliness of vehicle when you received your vehicle after servicing"
    idx_VehicleCondition = [i for i, bool in enumerate(idx_bool_VehicleCondition) if bool]
    q5_VehicleCondition = q5[
        q4 == "The condition and cleanliness of vehicle when you received your vehicle after servicing"].tolist()
    corpus_VehicleCondition = [data_helper.clean_str(sent) for sent in q5_VehicleCondition]

    idx_bool_TimeTaken = q4 == "The total time taken to complete the servicing of your vehicle"
    idx_TimeTaken = [i for i, bool in enumerate(idx_bool_TimeTaken) if bool]
    q5_TimeTaken = q5[q4 == "The total time taken to complete the servicing of your vehicle"].tolist()
    corpus_TimeTaken = [data_helper.clean_str(sent) for sent in q5_TimeTaken]

    idx_bool_OpeningTime = q4 == "Dealership opening/closing days and time"
    idx_OpeningTime = [i for i, bool in enumerate(idx_bool_OpeningTime) if bool]
    q5_OpeningTime = q5[q4 == "Dealership opening/closing days and time"].tolist()
    corpus_OpeningTime = [data_helper.clean_str(sent) for sent in q5_OpeningTime]

    idx_bool_WaitingArea = q4 == "The waiting area (e.g., comfort, cleanness, facilities)"
    idx_WaitingArea = [i for i, bool in enumerate(idx_bool_WaitingArea) if bool]
    q5_WaitingArea = q5[q4 == "The waiting area (e.g., comfort, cleanness, facilities)"].tolist()
    corpus_WaitingArea = [data_helper.clean_str(sent) for sent in q5_WaitingArea]

    idx_bool_Quality = q4 == "Quality of the work performed on your vehicle (e.g., fixing of issues during this servicing visit)"
    idx_Quality = [i for i, bool in enumerate(idx_bool_Quality) if bool]
    q5_Quality = q5[
        q4 == "Quality of the work performed on your vehicle (e.g., fixing of issues during this servicing visit)"].tolist()
    corpus_Quality = [data_helper.clean_str(sent) for sent in q5_Quality]

    idx_bool_FollowUp = q4 == "The follow-up calls made by the dealership post servicing of your vehicle to check your service experience and car condition"
    idx_FollowUp = [i for i, bool in enumerate(idx_bool_FollowUp) if bool]
    q5_FollowUp = q5[
        q4 == "The follow-up calls made by the dealership post servicing of your vehicle to check your service experience and car condition"].tolist()
    corpus_FollowUp = [data_helper.clean_str(sent) for sent in q5_FollowUp]

    idx_bool_Explanations = q4 == "The explanations given by dealership staff during your service visit (e.g., helpful/detailed)"
    idx_Explanations = [i for i, bool in enumerate(idx_bool_Explanations) if bool]
    q5_Explanations = q5[
        q4 == "The explanations given by dealership staff during your service visit (e.g., helpful/detailed)"].tolist()
    corpus_Explanations = [data_helper.clean_str(sent) for sent in q5_Explanations]

    idx_bool_Appointment = q4 == "Arranging service appointment/visits to the dealership"
    idx_Appointment = [i for i, bool in enumerate(idx_bool_Appointment) if bool]
    q5_Appointment = q5[q4 == "Arranging service appointment/visits to the dealership"].tolist()
    corpus_Appointment = [data_helper.clean_str(sent) for sent in q5_Appointment]

    return [[corpus_Noimprovement, idx_Noimprovement], [corpus_Appointment, idx_Appointment], [corpus_OpeningTime, idx_OpeningTime],
            [corpus_Explanations,idx_Explanations], [corpus_VehicleCondition, idx_VehicleCondition],
            [corpus_Quality, idx_Quality], [corpus_Location, idx_Location], [corpus_WaitingArea, idx_WaitingArea],
            [corpus_TimeTaken, idx_TimeTaken], [corpus_Charges, idx_Charges], [corpus_FollowUp, idx_FollowUp]]

예제 #20

0

파일 보기

파일: preprocessing_data.py 프로젝트: lorainekv/nlp_word2vec

data_frame_sports = pd.read_json("reviews_Sports_and_Outdoors_5.json",
                                 lines=True)

data_frame = pd.concat([data_frame_clothing, data_frame_sports])

# Reduce size temporarily while iterating
data_frame = data_frame[:1000]

# Retreive text from the data frame
review_text = data_frame["reviewText"].values

# Tidy
del data_frame

# clean strings
review_text = [data_helper.clean_str(x) for x in review_text]

# Assign unique word ids to each word in a corpus
max_sentence_length = 100
vocab_processor = tflearn.data_utils.VocabularyProcessor(max_sentence_length)

word_ids = list(vocab_processor.fit_transform(review_text))

# Total number of unique words
vocabulary_size = len(vocab_processor.vocabulary_)

# Convert np arrays to lists to support popping of 0 values
word_ids = [i.tolist() for i in word_ids]

# Remove trailing 0s for sentences not 100 words long
# TIL: I tried benchmarking np.trim_zeros here to avoid the type conversion,

예제 #21

0

파일 보기

파일: main.py 프로젝트: wyu-du/DeepLearning-NLP

def demo_cnn_rnn(demo_model):
    # load training parameters
    params, words_index, labels, embedding_mat=load_trained_params('data_path_save/cnn_rnn_'+demo_model+'/trained_results/')    
    
    with tf.Graph().as_default():
        session_conf=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess=tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn=TextCNNRNN(embedding_mat=embedding_mat, non_static=params['non_static'], hidden_unit=params['hidden_unit'], sequence_length=params['sequence_length'],
                               max_pool_size=params['max_pool_size'], filter_sizes=map(int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], 
                               num_classes=len(labels),embedding_size=params['embedding_dim'],l2_reg_lambda=params['l2_reg_lambda'])
            
            def real_len(batches):
                return [np.ceil(np.argmin(batch+[0])*1.0/params['max_pool_size']) for batch in batches]
            
            def predict_step(x_batch):
                feed_dict={
                        cnn_rnn.input_x: x_batch,
                        cnn_rnn.dropout_keep_prob: 1.0,
                        cnn_rnn.batch_size: len(x_batch),
                        cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                        cnn_rnn.real_len: real_len(x_batch)
                        }
                predictions=sess.run([cnn_rnn.predictions], feed_dict=feed_dict)
                return predictions
            
            checkpoint_file=tf.train.latest_checkpoint('data_path_save/cnn_rnn_'+demo_model+'/checkpoints/')
            saver=tf.train.Saver(tf.all_variables())
            saver=tf.train.import_meta_graph('{}.meta'.format(checkpoint_file))
            saver.restore(sess, checkpoint_file)
            logging.critical('{} has been loaded'.format(checkpoint_file))
            
            while(1):
                print('Please input your sentence:')
                input_sentence = input()
                if input_sentence == '' or input_sentence.isspace():
                    print('See you next time!')
                    break
                else:
                    x_=data_helper.clean_str(input_sentence).split(' ')
                    # Prediction: cut off the sentence if it is longer than the sequence length
                    sequence_length=params['sequence_length']
                    num_padding=sequence_length-len(x_)
                    padded_sentence=[]
                    if num_padding<0:
                        logging.info('This sentence has to be cut off because it is longer than trained sequence length')
                        padded_sentence=x_[0: sequence_length]
                    else:
                        padded_sentence=x_+['<PAD/>']*num_padding
                    # Get word index
                    temp=[]
                    for word in padded_sentence:
                        if word in words_index:
                            temp.append(words_index[word])
                        else:
                            temp.append(0)
                    temp=np.asarray(temp)
                    x_test=np.expand_dims(temp, axis=0)
                    
                    prediction=predict_step(x_test)[0][0]
                    predicted_label=labels[prediction]
                    print('\n疾病类别： '+predicted_label+'\n')

예제 #22

0

파일 보기

def predict_unseen_data():
    x_test = []
    #y_test = []
    """Step 0: load trained model and parameters"""
    params = json.loads(open('./parameters.json').read())
    checkpoint_dir = sys.argv[1]
    if not checkpoint_dir.endswith('/'):
        checkpoint_dir += '/'
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir +
                                                 'checkpoints')
    logging.critical('Loaded the trained model: {}'.format(checkpoint_file))
    """Step 1: load data for prediction"""
    test_file = sys.argv[2]
    #test_examples = json.loads(open(test_file).read())
    test_examples = pd.read_csv(test_file, sep='\t')

    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(open('./labels.json').read())
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    #x_raw = [example['consumer_complaint_narrative'] for example in test_examples]
    for x in range(test_examples.consumer_complaint_narrative.shape[0]):
        x_raw = test_examples.consumer_complaint_narrative[x]
        #y_raw = test_examples.tag[x]
        #y_test.append(y_raw)
        temp_str = data_helper.clean_str(x_raw)
        x_test.append(temp_str)

    #x_test = [data_helper.clean_str(x) for x in x_raw]
    logging.info('The number of x_test: {}'.format(len(x_test)))

    #y_test = None
    #if 'product' in test_examples[0]:
    #y_raw = [example['product'] for example in test_examples]
    #y_test = [label_dict[y] for y in y_raw]
    #logging.info('The number of y_test: {}'.format(len(y_test)))

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))
    """Step 2: compute the predictions"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)
            all_predictions = []
            nf = open("tags.txt", 'a+')
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                #print (batch_predictions)
                for x in batch_predictions:
                    tot = str(x) + "\n"
                    nf.write(tot)
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])
                #print(all_predictions)
                #for x in all_predictions:
                #tot = str(x) + "\n"
                #nf.write(tot)
            nf.close()

예제 #23

0

파일 보기

파일: predict1.py 프로젝트: heiheiheihei4444/education_platform

def predict_unseen_data():
    """Step 0: load trained model and parameters"""

    params = json.loads(open(root_path + 'parameters.json').read())
    checkpoint_dir = root_path + "trained_model\\"
    # if not checkpoint_dir.endswith('/'):
    # 	checkpoint_dir += '/'
    checkpoint_file = tf.train.latest_checkpoint(
        "E:\工作文档\bs\textcnn-multi-class\textcnn-multi-class\trained_model_1558186060\checkpoints"
    )
    logging.critical('Loaded the trained model: {}'.format(checkpoint_file))
    """Step 1: load data for prediction"""
    # 自定义参数
    text = sys.argv[2]
    text = processData(text)

    # test_df = pd.read_csv(sys.argv[1])
    # test_examples = json.loads(open(test_file).read())
    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(open(root_path + 'labels.json').read())

    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))
    # x_text = data_helper.clean_str(text)
    x_raw = [sys.argv[1]]
    x_test = [data_helper.clean_str(x) for x in x_raw]
    logging.info('The number of x_test: {}'.format(len(x_test)))
    # print(s_text)
    y_test = None

    # y_raw = test_df['category']
    # y_test = [label_dict[y] for y in y_raw]
    # logging.info('The number of y_test: {}'.format(len(y_test)))

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))
    """Step 2: compute the predictions"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    actual_labels = [labels[int(prediction)] for prediction in all_predictions]
    print(actual_labels[0])

예제 #24

0

파일 보기

def predict_unseen_data():
    """Step 0: load trained model and parameters"""
    params = json.loads(open('./parameters.json').read())
    checkpoint_dir = "trained_model_1516138925/"
    if not checkpoint_dir.endswith('/'):
        checkpoint_dir += '/'
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir +
                                                 'checkpoints')
    #logging.critical('Loaded the trained model: {}'.format(checkpoint_file))
    """Step 1: load data for prediction"""
    data = {}
    data = sys.argv[1]
    message = ('[{"content": "%s"}]') % (data)

    test_examples = json.loads(message)

    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(open('./labels.json').read())
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    x_raw = [example['content'] for example in test_examples]
    x_test = [data_helper.clean_str(x) for x in x_raw]
    logging.info('The number of x_test: {}'.format(len(x_test)))

    y_test = None
    if 'label' in test_examples[0]:
        y_raw = [example['label'] for example in test_examples]
        y_test = [label_dict[y] for y in y_raw]
        logging.info('The number of y_test: {}'.format(len(y_test)))

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))
    """Step 2: compute the predictions"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])
            print("Predictions:")
            looper = 0
            length = len(all_predictions)
            location_dataset = "data/Dataset.csv"
            while looper < length:
                label = get_label(all_predictions[looper])
                #Change," Updated the rule file"
                print(label)
                print(data)
                dataset = "\n%s,\"%s\" " % (label, data)
                f = open(location_dataset, "a")
                f.write(dataset)
                f.close
                looper += 1

    if y_test is not None:
        y_test = np.argmax(y_test, axis=1)
        correct_predictions = sum(all_predictions == y_test)
        logging.critical('The accuracy is: {}'.format(correct_predictions /
                                                      float(len(y_test))))
        logging.critical('The prediction is complete')

예제 #25

0

파일 보기

def predict_unseen_data():
    """Step 0: load trained model and parameters"""
    params = json.loads(
        open('./parameters.json').read())  #load parameter dari parameters.json
    checkpoint_dir = sys.argv[1]  #path/alamat ke checkpoint (pada arg1)
    if not checkpoint_dir.endswith('/'):  #tambah / ke alamat checkpoint
        checkpoint_dir += '/'
    checkpoint_file = tf.train.latest_checkpoint(
        checkpoint_dir + 'checkpoints')  #train checkpoint terakhir
    logging.critical(
        'Loaded the trained model: {}'.format(checkpoint_file))  #log
    """Step 1: load data for prediction"""
    test_file = sys.argv[2]  #lokasi test_file (pada arg2)
    test_examples = json.loads(
        open(test_file).read())  #load contoh test dari direktori test_file

    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(
        open('./labels.json').read())  #label yang ditentukan dalam bentuk json
    one_hot = np.zeros((len(labels), len(labels)),
                       int)  #buat array isi nol dari panjang labels
    np.fill_diagonal(one_hot,
                     1)  #isi diagonal dari array one_hot dengan angka 1
    label_dict = dict(zip(labels, one_hot))

    x_raw = [
        example['consumer_complaint_narrative'] for example in test_examples
    ]
    x_test = [data_helper.clean_str(x) for x in x_raw]
    logging.info('The number of x_test: {}'.format(len(x_test)))

    y_test = None
    if 'product' in test_examples[0]:
        y_raw = [example['product'] for example in test_examples]
        y_test = [label_dict[y] for y in y_raw]
        logging.info('The number of y_test: {}'.format(len(y_test)))

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))
    """Step 2: compute the predictions"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    if y_test is not None:
        y_test = np.argmax(y_test, axis=1)
        correct_predictions = sum(all_predictions == y_test)

        # Save the actual labels back to file
        actual_labels = [
            labels[int(prediction)] for prediction in all_predictions
        ]

        for idx, example in enumerate(test_examples):
            example['new_prediction'] = actual_labels[idx]

        with open('./data/small_samples_prediction.json', 'w') as outfile:
            json.dump(test_examples, outfile, indent=4)

        logging.critical('The accuracy is: {}'.format(correct_predictions /
                                                      float(len(y_test))))
        logging.critical('The prediction is complete')

예제 #26

0

파일 보기

def predict_unseen_data():
	"""Step 0: load trained model and parameters"""
	params = {
    "num_epochs": 25,
    "batch_size": 30,
    "num_filters": 128,
    "filter_sizes": "3,4,5",
    "embedding_dim": 300,
    "l2_reg_lambda": 0.15,
    "evaluate_every": 200,
    "dropout_keep_prob": 0.5
    }
	curr_checkpoint = 'trained_model_1589201852'
	checkpoint_dir = file_dir + '/' + curr_checkpoint + '/'
	if not checkpoint_dir.endswith('/'):
		checkpoint_dir += '/'
	checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir + 'checkpoints')
	logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

	"""Step 1: load data for prediction"""
	test_file = file_dir + '/data/small_samples.json'
	test_examples = json.loads(open(test_file).read())

	# labels.json was saved during training, and it has to be loaded during prediction
	labels = json.loads(open(file_dir+'/labels.json').read())
	one_hot = np.zeros((len(labels), len(labels)), int)
	np.fill_diagonal(one_hot, 1)
	label_dict = dict(zip(labels, one_hot))

	x_raw = [example['comments'] for example in test_examples]
	x_test = [data_helper.clean_str(x) for x in x_raw]
	logging.info('The number of x_test: {}'.format(len(x_test)))

	y_test = None
	if 'category' in test_examples[0]:
		y_raw = [example['category'] for example in test_examples]
		y_test = [label_dict[y] for y in y_raw]
		logging.info('The number of y_test: {}'.format(len(y_test)))

	vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
	vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
	x_test = np.array(list(vocab_processor.transform(x_test)))

	"""Step 2: compute the predictions"""
	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)

		with sess.as_default():
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)

			input_x = graph.get_operation_by_name("input_x").outputs[0]
			dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
			predictions = graph.get_operation_by_name("output/predictions").outputs[0]
			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
			all_predictions = []
			for x_test_batch in batches:
				batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
				all_predictions = np.concatenate([all_predictions, batch_predictions])

	if y_test is not None:
		y_test = np.argmax(y_test, axis=1)
		correct_predictions = sum(all_predictions == y_test)

		# Save the actual labels back to file
		actual_labels = [labels[int(prediction)] for prediction in all_predictions]

		for idx, example in enumerate(test_examples):
			example['new_prediction'] = actual_labels[idx]
		
		with open(file_dir+'/data/small_samples_prediction.json', 'w') as outfile:
			json.dump(test_examples, outfile, indent=4)

		logging.critical('The accuracy is: {}'.format(correct_predictions / float(len(y_test))))
		logging.critical('The prediction is complete')

예제 #27

0

파일 보기

파일: predict.py 프로젝트: VinayGValsaraj/multi-class-text-classification-cnn

def predict_unseen_data():
    """Step 0: load trained model and parameters"""
    # print ("loading")
    params = json.loads(open('./parameters.json').read())
    checkpoint_dir = sys.argv[1]
    if not checkpoint_dir.endswith('/'):
        checkpoint_dir += '/'
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir +
                                                 'checkpoints')
    # logging.critical('Loaded the trained model: {}'.format(checkpoint_file))
    """Step 1: load data for prediction"""
    test_file = sys.argv[2]
    # test_examples = json.loads(open(test_file).read())
    df = pd.read_csv(test_file, sep="|", encoding="unicode_escape")
    df.columns = ['label', 'text']

    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(open('./labels.json').read())
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    x_raw = df['text'].apply(lambda x: str(x)).tolist()
    x_test = [data_helper.clean_str(x) for x in x_raw]
    # logging.info('The number of x_test: {}'.format(len(x_test)))

    y_test = None
    # if 'label' in test_examples[0]:
    # y_raw = [example['label'] for example in df]
    y_raw = df['label'].tolist()
    # y_raw = df['label'].apply(lambda y: y)]).tolist()
    y_test = [label_dict[str(y)] for y in y_raw]
    # logging.info('The number of y_test: {}'.format(len(y_test)))

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))
    # print ("computing")
    """Step 2: compute the predictions"""
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])
    # print ("done computing")
    if y_test is not None:
        y_test = np.argmax(y_test, axis=1)
        correct_predictions = sum(all_predictions == y_test)

        # Save the actual labels back to file
        actual_labels = [
            labels[int(prediction)] for prediction in all_predictions
        ]
        # print (all_predictions)
        labels = [int(pred) for pred in all_predictions]

        # for idx, example in enumerate(df):
        # 	example['new_prediction'] = actual_labels[idx]
        #
        # with open('./data/small_samples_prediction.json', 'w') as outfile:
        # 	json.dump(df, outfile, indent=4)

        # logging.critical('The accuracy is: {}'.format(correct_predictions / float(len(y_test))))
        # logging.critical('The prediction is complete')
        print("Accuracy: ", accuracy_score(y_raw, labels))
        print(classification_report(y_raw, labels))

예제 #28

0

파일 보기

def nps_prediction():
    print("Sentiment being processed")

    publish_date=[]
    first_name=[]
    last_name=[]
    activity=[]
    channel_code=[]
    customer_id=[]
    response_date=[]
    rating=[]
    why=[]
    what=[]
    system_date=[]
    sentiment_why=[]
    sentiment_what = []
    sentiment=[]
    non_digital_why=[]
    non_digital_what=[]
    non_digital=[]
    id_val=[]

    s = solr.Solr('http://localhost:8983/solr/' + table_name)

    url_call = 'http://localhost:8983/solr/' + table_name + '/select?indent=on&q=*:*&wt=python'
    conn = urlopen(url_call)
    resp = eval(conn.read())
    resp_rows = str(resp['response']['numFound'])
    print(resp_rows) #How many record
    url_string = url_call + '&rows=' + resp_rows + '&start=0'
    print(url_string)
    conn = urlopen(url_string)
    resp = eval(conn.read())

    for doc1 in resp['response']['docs']:
        if 'why' in doc1.keys():
            why.append(doc1['why'][0])
        else:
            why.append('no comment')
        if 'what' in doc1.keys():
            what.append(doc1['what'][0])
        else:
            what.append('no comment')
        if 'last_name' in doc1.keys():
            last_name.append(doc1['last_name'][0])
        else:
            last_name.append('Business User')
        publish_date.append(doc1['publish_date'][0])
        first_name.append(doc1['first_name'][0])
        activity.append(doc1['activity'][0])
        channel_code.append(doc1['channel_code'][0])
        customer_id.append(doc1['customer_id'][0])
        response_date.append(doc1['response_date'][0])
        rating.append(doc1['rating'][0])
        system_date.append(doc1['system_date'][0])
        #Initial Flags
        sentiment.append('no')
        non_digital.append('no')
        id_val.append(doc1['id_val'][0])

    for prediction in ['sentiment','digital']:
        if prediction == 'sentiment':
            params = json.loads(open('./parameters_sentiment.json').read())
            checkpoint_dir = "C:\\Code_Sketch\\NPS\\S3134076\\PycharmProjects\\nps\\trained_model_1509637563"
            labels = json.loads(open('./labels_sentiment.json').read())
            vocab_path = os.path.join(checkpoint_dir, "vocab_sentiment.pickle")
        elif prediction == 'digital':
            params = json.loads(open('./parameters_digital.json').read())
            checkpoint_dir = "C:\\Code_Sketch\\NPS\\S3134076\\PycharmProjects\\nps\\trained_model_1508867627"
            labels = json.loads(open('./labels_digital.json').read())
            vocab_path = os.path.join(checkpoint_dir, "vocab_digital.pickle")

        logging.getLogger().setLevel(logging.INFO)

       # if not checkpoint_dir.endswith('/'):
        #    checkpoint_dir += '/'
        checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir + '\\checkpoints')
        logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

        one_hot = np.zeros((len(labels), len(labels)), int)
        np.fill_diagonal(one_hot, 1)



        vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
        x_test_why = [data_helper.clean_str(x) for x in why]
        x_test_why = np.array(list(vocab_processor.transform(x_test_why)))

        x_text_what=[data_helper.clean_str(x) for x in what]
        x_test_what = np.array(list(vocab_processor.transform(x_text_what)))

        graph = tf.Graph()
        with graph.as_default():
            session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
            sess = tf.Session(config=session_conf)

            with sess.as_default():
                saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
                saver.restore(sess, checkpoint_file)

                input_x_why = graph.get_operation_by_name("input_x").outputs[0]
                dropout_keep_prob_why = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
                predictions_why = graph.get_operation_by_name("output/predictions").outputs[0]

                input_x_what = graph.get_operation_by_name("input_x").outputs[0]
                dropout_keep_prob_what = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
                predictions_what = graph.get_operation_by_name("output/predictions").outputs[0]


                batches_why = data_helper.batch_iter(list(x_test_why), params['batch_size'], 1, shuffle=False)
                batches_what= data_helper.batch_iter(list(x_test_what), params['batch_size'], 1, shuffle=False)

                all_predictions_why = []
                all_predictions_what = []

                for x_test_batch in batches_why:
                    batch_predictions_why = sess.run(predictions_why, {input_x_why: x_test_batch, dropout_keep_prob_why: 1.0})
                    all_predictions_why = np.concatenate([all_predictions_why, batch_predictions_why])
                # Giving Prediction batch by batch

                for x_test_batch in batches_what:
                    batch_predictions_what = sess.run(predictions_what, {input_x_what: x_test_batch, dropout_keep_prob_what: 1.0})
                    all_predictions_what = np.concatenate([all_predictions_what, batch_predictions_what])



        for predict in all_predictions_why:
            #print(labels[int(predict)])
            if prediction == 'sentiment':
                if labels[int(predict)] == 'positive':
                    sentiment_why.append("positive")
                else:
                    sentiment_why.append("negative")
            elif prediction == 'digital':
                if labels[int(predict)] == 'non_digital':
                    non_digital_why.append("non_digital")
                else:
                    non_digital_why.append("digital")
        for predict in all_predictions_what:
            if prediction == 'sentiment':
                if labels[int(predict)] == 'positive':
                    sentiment_what.append("positive")
                else:
                    sentiment_what.append("negative")
            elif prediction == 'digital':
                if labels[int(predict)] == 'non_digital':
                    non_digital_what.append("non_digital")
                else:
                    non_digital_what.append("digital")


    current_directory = os.getcwd()
    os.chdir(solr_path)
    os.system('solr delete -c ' + table_name)
    os.system('solr create -c ' + table_name)
    os.chdir(current_directory)

    doc=[]

예제 #29

0

파일 보기

def main(_):
    print(time.ctime())
    host, port = FLAGS.server.split(':')
    channel = implementations.insecure_channel(host, int(port))
    stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

    #Send request
    params = json.loads(open('./parameters.json').read())
    checkpoint_dir = './trained_model_1531212474/'
    if not checkpoint_dir.endswith('/'):
        checkpoint_dir += '/'
    """Step 1: load data for prediction"""
    test_file = './data/data.csv'
    # test_examples = json.loads(open(test_file).read())
    test_examples = pd.read_csv(test_file, dtype={'data': object})
    df = pd.read_csv(test_file, dtype={'data': object})
    # selected = ['text','title','style','structural','tag']
    selected = ['data', 'tag']
    non_selected = list(set(df.columns) - set(selected))

    df = df.drop(non_selected, axis=1)  # Drop non selected columns
    df = df.dropna(axis=0, how='any', subset=selected)  # Drop null rows
    df = df.reindex(np.random.permutation(df.index))  # Shuffle the dataframe

    # Map the actual labels to one hot labels
    labels = sorted(list(set(df[selected[1]].tolist())))
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)

    x_raw = df[selected[0]].apply(lambda x: clean_str(x)).tolist()
    x_test = [data_helper.clean_str(x) for x in x_raw]
    x_test = np.array(list(vocab_processor.transform(x_test)))
    logging.info('The number of x_test: {}'.format(len(x_test)))

    y_test = None
    y_raw = df[selected[1]].apply(lambda y: label_dict[y]).tolist()
    y_test = np.array(y_raw)
    logging.info('The number of y_test: {}'.format(len(y_test)))

    batches = data_helper.batch_iter(list(zip(x_test, y_test)),
                                     params['batch_size'],
                                     1,
                                     shuffle=False)
    all_predictions = []
    start = time.time()

    for test_batch in batches:
        x_test_batch, y_test_batch = zip(*test_batch)

        x_test_batch = np.array(x_test_batch)
        x_test_batch = numpy_array_int32(x_test_batch)

        y_test_batch = np.array(y_test_batch)
        y_test_batch = numpy_array_float(y_test_batch)
        print(type(y_test_batch))

        request = predict_pb2.PredictRequest()
        request.model_spec.name = 'saved_model'
        request.model_spec.version.value = 1
        request.model_spec.signature_name = 'predict_label'

        request.inputs['input_x'].CopyFrom(
            tf.contrib.util.make_tensor_proto(x_test_batch))
        request.inputs['input_y'].CopyFrom(
            tf.contrib.util.make_tensor_proto(y_test_batch))
        print("lalalala")
        request.inputs['dropout_keep_prob'].CopyFrom(
            tf.contrib.util.make_tensor_proto(1.0))
        print("hahahahaha")

        result = stub.Predict(request, 10.0)  # 10 secs timeout

        print(result.outputs['accuracy/accuracy'])
        batch_predictions = (result.outputs['output/predictions'].int64_val)
        all_predictions = np.concatenate([all_predictions, batch_predictions])
        print(batch_predictions)
        print(result)
        print("batch_predictions", batch_predictions)
        print("all_predictions", all_predictions)

    print(time.time() - start)

    if y_test is not None:
        y_test = np.argmax(y_test, axis=1)
        correct_predictions = sum(all_predictions == y_test)
        print(correct_predictions)
        # Save the actual labels back to file

        for idx, example in enumerate(test_examples):
            print(idx, '\t', example)
        # example['tag'] = actual_labels[idx]

        with open('./data/small_samples_prediction.json', 'w') as outfile:
            json.dump(test_examples.to_json(), outfile, indent=4)

        logging.critical('The accuracy is: {}'.format(correct_predictions /
                                                      float(len(y_test))))
        logging.critical('The prediction is complete')

예제 #30

0

파일 보기

파일: predict.py 프로젝트: NextNight/multi-class-text-classification-cnn

def predict_unseen_data():
	"""Step 0: load trained model and parameters"""
	params = json.loads(open('./parameters.json').read())
	checkpoint_dir = sys.argv[1]
	if not checkpoint_dir.endswith('/'):
		checkpoint_dir += '/'
	checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir + 'checkpoints')
	logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

	"""Step 1: load data for prediction"""
	test_file = sys.argv[2]
	test_examples = json.loads(open(test_file).read())

	# labels.json was saved during training, and it has to be loaded during prediction
	labels = json.loads(open('./labels.json').read())
	one_hot = np.zeros((len(labels), len(labels)), int)
	np.fill_diagonal(one_hot, 1)
	label_dict = dict(zip(labels, one_hot))

	x_raw = [example['consumer_complaint_narrative'] for example in test_examples]
	x_test = [data_helper.clean_str(x) for x in x_raw]
	logging.info('The number of x_test: {}'.format(len(x_test)))

	y_test = None
	if 'product' in test_examples[0]:
		y_raw = [example['product'] for example in test_examples]
		y_test = [label_dict[y] for y in y_raw]
		logging.info('The number of y_test: {}'.format(len(y_test)))

	vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
	vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
	x_test = np.array(list(vocab_processor.transform(x_test)))

	"""Step 2: compute the predictions"""
	graph = tf.Graph()
	with graph.as_default():
		session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
		sess = tf.Session(config=session_conf)

		with sess.as_default():
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)

			input_x = graph.get_operation_by_name("input_x").outputs[0]
			dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
			predictions = graph.get_operation_by_name("output/predictions").outputs[0]

			batches = data_helper.batch_iter(list(x_test), params['batch_size'], 1, shuffle=False)
			all_predictions = []
			for x_test_batch in batches:
				batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
				all_predictions = np.concatenate([all_predictions, batch_predictions])

	if y_test is not None:
		y_test = np.argmax(y_test, axis=1)
		correct_predictions = sum(all_predictions == y_test)

		# Save the actual labels back to file
		actual_labels = [labels[int(prediction)] for prediction in all_predictions]

		for idx, example in enumerate(test_examples):
			example['new_prediction'] = actual_labels[idx]
		
		with open('./data/small_samples_prediction.json', 'w') as outfile:
			json.dump(test_examples, outfile, indent=4)

		logging.critical('The accuracy is: {}'.format(correct_predictions / float(len(y_test))))
		logging.critical('The prediction is complete')

예제 #31

0

파일 보기

def console_predict():
    """Step 0: load trained model and parameters"""
    params = json.loads(open('./parameters.json').read())
    checkpoint_dir = sys.argv[1]
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir +
                                                 'checkpoints')
    logging.critical('Loaded the trained model: {}'.format(checkpoint_file))

    # labels.json was saved during training, and it has to be loaded during prediction
    labels = json.loads(open('./labels.json').read())
    one_hot = np.zeros((len(labels), len(labels)), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    ###For Console input
    #text = input("Input : ")
    ###For C# inter-processing
    #text = sys.argv[2]
    ###For URL request
    text = flask.request.args['query']

    logging.info('ENCODING : {}'.format(flask.request.args['query']))

    li = [text]

    x_raw = li
    x_test = [data_helper.clean_str(x) for x in x_raw]
    logging.info('The number of x_test: {}'.format(len(x_test)))

    y_test = None

    vocab_path = os.path.join(checkpoint_dir, "vocab.pickle")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_test)))

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            input_x = graph.get_operation_by_name("input_x").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            batches = data_helper.batch_iter(list(x_test),
                                             params['batch_size'],
                                             1,
                                             shuffle=False)
            all_predictions = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    # Save the actual labels back to file
    actual_labels = [labels[int(prediction)] for prediction in all_predictions]

    logging.info('RESULT : {}'.format(actual_labels))
    print(actual_labels[0])
    result = {'result': actual_labels[0]}
    return flask.jsonify(result)