-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClassificationInterpreterCustom1.py
111 lines (99 loc) · 5.55 KB
/
ClassificationInterpreterCustom1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from Vectorizer import Vectorizer
from Misc import Misc
from Storage import Storage
from SessionLogger import SessionLogger
from CategoryListHandler import CategoryListHandler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
from SessionConfigReader import SessionConfigReader
class ClassificationInterpreterCustom1:
col_name_categories = 'categories'
new_col_name_cat_vec = 'categories vector'
col_name_class_out = 'classification output'
col_name_result = 'result'
one_word_cat = 'one word category'
ext_out_vecs = '_category_vectors'
ext_categorized = '_categorized'
similarity_function_key = 'similarity_function'
sim_func_cosine = 'cosine_similarity'
sim_func_eucl_dist = 'euclidean_distance'
# expects list of strings
# returns the first word from the first string element in the list
@staticmethod
def extract_one_word_cat(str_list):
if len(str_list) == 0:
return ''
else:
return Misc.extract_first_word(str_list[0])
# expects a pandas data frame, containing a category column
# creates an output vector for training from each entry in the category column from the data frame and stores it in a new column
# returns pandas data frame with added column for added output vectors
@staticmethod
def create_out_vectors(data_frame, col_name=col_name_categories, new_col_name=new_col_name_cat_vec, storage_level=0, storage_name=''):
df = data_frame.copy()
df[ClassificationInterpreterCustom1.one_word_cat] = df.apply(lambda x: ClassificationInterpreterCustom1.extract_one_word_cat(x[col_name]), axis=1)
vectorized_df = Vectorizer.vectorize(df, col_name=ClassificationInterpreterCustom1.one_word_cat, new_col_name=new_col_name, storage_level=0, log=0)
vectorized_df = vectorized_df.drop(columns=[ClassificationInterpreterCustom1.one_word_cat])
vectorized_df[new_col_name] = vectorized_df.apply(lambda x: (x[new_col_name]+1)/2, axis=1) # adjust to softmax codomain
log_text = 'Category vectors for classifier training have been created (' + str(len(data_frame.index)) + ' entries).'
if storage_level >= 1 and storage_name != '':
storage_name = storage_name + ClassificationInterpreterCustom1.ext_out_vecs
Storage.store_pd_frame(vectorized_df, storage_name)
log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').'
SessionLogger.log(log_text)
return vectorized_df
# expects a vector and two lists with word strings and their vector representations
# returns the word with the highest similarity in relation to the vector
@staticmethod
def get_highest_similarity(vec, word_list, word_vec_list):
vec = vec*2-1 # adjust from softmax codomain
sim_func = SessionConfigReader.read_value(ClassificationInterpreterCustom1.similarity_function_key)
idx = 0
sim = 0
highest_word = ''
for word in word_list:
word_vec = word_vec_list[idx]
new_sim = 0
if word_vec is not None:
if sim_func == ClassificationInterpreterCustom1.sim_func_cosine:
new_sim = cosine_similarity(np.asarray([word_vec]), [vec])[0][0]
elif sim_func == ClassificationInterpreterCustom1.sim_func_eucl_dist:
new_sim = euclidean_distances([word_vec], [vec])[0][0]
if new_sim < 0:
new_sim = new_sim*-1
if new_sim > sim:
sim = new_sim
highest_word = word
idx = idx+1
return highest_word
# expects a pandas data frame, containing a column with classification output vectors
# creates a list of categories from each classification output vector and stores it in a new column
# returns pandas data frame with added column for added category lists
@staticmethod
def interpret_output(data_frame, col_name=col_name_class_out, new_col_name=col_name_result, storage_level=0, storage_name='', log=1):
df = data_frame.copy()
category_list = CategoryListHandler.read_categories()
category_vectors = Vectorizer.get_word_vectors(category_list)
df[new_col_name] = df.apply(lambda x: [ClassificationInterpreterCustom1.get_highest_similarity(x[col_name], category_list, category_vectors)], axis=1)
log_text = 'Categories have been determined (' + str(len(df.index)) + ' entries).'
if storage_level >= 1 and storage_name != '':
storage_name = storage_name + ClassificationInterpreterCustom1.ext_categorized
Storage.store_pd_frame(df, storage_name)
log_text = log_text + ' Stored in \'' + storage_name + '\' (column: \'' + new_col_name + '\').'
if log:
SessionLogger.log(log_text)
return df
# expects a pandas data frame, containing a column with categories and a column with interpreted classification outputs
# compares content from both columns and makes a summarizing statement about accuracy
# returns accuracy in percent
@staticmethod
def evaluate_output(data_frame, col_name_categories=col_name_categories, col_name_outputs=col_name_result):
idx = 0
matches = 0
for index, row in data_frame.iterrows():
category = ClassificationInterpreterCustom1.extract_one_word_cat(row[col_name_categories])
output = row[col_name_outputs][0]
if category == output:
matches = matches+1
idx = idx+1
return matches/idx