/
DataPreprocessorOld.py
158 lines (114 loc) · 4.06 KB
/
DataPreprocessorOld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy
#from sklearn.neighbors import KDTree
from sklearn.neighbors import LSHForest
from collections import Counter
import csv
import spacy
import codecs
TOP_N_COUNT = 3 # Number of categories to describe the channel
#from scipy import spatial
# ToDo: benchmark (1 - scipy.spatial.distance.cosine(one.vector, three.vector))
def cosine_similarity(vec1, vec2):
return numpy.dot(vec1, vec2)/(numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))
def GetVerticalsListFromFile():
verticals = []
with open('/home/pkonovalov/PycharmProjects/ChannelClassifier/Data/verticals.csv', 'rt') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader, None) # skip the headers
for row in reader:
verticals.append(row[0].replace("&", ""))
return verticals
def GetVerticalsVectorsDict(nlp, verticals):
verticalsDict = {}
sumVector = numpy.zeros(nlp.vocab.vectors_length)
for category in verticals:
words = nlp(category.replace("&", ""))
for word in words:
sumVector += word.vector
verticalsDict[category] = sumVector
return verticalsDict
def Classify(nlp, keywords, categories): #keywords - list; categories - dict: {name; vector}
counterDict = Counter(keywords) #optimization for keywords duplicates
sumVector = numpy.zeros(nlp.vocab.vectors_length)
#temp
text = ' '.join(keywords)
for word, repCount in counterDict.items(): #summurizing words vectors
curVect = nlp(word).vector
sumVector += (curVect * repCount)
vec = nlp(text).vector
sim = cosine_similarity(vec, sumVector)
print("Sim: " + str(sim))
catArray = numpy.array(list(categories.values()))
catKeys = list(categories.keys())
#tree = KDTree(catArray, metric='pyfunc', func=cosine_similarity)
#dist, ind = tree.query(sumVector, k=TOP_N_COUNT) #.reshape(-1, 1)
print("Creating LSHForest...")
lshf = LSHForest(n_candidates=70, n_estimators=30, n_neighbors=TOP_N_COUNT)
lshf.fit(catArray)
print("LSHForest was created")
print("Getting neighbors...")
distances, indices = lshf.kneighbors(sumVector.reshape((1, -1)))
print("Got neighbors.")
for curIndex in numpy.nditer(indices):
print("Found category: " + str(catKeys[curIndex]))
print("with distance: " + str(distances))
#
# def Process(nlp):
# words = nlp(u'BMW Mercedes Toyota Lexus Ford window number')
# car = nlp(u'car')
# motorcycle = nlp(u'motorcycle')
#
# shape = nlp.vocab.vectors_length
# sumVector = numpy.zeros(shape)
#
# for word in words:
# sumVector += word.vector
#
# sim1 = cosine_similarity(sumVector, car.vector)
# sim2 = cosine_similarity(sumVector, motorcycle.vector)
#
# print(sim1)
# print(sim2)
#X = numpy.array([[ 1., 2.], [ 10., 20]])
#tree = KDTree(X, leaf_size=2, metric='pyfunc', func=cosine_similarity)
#target = numpy.array([ 3.7]).reshape(-1, 1)
#target = numpy.array( [ 9., 11.])
#dist, ind = tree.query(target, k=1)
#print (dist) # distances to 3 closest neighbors
#print (ind)
#X = numpy.array([ 1., 2., 3., 4., 5.]).reshape(-1, 1)
#X1 = numpy.random.random((4, 1))
# tree = KDTree(X, leaf_size=2, metric='pyfunc', func=cosine_similarity)
#
# #target = numpy.array([ 3.7]).reshape(-1, 1)
#
# dist, ind = tree.query(target, k=1)
# print (dist) # distances to 3 closest neighbors
# print (ind)
# Load English tokenizer, tagger, parser, NER and word vectors
#nlp = spacy.load('en')
#Process(nlp)
# s1 = one.similarity(three)
# s1_test = 1 - spatial.distance.cosine(one.vector, three.vector)
# s1_test2 = cosine_similarity(numpy.float64(one.vector), numpy.float64(three.vector))
# s2 = two.similarity(three)
#
# sum1 = (s1 + s2)/2
#
# vecSum = one.vector + two.vector
# sum2 = 1 - spatial.distance.cosine(three.vector, vecSum)
#
#
#
# print("s1 = " + str(s1))
# print("s1_test = " + str(s1_test))
# print("s1_test2 = " + str(s1_test2))
#
# print("s2 = " + str(s2))
#
#
# print("Sum1 = " + str(sum1))
# print("Sum2 = " + str(sum2))
#sum2 = car.similarity()
#print(vecSum)
#print(oranges.similarity(oranges))