/
VectorSpace_chinese.py
154 lines (117 loc) · 5.37 KB
/
VectorSpace_chinese.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from pprint import pprint
from Parser import Parser
from textblob import TextBlob as tb
import util
import tfidf
import jieba
class VectorSpace:
""" A algebraic model for representing text documents as vectors of identifiers.
A document is represented as a vector. Each dimension of the vector corresponds to a
separate term. If a term occurs in the document, then the value in the vector is non-zero.
"""
#Collection of document term vectors
documentVectors = {}
#Mapping of vector index to keyword
vectorKeywordIndex=[]
#Tidies terms
parser=None
def __init__(self, documents={}, weighting='tf'):
self.documentVectors={}
self.parser = Parser()
self.blobList = []
if(len(documents)>0):
self.build(documents, weighting)
def build(self,documents, weighting):
""" Create the vector space for the passed document strings """
self.vectorKeywordIndex = self.getVectorKeywordIndex(list(documents.values()))
for key, value in documents.items():
self.blobList.append(tb(value))
self.documentVectors[key] = self.makeVector(value, weighting)
#print(self.vectorKeywordIndex)
#print(self.documentVectors)
def getVectorKeywordIndex(self, documentList):
""" create the keyword associated to the position of the elements within the document vectors """
uniqueVocabularyList = []
for document in documentList:
seg_list = jieba.cut_for_search(document)
uniqueVocabularyList.append(word for word in seg_list)
vectorIndex={}
offset=0
# #Associate a position with the keywords which maps to the dimension on the vector used to represent this word
for word in uniqueVocabularyList:
vectorIndex[word]=offset
offset+=1
return vectorIndex #(keyword:position)
def makeVector(self, wordString, weighting):
""" @pre: unique(vectorIndex) """
#Initialise vector with 0's
vector = [0] * len(self.vectorKeywordIndex)
wordList = self.parser.tokenise(wordString)
wordList = self.parser.removeStopWords(wordList)
documentString = " ".join(wordList)
blob = tb(documentString)
### tf weighting
for word in wordList:
if weighting == 'tf':
vector[self.vectorKeywordIndex[word]] += 1 / len(wordList) #Use simple Term Count Model
# vector[self.vectorKeywordIndex[word]] = tfidf.tf(word, blob)
elif weighting == 'tfidf':
vector[self.vectorKeywordIndex[word]] = tfidf.tfidf(word, blob, self.blobList)
return vector
def buildQueryVector(self, termList, weighting):
""" convert query string into a term vector """
query = self.makeVector(" ".join(termList), weighting)
return query
def related(self,documentId):
""" find documents that are related to the document indexed by passed Id within the document Vectors"""
ratings = {}
for key, value in documentVectors.items():
rating = util.cosine(self.documentVectors[documentId], value)
#ratings.sort(reverse=True)
ratings[key] = rating
return ratings
def search(self,searchList, formula="cosine", weighting="tf"):
""" search for documents that match based on a list of terms """
ratings = {}
queryVector = self.buildQueryVector(searchList, weighting)
for key, value in self.documentVectors.items():
if formula == "cosine":
rating = util.cosine(queryVector, value)
elif formula == "euclidean":
rating = util.euclidean(queryVector, value)
ratings[key] = rating
ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)}
return ratings
def relevence_search(self, searchVector, formula="cosine", weighting='tf'):
ratings = {}
for key, value in self.documentVectors.items():
if formula == "cosine":
rating = util.cosine(searchVector, value)
elif formula == "euclidean":
rating = util.euclidean(searchVector, value)
ratings[key] = rating
ratings = {k: v for k, v in sorted(ratings.items(), key=lambda item: item[1], reverse=True)}
return ratings
def getRelevenceVector(self, wordString):
wordList = self.parser.tokenise(wordString)
wordList = self.parser.removeStopWords(wordList)
feedbackWord = []
pos_result = nltk.pos_tag(wordList)
for word in pos_result:
if word[1] == 'VB' or 'NN':
feedbackWord.append(word[0])
weighting="tfidf"
feedbackVector = self.makeVector(" ".join(feedbackWord), weighting)
return feedbackVector
if __name__ == '__main__':
#test data
documents = {"N1":"The cat in the hat disabled",
"N2":"A cat is a fine pet ponies.",
"N3":"Dogs and cats make good pets.",
"N4":"I haven't got a hat."}
vectorSpace = VectorSpace(documents)
#print(vectorSpace.vectorKeywordIndex)
# print("document Vectors: ",vectorSpace.documentVectors)
# print(vectorSpace.related(1))
print(vectorSpace.search(["cat", "dog"]))
###################################################