/
QueryTermsScoring.py
48 lines (35 loc) · 1.76 KB
/
QueryTermsScoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import Tokenizer
import numpy
class QueryTermsScoring:
def __init__(self):
self.query_list = {
"anger": r"D:\Thesis\Thesis-CE\Phyton Program\QueryTerms\anger.txt",
"joy": r"D:\Thesis\Thesis-CE\Phyton Program\QueryTerms\joy.txt",
"fear": r"D:\Thesis\Thesis-CE\Phyton Program\QueryTerms\fear.txt",
"sadness": r"D:\Thesis\Thesis-CE\Phyton Program\QueryTerms\sadness.txt"
}
self.queryDict = dict()
def fit(self, emotion: str):
with open(self.query_list[emotion], 'r', errors="surrogateescape") as file:
for line in file:
self.queryDict[line.strip('\n')] = []
def labelQueryTerm(self, tweetsList):
tokenizer = Tokenizer.Tokenizer()
for tweet in tweetsList:
termsInTweets = tokenizer.tokenize(tweet[2], 'simple')
for term in termsInTweets:
if term in list(self.queryDict.keys()):
self.queryDict[term].append(tweet[1])
self._scoreQueryTerms()
def getScore(self,tokenizedTweet, noOfClasses=4): # in our work each tweet might be from one of the four possible classes
scores = numpy.zeros(noOfClasses)
for term in tokenizedTweet:
if term in list(self.queryDict.keys()):
for index, score in enumerate(self.queryDict[term]):
scores[index] = max(scores[index], score)
return scores.tolist()
def _scoreQueryTerms(self):
for term in list(self.queryDict.keys()):
labelList = numpy.array(self.queryDict[term])
unique, counts = numpy.unique(labelList, return_counts=True)
self.queryDict[term] = (counts/sum(counts)).tolist()