-
Notifications
You must be signed in to change notification settings - Fork 0
/
New_Topic_Scoring.py
241 lines (222 loc) · 9.82 KB
/
New_Topic_Scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import gensim
import nltk
import warnings
import re
from nltk.corpus import wordnet
from pywsd.lesk import adapted_lesk
import spacy
import numpy as np
from gensim.models import LdaModel, LsiModel
from gensim.corpora import Dictionary
from PyDictionary import PyDictionary
warnings.filterwarnings('ignore')
dictionary = PyDictionary()
"""Compute similarity between two synsets
synset1.wup_similarity(synset2): Wu-Palmer Similarity: Return a score denoting how similar two word senses are,
based on the depth of the two senses in the taxonomy and that of their
Least Common Subsumer (most specific ancestor node)"""
def compute_similarity(synsets1, synsets2):
score, count = 0.0, 0
print(synsets1)
print(synsets2)
print("------------------")
# For each word in the first sentence
for synset in synsets1:
# Get the similarity value of the most similar word in the other sentence
samp_score = []
# calculating similarity using wup_similarity
best_score = [synset.wup_similarity(ss) for ss in synsets2]
for i in best_score:
if i is not None:
samp_score.append(i)
if len(samp_score) is not 0:
new_best_score = max(samp_score)
index = samp_score.index(max(samp_score))
print(str(index) + " " + str(synsets2[index]))
else:
samp_score.append(0)
new_best_score = max(samp_score)
print(samp_score)
# Check that the similarity could have been computed
if new_best_score is not None:
score += new_best_score
count += 1
# Average the values
score /= count
return score
# Getting topic from the text using Gensim LDA and LSI Topic Modeling Techniques. It also uses Spacy English model.
def get_topic(text):
np.random.seed(100)
nlp = spacy.load('en')
my_stop_words = [u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get']
for stopword in my_stop_words:
lexeme = nlp.vocab[stopword]
lexeme.is_stop = True
doc = nlp(text)
article = []
texts = []
for w in doc:
# if it's not a stop word or punctuation mark, add it to our article!
if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
# we add the lematized version of the word
article.append(w.lemma_)
texts.append(article)
# getting bigrams out of words using gensim
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
# Creating corpus with our words
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(i) for i in texts]
# Applying LDA and LSI models
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]
topics = []
for i in ldatopics:
topics.append(i[0])
tags = nltk.pos_tag(topics)
# removing verbs as generally nouns are topics
lfinaltopics = [word for word, pos in tags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ' and pos!='VBG' and pos != 'JJ' and pos != 'RB']
ldafinaltopics = list(set(lfinaltopics))
lstopics = []
for i in lsitopics:
for j in i:
lstopics.append(j)
ltags = nltk.pos_tag(lstopics)
lsifinaltopics = [word for word,pos in ltags if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos!= 'VBP' and pos!= 'VBZ' and pos!='VBG' and pos!= 'RB' and pos != 'JJ']
# Intersection of results from both models
finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics))
final_topics = []
for i in finaltopics:
if len(i) >= 2:
final_topics.append(i)
return final_topics
# It does all preprocessing for our answer and returns the clean text along with the topics in that text
def get_suggested_answer_topics():
# Cleaning our text
# demo1.txt has suggested answer
que = open("demo1.txt", encoding = 'utf8').read().lstrip()
que = re.sub(r'\s+', ' ', que)
que_topics = get_topic(que)
print("Suggested answer topics")
print(que_topics)
return que_topics, que
def get_student_answer_topics():
# Cleaning our text
# demo2.txt has student answer
text = open("demo2.txt", encoding = "utf8").read().lstrip()
text = re.sub(r'\s+', ' ', text)
print(text)
answer_topics = get_topic(text)
print("student answer Topics")
print(answer_topics)
return answer_topics, text
def topic_match(que_topics, ans_topics, topic_match_length):
# match_percent finds how many topics are matched with respect to question topics. This remains same for any case given below
# If topics_matched = 4 and question topics= 6
# match_percent= (4/6)*100 = 66.66
match_percent = (topic_match_length / len(que_topics))*100
# case 1: if the number of question topics and answer topics is same Ex: [1,2,3,4] [2,3,4,5]
if len(que_topics) == len(ans_topics):
# sub case 1: all topics are matched in both topic lists
if topic_match_length == len(que_topics):
return 100
# sub case 2: less than 40 % topics are matched
# Example : let question topics be [1,2,3,4]
# answer topics be [1,5,6,7]
# as we saw topic 1 is matched
# match_percent = 1/4 *100 = 25
# match_score = 25-(25*0.3)= 17.5 ( Penalising factor for irrelevancy is 0.3 )
elif match_percent < 40:
match_score = match_percent - (match_percent * 0.3)
# sub case 3: less than 50 % and >= 40 % topics are matched
# Penalising factor for irrelevancy is 0.25
elif match_percent >= 40 & int(match_percent) < 50:
match_score = match_percent - (match_percent * 0.25)
# sub case 4 and 5 : >= 50 and < 70
# question topics =[ 1,2,3,4,5,6]
# answer topics =[1,2,3,4,7,8]
# match_percent = 66
# match_score= 66- ((100-66)*0.2) = 57.2
# penalising factor for the above case is 0.2 of percentage of unmatched topics
# for sub case 5 : penalising factor is 0.1
elif match_percent >= 50 & int(match_percent) < 70:
match_score = match_percent - ((100 - match_percent) * 0.2)
else:
match_score = match_percent - ((100 - match_percent) * 0.1)
return match_score
# Case:II If number of suggested answer topics1 > student answer topics
elif len(que_topics) > len(ans_topics):
# irrelevancy = percent of irrelevant things written with respect to total student topics
# sub case 1:
# example : suggested topics (ST) = [1,2,3,4,5,6]
# student topics (S2T) =[1,2,8,9]
# match_percent= (2/6)*100 = 33
# irrelevant_percent = (2/4)*100= 50
# match_score = 33- 33(0.4)= 19.8
if topic_match_length < len(ans_topics):
irrelevant_topics = len(ans_topics) - topic_match_length
irrelevant_percent = (irrelevant_topics / len(ans_topics)) * 100
if irrelevant_percent >= 60:
match_score = match_percent * 0.5
elif irrelevant_percent < 60 & int(irrelevant_percent) >= 50:
match_score = match_percent - match_percent * 0.4
elif irrelevant_percent < 50 & int(irrelevant_percent) >= 40:
match_score = match_percent - match_percent * 0.3
else:
match_score = match_percent - match_percent * 0.1
# match_score = match_percent - match_percent * irrelevant_percent
else:
match_score = match_percent
return match_score
# Case 3: if number of ST < S2T
# Example ;
# ST=[1,2,3,4]
# S2T = [1,2,3,7,8,9]
# match_percent = 3/4*100 =75
# irrelevancy = 3/6*100 =50
# irrelevancy = 50*0.3= 15
# match_score = 75-15= 60
else:
irrelevancy = (len(ans_topics) - topic_match_length) / len(ans_topics) * 100
if irrelevancy <= 30:
irrelevancy = irrelevancy * 0.1
match_score = match_percent - irrelevancy
elif irrelevancy > 30 & int(irrelevancy) <= 40:
irrelevancy = irrelevancy * 0.2
match_score = match_percent - irrelevancy
elif irrelevancy > 40 & int(irrelevancy) <= 50:
irrelevancy = irrelevancy * 0.3
match_score = match_percent - irrelevancy
else:
irrelevancy = irrelevancy * 0.4
match_score = match_percent - irrelevancy
return match_score
def get_similarity():
que_topics, que = get_suggested_answer_topics()
answer_topics, text = get_student_answer_topics()
length = len(list(set(que_topics) & set(answer_topics)))
print(str(length) + " topics matched")
# Calculating the score based on number of topics matched
topics_score = abs(topic_match(que_topics, answer_topics, length))
print("")
print(topics_score)
synsets_que_topics = []
synsets_ans_topics = []
sim_score = 0
# calculating similarity using wordnet's wup_similarity
# Getting appropriate sense of topic from the text using "lesk"(word sense disambiguation algorithm)
for i in que_topics:
synsets_que_topics.append(adapted_lesk(que, i, pos='n'))
for i in answer_topics:
synset_answer = adapted_lesk(text, i, pos='n')
print(str(synset_answer)+'..')
if str(synset_answer) != "None":
synsets_ans_topics.append(synset_answer)
print("Similarity Score")
sim_score = compute_similarity(synsets_que_topics, synsets_ans_topics) * 100
print(sim_score)
print("")
print("Average Score: " + str(abs(topics_score + int(sim_score)/ 2)))
get_similarity()