-
Notifications
You must be signed in to change notification settings - Fork 0
/
Ranked Retrieval Model.py
244 lines (197 loc) · 7.82 KB
/
Ranked Retrieval Model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# -*- coding: utf-8 -*-
import sys
import math
from functools import reduce # py3
from textblob import TextBlob
from textblob import Word
from collections import defaultdict
uselessTerm = ["username", "text", "tweetid"]
postings = defaultdict(dict)
document_frequency = defaultdict(int)
document_lengths = defaultdict(int)
document_numbers = len(document_lengths)
avdl = 0
def main():
get_postings_dl()
initialize_document_frequencies()
initialize_avdl()
# print(postings)
# print(document_lengths)
# print("平均tweet 长度为:" + str(avdl))
# 可以修改do_search方法中的返回数据,得到使用不同模型的result
# my_result_PLN.txt/my_result_BM25.txt/
#result_name = "my_result_PLN_BM25.txt"
#get_result(result_name)
while True:
do_search()
def tokenize_tweet(document):
global uselessTerm
document = document.lower()
a = document.index("username")
b = document.index("clusterno")
c = document.rindex("tweetid") - 1
d = document.rindex("errorcode")
e = document.index("text")
f = document.index("timestr") - 3
# 提取用户名、tweet内容和tweetid三部分主要信息
document = document[c:d] + document[a:b] + document[e:f]
terms = TextBlob(document).words.singularize()
result = []
for word in terms:
expected_str = Word(word)
expected_str = expected_str.lemmatize("v")
if expected_str not in uselessTerm:
result.append(expected_str)
return result
# 这里和上一个是一样的
def get_postings_dl():
global postings, document_lengths
f = open(
r"C:\Users\Lenovo\Desktop\tweets.txt")
lines = f.readlines() # 读取全部内容
for line in lines:
line = tokenize_tweet(line)
tweetid = line[0] # 这里记录一下tweetid,就弹出
line.pop(0)
document_lengths[tweetid] = len(line) # 这一步直接记录词数
unique_terms = set(line)
for te in unique_terms:
postings[te][tweetid] = line.count(te)
# 按字典序对postings进行升序排序,但返回的是列表,失去了键值的信息
# postings = sorted(postings.items(),key = lambda asd:asd[0],reverse=False)
mylog = open(r"C:\Users\Lenovo\Desktop\Invertedtf.txt", mode='a', encoding='utf-8')
print(postings,file=mylog)
def initialize_document_frequencies():
global document_frequency, postings
for term in postings:
document_frequency[term] = len(postings[term])
mylog2 = open(r"C:\Users\Lenovo\Desktop\Inverteddf.txt", mode='a', encoding='utf-8')
print(document_frequency,file=mylog2)
# 求平均文档长度
def initialize_avdl():
global document_lengths, avdl
count = 0
for twid in document_lengths:
count += document_lengths[twid]
avdl = count / len(document_lengths)
# 对输入的查询内容正则化
def token(doc):
doc = doc.lower()
terms = TextBlob(doc).words.singularize()
result = []
for word in terms:
expected_str = Word(word)
expected_str = expected_str.lemmatize("v")
result.append(expected_str)
return result
# 这是对每件的每个单词进行查询
def get_result(file_name):
with open(file_name, 'w', encoding='utf-8') as f_out:
Quaries = get_queries()
qkeys = Quaries.keys()
# 这里取得是索引
for key in qkeys:
q_result = do_search(Quaries[key])
# 想清楚Quaries[key]的含义
for tweetid in q_result:
f_out.write(str(key) + ' ' + tweetid + '\n')
# queries是建立的查询索引
def get_queries():
# 输入为qrels2014.txt
# 输出为对于查询token后的结果:id + 查询字符串
# 这个先忽略
queries = {}
keyid = 171
fq = open(
r"C:\Users\Lenovo\Desktop\qrels2014.txt")
lines = fq.readlines()
for line in lines:
index1 = line.find("<query>")
if index1 >= 0:
# 得到<query>和</query>之间的查询内容
index1 += 8
index2 = line.find("</query>")
# print(line[index1:index2])
# 添加到queries字典
queries[keyid] = line[index1:index2]
keyid += 1
return queries
# 上面的两个函数的意思其实是对给定的qres文件进行查询,不需要的话直接删去就好
def do_search():
query = token(input("Search query >> "))
result = [] # 返回对于query的所有tweetid排序后的列表
#query = token(query)
if query == []:
sys.exit()
unique_query = set(query)
# 避免遍历所有的tweet,可先提取出有相关性的tweetid,tweet中包含查询的关键词之一便可认为相关
relevant_tweetids = Union([set(postings[term].keys()) for term in unique_query])
# print(relevant_tweetids)
print ("<<<<<Score(PLN)--Tweeetid>>>>>")
print("PLN一共有"+str(len(relevant_tweetids))+"条相关tweet!")
if not relevant_tweetids:
print("No tweets matched any query terms for")
print(query)
# 下面是这份代码最核心的东西了
else:
# PLN
scores3 = sorted([(id,similarity_PLN(query,id))for id in relevant_tweetids],key=lambda x: x[1],reverse=True)
# BM25
# scores2 = sorted([(id,similarity_BM25(query,id))
# for id in relevant_tweetids],
# key=lambda x: x[1],
# reverse=False)
# PLN+BM25
'''scores3 = sorted([(id, similarity_BM25(query, id) + similarity_PLN(query, id))
for id in relevant_tweetids],
key=lambda x: x[1],
reverse=False)'''
i = 1
for (id, score) in scores3:
if i<=100:
result.append(id)
print(str(score) + ": " + id)
i = i + 1
else:
break
print("finished")
# return result
# for (id,score) in scores1:
# print (str(score)+": "+id)
#
# print ("<<<<<Score(BM25)--Tweeetid>>>>>")
# print("BM25一共有"+str(len(scores2))+"条相关tweet!")
# for (id,score) in scores2:
# print (str(score)+": "+id)
def similarity_PLN(query, id):
global postings, avdl
fenmu = 1 - 0.1 + 0.1 * (document_lengths[id] / avdl)
similarity = 0.0
unique_query = set(query)
for term in unique_query:
wtq=query.count(term)/len(query)
if (term in postings) and (id in postings[term].keys()):
wtd = (1 + math.log(postings[term][id]) * math.log((document_numbers + 1) / document_frequency[term]))
similarity = wtq*wtd
# 使用ln(1+ln(C(w,d)+1))后发现相关性的分数都为负数很小
#similarity += ((query.count(term)) * (math.log(math.log(postings[term][id] + 1) + 1)) * math.log(
#(document_numbers + 1) / document_frequency[term])) / fenmu
return similarity
#
def similarity_BM25(query, id):
global postings, avdl
fenmu = 1 - 0.2 + 0.2 * (document_lengths[id] / avdl)
k = 1
similarity = 0.00
unique_query = set(query)
for term in unique_query:
if (term in postings) and (id in postings[term].keys()):
C_wd = postings[term][id]
# 使用ln(1+ln(C(w,d)+1))后发现相关性的分数都为负数很小
similarity += (query.count(term) * (k + 1) * C_wd * math.log(
(document_numbers + 1) / document_frequency[term])) / (k * fenmu + C_wd)
return similarity
def Union(sets):
return reduce(set.union, [s for s in sets])
if __name__ == "__main__":
main()