-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
219 lines (179 loc) · 7.46 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from Corpus import Corpus
from Tokenizer import Tokenizer
import mysql.connector
from math import log10
from math import sqrt
from bs4 import BeautifulSoup
'''
This model performs as an indexer, it includes all the index
building processes by transfering data to the mysql database.
We have 5 tables:
Doc - [doc_id], [url], [brief website description]
TokenT - [unique tokens]
Web_index - [token], [doc_id], [tf], [wt]
Ranking - [token], [doc_id], [tf-idf]
Normalize - [token], [doc_id], [normalized tf-idf]
'''
class Indexer():
def __init__(self):
self.mydb = mysql.connector.connect(
host="localhost",
user="root",
password="mf15837165780",
database="web",
buffered=True,
auth_plugin='mysql_native_password'
)
self.mycursor = self.mydb.cursor()
self.total = 36265
def build_index(self):
'''
This function build the inverted index, it inserts the url to the
doc Table with a doc_id, and insert each token to tokenT table
and insert token, doc_id, term frequency and weight into the web_index
Table
'''
c = Corpus()
t = Tokenizer()
for url,name in c.get_file_name():
if len(url)>1000:
continue
result = t.tokenize(name)
if len(result)==0:
continue
print(url)
doc_id=1
#Insert URL to table DOC
sql="INSERT INTO web.doc(url) values (%s)"
val=(url,)
self.mycursor.execute(sql,val)
self.mydb.commit()
print(self.mycursor.rowcount, "was inserted in URL.")
print(url)
s_sql="select id from doc where url=%s"
self.mycursor.execute(s_sql,val)
myresult = self.mycursor.fetchone()
doc_id = myresult[0]
print ("DOC_ID IS "+ str(doc_id))
#Insert token, doc_id, tf into web_index
t_sql="INSERT INTO web.web_index(token, doc_id, tf, wt) values (%s,%s,%s,%s)"
t_val=[]
for token in result.keys():
t_val.append((token, doc_id, result[token][0], result[token][1]))
#print(t_val)
self.mycursor.executemany(t_sql,t_val)
self.mydb.commit()
print(self.mycursor.rowcount, "was inserted in WEB_INDEX.")
#insert into TokenT table
count=0
for token in result.keys():
tq="Insert ignore into tokenT values (%s)"
tv=(token,)
self.mycursor.execute(tq,tv)
self.mydb.commit()
count+=1
print("inserted "+ str(count) +" Tokens")
def calculate(self, tf, df):
'''
This function calculate the tf-idf value with tf and df given
in the parameter
'''
result = 1.0 + log10(float(tf))
result *= log10(self.total/df)
return result
def update(self):
'''
This function took all the token and document id pair and
calculate the tf-idf and update them into the ranking Table
'''
#take all the tokens out of database
self.mycursor.execute("Select token from tokenT")
myresult = self.mycursor.fetchall()
count=0
for token in myresult:
print("**********Token is "+ str(token)+" ***********")
c_sql="select count(*) from web_index where token=%s"
c_val=token
self.mycursor.execute(c_sql,c_val)
c_result= self.mycursor.fetchone()
df=c_result[0]
print("************* DF IS "+str(df))
t_sql="select token, doc_id, tf from web_index where token=%s"
self.mycursor.execute(t_sql, c_val)
t_list= self.mycursor.fetchall()
u_val=[]
for toke, doc_id, tf in t_list:
#print ("Token is "+ str(token)+" doc_id is "+ str(doc_id) + " tf is "+ str(tf))
tfidf = self.calculate(tf, df)
#print("TF-IDF is "+ str(tfidf))
u_val.append((toke,doc_id,tfidf))
count+=1
u_sql="insert into ranking(token, doc_id, tf_idf) values (%s, %s, %s)"
self.mycursor.executemany(u_sql, u_val)
self.mydb.commit()
print(self.mycursor.rowcount, "was inserted in RANKING.")
def normalize(self):
'''
This function calculates the normalized tf-idf and insert
them into a new table normaliza
'''
#get all tf-idf of a dic_id
#select doc_id
self.mycursor.execute("select id from doc")
myresult= self.mycursor.fetchall()
for doc_id in myresult:
print("**********Doc ID is "+str(doc_id)+" ********")
#get all tf-idf
tf_sql="select * from ranking where doc_id = %s"
tf_val=doc_id
self.mycursor.execute(tf_sql, tf_val)
tf_result=self.mycursor.fetchall()
sqrsum=0
for tf_idf in tf_result:
sqrsum+=tf_idf[2]**2
doc_length= sqrt(sqrsum)
i_val=[]
for toke,doc_id, tf_idf in tf_result:
norm = tf_idf/doc_length
i_val.append((toke, doc_id, norm))
i_sql ="insert into normalize(token, doc_id, nmlz) values(%s, %s, %s)"
self.mycursor.executemany(i_sql, i_val)
self.mydb.commit()
print(self.mycursor.rowcount, "was inserted in Normalize")
def get_description(self):
'''
This function gets all the url, finds their description text
and update them to the database
'''
#get doc_id
self.mycursor.execute("select id,url from doc")
myresult= self.mycursor.fetchall()
for doc_id, url in myresult:
#print("**********Doc ID is "+str(doc_id)+" ********")
c= Corpus()
name = c.url_to_dir(url)
#print("Name is "+ name)
with open (name,"rb") as file:
content = file.read()
soup = BeautifulSoup(content, "lxml")
metas = soup.find_all("meta")
result = ''
for meta in metas:
if ('content' in meta.attrs) and ('name' in meta.attrs) and \
((meta.attrs['name'] == 'description') or (meta.attrs['name'] == 'keywords')):
result = " ".join(meta.attrs['content'].split())
#if html doesn't have description tag
if result == '':
script = soup.find(["h1", "h2", "h3","h4","h5", "strong", "title","b"])
if script:
temp = " ".join(script.text.split())
result += temp if len(temp) < 200 else ""
print(result)
i_sql ="update doc set description =%s where id = %s"
i_val=(result, doc_id)
self.mycursor.execute(i_sql, i_val)
self.mydb.commit()
print(self.mycursor.rowcount, "was inserted in DOC , DOC ID IS "+str(doc_id))
if __name__ == "__main__":
i= Indexer()
i.get_description()