-
Notifications
You must be signed in to change notification settings - Fork 4
/
process_wiki_dump.py
205 lines (175 loc) · 7.38 KB
/
process_wiki_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import time
import os
import sys
import logging
import itertools
import gc
import numpy as np
import gensim
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO # ipython sometimes messes up the logging setup; restore
# set module lever logger
FORMAT = '%(asctime)-15s :: %(levelname)s :: %(name)s :: %(message)s'
formatter = logging.Formatter(FORMAT)
class ProcessWiki(object):
def __init__(self, dump_file, distributed):
self.dump_file = dump_file
self.dictionary = gensim.corpora.Dictionary([])
self.clip_docs = 5
if distributed:
vt = 'distributed'
else:
vt = 'normal'
self.DICT_PATH = "/data/logs/wiki_dump_{0}.dict".format(vt)
# /data/logs/wiki_dump_dict.dict
self.MODEL_PATH = "/data/logs/wiki_dump_{0}.lda".format(vt)
log_file = '/data/logs/wiki_dump_{0}.log'.format(vt)
self.logger = logging.getLogger('wiki_log')
self.logger.setLevel(logging.DEBUG)
ch = logging.FileHandler(log_file)
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
self.logger.addHandler(ch)
self.logger.info("Tag wiki initialized")
self.lda = None
self.distributed = distributed
# initialize dictionary
if os.path.exists(self.DICT_PATH):
self.dictionary = gensim.corpora.Dictionary.load(self.DICT_PATH)
else:
self.dictionary = gensim.corpora.Dictionary()
def iter_wiki(self):
"""Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
for title, text, pageid in _extract_pages(smart_open(self.dump_file)):
text = filter_wiki(text)
tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
continue # ignore short articles and various meta-articles
yield title, tokens
def _init_lda(self):
""" initialize lda model. This should be called only after the dictionary is prepared.
Otherwise dictionary saved to a file should be ready beforehand.
"""
if False: #os.path.exists(self.MODEL_PATH):
self.lda = gensim.models.ldamodel.LdaModel.load(self.MODEL_PATH)
else:
# chunksize determines the number of documents to be processed in a worker.
self.lda = gensim.models.ldamodel.LdaModel(
corpus=None, id2word=self.dictionary, num_topics=100,
update_every=1, chunksize=10000, passes=1, distributed=self.distributed)
# Pass 1: Prepare Dictionary
def prepare_dictionary_from_docs(self):
"""
iterate through the wikipedia docs dir. and update dictionary
"""
if os.path.exists(self.DICT_PATH):
return True
self.logger.info("START PREPARING DICT")
for title, tokens in self.iter_wiki():
try:
self.logger.info("dict update {0}".format(title))
self.dictionary.add_documents([tokens])
self.dictionary.save(self.DICT_PATH)
except UnicodeError:
continue
return True
# Pass 2: Process topics
def update_lda_model(self):
"""
Read documents from wikipedia articles in data folder and then
- update lda model
- predict the relevent topics for the document
"""
self.logger.info("START UPDATING LDA")
self._init_lda()
counter = 0
bow_list = []
for title, tokens in itertools.islice(self.iter_wiki(), self.clip_docs):
try:
self.logger.info("updating lda: {0}".format(title))
bow = self.dictionary.doc2bow(tokens)
bow_list.append(bow)
if counter == 5:
self.lda.update(bow_list)
counter = 0
bow_list = []
else:
counter += 1
except UnicodeError:
self.logger.info("PROCESSING FAILED!")
continue
self.lda.save(self.MODEL_PATH)
return True
# Pass 3: Print topic for each document
def print_document_topics(self):
self.logger.info("START PRINTING DOCUMENTS")
for title, tokens in self.iter_wiki():
try:
# get the topics for files and write it to log file
bow = self.dictionary.doc2bow(tokens)
topics = sorted(self.lda[bow], key=lambda x: x[1], reverse=True)
topic = self.lda.print_topic(topics[0][0])
self.logger.info("{0} :: {1}\n".format(title, topic))
except UnicodeError:
pass
return True
def main(wiki_path, run_type):
if run_type.lower() not in ['true', 'false']:
print 'Invalid input'
sys.exit(0)
if run_type.lower() == 'true':
distributed = True
fn = '/data/logs/wiki_dump_module_{0}.log'.format('distributed')
else:
distributed = False
fn = '/data/logs/wiki_dump_module_{0}.log'.format('normal')
logging.basicConfig(filename=fn, level=logging.DEBUG, format=FORMAT)
module_logger = logging.getLogger('wiki_module_logger')
module_logger.setLevel(logging.DEBUG)
# set file handler
fh = logging.FileHandler(fn)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
module_logger.addHandler(fh)
start_time = time.time()
module_logger.info("START TIME :{0}".format(start_time))
wiki = ProcessWiki(wiki_path, distributed)
# PASS 1
wiki.prepare_dictionary_from_docs()
dict_prepare_time = time.time()
module_logger.info("TIME AFTER DICTIONARY PREPARATION :{0}".format(dict_prepare_time))
# PASS 2
wiki.update_lda_model()
first_pass = time.time()
module_logger.info("TIME AFTER FIRST PASS :{0}".format(first_pass))
# PASS 3
wiki.print_document_topics()
second_pass = time.time()
module_logger.info("TIME AFTER DOC PRINT :{0}".format(second_pass))
total_time = (second_pass - start_time) / 60
module_logger.info("TOTAL TIME ELAPSED :{0}".format(total_time))
def create_wiki_dict(wiki_path, run_type):
from gensim.corpora.wikicorpus import WikiCorpus
fn = '/data/logs/create_wiki_dict.log'
logging.basicConfig(filename=fn, level=logging.DEBUG, format=FORMAT)
module_logger = logging.getLogger('wiki_module_logger')
module_logger.setLevel(logging.DEBUG)
# set file handler
fh = logging.FileHandler(fn)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
module_logger.addHandler(fh)
module_logger.info("START")
wiki_corpus = WikiCorpus(wiki_path) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
module_logger.info("Wiki corpus ready")
wiki_corpus.dictionary.save("/data/logs/wiki_dump_dict.dict")
module_logger.info("Dictionary Created")
if __name__ == '__main__':
wiki_path = sys.argv[1]
run_type = sys.argv[2]
main(wiki_path, run_type)
#create_wiki_dict(wiki_path, run_type)