/
extract_wiki_data.py
490 lines (401 loc) · 21.4 KB
/
extract_wiki_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
"""
Usage:
python extract_wiki_data.py --outfolder en.csv --lang en
Where outfolder is a folder containing files called unprocessed.csv and processed.csv.
The first will be tokens, i.e. a bag of words. The second will be a list of numeric data.
"""
import os, sys, inspect # this all needed for Chinese segmentation
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile( inspect.currentframe() ))[0]) + "/smallseg_0.6")
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)
from argparse import ArgumentParser
from collections import OrderedDict
from text.blob import TextBlob
from nltk.util import bigrams
from multiprocessing import Pool
from traceback import format_exc
import nltk.stem.snowball as snowball # Mod. Isaac [for foreign language parsing?]
from nltk.tokenize.regexp import WhitespaceTokenizer
from nltk.corpus import stopwords
from boto import connect_s3
import requests
import codecs
import traceback
#import wikipedia #TODO: before full automatization, use better API.
import re
from re import compile as _Re
import string
from smallseg import SEG
# stops, tokenizer and stemmer (still global) were originally declared out here. I moved them into the main method
# so they could be taylored to the language.
code_to_language_name = {'en': u'english',
'fr': u'french',
'zh': u'chinese',
'de': u'german',
'ru': u'russian',
'pl': u'polish',
'ja': u'japanese',
'pt': u'portuguese',
'es': u'spanish'
}
MIN_WORD_LENGTH = 4 # Program changes to 1 for Chinese and Japanese
stemmers = {}
tokenizers = {}
def get_stemmer(lang):
global stemmers
if lang not in stemmers:
stemmers[lang] = new_stemmer(lang)
return stemmers[lang]
def get_tokenizer(lang):
global tokenizers
if str(lang) not in tokenizers.keys():
tokenizers[lang] = new_tokenizer(lang)
return tokenizers[lang]
def get_wikipedia_title(wikia_title):
search_results = wikipedia.search(wikia_title)
if search_results == []:
return "NO_SEARCH_RESULTS_FOUND"
else:
return search_results[0]
def get_wikipedia_desc_from_wikia_title(wikia_title):
wikia_title = re.sub(" Wiki", "", wikia_title)
try:
wikia_title = get_wikipedia_title(wikia_title)
wikipedia_page = wikipedia.page(get_wikipedia_title(wikia_title))
#return wikipedia_page.summary.encode('utf-8').split('\n')[0]
return wikipedia_page.summary.encode('utf-8')
except Exception as e:
print "\nERROR with %s:"%wikia_title
print str(e).split('\n')[0]
print wikia_title
#Even the disambiguation desc. (which we assume the error to be) may
#have something like Violeta (TV) in it!!! We want that 'TV'!
disambig_string = str(e)#.decode('utf-8')
#gzorp
disambig_string = re.sub(wikia_title, "", disambig_string)
return disambig_string
#return wikipedia_page.summary
def get_args():
ap = ArgumentParser()
ap.add_argument(u'--num-processes', dest=u"num_processes", default=8, type=int)
ap.add_argument(u'--solr-host', dest=u"solr_host", default=u"http://search-s10:8983") # TODO: what does 8983 mean?
ap.add_argument(u'--outfolder', dest=u'outfolder', default=u'wiki_features_data') #modified by Isaac. Alas I cannot account for
#the language in this version. Fortunately, no one needs to call this from command line any more.
ap.add_argument(u'--s3dest', dest=u's3dest')
ap.add_argument(u'--lang', dest=u'lang', default = u'en')
return ap.parse_args()
"""
def get_wiki_data(args):
# Gets wiki data as JSON docs for all English wikis with 50 or more articles (content pages).
fl = u'id,top_categories_mv_%s,hub_s,top_articles_mv_%s,description_txt,sitename_txt'%(LANGUAGE, LANGUAGE)
params = {u'fl': fl,
u'start': 0,
u'wt': u'json',
u'q': u'lang_s:%s AND articles_i:[50 TO *]'%LANGUAGE, #lang_s is not working properly
u'rows': 500}
data = []
while True:
response = requests.get(u'%s/solr/xwiki/select' % args.solr_host, params=params).json()
data += response[u'response'][u'docs']
if response[u'response'][u'numFound'] < params[u'rows'] + params[u'start']:
return OrderedDict([(d[u'id'], d) for d in data])
params[u'start'] += params[u'rows']
"""
def get_mainpage_text(args, wikis):
"""
Get mainpage text for each wiki
now modified to get other numerical features, as well as wikipedia features.
seems to be confusing and redundant to have this as a separate function than get_wikidata_from_list!
wikis, an ordered dict created by get_wiki_data, is modified so each value (itself a dict)
also contains the key-value pair ('main_page_text', [mainpage text])
:param wikis: our wiki data set
:type wikis:class:`collections.OrderedDict`
:return: OrderedDict of search docs, id to doc, with mainpage raw text added
:rtype:class:`collections.OrderedDict`
"""
for i in range(0, len(wikis), 100):
query = u'(%s) AND is_main_page:true' % u' OR ' .join([u"wid:%s" % wid for wid in wikis.keys()[i:i+100]])
fl = u'wid,html_%s,wikiviews_monthly,wam,lang'%LANGUAGE # commented out 29 July 2014
#fl = u'wid,html_%s,wikiviews_monthly,wam,edits_i,videos_i,promoted_b,active_users_i,alldomains_mv_wd'%LANGUAGE #WHY NO WORK
params = {u'wt': u'json',
u'start': 0,
u'rows': 100,
u'q': query,
u'fl': fl}
response = requests.get(u'%s/solr/main/select' % args.solr_host, params=params).json()
for result in response[u'response'][u'docs']:
if u'html_%s'%LANGUAGE in result: #added by Isaac. For some reason, French and Chinese had some entries without this key.
wikis[str(result[u'wid'])][u'main_page_text'] = result[u'html_%s'%LANGUAGE]
wikis[str(result[u'wid'])][u'wam'] = result[u'wam']
wikis[str(result[u'wid'])][u'wikiviews_monthly'] = result[u'wikiviews_monthly']
wikis[str(result[u'wid'])][u'lang'] = result[u'lang']
return wikis
def normalize(wordstring, lang):
if lang==None:
lang='en'
global stops
stemmer = get_stemmer(lang)
tokenizer = get_tokenizer(lang)
try:
return [stemmer.stem(word.strip(string.punctuation)) for word in tokenizer.tokenize(wordstring.lower())
if len(word) >= MIN_WORD_LENGTH and word not in stops] # MIN_WORD_LENGTH added by Isaac to handle i.e. Japanese
except (Exception, IndexError) as e:
traceback.format_exc()
print "Caught errror (No te preocupas, dude):\n|''''''''''''''''''''''''''''''|"
print e
print wordstring
print tokenizer.tokenize(wordstring.lower())
print "|______________________________|"
return [''] #so the caller function won't crash.
def wiki_to_feature(wiki):
"""
Specifically handles a single wiki document
NOTE!!!!! I,saac changed TOP_ART:, TOP_CAT:, DESC: and ORIGINAL_HUB: to
TOP_ART_, TOP_CAT_, DESC_ and ORIGINAL_HUB_ so they don't get split up in the tokenizer later on.
As it was before, these features were split by a tokenizer later on.
Also added Wikipedia features
:param wiki: dict for wiki fields
:type wiki: dict
:return: tuple with wiki id and list of feature strings
:rtype: tuple
"""
try:
lang = wiki.get(u'lang')
features = [] # strings representing categories, art, old hubs, etc
numeric_features = [] # wam, pg count, hub ct, etc ADDED BY ISAAC
bow = []
features += [u'ORIGINAL_HUB_%s' % wiki.get(u'hub_s', u'')]
features += [u'TOP_CAT_%s' % u'_'.join(normalize(c, lang)) for c in wiki.get(u'categories_mv_%s'%LANGUAGE, [])] # Added lang --Isaac
bow += [u"_".join(normalize(c, lang)) for c in wiki.get(u'categories_mv_%s'%LANGUAGE, [])] # added lang. --Isaac
features += [u'TOP_ART_%s' % u"_".join(normalize(a, lang)) for a in wiki.get(u'top_articles_mv_%s'%LANGUAGE, [])] # added lang. --Isaac
bow += [u"_".join(normalize(a, lang)) for a in wiki.get(u'top_articles_mv_%s'%LANGUAGE, [])]
desc_ngrams = [u"_".join(n) for grouping in
[bigrams(normalize(np, lang))
for np in TextBlob(wiki.get(u'description_txt', [u''])[0]).noun_phrases]
for n in grouping]
# Note: splitting the descriptions is likely more meaningful, because if they are unplit they contain tokens such as book are book_seri
# Problem tho: comic_book and book_seri would map to same thing :(, as would book_seri and seri_star. Thus full descs must also be added,
split_descs = [d for ngram in desc_ngrams for d in ngram.split("_")]
bow += desc_ngrams
features += [u'DESC_%s' % d for d in desc_ngrams]
#features += [u'SPLIT_DESC_%s' % d for d in split_descs]
bow += split_descs
if u'sitename_txt' in wiki:
bow += [u"_".join(b) for b in bigrams(normalize(wiki.get(u'sitename_txt')[0], lang))]
#mp_nps = TextBlob(wiki.get(u'main_page_text', u'')).noun_phrases
mp_nps = normalize(wiki.get(u'main_page_text', u''), lang)
#bow += [u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping]
bow += [u''.join(normalize(w, lang)) for words in [np.split(u" ") for np in mp_nps] for w in words]
#WIKIPEDIA FIRST PARAGRAPHS!
#wikipedia_desc = get_wikipedia_desc_from_wikia_title(wiki[u'sitename_txt'][0])
#wikipedia_desc = wikipedia_desc.decode("utf-8")
#mp_nps = TextBlob(wikipedia_desc).noun_phrases
#mp_nps = normalize(wikipedia_desc)
#bow += [u"_".join(bg) for grouping in [bigrams(normalize(n)) for n in mp_nps] for bg in grouping]
#bow += [u''.join(normalize(w)) for words in [np.split(u" ") for np in mp_nps] for w in words]
#EXTRALINGUISTIC NUMERIC FEATURES
numeric_features.append(wiki.get(u'wam'))
numeric_features.append(wiki.get(u'wikiviews_monthly'))
#Note: All wikis where the 'id' field is missing will be hashed to a 'garbage' field with None key.
return wiki.get(u'id'), (bow + features, numeric_features)
# return wiki[u'id'], bow + features # OLD
except Exception as e:
print "Caught errror (No te preocupas, dude):\n|````````````````````````````|"
print e, format_exc()
print "|____________________________|"
return None, None
#raise e
def wikis_to_features(args, wikis):
"""
Turns wikis into a set of features
:param args:argparse namespace
:type args:class:`argparse.Namespace`
:param wikis: our wiki data set
:type wikis:class:`collections.OrderedDict`
:return: OrderedDict of features, id to featureset
:rtype:class:`collections.OrderedDict`
"""
p = Pool(processes=args.num_processes)
res = OrderedDict(p.map_async(wiki_to_feature, wikis.values()).get())
p.close()
p.join()
return res
def get_wiki_data_from_list(args, wid_list):
"""
A copy of get_wiki_data, except that this one gets data only from those articles specified by the id_list,
instead of the full cerca 20k gotten by get_wiki_data
"""
its = 0
step = 99
data = []
while True: # not getting the request all in one go because that can give a 413 error (request too large)
if its>=len(wid_list):
break
query = u' OR ' .join([u"id:%s" % wid for wid in wid_list[its:its+step]])
#fl = u'id,top_categories_mv_%s,hub_s,top_articles_mv_%s,description_txt,sitename_txt,wikiviews_monthly, wam'%(LANGUAGE, LANGUAGE)
#fl = u'id,top_categories_mv_%s,hub_s,top_articles_mv_%s,description_txt,sitename_txt'%(LANGUAGE, LANGUAGE) #commented out 29 July 2014
fl = u'id,categories_mv_%s,hub_s,top_articles_mv_%s,description_txt,sitename_txt'%(LANGUAGE, LANGUAGE)
its +=step
params = {u'fl': fl,
u'start': 0,
u'wt': u'json',
u'q': query,
u'rows': 500}
while True:
#response = requests.get(u'%s/solr/xwiki/select' % args.solr_host, params=params)
response = requests.get(u'%s/solr/xwiki/select' % args.solr_host, params=params).json()
data += response[u'response'][u'docs']
if response[u'response'][u'numFound'] < params[u'rows'] + params[u'start']:
break
params[u'start'] += params[u'rows']
return OrderedDict([(d[u'id'], d) for d in data])
def init_args(num_processes, solr_host, outfolder, s3dest):
"""
this rigamarole is so that this module can be called from command line and from scripts. calling from command line,
argparse makes an args object. This function simulates that.
"""
class ArgumentObject: # TODO: the default for outfolder cannot account for lang, so cannot be used. default should be nixed.
def __init__(self, num_processes=8, solr_host = u"http://search-s10:8983", outfolder = u'wiki_features_data', s3dest = None): #lang = u'en'):
self.num_processes = num_processes
self.solr_host = solr_host
self.outfolder = outfolder
self.s3dest = s3dest
return ArgumentObject(num_processes, solr_host, outfolder, s3dest)
def new_tokenizer(lang):
class ChineseTokenizer(): # a wrapper class to support the duck typing
def __init__(self):
self._seg = SEG()
def tokenize(self, text):
wlist = self._seg.cut(text)
#wlist.reverse() Using Bag of W assumption, so unneeded, yeah?
return wlist
class UnicodeTokenizer():
def __init__(self):
self._splitter = _Re( '(?s)((?:[\ud800-\udbff][\udc00-\udfff])|.)' ).split
def tokenize(self, text):
return [ch for ch in self._splitter(text) if ch]
if lang == "zh":
#return UnicodeTokenizer()
return ChineseTokenizer()
if lang == "ja":
#return ChineseTokenizer()
return UnicodeTokenizer()
else:
return WhitespaceTokenizer()
def new_stemmer(lang):
"""
returns a stemmer for the appropriate language, or a dummy object.
The dummy object simply returns an unstemmed word when the .stem(word)
method is called. This model actuall makes sense with an isolating language
like Chinese, but alas will do Polish and Japanese a disservice.
"""
class NonStemmer():
def __init__(self):
pass
def stem(self, word):
return word
return { #Yay python pseudo switch!
'en': snowball.EnglishStemmer(),
'es': snowball.SpanishStemmer(),
'fr': snowball.FrenchStemmer(),
'de': snowball.GermanStemmer(),
'pt': snowball.PortugueseStemmer(),
'ru': snowball.RussianStemmer()
}.get(lang, NonStemmer())
def extract_features_from_list_of_wids(wid_list, num_processes=8, solr_host = u"http://search-s10:8983", outfolder = None, lang = u'en', s3dest = None):
"""
for each id in wid_list, extracts content from the corresponding wiki, parses it into meaningful
features and saves it to the specified file. A replacement for extract_features_and_write_to_file
(which contains the functionality from Robert's original script) which was grossly inefficient,
scraping around 20,000 wikis, when only 100-200 are needed per language (100x space storage), and
ommitting smaller wikis, which are useful so sample for randomization porpoises.
"""
#TODO: label outfolder such that it's clear that it's a microfolder of two files
if not outfolder:
outfolder = u"wiki_features_data_%s"%lang
args = init_args(num_processes, solr_host, outfolder, s3dest)
global LANGUAGE
LANGUAGE = lang[:] # I use a global because otherwise it's a pain to make map_async accept multiple arguments :( --Isaac
global stops
stops = stopwords.words(code_to_language_name[LANGUAGE]) # Added --Isaac. kept global, as Robert had written it, but moved it here so the language could be specified.
stops = [stop.decode('utf-8') for stop in stops] # tohandle i.e. Japanese. added by Isaac
if lang in [u'zh', u'ja']:
global MIN_WORD_LENGTH
MIN_WORD_LENGTH = 1
print u"set minium word length to 1, because we're dealing with %s here..."%code_to_language_name[lang]
# features: mapping of wid to (linguistic feats, numeric feats)
features = wikis_to_features(args, get_mainpage_text(args, get_wiki_data_from_list(args, wid_list)))
#flname = args.s3dest if args.s3dest else args.outfile # Robert wrote this line and I don't have any idea what its porpoise is
# NUMERIC FEATURES_TODO: delete above line, make directory if needed
if not os.path.exists(args.outfolder): #note: this can fail in race conditions, but that's liable never to be a problem here....
os.makedirs(args.outfolder)
processed_flname = "%s/processed.csv"%args.outfolder # for numeric features
unprocessed_flname = "%s/unprocessed.csv"%args.outfolder # for linguistic features
with codecs.open(unprocessed_flname, u'w', encoding=u'utf8') as unprocessed_fl, \
codecs.open(processed_flname, u'w', encoding=u'utf8') as processed_fl:
for wid, features in features.items():
line_for_writing_unprocessed = u",".join([wid, u",".join(features[0])]) + u"\n"
unprocessed_fl.write(line_for_writing_unprocessed)
line_for_writing_processed = u",".join([wid, u",".join([str(i) for i in features[1]])]) + u"\n"
processed_fl.write(line_for_writing_processed)
if args.s3dest: # No idea
b = connect_s3().get_bucket(u'nlp-data')
k = b.get_key(args.s3dest)
k.set_contents_from_filename(args.s3dest)
#1. GWD equivalent
#2. get_mainpage_text with'm
def main():
print "U better be running this as a test, because this just has some hardcoded list it's calling. Functionality\
of main() has been exported to the callable methods extract_features_from_list_of_wids(), or if you're crazy, extract_features_and_write_to_file()"
args = get_args()
extract_features_from_list_of_wids([737061, 163085, 7045, 113, 277, 832, 932], args.num_processes, args.solr_host, args.outfolder, args.lang, args.s3dest)
#get_wiki_data_from_list(args, [1221, 14316, 23556])
#extract_features_and_write_to_file(args.num_processes, args.solr_host, args.outfile, args.lang, args.s3dest)
"""
def main():
args = get_args()
features = wikis_to_features(args, get_mainpage_text(args, get_wiki_data(args)))
flname = args.s3dest if args.s3dest else args.outfile
with codecs.open(flname, u'w', encoding=u'utf8') as fl:
for wid, features in features.items():
line_for_writing = u",".join([wid, u",".join(features)]) + u"\n"
fl.write(line_for_writing)
if args.s3dest:
b = connect_s3().get_bucket(u'nlp-data')
k = b.get_key(args.s3dest)
k.set_contents_from_filename(args.s3dest)
def extract_features_and_write_to_file(num_processes=8, solr_host = u"http://search-s10:8983", outfile = u'wiki_data.csv', lang = u'en', s3dest = None):
#essentially a version of main() to be called from within a python script.
#once finished, main() will become a wrapper for this function.
#@Author: Isaac
#:param s3dest: I have no idea
print "WARNING !! Thus function is deprecated, and may be incompatible with other scripts in this module.\
For instance, \'outfile\' has been replaced with \'outfolder\', in order to account for numeric features.\
User is reccommended to familiarize him or herself with/modify the code."
class ArgumentObject:
def __init__(self, num_processes=8, solr_host = u"http://search-s10:8983", outfile = u'wiki_data.csv', s3dest = None): #lang = u'en'):
self.num_processes = num_processes
self.solr_host = solr_host
self.outfile = outfile
# self.lang = lang # deprecated in favor of global --Isaac
self.s3dest = s3dest
args = ArgumentObject(num_processes, solr_host, outfile, s3dest)
global LANGUAGE
LANGUAGE = lang # I use a global because otherwise it's a pain to make map_async accept multiple arguments :( --Isaac
global stops
stops = stopwords.words(code_to_language_name[LANGUAGE]) # Added --Isaac. kept global, as Robert had written it, but moved it here so the language could be specified.
features = wikis_to_features(args, get_mainpage_text(args, get_wiki_data(args)))
flname = args.s3dest if args.s3dest else args.outfile
with codecs.open(flname, u'w', encoding=u'utf8') as fl:
for wid, features in features.items():
line_for_writing = u",".join([wid, u",".join(features)]) + u"\n"
fl.write(line_for_writing)
if args.s3dest:
b = connect_s3().get_bucket(u'nlp-data')
k = b.get_key(args.s3dest)
k.set_contents_from_filename(args.s3dest)
"""
tokenizers['en'] = get_tokenizer('en')
stemmers['en'] = get_stemmer('en')
if __name__ == u'__main__':
main()