/
linkextractors.py
528 lines (428 loc) · 17.5 KB
/
linkextractors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
import Queue
import lxml.etree as etree
import lxml.html
from lxml.html.clean import Cleaner as html_clr
import lxml.html.soupparser as soup
from StringIO import StringIO
from urlparse import urlparse
import re
from multiprocessing import Pool, Process, Manager
from multiprocessing import Queue as pQueue
# Import the custom thread-pool
from threadpool import ThreadPool
from multiprocessing.managers import BaseManager
from multiprocessing import Process, Pool, current_process
import HTMLParser
import urlparse
import hashlib
# #### HACK ####
# NOTE: This is a "Hack" for enabling multiprocessing.Pool to execute class method functions...
# ...I prefer to make an implementation bound the Pickling hack only for LinkExtractorPPool
import types
import copy_reg
def _pickle_method(method):
func_name = method.im_func.__name__
# obj = method.im_self # Use this only for Classes not including multiprocessing.Pool definition
cls = method.im_class
return _unpickle_method, (func_name, cls)
def _unpickle_method(func_name, cls):
for cls in cls.mro():
try:
func = cls.__dict__[func_name]
except KeyError:
pass
else:
break
return func.__get__(cls)
copy_reg.pickle(types.MethodType, _pickle_method, _unpickle_method)
# ##### HACK END HERE ####
class XLinkExtractor(object):
""" TO BE FIXED """
def __init__(self, feed=True):
# Thread pool
self.__etree_q = Queue.Queue()
self.__call_q = self.__etree_q
# XPath objects for extracting the URL Types
self.__extract_site_urls = etree.XPath("/html/body//a/@href")
self.__extract_media_urls = etree.XPath("//text()")
self.__extract_scripts_urls = etree.XPath("//text()")
def __iter__(self):
"""Be careful when use the XLinkExtractor as iterator"""
return self
def next(self):
"""Be careful: class as 'Iterator' returns etrees queue, by default,
or the one defined but the proper function bellow"""
try:
print "GET ITER"
return self.__call_q.get(timeout=1) # timeout maybe should be trimmed
except Queue.Empty:
print "EMPTY ITER"
raise StopIteration
def feed(self, xhtml):
self.__tpool.dispatch(self.__parseto_xtree, xhtml, self.__callback_chain)
def l_feed(self, xhtml_l):
if isinstance(xhtml_l, list):
self.__tpool.map(self.__parseto_xtree, self.__callback_chain, xhtml_l)
else:
raise Exception("LinkExtractor.l_feed() Error: List() argument was expected")
def all_links(self, etree):
links = list()
for link in etree.iterlinks():
links.append(link)
def sites_links(self, xhtml):
url_l = self._url_href.findall(xhtml['xhtml_s'])
for i, url in enumerate(url_l):
prsd_url = urlparse(url)
if not prsd_url.netloc:
url_l[i] = xhtml['base_url'] + url
return url_l
def __ret_q(self, q):
"""A callable for iterators to return the content of Queues"""
if q.empty():
return True
else:
return q.get()
def all_links_iter(self):
try:
etree = self.__etree_q.get(2)
except Queue.Empty:
return StopIteration
else:
return etree.iterlinks()
def __parseto_xtree(self, xhtml_s):
if isinstance(xhtml_s, dict):
base_url = xhtml_s.pop("base_url", None)
# print "IN"
print base_url
resolve_base = xhtml_s.pop("resolve_base", True)
clean_xhtml = xhtml_s.pop("clean_xhtml", False)
xhtml_s = xhtml_s.pop("xhtml_s", None)
assert xhtml_s,\
"LinkExtractor.__parseto_xtree() Error: Dictionary with <None> xhtml source"
elif isinstance(xhtml_s, str):
clean_xhtml = False
base_url = None
else:
raise Exception(
"LinkExtractor.__parseto_xtree() Error: string or dictionary instance expected"
)
if clean_xhtml:
xhtml_clr = html_clr(
scripts=True, javascript=True, comments=True, style=True,
links=True, meta=True, page_structure=False, processing_instructions=True,
embedded=True, annoying_tags=True, remove_unknown_tags=True
)
# meta=False because we need MetaInfo
xhtml_s = xhtml_clr.clean_html(xhtml_s)
# The HTMLParser(s) should be defined in the thread (or process) when lxml.html.parser is dispatched into it
htmlparser = lxml.html.HTMLParser(recover=True, no_network=False)
# recover mode and download DTD enabled
# Now parse the XHTML source
try:
etree = lxml.html.parse(StringIO(xhtml_s), parser=htmlparser)
except Exception as e:
print("LinkExtractor Error: %s" % e)
print("LinkExtractor: Now Trying with the SOUP parser")
try:
etree = soup.parse(xhtml_s)
except Exception as e:
raise Exception("LinkExtractor Error: %s" % e)
if base_url:
eroot = etree.getroot()
try:
eroot.make_links_absolute(base_url, resolve_base_href=resolve_base)
except Exception as e:
raise Exception(
"LinkExtractor.__parseto_xtree() while making links absolute Error: " % e
)
# Return the etree just created
return etree
class LinkExtractor(object):
def __init__(self, base_url=None, iterable=None):
# Set self.base_url and choose the proper function to be used
self.set_base(base_url)
# iterable argument should ONLY be used when class instance is used as interator
if iterable:
self.__iter = iterable.__iter__()
else:
self.__iter = None
# Regular expression objects for extracting the URL/Link Types
self.__url_a_href = re.compile('<a href="([^"]+)"')
def __iter__(self):
return self
def next(self):
try:
return self.all_links(self.__iter.next())
except StopIteration:
raise StopIteration
def set_base(self, base_url):
# Define the proper functions to be used and define a class scope variable
if base_url:
self.base_url = base_url
self.site_links = self.__site_links_base
self.media_links = self.__media_links_base
self.script_links = self.__script_links_base
self.undefined_links = self.__undefined_links_base
else:
self.site_links = self.__site_links
self.media_links = self.__media_links
self.script_links = self.__script_links
self.undefined_links = self.__undefined_links
def all_links(self, xhtml):
all_links = list()
all_links.extend(self.site_links(xhtml))
all_links.extend(self.media_links(xhtml))
all_links.extend(self.scripts_links(xhtml))
all_links.extend(self.undefined_links(xhtml))
return all_links
def __site_links(self, xhtml):
return self.__url_a_href.findall(xhtml)
def __site_links_base(self, xhtml):
url_l = self.__url_a_href.findall(xhtml)
for i, url in enumerate(url_l):
prsd_url = urlparse(url)
if not prsd_url.netloc:
url_l[i] = self.base_url + url
return url_l
class LinkExtractorTPool(object):
""" TO BE FIXED """
def __init__(self, feed=True):
# Thread pool
if feed:
self.__tpool = ThreadPool(100)
# ETREE queue for the parsed xhtml(s) to be stored
self.__etree_q = Queue.Queue()
self.__site_links_q = Queue.Queue()
self.__media_links_q = Queue.Queue()
self.__scripts_links_q = Queue.Queue()
self.__undefined_links_q = Queue.Queue()
# Define a default queue returned with the iterator or callable instance of this Class
self.__call_q = self.__etree_q
# XPath objects for extracting the URL Types
self.__extract_site_urls = etree.XPath("/html/body//a/@href")
self.__extract_media_urls = etree.XPath("//src")
self.__extract_scripts_urls = etree.XPath("//src")
self._url_href = re.compile('<a href="([^"]+)"')
def __iter__(self):
"""Be careful when use the LinkExtractor as iterator"""
return self
def next(self):
"""Be careful: class as 'Iterator' returns etrees queue, by default,
or the one defined but the proper function bellow"""
try:
print "GET ITER"
return self.__call_q.get(timeout=1) # timeout maybe should be trimmed
except Queue.Empty:
print "EMPTY ITER"
raise StopIteration
def __call__(self):
"""Be careful: class as 'Callable' returns etrees queue, by default,
or the one defined but the proper function bellow"""
try:
return self.__call_q.get(timeout=1) # timeout maybe should be trimmed
except Queue.Empty:
return False
def feed(self, xhtml):
self.__tpool.dispatch(self.__parseto_xtree, xhtml, self.__callback_chain)
def l_feed(self, xhtml_l):
if isinstance(xhtml_l, list):
self.__tpool.map(self.__parseto_xtree, self.__callback_chain, xhtml_l)
else:
raise Exception("LinkExtractor.l_feed() Error: List() argument was expected")
def __callback_chain(self, etree):
# Put the etree to the etree-queue for getting all the URLs available
self.__etree_q.put(etree)
# Find Links to other site and put them in the queue
site_links = self.__site_links(etree)
if site_links:
self.__site_links_q.put(site_links)
# Find Links of media and put them in the queue
media_links = self.__media_links(etree)
if media_links:
self.__media_links_q.put(media_links)
# Find Links of scripts and put them in the queue
script_links = self.__media_links(etree)
if script_links:
self.__scripts_links_q.put(script_links)
undefined_links = self.__undefined_links(etree)
if undefined_links:
self.__undefined_links_q.put(undefined_links)
def all_links(self, etree):
links = list()
for link in etree.iterlinks():
links.append(link)
def sites_links(self, xhtml):
url_l = self._url_href.findall(xhtml['xhtml_s'])
for i, url in enumerate(url_l):
if url.find('# ') > -1:
url = ""
prsd_url = urlparse(url)
if not prsd_url.netloc:
url_l[i] = xhtml['base_url'] + url
return url_l
def __ret_q(self, q):
"""A callable for iterators to return the content of Queues"""
if q.empty():
return True
else:
return q.get()
def all_links_iter(self):
try:
etree = self.__etree_q.get(2)
except Queue.Empty:
return StopIteration
else:
return etree.iterlinks()
def sites_links_iter(self):
self.__call_q = self.__site_links_q
return self
def media_links_iter(self):
self.__call_q = self.__media_links_q
return self
def scripts_links_iter(self):
self.__call_q = self.__scripts_links_q
return self
def undefined_links_iter(self):
self.__call_q = self.__undefined_links_q
return self
def close(self):
self.__tpool.join_all()
class LinkExtractorPPool(LinkExtractor):
def __init__(self, base_url=None, size=3):
LinkExtractor.__init__(self)
if base_url:
self.set_base(base_url)
#Thread pool definition
self.__pl_size = size
self.__ppool = Pool(size)
def __iter__(self):
return self
def next(self):
try:
print "GET ITER"
return self.__all_links.get(timeout=1) # timeout maybe should be trimmed
except Queue.Empty:
print "EMPTY ITER"
raise StopIteration
def __call__(self):
"""Be careful: class as 'Callable' returns etrees queue, by default,
or the one defined but the proper function bellow"""
try:
return self.__all_links.get(timeout=1) # timeout maybe should be trimmed
except Queue.Empty:
return False
def all_feed(self, xhtml):
self.__all_links = self.__ppool.apply_async(self.all_links, xhtml)
def url_feed(self, xhtml):
self.__url_links = self.__ppool.apply_async(self.site_links, xhtml)
def med_feed(self, xhtml):
self.__media_links = self.__ppool.apply_async(self.media_links_links, xhtml)
def scr_feed(self, xhtml):
self.__script_links = self.__ppool.apply_async(self.script_links, xhtml)
def und_feed(self, xhtml):
self.__undif_links = self.__ppool.apply_async(self.undefined_links, xhtml)
def get_all(self):
return self.__all_links.get()
def get_url(self):
return self.__url_links.get()
def get_med(self):
return self.__med_links.get()
def get_scr(self):
return self.__scr_links.get()
def get_und(self):
return self.__und_links.get()
def all_imap(self, xhtml_list):
return self.__ppool.imap(self.all_links, xhtml_list, self.__pl_size)
def url_imap(self, xhtml_list):
return self.__ppool.imap(self.site_links, xhtml_list, self.__pl_size)
def url_iap(self, xhtml_list):
return self.__ppool.imap(self.site_links, xhtml_list, self.__pl_size)
def url_map(self, xhtml_list):
return self.__ppool.imap(self.site_links, xhtml_list, self.__pl_size)
def close(self):
self.__ppool.close()
self.__ppool.join()
class ExtractLinks(HTMLParser.HTMLParser):
def __init__(self, urldata):
HTMLParser.HTMLParser.__init__(self)
self.urldata = urldata
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href':
self.urldata.append(value)
def handle_startendtag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href':
self.urldata.append(value)
class SynCScanner(Process):
"""SynCScanner Process:"""
PROCESSNUM = 0
def __init__(self, myPendingScanQ, urlLQ):
Process.__init__(self)
SynCScanner.PROCESSNUM += 1
self.myPendingScanQ = myPendingScanQ
self.urlLQ = urlLQ
self.htmlSrcTuple = None
def run(self):
#print "SynCScanner Process with PID:%s and PCN:%s - Engaged" % (current_process().pid, SynCScanner.PROCESSNUM)
while True:
self.htmlSrcTuple = self.myPendingScanQ.get()
if self.htmlSrcTuple == None:
if SynCScanner.PROCESSNUM > 1:
print "SynCScanner Process with PID:%s and PCN:%s - Terminated (None...to do)" % (current_process().pid, SynCScanner.PROCESSNUM)
SynCScanner.PROCESSNUM -= 1
return
listURLs = self._extracturls()
digestedURLsLL = self._url_hash(listURLs)
self.urlLQ.put(digestedURLsLL)
#self.urlLQ.put(self.listURLs)
#print self.urlLQ.qsize()
#print "Extract URLs - END"
def _extracturls(self):
#print "Extract URLs"
urls = []
htmlsrc, charset, parenturl = self.htmlSrcTuple
if htmlsrc != None:
resulturls = []
urlExtractor = ExtractLinks(resulturls)
try:
if charset == None:
urlExtractor.feed(htmlsrc)
else:
urlExtractor.feed(htmlsrc.decode(charset))
except HTMLParser.HTMLParseError:
pass
try:
urlExtractor.reset() # I think close needs special treatment .close()
except HTMLParser.HTMLParseError:
urlExtractor.reset()
#this piece of code forms the URIs to full URLs by joining the
#parenturl with the network location free URLs extracted
for i in xrange(len(resulturls)): #replacing range() for performance reasons
urlres = urlparse.urlparse(resulturls[i], "http")
if urlres.netloc == "":
resulturls[i] = urlparse.urljoin(parenturl, resulturls[i])
urls.extend(resulturls)
return urls
def _url_hash(self, listURLs):
digestedURLsLL = list()
for i in xrange(len(listURLs)):#replacing range() for performance reasons
hash = hashlib.md5()
hash.update(listURLs[i])
digestedURLsLL.append( [hash.digest(), listURLs[i]] )
return digestedURLsLL
# Unit test
if __name__ == "__main__":
import urllib2
rq = urllib2.Request("http://www.extremepro.gr", headers={ 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux x86_64; en-GB; rv:1.9.1.9)' })
socket = urllib2.urlopen(rq)
# URL to other Sites extraction
# link_extro = LinkExtractor(base_url="http://www.extremepro.gr/")
link_extro = LinkExtractorPPool(base_url="http://www.extremepro.gr/")
src = socket.read()
# String-Source as input
link_extro.url_feed(src)
print link_extro.get_url()
print("Thank you and Goodbye!")