forked from chrisspen/webarticle2text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
webarticle2text.py
executable file
·640 lines (550 loc) · 19.9 KB
/
webarticle2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#!/usr/bin/env python
"""
File: webarticle2text.py
Copyright (C) 2008 Chris Spencer (chrisspen at gmail dot com)
Attempts to locate and extract the largest cluster of text in a
webpage. It does this by walking the DOM-tree, identifying all text
segments and their depth inside the DOM, appends all text at roughly
the same depth, and then returns the chunk with the largest total
length.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
"""
VERSION = (2, 0, 1)
__version__ = '.'.join(map(str, VERSION))
import os
import sys
import time
import formatter
#import htmlentitydefs
#import htmllib
#import httplib
#import HTMLParser
import mimetypes
import re
#import StringIO
#import urllib2
import hashlib
#import robotparser
# http://pythonhosted.org/six/
# Note, you may need to install pip for python3 via:
# sudo apt-get install python3-pip
import six
from six.moves import html_entities as htmlentitydefs
from six.moves.html_parser import HTMLParser
from six.moves import http_client as httplib
try:
from six.moves import cStringIO as StringIO
except ImportError:
from six import StringIO
from six.moves.urllib import parse as urlparse
from six.moves.urllib.error import HTTPError
from six.moves.urllib.request import OpenerDirector, Request, urlopen
from six.moves.urllib import robotparser
u = six.u
unicode = six.text_type
unichr = six.unichr
def get_unicode(text):
try:
text = unicode(text, 'utf-8')
except TypeError:
return text
def unescapeHTMLEntities(text):
"""Removes HTML or XML character references
and entities from a text string.
keep &, >, < in the source code.
from Fredrik Lundh
http://effbot.org/zone/re-sub.htm#unescape-html
"""
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
IGNORED_TAGS = (
'script','style','option','ul','li','legend','object','noscript',
#'h1','h2','h3','h4','h5','h6',
'label', 'footer', 'nav', 'aside',
)
class TextExtractor(HTMLParser):
"""
Attempts to extract the main body of text from an HTML document.
This is a messy task, and certain assumptions about the story text
must be made:
The story text:
1. Is the largest block of text in the document.
2. Sections all exist at the same relative depth.
"""
dom = []
path = [0]
pathBlur = 5
def __init__(self):
HTMLParser.__init__(self)
self._ignore = False
self._ignorePath = None
self._lasttag = None
self._depth = 0
self.depthText = {} # path:text
self.counting = 0
self.lastN = 0
def handle_starttag(self, tag, attrs):
ignore0 = self._ignore
tag = tag.lower()
if tag in IGNORED_TAGS:
self._ignore = True
attrd = dict(attrs)
self._lasttag = tag.lower()
self._depth += 1
self.path += [self.lastN]
self.lastN = 0
# Ignore footer garbage.
if 'id' in attrd and 'footer' in attrd['id'].lower():
self._ignore = True
elif 'id' in attrd and 'copyright' in attrd['id'].lower():
self._ignore = True
elif 'class' in attrd and 'footer' in attrd['class'].lower():
self.counting = max(self.counting,1)
self._ignore = True
elif 'class' in attrd and 'copyright' in attrd['class'].lower():
self._ignore = True
# If we just started ignoring, then remember the initial path
# so we can later know when to start un-ignoring again.
if self._ignore and not ignore0:
self._ignorePath = tuple(self.path)
def handle_startendtag(self, tag, attrs):
pass
def handle_endtag(self, tag):
if self._ignore and tuple(self.path) == self._ignorePath:
self._ignore = False
self._depth -= 1
if len(self.path):
self.lastN = self.path.pop()
else:
self.lastN = 0
self.lastN += 1
def handle_data(self, data, entity=False):
if len(data) > 0 and not self._ignore:
# Skip blocks of text beginning with 'copyright', which usually
# indicates a copyright notice.
_data = data.strip().lower()
if _data.startswith('copyright') and not self._ignore:
self._ignore = True
self._ignorePath = tuple(self.path)
return
if data:
rpath = tuple(self.path[:-self.pathBlur])
self.depthText.setdefault(rpath, [])
self.depthText[rpath] += [data]
# Allow one more layer below, to include
# text inside <i></i> or <b></b> tags.
# Unfortuantely, this will include a lot of crap
# in the page's header and footer, so we'll
# prefix this text with '#' and strip these out later.
rpath2 = tuple(self.path[:-self.pathBlur-1])
self.depthText.setdefault(rpath2, [])
self.depthText[rpath2] += ['#'+data]
def handle_charref(self, name):
if name.isdigit():
text = unescapeHTMLEntities(get_unicode('&#'+name+';'))
else:
text = unescapeHTMLEntities(get_unicode('&'+name+';'))
self.handle_data(text, entity=True)
def handle_entityref(self, name):
self.handle_charref(name)
def get_plaintext(self):
maxLen,maxPath,maxText,maxTextList = 0,(),'',[]
for path,textList in six.iteritems(self.depthText):
# Strip off header segments, prefixed with a '#'.
start = True
text = []
for t in textList:
if len(t.strip()):
if t.startswith('#') and start:
continue
start = False
text.append(t)
# Strip off footer segments, prefixed with a '#'.
start = True
textList = reversed(text)
text = []
for t in textList:
if len(t.strip()):
if t.startswith('#') and start:
continue
start = False
text.append(t)
text = reversed(text)
text = u('').join(text).replace('#','')
text = text.replace(u('\xa0'),' ')
text = text.replace(u('\u2019'),"'")
# Compress whitespace.
text = re.sub("[\\n\\s]+", u(' '), text).strip()
# print('old:',(maxLen,maxPath,maxText,maxTextList))
# print('new:',(len(text),path,text,textList))
maxLen,maxPath,maxText,maxTextList = max(
(maxLen,maxPath,maxText,maxTextList),
(len(text),path,text,textList),
)
return maxText
def parse_endtag(self, i):
# This is necessary because the underlying HTMLParser is buggy and
# unreliable.
try:
return HTMLParser.parse_endtag(self, i)
except AttributeError:
return -1
def error(self,msg):
# ignore all errors
pass
#class HTMLParserNoFootNote(htmllib.HTMLParser):
class HTMLParserNoFootNote(HTMLParser):
"""
Ignores link footnotes, image tags, and other useless things.
"""
textPattern = None
path = [0]
def handle_starttag(self, tag, attrs, *args):
time.sleep(0.5)
self.path += [0]
if tag == 'script':
pass
def handle_endtag(self, tag, *args):
self.path.pop()
self.path[-1] += 1
if tag == 'script':
pass
def anchor_end(self):
if self.anchor:
#self.handle_data("[%d]" % len(self.anchorlist))
self.anchor = None
def handle_image(self, src, alt, *args):
pass
def handle_data(self, data):
if self.textPattern:
data = ' '.join(self.textPattern.findall(data))
#htmllib.HTMLParser.handle_data(self, data)
HTMLParser.handle_data(self, data)
def extractFromHTML(html):
"""
Extracts text from HTML content.
"""
html = unicode(html)
assert isinstance(html, unicode)
# Create memory file.
file = StringIO()
# Convert html to text.
f = formatter.AbstractFormatter(formatter.DumbWriter(file))
p = TextExtractor()
p.feed(html)
p.close()
text = p.get_plaintext()
# Remove stand-alone punctuation.
text = re.sub("\s[\(\),;\.\?\!](?=\s)", " ", text).strip()
# Compress whitespace.
text = re.sub("[\n\s]+", " ", text).strip()
# Remove consequetive dashes.
text = re.sub("\-{2,}", "", text).strip()
# Remove consequetive periods.
text = re.sub("\.{2,}", "", text).strip()
return text
def tidyHTML(dirtyHTML):
"""
Runs an arbitrary HTML string through Tidy.
"""
try:
from tidylib import tidy_document
except ImportError as e:
raise ImportError(("%s\nYou need to install pytidylib.\n" +
"e.g. sudo pip install pytidylib") % e)
options = {
'output-xhtml':1,
#add_xml_decl=1,#option in tidy but not pytidylib
'indent':1,
'tidy-mark':1,
#'char-encoding':'utf8',
'char-encoding':'raw',
}
html, errors = tidy_document(dirtyHTML, options=options)
return html
def generate_key(s, pattern="%s.txt"):
"""
Generates the cache key for the given string using the content in pattern
to format the output string
"""
h = hashlib.sha1()
#h.update(s)
h.update(s.encode('utf-8'))
return pattern % h.hexdigest()
def cache_get(cache_dir, cache_key, default=None):
"""
Returns the content of a cache item or the given default
"""
filename = os.path.join(cache_dir, cache_key)
if os.path.isfile(filename):
with open(filename, 'r') as f:
return f.read()
return default
def cache_set(cache_dir, cache_key, content):
"""
Creates a new cache file in the cache directory
"""
filename = os.path.join(cache_dir, cache_key)
with open(filename, 'w') as f:
f.write(content)
def cache_info(cache_dir, cache_key):
"""
Returns the cache files mtime or 0 if it does not exists
"""
filename = os.path.join(cache_dir, cache_key)
return os.path.getmtime(filename) if os.path.exists(filename) else 0
def fetch(url, timeout=5, userAgent=None, only_mime_types=None):
"""
Retrieves the raw content of the URL.
"""
headers = {}
if userAgent:
headers['User-agent'] = str(userAgent)
request = Request(url=url, headers=headers)
response = urlopen(request, timeout=timeout)
# Return nothing of the content isn't one of the target mime-types.
if only_mime_types:
assert isinstance(only_mime_types, (tuple, list))
# Check for mimetype by looking at pattern in the URL.
# Not super accurate, but very fast.
ct, mt_encoding = mimetypes.guess_type(url)
# Then check for mimetype by actually requesting the resource and
# looking at the response.
# More accurate, but slower since we actually have to send a request.
if not ct:
response_info = response.info()
ct = (response_info.getheader('Content-Type') or '').split(';')[0]
#TODO:if still undefined, use magic.Magic(mime=True).from_file(url)?
if ct not in only_mime_types:
return
try:
return response.read()
except httplib.IncompleteRead as e:
# This should rarely happen, and is often the fault of the server
# sending a malformed response.
#TODO:just abandon all content and return '' instead?
return e.partial
def check_robotstxt(url, useCache, cache_dir, userAgent=None):
scheme, netloc, url_path, query, fragment = urlparse.urlsplit(url)
robotstxt_url = urlparse.urlunsplit((
scheme,
netloc,
'/robots.txt',
'',
'',
))
key = generate_key(robotstxt_url)
robots_parser = robotparser.RobotFileParser()
cached_content = cache_get(cache_dir, key) if useCache else ''
threshold = (time.time() - 86400 * 7)
if not cached_content or cache_info(cache_dir, key) < threshold:
try:
cached_content = fetch(robotstxt_url, userAgent=userAgent)
if useCache:
cache_set(cache_dir, key, cached_content)
except HTTPError as he:
# this block mimics the behaviour in the robotparser.read() method
if he.code in (401, 403):
robots_parser.disallow_all = True
elif he.code >= 400:
robots_parser.allow_all = True
else:
raise he
cached_content = ''
try:
cached_content = str(cached_content, encoding='utf8')
except TypeError:
pass
robots_parser.parse((x for x in cached_content.split('\n')))
default_useragent = None
for k, v in OpenerDirector().addheaders:
if k == "User-agent":
default_useragent = v
break
return robots_parser.can_fetch(userAgent or default_useragent, url)
def extractFromURL(url,
cache=False,
cacheDir='_cache',
verbose=False,
encoding=None,
filters=None,
userAgent=None,
timeout=5,
ignore_robotstxt=False,
only_mime_types=None,
raw=False):
"""
Extracts text from a URL.
Parameters:
url := string
Remote URL or local filename where HTML will be read.
cache := bool
True=store and retrieve url from cache
False=always retrieve url from the web
cacheDir := str
Directory where cached url contents will be stored.
verbose := bool
True=print logging messages
False=print no output
encoding := string
The encoding of the page contents.
If none given, it will attempt to guess the encoding.
See http://docs.python.org/howto/unicode.html for further info
on Python Unicode and encoding support.
filters := string
Comma-delimited list of filters to apply before parsing.
only_mime_types := list of strings
A list of mime-types to limit parsing to.
If the mime-type of the raw-content retrieved does not match
one of these, a value of None will be returned.
"""
try:
import chardet
except ImportError as e:
raise ImportError(("%s\nYou need to install chardet.\n" + \
"e.g. sudo pip install chardet") % e)
if only_mime_types and isinstance(only_mime_types, basestring):
only_mime_types = only_mime_types.split(',')
# Load url from cache if enabled.
if cache:
if not os.path.isdir(cacheDir):
cache_perms = 488 # 750 in octal, '-rwxr-x---'
os.makedirs(cacheDir, cache_perms)
cache_key = generate_key(url)
cached_content = cache_get(cacheDir, cache_key)
if cached_content:
return cached_content
if not ignore_robotstxt:
if not check_robotstxt(url, cache, cacheDir, userAgent=userAgent):
if verbose: print("Request denied by robots.txt")
return ''
# Otherwise download the url.
if verbose: print('Reading %s...' % url)
html = fetch(
url,
timeout=timeout,
userAgent=userAgent,
only_mime_types=only_mime_types)
if not html:
return ''
# If no encoding guess given, then attempt to determine
# encoding automatically.
if not encoding:
encoding_opinion = chardet.detect(html)
encoding = encoding_opinion['encoding']
if verbose: print('Using encoding %s.' % encoding)
# Save raw contents to cache if enabled.
if verbose: print('Read %i characters.' % len(html))
if cache:
raw_key = generate_key(url, "%s.raw")
cache_set(cacheDir, raw_key, html)
# Apply filters.
if filters:
filter_names = map(str.strip, filters.split(','))
for filter_name in filter_names:
filter = get_filter(filter_name)
html = filter(html)
# Clean up HTML.
html = tidyHTML(html)
if verbose: print('Extracted %i characters.' % len(html))
# Convert to Unicode.
if not html:
return ''
html = unicode(html, encoding=encoding, errors='replace')
if raw:
return html
# Extract text from HTML.
res = extractFromHTML(html)
assert isinstance(res, unicode)
# Save extracted text to cache if enabled.
res = res.encode(encoding, 'ignore')
if cache:
cache_set(cacheDir, cache_key, res)
return res
def filter_remove_entities(text):
return re.sub("&#[a-zA-Z]+", '', text)
def get_filter_names():
return [
k.replace('filter_','')
for k,v in six.iteritems(globals())
if k.startswith('filter_')
]
def get_filter(name):
return eval('filter_' + re.sub('[^a-zA-Z_]','',name))
if __name__ == '__main__':
from optparse import OptionParser
usage = "usage: %prog [options] <remote url or local filename>"
parser = OptionParser(usage=usage)
parser.add_option(
"-e", "--encoding", dest="encoding",
default=None,
help='Manually specifies the encoding to use when '
'interpreting the url.')
parser.add_option(
"-c", "--cache", dest="cache",
action='store_true',
default=False,
help="Stores and loads data from cache.")
parser.add_option(
"-d", "--cacheDir", dest="cacheDir",
default='_cache',
help="The directory where cache files will be stored.")
parser.add_option(
"-u", "--userAgent", dest="userAgent",
default=None,
help="The user-agent to use when requesting URLs.")
parser.add_option(
"-f", "--filters", dest="filters",
default=None,
choices=get_filter_names(),
help=('A comma-delimited list of pre-processing filters '
'to apply, one of [%s].') % '|'.join(get_filter_names()))
parser.add_option(
"-v", "--verbose", dest="verbose",
action='store_true',
default=False,
help="Displays status messages.")
parser.add_option(
'-i', '--ignore-robotstxt', dest="ignore_robotstxt",
default=False, action="store_true",
help="Ignore robots.txt when fetching the content.")
parser.add_option(
'-m', '--only-mime-types', dest="only_mime_types",
default=None,
help="A comma-delimited list of mime-types to limit retrieval to.")
(options, args) = parser.parse_args()
if len(args) < 1:
parser.print_help()
sys.exit()
url = args[0]
s = extractFromURL(url=url, **options.__dict__)
s = s.decode('utf8')
sys.stdout.write(s)
sys.stdout.write('\n')