/
download_cc.py
92 lines (63 loc) · 2.24 KB
/
download_cc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import StringIO
import gzip
import os
import requests
import sys
import time
import argparse
import warc
import eatiht.v2 as v2
from multiprocessing import Pool
reload(sys)
sys.setdefaultencoding('utf8')
def download_page(record):
offset, length = int(record['offset']), int(record['length'])
offset_end = offset + length - 1
prefix = 'https://aws-publicdatasets.s3.amazonaws.com/'
resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
raw_data = StringIO.StringIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
data = f.read()
return data
def clean_warc(input):
text = v2.extract(input)
warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input))
for record in warc_content:
url,date = record['WARC-Target-URI'], record['WARC-Date']
return '%s,%s\n%s' % (url, date, text)
def run(item):
global output
record = eval(item)
# if record['mime'] != 'text/html':
# return
try:
warc_data = download_page(record)
text = clean_warc(warc_data)
file_name = record['url'].replace('https://','').replace('/','_')[:60]
with open('%s/%s.text' % (output, file_name), 'wb') as f:
f.write(text)
except:
print 'encountered error in %s, ignoring' % record['url']
parser = argparse.ArgumentParser()
parser.add_argument('-p','--pools', action='store', dest='num_pools', help='Number of pools', type=int)
parser.add_argument('-i','--input', action='store', dest='input', help='Input file with commoncrawl records')
parser.add_argument('-o','--output', action='store', dest='output', help='Output directory to write files')
args = parser.parse_args()
if not (args.num_pools and args.input):
print 'missing args'
parser.print_help()
sys.exit()
if not os.path.exists(args.output):
print args.output + ' doesnt exist, creating it...'
os.makedirs(args.output)
output = args.output
print 'reading records'
with open(args.input, 'rb') as f:
records = f.read().split('\n')
print "starting download"
pool = Pool(args.num_pools)
start = time.time()
total = len(records)
print "Starting parallel processes with number of entries: %d" % total
for idx, x in enumerate(pool.imap_unordered(run, records)):
print 'elapsed time for file %d out of %d : %d' %(idx, total,int(time.time() - start))