-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.py
417 lines (344 loc) · 13.6 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
r"""
This file defines a seesaw pipeline for the ArchiveTeam Warrior.
It can also be run standalone:
pip install --user seesaw
~/.local/bin/run-pipeline pipeline.py YOURNICKNAME
(or run run-pipeline --help for more details)
This pipeline relies on this code inserted into your universal-tracker redis database:
$ redis-cli
redis 127.0.0.1:6379> select 13
OK
redis 127.0.0.1:6379[13]> set greader-directory:extra_parameters 'data["cookie_id"], data["cookie_value"] = open("/home/ivan/directory/cookie", "r") {|f| f.read().strip().split("|", 2)}; data["task_urls"] = item.split("|").map{|q| "https://www.google.com/reader/directory/search?q=" + q}; data["user_agent"] = "Wget/1.14 gzip ArchiveTeam"; data["wget_timeout"] = "60"; data["wget_tries"] = "20"; data["wget_waitretry"] = "5";'
OK
items should be query|another%20query|etc where all queries are already urllib.quote encoded.
"""
import os
import sys
import json
import os.path
import shutil
import time
import zlib
from hashlib import md5
from distutils.version import StrictVersion
# check the seesaw version before importing any other components
import seesaw
if StrictVersion(seesaw.__version__) < StrictVersion("0.0.12"):
raise Exception("This pipeline needs seesaw version 0.0.12 or higher.")
from seesaw.project import Project
from seesaw.config import NumberConfigValue, realize
from seesaw.item import ItemInterpolation, ItemValue
from seesaw.task import SimpleTask, LimitConcurrent
from seesaw.pipeline import Pipeline
from seesaw.externalprocess import ExternalProcess, WgetDownload, AsyncPopen
from seesaw.tracker import TrackerRequest, UploadWithTracker, SendDoneToTracker, PrepareStatsForTracker
# run-pipeline changes the cwd to the directory containing pipeline.py, then
# execs the contents of pipeline.py.
PIPELINE_DIR = os.getcwd()
SSL_CERT_DIR = os.path.join(PIPELINE_DIR, "certs")
## Begin AsyncPopen fix
import pty
import fcntl
import subprocess
import seesaw.externalprocess
from tornado.ioloop import IOLoop, PeriodicCallback
class AsyncPopenFixed(AsyncPopen):
"""
Start the wait_callback after setting self.pipe, to prevent an infinite spew of
"AttributeError: 'AsyncPopen' object has no attribute 'pipe'"
"""
def run(self):
self.ioloop = IOLoop.instance()
(master_fd, slave_fd) = pty.openpty()
# make stdout, stderr non-blocking
fcntl.fcntl(master_fd, fcntl.F_SETFL, fcntl.fcntl(master_fd, fcntl.F_GETFL) | os.O_NONBLOCK)
self.master_fd = master_fd
self.master = os.fdopen(master_fd)
# listen to stdout, stderr
self.ioloop.add_handler(master_fd, self._handle_subprocess_stdout, self.ioloop.READ)
slave = os.fdopen(slave_fd)
self.kwargs["stdout"] = slave
self.kwargs["stderr"] = slave
self.kwargs["close_fds"] = True
self.pipe = subprocess.Popen(*self.args, **self.kwargs)
self.stdin = self.pipe.stdin
# check for process exit
self.wait_callback = PeriodicCallback(self._wait_for_end, 250)
self.wait_callback.start()
seesaw.externalprocess.AsyncPopen = AsyncPopenFixed
## End AsyncPopen fix
def gunzip_string(s):
return zlib.decompress(s, 16 + zlib.MAX_WBITS)
class GetItemFromTracker(TrackerRequest):
def __init__(self, tracker_url, downloader, version = None):
TrackerRequest.__init__(self, "GetItemFromTracker", tracker_url, "request", may_be_canceled=True)
self.downloader = downloader
self.version = version
def data(self, item):
data = {"downloader": realize(self.downloader, item), "api_version": "2"}
if self.version:
data["version"] = realize(self.version, item)
return data
def process_body(self, body, item):
data = json.loads(body)
if "item_name" in data:
for (k,v) in data.iteritems():
item[k] = v
##print item
item.log_output("Received item %r from tracker; using cookie %r\n" % (item["item_name"], item["cookie_id"]))
self.complete_item(item)
else:
item.log_output("Tracker responded with empty response.\n")
self.schedule_retry(item)
# stdin_data_function added in seesaw 0.14
class WgetDownloadWithStdin(WgetDownload):
def __init__(self, args, max_tries=1, accept_on_exit_code=[0], retry_on_exit_code=None, env=None, stdin_data_function=None):
super(WgetDownloadWithStdin, self).__init__(args, max_tries, accept_on_exit_code, retry_on_exit_code, env)
self.stdin_data_function = stdin_data_function
def stdin_data(self, item):
if self.stdin_data_function:
return self.stdin_data_function(item)
else:
return ""
#---------------------------------------
# This is an updated version of test_executable.
# This can be removed when all warriors have updated
# the seesaw-kit. (Needs at least version 0.0.15.)
#
import subprocess
def test_executable(name, version, path):
print "Looking for %s in %s" % (name, path)
try:
process = subprocess.Popen([path, "-V"], stdout=subprocess.PIPE)
result = process.communicate()[0]
if not process.returncode == 0:
print "%s: Returned code %d" % (path, process.returncode)
return False
if isinstance(version, basestring):
if not version in result:
print "%s: Incorrect %s version (want %s)." % (path, name, version)
return False
elif hasattr(version, "search"):
if not version.search(result):
print "%s: Incorrect %s version." % (path, name)
return False
elif hasattr(version, "__iter__"):
if not any((v in result) for v in version):
print "%s: Incorrect %s version (want %s)." % (path, name, str(version))
return False
print "Found usable %s in %s" % (name, path)
return True
except OSError as e:
print "%s:" % path, e
return False
def find_executable(name, version, paths):
for path in paths:
if test_executable(name, version, path):
return path
return None
#---------------------------------------
###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable(
"Wget+Lua",
["GNU Wget 1.14.lua.20130523-9a5c"],
[
"./wget-lua",
"./wget-lua-warrior",
"./wget-lua-local",
"../wget-lua",
"../../wget-lua",
"/home/warrior/wget-lua",
"/usr/bin/wget-lua"
]
)
if not WGET_LUA:
raise Exception("No usable Wget+Lua found.")
###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20130620.01"
###########################################################################
# This section defines project-specific tasks.
#
# Simple tasks (tasks that do not need any concurrency) are based on the
# SimpleTask class and have a process(item) method that is called for
# each item.
def hash_unicode(s):
return md5(s.encode("utf-8")).hexdigest()
class PrepareDirectories(SimpleTask):
"""
A task that creates temporary directories and initializes filenames.
It initializes these directories, based on the previously set item_name:
item["item_dir"] = "%{data_dir}/%{item_name}"
item["warc_file_base"] = "%{warc_prefix}-%{item_name}-%{timestamp}"
These attributes are used in the following tasks, e.g., the Wget call.
* set warc_prefix to the project name.
* item["data_dir"] is set by the environment: it points to a working
directory reserved for this item.
* use item["item_dir"] for temporary files
"""
def __init__(self, warc_prefix):
SimpleTask.__init__(self, "PrepareDirectories")
self.warc_prefix = warc_prefix
def process(self, item):
dirname = "/".join((item["data_dir"], hash_unicode(item["item_name"])))
if os.path.isdir(dirname):
shutil.rmtree(dirname)
os.makedirs(dirname)
item["item_dir"] = dirname
item["warc_file_base"] = "%s-%s-%s" % (
self.warc_prefix, hash_unicode(item["item_name"]), time.strftime("%Y%m%d-%H%M%S"))
open("%(item_dir)s/%(warc_file_base)s.warc.gz" % item, "w").close()
class MoveFiles(SimpleTask):
"""
After downloading, this task moves the warc file from the
item["item_dir"] directory to the item["data_dir"], and removes
the files in the item["item_dir"] directory.
"""
def __init__(self):
SimpleTask.__init__(self, "MoveFiles")
def process(self, item):
os.rename("%(item_dir)s/%(warc_file_base)s.warc.gz" % item,
"%(data_dir)s/%(warc_file_base)s.warc.gz" % item)
shutil.rmtree("%(item_dir)s" % item)
class CookWARC(ExternalProcess):
def __init__(self):
args = [
sys.executable,
os.path.join(PIPELINE_DIR, "warc2warc_greader.py"),
"--gzip",
"--decode_http",
"--output", ItemInterpolation("%(data_dir)s/%(warc_file_base)s.cooked.warc.gz"),
ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
]
ExternalProcess.__init__(self, "CookWARC", args)
###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
title="Google Reader Feed Directory",
project_html="""
<h2>Google Reader Feed Directory <span class="links"><a href="http://www.google.com/reader/">Website</a> · <a
href="http://tracker.archiveteam.org/greader/">Leaderboard</a></span></h2>
<p><i>Google Reader</i> is closing July 1st, 2013</p>
"""
)
###########################################################################
try:
TRACKER_URL = os.environ["GREADER_DIRECTORY_TRACKER_URL"]
except KeyError:
TRACKER_URL = "http://tracker-alt.dyn.ludios.net:9292/greader-directory"
###########################################################################
# The pipeline.
#
# Items move through each task on the pipeline.
# Items are dicts, so tasks can set properties and can use properties set
# by earlier tasks and (such as the item["item_name"] property).
#
pipeline = Pipeline(
# request an item from the tracker (using the universal-tracker protocol)
# the downloader variable will be set by the warrior environment
#
# this task will wait for an item and sets item["item_name"] to the item name
# before finishing
GetItemFromTracker(TRACKER_URL, downloader, VERSION),
# create the directories and initialize the filenames (see above)
# warc_prefix is the first part of the warc filename
#
# this task will set item["item_dir"] and item["warc_file_base"]
PrepareDirectories(warc_prefix="greaderdirectory"),
# execute Wget+Lua
#
# the ItemInterpolation() objects are resolved during runtime
# (when there is an Item with values that can be added to the strings)
WgetDownloadWithStdin([
WGET_LUA,
"-U", ItemInterpolation("%(user_agent)s"),
"-nv",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--timeout", ItemInterpolation("%(wget_timeout)s"),
"--tries", ItemInterpolation("%(wget_tries)s"),
"--waitretry", ItemInterpolation("%(wget_waitretry)s"),
"--header", ItemInterpolation("Cookie: %(cookie_value)s"),
"--header", "Accept-Encoding: gzip",
"--lua-script", "greader-directory.lua",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "greader-directory-dld-script-version: " + VERSION,
"--input", "-"
],
max_tries=2,
accept_on_exit_code=[0, 8], # which Wget exit codes count as a success?
env=dict(SSL_CERT_DIR=SSL_CERT_DIR),
stdin_data_function=(lambda item: "\n".join(u.encode("utf-8") for u in item["task_urls"]) + "\n"),
),
# remove the temporary files, move the warc file from
# item["item_dir"] to item["data_dir"]
MoveFiles(),
# create a .cooked.warc.gz based on the .warc.gz. The cooked WARC has
# gunzipped HTTP responses. Note that the .gz compression on the WARC
# itself remains.
CookWARC(),
# this will set the item["stats"] string that is sent to the tracker (see below)
PrepareStatsForTracker(
# there are a few normal values that need to be sent
defaults={"downloader": downloader, "version": VERSION},
# this is used for the size counter on the tracker:
# the groups should correspond with the groups set configured on the tracker
file_groups={
# there can be multiple groups with multiple files
# file sizes are measured per group
"data": [ItemInterpolation("%(data_dir)s/%(warc_file_base)s.cooked.warc.gz")]
},
id_function=(lambda item: {"ua": item["user_agent"]})
),
# there can be multiple items in the pipeline, but this wrapper ensures
# that there is only one item uploading at a time
#
# the NumberConfigValue can be changed in the configuration panel
LimitConcurrent(
NumberConfigValue(
min=1, max=4, default="2", name="shared:rsync_threads", title="Rsync threads",
description="The maximum number of concurrent uploads."),
# this upload task asks the tracker for an upload target
# this can be HTTP or rsync and can be changed in the tracker admin panel
UploadWithTracker(
TRACKER_URL,
downloader=downloader,
version=VERSION,
# list the files that should be uploaded.
# this may include directory names.
# note: HTTP uploads will only upload the first file on this list
files=[
ItemInterpolation("%(data_dir)s/%(warc_file_base)s.cooked.warc.gz")
],
# the relative path for the rsync command
# (this defines if the files are uploaded to a subdirectory on the server)
rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
# extra rsync parameters (probably standard)
rsync_extra_args=[
"--recursive",
"--partial",
"--partial-dir", ".rsync-tmp"
]
),
),
# if the item passed every task, notify the tracker and report the statistics
SendDoneToTracker(
tracker_url=TRACKER_URL,
stats=ItemValue("stats")
)
)