"data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp", ]), ), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
[ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation( '%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst' ), ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt') ], rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', '--min-size', '1', '--no-compress', '--compress-level', '0' ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
max_tries=1, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base'), }), SetBadUrls(), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), ChooseTargetAndUpload(), ), MaybeSendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), WriteInfo(), MoveFiles(), LimitConcurrent( 2, RsyncUpload(target=RSYNC_URL, target_source_path=ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args=['--partial', '--partial-dir', '.rsync-tmp'])), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME)) def stop_control(): #control.flag_logging_thread_for_termination() control.unregister_pipeline(pipeline_id) pipeline.on_cleanup += stop_control pipeline.running_status = "Running"
'data': [ ItemInterpolation( '%(data_dir)s/%(warc_file_base)s.warc.gz') # TODO ? ] }, id_function=stats_id_function, ), LimitConcurrent( NumberConfigValue( min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, # noqa: F821 version=VERSION, files=ItemValue('files'), rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
# # the NumberConfigValue can be changed in the configuration panel LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), # this upload task asks the tracker for an upload target # this can be HTTP or rsync and can be changed in the tracker admin panel UploadWithTracker( TRACKER_URL, downloader=downloader, version=VERSION, # list the files that should be uploaded. # this may include directory names. # note: HTTP uploads will only upload the first file on this list files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ], # the relative path for the rsync command # (this defines if the files are uploaded to a subdirectory on the server) rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), # extra rsync parameters (probably standard) rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ] ), ), # if the item passed every task, notify the tracker and report the statistics
"data": FilesToUpload(), }, id_function=prepare_stats_id_function, ), CleanUpItemDir(), LimitConcurrent( NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), ConditionalTask( files_to_upload, UploadWithTracker2( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=FilesToUpload(), rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ] ) ) ), SendDoneToTracker( tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats") ) )
MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default="20", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"), ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--sockopts=SO_SNDBUF=8388608,SO_RCVBUF=8388608", # 02:50 <Kenshin> the extra options should improve rsync speeds when the latency is higher "--recursive", "--partial", "--partial-dir", ".rsync-tmp", "--min-size", "1", "--no-compress", "--compress-level=0" ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
IOLoop.instance().add_timeout(datetime.timedelta(seconds=10), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) class IdleTask(Task): def __init__(self): Task.__init__(self, 'IdleTask') def enqueue(self, item): self.start_item(item) item.may_be_canceled = True item.log_output('Pausing for 60 seconds...') IOLoop.instance().add_timeout(datetime.timedelta(seconds=60), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) pipeline = Pipeline( WarningTask(), LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])), IdleTask(), )
# This will be shown in the warrior management panel. The logo should not # be too big. The deadline is optional. project = Project( title="sourceforgersync", project_html=""" <img class="project-logo" alt="Project logo" src="" height="50px" title=""/> <h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> · <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2> <p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p> """ ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]), LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])), ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ cleanItem("%(data_dir)s/%(item_name)s.tar.gz") #ItemInterpolation("foo.tar.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive",
functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) class IdleTask(Task): def __init__(self): Task.__init__(self, 'IdleTask') def enqueue(self, item): self.start_item(item) item.may_be_canceled = True item.log_output('Pausing for 60 seconds...') IOLoop.instance().add_timeout(datetime.timedelta(seconds=60), functools.partial(self._finish, item)) def _finish(self, item): item.may_be_canceled = False self.complete_item(item) pipeline = Pipeline( WarningTask(), LimitConcurrent( 1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])), IdleTask(), )
pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(), WgetDownload(WgetArgs(), ), DeduplicateWarcExtProc(DedupeArgs()), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(data_dir)s/%(item_name)s.deduplicated.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadToIA(UploadToIAArgs()), ), DeleteFiles(), SendDoneToTracker(tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats")))
'user_agent': user_agent, 'bind_address': globals().get('bind_address', ''), 'disco_tracker': DISCO_TRACKER_URL, "item_dir": ItemValue("item_dir"), }, accept_on_exit_code=[0], ), LimitConcurrent( NumberConfigValue( min=1, max=6, default=globals().get("num_procs", "1"), name="shared:fagrab:num_procs", title="Number of Processes", description="The maximum number of concurrent download processes." ), WgetDownload(WgetArgs(), max_tries=1, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "downloader": downloader, "item_name": ItemValue("item_name"), }), ), ExternalProcess( 'End', [sys.executable, 'helper.py', 'end'], env={ 'user_agent': user_agent, 'bind_address': globals().get('bind_address', ''), 'disco_tracker': DISCO_TRACKER_URL,
LimitConcurrent( NumberConfigValue( min=1, max=10, default="10", name="isohunt:download_threads", title="Isohunt downloading threads", description= "How many threads downloading Isohunt torrents and pages can run at once, to avoid throttling." ), WgetDownloadTorrentRange( [ WGET_LUA, "-U", USER_AGENT, "--no-check-certificate", "-e", "robots=off", "--rotate-dns", "--timeout", "60", "--level=inf", "--tries", "20", "--waitretry", "5", # "--bind-address", "%BIND_ADDRESS%", ], max_tries=5, accept_on_exit_code=[0]), ),
<h2>freeml.com <span class="links"><a href="http://www.freeml.com/">Website</a> · <a href="http://tracker.archiveteam.org/freeml/">Leaderboard</a></span></h2> ''') pipeline = Pipeline( CheckIP(), GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix='freeml'), LimitConcurrent( NumberConfigValue( min=1, max=1, default='1', name='shared:wget_download', title='wget-lua threads', description='The maximum number of concurrent downloads.'), WgetDownload(WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base') }), ), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data':