'data': [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz') ], rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', ]), ), SendDoneToTracker(tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats')))
ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"), ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp", ] ), ), SendDoneToTracker( tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID), stats=ItemValue("stats") ) )
}, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(NumberConfigValue(min=1, max=20, default='2', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'), ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt') ], rsync_target_source_path=ItemInterpolation('%(data_dir)s/'), rsync_extra_args=[ '--recursive', '--partial', '--partial-dir', '.rsync-tmp', '--min-size', '1', '--no-compress', '--compress-level', '0' ] ), ), SendDoneToTracker( tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID), stats=ItemValue('stats') ) )
max=4, default="2", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), # this upload task asks the tracker for an upload target # this can be HTTP or rsync and can be changed in the tracker admin panel UploadWithTracker( TRACKER_URL, downloader=downloader, version=VERSION, # list the files that should be uploaded. # this may include directory names. # note: HTTP uploads will only upload the first file on this list files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"), ItemInterpolation( "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz") ], # the relative path for the rsync command # (this defines if the files are uploaded to a subdirectory on the server) rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), # extra rsync parameters (probably standard) rsync_extra_args=[ "--recursive", "--partial", "--partial-dir", ".rsync-tmp" ]), ), # if the item passed every task, notify the tracker and report the statistics SendDoneToTracker(tracker_url=TRACKER_URL, stats=ItemValue("stats")))