Пример #1
0
                                     MULTI_ITEM_SIZE), downloader, VERSION),
 PrepareDirectories(warc_prefix='halo'),
 WgetDownload(WgetArgs(),
              max_tries=1,
              accept_on_exit_code=[0, 4, 8],
              env={
                  'item_dir': ItemValue('item_dir'),
                  'warc_file_base': ItemValue('warc_file_base'),
                  'item_name_newline': ItemValue('item_name_newline')
              }), SetBadUrls(),
 PrepareStatsForTracker(
     defaults={
         'downloader': downloader,
         'version': VERSION
     },
     file_groups={
         'data':
         [ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=20,
         default='2',
         name='shared:rsync_threads',
         title='Rsync threads',
         description='The maximum number of concurrent uploads.'),
     UploadWithTracker(
         'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
Пример #2
0
     NumberConfigValue(
         min=1,
         max=20,
         default="1",
         name="shared:dedupe_threads",
         title="Deduplicate threads",
         description="The maximum number of concurrent dedupes."),
     DeduplicateWarcExtProc(),
 ),
 PrepareStatsForTracker(
     defaults={
         "downloader": downloader,
         "version": VERSION
     },
     file_groups={
         "data": [
             ItemInterpolation(
                 "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz")
         ]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=4,
         default="1",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
     UploadWithTracker(
Пример #3
0
     YgaArgs(),
     max_tries=1,              # 2,          #changed
     accept_on_exit_code=[0],  # [0, 4, 8],  #changed
     env={
         'item_dir': ItemValue('item_dir'),
         'item_value': ItemValue('item_value'),
         'item_type': ItemValue('item_type'),
         'warc_file_base': ItemValue('warc_file_base'),
     }
 ),
 MoveFiles(),
 PrepareStatsForTracker(
     defaults={'downloader': downloader, 'version': VERSION},    # noqa: F821
     file_groups={
         'data': [
             ItemInterpolation('%(data_dir)s/%(warc_file_base)s.warc.gz')  #TODO ?
         ]
     },
     id_function=stats_id_function,
 ),
 LimitConcurrent(NumberConfigValue(min=1, max=20, default='2',
                                   name='shared:rsync_threads', title='Rsync threads',
                                  description='The maximum number of concurrent uploads.'),
                 UploadWithTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
                                   downloader=downloader,        # noqa: F821
                                   version=VERSION,
                                   files=ItemValue('files'),
                                   rsync_target_source_path=ItemInterpolation('%(data_dir)s/'),
                                   rsync_extra_args=[
                                                      '--recursive',
                                                      '--partial',
Пример #4
0
      """,
                  utc_deadline=datetime.datetime(2013, 12, 26, 0, 0, 1))

pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix='yahooblog'),
    WgetDownload(
        wget_args,
        max_tries=2,
        accept_on_exit_code=[0, 8],
    ),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
        }), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=4,
            default="1",
            name="shared:rsync_threads",
            title="Rsync threads",
            description="The maximum number of concurrent uploads."),
        UploadWithTracker(
            "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
            downloader=downloader,
Пример #5
0
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
    title='load',
    project_html='''
        <img class="project-logo" alt="Project logo" src="https://www.archiveteam.org/images/b/b5/Reddit_logo.png" height="50px" title=""/>
        <h2>reddit.com <span class="links"><a href="https://reddit.com/">Website</a> &middot; <a href="http://tracker.archiveteam.org/reddit/">Leaderboard</a></span></h2>
        <p>Archiving everything from reddit.</p>
    '''
)

pipeline = Pipeline(
    GetItemFromTracker('http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader,
        VERSION),
    PrepareStatsForTracker(
        defaults={'downloader': downloader, 'version': VERSION},
        file_groups={
            'data': [
                '/dev/null'
            ]
        },
        id_function=stats_id_function,
    ),
    SendDoneToTracker(
        tracker_url='http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
        stats=ItemValue('stats')
    )
)

Пример #6
0
    # gunzipped HTTP responses.  Note that the .gz compression on the WARC
    # itself remains.
    CookWARC(),

    # this will set the item["stats"] string that is sent to the tracker (see below)
    PrepareStatsForTracker(
        # there are a few normal values that need to be sent
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        # this is used for the size counter on the tracker:
        # the groups should correspond with the groups set configured on the tracker
        file_groups={
            # there can be multiple groups with multiple files
            # file sizes are measured per group
            "data": [
                ItemInterpolation(
                    "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz")
            ],
            "hrefs":
            [ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2")]
        },
        id_function=(lambda item: {
            "ua": item["user_agent"]
        })),

    # there can be multiple items in the pipeline, but this wrapper ensures
    # that there is only one item uploading at a time
    #
    # the NumberConfigValue can be changed in the configuration panel