예제 #1
0
    def test_max_items_with_subproc(self):
        pipeline = Pipeline(PrintItem(), PrintItem(),
                            ExternalProcess("pwd", ["pwd"]))
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=3)

        def finish_item_callback(runner, pipeline, item):
            if runner.item_count > 10:
                raise Exception('Too many items.')

        runner.on_pipeline_finish_item += finish_item_callback
        runner.start()

        self.assertFalse(pipeline.has_failed)
        self.assertEqual(3, runner.item_count)
        self.assertIOLoopOK()
예제 #2
0
             <a href="http://tracker.archiveteam.org/quizilladisco/">Leaderboard</a>
             <a href="http://archiveteam.org/index.php?title=Quizilla">Wiki</a> &middot;
         </span>
        </h2>
        <p>Quizilla shuts down. This is phase 1: content discovery.</p>
    """,
                  utc_deadline=datetime.datetime(2014, 10, 1, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="quizilladisco"),
    ExternalProcess('Scraper',
                    CustomProcessArgs(),
                    max_tries=2,
                    accept_on_exit_code=[0],
                    env={"item_dir": ItemValue("item_dir")}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
예제 #3
0
                  "warc_file_base": ItemValue("warc_file_base"),
              }),
 PrepareStatsForTracker(
     defaults={
         "downloader": downloader,
         "version": VERSION
     },
     file_groups={
         "data":
         [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 ExternalProcess("rsync", [
     "rsync", "-avz", "--progress",
     ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt"),
     "rsync://storage.harrycross.me/dev/bayimg"
 ]),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=4,
         default="1",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
     UploadWithTracker(
         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
         version=VERSION,
         files=[
예제 #4
0
project = Project(title="Yahoo! Blog & Wretch Username",
                  project_html="""
    <img class="project-logo" alt="" src="http://archiveteam.org/images/7/76/Archiveteam1.png" height="50" />
    <h2>Yahoo! Blog & Wretch <span class="links"><a href="http://blog.yahoo.com/">Yahoo! Blog</a> &middot; <a href="http://www.wretch.cc/">Wretch</a> &middot; <a href="http://%s/%s/">Leaderboard</a></span></h2>
    <p><b>Yahoo!</b> is a horrible monster.</p>
    """ % (TRACKER_HOST, TRACKER_ID),
                  utc_deadline=datetime.datetime(2013, 12, 26, 00, 00, 1))

pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="ybw-username"),
    ExternalProcess(
        'Scraper', [
            "python", "scraper.py",
            ItemInterpolation("%(item_name_punycode)s"),
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s")
        ],
        env={'SCRAPER_BIND_ADDRESS': globals().get('bind_address', '')}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(item_dir)s/%(warc_file_base)s.wretch.txt"),
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.yahoo.txt"),
            ]
        }), MoveFiles(),
예제 #5
0
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
	title="sourceforgersync",
	project_html="""
		<img class="project-logo" alt="Project logo" src="" height="50px" title=""/>
		<h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2>
		<p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p>
	"""
)

pipeline = Pipeline(
	CheckIP(),
	GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
	ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]),
	LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])),
	ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]),
	LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
		name="shared:rsync_threads", title="Rsync threads",
		description="The maximum number of concurrent uploads."),
		UploadWithTracker(
			"http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
			downloader=downloader,
			version=VERSION,
			files=[
				cleanItem("%(data_dir)s/%(item_name)s.tar.gz")
				#ItemInterpolation("foo.tar.gz")
			],
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			rsync_extra_args=[
        IOLoop.instance().add_timeout(datetime.timedelta(seconds=10),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])),
    IdleTask(),
)
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(
        1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])),
    IdleTask(),
)
예제 #8
0
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    # LimitConcurrent(
    #     NumberConfigValue(min=1, max=4, default="1",
    #                       name="shared:rsync_threads", title="Rsync threads",
    #                       description="The maximum number of concurrent uploads."),
    #     UploadWithTracker(
    #         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
    #         downloader=downloader,
    #         version=VERSION,
    #         files=[
    #             ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
    #         ],
    #         rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
    #         rsync_extra_args=[
    #             "--recursive",
    #             "--partial",
    #             "--partial-dir", ".rsync-tmp",
    #             ]
    #     ),
    # ),
    # SendDoneToTracker(
    #     tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
    #     stats=ItemValue("stats")
    # )
    ExternalProcess("sleep", ["sleep", "60"]),
)
예제 #9
0
        <p>Downloading FurAffinity</p>
        <!--<p class="projectBroadcastMessage"></p>-->
    """,
    # utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="furaffinity"),
    ExternalProcess(
        'Begin',
        [sys.executable, 'helper.py', 'begin'],
        env={
            'user_agent': user_agent,
            'bind_address': globals().get('bind_address', ''),
            'disco_tracker': DISCO_TRACKER_URL,
            "item_dir": ItemValue("item_dir"),
        },
        accept_on_exit_code=[0],
    ),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=6,
            default=globals().get("num_procs", "1"),
            name="shared:fagrab:num_procs",
            title="Number of Processes",
            description="The maximum number of concurrent download processes."
        ),
        WgetDownload(WgetArgs(),