def __init__(self, args):
     print(args)
     ExternalProcess.__init__(
         self,
         "UploadToIA",
         args=args,
     )
예제 #2
0
 def __init__(self, args, max_tries=1, accept_on_exit_code=None,
              retry_on_exit_code=None, env=None, stdin_data_function=None):
     ExternalProcess.__init__(
         self, "YgaDownload",
         args=args, max_tries=max_tries,
         accept_on_exit_code=(accept_on_exit_code
                              if accept_on_exit_code is not None else [0]),
         retry_on_exit_code=retry_on_exit_code,
         env=env)
     self.stdin_data_function = stdin_data_function
	def __init__(self):
		args = [
			sys.executable,
			os.path.join(PIPELINE_DIR, "warc2warc_greader.py"),
			"--gzip",
			"--decode_http",
			"--output", ItemInterpolation("%(data_dir)s/%(warc_file_base)s.cooked.warc.gz"),
			ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
		]
		ExternalProcess.__init__(self, "CookWARC", args)
예제 #4
0
    def __init__(self):
        env = {'PYTHONPATH': 'terroroftinytown'}

        ExternalProcess.__init__(
            self,
            'RunScraper', [
                sys.executable, 'scraper.py', TRACKER_HOST, VERSION,
                globals()['downloader'],
                globals().get('bind_address', ''), USER_AGENT
            ],
            env=env)
예제 #5
0
 def __init__(self):
     args = [
         sys.executable,
         os.path.join(PIPELINE_DIR, "warc2warc_greader.py"), "--gzip",
         "--decode_http", "--strip-404s", "--json-hrefs-file",
         ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"),
         "--output",
         ItemInterpolation(
             "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz"),
         ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
     ]
     ExternalProcess.__init__(self, "CookWARC", args)
예제 #6
0
    def test_max_items_with_subproc(self):
        pipeline = Pipeline(PrintItem(), PrintItem(),
                            ExternalProcess("pwd", ["pwd"]))
        pipeline.has_failed = None

        def fail_callback(task, item):
            pipeline.has_failed = True

        pipeline.on_fail_item += fail_callback

        runner = SimpleRunner(pipeline, max_items=3)

        def finish_item_callback(runner, pipeline, item):
            if runner.item_count > 10:
                raise Exception('Too many items.')

        runner.on_pipeline_finish_item += finish_item_callback
        runner.start()

        self.assertFalse(pipeline.has_failed)
        self.assertEqual(3, runner.item_count)
        self.assertIOLoopOK()
예제 #7
0
project = Project(title="Yahoo! Blog & Wretch Username",
                  project_html="""
    <img class="project-logo" alt="" src="http://archiveteam.org/images/7/76/Archiveteam1.png" height="50" />
    <h2>Yahoo! Blog & Wretch <span class="links"><a href="http://blog.yahoo.com/">Yahoo! Blog</a> &middot; <a href="http://www.wretch.cc/">Wretch</a> &middot; <a href="http://%s/%s/">Leaderboard</a></span></h2>
    <p><b>Yahoo!</b> is a horrible monster.</p>
    """ % (TRACKER_HOST, TRACKER_ID),
                  utc_deadline=datetime.datetime(2013, 12, 26, 00, 00, 1))

pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="ybw-username"),
    ExternalProcess(
        'Scraper', [
            "python", "scraper.py",
            ItemInterpolation("%(item_name_punycode)s"),
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s")
        ],
        env={'SCRAPER_BIND_ADDRESS': globals().get('bind_address', '')}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data": [
                ItemInterpolation(
                    "%(item_dir)s/%(warc_file_base)s.wretch.txt"),
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s.yahoo.txt"),
            ]
        }), MoveFiles(),
예제 #8
0
             <a href="http://tracker.archiveteam.org/quizilladisco/">Leaderboard</a>
             <a href="http://archiveteam.org/index.php?title=Quizilla">Wiki</a> &middot;
         </span>
        </h2>
        <p>Quizilla shuts down. This is phase 1: content discovery.</p>
    """,
                  utc_deadline=datetime.datetime(2014, 10, 1, 23, 59, 0))

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
                       downloader, VERSION),
    PrepareDirectories(warc_prefix="quizilladisco"),
    ExternalProcess('Scraper',
                    CustomProcessArgs(),
                    max_tries=2,
                    accept_on_exit_code=[0],
                    env={"item_dir": ItemValue("item_dir")}),
    PrepareStatsForTracker(
        defaults={
            "downloader": downloader,
            "version": VERSION
        },
        file_groups={
            "data":
            [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt.gz")]
        },
        id_function=stats_id_function,
    ), MoveFiles(),
    LimitConcurrent(
        NumberConfigValue(
예제 #9
0
 def on_subprocess_stdout(self, pipe, item, data):
     ExternalProcess.on_subprocess_stdout(self, pipe, item, data)
     self.output_buffer.write(data.decode('utf8', 'replace'))
예제 #10
0
                  "warc_file_base": ItemValue("warc_file_base"),
              }),
 PrepareStatsForTracker(
     defaults={
         "downloader": downloader,
         "version": VERSION
     },
     file_groups={
         "data":
         [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 ExternalProcess("rsync", [
     "rsync", "-avz", "--progress",
     ItemInterpolation("%(data_dir)s/%(warc_file_base)s_data.txt"),
     "rsync://storage.harrycross.me/dev/bayimg"
 ]),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=4,
         default="1",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
     UploadWithTracker(
         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
         version=VERSION,
         files=[
예제 #11
0
 def on_subprocess_stdout(self, pipe, item, data):
     ExternalProcess.on_subprocess_stdout(self, pipe, item, data)
     self.output_buffer.write(data)
예제 #12
0
 def on_subprocess_end(self, item, returncode):
     ExternalProcess.on_subprocess_end(self, item, returncode)
     self.return_code = returncode
     self.exit_count += 1
예제 #13
0
 def __init__(self, *args, **kwargs):
     ExternalProcess.__init__(self, *args, **kwargs)
     self.output_buffer = StringIO()
     self.return_code = None
     self.exit_count = 0
     self.retry_delay = 0.1
예제 #14
0
 def __init__(self, *args, **kwargs):
     ExternalProcess.__init__(self, *args, **kwargs)
     self.output_buffer = StringIO()
     self.return_code = None
     self.exit_count = 0
     self.retry_delay = 0.1
예제 #15
0
 def __init__(self):
     ExternalProcess.__init__(self,
                              'UpdateSubmodule',
                              self.NEW_ARGS,
                              max_tries=5,
                              retry_delay=2)
예제 #16
0
 def on_subprocess_end(self, item, returncode):
     ExternalProcess.on_subprocess_end(self, item, returncode)
     self.return_code = returncode
     self.exit_count += 1
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(
        1, ExternalProcess('Install Python 3.5', ['install-python3.5.sh'])),
    IdleTask(),
)
예제 #18
0
            ]
        },
        id_function=stats_id_function,
    ),
    MoveFiles(),
    # LimitConcurrent(
    #     NumberConfigValue(min=1, max=4, default="1",
    #                       name="shared:rsync_threads", title="Rsync threads",
    #                       description="The maximum number of concurrent uploads."),
    #     UploadWithTracker(
    #         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
    #         downloader=downloader,
    #         version=VERSION,
    #         files=[
    #             ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz"),
    #         ],
    #         rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
    #         rsync_extra_args=[
    #             "--recursive",
    #             "--partial",
    #             "--partial-dir", ".rsync-tmp",
    #             ]
    #     ),
    # ),
    # SendDoneToTracker(
    #     tracker_url="http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
    #     stats=ItemValue("stats")
    # )
    ExternalProcess("sleep", ["sleep", "60"]),
)
 def __init__(self, args):
     ExternalProcess.__init__(
         self,
         "WgetDownload",
         args=args,
     )
 def __init__(self, args):
     ExternalProcess.__init__(
         self,
         "DeduplicateWarcExtProc",
         args=args,
     )
예제 #21
0
        <p>Downloading FurAffinity</p>
        <!--<p class="projectBroadcastMessage"></p>-->
    """,
    # utc_deadline=datetime.datetime(2000, 1, 1, 23, 59, 0)
)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                       VERSION), PrepareDirectories(warc_prefix="furaffinity"),
    ExternalProcess(
        'Begin',
        [sys.executable, 'helper.py', 'begin'],
        env={
            'user_agent': user_agent,
            'bind_address': globals().get('bind_address', ''),
            'disco_tracker': DISCO_TRACKER_URL,
            "item_dir": ItemValue("item_dir"),
        },
        accept_on_exit_code=[0],
    ),
    LimitConcurrent(
        NumberConfigValue(
            min=1,
            max=6,
            default=globals().get("num_procs", "1"),
            name="shared:fagrab:num_procs",
            title="Number of Processes",
            description="The maximum number of concurrent download processes."
        ),
        WgetDownload(WgetArgs(),
예제 #22
0
 def handle_process_error(self, exit_code, item):
     self.args = self.OLD_ARGS
     item.log_output('Submodule could not be automatically updated.')
     item.log_output('* It is safe to ignore the following error. *')
     ExternalProcess.handle_process_error(self, exit_code, item)
        IOLoop.instance().add_timeout(datetime.timedelta(seconds=10),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


class IdleTask(Task):
    def __init__(self):
        Task.__init__(self, 'IdleTask')

    def enqueue(self, item):
        self.start_item(item)
        item.may_be_canceled = True
        item.log_output('Pausing for 60 seconds...')

        IOLoop.instance().add_timeout(datetime.timedelta(seconds=60),
                                      functools.partial(self._finish, item))

    def _finish(self, item):
        item.may_be_canceled = False
        self.complete_item(item)


pipeline = Pipeline(
    WarningTask(),
    LimitConcurrent(1, ExternalProcess('Install', ['./install.sh'])),
    IdleTask(),
)
 def __init__(self, args):
     ExternalProcess.__init__(
         self, "DeduplicateWarcExtProc", args=args, accept_on_exit_code=[0], 
         retry_on_exit_code=[2])
예제 #25
0
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
	title="sourceforgersync",
	project_html="""
		<img class="project-logo" alt="Project logo" src="" height="50px" title=""/>
		<h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2>
		<p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p>
	"""
)

pipeline = Pipeline(
	CheckIP(),
	GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
	ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]),
	LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])),
	ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]),
	LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
		name="shared:rsync_threads", title="Rsync threads",
		description="The maximum number of concurrent uploads."),
		UploadWithTracker(
			"http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
			downloader=downloader,
			version=VERSION,
			files=[
				cleanItem("%(data_dir)s/%(item_name)s.tar.gz")
				#ItemInterpolation("foo.tar.gz")
			],
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			rsync_extra_args=[