Пример #1
0
 def data(self, item):
     data = {
         "downloader": realize(self.downloader, item),
         "api_version": "2"
     }
     if self.version:
         data["version"] = realize(self.version, item)
     return data
Пример #2
0
 def data(self, item):
     data = {
         "downloader": realize(self.downloader, item),
         "api_version": "2"
     }
     if self.version:
         data["version"] = realize(self.version, item)
     return data
Пример #3
0
 def data(self, item):
     data = {
         "downloader": realize(self.downloader, item),
         "item_name": item["item_name"]
     }
     if self.version:
         data["version"] = realize(self.version, item)
     return data
Пример #4
0
 def stdin_data(self, item):
     return "".join(
         [
             "%s\n" % os.path.relpath(
                 realize(f, item),
                 realize(self.target_source_path, item)
             )
             for f in realize(self.files, item)
         ]).encode('utf-8')
Пример #5
0
def start_warrior_server(warrior,
                         bind_address="localhost",
                         port_number=8001,
                         http_username=None,
                         http_password=None):
    '''Starts the warrior web interface.'''
    SeesawConnection.warrior = warrior

    warrior.on_projects_loaded += SeesawConnection.handle_projects_loaded
    warrior.on_project_refresh += SeesawConnection.handle_project_refresh
    warrior.on_project_installing += SeesawConnection.handle_project_installing
    warrior.on_project_installed += SeesawConnection.handle_project_installed
    warrior.on_project_installation_failed += \
        SeesawConnection.handle_project_installation_failed
    warrior.on_project_selected += SeesawConnection.handle_project_selected
    warrior.on_broadcast_message_received += SeesawConnection.handle_broadcast_message
    warrior.on_status += SeesawConnection.handle_warrior_status
    warrior.runner.on_pipeline_start_item += SeesawConnection.handle_start_item
    warrior.runner.on_pipeline_finish_item += \
        SeesawConnection.handle_finish_item
    warrior.runner.on_status += SeesawConnection.handle_runner_status

    if not http_username:
        http_username = warrior.http_username
    if not http_password:
        http_password = warrior.http_password

    ioloop.PeriodicCallback(SeesawConnection.broadcast_bandwidth, 1000).start()
    ioloop.PeriodicCallback(SeesawConnection.broadcast_timestamp, 1000).start()

    router = SockJSRouter(SeesawConnection)

    application = web.Application(
        router.apply_routes([(r"/(.*\.(html|css|js|swf|png|ico))$",
                              web.StaticFileHandler, {
                                  "path": PUBLIC_PATH
                              }), ("/", IndexHandler),
                             ("/api/(.+)$", ApiHandler, {
                                 "warrior": warrior
                             })]),
        #   flash_policy_port = 843,
        #   flash_policy_file = os.path.join(PUBLIC_PATH, "flashpolicy.xml"),
        socket_io_address=bind_address,
        socket_io_port=port_number,

        # settings for AuthenticatedApplication
        auth_enabled=lambda: (realize(http_password) or "").strip() != "",
        check_auth=lambda r, username, password:
        (password == realize(http_password) and
         (realize(http_username) or "").strip() in ["", username]),
        auth_realm="ArchiveTeam Warrior",
        skip_auth=[])

    application.listen(port_number, bind_address)
Пример #6
0
def start_warrior_server(warrior, bind_address="localhost", port_number=8001,
                         http_username=None, http_password=None):
    '''Starts the warrior web interface.'''
    SeesawConnection.warrior = warrior

    warrior.on_projects_loaded += SeesawConnection.handle_projects_loaded
    warrior.on_project_refresh += SeesawConnection.handle_project_refresh
    warrior.on_project_installing += SeesawConnection.handle_project_installing
    warrior.on_project_installed += SeesawConnection.handle_project_installed
    warrior.on_project_installation_failed += \
        SeesawConnection.handle_project_installation_failed
    warrior.on_project_selected += SeesawConnection.handle_project_selected
    warrior.on_broadcast_message_received += SeesawConnection.handle_broadcast_message
    warrior.on_status += SeesawConnection.handle_warrior_status
    warrior.runner.on_pipeline_start_item += SeesawConnection.handle_start_item
    warrior.runner.on_pipeline_finish_item += \
        SeesawConnection.handle_finish_item
    warrior.runner.on_status += SeesawConnection.handle_runner_status

    if not http_username:
        http_username = warrior.http_username
    if not http_password:
        http_password = warrior.http_password

    ioloop.PeriodicCallback(SeesawConnection.broadcast_bandwidth, 1000).start()
    ioloop.PeriodicCallback(SeesawConnection.broadcast_timestamp, 1000).start()
    
    router = SockJSRouter(SeesawConnection)

    application = web.Application(
        router.apply_routes([
            (r"/(.*\.(html|css|js|swf|png|ico))$",
                web.StaticFileHandler, {"path": PUBLIC_PATH}),
            ("/", IndexHandler),
            ("/api/(.+)$", ApiHandler, {"warrior": warrior})]),
        #   flash_policy_port = 843,
        #   flash_policy_file = os.path.join(PUBLIC_PATH, "flashpolicy.xml"),
        socket_io_address=bind_address,
        socket_io_port=port_number,

        # settings for AuthenticatedApplication
        auth_enabled=lambda: (realize(http_password) or "").strip() != "",
        check_auth=lambda r, username, password:
            (
                password == realize(http_password) and
                (realize(http_username) or "").strip() in ["", username]
            ),
        auth_realm="ArchiveTeam Warrior",
        skip_auth=[]
    )

    application.listen(port_number, bind_address)
Пример #7
0
	def process(self, item):
		total_bytes = {}
		for (group, files) in self.file_groups.iteritems():
			total_bytes[group] = sum([ os.path.getsize(f) for f in realize(files, item)])

		stats = {}
		stats.update(self.defaults)
		stats["item"] = item["item_name"]
		stats["bytes"] = total_bytes

		if self.id_function:
			stats["id"] = self.id_function(item)

		item["stats"] = realize(stats, item)
Пример #8
0
    def process(self, item):
        total_bytes = {}
        for (group, files) in self.file_groups.iteritems():
            total_bytes[group] = sum([ os.path.getsize(f) for f in realize(files, item)])

        stats = {}
        stats.update(self.defaults)
        stats["item"] = item["item_name"]
        stats["bytes"] = total_bytes

        if self.id_function:
            stats["id"] = self.id_function(item)

        item["stats"] = realize(stats, item)
Пример #9
0
    def process(self, item):
        with self.task_cwd():
            p = AsyncPopen(args=realize(self.args, item),
                           env=realize(self.env, item),
                           stdin=subprocess.PIPE,
                           close_fds=True)

            p.on_output += functools.partial(self.on_subprocess_stdout, p,
                                             item)
            p.on_end += functools.partial(self.on_subprocess_end, item)

            p.run()

            p.stdin.write(self.stdin_data(item))
            p.stdin.close()
Пример #10
0
	def process_body(self, body, item):
		data = json.loads(body)
		if "upload_target" in data:
			files = realize(self.files, item)
			inner_task = None

			if re.match(r"^rsync://", data["upload_target"]):
				item.log_output("Uploading with Rsync to %s" % data["upload_target"])
				inner_task = RsyncUpload(data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1)

			elif re.match(r"^https?://", data["upload_target"]):
				item.log_output("Uploading with Curl to %s" % data["upload_target"])

				if len(files) != 1:
					item.log_output("Curl expects to upload a single file.")
					self.fail_item(item)
					return

				inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1)

			else:
				item.log_output("Received invalid upload type.")
				self.fail_item(item)
				return

			inner_task.on_complete_item += self._inner_task_complete_item
			inner_task.on_fail_item += self._inner_task_fail_item
			inner_task.enqueue(item)

		else:
			item.log_output("Tracker did not provide an upload target.")
			self.schedule_retry(item)
Пример #11
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies',
            '--content-on-error', '--lua-script', 'mercurial.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--span-hosts', '--waitretry', '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s-main'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'mercurial-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('mercurial-item: %(item_name)s'),
            '--warc-dedup-url-agnostic'
        ]

        item_name = item['item_name']
        item_value = item_name

        item['item_value'] = item_value

        wget_args.extend(
            ['--warc-header', 'mercurial-repository: ' + str(item_value)])
        wget_args.extend(['--warc-header', 'warc-type: main'])
        wget_args.append(item_value + '?cmd=capabilities')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #12
0
  def process(self, item):
    with self.task_cwd():
      p = AsyncPopen(
          args=realize(self.args, item),
          env=realize(self.env, item),
          stdin=subprocess.PIPE,
          close_fds=True
      )

      p.on_output += functools.partial(self.on_subprocess_stdout, p, item)
      p.on_end += functools.partial(self.on_subprocess_end, item)

      p.run()

      p.stdin.write(self.stdin_data(item))
      p.stdin.close()
Пример #13
0
    def process_body(self, body, item):
        data = json.loads(body)
        if "upload_target" in data:
            files = realize(self.files, item)
            inner_task = None

            if re.match(r"^rsync://", data["upload_target"]):
                item.log_output("Uploading with Rsync to %s" % data["upload_target"])
                inner_task = RsyncUpload(data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1)

            elif re.match(r"^https?://", data["upload_target"]):
                item.log_output("Uploading with Curl to %s" % data["upload_target"])

                if len(files) != 1:
                    item.log_output("Curl expects to upload a single file.")
                    self.fail_item(item)
                    return

                inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1)

            else:
                item.log_output("Received invalid upload type.")
                self.fail_item(item)
                return

            inner_task.on_complete_item += self._inner_task_complete_item
            inner_task.on_fail_item += self._inner_task_fail_item
            inner_task.enqueue(item)

        else:
            item.log_output("Tracker did not provide an upload target.")
            self.schedule_retry(item)
Пример #14
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U',
            USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script',
            'domains.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'domains-dld-script-version: ' + VERSION,
            '--warc-header',
            ItemInterpolation('domains-item: %(item_name)s'),
            '--warc-dedup-url-agnostic',
        ]

        item_name = item['item_name']

        wget_args.extend(['--domains', item_name])
        wget_args.extend(['--warc-header', 'domain: ' + item_name])

        wget_args.append('http://{}/'.format(item_name))
        wget_args.append('https://{}/'.format(item_name))

        if item_name.count('.') == 1:
            wget_args.append('http://www.{}/'.format(item_name))
            wget_args.append('https://www.{}/'.format(item_name))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #15
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "furaffinity.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "furaffinity.net",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "furaffinity-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("furaffinity-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ('image', 'imagelogin')
     
     if item_type == 'image':
         suffixesa = string.digits + string.lowercase
         suffixesb = string.digits + string.lowercase
         
         for url in ['http://www.furaffinity.net/view/{0}{1}{2}/'.format(item_value, a, b) for a in suffixesa for b in suffixesb]:
             wget_args.append(url)
         wget_args.extend(["--no-cookies"])
     elif item_type == 'imagelogin'
         suffixesa = string.digits + string.lowercase
         suffixesb = string.digits + string.lowercase
         
         for url in ['http://www.furaffinity.net/view/{0}{1}{2}/'.format(item_value, a, b) for a in suffixesa for b in suffixesb]:
             wget_args.append(url)
         wget_args.extend(["--load-cookies", "cookies.txt"])
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
Пример #16
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', 'tumblr-static.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'tumblr-static-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('tumblr-static-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'list':
            r = requests.get('http://grafana.fvz.io/.well-known/at/{}'.format(item_value))
            if r.status_code != 200:
                raise Exception('Could not get URLs list from github.')
            for url in r.text.splitlines():
                url = url.strip()
                if '%20' in url:
                    urls = url.split('%20')
                    urls.append(url.replace('%20', ''))
                else:
                    urls = [url]
                for url in urls:
                    if len(url) == 0 or not re.search(r'^https?://[^/]+/', url) \
                            or 'www.tumblr.com' in url:
                        continue
                    wget_args.extend(['--warc-header', 'tumblr-static-url: {}'.format(url)])
                    wget_args.append(url)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #17
0
    def realize(self, item):
        yga_args = [PYTHON, '../../../yahoo.py', '-a', '-t', '-w']

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()

        if item_type == 'group':
            yga_args.append(item_value)
        elif item_type == 'group_cookie':
            cookie_json = http_client.fetch(
                'https://df58.host.cs.st-andrews.ac.uk/yahoogroups/cookieget/'
                + item_value + '/',
                method='GET')
            if cookie_json.code != 200:
                raise ValueError('Got bad status code {}.'.format(
                    cookie_json.code))

            cookies = json.loads(cookie_json.body.decode('utf-8', 'ignore'))
            yga_args.extend(['-cy', "%s" % cookies["cookie_Y"]])
            yga_args.extend(['-ct', "%s" % cookies["cookie_T"]])
            yga_args.append(item_value)
        else:
            raise Exception('Unknown item')

        return realize(yga_args, item)
Пример #18
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "rutracker.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "rutracker.org",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "rutracker-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("rutracker-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('thread', 'forum')

        if item_type == 'thread':
            suffixes = string.digits
            for suffix in suffixes:
                wget_args.append('http://rutracker.org/forum/viewtopic.php?t={0}{1}'.format(item_value, suffix))
                wget_args.append('http://api.rutracker.org/v1/get_peer_stats?by=topic_id&val={0}{1}'.format(item_value, suffix))
                wget_args.append('http://api.rutracker.org/v1/get_tor_hash?by=topic_id&val={0}{1}'.format(item_value, suffix))
                wget_args.append('http://api.rutracker.org/v1/get_tor_topic_data?by=topic_id&val={0}{1}'.format(item_value, suffix))
        elif item_type == 'forum':
            suffixes = string.digits
            for suffix in suffixes:
                wget_args.append('http://rutracker.org/forum/viewforum.php?f={0}{1}'.format(item_value, suffix))
                wget_args.append('http://api.rutracker.org/v1/get_forum_name?by=forum_id&val={0}{1}'.format(item_value, suffix))
                wget_args.append('http://api.rutracker.org/v1/get_forum_data?by=forum_id&val={0}{1}'.format(item_value, suffix))
                wget_args.append('http://api.rutracker.org/v1/static/pvc/f/{0}{1}'.format(item_value, suffix))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #19
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error',
            '--lua-script', 'parler.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'parler.com', '--span-hosts', '--waitretry',
            '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic', '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict',
            ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        for item_name in item['item_name'].split('\0'):
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'post':
                wget_args.extend(
                    ['--warc-header', 'parler-post: {}'.format(item_value)])
                wget_args.append(
                    'https://parler.com/post/{}'.format(item_value))
            elif item_type == 'profile':
                wget_args.extend(
                    ['--warc-header', 'parler-post: {}'.format(item_value)])
                wget_args.append(
                    'https://parler.com/profile/{}'.format(item_value))
            elif item_type == 'url':
                wget_args.append(item_value)
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #20
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', 'firefox-addons.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'mozilla.org',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'firefox-addons-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('firefox-addons-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'ffaddon':
            wget_args.extend(['--warc-header', 'firefox-addon-identifier: {}'.format(item_value)])
            wget_args.append('https://addons.mozilla.org/en-US/firefox/addon/{}/'.format(item_value))
            wget_args.append('https://services.addons.mozilla.org/api/v3/addons/addon/{}/'.format(item_value))
            wget_args.append('https://services.addons.mozilla.org/api/v4/addons/addon/{}/'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/en/firefox/addon/{}/'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=homepage-collection-featured'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=featured'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hp-dl-promo'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=collection'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hotness'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=rating'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=recommended_fallback'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/'.format(item_value))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
    def realize(self, item):
        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        wpull_args = [
            WPULL_EXE,
            '-nv',
            '-U', 'ArchiveTeam; Googlebot/2.1',
            '--no-check-certificate',
            '--no-robots',
            '--dns-timeout', '20',
            '--connect-timeout', '20',
            '--read-timeout', '900',
            '--session-timeout', '1800',
            '--tries', '5',
            '--waitretry', '5',
            '--max-redirect', '20',
            '--output-file', ItemInterpolation("%(item_dir)s/wpull.log"),
            '--database', ItemInterpolation("%(item_dir)s/wpull.db"),
            '--delete-after',
            '--page-requisites',
            '--no-parent',
            '--concurrent', '5',
            '--warc-file', ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            '--level', '0',
            '--page-requisites-level', '5',
            '--span-hosts-allow', 'page-requisites',
            '--warc-header', 'pipeline-py-sha256: ' + PIPELINE_SHA256,
            '--warc-header', 'warrior-install-sh-sha256: ' + WARRIOR_INSTALL_SHA256,
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'newsgrabber-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('ftp-item: %(item_name)s'),
            '--reject-regex', r'^https?://launcher\.spot\.im/spot/(www\.spot\.im/launcher/|launcher\.spot\.im/|modules/launcher/){3,}bundle\.js$'
        ]

        if '-videos' in item_value:
            wpull_args.append('--youtube-dl')
            wpull_args.append('--youtube-dl-exe')
            wpull_args.append(YOUTUBE_DL_EXE)

        list_url = 'http://master.newsbuddy.net/' + item_value
        list_data = requests.get(list_url)
        #wpull_args.append(list_url)
        if list_data.status_code == 200:
            for url in list_data.text.splitlines():
                url = url.strip()
                wpull_args.append(url)

        if 'bind_address' in globals():
            wpull_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wpull will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wpull_args, item)
Пример #22
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "gamefront.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "gamefront.com",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "gamefront-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("gamefront-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ('file', 'singlefile')
     
     if item_type == 'file':
         suffixes = string.digits
         for suffix in suffixes:
             wget_args.append('http://www.gamefront.com/files/{0}{1}'.format(item_value, suffix))
     elif item_type == 'singlefile':
         wget_args.append('http://www.gamefront.com/files/{0}'.format(item_value))
         session1 = requests.Session()
         mainpage = session1.get('http://www.gamefront.com/files/' + item_value).text
         if re.search(r"plopMe\('[0-9]+',\s+'[^']+'\)", mainpage):
             plopme = re.search(r"plopMe\('[0-9]+',\s+'([^']+)'\)", mainpage).group(1)
             print('Received token ' + plopme + '.')
             print('Received ' + session1.post('http://www.gamefront.com/files/service/request', data = {'token':plopme}, headers={'referer': 'http://www.gamefront.com/files/' + item_value}).text + '.')
         session1.get('http://www.gamefront.com/files/service/thankyou?id=' + item_value, headers={'referer': 'http://www.gamefront.com/files/' + item_value})
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
 def realize(self, item):
     dedupe_args = [
         PYTHON35_EXE,
         "-u",
         "deduplicate.py",
         ItemInterpolation("%(data_dir)s/%(item_name)s.warc.gz"),
     ]
     return realize(dedupe_args, item)
Пример #24
0
 def enqueue(self, item):
     self.start_item(item)
     item.log_output("Starting %s for %s\n" % (self, item.description()))
     item["tries"] = 1
     item['WgetDownloadMany.urls'] = realize(self.unrealized_urls, item)
     item['WgetDownloadMany.urls_index'] = 0
     item['WgetDownloadMany.current_url'] = None
     self.process(item)
Пример #25
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'pastebin.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'pastebin.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'pastebin-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('pastebin-item: %(item_name)s'),
            '--warc-dedup-url-agnostic',
            '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        item_name = item['item_name']
        item_value = item_name
        if len(item_value) > 8:
            item_value = self.int_to_str(int(item_name.replace('b36.', ''), 36))

        item['item_value'] = item_value

        wget_args.extend(['--warc-header', 'pastebin-paste: ' + str(item_value)])
        wget_args.append('https://pastebin.com/{}'.format(item_value))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #26
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            USER_AGENT,
            "-nv",
            "--lua-script",
            "musicbrainz.lua",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            "--rotate-dns",
            "--no-parent",
            "--page-requisites",
            "--timeout",
            "30",
            "--tries",
            "inf",
            "--span-hosts",
            "--waitretry",
            "30",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "musicbrainz-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("musicbrainz-user: %(item_name)s"),
        ]

        item_name = item["item_name"]
        assert ":" in item_name
        item_sort, item_item, item_file = item_name.split(":", 2)

        item["item_item"] = item_item

        item_list = requests.get("http://archive.org/download/{0}/{1}".format(item_item, item_file))
        if item_list.status_code != 200:
            raise Exception(
                "You received status code %d with URL %s"
                % (item_list.status_code, "https://archive.org/download/{0}/{1}".format(item_item, item_file))
            )
        for url in item_list.text.splitlines():
            wget_args.append("{0}".format(url))

        if "bind_address" in globals():
            wget_args.extend(["--bind-address", globals()["bind_address"]])
            print("")
            print("*** Wget will bind address at {0} ***".format(globals()["bind_address"]))
            print("")

        return realize(wget_args, item)
Пример #27
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", random.choice(USER_AGENTS),
            "-nv",
            "--lua-script", "twitpic-api.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--no-cookies",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--span-hosts",
            "--waitretry", "30",
            "--domains", "twitpic.com,cloudfront.net,twimg.com,amazonaws.com",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "twitpic-api-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("twitpic-api-user: %(item_name)s"),
            "--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "--header", "DNT: 1",
            "--header", random.choice(ACCEPT_LANGUAGE_HEADERS),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('image', 'user', 'tag', 'event')
        
        if item_type == 'imageapi':
            suffixes = string.digits + string.lowercase

            for args in [(
                          'http://api.twitpic.com/2/media/show.json?id={0}{1}'.format(item_value, s), \
                          'http://api.twitpic.com/2/comments/show.json?media_id={0}{1}&page=1'.format(item_value, s)) for s in suffixes]:
                wget_args.append(args[0])
                wget_args.append(args[1])
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #28
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--no-cookies",
            "--lua-script", "portalgraphics.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "portalgraphics.net",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "portalgraphics-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("portalgraphics-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 2)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('image_id', 'user_id')

        if item_type == 'image_id':
            wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}&lang=ja'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}&lang=en'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}&lang=ja'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}&lang=en'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/movie/address.php?image%5Fid={0}'.format(item_value))
            wget_args.append('http://www.portalgraphics.net/pg/movie/address.php?image_id={0}'.format(item_value))
        elif item_type == 'user_id':
            wget_args.append('http://portalgraphics.net/pg/profile/?user_id={0}'.format(item_value))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #29
0
    def realize(self, item):
        wget_args = [
            WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script',
            'sketch.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'sonymobile.com', '--span-hosts',
            '--waitretry', '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'sketch-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('sketch-item: %(item_name)s'), '--header',
            'Accept-Encoding: gzip', '--compression', 'gzip'
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()

        if item_type == 'sketches':
            r = http_client.fetch('http://103.230.141.2/sketch/' + item_value,
                                  method='GET')
            for s in r.body.decode('utf-8', 'ignore').splitlines():
                s = s.strip()
                if len(s) == 0:
                    continue
                wget_args.extend(
                    ['--warc-header', 'sketch-sketch-id: {}'.format(s)])
                wget_args.append(
                    'https://sketch.sonymobile.com/api/1/sharedsketch/{}'.
                    format(s))
        elif item_type == 'user':
            wget_args.extend(
                ['--warc-header', 'sketch-user-id: '.format(item_value)])
            wget_args.append(
                'https://sketch.sonymobile.com/api/1/artist/{}'.format(
                    item_value))
        else:
            raise Exception('Unknown item')

        http_client.close()

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #30
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--no-cookies",
            "--lua-script", "panoramio.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "panoramio.com",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "panoramio-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("panoramio-item: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('photos', 'users')

        if item_type == 'photos':
            start, stop = item_value.split('-')
            for i in range(int(start), int(stop)+1):
                wget_args.extend(['--warc-header', 'panoramio-photo: {i}'.format(**locals())])
                wget_args.append('http://www.panoramio.com/photo/{i}'.format(**locals()))
        elif item_type == 'users':
            start, stop = item_value.split('-')
            for i in range(int(start), int(stop)+1):
                wget_args.extend(['--warc-header', 'panoramio-user: {i}'.format(**locals())])
                wget_args.append('http://www.panoramio.com/user/{i}'.format(**locals()))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
 def realize(self, item):
     dedup_args = [
         PYTHON2_EXE,
         '-u', # no output buffering
         'dedupe.py',
         '%(item_dir)s/%(warc_file_base)s.warc.gz' % item,
         '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item
     ]
     return realize(dedup_args, item)
Пример #32
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', '500px.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', '500px.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', '500px-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('500px-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'photos':
            for id_ in item_value.split(';'):
                wget_args.extend(['--warc-header', '500px-photo: {}'.format(id_)])
                wget_args.append('https://500px.com/photo/{}'.format(id_))
                wget_args.append('https://api.500px.com/v1/photos/{}/comments?sort=created_at&include_subscription=1&include_flagged=1&nested=1&page=1&rpp=30'.format(id_))
                wget_args.append('https://api.500px.com/v1/photos?image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&expanded_user_info=true&include_tags=true&include_geo=true&include_equipment_info=true&include_licensing=true&include_releases=true&liked_by=1&following_sample=100&ids={}'.format(id_))
                #wget_args.append('https://api.500px.com/v1/photos/{}/navigation?from=user&formats=jpeg%2Clytro&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14'.format(id_))
        elif item_type == 'all':
            start, end = item_value.split('-')
            for id_ in range(int(start), int(end)+1):
                wget_args.extend(['--warc-header', '500px-photo: {}'.format(id_)])
                wget_args.append('https://500px.com/photo/{}'.format(id_))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #33
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "sourceforge.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "sourceforge.net",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "sourceforge-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("sourceforge-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ('project')
     
     if item_type == 'project':
         wget_args.append('http://sourceforge.net/projects/{0}/'.format(item_value))
         wget_args.append('http://sourceforge.net/projects/{0}/?source=directory'.format(item_value))
         wget_args.append('http://sourceforge.net/projects/{0}/?source=directory-featured'.format(item_value))
         wget_args.append('http://sourceforge.net/projects/{0}/?source=frontpage&position=1'.format(item_value))
         wget_args.append('http://sourceforge.net/projects/{0}/?source=frontpage'.format(item_value))
         wget_args.append('http://sourceforge.net/projects/{0}/'.format(item_value))
         wget_args.append('http://sourceforge.net/p/{0}/'.format(item_value))
         wget_args.append('http://sourceforge.net/rest/p/{0}/'.format(item_value))
         wget_args.append('http://sourceforge.net/rest/p/{0}?doap'.format(item_value))
         wget_args.append('http://{0}.sourceforge.net/'.format(item_value))
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
Пример #34
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
          #  "--no-cookies",
            "--load-cookies", "cookies.txt",
            "--lua-script", "codebender.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "codebender.cc",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "codebender-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("codebender-item: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('sketches', 'user')

        if item_type == 'sketches':
            start, stop = item_value.split('-')
            for i in range(int(start), int(stop)+1):
                wget_args.extend(['--warc-header', 'codebender-sketch: {i}'.format(**locals())])
                wget_args.append('https://codebender.cc/sketch:{i}'.format(**locals()))
                wget_args.append('https://codebender.cc/sketch:{i}?noCookies=true'.format(**locals()))
                wget_args.append('https://codebender.cc/utilities/download/{i}'.format(**locals()))
                wget_args.append('https://codebender.cc/utilities/download/{i}?noCookies=true'.format(**locals()))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #35
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "yuku.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--no-cookies",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "yuku.com",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "yuku-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("yuku-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_name, item_type, item_value, item_thread = item_name.split(':', 3)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        item['item_thread'] = item_thread

        # Example item: yuku:10threads:deltasforest29697:17
        
        assert item_type in ('thread', '10threads')

        if item_type == 'thread':
            wget_args.append('http://%s.yuku.com/topic/%s/'%(item_value, item_thread))
        elif item_type == '10threads':
            suffixes = string.digits
            for suffix in suffixes:
                wget_args.append('http://%s.yuku.com/topic/%s%s/'%(item_value, item_thread, suffix))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #36
0
    def realize(self, item):
        wget_args = [
            WPULL_EXE,
            "-nv",
            "--python-script", "ftp.py",
            "-o", ItemInterpolation("%(item_dir)s/wpull.log"),
            "--no-check-certificate",
            "--database", ItemInterpolation("%(item_dir)s/wpull.db"),
            "--delete-after",
            "--no-robots",
            "--no-cookies",
            "--rotate-dns",
            "--timeout", "60",
            "--tries", "inf",
            "--wait", "0.5",
            "--random-wait",
            "--waitretry", "5",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "ftp-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("ftp-user: %(item_name)s"),
            ]

        item_name = item['item_name']
        assert ':' in item_name
        item_sort, item_item, item_file = item_name.split(':', 2)

        item['item_item'] = item_item

        MAX_SIZE = 10737418240
        
        item_list = requests.get('http://archive.org/download/{0}/{1}'.format(item_item, item_file))
        if item_list.status_code != 200:
            raise Exception('You received status code %d with URL %s'%(item_list.status_code, 'https://archive.org/download/{0}/{1}'.format(item_item, item_file)))
        itemsize = int(re.search(r'ITEM_TOTAL_SIZE: ([0-9]+)', item_list.text).group(1))
        if itemsize > MAX_SIZE:
            raise Exception('Item is %d bytes. This is larger then %d bytes.'%(itemsize, MAX_SIZE))
        for url in item_list.text.splitlines():
            if url.startswith('ftp://'):
                url = url.replace(' ', '%20').replace('&', '&')
                url = urllib.unquote(url)
                if item_item == 'archiveteam_ftp_items_2015120102':
                    url = url.replace('ftp://ftp.research.microsoft.com/downloads/downloads/', 'ftp://ftp.research.microsoft.com/downloads/')
                if '#' in url:
                    raise Exception('%s containes a bad character.'%(url))
                else:
                    wget_args.append("{0}".format(url))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #37
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "canvas.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--no-cookies",
            "--rotate-dns",
#             "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "60",
            "--tries", "inf",
            "--span-hosts",
            "--waitretry", "3600",
            "--domains", "canv.as,drawquest-export.s3-website-us-east-1.amazonaws.com",
            "--warc-file",
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "canvas-archive-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("canvas-user: %(item_name)s"),
            "--header", "Host: drawquest-export.s3-website-us-east-1.amazonaws.com",
        ]

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('user', 'homepage')

        if item_type == 'user':
            wget_args.append('http://canv.as/{0}/'.format(item_value))
            wget_args.extend(["--recursive", "--level=inf"])

        elif item_type == 'homepage':
            wget_args.append('http://canv.as/')

        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #38
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script', 'so-net-u-page-plus.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'upp.so-net.ne.jp',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
        ]

        for item_name in item['item_name'].split('\0'):
            item_name = item_name.replace('http://', '')
            wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
            wget_args.append('item-name://'+item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == "userdir":
                wget_args.extend(['--warc-header', 'so-net-u-page-plus-userdir: ' + item_value])
                hostname = item_value.split("/")[0]
                user_dir_name = item_value.split("/")[1]
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/'.format(hostname, user_dir_name))
                # Alternate forms, because I amn't sure how they parse this
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.htm'.format(hostname, user_dir_name))
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.html'.format(hostname, user_dir_name))
                wget_args.append('http://{}.upp.so-net.ne.jp/{}'.format(hostname, user_dir_name))
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #39
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "gamefront.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--no-cookies",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "gamefront.com,filefront.com",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "gamefront-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("gamefront-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('forums', 'members')

        suffixes = string.digits

        if item_type == 'forums':
            for suffix in suffixes:
                wget_args.append('http://forums.filefront.com/showthread.php?t={0}{1}'.format(item_value, suffix))
        elif item_type == 'members':
            for suffix in suffixes:
                wget_args.append('http://forums.filefront.com/member.php?u={0}{1}'.format(item_value, suffix))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #40
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "olympe.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "olympe-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("olympe-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('site')

        if item_type == 'site':
            urls = ['http://' + item_value + '.olympe.in/']
            response = requests.get(urls[0])
            urls.append(re.search(r'^(https?://[^/]+)', response.url).group(1))
            with open('seedurls', 'w') as file:
                file.write('\n'.join([re.search(r'^https?://([^/]+)', url).group(1) for url in urls]))
            for url in urls:
                wget_args.append(url)
            wget_args.append(response.url)
            wget_args += ["--domains", ','.join([re.search(r'^https?://([^/]+)', url).group(1) for url in urls])]
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #41
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "yahoomaps.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
#            "--recursive", "--level=inf",
            "--no-parent",
#            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "yahoo.com,here.com",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "sourceforge-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("yahoomaps-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('tiles')
        
        if item_type == 'tiles':
            tile_layer, tile_num, tile_range = item_value.split(':')
            tile_start, tile_end = tile_range.split('-')
            print(tile_start)
            print(tile_end)
            for tilenum in range(int(tile_start), int(tile_end)):
                wget_args.append('http://1.base.maps.api.here.com/maptile/2.1/maptile/187ddf591c/normal.day/{0}/{1}/{2}/256/png8?lg=ENG&token=TrLJuXVK62IQk0vuXFzaig%3D%3D&requestid=yahoo.prod&app_id=eAdkWGYRoc4RfxVo0Z4B'.format(tile_layer, tile_num, tilenum))
                wget_args.append('http://1.aerial.maps.api.here.com/maptile/2.1/maptile/187ddf591c/hybrid.day/{0}/{1}/{2}/256/jpg?lg=ENG&token=TrLJuXVK62IQk0vuXFzaig%3D%3D&requestid=yahoo.prod&app_id=eAdkWGYRoc4RfxVo0Z4B'.format(tile_layer, tile_num, tilenum))
                wget_args.append('http://1.aerial.maps.api.here.com/maptile/2.1/maptile/187ddf591c/satellite.day/{0}/{1}/{2}/256/jpg?lg=ENG&token=TrLJuXVK62IQk0vuXFzaig%3D%3D&requestid=yahoo.prod&app_id=eAdkWGYRoc4RfxVo0Z4B'.format(tile_layer, tile_num, tilenum))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #42
0
    def realize(self, item):
        wget_args = [
            WPULL_EXE,
            "-nv",
            # "--user-agent", USER_AGENT,
            "--python-script",
            "examplecity.py",
            "-o",
            ItemInterpolation("%(item_dir)s/wpull.log"),
            "--no-check-certificate",
            "--database",
            ItemInterpolation("%(item_dir)s/wpull.db"),
            "--delete-after",
            "--no-robots",
            "--no-cookies",
            "--rotate-dns",
            # "--recursive", "--level=inf",
            "--recursive",
            "--level=2",
            "--no-parent",
            "--page-requisites",
            "--span-hosts-allow",
            "page-requisites,linked-pages",
            "--timeout",
            "30",
            "--tries",
            "2",
            "--wait",
            "0.5",
            "--random-wait",
            "--waitretry",
            "5",
            # "--domains", "example.com,example.net",
            # "--hostnames", "assets.cloudspeeder.invalid,cnd.wahoo.invalid",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "examplecity-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("examplecity-user: %(item_name)s"),
        ]

        domain = item['item_name']
        wget_args.append("http://{0}".format(domain))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #43
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--load-cookies", "cookies.txt",
            "--lua-script", "coursera.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "coursera.org",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "coursera-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("coursera-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('oldcourse')

        if item_type == 'oldcourse':
            X_CSRFToken = ''.join(random.choice(string.digits) for i in range(20))
            X_CSRF2_Cookie = 'csrf2_token_' + ''.join(random.choice(string.digits) for i in range(8))
            X_CSRF2_Token = ''.join(random.choice(string.digits) for i in range(24))
            Cookie = "csrftoken=%s; %s=%s" % (X_CSRFToken, X_CSRF2_Cookie, X_CSRF2_Token)
            os.system(WGET_LUA + " --save-cookies cookies.txt --keep-session-cookies --post-data '[email protected]&password=123456&webrequest=true' --header='Cookie: " + Cookie + "' --header='X-CSRFToken: " + X_CSRFToken + "' --header='X-CSRF2-Cookie: " + X_CSRF2_Cookie + "' --header='X-CSRF2-Token: " + X_CSRF2_Token + "' https://www.coursera.org/api/login/v3")
            os.remove('v3')
            wget_args.append('https://www.coursera.org/course/' + item_value)
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #44
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "toshiba.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "toshiba.com",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "toshiba-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("toshiba-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ('download')
     
     if item_type == 'download':
         suffixes = string.digits
         for url in ['http://support.toshiba.com/sscontent?contentId={0}{1}{2}'.format(item_value, a, b) for a in suffixes for b in suffixes]:
             wget_args.append(url)
         for url in ['http://support.toshiba.com/support/viewContentDetail?contentId={0}{1}{2}'.format(item_value, a, b) for a in suffixes for b in suffixes]:
             wget_args.append(url)
         for url in ['http://support.toshiba.com/support/staticContentDetail?contentId={0}{1}{2}'.format(item_value, a, b) for a in suffixes for b in suffixes]:
             wget_args.append(url)
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
 def realize(self, item):
     item_name = item['item_name']
     warcfile = item['data_dir'] + "/" + item['item_name'] + ".warc.gz"
     wget_args = [
         'wget', '-nv', '-U', 'ArchiveTeam; Googlebot/2.1', '--tries', '5',
         '--waitretry', '5', '-O', warcfile,
         ItemInterpolation(
             "https://archive.org/download/archiveteam_%(item_name)s/%(item_name)s.megawarc.warc.gz"
         )
     ]
     return realize(wget_args, item)
Пример #46
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'playstv.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'plays.tv',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'playstv-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('playstv-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'user':
            wget_args.extend(['--warc-header', 'playstv-user-id: ' + item_value])
            wget_args.append('https://plays.tv/playsapi/usersys/v1/user/' + item_value)
        elif item_type == 'video':
            for s in item_value.split(';'):
                print(s)
                wget_args.extend(['--warc-header', 'playstv-video-id: ' + s])
                wget_args.append('https://plays.tv/playsapi/feedsys/v1/media/' + s)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #47
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "layervault.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "layervault.com,layervau.lt",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "layervault-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("layervault-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ('project', 'file')
     
     a = string.digits
     
     if item_type == 'project':
         for url in ['https://layervault.com/api/v2/projects/{0}{1}{2}'.format(item_value, sufa, sufb) for sufa in a for sufb in a]:
             wget_args.append(url)
     elif item_type == 'file':
         for url in ['https://layervault.com/api/v2/files/{0}{1}{2}'.format(item_value, sufa, sufb) for sufa in a for sufb in a]:
             wget_args.append(url)
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
Пример #48
0
    def process_one(self, item):
        with self.task_cwd():
            url = item['WgetDownloadMany.current_url']

            item.log_output("Start downloading URL %s" % url)

            p = seesaw.externalprocess.AsyncPopen(
              args=realize(self.args, item) + [url],
              env=realize(self.env, item),
              stdin=subprocess.PIPE,
              close_fds=True
            )

            p.on_output += functools.partial(self.on_subprocess_stdout, p, item)
            p.on_end += functools.partial(self.on_subprocess_end, item)

            p.run()

            p.stdin.write(self.stdin_data(item))
            p.stdin.close()
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error',
            '--load-cookies', 'cookies.txt', '--lua-script',
            'super-mario-maker-bookmarks.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic'
        ]

        item_names = item['item_name'].split('\0')
        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        for item_name in item_names:
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'user':
                wget_args.extend([
                    '--warc-header',
                    'super-mario-world-bookmarks-user: '******'https://supermariomakerbookmark.nintendo.net/profile/' +
                    item_value)
            elif item_type == 'course':
                wget_args.extend([
                    '--warc-header',
                    'super-mario-world-bookmarks-course: ' + item_value
                ])
                wget_args.append(
                    'https://supermariomakerbookmark.nintendo.net/courses/' +
                    item_value)
            else:
                raise ValueError('item_type not supported.')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #50
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "jux.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "jux.com",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "jux-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("jux-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ("jux")
     
     if item_type == 'jux':
         wget_args.append('http://{0}.jux.com/'.format(item_value))
         wget_args.append('http://{0}.jux.com/robots.txt'.format(item_value))
         wget_args.append('http://{0}.jux.com/sitemap.xml'.format(item_value))
         wget_args.append('http://{0}.jux.com/owner.json'.format(item_value))
         wget_args.append('http://{0}.jux.com/quarks.json'.format(item_value))
         wget_args.append('http://{0}.jux.com/quarks.json?per_page=1000000000'.format(item_value))
         
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
Пример #51
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies',
            '--content-on-error', '--lua-script', 'bitbucket.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'bitbucket.com', '--span-hosts', '--waitretry',
            '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'bitbucket-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('bitbucket-item: %(item_name)s'),
            '--warc-dedup-url-agnostic', '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict',
            ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'hg':
            wget_args.append('https://bitbucket.org/' + item_value + '/src/')
            wget_args.append('https://bitbucket.org/' + item_value)
            wget_args.append('https://bitbucket.org/' + item_value +
                             '/src/default/')
            wget_args.append('https://bitbucket.org/!api/2.0/repositories/' +
                             item_value)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #52
0
 def realize(self, item):
     wget_args = [
         WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
         "--lua-script", "ancestry.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
         "--domains", "mundia.com,muncn.com,genealogy.com,familyorigins.com,genforum.com,myfamily.com",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "ancestry-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("ancestry-user: %(item_name)s"),
     ]
     
     #example item: genealogy:users:c:o:x:Helen-Cox-NJ
     #example item: familytreemaker:users:s:c:h:Aaron-J-Schwartz
     #example item: familyorigins:users:s:c:h:Beverly-G-Schweppe
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ("genforum")
     
     if item_type == "genforum":
         wget_args.append('http://genforum.genealogy.com/{0}/'.format(item_value))
         wget_args.append('http://genforum.com/{0}/'.format(item_value))
         wget_args.extend(["--no-cookies"])
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)
Пример #53
0
    def process(self, item):
        with self.task_cwd():
            p = AsyncPopen2(args=realize(self.args, item),
                            env=realize(self.env, item),
                            stdin=subprocess.PIPE,
                            close_fds=True)

            p.on_output += functools.partial(self.on_subprocess_stdout, p,
                                             item)
            p.on_end += functools.partial(self.on_subprocess_end, item)

            p.run()

            try:
                p.stdin.write(self.stdin_data(item))
            except Exception as error:
                # FIXME: We need to properly propagate errors
                item.log_output("Error writing to process: %s" % str(error))
                item["ExternalProcess.stdin_write_error"] = True

            p.stdin.close()
Пример #54
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "friendfeed.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
#            "--recursive", "--level=inf",
            "--no-parent",
#            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "friendfeed.com,friendfeed-media.com,friendfeed-api.com",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "friendfeed-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("friendfeed-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('account')
        
        if item_type == 'account':
            # wget_args.append('http://friendfeed-api.com/v2/picture/{0}'.format(item_value))
            # wget_args.append('http://friendfeed-api.com/v2/picture/{0}?size=small'.format(item_value))
            # wget_args.append('http://friendfeed-api.com/v2/picture/{0}?size=medium'.format(item_value))
            # wget_args.append('http://friendfeed-api.com/v2/picture/{0}?size=large'.format(item_value))
            wget_args.append('http://friendfeed-api.com/v2/feed/{0}?pretty=1&num=100&start=0&hidden=1&raw=1'.format(item_value))
            wget_args.append('http://friendfeed.com/{0}'.format(item_value))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Пример #55
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "sandbox.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            #  "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "yoyogames.com",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "gamemakersandbox-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("gamemakersandbox-user: %(item_name)s"),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ("user", 'game')

        if item_type == 'user':
            wget_args.append('http://sandbox.yoyogames.com/users/{0}'.format(item_value))
        elif item_type == 'game':
            wget_args.append('http://sandbox.yoyogames.com/games/{0}'.format(item_value))

            game_id = item_value.split('-', 1)[0]
            wget_args.append('http://sandbox.yoyogames.com/games/{0}/download'.format(game_id))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #56
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'gfycat.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'gfycat.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'gfycat-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('gfycat-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'disco':
            prefix = 'https://api.gfycat.com/v1/gfycats/' + item_value
            with open('animals', 'r') as f:
                for line in f:
                    if line.startswith('#'):
                        continue
                    wget_args.append(prefix + line.strip())
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Пример #57
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error',
            '--load-cookies', 'cookies.txt', '--lua-script', 'bintray.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic'
        ]

        item_names = item['item_name'].split('\0')
        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        item_names_to_submit = item_names.copy()
        for item_name in item_names:
            assert item_name not in {'user:account', 'user:assets'
                                     }, 'Doing this out of caution'
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'user':
                wget_args.extend(
                    ['--warc-header', 'bintray-user: '******'https://bintray.com/{item_value}')
                wget_args.append(f'https://bintray.com/{item_value}/')
            elif item_type == 'file':
                wget_args.extend(
                    ['--warc-header', 'bintray-file: ' + item_value])
                assert item_value.startswith(
                    "http"), "If this fails, something strange has happened"
                wget_args.append(item_value)
            else:
                raise ValueError('item_type not supported.')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)