def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'tumblr-static.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'tumblr-static-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('tumblr-static-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'list': r = requests.get('http://grafana.fvz.io/.well-known/at/{}'.format(item_value)) if r.status_code != 200: raise Exception('Could not get URLs list from github.') for url in r.text.splitlines(): url = url.strip() if '%20' in url: urls = url.split('%20') urls.append(url.replace('%20', '')) else: urls = [url] for url in urls: if len(url) == 0 or not re.search(r'^https?://[^/]+/', url) \ or 'www.tumblr.com' in url: continue wget_args.extend(['--warc-header', 'tumblr-static-url: {}'.format(url)]) wget_args.append(url) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'domains.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'domains-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('domains-item: %(item_name)s'), '--warc-dedup-url-agnostic', ] item_name = item['item_name'] wget_args.extend(['--domains', item_name]) wget_args.extend(['--warc-header', 'domain: ' + item_name]) wget_args.append('http://{}/'.format(item_name)) wget_args.append('https://{}/'.format(item_name)) if item_name.count('.') == 1: wget_args.append('http://www.{}/'.format(item_name)) wget_args.append('https://www.{}/'.format(item_name)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'mercurial.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s-main'), '--warc-header', 'operator: Archive Team', '--warc-header', 'mercurial-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('mercurial-item: %(item_name)s'), '--warc-dedup-url-agnostic' ] item_name = item['item_name'] item_value = item_name item['item_value'] = item_value wget_args.extend( ['--warc-header', 'mercurial-repository: ' + str(item_value)]) wget_args.extend(['--warc-header', 'warc-type: main']) wget_args.append(item_value + '?cmd=capabilities') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value wpull_args = [ WPULL_EXE, '-nv', '-U', 'ArchiveTeam; Googlebot/2.1', '--no-check-certificate', '--no-robots', '--dns-timeout', '20', '--connect-timeout', '20', '--read-timeout', '900', '--session-timeout', '1800', '--tries', '5', '--waitretry', '5', '--max-redirect', '20', '--output-file', ItemInterpolation("%(item_dir)s/wpull.log"), '--database', ItemInterpolation("%(item_dir)s/wpull.db"), '--delete-after', '--page-requisites', '--no-parent', '--concurrent', '5', '--warc-file', ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), '--level', '0', '--page-requisites-level', '5', '--span-hosts-allow', 'page-requisites', '--warc-header', 'pipeline-py-sha256: ' + PIPELINE_SHA256, '--warc-header', 'warrior-install-sh-sha256: ' + WARRIOR_INSTALL_SHA256, '--warc-header', 'operator: Archive Team', '--warc-header', 'newsgrabber-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('ftp-item: %(item_name)s'), '--reject-regex', r'^https?://launcher\.spot\.im/spot/(www\.spot\.im/launcher/|launcher\.spot\.im/|modules/launcher/){3,}bundle\.js$' ] if '-videos' in item_value: wpull_args.append('--youtube-dl') wpull_args.append('--youtube-dl-exe') wpull_args.append(YOUTUBE_DL_EXE) list_url = 'http://master.newsbuddy.net/' + item_value list_data = requests.get(list_url) #wpull_args.append(list_url) if list_data.status_code == 200: for url in list_data.text.splitlines(): url = url.strip() wpull_args.append(url) if 'bind_address' in globals(): wpull_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wpull will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wpull_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'firefox-addons.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'mozilla.org', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'firefox-addons-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('firefox-addons-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'ffaddon': wget_args.extend(['--warc-header', 'firefox-addon-identifier: {}'.format(item_value)]) wget_args.append('https://addons.mozilla.org/en-US/firefox/addon/{}/'.format(item_value)) wget_args.append('https://services.addons.mozilla.org/api/v3/addons/addon/{}/'.format(item_value)) wget_args.append('https://services.addons.mozilla.org/api/v4/addons/addon/{}/'.format(item_value)) #wget_args.append('https://addons.mozilla.org/en/firefox/addon/{}/'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=homepage-collection-featured'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=featured'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hp-dl-promo'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=collection'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hotness'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=rating'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=recommended_fallback'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'parler.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'parler.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) for item_name in item['item_name'].split('\0'): wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'post': wget_args.extend( ['--warc-header', 'parler-post: {}'.format(item_value)]) wget_args.append( 'https://parler.com/post/{}'.format(item_value)) elif item_type == 'profile': wget_args.extend( ['--warc-header', 'parler-post: {}'.format(item_value)]) wget_args.append( 'https://parler.com/profile/{}'.format(item_value)) elif item_type == 'url': wget_args.append(item_value) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'pastebin.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'pastebin.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'pastebin-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('pastebin-item: %(item_name)s'), '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) item_name = item['item_name'] item_value = item_name if len(item_value) > 8: item_value = self.int_to_str(int(item_name.replace('b36.', ''), 36)) item['item_value'] = item_value wget_args.extend(['--warc-header', 'pastebin-paste: ' + str(item_value)]) wget_args.append('https://pastebin.com/{}'.format(item_value)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'sketch.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'sonymobile.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'sketch-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('sketch-item: %(item_name)s'), '--header', 'Accept-Encoding: gzip', '--compression', 'gzip' ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() if item_type == 'sketches': r = http_client.fetch('http://103.230.141.2/sketch/' + item_value, method='GET') for s in r.body.decode('utf-8', 'ignore').splitlines(): s = s.strip() if len(s) == 0: continue wget_args.extend( ['--warc-header', 'sketch-sketch-id: {}'.format(s)]) wget_args.append( 'https://sketch.sonymobile.com/api/1/sharedsketch/{}'. format(s)) elif item_type == 'user': wget_args.extend( ['--warc-header', 'sketch-user-id: '.format(item_value)]) wget_args.append( 'https://sketch.sonymobile.com/api/1/artist/{}'.format( item_value)) else: raise Exception('Unknown item') http_client.close() if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", # "--no-cookies", "--load-cookies", "cookies.txt", "--lua-script", "codebender.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "codebender.cc", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "codebender-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("codebender-item: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('sketches', 'user') if item_type == 'sketches': start, stop = item_value.split('-') for i in range(int(start), int(stop)+1): wget_args.extend(['--warc-header', 'codebender-sketch: {i}'.format(**locals())]) wget_args.append('https://codebender.cc/sketch:{i}'.format(**locals())) wget_args.append('https://codebender.cc/sketch:{i}?noCookies=true'.format(**locals())) wget_args.append('https://codebender.cc/utilities/download/{i}'.format(**locals())) wget_args.append('https://codebender.cc/utilities/download/{i}?noCookies=true'.format(**locals())) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'so-net-u-page-plus.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'upp.so-net.ne.jp', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', ] for item_name in item['item_name'].split('\0'): item_name = item_name.replace('http://', '') wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) item_type, item_value = item_name.split(':', 1) if item_type == "userdir": wget_args.extend(['--warc-header', 'so-net-u-page-plus-userdir: ' + item_value]) hostname = item_value.split("/")[0] user_dir_name = item_value.split("/")[1] wget_args.append('http://{}.upp.so-net.ne.jp/{}/'.format(hostname, user_dir_name)) # Alternate forms, because I amn't sure how they parse this wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.htm'.format(hostname, user_dir_name)) wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.html'.format(hostname, user_dir_name)) wget_args.append('http://{}.upp.so-net.ne.jp/{}'.format(hostname, user_dir_name)) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--load-cookies", "cookies.txt", "--lua-script", "coursera.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "coursera.org", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "coursera-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("coursera-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('oldcourse') if item_type == 'oldcourse': X_CSRFToken = ''.join(random.choice(string.digits) for i in range(20)) X_CSRF2_Cookie = 'csrf2_token_' + ''.join(random.choice(string.digits) for i in range(8)) X_CSRF2_Token = ''.join(random.choice(string.digits) for i in range(24)) Cookie = "csrftoken=%s; %s=%s" % (X_CSRFToken, X_CSRF2_Cookie, X_CSRF2_Token) os.system(WGET_LUA + " --save-cookies cookies.txt --keep-session-cookies --post-data '[email protected]&password=123456&webrequest=true' --header='Cookie: " + Cookie + "' --header='X-CSRFToken: " + X_CSRFToken + "' --header='X-CSRF2-Cookie: " + X_CSRF2_Cookie + "' --header='X-CSRF2-Token: " + X_CSRF2_Token + "' https://www.coursera.org/api/login/v3") os.remove('v3') wget_args.append('https://www.coursera.org/course/' + item_value) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WPULL_EXE, "-nv", # "--user-agent", USER_AGENT, "--python-script", "examplecity.py", "-o", ItemInterpolation("%(item_dir)s/wpull.log"), "--no-check-certificate", "--database", ItemInterpolation("%(item_dir)s/wpull.db"), "--delete-after", "--no-robots", "--no-cookies", "--rotate-dns", # "--recursive", "--level=inf", "--recursive", "--level=2", "--no-parent", "--page-requisites", "--span-hosts-allow", "page-requisites,linked-pages", "--timeout", "30", "--tries", "2", "--wait", "0.5", "--random-wait", "--waitretry", "5", # "--domains", "example.com,example.net", # "--hostnames", "assets.cloudspeeder.invalid,cnd.wahoo.invalid", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "examplecity-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("examplecity-user: %(item_name)s"), ] domain = item['item_name'] wget_args.append("http://{0}".format(domain)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'playstv.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'plays.tv', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'playstv-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('playstv-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'user': wget_args.extend(['--warc-header', 'playstv-user-id: ' + item_value]) wget_args.append('https://plays.tv/playsapi/usersys/v1/user/' + item_value) elif item_type == 'video': for s in item_value.split(';'): print(s) wget_args.extend(['--warc-header', 'playstv-video-id: ' + s]) wget_args.append('https://plays.tv/playsapi/feedsys/v1/media/' + s) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--load-cookies', 'cookies.txt', '--lua-script', 'super-mario-maker-bookmarks.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic' ] item_names = item['item_name'].split('\0') item['item_name_newline'] = item['item_name'].replace('\0', '\n') for item_name in item_names: wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'user': wget_args.extend([ '--warc-header', 'super-mario-world-bookmarks-user: '******'https://supermariomakerbookmark.nintendo.net/profile/' + item_value) elif item_type == 'course': wget_args.extend([ '--warc-header', 'super-mario-world-bookmarks-course: ' + item_value ]) wget_args.append( 'https://supermariomakerbookmark.nintendo.net/courses/' + item_value) else: raise ValueError('item_type not supported.') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'bitbucket.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'bitbucket.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'bitbucket-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('bitbucket-item: %(item_name)s'), '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'hg': wget_args.append('https://bitbucket.org/' + item_value + '/src/') wget_args.append('https://bitbucket.org/' + item_value) wget_args.append('https://bitbucket.org/' + item_value + '/src/default/') wget_args.append('https://bitbucket.org/!api/2.0/repositories/' + item_value) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'gfycat.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'gfycat.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'gfycat-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('gfycat-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'disco': prefix = 'https://api.gfycat.com/v1/gfycats/' + item_value with open('animals', 'r') as f: for line in f: if line.startswith('#'): continue wget_args.append(prefix + line.strip()) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def __init__(self): args = [ sys.executable, os.path.join(PIPELINE_DIR, "warc2warc_greader.py"), "--gzip", "--decode_http", "--strip-404s", "--json-hrefs-file", ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"), "--output", ItemInterpolation( "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz"), ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ] ExternalProcess.__init__(self, "CookWARC", args)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'google-poly.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'poly.google.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8', ] for item_name in item['item_name'].split('\0'): wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'poly': wget_args.extend(['--warc-header', 'google-poly-item: '+item_value]) wget_args.append('https://poly.google.com/view/'+item_value) elif item_type == 'user': wget_args.extend(['--warc-header', 'google-poly-user: '******'https://poly.google.com/user/'+item_value) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--no-cookies", "--content-on-error", "--lua-script", "vidme.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "vid.me", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "vidme-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("vidme-item: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'video': wget_args.extend(['--warc-header', 'vidme-video-id: {i}'.format(i=item_value)]) wget_args.append('https://api.vid.me/video/{i}'.format(i=item_value)) wget_args.append('https://api.vid.me/video/{i}/upnext'.format(i=item_value)) wget_args.append('https://api.vid.me/video/{i}/likes?offset=0&limit=10'.format(i=item_value)) wget_args.append('https://api.vid.me/video/{i}/comments?offsetAtParentLevel=true&order=score&offset=0&limit=20'.format(i=item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--load-cookies', 'cookies.txt', '--lua-script', 'bintray.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic' ] item_names = item['item_name'].split('\0') item['item_name_newline'] = item['item_name'].replace('\0', '\n') item_names_to_submit = item_names.copy() for item_name in item_names: assert item_name not in {'user:account', 'user:assets' }, 'Doing this out of caution' wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'user': wget_args.extend( ['--warc-header', 'bintray-user: '******'https://bintray.com/{item_value}') wget_args.append(f'https://bintray.com/{item_value}/') elif item_type == 'file': wget_args.extend( ['--warc-header', 'bintray-file: ' + item_value]) assert item_value.startswith( "http"), "If this fails, something strange has happened" wget_args.append(item_value) else: raise ValueError('item_type not supported.') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'mixer.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'mixer.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'mixer-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('mixer-item: %(item_name)s'), '--warc-dedup-url-agnostic', ] item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'clip': wget_args.extend(['--warc-header', 'mixer-clip: ' + item_value]) wget_args.append('https://mixer.com/api/v1/clips/' + item_value) elif item_type in ('rec', 'rec-meta'): wget_args.extend(['--warc-header', 'mixer-recording: ' + item_value]) wget_args.append('https://mixer.com/api/v1/recordings/' + item_value) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'tumblr.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'tumblr.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'tumblr-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('tumblr-blog: %(item_name)s') ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'tumblr-blog': split_items = item_value.split(':') for x in split_items: wget_args.extend(['--warc-header', 'tumblr-blog: ' + x]) wget_args.append('http://{}.tumblr.com/'.format(x)) wget_args.append('http://{}.tumblr.com/sitemap.xml'.format(x)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--lua-script', 'sketch-static.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'sketch.sonymobile.com,sketch-cloud-storage.s3.amazonaws.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'sketch-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('sketches-created-on: %(item_value)s') ] item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() if item_type == 'sketches' or item_type == 'tests': r = http_client.fetch( 'https://raw.githubusercontent.com/marked/sketch-items/master/' + item_type + "/" + item_value, method='GET') for s in r.body.decode('utf-8', 'ignore').splitlines(): s = s.strip() if len(s) == 0: continue wget_args.append( 'https://storage.sketch.sonymobile.com/feed/{}/image'. format(s)) else: raise Exception('Unknown item') http_client.close() if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--header', 'Cookie: rxx=5xsz8gpps7w.1cxbbha8&v=1; _ga=GA1.2.67023728.1544887148; _gid=GA1.2.55111282.1544887148; __utma=189990958.67023728.1544887148.1544887148.1544887148.1; __utmb=189990958.0.10.1544887148; __utmc=189990958; __utmz=189990958.1544887148.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); language=%2Cen_US; logged_in=1; pfx=025a404c8fe0a59b8c35f218ec03d27834e8b87ef79b414c0872edf4ff0e146c%230%234099695469; pfg=253dab60b55590b025d9ae175a9442d5895d72b2f7aeb68188ca352cc075242c%23%7B%22eu_resident%22%3A1%2C%22gdpr_is_acceptable_age%22%3A1%2C%22gdpr_consent_core%22%3A1%2C%22gdpr_consent_first_party_ads%22%3A1%2C%22gdpr_consent_third_party_ads%22%3A1%2C%22gdpr_consent_search_history%22%3A1%2C%22exp%22%3A1576423244%2C%22vc%22%3A%22%22%7D%230120119809; tmgioct=5c151bcc4067d10993384260', '--lua-script', 'tumblr.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'tumblr.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'tumblr-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('tumblr-blog: %(item_name)s') ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'tumblr-blog': split_items = item_value.split(':') for x in split_items: wget_args.extend(['--warc-header', 'tumblr-blog: ' + x]) wget_args.append('http://{}.tumblr.com/'.format(x)) wget_args.append('http://{}.tumblr.com/sitemap.xml'.format(x)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "justintv.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", # "--recursive", "--level=inf", "--page-requisites", "--timeout", "60", "--tries", "inf", "--span-hosts", "--waitretry", "3600", # "--domains", "canv.as,canvasugc.com", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "justintv-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("justintv-user: %(item_name)s"), ] item_name = item['item_name'] wget_args.append('http://justin.tv/{0}'.format(item_name)) wget_args.append(item['video_url']) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', #Insert project '--lua-script', 'project.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', #Insert project '--domains', '', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', #Insert project '--warc-header', '-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('-item: %(item_name)s') ] item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() #Insert project item code http_client.close() if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "canvas.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", "--recursive", "--level=inf", "--page-requisites", "--timeout", "60", "--tries", "inf", "--span-hosts", "--waitretry", "3600", "--domains", "canv.as,canvasugc.com", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "canvas-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("canvas-user: %(item_name)s"), ] item_name = item['item_name'] item_type, item_data = item_name.split(':', 1) item['item_type'] = item_type item['item_data'] = item_data if item_type == 'drawing': wget_args.append('http://canv.as/p/%s' % item_data) elif item_type == 'profile': wget_args.append('http://canv.as/user/%s' % item_data) else: raise Exception('Unknown item_type') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'jamiiforums.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'jamiiforums.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'jamiiforums-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('jamiiforums-item: %(item_name)s') ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'threads': start, end = (int(i) for i in item_value.split('-')) for id_ in range(start, end+1): wget_args.extend(['--warc-header', 'jamiiforums-thread-id: {}'.format(id_)]) wget_args.append('https://www.jamiiforums.com/threads/x.{}/'.format(id_)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "swipnet.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--span-hosts", "--waitretry", "30", "--domains", "swipnet.se", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "swipnet-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("swipnet-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) assert item_type assert item_value item['item_type'] = item_type item['item_value'] = item_value wget_args.append('http://{0}.swipnet.se/{1}/'.format(item_type, item_value)) # wget_args.append('http://home.swipnet.se/{0}/'.format(item_name)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", #"--lua-script", "cobook.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", #"--domains", "cobook.co", "--header","Cookie: iccmtspmvrfy=ano", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "internetcentrum-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("internetcentrum-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('site') if item_type == 'site': wget_args.append('{0}'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)