async def test_http_retry_then_succeed(self): async def handle_post(_): return web.Response(text='the-response-data') app = web.Application() app.add_routes([web.post('/page', handle_post)]) runner = web.AppRunner(app) request, close = Pool() self.add_async_cleanup(close) retriable_request = retry( request, exception_intervals=((HttpConnectionError, (0, 1, 2)), ), ) async def delayed_start(): await asyncio.sleep(0.5) await runner.setup() self.add_async_cleanup(runner.cleanup) site = web.TCPSite(runner, '0.0.0.0', 8080) await site.start() asyncio.ensure_future(delayed_start()) _, _, body = await retriable_request(b'POST', 'http://localhost:8080/page', body=streamed(b'some-data'), headers=((b'content-length', b'9'), )) self.assertEqual(b'the-response-data', await buffered(body))
async def test_http_post_small_buffered_streamed(self): request, close = Pool() self.add_async_cleanup(close) code, headers, body = await request( b'POST', 'http://postman-echo.com/post', (), ( (b'content-length', b'19'), (b'content-type', b'application/x-www-form-urlencoded'), ), streamed(b'some-data=something'), ) body_bytes = await buffered(body) headers_dict = dict(headers) response_dict = json.loads(body_bytes) self.assertEqual(code, b'200') self.assertEqual(headers_dict[b'content-type'], b'application/json; charset=utf-8') self.assertEqual(response_dict['headers']['host'], 'postman-echo.com') self.assertEqual(response_dict['headers']['content-length'], '19') self.assertEqual(response_dict['form'], {'some-data': 'something'})
async def list_packages(): request_body = ( b'<?xml version="1.0"?>' b"<methodCall><methodName>list_packages</methodName></methodCall>") _, _, body = await request( b"POST", source_base + "/pypi", body=streamed(request_body), headers=( (b"content-type", b"text/xml"), (b"content-length", str(len(request_body)).encode()), ), ) return [ package.text for package in ET.fromstring(await buffered( body)).findall("./params/param/value/array/data/value/string") ]
async def list_packages(): request_body = ( b'<?xml version="1.0"?>' b'<methodCall><methodName>list_packages</methodName></methodCall>') _, _, body = await request( b'POST', source_base + '/pypi', body=streamed(request_body), headers=( (b'content-type', b'text/xml'), (b'content-length', str(len(request_body)).encode()), ), ) return [ package.text for package in ET.fromstring(await buffered( body)).findall('./params/param/value/array/data/value/string') ]
async def changelog(sync_changes_after): request_body = ( b'<?xml version="1.0"?>' b"<methodCall><methodName>changelog</methodName><params>" b"<param><value>" b"<int>" + str(sync_changes_after).encode() + b"</int>" b"</value></param>" b"</params></methodCall>") _, _, body = await request( b"POST", source_base + "/pypi", body=streamed(request_body), headers=( (b"content-type", b"text/xml"), (b"content-length", str(len(request_body)).encode()), ), ) return [ package.text for package in ET. fromstring((await buffered(body)).replace(b"\x1b", b"")).findall( "./params/param/value/array/data/value/array/data/value[1]/string" ) ]
async def changelog(sync_changes_after): request_body = ( b'<?xml version="1.0"?>' b'<methodCall><methodName>changelog</methodName><params>' b'<param><value>' b'<int>' + str(sync_changes_after).encode() + b'</int>' b'</value></param>' b'</params></methodCall>') _, _, body = await request( b'POST', source_base + '/pypi', body=streamed(request_body), headers=( (b'content-type', b'text/xml'), (b'content-length', str(len(request_body)).encode()), ), ) return [ package.text for package in ET.fromstring(await buffered(body)).findall( './params/param/value/array/data/value/array/data/value[1]/string' ) ]
async def nltk_mirror(logger, request, s3_context, s3_prefix): base = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/" input_index_url = f"{base}/index.xml" # Fetch index file code, headers, body = await request(b"GET", input_index_url) if code != b"200": raise Exception("Unable to fetch index") headers_xml_lower = dict((key.lower(), value) for key, value in headers) index_xml_content_type = headers_xml_lower[b"content-type"] index_xml = ET.fromstring(await buffered(body)) # Transfer package contents to S£ if they haven't already for package in index_xml.findall("./packages/package"): package_original_url = package.attrib["url"] package_checksum = package.attrib["checksum"] package_id = package.attrib["id"] # The NTLK downloader is sensitive to the extension of the file _, ext = os.path.splitext(package_original_url) new_path = f"/{s3_prefix}{package_id}-{package_checksum}{ext}" package.attrib[ "url"] = f"https://{s3_context.bucket.host}/{s3_context.bucket.name}{new_path}" # If the file already exists, don't re-upload. Named including the # checksum deliberately so we can do this code, _ = await s3_request_full( logger, s3_context, b"HEAD", new_path, (), headers, empty_async_iterator, "UNSIGNED-PAYLOAD", ) logger.info("%s %s", f"{s3_context.bucket.name}{new_path}", code) if code == b"200": continue # Stream the file to S3 code, headers, body = await request(b"GET", package_original_url) if code != b"200": await blackhole(body) raise Exception(f"{code} {package_original_url}") headers_lower = dict((key.lower(), value) for key, value in headers) headers = ( (b"content-length", headers_lower[b"content-length"]), (b"content-type", headers_lower[b"content-type"]), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", new_path, (), headers, lambda: body, "UNSIGNED-PAYLOAD", ) if code != b"200": raise Exception() # The ascii encoding is important: the nltk downloader seems to assume # attributes are ascii output_xml_str = ET.tostring(index_xml, encoding="ascii", method="xml") index_xml_content_length = str(len(output_xml_str)).encode("ascii") headers = ( (b"content-length", index_xml_content_length), (b"content-type", index_xml_content_type), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", f"/{s3_prefix}index.xml", (), headers, streamed(output_xml_str), "UNSIGNED-PAYLOAD", ) if code != b"200": raise Exception()
async def conda_mirror(logger, request, s3_context, source_base_url, s3_prefix): arch_dirs = ["noarch/", "linux-64/"] repodatas = [] queue = asyncio.Queue() logger.info("Finding existing files") existing_files = { key async for key in s3_list_keys_relative_to_prefix( logger, s3_context, s3_prefix) } for arch_dir in arch_dirs: code, _, body = await request( b"GET", source_base_url + arch_dir + "repodata.json") if code != b"200": raise Exception() source_repodata_raw = await buffered(body) source_repodata = json.loads(source_repodata_raw) for package_suffix, _ in source_repodata["packages"].items(): await queue.put(arch_dir + package_suffix) repodatas.append((arch_dir + "repodata.json", source_repodata_raw)) code, _, body = await request( b"GET", source_base_url + arch_dir + "repodata.json.bz2") if code != b"200": raise Exception() repodatas.append((arch_dir + "repodata.json.bz2", await buffered(body))) async def transfer_package(package_suffix): source_package_url = source_base_url + package_suffix target_package_key = s3_prefix + package_suffix exists = package_suffix in existing_files if exists: logger.debug("Skipping transfer of {}".format("/" + target_package_key)) return code, headers, body = await request(b"GET", source_package_url) if code != b"200": response = await buffered(body) raise Exception("Exception GET {} {} {}".format( source_package_url, code, response)) headers_lower = dict((key.lower(), value) for key, value in headers) headers = ((b"content-length", headers_lower[b"content-length"]), ) code, body = await s3_request_full( logger, s3_context, b"PUT", "/" + target_package_key, (), headers, lambda: body, "UNSIGNED-PAYLOAD", ) if code != b"200": raise Exception( # pylint: disable=broad-except "Exception PUT {} {} {}".format("/" + target_package_key, code, body)) async def transfer_task(): while True: package_suffix = await queue.get() try: for _ in range(0, 10): try: await transfer_package(package_suffix) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break except Exception: # pylint: disable=broad-except logger.exception("Exception transferring %s", package_suffix) finally: queue.task_done() tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)] try: await queue.join() finally: for task in tasks: task.cancel() await asyncio.sleep(0) for path, data in repodatas: target_repodata_key = s3_prefix + path headers = ((b"content-length", str(len(data)).encode("ascii")), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", "/" + target_repodata_key, (), headers, streamed(data), s3_hash(data), ) if code != b"200": raise Exception()
async def cran_mirror(logger, request, s3_context): source_base = "https://cran.ma.imperial.ac.uk/" source_base_url = source_base + "web/packages/available_packages_by_name.html" source_base_parsed = urllib.parse.urlparse(source_base_url) cran_prefix = "cran/" done = set() queue = asyncio.Queue() await queue.put(source_base_url) # Main package file. Maybe better parsing this than crawling HTML? package_index = "src/contrib/PACKAGES" code, _, body = await request(b"GET", source_base + package_index) package_index_body = await buffered(body) if code != b"200": raise Exception() logger.info("Finding existing files") existing_files = { key async for key in s3_list_keys_relative_to_prefix( logger, s3_context, cran_prefix) } async def crawl(url): key_suffix = urllib.parse.urlparse(url).path[1:] # Without leading / if key_suffix in existing_files and (key_suffix.endswith(".tar.gz") or key_suffix.endswith(".tgz") or key_suffix.endswith(".zip") or key_suffix.endswith(".pdf")): # The package files have a version in the file name. Other files like html and pdf # don't, but don't think they are actually used in when installing packages from R return code, headers, body = await request(b"GET", url) if code != b"200": await blackhole(body) raise Exception() headers_lower = dict((key.lower(), value) for key, value in headers) content_type = headers_lower.get(b"content-type", None) content_length = headers_lower[b"content-length"] target_key = cran_prefix + key_suffix if content_type == b"text/html": data = await buffered(body) soup = BeautifulSoup(data, "html.parser") links = soup.find_all("a") for link in links: absolute = urllib.parse.urljoin(url, link.get("href")) absolute_no_frag = absolute.split("#")[0] is_done = (urllib.parse.urlparse(absolute_no_frag).netloc == source_base_parsed.netloc and absolute_no_frag not in done) if is_done: done.add(absolute_no_frag) await queue.put(absolute_no_frag) return if key_suffix in existing_files: await blackhole(body) return headers = ((b"content-length", content_length), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", "/" + target_key, (), headers, lambda: body, "UNSIGNED-PAYLOAD", ) if code != b"200": raise Exception() async def transfer_task(): while True: url = await queue.get() try: for _ in range(0, 10): try: await crawl(url) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break except Exception: # pylint: disable=broad-except logger.exception("Exception crawling %s", url) finally: queue.task_done() tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)] try: await queue.join() finally: for task in tasks: task.cancel() await asyncio.sleep(0) headers = ((b"content-length", str(len(package_index_body)).encode("ascii")), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", "/" + cran_prefix + package_index, (), headers, streamed(package_index_body), s3_hash(package_index_body), ) if code != b"200": raise Exception()
async def transfer_project(project_name, project_url): code, _, body = await request(b"GET", project_url) data = await buffered(body) if code != b"200": raise Exception("Failed GET {}".format(code)) soup = BeautifulSoup(data, "html.parser") links = soup.find_all("a") link_data = [] logger.info("Finding existing files") existing_project_filenames = { key async for key in s3_list_keys_relative_to_prefix( logger, s3_context, f"{pypi_prefix}{project_name}/") } for link in links: absolute = link.get("href") absolute_no_frag, frag = absolute.split("#") filename = str(link.string) python_version = link.get("data-requires-python") has_python_version = python_version is not None python_version_attr = (' data-requires-python="' + html.escape(python_version) + '"' if has_python_version else "") s3_path = f"/{pypi_prefix}{project_name}/{filename}" link_data.append((s3_path, filename, frag, python_version_attr)) exists = filename in existing_project_filenames if exists: logger.debug("Skipping transfer of %s", s3_path) continue for _ in range(0, 10): try: code, headers, body = await request( b"GET", absolute_no_frag) if code != b"200": await blackhole(body) raise Exception("Failed GET {}".format(code)) content_length = dict( (key.lower(), value) for key, value in headers)[b"content-length"] headers = ((b"content-length", content_length), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", s3_path, (), headers, lambda: body, "UNSIGNED-PAYLOAD", ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b"200": raise Exception("Failed PUT {}".format(code)) html_str = ("<!DOCTYPE html>" + "<html>" + "<body>" + "".join([ f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}' f'#{frag}"{python_version_attr}>{filename}</a>' for s3_path, filename, frag, python_version_attr in link_data ]) + "</body>" + "</html>") html_bytes = html_str.encode("ascii") s3_path = f"/{pypi_prefix}{project_name}/" headers = ( (b"content-type", b"text/html"), (b"content-length", str(len(html_bytes)).encode("ascii")), ) for _ in range(0, 5): try: code, _ = await s3_request_full( logger, s3_context, b"PUT", s3_path, (), headers, streamed(html_bytes), s3_hash(html_bytes), ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b"200": raise Exception("Failed PUT {}".format(code))
async def pypi_mirror(logger, request, s3_context): def normalise(name): return re.sub(r"[-_.]+", "-", name).lower() async def list_packages(): request_body = ( b'<?xml version="1.0"?>' b"<methodCall><methodName>list_packages</methodName></methodCall>") _, _, body = await request( b"POST", source_base + "/pypi", body=streamed(request_body), headers=( (b"content-type", b"text/xml"), (b"content-length", str(len(request_body)).encode()), ), ) return [ package.text for package in ET.fromstring(await buffered( body)).findall("./params/param/value/array/data/value/string") ] async def changelog(sync_changes_after): request_body = ( b'<?xml version="1.0"?>' b"<methodCall><methodName>changelog</methodName><params>" b"<param><value>" b"<int>" + str(sync_changes_after).encode() + b"</int>" b"</value></param>" b"</params></methodCall>") _, _, body = await request( b"POST", source_base + "/pypi", body=streamed(request_body), headers=( (b"content-type", b"text/xml"), (b"content-length", str(len(request_body)).encode()), ), ) return [ package.text for package in ET. fromstring((await buffered(body)).replace(b"\x1b", b"")).findall( "./params/param/value/array/data/value/array/data/value[1]/string" ) ] source_base = "https://pypi.python.org" pypi_prefix = "pypi/" # We may have overlap, but that's fine sync_changes_after_key = "__sync_changes_after" # Paranoia: the reference implementation at https://bitbucket.org/loewis/pep381client has -1 started = int(time()) - 1 # Determine after when to fetch changes. There is an eventual consistency issue storing this # on S3, but at worst we'll be unnecessarily re-fetching updates, rather than missing them. # Plus, given the time to run a sync and frequency, this is unlikely anyway code, data = await s3_request_full( logger, s3_context, b"GET", "/" + pypi_prefix + sync_changes_after_key, (), (), empty_async_iterator, s3_hash(b""), ) if code not in [b"200", b"404"]: raise Exception("Failed GET of __sync_changes_after {} {}".format( code, data)) sync_changes_after = int(data) if code == b"200" else 0 # changelog doesn't seem to have changes older than two years, so for all projects on initial # import, we need to call list_packages project_names_with_duplicates = ((await list_packages()) if sync_changes_after == 0 else (await changelog(sync_changes_after))) project_names = sorted(list(set(project_names_with_duplicates))) queue = asyncio.Queue() for project_name in project_names: normalised_project_name = normalise(project_name) await queue.put(( normalised_project_name, source_base + f"/simple/{normalised_project_name}/", )) async def transfer_project(project_name, project_url): code, _, body = await request(b"GET", project_url) data = await buffered(body) if code != b"200": raise Exception("Failed GET {}".format(code)) soup = BeautifulSoup(data, "html.parser") links = soup.find_all("a") link_data = [] logger.info("Finding existing files") existing_project_filenames = { key async for key in s3_list_keys_relative_to_prefix( logger, s3_context, f"{pypi_prefix}{project_name}/") } for link in links: absolute = link.get("href") absolute_no_frag, frag = absolute.split("#") filename = str(link.string) python_version = link.get("data-requires-python") has_python_version = python_version is not None python_version_attr = (' data-requires-python="' + html.escape(python_version) + '"' if has_python_version else "") s3_path = f"/{pypi_prefix}{project_name}/{filename}" link_data.append((s3_path, filename, frag, python_version_attr)) exists = filename in existing_project_filenames if exists: logger.debug("Skipping transfer of %s", s3_path) continue for _ in range(0, 10): try: code, headers, body = await request( b"GET", absolute_no_frag) if code != b"200": await blackhole(body) raise Exception("Failed GET {}".format(code)) content_length = dict( (key.lower(), value) for key, value in headers)[b"content-length"] headers = ((b"content-length", content_length), ) code, _ = await s3_request_full( logger, s3_context, b"PUT", s3_path, (), headers, lambda: body, "UNSIGNED-PAYLOAD", ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b"200": raise Exception("Failed PUT {}".format(code)) html_str = ("<!DOCTYPE html>" + "<html>" + "<body>" + "".join([ f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}' f'#{frag}"{python_version_attr}>{filename}</a>' for s3_path, filename, frag, python_version_attr in link_data ]) + "</body>" + "</html>") html_bytes = html_str.encode("ascii") s3_path = f"/{pypi_prefix}{project_name}/" headers = ( (b"content-type", b"text/html"), (b"content-length", str(len(html_bytes)).encode("ascii")), ) for _ in range(0, 5): try: code, _ = await s3_request_full( logger, s3_context, b"PUT", s3_path, (), headers, streamed(html_bytes), s3_hash(html_bytes), ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b"200": raise Exception("Failed PUT {}".format(code)) async def transfer_task(): while True: project_name, project_url = await queue.get() logger.info("Transferring project %s %s", project_name, project_url) try: await transfer_project(project_name, project_url) except Exception: # pylint: disable=broad-except logger.exception("Exception crawling %s", project_url) finally: queue.task_done() tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)] try: await queue.join() finally: for task in tasks: task.cancel() await asyncio.sleep(0) started_bytes = str(started).encode("ascii") headers = ((b"content-length", str(len(started_bytes)).encode("ascii")), ) for _ in range(0, 10): try: code, _ = await s3_request_full( logger, s3_context, b"PUT", "/" + pypi_prefix + sync_changes_after_key, (), headers, streamed(started_bytes), s3_hash(started_bytes), ) except (HttpConnectionError, HttpDataError): pass else: break if code != b"200": raise Exception()
async def nltk_mirror(logger, request, s3_context, s3_prefix): base = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/' input_index_url = f'{base}/index.xml' # Fetch index file code, headers, body = await request(b'GET', input_index_url) if code != b'200': raise Exception('Unable to fetch index') headers_xml_lower = dict((key.lower(), value) for key, value in headers) index_xml_content_type = headers_xml_lower[b'content-type'] index_xml = ET.fromstring(await buffered(body)) # Transfer package contents to S£ if they haven't already for package in index_xml.findall('./packages/package'): package_original_url = package.attrib['url'] package_checksum = package.attrib['checksum'] package_id = package.attrib['id'] # The NTLK downloader is sensitive to the extension of the file _, ext = os.path.splitext(package_original_url) new_path = f'/{s3_prefix}{package_id}-{package_checksum}{ext}' package.attrib[ 'url' ] = f'https://{s3_context.bucket.host}/{s3_context.bucket.name}{new_path}' # If the file already exists, don't re-upload. Named including the # checksum deliberately so we can do this code, _ = await s3_request_full( logger, s3_context, b'HEAD', new_path, (), headers, empty_async_iterator, 'UNSIGNED-PAYLOAD', ) logger.info('%s %s', f'{s3_context.bucket.name}{new_path}', code) if code == b'200': continue # Stream the file to S3 code, headers, body = await request(b'GET', package_original_url) if code != b'200': await blackhole(body) raise Exception(f'{code} {package_original_url}') headers_lower = dict((key.lower(), value) for key, value in headers) headers = ( (b'content-length', headers_lower[b'content-length']), (b'content-type', headers_lower[b'content-type']), ) code, _ = await s3_request_full( logger, s3_context, b'PUT', new_path, (), headers, lambda: body, 'UNSIGNED-PAYLOAD', ) if code != b'200': raise Exception() # The ascii encoding is important: the nltk downloader seems to assume # attributes are ascii output_xml_str = ET.tostring(index_xml, encoding='ascii', method='xml') index_xml_content_length = str(len(output_xml_str)).encode('ascii') headers = ( (b'content-length', index_xml_content_length), (b'content-type', index_xml_content_type), ) code, _ = await s3_request_full( logger, s3_context, b'PUT', f'/{s3_prefix}index.xml', (), headers, streamed(output_xml_str), 'UNSIGNED-PAYLOAD', ) if code != b'200': raise Exception()
async def conda_mirror(logger, request, s3_context, source_base_url, s3_prefix): arch_dirs = ['noarch/', 'linux-64/'] repodatas = [] queue = asyncio.Queue() logger.info('Finding existing files') existing_files = { key async for key in s3_list_keys_relative_to_prefix(logger, s3_context, s3_prefix) } for arch_dir in arch_dirs: code, _, body = await request( b'GET', source_base_url + arch_dir + 'repodata.json' ) if code != b'200': raise Exception() source_repodata_raw = await buffered(body) source_repodata = json.loads(source_repodata_raw) for package_suffix, _ in source_repodata['packages'].items(): await queue.put(arch_dir + package_suffix) repodatas.append((arch_dir + 'repodata.json', source_repodata_raw)) code, _, body = await request( b'GET', source_base_url + arch_dir + 'repodata.json.bz2' ) if code != b'200': raise Exception() repodatas.append((arch_dir + 'repodata.json.bz2', await buffered(body))) async def transfer_package(package_suffix): source_package_url = source_base_url + package_suffix target_package_key = s3_prefix + package_suffix exists = package_suffix in existing_files if exists: logger.debug('Skipping transfer of {}'.format('/' + target_package_key)) return code, headers, body = await request(b'GET', source_package_url) if code != b'200': response = await buffered(body) raise Exception( 'Exception GET {} {} {}'.format(source_package_url, code, response) ) headers_lower = dict((key.lower(), value) for key, value in headers) headers = ((b'content-length', headers_lower[b'content-length']),) code, body = await s3_request_full( logger, s3_context, b'PUT', '/' + target_package_key, (), headers, lambda: body, 'UNSIGNED-PAYLOAD', ) if code != b'200': raise Exception( 'Exception PUT {} {} {}'.format('/' + target_package_key, code, body) ) async def transfer_task(): while True: package_suffix = await queue.get() try: for _ in range(0, 10): try: await transfer_package(package_suffix) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break except Exception: logger.exception('Exception transferring %s', package_suffix) finally: queue.task_done() tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)] try: await queue.join() finally: for task in tasks: task.cancel() await asyncio.sleep(0) for path, data in repodatas: target_repodata_key = s3_prefix + path headers = ((b'content-length', str(len(data)).encode('ascii')),) code, _ = await s3_request_full( logger, s3_context, b'PUT', '/' + target_repodata_key, (), headers, streamed(data), s3_hash(data), ) if code != b'200': raise Exception()
async def transfer_project(project_name, project_url): code, _, body = await request(b'GET', project_url) data = await buffered(body) if code != b'200': raise Exception('Failed GET {}'.format(code)) soup = BeautifulSoup(data, 'html.parser') links = soup.find_all('a') link_data = [] logger.info('Finding existing files') existing_project_filenames = { key async for key in s3_list_keys_relative_to_prefix( logger, s3_context, f'{pypi_prefix}{project_name}/' ) } for link in links: absolute = link.get('href') absolute_no_frag, frag = absolute.split('#') filename = str(link.string) python_version = link.get('data-requires-python') has_python_version = python_version is not None python_version_attr = ( ' data-requires-python="' + html.escape(python_version) + '"' if has_python_version else '' ) s3_path = f'/{pypi_prefix}{project_name}/{filename}' link_data.append((s3_path, filename, frag, python_version_attr)) exists = filename in existing_project_filenames if exists: logger.debug('Skipping transfer of %s', s3_path) continue for _ in range(0, 10): try: code, headers, body = await request(b'GET', absolute_no_frag) if code != b'200': await blackhole(body) raise Exception('Failed GET {}'.format(code)) content_length = dict( (key.lower(), value) for key, value in headers )[b'content-length'] headers = ((b'content-length', content_length),) code, _ = await s3_request_full( logger, s3_context, b'PUT', s3_path, (), headers, lambda: body, 'UNSIGNED-PAYLOAD', ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b'200': raise Exception('Failed PUT {}'.format(code)) html_str = ( '<!DOCTYPE html>' + '<html>' + '<body>' + ''.join( [ f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}' f'#{frag}"{python_version_attr}>{filename}</a>' for s3_path, filename, frag, python_version_attr in link_data ] ) + '</body>' + '</html>' ) html_bytes = html_str.encode('ascii') s3_path = f'/{pypi_prefix}{project_name}/' headers = ( (b'content-type', b'text/html'), (b'content-length', str(len(html_bytes)).encode('ascii')), ) for _ in range(0, 5): try: code, _ = await s3_request_full( logger, s3_context, b'PUT', s3_path, (), headers, streamed(html_bytes), s3_hash(html_bytes), ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b'200': raise Exception('Failed PUT {}'.format(code))
async def pypi_mirror(logger, request, s3_context): def normalise(name): return re.sub(r'[-_.]+', '-', name).lower() async def list_packages(): request_body = ( b'<?xml version="1.0"?>' b'<methodCall><methodName>list_packages</methodName></methodCall>' ) _, _, body = await request( b'POST', source_base + '/pypi', body=streamed(request_body), headers=( (b'content-type', b'text/xml'), (b'content-length', str(len(request_body)).encode()), ), ) return [ package.text for package in ET.fromstring(await buffered(body)).findall( './params/param/value/array/data/value/string' ) ] async def changelog(sync_changes_after): request_body = ( b'<?xml version="1.0"?>' b'<methodCall><methodName>changelog</methodName><params>' b'<param><value>' b'<int>' + str(sync_changes_after).encode() + b'</int>' b'</value></param>' b'</params></methodCall>' ) _, _, body = await request( b'POST', source_base + '/pypi', body=streamed(request_body), headers=( (b'content-type', b'text/xml'), (b'content-length', str(len(request_body)).encode()), ), ) return [ package.text for package in ET.fromstring(await buffered(body)).findall( './params/param/value/array/data/value/array/data/value[1]/string' ) ] source_base = 'https://pypi.python.org' pypi_prefix = 'pypi/' # We may have overlap, but that's fine sync_changes_after_key = '__sync_changes_after' # Paranoia: the reference implementation at https://bitbucket.org/loewis/pep381client has -1 started = int(time()) - 1 # Determine after when to fetch changes. There is an eventual consistency issue storing this # on S3, but at worst we'll be unnecessarily re-fetching updates, rather than missing them. # Plus, given the time to run a sync and frequency, this is unlikely anyway code, data = await s3_request_full( logger, s3_context, b'GET', '/' + pypi_prefix + sync_changes_after_key, (), (), empty_async_iterator, s3_hash(b''), ) if code not in [b'200', b'404']: raise Exception('Failed GET of __sync_changes_after {} {}'.format(code, data)) sync_changes_after = int(data) if code == b'200' else 0 # changelog doesn't seem to have changes older than two years, so for all projects on initial # import, we need to call list_packages project_names_with_duplicates = ( (await list_packages()) if sync_changes_after == 0 else (await changelog(sync_changes_after)) ) project_names = sorted(list(set(project_names_with_duplicates))) queue = asyncio.Queue() for project_name in project_names: normalised_project_name = normalise(project_name) await queue.put( ( normalised_project_name, source_base + f'/simple/{normalised_project_name}/', ) ) async def transfer_project(project_name, project_url): code, _, body = await request(b'GET', project_url) data = await buffered(body) if code != b'200': raise Exception('Failed GET {}'.format(code)) soup = BeautifulSoup(data, 'html.parser') links = soup.find_all('a') link_data = [] logger.info('Finding existing files') existing_project_filenames = { key async for key in s3_list_keys_relative_to_prefix( logger, s3_context, f'{pypi_prefix}{project_name}/' ) } for link in links: absolute = link.get('href') absolute_no_frag, frag = absolute.split('#') filename = str(link.string) python_version = link.get('data-requires-python') has_python_version = python_version is not None python_version_attr = ( ' data-requires-python="' + html.escape(python_version) + '"' if has_python_version else '' ) s3_path = f'/{pypi_prefix}{project_name}/{filename}' link_data.append((s3_path, filename, frag, python_version_attr)) exists = filename in existing_project_filenames if exists: logger.debug('Skipping transfer of %s', s3_path) continue for _ in range(0, 10): try: code, headers, body = await request(b'GET', absolute_no_frag) if code != b'200': await blackhole(body) raise Exception('Failed GET {}'.format(code)) content_length = dict( (key.lower(), value) for key, value in headers )[b'content-length'] headers = ((b'content-length', content_length),) code, _ = await s3_request_full( logger, s3_context, b'PUT', s3_path, (), headers, lambda: body, 'UNSIGNED-PAYLOAD', ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b'200': raise Exception('Failed PUT {}'.format(code)) html_str = ( '<!DOCTYPE html>' + '<html>' + '<body>' + ''.join( [ f'<a href="https://{s3_context.bucket.host}/{s3_context.bucket.name}{s3_path}' f'#{frag}"{python_version_attr}>{filename}</a>' for s3_path, filename, frag, python_version_attr in link_data ] ) + '</body>' + '</html>' ) html_bytes = html_str.encode('ascii') s3_path = f'/{pypi_prefix}{project_name}/' headers = ( (b'content-type', b'text/html'), (b'content-length', str(len(html_bytes)).encode('ascii')), ) for _ in range(0, 5): try: code, _ = await s3_request_full( logger, s3_context, b'PUT', s3_path, (), headers, streamed(html_bytes), s3_hash(html_bytes), ) except (HttpConnectionError, HttpDataError): await asyncio.sleep(10) else: break if code != b'200': raise Exception('Failed PUT {}'.format(code)) async def transfer_task(): while True: project_name, project_url = await queue.get() logger.info('Transferring project %s %s', project_name, project_url) try: await transfer_project(project_name, project_url) except Exception: logger.exception('Exception crawling %s', project_url) finally: queue.task_done() tasks = [asyncio.ensure_future(transfer_task()) for _ in range(0, 10)] try: await queue.join() finally: for task in tasks: task.cancel() await asyncio.sleep(0) started_bytes = str(started).encode('ascii') headers = ((b'content-length', str(len(started_bytes)).encode('ascii')),) for _ in range(0, 10): try: code, _ = await s3_request_full( logger, s3_context, b'PUT', '/' + pypi_prefix + sync_changes_after_key, (), headers, streamed(started_bytes), s3_hash(started_bytes), ) except (HttpConnectionError, HttpDataError): pass else: break if code != b'200': raise Exception()