def fetch_roles(remote): """ Fetch the roles in a remote repository Args: remote (AnsibleRemote): A remote. Returns: list: a list of dicts that represent roles """ page_count = 0 def role_page_url(remote, page=1): parsed = urlparse(remote.url) new_query = parse_qs(parsed.query) new_query['page'] = page return parsed.scheme + '://' + parsed.netloc + parsed.path + '?' + urlencode( new_query, doseq=True) def parse_metadata(path): metadata = json.load(open(path)) page_count = metadata['num_pages'] return page_count, parse_roles(metadata) downloader = remote.get_downloader(role_page_url(remote)) downloader.fetch() page_count, roles = parse_metadata(downloader.path) progress_bar = ProgressBar(message='Parsing Pages from Galaxy Roles API', total=page_count, done=1, state='running') progress_bar.save() def downloader_coroutines(): for page in range(2, page_count + 1): downloader = remote.get_downloader(role_page_url(remote, page)) yield downloader.run() loop = asyncio.get_event_loop() downloaders = downloader_coroutines() not_done = set() with suppress(StopIteration): for i in range(20): not_done.add(next(downloaders)) while True: if not_done == set(): break done, not_done = loop.run_until_complete( asyncio.wait(not_done, return_when=FIRST_COMPLETED)) for item in done: download_result = item.result() new_page_count, new_roles = parse_metadata(download_result.path) roles.extend(new_roles) progress_bar.increment() with suppress(StopIteration): not_done.add(next(downloaders)) progress_bar.state = 'completed' progress_bar.save() return roles
async def run(self): """ DockerFirstStage. """ future_manifests = [] tag_list = [] to_download = [] man_dcs = {} total_blobs = [] with ProgressBar(message='Downloading tag list', total=1) as pb: repo_name = self.remote.namespaced_upstream_name relative_url = '/v2/{name}/tags/list'.format(name=repo_name) tag_list_url = urljoin(self.remote.url, relative_url) list_downloader = self.remote.get_downloader(url=tag_list_url) await list_downloader.run(extra_data={'repo_name': repo_name}) with open(list_downloader.path) as tags_raw: tags_dict = json.loads(tags_raw.read()) tag_list = tags_dict['tags'] # check for the presence of the pagination link header link = list_downloader.response_headers.get('Link') await self.handle_pagination(link, repo_name, tag_list) whitelist_tags = self.remote.whitelist_tags if whitelist_tags: tag_list = list(set(tag_list) & set(whitelist_tags.split(','))) pb.increment() msg = 'Creating Download requests for v2 Tags' with ProgressBar(message=msg, total=len(tag_list)) as pb: for tag_name in tag_list: relative_url = '/v2/{name}/manifests/{tag}'.format( name=self.remote.namespaced_upstream_name, tag=tag_name, ) url = urljoin(self.remote.url, relative_url) downloader = self.remote.get_downloader(url=url) to_download.append(downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS})) pb.increment() pb_parsed_tags = ProgressBar(message='Processing v2 Tags', state='running') pb_parsed_ml_tags = ProgressBar(message='Parsing Manifest List Tags', state='running') pb_parsed_m_tags = ProgressBar(message='Parsing Manifests Tags', state='running') global pb_parsed_blobs pb_parsed_blobs = ProgressBar(message='Parsing Blobs', state='running') pb_parsed_man = ProgressBar(message='Parsing Manifests', state='running') for download_tag in asyncio.as_completed(to_download): tag = await download_tag with open(tag.path) as content_file: raw = content_file.read() content_data = json.loads(raw) mediatype = content_data.get('mediaType') tag.artifact_attributes['file'] = tag.path saved_artifact = Artifact(**tag.artifact_attributes) try: saved_artifact.save() except IntegrityError: del tag.artifact_attributes['file'] saved_artifact = Artifact.objects.get(**tag.artifact_attributes) tag_dc = self.create_tag(mediatype, saved_artifact, tag.url) if type(tag_dc.content) is ManifestListTag: list_dc = self.create_tagged_manifest_list( tag_dc, content_data) await self.put(list_dc) pb_parsed_ml_tags.increment() tag_dc.extra_data['list_relation'] = list_dc for manifest_data in content_data.get('manifests'): man_dc = self.create_manifest(list_dc, manifest_data) future_manifests.append(man_dc.get_or_create_future()) man_dcs[man_dc.content.digest] = man_dc await self.put(man_dc) pb_parsed_man.increment() elif type(tag_dc.content) is ManifestTag: man_dc = self.create_tagged_manifest(tag_dc, content_data) await self.put(man_dc) pb_parsed_m_tags.increment() tag_dc.extra_data['man_relation'] = man_dc self.handle_blobs(man_dc, content_data, total_blobs) await self.put(tag_dc) pb_parsed_tags.increment() pb_parsed_tags.state = 'completed' pb_parsed_tags.total = pb_parsed_tags.done pb_parsed_tags.save() pb_parsed_ml_tags.state = 'completed' pb_parsed_ml_tags.total = pb_parsed_ml_tags.done pb_parsed_ml_tags.save() pb_parsed_m_tags.state = 'completed' pb_parsed_m_tags.total = pb_parsed_m_tags.done pb_parsed_m_tags.save() pb_parsed_man.state = 'completed' pb_parsed_man.total = pb_parsed_man.done pb_parsed_man.save() for manifest_future in asyncio.as_completed(future_manifests): man = await manifest_future with man._artifacts.get().file.open() as content_file: raw = content_file.read() content_data = json.loads(raw) man_dc = man_dcs[man.digest] self.handle_blobs(man_dc, content_data, total_blobs) for blob in total_blobs: await self.put(blob) pb_parsed_blobs.state = 'completed' pb_parsed_blobs.total = pb_parsed_blobs.done pb_parsed_blobs.save()
async def run(self): """ DockerFirstStage. """ future_manifests = [] tag_list = [] to_download = [] man_dcs = {} total_blobs = [] with ProgressBar(message='Downloading tag list', total=1) as pb: repo_name = self.remote.namespaced_upstream_name relative_url = '/v2/{name}/tags/list'.format(name=repo_name) tag_list_url = urljoin(self.remote.url, relative_url) list_downloader = self.remote.get_downloader(url=tag_list_url) await list_downloader.run(extra_data={'repo_name': repo_name}) with open(list_downloader.path) as tags_raw: tags_dict = json.loads(tags_raw.read()) tag_list = tags_dict['tags'] # check for the presence of the pagination link header link = list_downloader.response_headers.get('Link') await self.handle_pagination(link, repo_name, tag_list) whitelist_tags = self.remote.whitelist_tags if whitelist_tags: tag_list = list(set(tag_list) & set(whitelist_tags.split(','))) pb.increment() for tag_name in tag_list: relative_url = '/v2/{name}/manifests/{tag}'.format( name=self.remote.namespaced_upstream_name, tag=tag_name, ) url = urljoin(self.remote.url, relative_url) downloader = self.remote.get_downloader(url=url) to_download.append( downloader.run(extra_data={'headers': V2_ACCEPT_HEADERS})) pb_parsed_tags = ProgressBar(message='Processing Tags', state='running') for download_tag in asyncio.as_completed(to_download): tag = await download_tag with open(tag.path, 'rb') as content_file: raw_data = content_file.read() content_data = json.loads(raw_data) media_type = content_data.get('mediaType') tag.artifact_attributes['file'] = tag.path saved_artifact = Artifact(**tag.artifact_attributes) try: saved_artifact.save() except IntegrityError: del tag.artifact_attributes['file'] saved_artifact = Artifact.objects.get( **tag.artifact_attributes) tag_dc = self.create_tag(saved_artifact, tag.url) if media_type == MEDIA_TYPE.MANIFEST_LIST: list_dc = self.create_tagged_manifest_list( tag_dc, content_data) await self.put(list_dc) tag_dc.extra_data['man_relation'] = list_dc for manifest_data in content_data.get('manifests'): man_dc = self.create_manifest(list_dc, manifest_data) future_manifests.append(man_dc.get_or_create_future()) man_dcs[man_dc.content.digest] = man_dc await self.put(man_dc) else: man_dc = self.create_tagged_manifest(tag_dc, content_data, raw_data) await self.put(man_dc) tag_dc.extra_data['man_relation'] = man_dc self.handle_blobs(man_dc, content_data, total_blobs) await self.put(tag_dc) pb_parsed_tags.increment() pb_parsed_tags.state = 'completed' pb_parsed_tags.total = pb_parsed_tags.done pb_parsed_tags.save() for manifest_future in asyncio.as_completed(future_manifests): man = await manifest_future with man._artifacts.get().file.open() as content_file: raw = content_file.read() content_data = json.loads(raw) man_dc = man_dcs[man.digest] self.handle_blobs(man_dc, content_data, total_blobs) for blob in total_blobs: await self.put(blob)
async def run(self): """ Build `DeclarativeContent` from the repodata. """ packages_pb = ProgressBar(message='Parsed Packages') erratum_pb = ProgressBar(message='Parsed Erratum') packages_pb.save() erratum_pb.save() with ProgressBar(message='Downloading Metadata Files') as metadata_pb: downloader = self.remote.get_downloader( url=urljoin(self.remote.url, 'repodata/repomd.xml')) # TODO: decide how to distinguish between a mirror list and a normal repo result = await downloader.run() metadata_pb.increment() repomd_path = result.path repomd = cr.Repomd(repomd_path) package_repodata_urls = {} downloaders = [] for record in repomd.records: if record.type in PACKAGE_REPODATA: package_repodata_urls[record.type] = urljoin( self.remote.url, record.location_href) elif record.type in UPDATE_REPODATA: updateinfo_url = urljoin(self.remote.url, record.location_href) downloader = self.remote.get_downloader(url=updateinfo_url) downloaders.append([downloader.run()]) else: log.info( _('Unknown repodata type: {t}. Skipped.').format( t=record.type)) # TODO: skip databases, save unknown types to publish them as-is # to preserve order, downloaders are created after all repodata urls are identified package_repodata_downloaders = [] for repodata_type in PACKAGE_REPODATA: downloader = self.remote.get_downloader( url=package_repodata_urls[repodata_type]) package_repodata_downloaders.append(downloader.run()) downloaders.append(package_repodata_downloaders) # asyncio.gather is used to preserve the order of results for package repodata pending = [ asyncio.gather(*downloaders_group) for downloaders_group in downloaders ] while pending: done, pending = await asyncio.wait( pending, return_when=asyncio.FIRST_COMPLETED) for downloader in done: results = downloader.result() if results[0].url == package_repodata_urls['primary']: primary_xml_path = results[0].path filelists_xml_path = results[1].path other_xml_path = results[2].path metadata_pb.done += 3 metadata_pb.save() packages = await RpmFirstStage.parse_repodata( primary_xml_path, filelists_xml_path, other_xml_path) packages_pb.total = len(packages) packages_pb.state = 'running' packages_pb.save() for pkg in packages.values(): package = Package( **Package.createrepo_to_dict(pkg)) artifact = Artifact(size=package.size_package) checksum_type = getattr( CHECKSUM_TYPES, package.checksum_type.upper()) setattr(artifact, checksum_type, package.pkgId) url = urljoin(self.remote.url, package.location_href) filename = os.path.basename(package.location_href) da = DeclarativeArtifact( artifact=artifact, url=url, relative_path=filename, remote=self.remote, deferred_download=self.deferred_download) dc = DeclarativeContent(content=package, d_artifacts=[da]) packages_pb.increment() await self.put(dc) elif results[0].url == updateinfo_url: updateinfo_xml_path = results[0].path metadata_pb.increment() updates = await RpmFirstStage.parse_updateinfo( updateinfo_xml_path) erratum_pb.total = len(updates) erratum_pb.state = 'running' erratum_pb.save() for update in updates: update_record = UpdateRecord( **UpdateRecord.createrepo_to_dict(update)) update_record.digest = RpmFirstStage.hash_update_record( update) future_relations = { 'collections': defaultdict(list), 'references': [] } for collection in update.collections: coll_dict = UpdateCollection.createrepo_to_dict( collection) coll = UpdateCollection(**coll_dict) for package in collection.packages: pkg_dict = UpdateCollectionPackage.createrepo_to_dict( package) pkg = UpdateCollectionPackage(**pkg_dict) future_relations['collections'][ coll].append(pkg) for reference in update.references: reference_dict = UpdateReference.createrepo_to_dict( reference) ref = UpdateReference(**reference_dict) future_relations['references'].append(ref) erratum_pb.increment() dc = DeclarativeContent(content=update_record) dc.extra_data = future_relations await self.put(dc) packages_pb.state = 'completed' erratum_pb.state = 'completed' packages_pb.save() erratum_pb.save()
async def pre_migrate_content(content_model): """ A coroutine to pre-migrate Pulp 2 content. Args: content_model: Models for content which is being migrated. """ batch_size = 10000 content_type = content_model.pulp2.type pulp2content = [] # the latest timestamp we have in the migration tool Pulp2Content table for this content type content_qs = Pulp2Content.objects.filter( pulp2_content_type_id=content_type) last_updated = content_qs.aggregate( Max('pulp2_last_updated'))['pulp2_last_updated__max'] or 0 _logger.debug( 'The latest migrated {type} content has {timestamp} timestamp.'.format( type=content_type, timestamp=last_updated)) # query only newly created/updated items mongo_content_qs = content_model.pulp2.objects( _last_updated__gte=last_updated) total_content = mongo_content_qs.count() _logger.debug('Total count for {type} content to migrate: {total}'.format( type=content_type, total=total_content)) pulp2content_pb = ProgressBar( message='Pre-migrating Pulp 2 {} content (general info)'.format( content_type.upper()), total=total_content, state=TASK_STATES.RUNNING) pulp2content_pb.save() pulp2detail_pb = ProgressBar( message='Pre-migrating Pulp 2 {} content (detail info)'.format( content_type.upper()), total=total_content, state=TASK_STATES.RUNNING) pulp2detail_pb.save() existing_count = 0 for i, record in enumerate( mongo_content_qs.only('id', '_storage_path', '_last_updated', '_content_type_id', 'downloaded').batch_size(batch_size)): if record['_last_updated'] == last_updated: # corner case - content with the last``last_updated`` date might be pre-migrated; # check if this content is already pre-migrated migrated = Pulp2Content.objects.filter( pulp2_last_updated=last_updated, pulp2_id=record['id']) if migrated: existing_count += 1 # it has to be updated here and not later, in case all items were migrated before # and no new content will be saved. pulp2content_pb.total -= 1 pulp2content_pb.save() pulp2detail_pb.total -= 1 pulp2detail_pb.save() continue item = Pulp2Content(pulp2_id=record['id'], pulp2_content_type_id=record['_content_type_id'], pulp2_last_updated=record['_last_updated'], pulp2_storage_path=record['_storage_path'], downloaded=record['downloaded']) _logger.debug('Add content item to the list to migrate: {item}'.format( item=item)) pulp2content.append(item) save_batch = (i and not (i + 1) % batch_size or i == total_content - 1) if save_batch: _logger.debug( 'Bulk save for generic content info, saved so far: {index}'. format(index=i + 1)) pulp2content_batch = Pulp2Content.objects.bulk_create( pulp2content, ignore_conflicts=True) content_saved = len(pulp2content_batch) - existing_count pulp2content_pb.done += content_saved pulp2content_pb.save() await content_model.pulp_2to3_detail.pre_migrate_content_detail( pulp2content_batch) pulp2detail_pb.done += content_saved pulp2detail_pb.save() pulp2content = [] existing_count = 0 pulp2content_pb.state = TASK_STATES.COMPLETED pulp2content_pb.save() pulp2detail_pb.state = TASK_STATES.COMPLETED pulp2detail_pb.save()