def _compress_archive_tar(in_path: Path, out_path: Path): """Create a new zip compressed tar from a folder.""" with get_progress_reporter()(total=1, desc='Compressing to tar') as progress: _callback = create_callback(progress) with TarPath(out_path, mode='w:gz', dereference=True) as path: path.puttree(in_path, check_exists=False, callback=_callback, cb_descript='Compressing to tar')
def _extract_archive(self, filepath: Path, callback: Callable[[str, Any], None]): try: TarPath(self.filepath, mode='r:*', pax_format=tarfile.PAX_FORMAT).extract_tree( filepath, allow_dev=False, allow_symlink=False, callback=callback) except tarfile.ReadError as error: raise CorruptArchive(f'The input file cannot be read: {error}')
def open(self): # pylint: disable=attribute-defined-outside-init self.assert_within_context() # create a temporary folder in which to perform the write self._temp_path: Path = Path(tempfile.mkdtemp()) # open a zipfile in in write mode to export to self._archivepath: TarPath = TarPath(self._temp_path / 'export', mode='w:gz', dereference=True) # setup data to store self._data: Dict[str, Any] = { 'node_attributes': {}, 'node_extras': {}, 'export_data': {}, 'links_uuid': [], 'groups_uuid': {}, }
def _extract(self, *, path_prefix: str, callback: Callable[[str, Any], None] = null_callback): self.assert_within_context() assert self._sandbox is not None # required by mypy try: TarPath(self.filename, mode='r:*').joinpath(path_prefix).extract_tree( self._sandbox.abspath, allow_dev=False, allow_symlink=False, callback=callback, cb_descript='Extracting repository files') except tarfile.ReadError as error: raise CorruptArchive(f'The input file cannot be read: {error}') except NotADirectoryError as error: raise CorruptArchive( f'Unable to find required folder in archive: {error}')
def test_glob_all_tar(tmp_path): """Test that the `*/**` pattern matches the central directory list.""" for name in ("a", "b", "c"): tmp_path.joinpath(name).touch() tmp_path.joinpath("d").mkdir() tmp_path.joinpath("e").joinpath("f").mkdir(parents=True) for name in ("x", "y", "z"): tmp_path.joinpath("e").joinpath("f").joinpath(name).touch() with tarfile.TarFile(tmp_path / "archive.tar", "w") as zipper: for path in tmp_path.glob("**/*"): if path.name in ("archive.tar", "e"): continue zipper.add( str(path), path.relative_to(tmp_path).as_posix(), recursive=False ) namelist = sorted(n.rstrip("/") for n in zipper.getnames()) with TarPath(tmp_path / "archive.tar") as zpath: assert ( sorted(p.at for p in zpath.glob("**/*", include_virtual=False)) == namelist )
def _migrate(filename_archive, version_old, version_new, migration_method, archive_kwargs=None): """Migrate one of the archives from `aiida-export-migration-tests`. :param filename_archive: the relative file name of the archive :param version_old: version of the archive :param version_new: version to migrate to :param migration_method: the migration method that should convert between version_old and version_new :return: the migrated metadata and data as a tuple """ archive_path = get_archive_file( filename_archive, **(archive_kwargs or { 'filepath': 'archives', 'external_module': 'aiida-export-migration-tests' }) ) out_path = tmp_path / 'out.aiida' if zipfile.is_zipfile(archive_path): ZipPath(archive_path).extract_tree(out_path) elif tarfile.is_tarfile(archive_path): TarPath(archive_path).extract_tree(out_path) else: raise ValueError('invalid file format, expected either a zip archive or gzipped tarball') folder = CacheFolder(out_path) _, old_metadata = folder.load_json('metadata.json') verify_metadata_version(old_metadata, version=version_old) migration_method(folder) _, metadata = folder.load_json('metadata.json') verify_metadata_version(metadata, version=version_new) _, data = folder.load_json('data.json') return metadata, data
def test_illegal_create_links(external_archive, tmp_path): """Test illegal create links from workchain are detected and removed from exports using v0.3""" # Initialization dirpath_archive = get_archive_file('export_v0.3.aiida', **external_archive) known_illegal_links = 2 out_path = tmp_path / 'aiida.out' # Migrate if zipfile.is_zipfile(dirpath_archive): ZipPath(dirpath_archive).extract_tree(out_path) elif tarfile.is_tarfile(dirpath_archive): TarPath(dirpath_archive).extract_tree(out_path) else: raise ValueError( 'invalid file format, expected either a zip archive or gzipped tarball' ) try: data = json.loads((out_path / 'data.json').read_text('utf8')) except IOError: raise NotExistent( f'export archive does not contain the required file {out_path}') # Check illegal create links are present in org. archive file links_count = len(data['links_uuid']) links_count_migrated = links_count - known_illegal_links workfunc_uuids = { value['uuid'] for value in data['export_data']['Node'].values() if value['type'].startswith('calculation.function') or value['type'].startswith('calculation.work') } violations = [] for link in data['links_uuid']: if link['input'] in workfunc_uuids and link['type'] == 'createlink': violations.append(link) assert len(violations) == known_illegal_links, ( f'{known_illegal_links} illegal create links were expected, instead {len(violations)} was/were found' ) # Migrate to v0.4 folder = CacheFolder(out_path) migrate_v3_to_v4(folder) _, data = folder.load_json('data.json') # Check illegal create links were removed assert len(data['links_uuid']) == links_count_migrated, ( f"{links_count_migrated} links were expected, instead {len(data['links_uuid'])} was/were found" ) workfunc_uuids = { value['uuid'] for value in data['export_data']['Node'].values() if value['node_type'].find('WorkFunctionNode') != -1 or value['node_type'].find('WorkChainNode') != -1 } violations = [] for link in data['links_uuid']: if link['input'] in workfunc_uuids and link['type'] == 'create': violations.append(link) assert len( violations ) == 0, f'0 illegal links were expected, instead {len(violations)} was/were found'
def test_migrate_external(external_archive, tmp_path): """Test migration for file containing complete v0.3 era possibilities""" # Get metadata.json and data.json as dicts from v0.3 file archive dirpath_archive = get_archive_file('export_v0.3.aiida', **external_archive) out_path = tmp_path / 'aiida.out' # Migrate if zipfile.is_zipfile(dirpath_archive): ZipPath(dirpath_archive).extract_tree(out_path) elif tarfile.is_tarfile(dirpath_archive): TarPath(dirpath_archive).extract_tree(out_path) else: raise ValueError( 'invalid file format, expected either a zip archive or gzipped tarball' ) try: metadata = json.loads((out_path / 'metadata.json').read_text('utf8')) data = json.loads((out_path / 'data.json').read_text('utf8')) except IOError: raise NotExistent( f'export archive does not contain the required file {out_path}') verify_metadata_version(metadata, version='0.3') # Save pre-migration info links_count_org = len(data['links_uuid']) work_uuids = { value['uuid'] for value in data['export_data']['Node'].values() if value['type'].startswith('calculation.function') or value['type'].startswith('calculation.work') } illegal_links = [] for link in data['links_uuid']: if link['input'] in work_uuids and link['type'] == 'createlink': illegal_links.append(link) # Migrate to v0.4 folder = CacheFolder(out_path) migrate_v3_to_v4(folder) _, metadata = folder.load_json('metadata.json') _, data = folder.load_json('data.json') verify_metadata_version(metadata, version='0.4') ## Following checks are based on the archive-file ## Which means there are more legal entities, they are simply not relevant here. # Check schema-changes new_node_attrs = {'node_type', 'process_type'} for change in new_node_attrs: # data.json for node in data['export_data']['Node'].values(): assert change in node, f"'{change}' not found for {node}" # metadata.json assert change in metadata['all_fields_info'][ 'Node'], f"'{change}' not found in metadata.json for Node" # Check Node types legal_node_types = { 'data.float.Float.', 'data.int.Int.', 'data.dict.Dict.', 'data.code.Code.', 'data.structure.StructureData.', 'data.folder.FolderData.', 'data.remote.RemoteData.', 'data.upf.UpfData.', 'data.array.ArrayData.', 'data.array.bands.BandsData.', 'data.array.kpoints.KpointsData.', 'data.array.trajectory.TrajectoryData.', 'process.workflow.workchain.WorkChainNode.', 'process.calculation.calcjob.CalcJobNode.' } legal_process_types = {'', 'aiida.calculations:quantumespresso.pw'} for node in data['export_data']['Node'].values(): assert node['node_type'] in legal_node_types, ( f"{node['node_type']} is not a legal node_type. Legal node types: {legal_node_types}" ) assert node['process_type'] in legal_process_types, ( f"{node['process_type']} is not a legal process_type. Legal process types: {legal_node_types}" ) # Check links # Make sure the two illegal create links were removed during the migration assert len(data['links_uuid']) == links_count_org - 2, ( 'Two of the org. {} links should have been removed during the migration, ' 'instead there are now {} links'.format(links_count_org, len(data['links_uuid']))) legal_link_types = { 'unspecified', 'create', 'return', 'input_calc', 'input_work', 'call_calc', 'call_work' } for link in data['links_uuid']: assert link['type'] in legal_link_types for link in illegal_links: assert link not in data[ 'links_uuid'], f'{link} should not be in the migrated archive file' # Check Groups # There is one Group in the archive file, it is a user group updated_attrs = {'label', 'type_string'} legal_group_type = {'user'} for attr in updated_attrs: # data.json for group in data['export_data']['Group'].values(): assert attr in group, f'{attr} not found in Group {group}' assert group[ 'type_string'] in legal_group_type, f"{group['type_string']} is not a legal Group type_string" # metadata.json assert attr in metadata['all_fields_info'][ 'Group'], f'{attr} not found in metadata.json' # Check node_attributes* calcjob_nodes = [] process_nodes = [] for node_id, content in data['export_data']['Node'].items(): if content['node_type'] == 'process.calculation.calcjob.CalcJobNode.': calcjob_nodes.append(node_id) elif content['node_type'].startswith('process.'): process_nodes.append(node_id) mandatory_updated_calcjob_attrs = {'resources', 'parser_name'} optional_updated_calcjob_attrs = { 'custom_environment_variables': 'environment_variables' } updated_process_attrs = {'process_label'} fields = {'node_attributes', 'node_attributes_conversion'} for field in fields: for node_id in calcjob_nodes: for attr in mandatory_updated_calcjob_attrs: assert attr in data[field][node_id], ( f"Updated attribute name '{attr}' not found in {field} for node_id: {node_id}" ) for old, new in optional_updated_calcjob_attrs.items(): assert old not in data[field][node_id], ( "Old attribute '{}' found in {} for node_id: {}. " "It should now be updated to '{}' or not exist".format( old, field, node_id, new)) for node_id in process_nodes: for attr in updated_process_attrs: assert attr in data[field][node_id], ( f"Updated attribute name '{attr}' not found in {field} for node_id: {node_id}" ) # Check TrajectoryData # There should be minimum one TrajectoryData in the archive file trajectorydata_nodes = [] for node_id, content in data['export_data']['Node'].items(): if content['node_type'] == 'data.array.trajectory.TrajectoryData.': trajectorydata_nodes.append(node_id) updated_attrs = {'symbols'} fields = {'node_attributes', 'node_attributes_conversion'} for field in fields: for node_id in trajectorydata_nodes: for attr in updated_attrs: assert attr in data[field][node_id], ( f"Updated attribute name '{attr}' not found in {field} for TrajecteoryData node_id: {node_id}" ) # Check Computer removed_attrs = {'enabled'} for attr in removed_attrs: # data.json for computer in data['export_data']['Computer'].values(): assert attr not in computer, f"'{attr}' should have been removed from Computer {computer['name']}" # metadata.json assert attr not in metadata['all_fields_info']['Computer'], ( f"'{attr}' should have been removed from Computer in metadata.json" ) # Check new entities new_entities = {'Log', 'Comment'} fields = {'all_fields_info', 'unique_identifiers'} for entity in new_entities: for field in fields: assert entity in metadata[ field], f'{entity} not found in {field} in metadata.json' # Check extras # Dicts with key, vales equal to node_id, {} should be present # This means they should be same length as data['export_data']['Node'] or 'node_attributes*' attrs_count = len(data['node_attributes']) new_fields = {'node_extras', 'node_extras_conversion'} for field in new_fields: assert field in list( data.keys()), f"New field '{field}' not found in data.json" assert len(data[field]) == attrs_count, ( f"New field '{field}' found to have only {len(data[field])} entries, " f'but should have had {attrs_count} entries')