def run(self): """Get dataset 1 from Metax. :returns: ``None`` """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) metax_client.get_dataset('1')
def find_file_categories(self): """Create logical structure map of dataset files. Returns dictionary with filecategories as keys and filepaths as values. :returns: logical structure map dictionary """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset_files = metax_client.get_dataset_files(self.dataset_id) dataset_metadata = metax_client.get_dataset(self.dataset_id) languages = get_dataset_languages(dataset_metadata) dirpath2usecategory = get_dirpath_dict(metax_client, dataset_metadata) logical_struct = dict() for dataset_file in dataset_files: file_id = dataset_file['identifier'] # Get the use category of file. The path to the file in # logical structmap is stored in 'use_category' in metax. filecategory = find_file_use_category(file_id, dataset_metadata) # If file listed in datasets/<id>/files is not listed in # 'files' section of dataset metadata, look for # parent_directory of the file from 'directories' section. # The "use_category" of file is the "use_category" of the # parent directory. if filecategory is None: name_len = len(dataset_file["file_name"]) filecategory = find_dir_use_category( dataset_file["file_path"][:-name_len], dirpath2usecategory, languages) # If file category was not found even for the parent # directory, raise error if filecategory is None: raise InvalidDatasetMetadataError( "File category for file {} was not found".format(file_id)) # Append path to logical_struct[filecategory] list. Create # list if it does not exist already if filecategory not in logical_struct.keys(): logical_struct[filecategory] = [] logical_struct[filecategory].append(dataset_file['file_path']) return logical_struct
def get_provenance_ids(self): """List identifiers of provenance events. Gets list of dataset provenance events from Metax, and reads provenance IDs of the events from event.xml files found in the workspace directory. :returns: list of provenance IDs """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) metadata = metax_client.get_dataset(self.dataset_id) languages = get_dataset_languages(metadata) # Get the reference file path from Luigi task input # It already contains the workspace path. event_ids = get_md_references( read_md_references( self.workspace, os.path.basename( self.input()['create_provenance_information'].path))) event_type_ids = {} for event_id in event_ids: event_file = event_id[1:] + "-PREMIS%3AEVENT-amd.xml" event_file_path = os.path.join(self.sip_creation_path, event_file) if not os.path.exists(event_file_path): continue root = ET.parse(encode_path(event_file_path)).getroot() event_type = root.xpath("//premis:eventType", namespaces=NAMESPACES)[0].text event_type_ids[event_type] = event_id provenance_ids = [] for provenance in metadata["research_dataset"]["provenance"]: event_type = get_localized_value( provenance["preservation_event"]["pref_label"], languages=languages) provenance_ids += [event_type_ids[event_type]] return provenance_ids
def run(self): """Copy datacite.xml metadatafile from Metax. Creates a METS document that contains dmdSec element with datacite metadata. :returns: ``None`` """ # Get datacite.xml from Metax config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) dataset = metax_client.get_dataset(self.dataset_id) datacite = metax_client.get_datacite(dataset['identifier']) # Write datacite.xml to file datacite_path = os.path.join(self.workspace, 'datacite.xml') datacite.write(datacite_path) tmp = os.path.join(config_object.get('packaging_root'), 'tmp/') with TemporaryDirectory(prefix=tmp) as temporary_workspace: # Create output files with siptools import_description.import_description( dmdsec_location=datacite_path, workspace=temporary_workspace, without_uuid=True) # Move created files to SIP creation directory. PREMIS event # reference file is moved to output target path after # everything else is done. with self.output().temporary_path() as target_path: shutil.move( os.path.join(temporary_workspace, 'premis-event-md-references.jsonl'), target_path) for file_ in os.listdir(temporary_workspace): shutil.move(os.path.join(temporary_workspace, file_), self.sip_creation_path)
def validate_metadata(dataset_id, config="/etc/siptools_research.conf", dummy_doi="false"): """Validate dataset. Reads dataset metadata, file metadata, and additional techMD XML from Metax and validates them against schemas. Raises error if dataset is not valid. Raises InvalidDatasetError if dataset is invalid. :param dataset_id: dataset identifier :param config: configuration file path :param: dummy_doi: 'true' if dummy preservation identifier is to be used :returns: ``True``, if dataset metadata is valid. """ conf = Configuration(config) metax_client = Metax(conf.get('metax_url'), conf.get('metax_user'), conf.get('metax_password'), verify=conf.getboolean('metax_ssl_verification')) # Get dataset metadata from Metax dataset_metadata = metax_client.get_dataset(dataset_id) # Validate dataset metadata _validate_dataset_metadata(dataset_metadata, dummy_doi=dummy_doi) # Validate dataset localization _validate_dataset_localization(dataset_metadata) # Validate contract metadata _validate_contract_metadata(dataset_metadata['contract']['identifier'], metax_client) # Validate file metadata for each file in dataset files _validate_file_metadata(dataset_metadata, metax_client, conf) # Validate datacite provided by Metax _validate_datacite(dataset_id, metax_client, dummy_doi=dummy_doi) return True
def run(self): """Compile all metadata files into METS document. :returns: ``None`` """ config_object = Configuration(self.config) metax_client = Metax( config_object.get('metax_url'), config_object.get('metax_user'), config_object.get('metax_password'), verify=config_object.getboolean('metax_ssl_verification')) metadata = metax_client.get_dataset(self.dataset_id) # Get preservation_identifier from Metax preservation_id = metadata["preservation_identifier"] # Get contract data from Metax contract_id = metadata["contract"]["identifier"] contract_metadata = metax_client.get_contract(contract_id) contract_identifier = contract_metadata["contract_json"]["identifier"] contract_org_name \ = contract_metadata["contract_json"]["organization"]["name"] # Compile METS mets = compile_mets.create_mets(workspace=self.sip_creation_path, mets_profile='tpas', contractid=contract_identifier, objid=preservation_id, organization_name=contract_org_name, packagingservice='Packaging Service') with self.output().open('wb') as outputfile: mets.write(outputfile, pretty_print=True, xml_declaration=True, encoding='UTF-8')
class MetaxClient(object): """Class for handling Metax metadata.""" def __init__(self, url=None, user=None, password=None, verify=None): """Init MetaxClient instances.""" # If any of the params is not provided read them from app.config if url is None or user is None or password is None: url = CONFIG.get("METAX_URL") user = CONFIG.get("METAX_USER") password = CONFIG.get("METAX_PASSWORD") if verify is None: verify = CONFIG.get("METAX_SSL_VERIFICATION", True) self.client = Metax(url, user, password, verify=verify) # dataset_id => preservation_state dict self.dataset_cache = {} def get_files_dict(self, project): """Return dict {fpath: id} of all the files of a given project. """ return self.client.get_files_dict(project) def post_metadata(self, fpaths, root_upload_path, username, storage_id): """Generate file metadata and POST it to Metax in 5k chunks. :param fpaths: List of files for which to generate the metadata :param root_upload_path: root upload directory :param username: current user :param storage_id: pas storage identifier in Metax :returns: Stripped HTTP response returned by Metax. Success list contains succesfully generated file metadata in format: [ { "object": { "identifier": identifier, "file_path": file_path, "checksum": {"value": checksum}, "parent_directory": { "identifier": identifier } } }, . . . ] """ database = db.Database() project = database.user(username).get_project() checksums = database.checksums.get_checksums() metadata = [] responses = [] i = 0 for fpath in fpaths: metadata.append(_generate_metadata( fpath, root_upload_path, project, storage_id, checksums )) # POST metadata to Metax every 5k steps i += 1 if i % 5000 == 0: response = self.client.post_file(metadata) responses.append(_strip_metax_response(response)) # Add created identifiers to Mongo if "success" in response and response["success"]: database.store_identifiers( response["success"], root_upload_path, username ) metadata = [] # POST remaining metadata if metadata: response = self.client.post_file(metadata) responses.append(_strip_metax_response(response)) # Add created identifiers to Mongo if "success" in response and response["success"]: database.store_identifiers( response["success"], root_upload_path, username ) # Merge all responses into one response response = {"success": [], "failed": []} for metax_response in responses: if "success" in metax_response: response["success"].extend(metax_response["success"]) if "failed" in metax_response: response["failed"].extend(metax_response["failed"]) return response def delete_metadata(self, project, fpaths): """DELETE metadata from Metax. :param project: Project identifier :param fpaths: List of file_paths to remove :returns: HTTP response returned by Metax """ files_dict = self.client.get_files_dict(project) # Retrieve "file -> dataset" association map file_ids = [ file_["identifier"] for file_ in six.itervalues(files_dict) ] file2datasets = {} if file_ids: file2datasets = self.client.get_file2dataset_dict(file_ids) # Delete metadata if file exists in fpaths AND it doesn't have # any datasets file_ids_to_delete = [] for metax_path, file_ in six.iteritems(files_dict): path_exists = metax_path in fpaths dataset_exists = file2datasets.get(file_["identifier"], None) if path_exists and not dataset_exists: file_ids_to_delete.append(file_["identifier"]) if not file_ids_to_delete: return {"deleted_files_count": 0} return self.client.delete_files(file_ids_to_delete) def delete_file_metadata(self, project, fpath, root_upload_path=None, force=False): """Delete file metadata from Metax if file is not associated with any dataset. If force parameter is True metadata is deleted if the file belongs to a dataset not accepted to preservation. """ self.dataset_cache.clear() files_dict = self.client.get_files_dict(project) metax_path = get_metax_path(fpath, root_upload_path) if metax_path not in files_dict: raise MetaxClientError("Metadata not found in Metax") file_metadata = files_dict[metax_path] if file_metadata["storage_identifier"] != PAS_FILE_STORAGE_ID: raise MetaxClientError("Incorrect file storage") if not force and self.file_has_dataset(metax_path, files_dict): raise MetaxClientError("Metadata is part of a dataset") if self.file_has_accepted_dataset(metax_path, files_dict): raise MetaxClientError( "Metadata is part of an accepted dataset" ) file_id = six.text_type(file_metadata["id"]) return self.client.delete_file(file_id) def delete_all_metadata(self, project, fpath, root_upload_path, force=False): """Delete all file metadata from Metax found under dir fpath, which is not associated with any dataset and is stored in PAS file storage. If force parameter is True metadata is deleted if file belongs to a dataset not accepted to preservation. """ self.dataset_cache.clear() files_dict = self.client.get_files_dict(project) files_to_delete = {} # Iterate through all files under dir fpath for dirpath, _, files in os.walk(fpath): for _file in files: fpath = os.path.join(dirpath, _file) metax_path = get_metax_path(fpath, root_upload_path) if metax_path not in files_dict: continue storage_id = files_dict[metax_path]["storage_identifier"] if storage_id != PAS_FILE_STORAGE_ID: continue files_to_delete[metax_path] = files_dict[metax_path] if force: # Delete metadata for files which don't belong to accepted # datasets # FIXME: Deleting all file metadata when 'force' is in use # is inefficient at the moment due to each check requiring # an API call. file_ids_to_delete = [ file_["identifier"] for metax_path, file_ in six.iteritems(files_to_delete) if not self.file_has_accepted_dataset(metax_path, files_dict) ] else: # Delete metadata for files that don't belong to datasets file_ids = [ file_["identifier"] for file_ in six.itervalues(files_to_delete) ] # Retrieve related datasets in a single bulk operation file2datasets = {} if file_ids: file2datasets = self.client.get_file2dataset_dict(file_ids) file_ids_to_delete = [ file_["identifier"] for metax_path, file_ in six.iteritems(files_to_delete) if not file2datasets.get(file_["identifier"], None) ] if not file_ids_to_delete: return {"deleted_files_count": 0} return self.client.delete_files(file_ids_to_delete) def get_all_ids(self, project_list): """Get a set of all identifiers of files in any of the projects in project_list. """ id_set = set() # Iterate all projects for project in project_list: # Find all indentifiers in one project files_dict = self.get_files_dict(project) project_id_set = { _file["identifier"] for _file in files_dict.values() } # Add the identifiers to id_set id_set |= project_id_set return id_set def file_has_dataset(self, metax_path, files_dict): """Check if file belongs to any dataset.""" if metax_path not in files_dict: return False file_id = files_dict[metax_path]["id"] datasets = self.client.get_file_datasets(file_id) return len(datasets) != 0 def file_has_accepted_dataset(self, metax_path, files_dict): """Check if file belongs to dataset accepted to preservation.""" if metax_path in files_dict: file_id = files_dict[metax_path]["id"] dataset_ids = self.client.get_file_datasets(file_id) for dataset_id in dataset_ids: if dataset_id not in self.dataset_cache: dataset = self.client.get_dataset(dataset_id) self.dataset_cache[dataset_id] = \ dataset['preservation_state'] dataset_state = self.dataset_cache[dataset_id] if (DS_STATE_ACCEPTED_TO_DIGITAL_PRESERVATION <= dataset_state <= DS_STATE_IN_DIGITAL_PRESERVATION): return True return False