Пример #1
0
    def __init__(self, url=None, user=None, password=None, verify=None):
        """Init MetaxClient instances."""
        # If any of the params is not provided read them from app.config
        if url is None or user is None or password is None:
            url = CONFIG.get("METAX_URL")
            user = CONFIG.get("METAX_USER")
            password = CONFIG.get("METAX_PASSWORD")

        if verify is None:
            verify = CONFIG.get("METAX_SSL_VERIFICATION", True)

        self.client = Metax(url, user, password, verify=verify)
        # dataset_id => preservation_state dict
        self.dataset_cache = {}
def test_successful_directory_validation_fails(requests_mock):
    """Test validation of invalid directory tree.

    The root directory is missing the `directory_path` attribute

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))
    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par',
        json={
            'identifier': 'first_par',
            'directory_path': '/second_par/first_par',
            'parent_directory': {
                'identifier': 'second_par'
            }
        },
        status_code=200)
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par',
        json={
            'identifier': 'second_par',
            'directory_path': '/second_par',
            'parent_directory': {
                'identifier': 'root'
            }
        },
        status_code=200)
    root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL +
                                         '/directories/root',
                                         json={'identifier': 'root'},
                                         status_code=200)
    with pytest.raises(InvalidDatasetMetadataError) as exc_info:
        validator = DirectoryValidation(client)
        validator.is_valid_for_file(FILE_METADATA)

    assert str(exc_info.value).startswith(
        "Validation error in metadata of root: "
        "'directory_path' is a required property")

    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1
    assert root_dir_adapter.call_count == 1
def test_verify_file_contained_by_dataset_directories(requests_mock):
    """Test is_consistent_for_file method.

    Check that ``DatasetConsistency::is_consistent_for_file()`` succeeds
    when dataset directories contains the file

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification')
    )
    dataset = {
        'identifier': 'dataset_identifier',
        'research_dataset': {
            'files': [],
            'directories': [{'identifier': 'root_directory'}]
        }
    }

    file_metadata = {
        'identifier': 'file_identifier',
        'file_path': "/path/to/file",
        'parent_directory': {
            'identifier': 'parent_directory_identifier'
        }
    }
    requests_mock.get(
        tests.conftest.METAX_URL + '/directories/parent_directory_identifier',
        json={
            'identifier': 'parent_directory_identifier',
            'parent_directory': {
                'identifier': 'root_directory'
            }
        },
        status_code=200
    )
    try:
        dirs = DatasetConsistency(client, dataset)
        dirs.is_consistent_for_file(file_metadata)
    except InvalidDatasetMetadataError as exc:
        pytest.fail(
            '_verify_file_contained_by_dataset raised exception: ' + str(exc)
        )
def test_successful_directory_validation(requests_mock):
    """Directory validation of valid directory tree.

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(configuration.get('metax_url'),
                   configuration.get('metax_user'),
                   configuration.get('metax_password'),
                   verify=configuration.getboolean('metax_ssl_verification'))

    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par',
        json={
            'identifier': 'first_par',
            'directory_path': '/second_par/first_par',
            'parent_directory': {
                'identifier': 'second_par'
            }
        },
        status_code=200)
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par',
        json={
            'identifier': 'second_par',
            'directory_path': '/second_par',
            'parent_directory': {
                'identifier': 'root'
            }
        },
        status_code=200)
    root_dir_adapter = requests_mock.get(tests.conftest.METAX_URL +
                                         '/directories/root',
                                         json={
                                             'identifier': 'root',
                                             'directory_path': '/'
                                         },
                                         status_code=200)
    try:
        validator = DirectoryValidation(client)
        validator.is_valid_for_file(FILE_METADATA)
    except InvalidDatasetMetadataError as exc:
        pytest.fail('test_successful_directory_validation fails: ' + str(exc))
    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1
    assert root_dir_adapter.call_count == 1
    def get_provenance_ids(self):
        """List identifiers of provenance events.

        Gets list of dataset provenance events from Metax, and reads
        provenance IDs of the events from event.xml files found in the
        workspace directory.

        :returns: list of provenance IDs
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        metadata = metax_client.get_dataset(self.dataset_id)
        languages = get_dataset_languages(metadata)

        # Get the reference file path from Luigi task input
        # It already contains the workspace path.
        event_ids = get_md_references(
            read_md_references(
                self.workspace,
                os.path.basename(
                    self.input()['create_provenance_information'].path)))

        event_type_ids = {}
        for event_id in event_ids:
            event_file = event_id[1:] + "-PREMIS%3AEVENT-amd.xml"
            event_file_path = os.path.join(self.sip_creation_path, event_file)
            if not os.path.exists(event_file_path):
                continue
            root = ET.parse(encode_path(event_file_path)).getroot()
            event_type = root.xpath("//premis:eventType",
                                    namespaces=NAMESPACES)[0].text
            event_type_ids[event_type] = event_id

        provenance_ids = []
        for provenance in metadata["research_dataset"]["provenance"]:
            event_type = get_localized_value(
                provenance["preservation_event"]["pref_label"],
                languages=languages)
            provenance_ids += [event_type_ids[event_type]]

        return provenance_ids
    def run(self):
        """Read list of required files from Metax and download them.

        Files are written to path based on ``file_path`` in Metax.

        :returns: ``None``
        """
        upload_database = upload_rest_api.database.Database()

        # Find file identifiers from Metax dataset metadata.
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset_files = metax_client.get_dataset_files(self.dataset_id)

        # Download files to temporary directory which will be moved to
        # output target path when all files have been downloaded
        with self.output().temporary_path() as temporary_directory:
            os.mkdir(temporary_directory)

            for dataset_file in dataset_files:
                identifier = dataset_file["identifier"]

                # Full path to file
                target_path = os.path.normpath(
                    os.path.join(temporary_directory,
                                 dataset_file["file_path"].strip('/')))
                if not target_path.startswith(temporary_directory):
                    raise InvalidFileMetadataError(
                        'The file path of file %s is invalid: %s' %
                        (identifier, dataset_file["file_path"]))

                # Create the download directory for file if it does not
                # exist already
                if not os.path.isdir(os.path.dirname(target_path)):
                    # TODO: Use exist_ok -parameter when moving to
                    # python3
                    os.makedirs(os.path.dirname(target_path))

                download_file(dataset_file, target_path, self.config,
                              upload_database)
def test_verify_file_contained_by_dataset_missing_from_dataset(requests_mock):
    """Test is_consistent_for_file method.

    Check that ``DatasetConsistency::is_consistent_for_file()`` raises
    exception with descriptive error messages when dataset files nor
    directories do not contain the file.

    :returns: ``None``
    """
    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification')
    )
    dataset = {
        'identifier': 'dataset_identifier',
        'research_dataset': {
            'files': [],
            'directories': []
        }
    }

    file_metadata = {
        'identifier': 'file_identifier',
        'file_path': "/path/to/file",
        'parent_directory': {
            'identifier': 'parent_directory_identifier'
        }
    }
    requests_mock.get(
        tests.conftest.METAX_URL + '/directories/parent_directory_identifier',
        json={'identifier': 'parent_directory_identifier'},
        status_code=200
    )
    with pytest.raises(InvalidDatasetMetadataError) as exc_info:
        dirs = DatasetConsistency(client, dataset)
        dirs.is_consistent_for_file(file_metadata)

    assert str(exc_info.value) == ("File not found from dataset files nor "
                                   "directories: /path/to/file")
    def run(self):
        """Report preservation status to Metax.

        Checks the path of ingest report file in digital preservation
        service. If the ingest report is in ~/accepted/.../ directory,
        the dataset has passed validation.If the report is found in
        ~/rejected/.../ directory, or somewhere else, an exception is
        risen. The event handlers will deal with the exceptions.

        :returns: ``None``
        """
        # List of all matching paths ValidateSIP found
        ingest_report_paths = self.input()[0].existing_paths()

        # Only one ingest report should be found
        assert len(ingest_report_paths) == 1

        # 'accepted' or 'rejected'?
        directory = ingest_report_paths[0].split('/')[0]
        if directory == 'accepted':
            # Init metax
            config_object = Configuration(self.config)
            metax_client = Metax(
                config_object.get('metax_url'),
                config_object.get('metax_user'),
                config_object.get('metax_password'),
                verify=config_object.getboolean('metax_ssl_verification'))
            # Set Metax preservation state of this dataset to 6 ("in
            # longterm preservation")
            metax_client.set_preservation_state(
                self.dataset_id,
                state=DS_STATE_IN_DIGITAL_PRESERVATION,
                system_description='Accepted to preservation')
            with self.output().open('w') as output:
                output.write('Dataset id=' + self.dataset_id)
        elif directory == 'rejected':
            # Raise exception that informs event handler that dataset
            # did not pass validation
            raise InvalidSIPError("SIP was rejected")
        else:
            raise ValueError('Report was found in incorrect '
                             'path: %s' % ingest_report_paths[0])
Пример #9
0
    def run(self):
        """Copy datacite.xml metadatafile from Metax.

        Creates a METS document that contains dmdSec element with
        datacite metadata.

        :returns: ``None``
        """
        # Get datacite.xml from Metax
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        dataset = metax_client.get_dataset(self.dataset_id)
        datacite = metax_client.get_datacite(dataset['identifier'])

        # Write datacite.xml to file
        datacite_path = os.path.join(self.workspace, 'datacite.xml')
        datacite.write(datacite_path)

        tmp = os.path.join(config_object.get('packaging_root'), 'tmp/')
        with TemporaryDirectory(prefix=tmp) as temporary_workspace:
            # Create output files with siptools
            import_description.import_description(
                dmdsec_location=datacite_path,
                workspace=temporary_workspace,
                without_uuid=True)

            # Move created files to SIP creation directory. PREMIS event
            # reference file is moved to output target path after
            # everything else is done.
            with self.output().temporary_path() as target_path:
                shutil.move(
                    os.path.join(temporary_workspace,
                                 'premis-event-md-references.jsonl'),
                    target_path)
                for file_ in os.listdir(temporary_workspace):
                    shutil.move(os.path.join(temporary_workspace, file_),
                                self.sip_creation_path)
    def get_identifiers(self):
        """Get file identifiers.

        Return a list of all the file identifiers and the path to the
        downloaded files.

        :returns: Tuple (list of identifiers, cache_path)
        """
        config_object = Configuration(self.config)
        packaging_root = config_object.get("packaging_root")
        cache_path = os.path.join(packaging_root, "file_cache")

        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        try:
            dataset_files = metax_client.get_dataset_files(self.dataset_id)
            return [_file["identifier"] for _file in dataset_files], cache_path
        except DatasetNotAvailableError:
            return [], cache_path
Пример #11
0
def validate_metadata(dataset_id,
                      config="/etc/siptools_research.conf",
                      dummy_doi="false"):
    """Validate dataset.

    Reads dataset metadata, file metadata, and additional techMD XML from
    Metax and validates them against schemas. Raises error if dataset is not
    valid. Raises InvalidDatasetError if dataset is invalid.

    :param dataset_id: dataset identifier
    :param config: configuration file path
    :param: dummy_doi: 'true' if dummy preservation identifier is to be used
    :returns: ``True``, if dataset metadata is valid.
    """
    conf = Configuration(config)
    metax_client = Metax(conf.get('metax_url'),
                         conf.get('metax_user'),
                         conf.get('metax_password'),
                         verify=conf.getboolean('metax_ssl_verification'))
    # Get dataset metadata from Metax
    dataset_metadata = metax_client.get_dataset(dataset_id)

    # Validate dataset metadata
    _validate_dataset_metadata(dataset_metadata, dummy_doi=dummy_doi)

    # Validate dataset localization
    _validate_dataset_localization(dataset_metadata)

    # Validate contract metadata
    _validate_contract_metadata(dataset_metadata['contract']['identifier'],
                                metax_client)

    # Validate file metadata for each file in dataset files
    _validate_file_metadata(dataset_metadata, metax_client, conf)

    # Validate datacite provided by Metax
    _validate_datacite(dataset_id, metax_client, dummy_doi=dummy_doi)

    return True
    def run(self):
        """Compile all metadata files into METS document.

        :returns: ``None``
        """
        config_object = Configuration(self.config)
        metax_client = Metax(
            config_object.get('metax_url'),
            config_object.get('metax_user'),
            config_object.get('metax_password'),
            verify=config_object.getboolean('metax_ssl_verification'))
        metadata = metax_client.get_dataset(self.dataset_id)

        # Get preservation_identifier from Metax
        preservation_id = metadata["preservation_identifier"]

        # Get contract data from Metax
        contract_id = metadata["contract"]["identifier"]
        contract_metadata = metax_client.get_contract(contract_id)
        contract_identifier = contract_metadata["contract_json"]["identifier"]
        contract_org_name \
            = contract_metadata["contract_json"]["organization"]["name"]

        # Compile METS
        mets = compile_mets.create_mets(workspace=self.sip_creation_path,
                                        mets_profile='tpas',
                                        contractid=contract_identifier,
                                        objid=preservation_id,
                                        organization_name=contract_org_name,
                                        packagingservice='Packaging Service')

        with self.output().open('wb') as outputfile:
            mets.write(outputfile,
                       pretty_print=True,
                       xml_declaration=True,
                       encoding='UTF-8')
def test_dataset_directories_caching_works(requests_mock):
    """Test is_consistent_for_file method.

    Checks that caching of dataset directories in``DatasetConsistency``
    works and no extra calls are done to Metax. In this test dataset contains
    only one entry in dataset directories which is the root directory of the
    dataset files:

    /root_dir/second_par_dir/first_par_dir/file1
    /root_dir/second_par_dir/first_par_dir/file1

    :returns: ``None``
    """
    FILE_METADATA = {
        'file_path': "/path/to/file1",
        'parent_directory': {
            'identifier': 'first_par_dir'
        },
        "checksum": {
            "algorithm": "md5",
            "value": "foobar"
        },
        "file_characteristics": {
            "file_format": "text/csv"
        },
        "file_storage": {
            "identifier": "foobar",
            "id": 1
        }
    }
    file_1 = copy.deepcopy(FILE_METADATA)
    file_1['identifier'] = 'file_identifier1'
    file_2 = copy.deepcopy(FILE_METADATA)
    file_2['identifier'] = 'file_identifier2'

    # Init metax client
    configuration = Configuration(tests.conftest.UNIT_TEST_CONFIG_FILE)
    client = Metax(
        configuration.get('metax_url'),
        configuration.get('metax_user'),
        configuration.get('metax_password'),
        verify=configuration.getboolean('metax_ssl_verification')
    )
    dataset = {
        'identifier': 'dataset_identifier',
        'research_dataset': {
            'files': [],
            'directories': [{'identifier': 'root_dir'}]
        }
    }

    first_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/first_par_dir',
        json={
            'identifier': 'first_par_dir',
            'parent_directory': {
                'identifier': 'second_par_dir'
            }
        },
        status_code=200
    )
    second_par_dir_adapter = requests_mock.get(
        tests.conftest.METAX_URL + '/directories/second_par_dir',
        json={
            'identifier': 'second_par_dir',
            'parent_directory': {
                'identifier': 'root_dir'
            }
        },
        status_code=200
    )
    try:
        dirs = DatasetConsistency(client, dataset)
        dirs.is_consistent_for_file(file_1)
        dirs.is_consistent_for_file(file_2)
    except InvalidDatasetMetadataError as exc:
        pytest.fail(
            '_verify_file_contained_by_dataset raised exception: ' + str(exc)
        )
    # verify that dataset directory caching works. Metax is called only once
    # for the parent directories for the two files.
    assert first_par_dir_adapter.call_count == 1
    assert second_par_dir_adapter.call_count == 1