Exemplo n.º 1
0
    def find_subdirectory_errors(path):
        preservation_folder = os.path.join(path, "preservation")
        access_folder = os.path.join(path, "access")
        preservation_folders = os.listdir(preservation_folder)
        access_folders = os.listdir(access_folder)

        # find missing matching preservation folders
        master = set(preservation_folders)
        for access in access_folders:
            master.remove(access)

        for item_left in master:
            new_error = error_message.ValidationError(
                "missing matching {} in {}".format(item_left,
                                                   preservation_folder),
                group=path)
            new_error.source = path
            yield new_error

        # find missing matching access folders
        master = set(access_folders)
        for preservation in preservation_folders:
            master.remove(preservation)

        for item_left in master:
            new_error = error_message.ValidationError(
                "missing matching {} in {}".format(item_left, access_folder),
                group=path)
            new_error.source = path
            yield new_error
Exemplo n.º 2
0
    def check(self, path):
        valid = True
        errors = []
        file_location = os.path.dirname(path)
        basename, extension = os.path.splitext(os.path.basename(path))
        if extension not in self.ignore_extension:

            if extension not in self.valid_extensions:
                valid = False
                new_error = error_message.ValidationError(
                    "Invalid preservation file extension: \"{}\"".format(extension),
                    group=path)
                new_error.source = path
                errors.append(new_error)

            # Check the image files have the full 8 digits
            if extension == ".tif":
                if "target" not in basename:
                    if PresNamingChecker.valid_naming_scheme.match(basename) is None:
                        valid = False
                        new_error = error_message.ValidationError(
                            "Does not match the valid preservation file naming pattern",
                            group=file_location.split(os.sep)[-1])
                        new_error.source = path
                        errors.append(new_error)

        return checkers.Results(self.checker_name(), valid=valid, errors=errors)
Exemplo n.º 3
0
    def find_root_directory_errors(path: str):
        required_directories = {"access", "preservation"}
        for item in os.scandir(path):
            if item.is_dir():
                if item.name in required_directories:
                    required_directories.remove(item.name)
                else:
                    new_error = error_message.ValidationError(
                        "{} is an invalid folder.".format(item.path),
                        group=path)
                    new_error.source = path
                    yield new_error
            elif item.is_file():
                new_error = error_message.ValidationError(
                    "{} is an invalid file.".format(item.path), group=path)
                new_error.source = path
                yield new_error

        if required_directories:
            for folder in required_directories:
                new_error = error_message.ValidationError(
                    "{} is missing required folder {}".format(path, folder),
                    group=path)
                new_error.source = path
                yield new_error
Exemplo n.º 4
0
    def check(self, path):
        # NOTE: this uses the package because of the way hathi packages are formatted
        valid = True
        errors = []
        # Check if everything in access folder is found same in the preservation folder

        missing_pres_files = self.check_for_missing_matching_preservation(
            access_folder=path.directories["access"],
            preservation_folder=path.directories["preservation"])
        if missing_pres_files:
            valid = False
            new_error = error_message.ValidationError(
                "The files [{}] were found in the access but not in the preservation folder"
                .format(", ".join(
                    [os.path.basename(f) for f in missing_pres_files])),
                group=path.identifier)
            new_error.source = path.directories["access"]
            errors.append(new_error)

        missing_access_files = self.check_for_missing_matching_access(
            access_folder=path.directories["access"],
            preservation_folder=path.directories["preservation"])
        if missing_access_files:
            new_error = error_message.ValidationError(
                "The files [{}] were found in the preservation folder but not in the access folder"
                .format(", ".join(
                    [os.path.basename(f) for f in missing_access_files])),
                group=path.identifier)
            new_error.source = path.directories["preservation"]
            errors.append(new_error)
        return checkers.Results(self.checker_name(),
                                valid=valid,
                                errors=errors)
Exemplo n.º 5
0
    def check(self, path: str):
        """
        Make sure that all files included in this folder are tiff files 
        and contain nothing else

        Args:
            path: Path to the folder to check

        Returns: list of errors

        """
        required_files = set()  # type: ignore
        required_files = {"checksum.md5", "marc.xml", "meta.yml"}
        valid_image_extensions = [".jp2"]
        valid_text_extensions = [".txt", ".xml", ".yml"]
        errors = []
        valid = True
        image_files = set()
        text_files = set()
        try:
            missing = list(self.find_missing_by_number(path))
            if missing:
                valid = False
                new_error = error_message.ValidationError(
                    "Expected files [{}] not found in access folder".format(", ".join(missing)),
                    group=path.split(os.sep)[-1])
                new_error.source = path
                errors.append(new_error)
        except ValueError as e:
            valid = False
            new_error = error_message.ValidationError("Error trying to find missing files. Reason: {}".format(e),
                                                      group=path.split(os.sep)[-1])
            new_error.source = path
            errors.append(new_error)

        # Sort the files into their own category
        for root, dirs, files in os.walk(path):
            for file_ in files:

                # if the filename is the required files set, remove them
                if file_ in required_files:
                    required_files.remove(file_)

                basename, ext = os.path.splitext(file_)
                if ext in valid_image_extensions:
                    image_files.add((root, file_))
                elif ext in valid_text_extensions:
                    text_files.add((root, file_))

        # If there are any files still in the required_files set are missing.
        if required_files:
            valid = False
            new_error = error_message.ValidationError(
                "Missing expected file(s), [{}]".format(", ".join(required_files)))
            new_error.source = path
            errors.append(new_error)
            # errors.append("{} is missing {}".format(path, _file))

        return checkers.Results(self.checker_name(), valid=valid, errors=errors)
Exemplo n.º 6
0
    def check(self, path):
        valid = True
        errors = []
        required_files = (
            "target_l_001.tif",
            "target_l_002.tif",
            "target_r_001.tif",
            "target_r_002.tif",
        )
        error_group = path.split(os.sep)[-1]
        try:
            missing = list(self.find_missing_by_number(path))

            if missing:
                valid = False
                new_error = error_message.ValidationError(
                    "Expected files [{}] not found in preservation folder".
                    format(", ".join(missing)),
                    group=error_group)
                new_error.source = path
                errors.append(new_error)
        except ValueError as e:
            valid = False
            new_error = error_message.ValidationError(
                "Error trying to find missing files. Reason: {}".format(e),
                group=error_group)
            new_error.source = path
            errors.append(new_error)
        except FileNotFoundError as e:
            valid = False
            new_error = error_message.ValidationError(e, group=error_group)
            new_error.source = path
            errors.append(new_error)
            # return checkers.Results(self.checker_name(), valid=valid, errors=errors)
        # Find missing required_files
        missing = list(
            self.find_missing_required_files(path=path,
                                             expected_files=required_files))
        if missing:
            valid = False
            new_error = error_message.ValidationError(
                "Missing expected file(s), [{}]".format(", ".join(missing)),
                group=error_group)
            new_error.source = path
            errors.append(new_error)
        return checkers.Results(self.checker_name(),
                                valid=valid,
                                errors=errors)
    def setup(self):
        self.packages = []

        try:
            package_set = packages.create_package("Hathi", self.path)
            for package in package_set:
                self.packages.append(package)
        except packages.PackageError as e:
            self.valid = False
            new_error = error_message.ValidationError(e, group=self.path)
            new_error.source = self.path
            self.errors.append(new_error)

        # Add the tasks that need to be validated
        for hathi_package in self.packages:
            task_name = hathi_package.directories["preservation"].split(os.path.sep)[-1]

            my_task = tasks.Task(description="Validating {} in {}".format(task_name, hathi_package.root))

            # Package Structure Completeness:
            package_structure_test = validation_processors.PackageStructureComplete()
            package_structure_test.setup()
            package_structure_test.set_input(hathi_package.root)
            my_task.add_process(package_structure_test)

            # Package component Completeness:
            package_component_test = validation_processors.PackageComponentComplete()
            package_component_test.setup()
            package_component_test.set_input(hathi_package)
            my_task.add_process(package_component_test)

            # Preservation Folder
            preservation_folder_completeness_test = validation_processors.PackagePreservationComplete()
            preservation_folder_completeness_test.setup()
            preservation_folder_completeness_test.set_input(hathi_package.directories["preservation"])
            my_task.add_process(preservation_folder_completeness_test)

            # Access folder
            access_folder_completeness_test = validation_processors.PackageAccessComplete()
            access_folder_completeness_test.setup()
            access_folder_completeness_test.set_input(hathi_package.directories['access'])
            my_task.add_process(access_folder_completeness_test)

            # Preservation file name
            for file in os.scandir(hathi_package.directories["preservation"]):
                preservation_file_naming_test = validation_processors.PreservationFileNaming()
                preservation_file_naming_test.setup()
                preservation_file_naming_test.set_input(file.path)
                my_task.add_process(preservation_file_naming_test)

            # Access file name
            for file in os.scandir(hathi_package.directories["access"]):
                access_file_naming_test = validation_processors.AccessFileNaming()
                access_file_naming_test.setup()
                access_file_naming_test.set_input(file.path)
                my_task.add_process(access_file_naming_test)

            self.manager.push(my_task)
Exemplo n.º 8
0
    def check(self, path):
        valid = True
        errors = []
        file_location = os.path.dirname(path)
        basename, extension = os.path.splitext(os.path.basename(path))
        if extension not in self.ignore_extension:

            if extension not in self.valid_extensions:
                valid = False
                new_error = error_message.ValidationError("Invalid file type",
                                                          group=path.split(
                                                              os.sep)[-1])
                new_error.source = path
                errors.append(new_error)

            # Check the image files have the full 8 digits
            if self.valid_naming_scheme.match(basename) is None:
                valid = False
                new_error = error_message.ValidationError(
                    "Does not match the valid file pattern for preservation files",
                    group=file_location.split(os.sep)[-1])
                new_error.source = path
                errors.append(new_error)

                #
                # # The only xml file should be marc.xml
                # if extension == ".xml":
                #     if basename != "marc":
                #         valid = False
                #         errors.append(
                #             "\"{}\" does not match the valid file pattern for preservation files".format(basename))
                #
                # # The only yml file should be meta.yml
                # if extension == ".yml":
                #     if basename != "meta":
                #         valid = False
                #         errors.append(
                #             "\"{}\" does not match the valid file result_type pattern for preservation files".format(basename))

        return checkers.Results(self.checker_name(),
                                valid=valid,
                                errors=errors)
Exemplo n.º 9
0
    def check(self, path):
        valid = True
        errors = []
        file_location = os.path.dirname(path)
        group_name = file_location.split(os.sep)[-1]
        basename, extension = os.path.splitext(os.path.basename(path))
        if extension in self.extensions_to_check:
            if self.valid_naming_scheme.match(basename) is None:
                valid = False
                new_error = error_message.ValidationError(
                    "Does not match the valid file pattern for preservation files",
                    group=group_name)
                new_error.source = path
                errors.append(new_error)

        return checkers.Results(self.checker_name(), valid=valid, errors=errors)
Exemplo n.º 10
0
    def setup(self):

        package_searcher = self.profile.get_package_type
        package_searcher.root_path = self.path
        try:
            for package in package_searcher:
                self.packages.append(package)
        except packages.PackageError as e:
            self.valid = False
            new_error = error_message.ValidationError(e, group=self.path)
            new_error.source = self.path
            self.errors.append(new_error)

        for package in self.packages:
            self.manager.push(
                self.profile.create_validate_package_task(package))