def work(self) -> bool:
        errors: typing.List[hathi_result.Result] = []

        checksum_report = os.path.join(self.package_path, "checksum.md5")
        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            report_builder = hathi_result.SummaryDirector(
                source=checksum_report
            )

            try:
                files_to_check = []

                for a, file_name in \
                        validate_process.extracts_checksums(checksum_report):
                    files_to_check.append(file_name)

                self.log(
                    "Validating checksums of the {} files "
                    "included in {}".format(
                        len(files_to_check),
                        checksum_report
                    )
                )

                checksum_report_errors = validate_process.run_validation(
                    validator.ValidateChecksumReport(self.package_path,
                                                     checksum_report)
                )
                if not checksum_report_errors:
                    self.log(
                        "All checksums in {} successfully validated".format(
                            checksum_report
                        )
                    )
                else:
                    for error in checksum_report_errors:
                        errors.append(error)
            except FileNotFoundError as e:
                report_builder.add_error(
                    "Unable to validate checksums. Reason: {}".format(e)
                )
            except PermissionError as e:
                report_builder = hathi_result.SummaryDirector(
                   source=self.package_path
                )
                report_builder.add_error("Permission issues. \"{}\"".format(e))
                self.set_results(report_builder.construct())
                return False

            for error in report_builder.construct():
                errors.append(error)
            self.set_results(errors)
        return True
Exemplo n.º 2
0
    def work(self) -> bool:
        errors: List[hathi_result.Result] = []

        checksum_report = os.path.join(self.package_path, "checksum.md5")
        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            report_builder = hathi_result.SummaryDirector(
                source=checksum_report)

            try:
                files_to_check = [
                    file_name for _, file_name in
                    validate_process.extracts_checksums(checksum_report)
                ]

                self.log(
                    f"Validating checksums of the {len(files_to_check)} files "
                    f"included in {checksum_report}")

                checksum_report_errors: List[hathi_result.Result] = \
                    validate_process.run_validation(
                        validator.ValidateChecksumReport(
                            self.package_path,
                            checksum_report
                        )
                )
                if not checksum_report_errors:
                    self.log(
                        f"All checksums in {checksum_report} successfully "
                        f"validated")
                else:
                    for error in checksum_report_errors:
                        errors.append(error)
            except FileNotFoundError as file_missing_error:
                report_builder.add_error("Unable to validate checksums. "
                                         f"Reason: {file_missing_error}")
            except PermissionError as permission_error:
                report_builder = hathi_result.SummaryDirector(
                    source=self.package_path)
                report_builder.add_error(
                    f'Permission issues. "{permission_error}"')
                self.set_results(report_builder.construct())
                return False

            for error in report_builder.construct():
                errors.append(error)
            self.set_results(errors)
        return True
    def work(self) -> bool:
        marc_file = os.path.join(self.package_path, "marc.xml")
        result_builder = hathi_result.SummaryDirector(source=marc_file)
        errors: typing.List[hathi_result.Result] = []

        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            try:
                if not os.path.exists(marc_file):
                    self.log(
                        "Skipping \'{}\' due to file not found".format(
                            marc_file
                        )
                    )

                else:
                    self.log(
                        "Validating marc.xml in {}".format(self.package_path)
                    )

                    marc_errors = validate_process.run_validation(
                        validator.ValidateMarc(marc_file)
                    )

                    if not marc_errors:
                        self.log("{} successfully validated".format(marc_file))
                    else:
                        for error in marc_errors:
                            self.log(error.message)
                            errors.append(error)
            except FileNotFoundError as e:
                result_builder.add_error(
                    "Unable to Validate Marc. Reason: {}".format(e)
                )
            except PermissionError as e:
                report_builder = hathi_result.SummaryDirector(
                   source=self.package_path
                )
                report_builder.add_error("Permission issues. \"{}\"".format(e))
                self.set_results(report_builder.construct())
                return False

            for error in result_builder.construct():
                errors.append(error)
            self.set_results(errors)
        return True
    def work(self) -> bool:
        errors: typing.List[hathi_result.Result] = []
        extensions = [".txt", ".jp2"]
        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            if self.check_ocr:
                extensions.append(".xml")
            try:
                missing_files_errors = validate_process.run_validation(
                    validator.ValidateComponents(
                        self.package_path,
                        "^[0-9]{8}$",
                        *extensions
                    )
                )
            except FileNotFoundError:
                report_builder = hathi_result.SummaryDirector(
                   source=self.package_path
                )

                report_builder.add_error(
                    "No files located with expected file naming scheme in path"
                )
                self.set_results(report_builder.construct())
                return False
            except PermissionError as e:
                report_builder = hathi_result.SummaryDirector(
                   source=self.package_path
                )
                report_builder.add_error("Permission issues. \"{}\"".format(e))
                self.set_results(report_builder.construct())
                return False

            if not missing_files_errors:
                self.log(
                    "Found no missing component files in {}".format(
                        self.package_path
                    )
                )

            else:
                for error in missing_files_errors:
                    self.log(error.message)
                    errors.append(error)
            self.set_results(errors)
        return True
    def work(self) -> bool:
        errors: typing.List[hathi_result.Result] = []
        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            print("Running ocr Validation")
            try:
                ocr_errors = validate_process.run_validation(
                    validator.ValidateOCRFiles(path=self.package_path))

            except PermissionError as e:
                report_builder = hathi_result.SummaryDirector(
                    source=self.package_path)
                report_builder.add_error("Permission issues. \"{}\"".format(e))
                self.set_results(report_builder.construct())
                return False

            except Exception as e:
                print(e)
                raise

            if ocr_errors:
                self.log("No validation errors found in ".format(
                    self.package_path))

                for error in ocr_errors:
                    self.log(error.message)
                    errors.append(error)
            self.set_results(errors)
        return True
Exemplo n.º 6
0
    def work(self) -> bool:
        yml_file = os.path.join(self.package_path, "meta.yml")
        errors: List[hathi_result.Result] = []
        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            report_builder = hathi_result.SummaryDirector(source=yml_file)

            try:
                if not os.path.exists(yml_file):
                    self.log(f"Skipping '{yml_file}' due to file not found")

                else:
                    self.log(f"Validating meta.yml in {self.package_path}")

                    meta_yml_errors = validate_process.run_validation(
                        validator.ValidateMetaYML(yaml_file=yml_file,
                                                  path=self.package_path,
                                                  required_page_data=True))

                    if not meta_yml_errors:
                        self.log(f"{yml_file} successfully validated")
                    else:
                        for error in meta_yml_errors:
                            self.log(error.message)
                            errors.append(error)
            except FileNotFoundError as file_not_found_error:
                report_builder.add_error(
                    f"Unable to validate YAML. Reason: {file_not_found_error}")
            for error in report_builder.construct():
                errors.append(error)
            self.set_results(errors)
        return True
Exemplo n.º 7
0
def find_errors_marc(filename) -> result.ResultSummary:
    """
    Validate the MARC file

    Args:
        filename:

    Returns:

    """
    summary_builder = result.SummaryDirector(source=filename)

    xsd = etree.XML(xml_schemes.MARC_XSD)  # type: ignore
    scheme = etree.XMLSchema(xsd)
    try:
        with open(filename, "r", encoding="utf8") as f:
            raw_data = f.read()
        doc = etree.fromstring(raw_data)
        if not scheme.validate(doc):  # type: ignore
            summary_builder.add_error("Unable to validate")
    except FileNotFoundError:
        summary_builder.add_error("File missing")
    except etree.XMLSyntaxError as e:
        summary_builder.add_error("Syntax error: {}".format(e))
    return summary_builder.construct()
    def work(self) -> bool:
        errors: typing.List[hathi_result.Result] = []
        my_logger = logging.getLogger(hathi_validate.__name__)
        my_logger.setLevel(logging.INFO)

        with self.log_config(my_logger):
            try:
                extra_subdirectories_errors = validate_process.run_validation(
                    validator.ValidateExtraSubdirectories(
                        path=self.package_path))
            except PermissionError as e:
                report_builder = hathi_result.SummaryDirector(
                    source=self.package_path)

                report_builder.add_error("Permission issues. \"{}\"".format(e))
                self.set_results(report_builder.construct())
                return False

            if not extra_subdirectories_errors:
                self.log("No extra subdirectories found in {}".format(
                    self.package_path))

            else:
                for error in extra_subdirectories_errors:
                    self.log(error.message)
                    errors.append(error)

            self.set_results(errors)
        return True
Exemplo n.º 9
0
def step_impl(context):
    """
    Args:
        context (behave.runner.Context):
    """

    summary_builder = result.SummaryDirector(source="spam_source")
    context.summary = summary_builder.construct()
Exemplo n.º 10
0
def find_errors_meta(filename, path, require_page_data=True):
    """
    Validate meta.yml file
    could also validate that the values are correct by comparing with the images

    Args:
        filename:

    Yields: Error messages

    """
    def find_pagedata_errors(metadata):
        pages = metadata["pagedata"]
        for image_name, attributes in pages.items():
            if not os.path.exists(os.path.join(path, image_name)):
                yield "The pagedata {} contains an nonexistent file {}".format(
                    filename, image_name)
            if attributes:
                pass

    def find_capture_date_errors(metadata):
        capture_date = metadata["capture_date"]

        if not isinstance(capture_date, datetime.datetime):
            if isinstance(capture_date, str):
                # Just because the parser wasn't able to convert into a datetime object doesn't mean it's not valid per se.
                # It can also be a matched to a regex.
                if DATE_REGEX.fullmatch(capture_date) is None:
                    yield "Invalid YAML capture_date {}".format(capture_date)
            else:
                yield "Invalid YAML data type for in capture_date"

    def find_capture_agent_errors(metadata):
        capture_agent = metadata["capture_agent"]
        if not isinstance(capture_agent, str):
            yield "Invalid YAML capture_agent: {}".format(capture_agent)

    summary_builder = result.SummaryDirector(source=filename)
    try:
        yml_metadata = parse_yaml(filename=filename)

        try:
            for error in find_capture_date_errors(yml_metadata):
                summary_builder.add_error(error)
            for error in find_capture_agent_errors(yml_metadata):
                summary_builder.add_error(error)
            if require_page_data:
                for error in find_pagedata_errors(yml_metadata):
                    summary_builder.add_error(error)
        except KeyError as e:
            summary_builder.add_error("{} is missing key, {}".format(
                filename, e))
    except yaml.YAMLError as e:
        summary_builder.add_error("Unable to read {}. Reason:{}".format(
            filename, e))
    except FileNotFoundError as e:
        summary_builder.add_error("Missing {}".format(e))
    return summary_builder.construct()
Exemplo n.º 11
0
def step_impl(context):
    """
    Args:
        context (behave.runner.Context):
    """
    summary_builder = result.SummaryDirector(source="spam_source")
    summary_builder.add_error("Missing 0001.xml")
    summary_builder.add_error("Missing 0002.xml")
    summary_builder.add_error("Missing 0003.xml")
    summary_builder.add_error("Missing 0004.xml")
    context.summary = summary_builder.construct()
Exemplo n.º 12
0
def find_non_utf8_characters(file_path: str) -> result.ResultSummary:
    result_builder = result.SummaryDirector(source=file_path)
    with open(file_path, "rb") as f:

        for line_num, line in enumerate(f):
            try:
                line.decode("utf-8", errors="strict")
            except UnicodeDecodeError as e:
                result_builder.add_error(
                    "Line {} contains illegal characters. Details: {}".format(
                        line_num + 1, e))

    return result_builder.construct()
Exemplo n.º 13
0
def find_errors_ocr(path) -> result.ResultSummary:
    """ Validate all xml files located in the given path to make sure they are valid to the alto scheme

    Args:
        path: Path to find the alto xml files

    Returns:

    """
    def ocr_filter(entry):
        if not entry.is_file():
            return False

        base, ext = os.path.splitext(entry.name)
        if ext.lower() != ".xml":
            return False
        if base.lower() == "marc":
            return False

        return True

    logger = logging.getLogger(__name__)
    alto_xsd = etree.XML(xml_schemes.get_scheme("alto"))
    alto_scheme = etree.XMLSchema(alto_xsd)

    summary_builder = result.SummaryDirector(source=path)
    for xml_file in filter(ocr_filter, os.scandir(path)):

        # print(xml_file.path)
        try:
            with open(xml_file.path, "r", encoding="utf8") as f:
                raw_data = f.read()

            doc = etree.fromstring(raw_data.encode("utf8"))

            if not alto_scheme.validate(doc):
                summary_builder.add_error(
                    "{} does not validate to ALTO scheme".format(
                        xml_file.name))
            else:
                logger.info("{} validates to the ALTO XML scheme".format(
                    xml_file.name))

        except FileNotFoundError:
            summary_builder.add_error("File missing")
        except etree.XMLSyntaxError as e:
            summary_builder.add_error("Syntax error: {}".format(e))
    # summary_builder = result.SummaryDirector(source=path)
    return summary_builder.construct()
Exemplo n.º 14
0
def find_extra_subdirectory(path) -> result.ResultSummary:
    """Check path for any subdirectories

    Args:
        path:

    Yields: Any subdirectory

    """
    summary_builder = result.SummaryDirector(source=path)
    for item in os.scandir(path):
        if item.is_dir():
            summary_builder.add_error("Extra subdirectory {}".format(
                item.name))
    return summary_builder.construct()
Exemplo n.º 15
0
def find_failing_checksums(path, report) -> result.ResultSummary:
    """validate that the checksums in the .fil file match

    Args:
        path:
        report:

    Returns: Error report

    """

    logger = logging.getLogger(__name__)
    report_builder = result.SummaryDirector(source=path)
    try:
        for report_md5_hash, filename in extracts_checksums(report):
            logger.debug(
                "Calculating the md5 checksum hash for {}".format(filename))
            file_path = os.path.join(path, filename)
            try:
                file_md5_hash = calculate_md5(filename=file_path)
                if not is_same_hash(file_md5_hash, report_md5_hash):
                    logger.debug(
                        'Hash mismatch for "{}". (Actual ({}): expected ({}))'.
                        format(file_path, file_md5_hash, report_md5_hash))
                    report_builder.add_error(
                        "Checksum listed in \"{}\" doesn't match for \"{}\"".
                        format(os.path.basename(report), filename))
                else:
                    logger.info(
                        "{} successfully matches md5 hash in {}".format(
                            filename, os.path.basename(report)))
            except FileNotFoundError as e:
                logger.info(
                    "Unable to run checksum for missing file, {}".format(
                        filename))
                report_builder.add_error(
                    "Unable to run checksum for missing file, {}".format(
                        filename))
    except FileNotFoundError as e:
        report_builder.add_error("File missing")
    return report_builder.construct()
Exemplo n.º 16
0
def find_missing_files(path: str) -> result.ResultSummary:
    """check for expected files exist on the path

    Args:
        path:

    Yields: Any files missing

    """

    expected_files = [
        "checksum.md5",
        "marc.xml",
        "meta.yml",
    ]

    summery_builder = result.SummaryDirector(source=path)

    for file in expected_files:
        if not os.path.exists(os.path.join(path, file)):
            summery_builder.add_error("Missing file: {}".format(file))
    return summery_builder.construct()
Exemplo n.º 17
0
def multiple_summary():
    summary_builder = result.SummaryDirector(source="eggs_source")
    summary_builder.add_error("Some Error")
    summary_builder.add_error("Another Error")
    return summary_builder.construct()
Exemplo n.º 18
0
def simple_summary():
    summary_builder = result.SummaryDirector(source="spam_source")
    summary_builder.add_error("Not valid")
    summary = summary_builder.construct()
    return summary