Exemplo n.º 1
0
    def discover_task_metadata(
        self, initial_results: Sequence[Any], additional_data,
        **user_args: Union[str, bool]
    ) -> List[Dict[str, Union[str, Collection[str]]]]:
        """Create a list of metadata that the jobs will need in order to work.

        Args:
            initial_results:
            additional_data:
            **user_args:

        Returns:
            list of dictionaries of job metadata

        """
        server_url = self.global_settings.get("getmarc_server_url")
        if server_url is None:
            raise MissingConfiguration("getmarc_server_url")

        return [{
            "directory": {
                "value": folder.name,
                "type": user_args[IDENTIFIER_TYPE],
            },
            "enhancements": {
                "955": user_args.get(OPTION_955_FIELD, False),
                "035": user_args.get(OPTION_035_FIELD, False)
            },
            "api_server": server_url,
            "path": folder.path
        } for folder in filter(self.filter_bib_id_folders,
                               os.scandir(user_args[OPTION_USER_INPUT]))]
Exemplo n.º 2
0
    def discover_task_metadata(self,
                               initial_results: List[
                                   speedwagon.tasks.Result],
                               additional_data: Dict[str, Any],
                               **user_args: str) -> List[dict]:

        if self.tessdata_path is not None and \
                not os.path.exists(self.tessdata_path):
            raise MissingConfiguration("tessdata_path")
        new_tasks = []

        for result in initial_results:
            for image_file in result.data:
                image_path = os.path.dirname(image_file)
                base_name = os.path.splitext(os.path.basename(image_file))[0]
                ocr_file_name = f"{base_name}.txt"
                for key, value in ocr.LANGUAGE_CODES.items():
                    if value == user_args["Language"]:
                        language_code = key
                        break
                else:
                    language = user_args["Language"]
                    raise ValueError(
                        f"Unable to look up language code for {language}"
                    )

                new_task = {
                    "source_file_path": image_file,
                    "destination_path": image_path,
                    "output_file_name": ocr_file_name,
                    "lang_code": language_code
                }
                new_tasks.append(new_task)
        return new_tasks
Exemplo n.º 3
0
    def discover_task_metadata(self, initial_results: List[Any],
                               additional_data, **user_args) -> List[dict]:

        if not os.path.exists(self.tessdata_path):
            raise MissingConfiguration("tessdata_path")

        new_tasks = []

        for result in initial_results:
            for image_file in result.data:
                image_path = os.path.dirname(image_file)
                base_name = os.path.splitext(os.path.basename(image_file))[0]
                ocr_file_name = "{}.txt".format(base_name)
                for k, v in ocr.LANGUAGE_CODES.items():
                    if v == user_args["Language"]:
                        language_code = k
                        break
                else:
                    raise ValueError("Unable to look up language code "
                                     "for {}".format(user_args["Language"]))

                new_task = {
                    "source_file_path": image_file,
                    "destination_path": image_path,
                    "output_file_name": ocr_file_name,
                    "lang_code": language_code
                }
                new_tasks.append(new_task)
        return new_tasks
Exemplo n.º 4
0
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.global_settings = kwargs.get('global_settings', {})
        self.tessdata_path = self.global_settings.get("tessdata")
        if self.tessdata_path is None:
            raise MissingConfiguration(
                "Required setting not configured: tessdata")

        if not os.path.exists(self.tessdata_path):
            os.mkdir(self.tessdata_path)

        description = \
            "Create OCR text files for images. \n" \
            "\n" \
            "Settings: \n" \
            "    Path: Path containing tiff or jp2 files. \n" \
            "    Image File Type: The type of Image file to use.\n" \
            "\n" \
            "\n" \
            "Adding Additional Languages:\n" \
            "    To modify the available languages, place " \
            "Tesseract traineddata files for " \
            f"version {ocr.Engine(self.tessdata_path).get_version()} " \
            "into the following directory:\n" \
            "\n" \
            f"{self.tessdata_path}.\n" \
            "\n" \
            "Note:\n" \
            "    It's important to use the correct version of the " \
            "traineddata files. Using incorrect versions won't crash the " \
            "program but they may produce unexpected results.\n" \
            "\n" \
            "For more information about these files, go to " \
            "https://github.com/tesseract-ocr/tesseract/wiki/Data-Files\n"
        self.set_description(description)
Exemplo n.º 5
0
    def discover_task_metadata(self, initial_results: Sequence[Any],
                               additional_data,
                               **user_args) -> List[Dict[Any, Any]]:
        """Create a list of metadata that the jobs will need in order to work.

        Args:
            initial_results:
            additional_data:
            **user_args:

        Returns:
            list of dictionaries of job metadata

        """
        jobs = []
        server_url = self.global_settings.get("getmarc_server_url")
        if server_url is None:
            raise MissingConfiguration("getmarc_server_url")

        for folder in filter(self.filter_bib_id_folders,
                             os.scandir(user_args["Input"])):
            jobs.append({
                "identifier": {
                    "value": folder.name,
                    "type": user_args['Identifier type'],
                },
                "api_server": server_url,
                "path": folder.path
            })
        return jobs
Exemplo n.º 6
0
    def __init__(self,
                 global_settings: Optional[Dict[str, str]] = None) -> None:
        """Generate Marc XML files.

        Args:
            global_settings:
                Settings that could affect the way the workflow runs.
        """
        super().__init__()

        if global_settings is not None:
            self.global_settings = global_settings
        for k in GenerateMarcXMLFilesWorkflow.required_settings_keys:
            value = self.global_settings.get(k)
            if value is None:
                raise MissingConfiguration(f"Missing value for {k}")