예제 #1
0
 def __init__(self, process, ifgs, ifs):
     """Create a page alignment form a list of input files."""
     self.process = process
     self.ifgs = ifgs
     self.ifs = ifs
     self.log = getLogger('PageAlignment')
     self.align_lines()
예제 #2
0
    def add_files_to_mets(self, convention, mets, directory):
        """
        Add files from folder to METS, accoding to a file structure convention.

        Args:
            convention (string) : Which file structure convention to adhere to.

                'ocrd-gt' (Default)::

                    Subfolder name ==> mets:fileGrp @USE
                        'page' => 'OCR-D-OCR-PAGE'
                        'alto' => 'OCR-D-OCR-ALTO'
                        'tei' => 'OCR-D-OCR-TEI'
                    fileGrp + '_' + upper(Basename of file without extension) == mets:file @ID
                    File in root folder == mets:fileGrp @USE == 'OCR-D-IMG'
                    Extension ==> mets.file @MIMETYPE
                        .tif => image/tif
                        .png => image/png
                        .jpg => image/jpg
                        .xml => image/xml

        """
        log = getLogger('ocrd.resolver.add_files_to_mets')  # pylint: disable=redefined-outer-name
        log.debug("Reading files in '%s' according to '%s' convention",
                  directory, convention)

        if convention == 'ocrd-gt':
            for root, dirs, files in os.walk(directory):
                dirname = root[len(directory):]
                if not dirname:
                    fileGrp = 'OCR-D-IMG'
                elif '/' in dirname:
                    del dirs[:]
                    dirname = dirname[1:]
                    fileGrp = dirname.upper()
                for f in files:
                    if f == 'mets.xml':
                        continue
                    mimetype = 'application/octet-stream'
                    for ext in EXT_TO_MIME:
                        if f.endswith(ext):
                            mimetype = EXT_TO_MIME[ext]
                            break
                    if dirname == 'alto':
                        mimetype = 'application/alto+xml'
                        fileGrp = 'OCR-D-OCR-ALTO'
                    elif dirname == 'page':
                        fileGrp = 'OCR-D-OCR-PAGE'
                    local_filename = os.path.join(directory, dirname, f)
                    x = mets.add_file(
                        fileGrp,
                        mimetype=mimetype,
                        local_filename=local_filename,
                        ID='_'.join([fileGrp, f.replace('.', '_')]).upper(),
                        url='file://' + local_filename,
                    )
                    log.debug("Added as %s", x)
예제 #3
0
    def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None):
        """
        Download a file to the workspace.

        Early Shortcut: If url is a file://-URL and that file is already in the directory, keep it there.

        If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename.
        If basename is not given and no subdir is given, use the alnum characters in the URL as the basename.

        Args:
            directory (string): Directory to download files to
            basename (string, None): basename part of the filename on disk.
            url (string): URL to download from
            overwrite (boolean): Whether to overwrite existing files with that name
            subdir (boolean, None): Subdirectory to create within the directory. Think fileGrp.

        Returns:
            Local filename
        """
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
        log.debug("directory=|%s| url=|%s| basename=|%s| overwrite=|%s| subdir=|%s|", directory, url, basename, overwrite, subdir)
        if basename is None:
            if (subdir is not None) or \
                (directory and url.startswith('file://%s' % directory)): # in case downloading a url 'file:///tmp/foo/bar' to directory '/tmp/foo'
                basename = url.rsplit('/', 1)[-1]
            else:
                basename = safe_filename(url)

        if subdir is not None:
            basename = os.path.join(subdir, basename)

        outfilename = os.path.join(directory, basename)

        if os.path.exists(outfilename) and not overwrite:
            log.debug("File already exists and overwrite=False: %s", outfilename)
            return outfilename

        outfiledir = outfilename.rsplit('/', 1)[0]
        #  print(outfiledir)
        if not os.path.isdir(outfiledir):
            os.makedirs(outfiledir)

        log.debug("Downloading <%s> to '%s'", url, outfilename)
        if url.startswith('file://'):
            copyfile(url[len('file://'):], outfilename)
        else:
            response = requests.get(url)
            if response.status_code != 200:
                raise Exception("Not found: %s (HTTP %d)" % (url, response.status_code))
            with open(outfilename, 'wb') as outfile:
                outfile.write(response.content)

        return outfilename
예제 #4
0
    def download_to_directory(self,
                              directory,
                              url,
                              basename=None,
                              overwrite=False,
                              subdir=None,
                              prefer_symlink=None):
        """
        Download a file to the workspace.

        If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename.
        If basename is not given and no subdir is given, use the alnum characters in the URL as the basename.

        Args:
            directory (string): Directory to download files to
            basename (string, None): basename part of the filename on disk.
            url (string): URL to download from
            overwrite (boolean): Whether to overwrite existing files with that name
            subdir (boolean, None): Subdirectory to create within the directory. Think fileGrp.
            prefer_symlink (boolean): Whether to use symlinks instead of copying. Overrides self.prefer_symlink

        Returns:
            Local filename
        """
        log = getLogger('ocrd.resolver.download_to_directory')  # pylint: disable=redefined-outer-name
        if basename is None:
            if subdir is not None:
                basename = url.rsplit('/', 1)[-1]
            else:
                basename = safe_filename(url)

        if subdir is not None:
            basename = os.path.join(subdir, basename)

        outfilename = os.path.join(directory, basename)

        if os.path.exists(outfilename) and not overwrite:
            log.debug("File already exists and overwrite=False: %s",
                      outfilename)
            return outfilename

        outfiledir = outfilename.rsplit('/', 1)[0]
        #  print(outfiledir)
        if not os.path.isdir(outfiledir):
            os.makedirs(outfiledir)

        cached_filename = self.cache.get(url) if self.cache_enabled else False

        if cached_filename:
            log.debug("Found cached version of <%s> at '%s'", url,
                      cached_filename)
            self._copy_or_symlink(cached_filename, outfilename, prefer_symlink)
        else:
            log.debug("Downloading <%s> to '%s'", url, outfilename)
            if url.startswith('file://'):
                self._copy_or_symlink(url[len('file://'):], outfilename,
                                      prefer_symlink)
            else:
                with open(outfilename, 'wb') as outfile:
                    response = requests.get(url)
                    if response.status_code != 200:
                        raise Exception("Not found: %s (HTTP %d)" %
                                        (url, response.status_code))
                    outfile.write(response.content)

        if self.cache_enabled and not cached_filename:
            cached_filename = self.cache.put(url, filename=outfilename)
            log.debug("Stored in cache <%s> at '%s'", url, cached_filename)

        return outfilename
예제 #5
0
import os
from shutil import copyfile
from zipfile import ZipFile
import tempfile
import requests

from ocrd.constants import METS_XML_EMPTY, TMP_PREFIX, EXT_TO_MIME
from ocrd.utils import getLogger, safe_filename
from ocrd.resolver_cache import ResolverCache
from ocrd.workspace import Workspace
from ocrd.model import OcrdMets

log = getLogger('ocrd.resolver')
tempfile.tempdir = '/tmp'


class Resolver(object):
    """
    Handle Uploads, Downloads, Repository access and manage temporary directories
    Optionally cache files.

    Args:
        cache_enabled (Boolean): Whether to cache files. If True, passes kwargs to ~ResolverCache.
        prefer_symlink (Boolean): If True, symlink from cached file to the workspace instead of copying to reduce I/O.
    """
    def __init__(self, cache_enabled=False, prefer_symlink=False, **kwargs):
        """
        """
        self.cache_enabled = cache_enabled
        self.prefer_symlink = prefer_symlink
        self.cache = ResolverCache(**kwargs) if cache_enabled else None
예제 #6
0
from __future__ import absolute_import

from tesserocr import RIL, PSM, PyTessBaseAPI, get_languages, iterate_level
from ocrd.utils import getLogger, concat_padded, xywh_from_points, points_from_x0y0x1y1
from ocrd.model.ocrd_page import from_file, to_xml, TextEquivType, CoordsType, GlyphType
from ocrd import Processor, MIMETYPE_PAGE
from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL

log = getLogger('processor.TesserocrRecognize')

class TesserocrRecognize(Processor):

    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-recognize']
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrRecognize, self).__init__(*args, **kwargs)

    def process(self):
        """
        Performs the (text) recognition.
        """
        print(self.parameter)
        if self.parameter['textequiv_level'] not in ['line', 'glyph']:
            raise Exception("currently only implemented at the line/glyph level")
        model = get_languages()[1][-1] # last installed model
        if 'model' in self.parameter:
            model = self.parameter['model']
            if model not in get_languages()[1]:
                raise Exception("configured model " + model + " is not installed")
        with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
            log.info("Using model %s in %s for recognition", model, get_languages()[0])
예제 #7
0
 def __init__(self, *args, **kwargs):
     ocrd_tool = get_ocrd_tool()
     kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align']
     kwargs['version'] = ocrd_tool['version']
     super(Aligner, self).__init__(*args, **kwargs)
     self.log = getLogger('Processor.Aligner')
예제 #8
0
import os

from ocrd.constants import DEFAULT_CACHE_FOLDER

from ocrd.utils import getLogger, safe_filename
log = getLogger('ocrd.cache')

class ResolverCache(object):
    """
    Cache of downloads, based on URL.

    Args:
        cache_directory (string): Where to store cached files

    """

    def __init__(self, cache_directory=DEFAULT_CACHE_FOLDER):
        """
        Instantiate a cache
        """
        self.directory = cache_directory
        if not os.path.isdir(self.directory):
            log.info("Cache directory does not exist, creating: '%s'", self.directory)
            os.makedirs(self.directory)

    def get(self, url):
        cached_filename = os.path.join(self.directory, safe_filename(url))
        if os.path.exists(cached_filename):
            return cached_filename

    def put(self, url, filename=None, content=None):
예제 #9
0
 def __init__(self, jar, main, input_str, args):
     self.jar = jar
     self.main = main
     self.input_str = input_str
     self.args = args
     self.log = getLogger('JavaProcess')
예제 #10
0
 def __init__(self, *args, **kwargs):
     kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-typegroups-classifier']
     kwargs['version'] = OCRD_TOOL['version']
     super(TypegroupsClassifierProcessor, self).__init__(*args, **kwargs)
     self.log = getLogger('ocrd_typegroups_classifier')
예제 #11
0
import os
import json
import subprocess
from deprecated.sphinx import deprecated
from ocrd.utils import getLogger
from ocrd.validator import ParameterValidator

log = getLogger('ocrd.processor')


def _get_workspace(workspace=None,
                   resolver=None,
                   mets_url=None,
                   working_dir=None):
    if workspace is None:
        if resolver is None:
            raise Exception("Need to pass a resolver to create a workspace")
        if mets_url is None:
            raise Exception("Need to pass mets_url to create a workspace")
        workspace = resolver.workspace_from_url(mets_url,
                                                directory=working_dir)
    return workspace


def run_processor(
    processorClass,
    ocrd_tool=None,
    mets_url=None,
    resolver=None,
    workspace=None,
    group_id=None,
예제 #12
0
import json
import re

from jsonschema import Draft4Validator, validators  # pylint: disable=import-error

from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX, OCRD_TOOL_SCHEMA
from ocrd.utils import getLogger

log = getLogger('ocrd.validator')


# http://python-jsonschema.readthedocs.io/en/latest/faq/
def extend_with_default(validator_class):
    validate_properties = validator_class.VALIDATORS["properties"]

    def set_defaults(validator, properties, instance, schema):
        for prop, subschema in properties.items():
            if "default" in subschema:
                instance.setdefault(prop, subschema["default"])

        for error in validate_properties(validator, properties, instance,
                                         schema):
            yield error

    return validators.extend(validator_class, {"properties": set_defaults})


DefaultValidatingDraft4Validator = extend_with_default(Draft4Validator)

#
# -------------------------------------------------
예제 #13
0
from __future__ import absolute_import
import tesserocr
from ocrd.utils import getLogger, concat_padded, points_from_xywh
from ocrd.model.ocrd_page import (ReadingOrderType, RegionRefIndexedType,
                                  TextRegionType, CoordsType, OrderedGroupType,
                                  from_file, to_xml)
from ocrd import Processor, MIMETYPE_PAGE

from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL

log = getLogger('processor.TesserocrSegmentRegion')


class TesserocrSegmentRegion(Processor):
    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools'][
            'ocrd-tesserocr-segment-region']
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentRegion, self).__init__(*args, **kwargs)

    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                pcgts = from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(
                    pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
예제 #14
0
import os
import sys
import shutil

import cv2
from PIL import Image
import numpy as np

from ocrd.model import OcrdMets, OcrdExif
from ocrd.utils import getLogger
log = getLogger('ocrd.workspace')


class Workspace(object):
    """
    A workspace is a temporary directory set up for a processor. It's the
    interface to the METS/PAGE XML and delegates download and upload to the
    Resolver.

    Args:

        directory (string) : Folder to work in
        mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``.
        mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
    """
    def __init__(self,
                 resolver,
                 directory,
                 mets=None,
                 mets_basename='mets.xml'):
        self.resolver = resolver
예제 #15
0
파일: binarize.py 프로젝트: kba/ocrd_kraken
from __future__ import absolute_import
import kraken
from ocrd.utils import getLogger, mets_file_id
from ocrd import Processor, OcrdPage, MIMETYPE_PAGE

log = getLogger('processor.KrakenBinarize')


class KrakenBinarize(Processor):
    def process(self):
        """
        Performs the binarization.
        """
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            self.workspace.download_file(input_file)
            page = OcrdPage.from_file(input_file)
            image_url = page.imageFileName
            log.info("page %s", page)
            for region in page.list_textregions():
                textlines = region.list_textlines()
                log.info("About to binarize %i lines of region '%s'",
                         len(textlines), region.ID)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Binarizing line '%s' in region '%s'", line_no,
                              region.ID)
                    image = self.workspace.resolve_image_as_pil(
                        image_url, line.coords)
                    bin_image = kraken.binarization.nlbin(image)
            '''
            self.add_output_file(
예제 #16
0
from __future__ import absolute_import
from tesserocr import PyTessBaseAPI, RIL
from ocrd import Processor, MIMETYPE_PAGE
from ocrd.utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points
from ocrd.model.ocrd_page import (CoordsType, TextLineType, from_file, to_xml)

from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL

log = getLogger('processor.TesserocrSegmentLine')


class TesserocrSegmentLine(Processor):
    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-line']
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentLine, self).__init__(*args, **kwargs)

    def process(self):
        """
        Performs the line segmentation.
        """
        with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                pcgts = from_file(self.workspace.download_file(input_file))
                image_url = pcgts.get_Page().imageFilename
                for region in pcgts.get_Page().get_TextRegion():
                    log.debug("Detecting lines in %s with tesseract",
                              region.id)
                    image = self.workspace.resolve_image_as_pil(
                        image_url,
                        polygon_from_points(region.get_Coords().points))
예제 #17
0
from __future__ import absolute_import
from tesserocr import RIL, PyTessBaseAPI, OEM, PSM
from ocrd import Processor, MIMETYPE_PAGE
from ocrd.utils import getLogger, concat_padded, points_from_xywh, polygon_from_points, xywh_from_points
from ocrd.model.ocrd_page import (CoordsType, WordType, from_file, to_xml)

from ocrd_tesserocr.config import TESSDATA_PREFIX, OCRD_TOOL

log = getLogger('processor.TesserocrSegmentWord')


class TesserocrSegmentWord(Processor):
    def __init__(self, *args, **kwargs):
        kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-tesserocr-segment-word']
        kwargs['version'] = OCRD_TOOL['version']
        super(TesserocrSegmentWord, self).__init__(*args, **kwargs)

    def process(self):
        """
        Performs the line segmentation.
        """
        with PyTessBaseAPI(
                psm=PSM.SINGLE_LINE,
                path=TESSDATA_PREFIX,
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                pcgts = from_file(self.workspace.download_file(input_file))
                image_url = pcgts.get_Page().imageFilename
                for region in pcgts.get_Page().get_TextRegion():
                    for line in region.get_TextLine():
                        log.debug("Detecting words in line '%s'", line.id)
예제 #18
0
import re

from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX
from .report import ValidationReport
from ocrd.utils import getLogger

log = getLogger('ocrd.workspace_validator')

#
# -------------------------------------------------
#


class WorkspaceValidator(object):
    """
    Validates an OCR-D/METS workspace against the specs.

    Args:
        resolver (:class:`Resolver`) : Instance of a resolver
        mets_url (string) : URL of the METS file
    """
    def __init__(self, resolver, mets_url, directory=None):
        self.resolver = resolver
        self.mets_url = mets_url
        self.report = ValidationReport()
        log.debug('resolver=%s mets_url=%s directory=%s', resolver, mets_url,
                  directory)
        if mets_url is None:
            mets_url = '%s/mets.xml' % directory
        self.workspace = self.resolver.workspace_from_url(mets_url,
                                                          directory=directory)