예제 #1
0
import os
import shutil
import zipfile
from pathlib import Path

import h5py
import numpy as np
import toml
from boltons.cacheutils import cachedproperty
from tensorflow.keras.utils import to_categorical

from text_recognizer.datasets.dataset import _download_raw_dataset, Dataset, _parse_args

SAMPLE_TO_BALANCE = True

RAW_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'emnist'
METADATA_FILENAME = RAW_DATA_DIRNAME / 'metadata.toml'

PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'emnist'
PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / 'byclass.h5'

ESSENTIALS_FILENAME = Path(
    __file__).parents[0].resolve() / 'emnist_essentials.json'


class EmnistDataset(Dataset):
    """
    "The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19
    and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset."
    From https://www.nist.gov/itl/iad/image-group/emnist-dataset
예제 #2
0
"""Emnist Lines dataset: synthetic handwriting lines dataset made from EMNIST characters."""
import os
from collections import defaultdict
from pathlib import Path

import h5py
import numpy as np
from tensorflow.keras.utils import to_categorical

from text_recognizer.datasets.dataset import Dataset
from text_recognizer.datasets.emnist_dataset import EmnistDataset


DATA_DIRNAME = Dataset.data_dirname() / "processed" / "emnist_lines"
ESSENTIALS_FILENAME = Path(__file__).parents[0].resolve() / "emnist_lines_essentials.json"


class EmnistLinesDataset(Dataset):
    """
    EmnistLinesDataset class.

    Parameters
    ----------
    max_length
        Max line length in characters.
    max_overlap
        Max overlap between characters in a line.
    num_train
        Number of training examples to generate.
    num_test
        Number of test examples to generate.
예제 #3
0
from pathlib import Path

import numpy as np

from text_recognizer.datasets.dataset import Dataset

DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'emnist_lines'
ESSENTIALS_FILENAME = Path(
    __file__).parents[0].resolve() / 'emnist_lines_essentials.json'


class EmnistLinesDataset(Dataset):
    def __init__(self,
                 max_length: int = 34,
                 max_overlap: float = 0.33,
                 num_train: int = 10000,
                 num_test: int = 1000):
        pass

    @property
    def data_filename(self):
        pass

    def load_or_generate_data(self):
        pass

    def __repr__(self):
        pass

    def _load_data(self):
        pass
예제 #4
0
import zipfile

from boltons.cacheutils import cachedproperty
from tensorflow.keras.utils import to_categorical
import h5py
import numpy as np
import toml
import sys

sys.path.append(r"C:\Users\bcche\fsdl-text-recognizer-project\lab1")

from text_recognizer.datasets.dataset import _download_raw_dataset, Dataset, _parse_args

SAMPLE_TO_BALANCE = True  # If true, take at most the mean number of instances per class.

RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "emnist"
METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml"

PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / "processed" / "emnist"
PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / "byclass.h5"

ESSENTIALS_FILENAME = Path(__file__).parents[0].resolve() / "emnist_essentials.json"


class EmnistDataset(Dataset):
    """
    "The EMNIST dataset is a set of handwritten character digits derived from the NIST Special Database 19
    and converted to a 28x28 pixel image format and dataset structure that directly matches the MNIST dataset."
    From https://www.nist.gov/itl/iad/image-group/emnist-dataset

    The data split we will use is
예제 #5
0
"""Class for loading our own FSDL Handwriting dataset, which encompasses both paragraphs and lines."""
import json

import numpy as np
import toml

from text_recognizer import util
from text_recognizer.datasets.dataset import Dataset

RAW_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'fsdl_handwriting'
METADATA_FILENAME = RAW_DATA_DIRNAME / 'metadata.toml'
PAGES_DIRNAME = RAW_DATA_DIRNAME / 'pages'


class FsdlHandwritingDataset(Dataset):
    """
    FSDL Handwriting dataset gathered in class.
    """
    def __init__(self):
        self.metadata = toml.load(METADATA_FILENAME)
        with open(RAW_DATA_DIRNAME / self.metadata['filename']) as f:
            page_data = [json.loads(line) for line in f.readlines()]
        self.data_by_page_id = {
            id_: data
            for id_, data in (_extract_id_and_data(page_datum)
                              for page_datum in page_data)
        }

    def load_or_generate_data(self):
        if len(self.page_filenames) < len(self.data_by_page_id):
            self._download_pages()
예제 #6
0
"""SentenceGenerator class and supporting functions."""
from typing import Optional

from text_recognizer.datasets.dataset import Dataset

NLTK_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'nltk'


class SentenceGenerator:
    """Generate text sentences using the Brown corpus."""
    def __init__(self, max_length: Optional[int] = None):
        pass

    def generate(self, max_length: Optional[int] = None) -> str:
        pass


def brown_text():
    pass


def load_nltk_brown_corpus():
    pass
"""IamParagraphsDataset class and functions for data processing."""
from boltons.cacheutils import cachedproperty
import cv2
import numpy as np

from text_recognizer.datasets.dataset import Dataset, _parse_args
from text_recognizer.datasets.iam_dataset import IamDataset
from text_recognizer import util

INTERIM_DATA_DIRNAME = Dataset.data_dirname() / 'interim' / 'iam_paragraphs'
DEBUG_CROPS_DIRNAME = INTERIM_DATA_DIRNAME / 'debug_crops'
PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'iam_paragraphs'
CROPS_DIRNAME = PROCESSED_DATA_DIRNAME / 'crops'
GT_DIRNAME = PROCESSED_DATA_DIRNAME / 'gt'

PARAGRAPH_BUFFER = 50  # pixels in the IAM form images to leave around the lines
TEST_FRACTION = 0.2


class IamParagraphsDataset(Dataset):
    """
    Paragraphs from the IAM dataset.
    """
    def __init__(self, load_data: bool = True, subsample_fraction: float = None):
        self.iam_dataset = IamDataset()
        if load_data:
            self.iam_dataset.load_or_generate_data()

        self.num_classes = 3
        self.input_shape = (256, 256)
        self.output_shape = (256, 256, self.num_classes)
예제 #8
0
"""Class for loading our own FSDL Handwriting dataset, which encompasses both paragraphs and lines."""
import json

import numpy as np
import toml

from text_recognizer import util
from text_recognizer.datasets.dataset import Dataset

RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "fsdl_handwriting"
METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml"
PAGES_DIRNAME = RAW_DATA_DIRNAME / "pages"


class FsdlHandwritingDataset(Dataset):
    """
    FSDL Handwriting dataset gathered in class.
    """
    def __init__(self):
        self.metadata = toml.load(METADATA_FILENAME)
        with open(RAW_DATA_DIRNAME / self.metadata["filename"]) as f:
            page_data = [json.loads(line) for line in f.readlines()]
        # NOTE: pylint bug https://github.com/PyCQA/pylint/issues/3164
        # pylint: disable=unnecessary-comprehension
        self.data_by_page_id = {
            id_: data
            for id_, data in (_extract_id_and_data(page_datum)
                              for page_datum in page_data)
        }
        # pylint: enable=unnecessary-comprehension
"""SentenceGenerator class and supporting functions."""
import itertools
import re
import string
from typing import Optional

import nltk
import numpy as np

from text_recognizer.datasets.dataset import Dataset

NLTK_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "nltk"


class SentenceGenerator:
    """Generate text sentences using the Brown corpus."""
    def __init__(self, max_length: Optional[int] = None):
        self.text = brown_text()
        self.word_start_inds = [0] + [
            _.start(0) + 1 for _ in re.finditer(" ", self.text)
        ]
        self.max_length = max_length

    def generate(self, max_length: Optional[int] = None) -> str:
        """
        Sample a string from text of the Brown corpus of length at least one word and at most max_length,
        padding it to max_length with the '_' character.
        """
        if max_length is None:
            max_length = self.max_length
        if max_length is None:
IamLinesDataset class.

We will use a processed version of this dataset, without including code that did the processing.
We will look at how to generate processed data from raw IAM data in the IamParagraphsDataset.
"""

from boltons.cacheutils import cachedproperty
import h5py
from tensorflow.keras.utils import to_categorical

from text_recognizer import util
from text_recognizer.datasets.dataset import Dataset, _parse_args
from text_recognizer.datasets.emnist_dataset import EmnistDataset


PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / 'processed' / 'iam_lines'
PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / 'iam_lines.h5'
PROCESSED_DATA_URL = 'https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam_lines.h5'


class IamLinesDataset(Dataset):
    """

    Note that we use cachedproperty because data takes time to load.
    """
    def __init__(self, subsample_fraction: float = None):
        self.mapping = EmnistDataset().mapping
        self.inverse_mapping = {v: k for k, v in self.mapping.items()}
        self.num_classes = len(self.mapping)
        self.input_shape = (28, 952)
        self.output_shape = (97, self.num_classes)
예제 #11
0
"""Class for loading the IAM dataset, which encompasses both paragraphs and lines, with associated utilities."""
import os
from typing import Dict, List
import xml.etree.ElementTree as ElementTree
import zipfile

from boltons.cacheutils import cachedproperty
import toml

from text_recognizer.datasets.dataset import Dataset, _download_raw_dataset


RAW_DATA_DIRNAME = Dataset.data_dirname() / 'raw' / 'iam'
METADATA_FILENAME = RAW_DATA_DIRNAME / 'metadata.toml'
EXTRACTED_DATASET_DIRNAME = RAW_DATA_DIRNAME / 'iamdb'

DOWNSAMPLE_FACTOR = 2  # If images were downsampled, the regions must also be.
LINE_REGION_PADDING = 0  # add this many pixels around the exact coordinates


class IamDataset(Dataset):
    """
    "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text,
    which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels.
    From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database

    The data split we will use is
    IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines.
        The validation set has been merged into the train set.
        The train set has 7,101 lines from 326 writers.
        The test set has 1,861 lines from 128 writers.
예제 #12
0
"""Class for loading the IAM dataset, which encompasses both paragraphs and lines, with associated utilities."""

import os
from typing import Dict, List
import xml.etree.ElementTree as ElementTree
import zipfile

from boltons.cacheutils import cachedproperty
import toml

from text_recognizer.datasets.dataset import Dataset, _download_raw_dataset

RAW_DATA_DIRNAME = Dataset.data_dirname() / "raw" / "iam"
METADATA_FILENAME = RAW_DATA_DIRNAME / "metadata.toml"
EXTRACTED_DATASET_DIRNAME = RAW_DATA_DIRNAME / "iamdb"

DOWNSAMPLE_FACTOR = 2  # If images were downsampled, the regions must also be.
LINE_REGION_PADDING = 0  # add this many pixels around the exact coordinates


class IamDataset(Dataset):
    """
    "The IAM Lines dataset, first published at the ICDAR 1999, contains forms of unconstrained handwritten text,
    which were scanned at a resolution of 300dpi and saved as PNG images with 256 gray levels.
    From http://www.fki.inf.unibe.ch/databases/iam-handwriting-database

    The data split we will use is
    IAM lines Large Writer Independent Text Line Recognition Task (lwitlrt): 9,862 text lines.
        The validation set has been merged into the train set.
        The train set has 7,101 lines from 326 writers.
        The test set has 1,861 lines from 128 writers.
예제 #13
0
"""IamParagraphsDataset class and functions for data processing."""
from boltons.cacheutils import cachedproperty
from tensorflow.keras.utils import to_categorical
import cv2
import numpy as np

from text_recognizer.datasets.dataset import Dataset, _parse_args
from text_recognizer.datasets.iam_dataset import IamDataset
from text_recognizer import util

INTERIM_DATA_DIRNAME = Dataset.data_dirname() / "interim" / "iam_paragraphs"
DEBUG_CROPS_DIRNAME = INTERIM_DATA_DIRNAME / "debug_crops"
PROCESSED_DATA_DIRNAME = Dataset.data_dirname(
) / "processed" / "iam_paragraphs"
CROPS_DIRNAME = PROCESSED_DATA_DIRNAME / "crops"
GT_DIRNAME = PROCESSED_DATA_DIRNAME / "gt"

PARAGRAPH_BUFFER = 50  # pixels in the IAM form images to leave around the lines
TEST_FRACTION = 0.2


class IamParagraphsDataset(Dataset):
    """
    Paragraphs from the IAM dataset.
    """
    def __init__(self, subsample_fraction: float = None):
        self.iam_dataset = IamDataset()
        self.iam_dataset.load_or_generate_data()

        self.num_classes = 3
        self.input_shape = (256, 256)
예제 #14
0
IamLinesDataset class.

We will use a processed version of this dataset, without including code that did the processing.
We will look at how to generate processed data from raw IAM data in the IamParagraphsDataset.
"""

from boltons.cacheutils import cachedproperty
import h5py
from tensorflow.keras.utils import to_categorical

from text_recognizer import util
from text_recognizer.datasets.dataset import Dataset, _parse_args
from text_recognizer.datasets.emnist_lines_dataset import EmnistLinesDataset


PROCESSED_DATA_DIRNAME = Dataset.data_dirname() / "processed" / "iam_lines"
PROCESSED_DATA_FILENAME = PROCESSED_DATA_DIRNAME / "iam_lines.h5"
PROCESSED_DATA_URL = "https://s3-us-west-2.amazonaws.com/fsdl-public-assets/iam_lines.h5"


class IamLinesDataset(Dataset):
    """
    Note that we use cachedproperty because data takes time to load.

    Parameters
    ----------
    categorical_format
        If True, then y labels are given as one-hot vectors.
    with_start_and_end_tokens
        If True, start and end each sequence with special tokens
    subsample_fraction