def __init__(self, subsample_fraction: float = None):
        self.mapping = EmnistDataset().mapping
        self.inverse_mapping = {v: k for k, v in self.mapping.items()}
        self.num_classes = len(self.mapping)
        self.input_shape = (28, 952)
        self.output_shape = (97, self.num_classes)

        self.subsample_fraction = subsample_fraction
        self.x_train = None
        self.x_test = None
        self.y_train_int = None
        self.y_test_int = None
示例#2
0
 def __init__(
     self,
     max_length: int = 34,
     min_overlap: float = 0,
     max_overlap: float = 0.33,
     num_train: int = 10000,
     num_test: int = 1000,
 ):
     self.emnist = EmnistDataset()
     self.mapping = self.emnist.mapping
     self.max_length = max_length
     self.min_overlap = min_overlap
     self.max_overlap = max_overlap
     self.num_classes = len(self.mapping)
     self.input_shape = (
         self.emnist.input_shape[0],
         self.emnist.input_shape[1] * self.max_length,
     )
     self.output_shape = (self.max_length, self.num_classes)
     self.num_train = 500 if 'COLAB_GPU' not in os.environ else num_train
     self.num_test = 100 if 'COLAB_GPU' not in os.environ else num_test
     self.x_train = None
     self.y_train = None
     self.x_test = None
     self.y_test = None
示例#3
0
class IamLinesDataset(Dataset):
    """

    Note that we use cachedproperty because data takes time to load.
    """
    def __init__(self, subsample_fraction: float = None):
        self.mapping = EmnistDataset().mapping
        self.inverse_mapping = {v: k for k, v in self.mapping.items()}
        self.num_classes = len(self.mapping)
        self.input_shape = (28, 952)
        self.output_shape = (97, self.num_classes)

        self.subsample_fraction = subsample_fraction
        self.x_train = None
        self.x_test = None
        self.y_train_int = None
        self.y_test_int = None

    def load_or_generate_data(self):
        """Load or generate dataset data."""
        if not PROCESSED_DATA_FILENAME.exists():
            PROCESSED_DATA_DIRNAME.mkdir(parents=True, exist_ok=True)
            print("Downloading IAM lines...")
            util.download_url(PROCESSED_DATA_URL, PROCESSED_DATA_FILENAME)
        with h5py.File(PROCESSED_DATA_FILENAME, "r") as f:
            self.x_train = f["x_train"][:]
            self.y_train_int = f["y_train"][:]
            self.x_test = f["x_test"][:]
            self.y_test_int = f["y_test"][:]
        self._subsample()

    def _subsample(self):
        """Only this fraction of data will be loaded."""
        if self.subsample_fraction is None:
            return
        num_train = int(self.x_train.shape[0] * self.subsample_fraction)
        num_test = int(self.x_test.shape[0] * self.subsample_fraction)
        self.x_train = self.x_train[:num_train]
        self.y_train_int = self.y_train_int[:num_train]
        self.x_test = self.x_test[:num_test]
        self.y_test_int = self.y_test_int[:num_test]

    @cachedproperty
    def y_train(self):
        """Return y_train"""
        return to_categorical(self.y_train_int, self.num_classes)

    @cachedproperty
    def y_test(self):
        """Return y_test"""
        return to_categorical(self.y_test_int, self.num_classes)

    def __repr__(self):
        """Print info about the dataset."""
        return ("IAM Lines Dataset\n"  # pylint: disable=no-member
                f"Num classes: {self.num_classes}\n"
                f"Mapping: {self.mapping}\n"
                f"Train: {self.x_train.shape} {self.y_train.shape}\n"
                f"Test: {self.x_test.shape} {self.y_test.shape}\n")
    def _generate_data(self) -> str:
        """Generates a dataset with the Brown corpus and Emnist characters."""
        logger.debug("Generating data...")

        sentence_generator = SentenceGenerator(self.max_length)

        # Load emnist dataset.
        emnist = EmnistDataset(train=self.train,
                               sample_to_balance=True,
                               pad_token=self.pad_token)
        emnist.load_or_generate_data()

        samples_by_character = get_samples_by_character(
            emnist.data.numpy(),
            emnist.targets.numpy(),
            self.mapper.mapping,
        )

        DATA_DIRNAME.mkdir(parents=True, exist_ok=True)
        with h5py.File(self.data_filename, "a") as f:
            data, targets = create_dataset_of_images(
                self.num_samples,
                samples_by_character,
                sentence_generator,
                self.min_overlap,
                self.max_overlap,
            )

            targets = convert_strings_to_categorical_labels(
                targets, emnist.inverse_mapping)

            f.create_dataset("data", data=data, dtype="u1", compression="lzf")
            f.create_dataset("targets",
                             data=targets,
                             dtype="u1",
                             compression="lzf")
示例#5
0
 def __init__(self,
              max_length: int = 34,
              max_overlap: float = 0.33,
              num_train: int = 10000,
              num_test: int = 1000):
     self.emnist = EmnistDataset()
     self.mapping = self.emnist.mapping
     self.max_length = max_length
     self.max_overlap = max_overlap
     self.num_classes = len(self.mapping)
     self.input_shape = (self.emnist.input_shape[0],
                         self.emnist.input_shape[1] * self.max_length)
     self.output_shape = (self.max_length, self.num_classes)
     self.num_train = num_train
     self.num_test = num_test
     self.x_train = None
     self.y_train = None
     self.x_test = None
     self.y_test = None
示例#6
0
    def __init__(
        self,
        max_length: int = 34,
        min_overlap: float = 0,
        max_overlap: float = 0.33,
        num_train: int = 100000,
        num_test: int = 10000,
        categorical_format: bool = False,
        with_start_and_end_labels: bool = False,
    ):
        self.categorical_format = categorical_format
        self.with_start_and_end_labels = with_start_and_end_labels

        self.emnist = EmnistDataset()

        self.mapping = _augment_mapping(self.emnist.mapping)
        self.inverse_mapping = {v: k for k, v in self.mapping.items()}
        self.padding_label = self.inverse_mapping["_"]
        self.start_label = self.inverse_mapping["<s>"]
        self.end_label = self.inverse_mapping["<e>"]

        self.max_length = max_length
        self.max_output_length = self.max_length
        if self.with_start_and_end_labels:
            self.max_output_length += 2

        self.min_overlap = min_overlap
        self.max_overlap = max_overlap
        self.num_classes = len(self.mapping)
        self.input_shape = (
            self.emnist.input_shape[0],
            self.emnist.input_shape[1] * self.max_length,
        )
        self.output_shape = (self.max_output_length, self.num_classes)
        self.num_train = num_train
        self.num_test = num_test

        self.x_train = None
        self.y_train_int = None
        self.x_test = None
        self.y_test_int = None