def __init__(self, subsample_fraction: float = None): self.mapping = EmnistDataset().mapping self.inverse_mapping = {v: k for k, v in self.mapping.items()} self.num_classes = len(self.mapping) self.input_shape = (28, 952) self.output_shape = (97, self.num_classes) self.subsample_fraction = subsample_fraction self.x_train = None self.x_test = None self.y_train_int = None self.y_test_int = None
def __init__( self, max_length: int = 34, min_overlap: float = 0, max_overlap: float = 0.33, num_train: int = 10000, num_test: int = 1000, ): self.emnist = EmnistDataset() self.mapping = self.emnist.mapping self.max_length = max_length self.min_overlap = min_overlap self.max_overlap = max_overlap self.num_classes = len(self.mapping) self.input_shape = ( self.emnist.input_shape[0], self.emnist.input_shape[1] * self.max_length, ) self.output_shape = (self.max_length, self.num_classes) self.num_train = 500 if 'COLAB_GPU' not in os.environ else num_train self.num_test = 100 if 'COLAB_GPU' not in os.environ else num_test self.x_train = None self.y_train = None self.x_test = None self.y_test = None
class IamLinesDataset(Dataset): """ Note that we use cachedproperty because data takes time to load. """ def __init__(self, subsample_fraction: float = None): self.mapping = EmnistDataset().mapping self.inverse_mapping = {v: k for k, v in self.mapping.items()} self.num_classes = len(self.mapping) self.input_shape = (28, 952) self.output_shape = (97, self.num_classes) self.subsample_fraction = subsample_fraction self.x_train = None self.x_test = None self.y_train_int = None self.y_test_int = None def load_or_generate_data(self): """Load or generate dataset data.""" if not PROCESSED_DATA_FILENAME.exists(): PROCESSED_DATA_DIRNAME.mkdir(parents=True, exist_ok=True) print("Downloading IAM lines...") util.download_url(PROCESSED_DATA_URL, PROCESSED_DATA_FILENAME) with h5py.File(PROCESSED_DATA_FILENAME, "r") as f: self.x_train = f["x_train"][:] self.y_train_int = f["y_train"][:] self.x_test = f["x_test"][:] self.y_test_int = f["y_test"][:] self._subsample() def _subsample(self): """Only this fraction of data will be loaded.""" if self.subsample_fraction is None: return num_train = int(self.x_train.shape[0] * self.subsample_fraction) num_test = int(self.x_test.shape[0] * self.subsample_fraction) self.x_train = self.x_train[:num_train] self.y_train_int = self.y_train_int[:num_train] self.x_test = self.x_test[:num_test] self.y_test_int = self.y_test_int[:num_test] @cachedproperty def y_train(self): """Return y_train""" return to_categorical(self.y_train_int, self.num_classes) @cachedproperty def y_test(self): """Return y_test""" return to_categorical(self.y_test_int, self.num_classes) def __repr__(self): """Print info about the dataset.""" return ("IAM Lines Dataset\n" # pylint: disable=no-member f"Num classes: {self.num_classes}\n" f"Mapping: {self.mapping}\n" f"Train: {self.x_train.shape} {self.y_train.shape}\n" f"Test: {self.x_test.shape} {self.y_test.shape}\n")
def _generate_data(self) -> str: """Generates a dataset with the Brown corpus and Emnist characters.""" logger.debug("Generating data...") sentence_generator = SentenceGenerator(self.max_length) # Load emnist dataset. emnist = EmnistDataset(train=self.train, sample_to_balance=True, pad_token=self.pad_token) emnist.load_or_generate_data() samples_by_character = get_samples_by_character( emnist.data.numpy(), emnist.targets.numpy(), self.mapper.mapping, ) DATA_DIRNAME.mkdir(parents=True, exist_ok=True) with h5py.File(self.data_filename, "a") as f: data, targets = create_dataset_of_images( self.num_samples, samples_by_character, sentence_generator, self.min_overlap, self.max_overlap, ) targets = convert_strings_to_categorical_labels( targets, emnist.inverse_mapping) f.create_dataset("data", data=data, dtype="u1", compression="lzf") f.create_dataset("targets", data=targets, dtype="u1", compression="lzf")
def __init__(self, max_length: int = 34, max_overlap: float = 0.33, num_train: int = 10000, num_test: int = 1000): self.emnist = EmnistDataset() self.mapping = self.emnist.mapping self.max_length = max_length self.max_overlap = max_overlap self.num_classes = len(self.mapping) self.input_shape = (self.emnist.input_shape[0], self.emnist.input_shape[1] * self.max_length) self.output_shape = (self.max_length, self.num_classes) self.num_train = num_train self.num_test = num_test self.x_train = None self.y_train = None self.x_test = None self.y_test = None
def __init__( self, max_length: int = 34, min_overlap: float = 0, max_overlap: float = 0.33, num_train: int = 100000, num_test: int = 10000, categorical_format: bool = False, with_start_and_end_labels: bool = False, ): self.categorical_format = categorical_format self.with_start_and_end_labels = with_start_and_end_labels self.emnist = EmnistDataset() self.mapping = _augment_mapping(self.emnist.mapping) self.inverse_mapping = {v: k for k, v in self.mapping.items()} self.padding_label = self.inverse_mapping["_"] self.start_label = self.inverse_mapping["<s>"] self.end_label = self.inverse_mapping["<e>"] self.max_length = max_length self.max_output_length = self.max_length if self.with_start_and_end_labels: self.max_output_length += 2 self.min_overlap = min_overlap self.max_overlap = max_overlap self.num_classes = len(self.mapping) self.input_shape = ( self.emnist.input_shape[0], self.emnist.input_shape[1] * self.max_length, ) self.output_shape = (self.max_output_length, self.num_classes) self.num_train = num_train self.num_test = num_test self.x_train = None self.y_train_int = None self.x_test = None self.y_test_int = None