def load_data(self): data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8') # Get a list of the unique characters in the text vocab = list(sorted(set(data))) vocab_size = len(vocab) chars_to_ids = StringLookup(vocabulary=vocab) self.ids_to_chars_layer = StringLookup( vocabulary=chars_to_ids.get_vocabulary(), invert=True) # Split the entire text by character chars = unicode_split(data, 'UTF-8') ids_of_chars = chars_to_ids(chars) # Group characters to form sequences (+1 since the targets are shifted by one) sequences_ds = Dataset.from_tensor_slices(ids_of_chars) sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1) # Batch the sequences ds = sequences_ds.padded_batch(C.BATCH_SIZE) ds = ds.map(self._to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.shuffle(C.BUFFER_SIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
def get_svg_ds(self): data = GFile('datasets/svgs/simpleline.svg', 'rb').read().decode(encoding='UTF-8') # Get the list of the unique characters in the text vocab = ['e', 'g', 'n', 'r', '\n'] vocab_size = len(vocab) # Build the id to char lookup table chars_to_ids = StringLookup(vocabulary=vocab) self.ids_to_chars_layer = StringLookup( vocabulary=chars_to_ids.get_vocabulary(), invert=True) # Split the entire text by character chars = unicode_split(data, 'UTF-8') ids_of_chars = chars_to_ids(chars) # Group characters to form sequences svg_ds = Dataset.from_tensor_slices(ids_of_chars) svg_ds = svg_ds.batch(C.SEQUENCE_LENGTH) svg_ds = svg_ds.batch(C.BATCH_SIZE) return svg_ds
class OCR: def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1): self.scale_ratio = scale_ratio self.characters = sorted([ *set("".join( sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames + list(ArtsInfo.MainAttrNames.values()) + list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789"))) ]) # Mapping characters to integers self.char_to_num = StringLookup(vocabulary=list(self.characters), num_oov_indices=0, mask_token="") # Mapping integers back to original characters self.num_to_char = StringLookup( vocabulary=self.char_to_num.get_vocabulary(), oov_token="", mask_token="", invert=True) self.width = 240 self.height = 16 self.max_length = 15 self.build_model(input_shape=(self.width, self.height)) self.model.load_weights(model_weight) def detect_info(self, art_img): info = self.extract_art_info(art_img) x = np.concatenate([ self.preprocess(info[key]).T[None, :, :, None] for key in sorted(info.keys()) ], axis=0) y = self.model.predict(x) y = self.decode(y) return { **{key: v for key, v in zip(sorted(info.keys()), y)}, **{ 'star': self.detect_star(art_img) } } def extract_art_info(self, art_img): name = art_img.crop([i * self.scale_ratio for i in Config.name_coords]) type = art_img.crop([i * self.scale_ratio for i in Config.type_coords]) main_attr_name = art_img.crop( [i * self.scale_ratio for i in Config.main_attr_name_coords]) main_attr_value = art_img.crop( [i * self.scale_ratio for i in Config.main_attr_value_coords]) level = art_img.crop( [i * self.scale_ratio for i in Config.level_coords]) subattr_1 = art_img.crop([ i * self.scale_ratio for i in Config.subattr_1_coords ]) # [73, 83, 102] subattr_2 = art_img.crop( [i * self.scale_ratio for i in Config.subattr_2_coords]) subattr_3 = art_img.crop( [i * self.scale_ratio for i in Config.subattr_3_coords]) subattr_4 = art_img.crop( [i * self.scale_ratio for i in Config.subattr_4_coords]) if np.all( np.abs(np.array(subattr_1, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_1 del subattr_2 del subattr_3 del subattr_4 elif np.all( np.abs(np.array(subattr_2, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_2 del subattr_3 del subattr_4 elif np.all( np.abs(np.array(subattr_3, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_3 del subattr_4 elif np.all( np.abs(np.array(subattr_4, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_4 return { key: value for key, value in locals().items() if key not in ['art_img', 'self'] } def detect_star(self, art_img): star = art_img.crop([i * self.scale_ratio for i in Config.star_coords]) cropped_star = self.crop(self.normalize(self.to_gray(star))) coef = cropped_star.shape[1] / cropped_star.shape[0] coef = coef / 1.30882352 + 0.21568627 return int(round(coef)) def to_gray(self, text_img): text_img = np.array(text_img) if len(text_img.shape) > 2: text_img = ( text_img[..., :3] @ [[[0.299], [0.587], [0.114]]])[:, :, 0] return np.array(text_img, np.float32) def normalize(self, img, auto_inverse=True): img -= img.min() img /= img.max() if auto_inverse and img[-1, -1] > 0.5: img = 1 - img return img def crop(self, img, tol=0.7): # img is 2D image data # tol is tolerance mask = img > tol m, n = img.shape mask0, mask1 = mask.any(0), mask.any(1) col_start, col_end = mask0.argmax(), n - mask0[::-1].argmax() row_start, row_end = mask1.argmax(), m - mask1[::-1].argmax() # print(row_end-row_start, col_end-col_start) return img[row_start:row_end, col_start:col_end] def resize_to_height(self, img): height = self.height return (np.array( Image.fromarray(np.uint8(img * 255)).resize( (int(img.shape[1] * height / img.shape[0]), height), Image.BILINEAR, )) / 255) def pad_to_width(self, img): width = self.width if img.shape[1] >= width: return img[:, :width] return np.pad(img, [[0, 0], [0, width - img.shape[1]]], mode="constant", constant_values=0) def preprocess(self, text_img): result = self.to_gray(text_img) result = self.normalize(result, True) result = self.crop(result) result = self.normalize(result, False) result = self.resize_to_height(result) result = self.pad_to_width(result) return result def decode(self, pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search results = ctc_decode(pred, input_length=input_len, greedy=True)[0][0][:, :self.max_length] # Iterate over the results and get back the text output_text = [] for res in results: res = self.num_to_char(res) res = reduce_join(res) res = res.numpy().decode("utf-8") output_text.append(res) return output_text def build_model(self, input_shape): input_img = Input(shape=(input_shape[0], input_shape[1], 1), name="image", dtype="float32") mobilenet = MobileNetV3_Small((input_shape[0], input_shape[1], 1), 0, alpha=1.0, include_top=False).build() x = mobilenet(input_img) new_shape = ((input_shape[0] // 8), (input_shape[1] // 8) * 576) x = Reshape(target_shape=new_shape, name="reshape")(x) x = Dense(64, activation="relu", name="dense1")(x) x = Dropout(0.2)(x) # RNNs x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x) x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.25))(x) # Output layer output = Dense(len(self.characters) + 2, activation="softmax", name="dense2")(x) # Define the model self.model = Model(inputs=[input_img], outputs=output, name="ocr_model_v1")
Keras provides different preprocessing layers to deal with different modalities of data. [This guide](https://keras.io/guides/preprocessing_layers/) provids a comprehensive introduction. Our example involves preprocessing labels at the character level. This means that if there are two labels, e.g. "cat" and "dog", then our character vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the [`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/) layer for this purpose. """ AUTOTUNE = tf.data.AUTOTUNE # Mapping characters to integers. char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) # Mapping integers back to original characters. num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True) """ ### Resizing images without distortion Instead of square images, many OCR models work with rectangular images. This will become clearer in a moment when we will visualize a few samples from the dataset. While aspect-unaware resizing square images does not introduce a significant amount of distortion this is not the case for rectangular images. But resizing images to a uniform size is a requirement for mini-batching. So we need to perform our resizing such that the following criteria are met: * Aspect ratio is preserved. * Content of the images is not affected. """