예제 #1
0
    def recognize(self,
                  images,
                  detection_kwargs=None,
                  recognition_kwargs=None):
        """Run the pipeline on one or multiples images.

        Args:
            images: The images to parse (can be a list of actual images or a list of filepaths)
            detection_kwargs: Arguments to pass to the detector call
            recognition_kwargs: Arguments to pass to the recognizer call

        Returns:
            A list of lists of (text, box) tuples.
        """

        # Make sure we have an image array to start with.
        if not isinstance(images, np.ndarray):
            images = [tools.read(image) for image in images]
        # This turns images into (image, scale) tuples temporarily
        images = [
            tools.resize_image(image,
                               max_scale=self.scale,
                               max_size=self.max_size) for image in images
        ]
        max_height, max_width = np.array(
            [image.shape[:2] for image, scale in images]).max(axis=0)
        scales = [scale for _, scale in images]
        images = np.array([
            tools.pad(image, width=max_width, height=max_height)
            for image, _ in images
        ])
        if detection_kwargs is None:
            detection_kwargs = {}
        if recognition_kwargs is None:
            recognition_kwargs = {}
        box_groups = self.detector.detect(images=images, **detection_kwargs)
        prediction_groups = self.recognizer.recognize_from_boxes(
            images=images, box_groups=box_groups, **recognition_kwargs)
        box_groups = [
            tools.adjust_boxes(
                boxes=boxes, boxes_format='boxes', scale=1 /
                scale) if scale != 1 else boxes
            for boxes, scale in zip(box_groups, scales)
        ]
        return [
            list(zip(predictions, boxes))
            for predictions, boxes in zip(prediction_groups, box_groups)
        ]
예제 #2
0
def get_recognizer_image_generator(labels,
                                   height,
                                   width,
                                   alphabet,
                                   augmenter=None,
                                   shuffle=True):
    """Generate augmented (image, text) tuples from a list
    of (filepath, box, label) tuples.

    Args:
        labels: A list of (filepath, box, label) tuples
        height: The height of the images to return
        width: The width of the images to return
        alphabet: The alphabet which limits the characters returned
        augmenter: The augmenter to apply to images
        shuffle: Whether to shuffle the dataset on each iteration
    """
    n_with_illegal_characters = sum(
        any(c not in alphabet for c in text) for _, _, text in labels)
    if n_with_illegal_characters > 0:
        print(
            f'{n_with_illegal_characters} / {len(labels)} instances have illegal characters.'
        )
    labels = labels.copy()
    for index in itertools.cycle(range(len(labels))):
        if index == 0 and shuffle:
            random.shuffle(labels)
        filepath, box, text = labels[index]
        cval = cval = np.random.randint(low=0, high=255,
                                        size=3).astype('uint8')
        if box is not None:
            image = tools.warpBox(image=tools.read(filepath),
                                  box=box.astype('float32'),
                                  target_height=height,
                                  target_width=width,
                                  cval=cval)
        else:
            image = tools.read_and_fit(filepath_or_array=filepath,
                                       width=width,
                                       height=height,
                                       cval=cval)
        text = ''.join([c for c in text if c in alphabet])
        if not text:
            continue
        if augmenter:
            image = augmenter.augment_image(image)
        yield (image, text)
    def read_images(self,
                    plates_and_positions=None,
                    image_paths=None,
                    index_maximum=None,
                    crop_plate=True):
        if image_paths is None:
            image_paths = self.image_paths
        if plates_and_positions is None:
            plates_and_positions = self.get_plates_and_positions()
        if index_maximum is None:
            index_maximum = self.index_maximum
        for image_path, plate_and_position in zip(
                image_paths[:index_maximum],
                plates_and_positions[:index_maximum]):
            image = read(image_path)
            if crop_plate:
                image = self.crop_plate_func(image, plate_and_position)
            self.images.append(image)

        return self.images  # np.stack(images)
예제 #4
0
    def recognize_from_boxes(self, images, box_groups,
                             **kwargs) -> typing.List[str]:
        """Recognize text from images using lists of bounding boxes.

        Args:
            images: A list of input images, supplied as numpy arrays with shape
                (H, W, 3).
            boxes: A list of groups of boxes, one for each image
        """
        assert len(box_groups) == len(images), \
            'You must provide the same number of box groups as images.'
        crops = []
        start_end = []
        for image, boxes in zip(images, box_groups):
            image = tools.read(image)
            if self.prediction_model.input_shape[-1] == 1 and image.shape[
                    -1] == 3:
                # Convert color to grayscale
                image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY)
            for box in boxes:
                crops.append(
                    tools.warpBox(image=image,
                                  box=box,
                                  target_height=self.model.input_shape[1],
                                  target_width=self.model.input_shape[2]))
            start = 0 if not start_end else start_end[-1][1]
            start_end.append((start, start + len(boxes)))
        if not crops:
            return [[] for image in images]
        X = np.float32(crops) / 255
        if len(X.shape) == 3:
            X = X[..., np.newaxis]
        predictions = [
            ''.join([
                self.alphabet[idx] for idx in row
                if idx not in [self.blank_label_idx, -1]
            ]) for row in self.prediction_model.predict(X, **kwargs)
        ]
        return [predictions[start:end] for start, end in start_end]
예제 #5
0
    def detect(self,
               images: typing.List[typing.Union[np.ndarray, str]],
               detection_threshold=0.7,
               text_threshold=0.4,
               link_threshold=0.4,
               size_threshold=10,
               **kwargs):
        """Recognize the text in a set of images.

        Args:
            images: Can be a list of numpy arrays of shape HxWx3 or a list of
                filepaths.
            link_threshold: This is the same as `text_threshold`, but is applied to the
                link map instead of the text map.
            detection_threshold: We want to avoid including boxes that may have
                represented large regions of low confidence text predictions. To do this,
                we do a final check for each word box to make sure the maximum confidence
                value exceeds some detection threshold. This is the threshold used for
                this check.
            text_threshold: When the text map is processed, it is converted from confidence
                (float from zero to one) values to classification (0 for not text, 1 for
                text) using binary thresholding. The threshold value determines the
                breakpoint at which a value is converted to a 1 or a 0. For example, if
                the threshold is 0.4 and a value for particular point on the text map is
                0.5, that value gets converted to a 1. The higher this value is, the less
                likely it is that characters will be merged together into a single word.
                The lower this value is, the more likely it is that non-text will be detected.
                Therein lies the balance.
            size_threshold: The minimum area for a word.
        """
        images = [compute_input(tools.read(image)) for image in images]
        boxes = getBoxes(self.model.predict(np.array(images), **kwargs),
                         detection_threshold=detection_threshold,
                         text_threshold=text_threshold,
                         link_threshold=link_threshold,
                         size_threshold=size_threshold)
        return boxes
예제 #6
0
    def recognize(self,
                  images,
                  detection_kwargs=None,
                  recognition_kwargs=None):
        """Run the pipeline on one or multiples images.

        Args:
            images: The images to parse (can be a list of actual images or a list of filepaths)
            detection_kwargs: Arguments to pass to the detector call
            recognition_kwargs: Arguments to pass to the recognizer call

        Returns:
            A list of lists of (text, box) tuples.
        """

        # Make sure we have an image array to start with.
        if not isinstance(
                images, np.ndarray
        ):  #chengbin: If images are not array. we read image from the file path.
            images = [tools.read(image)
                      for image in images]  #chengbin: list of numpy array.
        # This turns images into (image, scale) tuples temporarily
        images = [
            tools.resize_image(image,
                               max_scale=self.scale,
                               max_size=self.max_size) for image in images
        ]  #chengbin: image has to resize: max_size: 2048, scale: 2

        max_height, max_width = np.array(
            [image.shape[:2] for image, scale in images]).max(axis=0)
        scales = [scale for _, scale in images]
        images = np.array([
            tools.pad(image, width=max_width, height=max_height)
            for image, _ in images
        ])
        if detection_kwargs is None:
            detection_kwargs = {}
        if recognition_kwargs is None:
            recognition_kwargs = {}
        box_groups = self.detector.detect(images=images, **detection_kwargs)
        #assert len(box_groups) == len(images), 'You must provide the same number of box groups as images.'

        crops = []
        start_end = []

        for image, boxes in zip(images, box_groups):
            image = tools.read(image)
            if self.prediction_model.input_shape[-1] == 1 and image.shape[
                    -1] == 3:
                # Convert color to grayscale
                image = cv2.cvtColor(image, code=cv2.COLOR_RGB2GRAY)
            print("This is prediction model input shape",
                  self.prediction_model.input_shape)
            for box in boxes:
                crops.append(
                    tools.warpBox(image=image,
                                  box=box,
                                  target_height=self.model.input_shape[1],
                                  target_width=self.model.input_shape[2]))
            start = 0 if not start_end else start_end[-1][1]
            start_end.append((start, start + len(boxes)))
        if not crops:
            return [[] for image in images]
        print("this is crops", crops, np.asarray(crops).shape)
        X = np.float32(crops) / 255
        if len(X.shape) == 3:
            X = X[..., np.newaxis]
        #predictions = [''.join([self.alphabet[idx] for idx in row if idx not in [self.blank_label_idx, -1]]) for row in self.prediction_model.predict(X, **recognition_kwargs)]
        rows = self.prediction_model.predict(np.asarray([X[0]]))
        #print("length of X",len(X[0]),"This is prediciton rows.",rows[0][0])
        for r in rows[0]:
            maxrid = 0
            maxr = r[0]
            for i in range(len(r)):
                if r[i] > maxr:
                    maxr = r[i]
                    maxrid = i
            print("max row value", maxr, "max r id", maxrid)
        return rows  #[predictions[start:end] for start, end in start_end]
예제 #7
0
def get_detector_image_generator(labels,
                                 width,
                                 height,
                                 augmenter=None,
                                 area_threshold=0.5,
                                 focused=False,
                                 min_area=None,
                                 shuffle=True):
    """Generated augmented (image, lines) tuples from a list
    of (filepath, lines, confidence) tuples. Confidence is
    not used right now but is included for a future release
    that uses semi-supervised data.

    Args:
        labels: A list of (image, lines, confience) tuples.
        augmenter: An augmenter to apply to the images.
        width: The width to use for output images
        height: The height to use for output images
        area_threshold: The area threshold to use to keep
            characters in augmented images.
        min_area: The minimum area for a character to be
            included.
        focused: Whether to pre-crop images to width/height containing
            a region containing text.
        shuffle: Whether to shuffle the data on each iteration.
    """
    labels = labels.copy()
    for index in itertools.cycle(range(len(labels))):
        if index == 0 and shuffle:
            random.shuffle(labels)
        image_filepath, lines, confidence = labels[index]
        image = tools.read(image_filepath)
        if augmenter is not None:
            image, lines = tools.augment(boxes=lines,
                                         boxes_format='lines',
                                         image=image,
                                         area_threshold=area_threshold,
                                         min_area=min_area,
                                         augmenter=augmenter)
        if focused:
            boxes = [tools.combine_line(line)[0] for line in lines]
            if boxes:
                selected = np.array(boxes[np.random.choice(len(boxes))])
                left, top = selected.min(axis=0).clip(0, np.inf).astype('int')
                if left > 0:
                    left -= np.random.randint(0, min(left, width / 2))
                if top > 0:
                    top -= np.random.randint(0, min(top, height / 2))
                image, lines = tools.augment(
                    boxes=lines,
                    augmenter=imgaug.augmenters.Sequential([
                        imgaug.augmenters.Crop(px=(int(top), 0, 0, int(left))),
                        imgaug.augmenters.CropToFixedSize(
                            width=width,
                            height=height,
                            position='right-bottom')
                    ]),
                    boxes_format='lines',
                    image=image,
                    min_area=min_area,
                    area_threshold=area_threshold)
        image, scale = tools.fit(image,
                                 width=width,
                                 height=height,
                                 mode='letterbox',
                                 return_scale=True)
        lines = tools.adjust_boxes(boxes=lines,
                                   boxes_format='lines',
                                   scale=scale)
        yield image, lines, confidence
예제 #8
0
def get_image_generator(
        height,
        width,
        font_groups,
        text_generator,
        font_size: typing.Union[int, typing.Tuple[int, int]] = 18,
        backgrounds: typing.List[typing.Union[str, np.ndarray]] = None,
        background_crop_mode='crop',
        rotationX: typing.Union[int, typing.Tuple[int, int]] = 0,
        rotationY: typing.Union[int, typing.Tuple[int, int]] = 0,
        rotationZ: typing.Union[int, typing.Tuple[int, int]] = 0,
        margin=0,
        use_ligatures=False,
        augmenter=None,
        draw_contour=False,
        draw_contour_text=False):
    """Create a generator for images containing text.

    Args:
        height: The height of the generated image
        width: The width of the generated image.
        font_groups: A dict mapping of { subalphabet: [path_to_font1, path_to_font2] }.
        text_generator: See get_text_generator
        font_size: The font size to use. Alternative, supply a tuple
            and the font size will be randomly selected between
            the two values.
        backgrounds: A list of paths to image backgrounds or actual images
            as numpy arrays with channels in RGB order.
        background_crop_mode: One of letterbox or crop, indicates
            how backgrounds will be resized to fit on the canvas.
        rotationX: The X-axis text rotation to use. Alternative, supply a tuple
            and the rotation will be randomly selected between
            the two values.
        rotationY: The Y-axis text rotation to use. Alternative, supply a tuple
            and the rotation will be randomly selected between
            the two values.
        rotationZ: The Z-axis text rotation to use. Alternative, supply a tuple
            and the rotation will be randomly selected between
            the two values.
        margin: The minimum margin around the edge of the image.
        use_ligatures: Whether to render ligatures (see `draw_text_image`)
        augmenter: An image augmenter to be applied to backgrounds
        draw_contour: Draw the permitted contour onto images (debugging only)
        draw_contour_text: Draw the permitted contour inside the text
            drawing function.

    Yields:
        Tuples of (image, lines) where image is the
        transparent text image and lines is a list of lines
        where each line itself is a list of (box, character) tuples and
        box is an array of points with shape (4, 2) providing the coordinates
        of the character box in clockwise order starting from the top left.
    """
    if backgrounds is None:
        backgrounds = [np.zeros((height, width, 3), dtype='uint8')]
    alphabet = ''.join(font_groups.keys())
    assert len(set(alphabet)) == len(
        alphabet
    ), 'Each character can appear in the subalphabet for only one font group.'
    for text, background_index, current_font_groups in zip(
            text_generator, itertools.cycle(range(len(backgrounds))),
            zip(*[
                itertools.cycle([(subalphabet, font_filepath)
                                 for font_filepath in font_group_filepaths])
                for subalphabet, font_group_filepaths in font_groups.items()
            ])):
        if background_index == 0:
            random.shuffle(backgrounds)
        current_font_groups = dict(current_font_groups)
        current_font_size = np.random.randint(
            low=font_size[0], high=font_size[1]) if isinstance(
                font_size, tuple) else font_size
        current_rotation_X, current_rotation_Y, current_rotation_Z = [
            (np.random.uniform(low=rotation[0], high=rotation[1])
             if isinstance(rotation, tuple) else rotation) * np.pi / 180
            for rotation in [rotationX, rotationY, rotationZ]
        ]
        current_background_filepath_or_array = backgrounds[background_index]
        current_background = tools.read(
            current_background_filepath_or_array) if isinstance(
                current_background_filepath_or_array,
                str) else current_background_filepath_or_array
        if augmenter is not None:
            current_background = augmenter(images=[current_background])[0]
        if current_background.shape[0] != height or current_background.shape[
                1] != width:
            current_background = tools.fit(current_background,
                                           width=width,
                                           height=height,
                                           mode=background_crop_mode)
        permitted_contour, isDark = get_maximum_uniform_contour(
            image=current_background,
            fontsize=current_font_size,
            margin=margin)
        if permitted_contour is None:
            # We can't draw on this background. Boo!
            continue
        random_color_values = np.random.randint(low=0, high=50, size=3)
        text_color = tuple(np.array([255, 255, 255]) -
                           random_color_values) if isDark else tuple(
                               random_color_values)
        text_image, lines = draw_text_image(
            text=text,
            width=width,
            height=height,
            fontsize=current_font_size,
            fonts=current_font_groups,
            thetaX=current_rotation_X,
            thetaY=current_rotation_Y,
            thetaZ=current_rotation_Z,
            use_ligatures=use_ligatures,
            permitted_contour=permitted_contour,
            color=text_color,
            draw_contour=draw_contour_text)
        alpha = text_image[..., -1:].astype('float32') / 255
        image = (alpha * text_image[..., :3] +
                 (1 - alpha) * current_background).astype('uint8')
        if draw_contour:
            image = cv2.drawContours(image,
                                     contours=[
                                         permitted_contour.reshape(
                                             (-1, 1, 2)).astype('int32')
                                     ],
                                     contourIdx=0,
                                     color=(255, 0, 0),
                                     thickness=int(width / 100))
        yield image, lines