Exemplo n.º 1
0
    def __init__(self, image, out_dir, out_size, **kwargs):
        super(BBOXPlotter, self).__init__()
        self.image = image
        self.reference_image = kwargs.pop("reference_image", None)
        self.reference_features = None
        self.render_extracted_rois = kwargs.pop("render_extracted_rois", True)
        self.image_size = Size(height=image.shape[1], width=image.shape[2])
        self.out_dir = out_dir
        os.makedirs(self.out_dir, exist_ok=True)
        self.out_size = out_size
        self.colours = get_next_color
        self.send_bboxes = kwargs.pop("send_bboxes", False)
        self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1')
        self.upstream_port = kwargs.pop("upstream_port", 1337)
        self.font = ImageFont.truetype("train_utils/DejaVuSans.ttf", 14)
        self.visualization_anchors = kwargs.pop("visualization_anchors", [])
        self.visual_backprop = VisualBackprop()
        self.vis_features = kwargs.pop("feature_anchors", [])
        self.plot_objectness_classification_result = kwargs.pop('plot_objectness_classification_result', False)
        self.show_visual_backprop_overlay = kwargs.pop('show_visual_backprop_overlay', False)
        # index of the visual backrpop prediction that is to be shown in overlay
        self.visual_backprop_index = kwargs.pop('visual_backprop_index', 0)
        self.show_backprop_and_feature_vis = kwargs.pop('show_backprop_and_feature_vis', False)
        self.get_discriminator_output_function = kwargs.pop('discriminator_output_function', self.get_discriminator_output)
        self.render_pca = kwargs.pop('render_pca', False)
        self.gt_bbox = kwargs.pop('gt_bbox', None)
        self.xp = np
        self.devices = kwargs.pop('devices', None)
        self.log_name = kwargs.pop('log_name', 'training')
        self.max_num_rois_to_render = kwargs.pop('num_rois_to_render', None)
        self.sort_rois = kwargs.pop('sort_rois', False)

        self.init_predictors(kwargs.pop("predictors", {}))
Exemplo n.º 2
0
    def __init__(self, dropout_factor, num_timesteps, zoom=0.9):
        super(FSNSMultipleSTNLocalizationNet, self).__init__()
        with self.init_scope():
            self.conv0 = L.Convolution2D(None, 32, 3, pad=1)
            self.bn0 = L.BatchNormalization(32)
            self.rs1 = ResnetBlock(32)
            self.rs2 = ResnetBlock(48, filter_increase=True)
            self.rs3 = ResnetBlock(48)
            self.lstm = L.LSTM(None, 256)
            self.translation_transform = L.Linear(256, 6)
            self.rotation_transform = L.Linear(256, 6)
            self.transform_2 = L.LSTM(256, 6)

        self.dropout_factor = dropout_factor
        self._train = True
        self.num_timesteps = num_timesteps

        for transform in [self.translation_transform, self.rotation_transform]:
            transform_bias = transform.b.data
            transform_bias[[0, 4]] = zoom
            transform_bias[[2, 5]] = 0
            transform.W.data[...] = 0

        # self.transform_2.upward.b.data[...] = 0
        # self.transform_2.upward.W.data[...] = 0
        # self.transform_2.lateral.W.data[...] = 0

        # self.transform.W.data[...] = 0

        self.visual_backprop = VisualBackprop()
        self.vis_anchor = None
Exemplo n.º 3
0
    def __init__(self, dropout_ratio, num_timesteps, zoom=0.9, use_dropout=False):
        super(FSNSSingleSTNLocalizationNet, self).__init__()
        with self.init_scope():
            self.conv0 = L.Convolution2D(None, 32, 3, pad=1)
            self.bn0 = L.BatchNormalization(32)
            self.rs1 = ResnetBlock(32, use_dropout=use_dropout, dropout_ratio=dropout_ratio)
            self.rs2 = ResnetBlock(48, filter_increase=True, use_dropout=use_dropout, dropout_ratio=dropout_ratio)
            self.rs3 = ResnetBlock(48)
            # self.rs4 = ResnetBlock(16, filter_increase=True)
            self.lstm = L.LSTM(None, 256)
            self.transform_2 = L.LSTM(256, 6)

        self.dropout_ratio = dropout_ratio
        self.use_dropout = use_dropout
        self._train = True
        self.num_timesteps = num_timesteps

        # initialize transform
        # self.transform_2.W.data[...] = 0
        #
        # transform_bias = self.transform_2.b.data
        # transform_bias[[0, 4]] = zoom
        # transform_bias[[2, 5]] = 0

        self.visual_backprop = VisualBackprop()
        self.vis_anchor = None

        self.width_encoding = None
        self.height_encoding = None
Exemplo n.º 4
0
 def __init__(self, image, out_dir, out_size, loss_metrics, **kwargs):
     super(BBOXPlotter, self).__init__()
     self.image = image
     self.render_extracted_rois = kwargs.pop("render_extracted_rois", True)
     self.image_size = Size(height=image.shape[1], width=image.shape[2])
     self.out_dir = out_dir
     os.makedirs(self.out_dir, exist_ok=True)
     self.out_size = out_size
     self.colours = COLOR_MAP
     self.send_bboxes = kwargs.pop("send_bboxes", False)
     self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1')
     self.upstream_port = kwargs.pop("upstream_port", 1337)
     self.loss_metrics = loss_metrics
     self.font = ImageFont.truetype("utils/DejaVuSans.ttf", 20)
     self.visualization_anchors = kwargs.pop("visualization_anchors", [])
     self.visual_backprop = VisualBackprop()
     self.xp = np
Exemplo n.º 5
0
    def __init__(self, out_size, **kwargs):
        self.transform_rois_to_grayscale = kwargs.pop('transform_rois_to_grayscale', False)
        self.num_bboxes_to_localize = kwargs.pop('num_bboxes_to_localize', 1)
        self.dropout_ratio = kwargs.pop('dropout_ratio', 0)
        self.box_offset_side_length = kwargs.pop('box_offset_side_length', 3)
        self.box_offset_factor = kwargs.pop('box_offset_factor', 20)
        self.features_per_timestep = kwargs.pop('features_per_timestep', 256)

        super().__init__()

        with self.init_scope():
            self.feature_extractor = ResNet(kwargs.pop('num_layers', 18))

        self.visual_backprop = VisualBackprop()
        self.visual_backprop_anchors = []
        self.out_size = out_size

        self.rotation_dropout_params = [1, 0, 1, 0, 1, 1]
        self.translation_dropout_params = [1, 1, 0, 1, 1, 0]
Exemplo n.º 6
0
    def predict(self, images, return_visual_backprop=False):
        with cuda.Device(self._device_id):
            if isinstance(images, list):
                images = [self.xp.array(image) for image in images]
                images = self.xp.stack(images, axis=0)

            visual_backprop = None
            with chainer.using_config('train', False):
                roi, bbox = self(images)
                rois = [roi]
                bboxes = [bbox]
                if return_visual_backprop:
                    if not hasattr(self, 'visual_backprop'):
                        self.visual_backprop = VisualBackprop()
                    visual_backprop = self.visual_backprop.perform_visual_backprop(self.visual_backprop_anchors[0])

        bboxes = F.stack(bboxes, axis=1)
        bboxes = F.reshape(bboxes, (-1,) + bboxes.shape[2:])
        rois = F.stack(rois, axis=1)
        rois = F.reshape(rois, (-1,) + rois.shape[2:])

        return rois, bboxes, visual_backprop
Exemplo n.º 7
0
    def predict(self, images, return_visual_backprop=False):
        with cuda.Device(self._device_id):
            images = [self.xp.array(image) for image in images]
            images = self.xp.stack(images, axis=0)
            with chainer.using_config('train', False):
                rois, bboxes = self(images)
                if return_visual_backprop:
                    if not hasattr(self, 'visual_backprop'):
                        self.visual_backprop = VisualBackprop()
                    visual_backprop = cuda.to_cpu(
                        self.visual_backprop.perform_visual_backprop(
                            self.visual_backprop_anchors[0]))
                else:
                    visual_backprop = None

                bboxes = self.extract_corners(bboxes)
                bboxes = self.scale_bboxes(bboxes,
                                           Size._make(images.shape[-2:]))

        bboxes = [cuda.to_cpu(bbox).reshape(1, -1) for bbox in bboxes.data]

        return bboxes, rois, np.ones((len(bboxes), 1)), visual_backprop
Exemplo n.º 8
0
    def __init__(self,
                 out_size,
                 transform_rois_to_grayscale=False,
                 train_imagenet=False):
        super().__init__()
        with self.init_scope():
            self.feature_extractor = ResNet(
                18, class_labels=1000 if train_imagenet else None)

            if not train_imagenet:
                self.res6 = BasicBlock(2, 512)
                self.res7 = BasicBlock(2, 512)
                self.param_predictor = L.Linear(512, 6)

                transform_bias = self.param_predictor.b.data
                transform_bias[[0, 4]] = 0.8
                transform_bias[[2, 5]] = 0
                self.param_predictor.W.data[...] = 0

        self.visual_backprop_anchors = []
        self.out_size = out_size
        self.transform_rois_to_grayscale = transform_rois_to_grayscale
        self.visual_backprop = VisualBackprop()
        self.train_imagenet = train_imagenet
Exemplo n.º 9
0
class BBOXPlotter(Extension):

    def __init__(self, image, out_dir, out_size, loss_metrics, **kwargs):
        super(BBOXPlotter, self).__init__()
        self.image = image
        self.render_extracted_rois = kwargs.pop("render_extracted_rois", True)
        self.image_size = Size(height=image.shape[1], width=image.shape[2])
        self.out_dir = out_dir
        os.makedirs(self.out_dir, exist_ok=True)
        self.out_size = out_size
        self.colours = COLOR_MAP
        self.send_bboxes = kwargs.pop("send_bboxes", False)
        self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1')
        self.upstream_port = kwargs.pop("upstream_port", 1337)
        self.loss_metrics = loss_metrics
        self.font = ImageFont.truetype("utils/DejaVuSans.ttf", 20)
        self.visualization_anchors = kwargs.pop("visualization_anchors", [])
        self.visual_backprop = VisualBackprop()
        self.xp = np

    def send_image(self, data):
        height = data.height
        width = data.width
        channels = len(data.getbands())

        # convert image to png in order to save network bandwidth
        png_stream = BytesIO()
        data.save(png_stream, format="PNG")
        png_stream = png_stream.getvalue()

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            try:
                sock.connect((self.upstream_ip, self.upstream_port))
            except Exception as e:
                print(e)
                print("could not connect to display server, disabling image rendering")
                self.send_bboxes = False
                return
            data = {
                'width': width,
                'height': height,
                'channels': channels,
                'image': base64.b64encode(png_stream).decode('utf-8'),
            }
            sock.send(bytes(json.dumps(data), 'utf-8'))

    def array_to_image(self, array):
        if array.shape[0] == 1:
            # image is black and white, we need to trick the system into thinking, that we are having an RGB image
            array = self.xp.tile(array, (3, 1, 1))
        return Image.fromarray(cuda.to_cpu(array.transpose(1, 2, 0) * 255).astype(np.uint8), "RGB").convert("RGBA")

    def variable_to_image(self, variable):
        return self.array_to_image(variable.data)

    def __call__(self, trainer):
        iteration = trainer.updater.iteration

        with cuda.get_device_from_id(trainer.updater.get_optimizer('main').target._device_id), chainer.using_config('train', False):
            self.xp = np if trainer.updater.get_optimizer('main').target._device_id < 0 else cuda.cupy
            image = self.xp.asarray(self.image)
            predictor = trainer.updater.get_optimizer('main').target.predictor
            predictions, rois, bboxes = predictor(image[self.xp.newaxis, ...])

            backprop_visualizations = []
            for visanchor in self.visualization_anchors:
                vis_target = predictor
                for target in visanchor:
                    vis_target = getattr(vis_target, target)
                backprop_visualizations.append(self.visual_backprop.perform_visual_backprop(vis_target))

            self.render_rois(predictions, rois, bboxes, iteration, self.image.copy(), backprop_vis=backprop_visualizations)

    @property
    def original_image_paste_location(self):
        return 0, 0

    def render_rois(self, predictions, rois, bboxes, iteration, image, backprop_vis=()):
        # get the predicted text
        text = self.decode_predictions(predictions)

        image = self.array_to_image(image)

        num_timesteps = self.get_num_timesteps(bboxes)
        bboxes, dest_image = self.set_output_sizes(backprop_vis, bboxes, image, num_timesteps)
        if self.render_extracted_rois:
            self.render_extracted_regions(dest_image, image, rois, num_timesteps)

        if len(backprop_vis) != 0:
            # if we have a backprop visualization we can show it now
            self.show_backprop_vis(backprop_vis, dest_image, image, num_timesteps)

        self.draw_bboxes(bboxes, image)
        dest_image.paste(image, self.original_image_paste_location)
        if len(text) > 0:
            dest_image = self.render_text(dest_image, text)
        dest_image.save("{}.png".format(os.path.join(self.out_dir, str(iteration))), 'png')
        if self.send_bboxes:
            self.send_image(dest_image)

    def get_num_timesteps(self, bboxes):
        return bboxes.shape[0]

    def set_output_sizes(self, backprop_vis, bboxes, image, num_timesteps):
        _, num_channels, height, width = bboxes.shape

        image_height = image.height if len(backprop_vis) == 0 else image.height + self.image_size.height
        image_width = image.width + image.width * num_timesteps if self.render_extracted_rois else image.width

        dest_image = Image.new("RGBA", (image_width, image_height), color='black')
        bboxes = F.reshape(bboxes, (num_timesteps, 1, num_channels, height, width))

        return bboxes, dest_image

    def show_backprop_vis(self, backprop_vis, dest_image, image, num_timesteps):
        count = 0
        for visualization in backprop_vis:
            for vis in visualization:
                backprop_image = self.array_to_image(self.xp.tile(vis[0], (3, 1, 1))).resize(
                    (self.image_size.width, self.image_size.height))
                dest_image.paste(backprop_image, (count * backprop_image.width, image.height))
                count += 1

    def decode_predictions(self, predictions):
        words = []
        for prediction in predictions:
            if isinstance(prediction, list):
                prediction = F.concat([F.expand_dims(p, axis=0) for p in prediction], axis=0)

            prediction = self.xp.transpose(prediction.data, (1, 0, 2))
            prediction = self.xp.squeeze(prediction, axis=0)
            prediction = self.xp.argmax(prediction, axis=1)
            word = self.loss_metrics.strip_prediction(prediction[self.xp.newaxis, ...])[0]
            if len(word) == 1 and word[0] == 0:
                continue
            word = "".join(map(self.loss_metrics.label_to_char, word))
            word = word.replace(chr(self.loss_metrics.char_map[str(self.loss_metrics.blank_symbol)]), '')
            if len(word) > 0:
                words.append(word)
        text = " ".join(words)
        return text

    def render_extracted_regions(self, dest_image, image, rois, num_timesteps):
        _, num_channels, height, width = rois.shape
        rois = self.xp.reshape(rois, (num_timesteps, -1, num_channels, height, width))

        for i, roi in enumerate(rois, start=1):
            roi_image = self.variable_to_image(roi[0])
            paste_location = i * image.width, 0
            dest_image.paste(roi_image.resize((self.image_size.width, self.image_size.height)), paste_location)

    def render_text(self, dest_image, text):
        label_image = Image.new(dest_image.mode, dest_image.size)
        # only keep ascii characters
        # labels = ''.join(filter(lambda x: len(x) == len(x.encode()), labels))
        draw = ImageDraw.Draw(label_image)
        text_width, text_height = draw.textsize(text, font=self.font)
        draw.rectangle([dest_image.width - text_width - 1, 0, dest_image.width, text_height],
                       fill=(255, 255, 255, 160))
        draw.text((dest_image.width - text_width - 1, 0), text, fill='green', font=self.font)
        dest_image = Image.alpha_composite(dest_image, label_image)
        return dest_image

    def draw_bboxes(self, bboxes, image):
        draw = ImageDraw.Draw(image)
        for i, sub_box in enumerate(F.separate(bboxes, axis=1)):
            for bbox, colour in zip(F.separate(sub_box, axis=0), self.colours):
                bbox.data[...] = (bbox.data[...] + 1) / 2
                bbox.data[0, :] *= self.image_size.width
                bbox.data[1, :] *= self.image_size.height

                x = self.xp.clip(bbox.data[0, :].reshape(self.out_size), 0, self.image_size.width) + i * self.image_size.width
                y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0, self.image_size.height)

                top_left = (x[0, 0], y[0, 0])
                top_right = (x[0, -1], y[0, -1])
                bottom_left = (x[-1, 0], y[-1, 0])
                bottom_right = (x[-1, -1], y[-1, -1])

                corners = [top_left, top_right, bottom_right, bottom_left]
                next_corners = corners[1:] + [corners[0]]

                for first_corner, next_corner in zip(corners, next_corners):
                    draw.line([first_corner, next_corner], fill=colour, width=3)
Exemplo n.º 10
0
class TextLocalizer(Chain):

    def __init__(self, out_size, **kwargs):
        self.transform_rois_to_grayscale = kwargs.pop('transform_rois_to_grayscale', False)
        self.num_bboxes_to_localize = kwargs.pop('num_bboxes_to_localize', 1)
        self.dropout_ratio = kwargs.pop('dropout_ratio', 0)
        self.box_offset_side_length = kwargs.pop('box_offset_side_length', 3)
        self.box_offset_factor = kwargs.pop('box_offset_factor', 20)
        self.features_per_timestep = kwargs.pop('features_per_timestep', 256)

        super().__init__()

        with self.init_scope():
            self.feature_extractor = ResNet(kwargs.pop('num_layers', 18))

        self.visual_backprop = VisualBackprop()
        self.visual_backprop_anchors = []
        self.out_size = out_size

        self.rotation_dropout_params = [1, 0, 1, 0, 1, 1]
        self.translation_dropout_params = [1, 1, 0, 1, 1, 0]

    @maybe_copy
    def __call__(self, images):
        self.visual_backprop_anchors.clear()
        h = self.feature_extractor(images)
        self.visual_backprop_anchors.append(h)

        batch_size = len(h)
        transform_params = self.get_transform_params(h)

        boxes = F.spatial_transformer_grid(transform_params, self.out_size)

        expanded_images = F.broadcast_to(F.expand_dims(images, axis=1), (batch_size, self.num_bboxes_to_localize) + images.shape[1:])
        expanded_images = F.reshape(expanded_images, (-1,) + expanded_images.shape[2:])
        rois = F.spatial_transformer_sampler(expanded_images, boxes)

        rois = F.reshape(rois, (batch_size, self.num_bboxes_to_localize, images.shape[1], self.out_size.height, self.out_size.width))
        boxes = F.reshape(boxes, (batch_size, self.num_bboxes_to_localize, 2, self.out_size.height, self.out_size.width))

        # return shapes:
        # 1. batch_size, num_bboxes, num_channels, (out-)height, (out-)width
        # 2. batch_size, num_bboxes, 2, (out-)height, (out-)width
        return rois, boxes

    def get_transform_params(self, features):
        raise NotImplementedError

    @maybe_copy
    def predict(self, images, return_visual_backprop=False):
        with cuda.Device(self._device_id):
            if isinstance(images, list):
                images = [self.xp.array(image) for image in images]
                images = self.xp.stack(images, axis=0)

            visual_backprop = None
            with chainer.using_config('train', False):
                roi, bbox = self(images)
                rois = [roi]
                bboxes = [bbox]
                if return_visual_backprop:
                    if not hasattr(self, 'visual_backprop'):
                        self.visual_backprop = VisualBackprop()
                    visual_backprop = self.visual_backprop.perform_visual_backprop(self.visual_backprop_anchors[0])

        bboxes = F.stack(bboxes, axis=1)
        bboxes = F.reshape(bboxes, (-1,) + bboxes.shape[2:])
        rois = F.stack(rois, axis=1)
        rois = F.reshape(rois, (-1,) + rois.shape[2:])

        return rois, bboxes, visual_backprop

    def virtual_box_number_increase(self, boxes, image_shape):
        image_shape = Size(*image_shape)
        offset_boxes = []
        box_offset_bounds = self.box_offset_side_length // 2
        x_box_shifts = self.xp.random.randint(1, 20, size=(self.box_offset_side_length, self.box_offset_side_length))
        y_box_shifts = self.xp.random.randint(1, 20, size=(self.box_offset_side_length, self.box_offset_side_length))
        for i in range(box_offset_bounds, box_offset_bounds + 1):
            for j in range(box_offset_bounds, box_offset_bounds + 1):
                x_shift = boxes[:, 0, :, :] + j * (x_box_shifts[i, j] / image_shape.width)
                y_shift = boxes[:, 1, :, :] + i * (y_box_shifts[i, j] / image_shape.height)
                offset_boxes.append(F.stack([x_shift, y_shift], axis=1))
        return F.stack(offset_boxes, axis=1)
Exemplo n.º 11
0
class BBOXPlotter(Extension):

    def __init__(self, image, out_dir, out_size, **kwargs):
        super(BBOXPlotter, self).__init__()
        self.image = image
        self.reference_image = kwargs.pop("reference_image", None)
        self.reference_features = None
        self.render_extracted_rois = kwargs.pop("render_extracted_rois", True)
        self.image_size = Size(height=image.shape[1], width=image.shape[2])
        self.out_dir = out_dir
        os.makedirs(self.out_dir, exist_ok=True)
        self.out_size = out_size
        self.colours = get_next_color
        self.send_bboxes = kwargs.pop("send_bboxes", False)
        self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1')
        self.upstream_port = kwargs.pop("upstream_port", 1337)
        self.font = ImageFont.truetype("train_utils/DejaVuSans.ttf", 14)
        self.visualization_anchors = kwargs.pop("visualization_anchors", [])
        self.visual_backprop = VisualBackprop()
        self.vis_features = kwargs.pop("feature_anchors", [])
        self.plot_objectness_classification_result = kwargs.pop('plot_objectness_classification_result', False)
        self.show_visual_backprop_overlay = kwargs.pop('show_visual_backprop_overlay', False)
        # index of the visual backrpop prediction that is to be shown in overlay
        self.visual_backprop_index = kwargs.pop('visual_backprop_index', 0)
        self.show_backprop_and_feature_vis = kwargs.pop('show_backprop_and_feature_vis', False)
        self.get_discriminator_output_function = kwargs.pop('discriminator_output_function', self.get_discriminator_output)
        self.render_pca = kwargs.pop('render_pca', False)
        self.gt_bbox = kwargs.pop('gt_bbox', None)
        self.xp = np
        self.devices = kwargs.pop('devices', None)
        self.log_name = kwargs.pop('log_name', 'training')
        self.max_num_rois_to_render = kwargs.pop('num_rois_to_render', None)
        self.sort_rois = kwargs.pop('sort_rois', False)

        self.init_predictors(kwargs.pop("predictors", {}))

    def init_predictors(self, predictors):
        self.localizer = predictors['localizer']
        # self.assessor = predictors['assessor']

    def initialize(self, trainer):
        # run the network with the completely randomized state we start with
        self(trainer)

    def send_image(self, data):
        height = data.height
        width = data.width
        channels = len(data.getbands())

        # convert image to png in order to save network bandwidth
        png_stream = BytesIO()
        data.save(png_stream, format="PNG")
        png_stream = png_stream.getvalue()

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
            try:
                sock.connect((self.upstream_ip, self.upstream_port))
            except Exception as e:
                print(e)
                print("could not connect to display server, disabling image rendering")
                self.send_bboxes = False
                return
            data = {
                'width': width,
                'height': height,
                'channels': channels,
                'title': self.log_name,
                'image': base64.b64encode(png_stream).decode('utf-8'),
            }
            sock.send(bytes(json.dumps(data), 'utf-8'))

    def array_to_image(self, array):
        if array.shape[0] == 1:
            # image is black and white, we need to trick the system into thinking, that we are having an RGB image
            array = self.xp.tile(array, (3, 1, 1))
        array = array.copy() * 255
        return Image.fromarray(cuda.to_cpu(array.transpose(1, 2, 0)).astype(np.uint8), "RGB").convert("RGBA")

    def variable_to_image(self, data):
        if isinstance(data, chainer.Variable):
            data = data.data
        return self.array_to_image(data)

    def get_predictions(self, image):
        rois, bboxes, localizer_visual_backprop = self.localizer.predict(
            image[self.xp.newaxis, ...],
            return_visual_backprop=True
        )
        # assessor_prediction, assessor_visual_backprop = self.assessor.predict(rois, return_visual_backprop=True)

        return {
            "rois": rois,
            "bboxes": bboxes,
            # "assessor_prediction": assessor_prediction,
            "visual_backprop": {
                "localizer": getattr(localizer_visual_backprop, 'array', localizer_visual_backprop),
                # "assessor": getattr(assessor_visual_backprop, 'array', assessor_visual_backprop),
            }
        }

    def sort_predictions(self, predictions):
        rois, assessor_output, assessor_visual_backprop = self.sort_rois_and_scores(
            predictions["rois"],
            predictions["assessor_prediction"],
            self.assessor,
            roi_visual_backprop=predictions["visual_backprop"]["assessor"]
        )
        predictions["rois"] = rois
        predictions["assessor_prediction"] = assessor_output
        predictions["visual_backprop"]["assessor"] = assessor_visual_backprop
        return predictions

    def filter_predictions(self, predictions):
        predictions["rois"] = predictions["rois"][:self.max_num_rois_to_render]
        # predictions["assessor_prediction"] = predictions["assessor_prediction"][:self.max_num_rois_to_render]
        # predictions["visual_backprop"]["assessor"] = predictions["visual_backprop"]["assessor"][:self.max_num_rois_to_render]
        return predictions

    def render_predictions(self, dest_image, predictions):
        return self.render_discriminator_result(
            dest_image,
            self.array_to_image(self.image.copy()),
            self.get_discriminator_output_function(predictions["assessor_prediction"])
        )

    def __call__(self, trainer):
        iteration = trainer.updater.iteration

        with chainer.using_device(trainer.updater.get_optimizer('opt_gen').target.device), chainer.using_config('train', False):
            self.xp = trainer.updater.get_optimizer('opt_gen').target.device.xp
            image = self.xp.asarray(self.image)
            predictions = self.get_predictions(image)

            if self.sort_rois:
                predictions = self.sort_predictions(predictions)

            if self.render_extracted_rois and self.max_num_rois_to_render is not None:
                predictions = self.filter_predictions(predictions)

            dest_image = self.render_rois(
                predictions["rois"],
                predictions["bboxes"],
                iteration,
                self.image.copy(),
                backprop_vis=predictions["visual_backprop"]['localizer'],
            )

            # self.show_backprop_vis(
            #     predictions["visual_backprop"]["assessor"],
            #     dest_image,
            #     self.array_to_image(self.image.copy()),
            #     count=1
            # )

            dest_image = self.render_predictions(dest_image, predictions)

            if self.gt_bbox is not None:
                dest_image = self.draw_gt_bbox(dest_image)
            if self.render_pca and self.render_extracted_rois:
                dest_image = self.show_pca(dest_image, trainer.updater)
            self.save_image(dest_image, iteration)

    def get_feature_maps(self, predictor):
        feature_visualizations = []
        for feature_anchor in self.vis_features:
            targets = predictor
            for attr in feature_anchor:
                targets = getattr(targets, attr, None)
            if targets is not None:
                for target in targets:
                    feature_visualizations.append(self.show_feature_map(target))
        return feature_visualizations

    def get_backprop_visualization(self, predictor):
        backprop_visualizations = []
        for visanchor in self.visualization_anchors:
            vis_targets = predictor
            for target in visanchor:
                vis_targets = getattr(vis_targets, target)
            if vis_targets is not None:
                if not hasattr(vis_targets, '__iter__'):
                    vis_targets = [vis_targets]
                for vis_target in vis_targets:
                    backprop_visualizations.append(self.visual_backprop.perform_visual_backprop(vis_target))
        return backprop_visualizations

    @property
    def original_image_paste_location(self):
        return 0, 0

    def compose_image_and_visual_backprop(self, original_image, backprop_image):
        backprop_image = self.array_to_image(
            self.xp.tile(backprop_image, (3, 1, 1))
        ).resize(
            (self.image_size.width, self.image_size.height)
        )
        original_image = original_image.convert("RGBA")
        backprop_image = backprop_image.convert("RGBA")

        resulting_image = Image.blend(original_image, backprop_image, 0.6)
        return resulting_image

    def render_rois(self, rois, bboxes, iteration, image, backprop_vis=(), feature_vis=()):
        image = self.array_to_image(image)

        num_timesteps = self.get_num_timesteps(bboxes)
        bboxes, dest_image = self.set_output_sizes(backprop_vis, feature_vis, bboxes, image, num_timesteps, len(rois))

        if self.render_extracted_rois:
            self.render_extracted_regions(dest_image, image, rois)

        if len(backprop_vis) != 0 and self.show_backprop_and_feature_vis:
            # if we have a backprop visualization we can show it now
            self.show_backprop_vis(backprop_vis, dest_image, image)

        if self.show_visual_backprop_overlay and len(backprop_vis) != 0:
            backprop_image_to_show = backprop_vis[self.visual_backprop_index][0]
            image = self.compose_image_and_visual_backprop(image, backprop_image_to_show)

        if len(feature_vis) != 0 and self.show_backprop_and_feature_vis:
            self.show_backprop_vis(feature_vis, dest_image, image, image.height)

        self.draw_bboxes(bboxes, image)
        dest_image.paste(image, self.original_image_paste_location)
        return dest_image

    def sort_rois_and_scores(self, rois, scores, assessor, roi_visual_backprop=None):
        # sort rois based on scores obtained by assessor, but only sort using first prediction
        sort_scores = assessor.extract_iou_prediction(scores)
        sort_scores = sort_scores.data.copy()
        score_indices = sort_scores.argsort()[::-1]
        rois = rois[score_indices]
        scores = scores[score_indices]
        if roi_visual_backprop is not None:
            roi_visual_backprop = roi_visual_backprop[score_indices]
            return rois, scores, roi_visual_backprop
        return rois, scores

    def save_image(self, dest_image, iteration):
        dest_image.save("{}.png".format(os.path.join(self.out_dir, str(iteration))), 'png')
        if self.send_bboxes:
            self.send_image(dest_image)

    def get_num_timesteps(self, bboxes):
        return bboxes.shape[0]

    def set_output_sizes(self, backprop_vis, feature_vis, bboxes, image, num_timesteps, num_rois_to_render):
        _, num_channels, height, width = bboxes.shape

        image_height = image.height if (len(backprop_vis) == 0 or not self.show_backprop_and_feature_vis) and not self.render_pca else image.height + self.image_size.height
        image_height = image_height + self.image_size.height if len(feature_vis) > 0 and self.show_backprop_and_feature_vis else image_height
        image_width = image.width + image.width * num_rois_to_render if self.render_extracted_rois else image.width

        dest_image = Image.new("RGBA", (image_width, image_height), color='black')
        bboxes = F.reshape(bboxes, (num_timesteps, 1, num_channels, height, width))

        return bboxes, dest_image

    def show_backprop_vis(self, visualizations, dest_image, image, height_offset=0, count=0):
        for visualization in visualizations:
            for vis in visualization:
                backprop_image = self.array_to_image(self.xp.tile(vis, (3, 1, 1))).resize(
                    (self.image_size.width, self.image_size.height))
                dest_image.paste(backprop_image, (count * backprop_image.width, height_offset + image.height))
                count += 1

    def show_feature_map(self, feature_map):
        with chainer.no_backprop_mode():
            averaged_feature_map = F.average(feature_map, axis=1, keepdims=True)[0]
            averaged_feature_map -= averaged_feature_map.data.min()
            max_value = averaged_feature_map.data.max()
            if max_value > 0:
                averaged_feature_map /= max_value
        return averaged_feature_map[None, ...].data

    def show_pca(self, dest_image, updater):
        colors = ['navy', 'turquoise', 'darkorange']
        if getattr(updater, 'pca', None) is None:
            return dest_image
        pca_discriminator = updater.pca.reshape(3, -1, updater.n_components_pca)

        plt.figure()
        for i, color, in enumerate(colors):
            plt.scatter(pca_discriminator[i, :, 0], pca_discriminator[i, :, 1], color=color, lw=2)
        plt.legend(['fake', 'real', 'anchor'])

        canvas = plt.get_current_fig_manager().canvas
        canvas.draw()
        image = Image.frombytes('RGB', canvas.get_width_height(), canvas.tostring_rgb())
        image = image.resize((self.image_size.width, self.image_size.height), Image.LANCZOS)
        dest_image.paste(image, (self.image_size.width, self.image_size.height))
        plt.close()
        return dest_image

    def render_extracted_regions(self, dest_image, image, rois):
        num_rois, num_channels, height, width = rois.shape
        if num_rois == 0:
            return
        rois = rois.reshape(len(rois), -1, num_channels, height, width)

        for i, roi in enumerate(rois, start=1):
            roi_image = self.variable_to_image(roi[0])
            paste_location = i * image.width, 0
            dest_image.paste(roi_image.resize((self.image_size.width, self.image_size.height)), paste_location)

    def draw_bboxes(self, bboxes, image):
        if len(bboxes) == 0:
            return
        draw = ImageDraw.Draw(image)
        for i, sub_box in enumerate(F.separate(bboxes, axis=1)):
            for bbox, colour in zip(F.separate(sub_box, axis=0), self.colours()):
                bbox.data[...] = (bbox.data[...] + 1) / 2
                bbox.data[0, :] *= self.image_size.width
                bbox.data[1, :] *= self.image_size.height

                x = self.xp.clip(bbox.data[0, :].reshape(self.out_size), 0, self.image_size.width) + i * self.image_size.width
                y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0, self.image_size.height)

                top_left = (x[0, 0], y[0, 0])
                top_right = (x[0, -1], y[0, -1])
                bottom_left = (x[-1, 0], y[-1, 0])
                bottom_right = (x[-1, -1], y[-1, -1])

                corners = [top_left, top_right, bottom_right, bottom_left]
                self.draw_bbox(colour, corners, draw)

    def draw_bbox(self, colour, corners, draw):
        next_corners = corners[1:] + [corners[0]]
        for first_corner, next_corner in zip(corners, next_corners):
            draw.line([first_corner, next_corner], fill=colour, width=3)

    def get_discriminator_output(self, discriminator_result):
        if discriminator_result.shape[1] > 1:
            discriminator_result = F.softmax(discriminator_result, axis=1)
        results = []
        for result in discriminator_result:
            if result.shape[0] == 1:
                result = format(float(result.data), ".3f")
            else:
                result = str(int(result.data.argmax()))
            results.append(result)
        return results

    def render_discriminator_result(self, dest_image, source_image, discriminator_result):
        for i, result in enumerate(discriminator_result, start=1):
            dest_image = self.render_text(dest_image, source_image, result, i)
        return dest_image

    def render_text(self, dest_image, source_image, text, i, bottom=False):
        label_image = Image.new(dest_image.mode, dest_image.size)
        draw = ImageDraw.Draw(label_image)
        paste_width = (i + 1) * source_image.width
        text_width, text_height = draw.textsize(text, self.font)
        insert_height = source_image.height - text_height - 1 if bottom else 0
        draw.rectangle([paste_width - text_width - 1, insert_height, paste_width, insert_height + text_height],
                       fill=(255, 255, 255, 160))
        draw.text((paste_width - text_width - 1, insert_height), text, fill='green', font=self.font)
        dest_image = Image.alpha_composite(dest_image, label_image)
        return dest_image

    def draw_gt_bbox(self, image):
        draw = ImageDraw.Draw(image)
        for bbox in self.gt_bbox:
            top_left = bbox[1], bbox[0]
            top_right = bbox[3], bbox[0]
            bottom_left = bbox[1], bbox[2]
            bottom_right = bbox[3], bbox[2]

            colour = COLOR_MAP[-1]
            self.draw_bbox(colour, [top_left, top_right, bottom_right, bottom_left], draw)
        return image
Exemplo n.º 12
0
class SheepLocalizer(Chain):
    def __init__(self,
                 out_size,
                 transform_rois_to_grayscale=False,
                 train_imagenet=False):
        super().__init__()
        with self.init_scope():
            self.feature_extractor = ResNet(
                18, class_labels=1000 if train_imagenet else None)

            if not train_imagenet:
                self.res6 = BasicBlock(2, 512)
                self.res7 = BasicBlock(2, 512)
                self.param_predictor = L.Linear(512, 6)

                transform_bias = self.param_predictor.b.data
                transform_bias[[0, 4]] = 0.8
                transform_bias[[2, 5]] = 0
                self.param_predictor.W.data[...] = 0

        self.visual_backprop_anchors = []
        self.out_size = out_size
        self.transform_rois_to_grayscale = transform_rois_to_grayscale
        self.visual_backprop = VisualBackprop()
        self.train_imagenet = train_imagenet

    def __call__(self, images):
        self.visual_backprop_anchors.clear()

        with cuda.Device(images.data.device):
            input_images = self.prepare_images(images.copy() * 255)
        h = self.feature_extractor(input_images)

        if self.train_imagenet:
            return h

        if images.shape[-2] > 224:
            h = self.res6(h)

            if images.shape[-2] > 300:
                h = self.res7(h)

        self.visual_backprop_anchors.append(h)
        h = _global_average_pooling_2d(h)

        transform_params = self.param_predictor(h)
        transform_params = rotation_dropout(F.reshape(transform_params,
                                                      (-1, 2, 3)),
                                            ratio=0.0)
        points = F.spatial_transformer_grid(transform_params, self.out_size)
        rois = F.spatial_transformer_sampler(images, points)

        if self.transform_rois_to_grayscale:
            assert rois.shape[
                1] == 3, "rois are not in RGB, can not convert them to grayscale"
            b, g, r = F.split_axis(rois, 3, axis=1)
            rois = 0.299 * r + 0.587 * g + 0.114 * b

        return rois, points

    def prepare_images(self, images):
        if self.xp != np:
            device = images.data.device
            images = F.copy(images, -1)

        converted_images = [
            resnet.prepare(image.data, size=None)
            for image in F.separate(images, axis=0)
        ]
        converted_images = F.stack(converted_images, axis=0)

        if self.xp != np:
            converted_images = F.copy(converted_images, device.id)
        return converted_images

    def extract_corners(self, bboxes):
        top = bboxes[:, 1, 0, 0]
        left = bboxes[:, 0, 0, 0]
        bottom = bboxes[:, 1, -1, -1]
        right = bboxes[:, 0, -1, -1]

        corners = F.stack([top, left, bottom, right], axis=1)
        return corners

    def scale_bboxes(self, bboxes, image_size):
        bboxes = (bboxes + 1) / 2
        bboxes.data[:, ::2] *= image_size.height
        bboxes.data[:, 1::2] *= image_size.width
        return bboxes

    def predict(self, images, return_visual_backprop=False):
        with cuda.Device(self._device_id):
            images = [self.xp.array(image) for image in images]
            images = self.xp.stack(images, axis=0)
            with chainer.using_config('train', False):
                rois, bboxes = self(images)
                if return_visual_backprop:
                    if not hasattr(self, 'visual_backprop'):
                        self.visual_backprop = VisualBackprop()
                    visual_backprop = cuda.to_cpu(
                        self.visual_backprop.perform_visual_backprop(
                            self.visual_backprop_anchors[0]))
                else:
                    visual_backprop = None

                bboxes = self.extract_corners(bboxes)
                bboxes = self.scale_bboxes(bboxes,
                                           Size._make(images.shape[-2:]))

        bboxes = [cuda.to_cpu(bbox).reshape(1, -1) for bbox in bboxes.data]

        return bboxes, rois, np.ones((len(bboxes), 1)), visual_backprop