def __init__(self, image, out_dir, out_size, **kwargs): super(BBOXPlotter, self).__init__() self.image = image self.reference_image = kwargs.pop("reference_image", None) self.reference_features = None self.render_extracted_rois = kwargs.pop("render_extracted_rois", True) self.image_size = Size(height=image.shape[1], width=image.shape[2]) self.out_dir = out_dir os.makedirs(self.out_dir, exist_ok=True) self.out_size = out_size self.colours = get_next_color self.send_bboxes = kwargs.pop("send_bboxes", False) self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1') self.upstream_port = kwargs.pop("upstream_port", 1337) self.font = ImageFont.truetype("train_utils/DejaVuSans.ttf", 14) self.visualization_anchors = kwargs.pop("visualization_anchors", []) self.visual_backprop = VisualBackprop() self.vis_features = kwargs.pop("feature_anchors", []) self.plot_objectness_classification_result = kwargs.pop('plot_objectness_classification_result', False) self.show_visual_backprop_overlay = kwargs.pop('show_visual_backprop_overlay', False) # index of the visual backrpop prediction that is to be shown in overlay self.visual_backprop_index = kwargs.pop('visual_backprop_index', 0) self.show_backprop_and_feature_vis = kwargs.pop('show_backprop_and_feature_vis', False) self.get_discriminator_output_function = kwargs.pop('discriminator_output_function', self.get_discriminator_output) self.render_pca = kwargs.pop('render_pca', False) self.gt_bbox = kwargs.pop('gt_bbox', None) self.xp = np self.devices = kwargs.pop('devices', None) self.log_name = kwargs.pop('log_name', 'training') self.max_num_rois_to_render = kwargs.pop('num_rois_to_render', None) self.sort_rois = kwargs.pop('sort_rois', False) self.init_predictors(kwargs.pop("predictors", {}))
def __init__(self, dropout_factor, num_timesteps, zoom=0.9): super(FSNSMultipleSTNLocalizationNet, self).__init__() with self.init_scope(): self.conv0 = L.Convolution2D(None, 32, 3, pad=1) self.bn0 = L.BatchNormalization(32) self.rs1 = ResnetBlock(32) self.rs2 = ResnetBlock(48, filter_increase=True) self.rs3 = ResnetBlock(48) self.lstm = L.LSTM(None, 256) self.translation_transform = L.Linear(256, 6) self.rotation_transform = L.Linear(256, 6) self.transform_2 = L.LSTM(256, 6) self.dropout_factor = dropout_factor self._train = True self.num_timesteps = num_timesteps for transform in [self.translation_transform, self.rotation_transform]: transform_bias = transform.b.data transform_bias[[0, 4]] = zoom transform_bias[[2, 5]] = 0 transform.W.data[...] = 0 # self.transform_2.upward.b.data[...] = 0 # self.transform_2.upward.W.data[...] = 0 # self.transform_2.lateral.W.data[...] = 0 # self.transform.W.data[...] = 0 self.visual_backprop = VisualBackprop() self.vis_anchor = None
def __init__(self, dropout_ratio, num_timesteps, zoom=0.9, use_dropout=False): super(FSNSSingleSTNLocalizationNet, self).__init__() with self.init_scope(): self.conv0 = L.Convolution2D(None, 32, 3, pad=1) self.bn0 = L.BatchNormalization(32) self.rs1 = ResnetBlock(32, use_dropout=use_dropout, dropout_ratio=dropout_ratio) self.rs2 = ResnetBlock(48, filter_increase=True, use_dropout=use_dropout, dropout_ratio=dropout_ratio) self.rs3 = ResnetBlock(48) # self.rs4 = ResnetBlock(16, filter_increase=True) self.lstm = L.LSTM(None, 256) self.transform_2 = L.LSTM(256, 6) self.dropout_ratio = dropout_ratio self.use_dropout = use_dropout self._train = True self.num_timesteps = num_timesteps # initialize transform # self.transform_2.W.data[...] = 0 # # transform_bias = self.transform_2.b.data # transform_bias[[0, 4]] = zoom # transform_bias[[2, 5]] = 0 self.visual_backprop = VisualBackprop() self.vis_anchor = None self.width_encoding = None self.height_encoding = None
def __init__(self, image, out_dir, out_size, loss_metrics, **kwargs): super(BBOXPlotter, self).__init__() self.image = image self.render_extracted_rois = kwargs.pop("render_extracted_rois", True) self.image_size = Size(height=image.shape[1], width=image.shape[2]) self.out_dir = out_dir os.makedirs(self.out_dir, exist_ok=True) self.out_size = out_size self.colours = COLOR_MAP self.send_bboxes = kwargs.pop("send_bboxes", False) self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1') self.upstream_port = kwargs.pop("upstream_port", 1337) self.loss_metrics = loss_metrics self.font = ImageFont.truetype("utils/DejaVuSans.ttf", 20) self.visualization_anchors = kwargs.pop("visualization_anchors", []) self.visual_backprop = VisualBackprop() self.xp = np
def __init__(self, out_size, **kwargs): self.transform_rois_to_grayscale = kwargs.pop('transform_rois_to_grayscale', False) self.num_bboxes_to_localize = kwargs.pop('num_bboxes_to_localize', 1) self.dropout_ratio = kwargs.pop('dropout_ratio', 0) self.box_offset_side_length = kwargs.pop('box_offset_side_length', 3) self.box_offset_factor = kwargs.pop('box_offset_factor', 20) self.features_per_timestep = kwargs.pop('features_per_timestep', 256) super().__init__() with self.init_scope(): self.feature_extractor = ResNet(kwargs.pop('num_layers', 18)) self.visual_backprop = VisualBackprop() self.visual_backprop_anchors = [] self.out_size = out_size self.rotation_dropout_params = [1, 0, 1, 0, 1, 1] self.translation_dropout_params = [1, 1, 0, 1, 1, 0]
def predict(self, images, return_visual_backprop=False): with cuda.Device(self._device_id): if isinstance(images, list): images = [self.xp.array(image) for image in images] images = self.xp.stack(images, axis=0) visual_backprop = None with chainer.using_config('train', False): roi, bbox = self(images) rois = [roi] bboxes = [bbox] if return_visual_backprop: if not hasattr(self, 'visual_backprop'): self.visual_backprop = VisualBackprop() visual_backprop = self.visual_backprop.perform_visual_backprop(self.visual_backprop_anchors[0]) bboxes = F.stack(bboxes, axis=1) bboxes = F.reshape(bboxes, (-1,) + bboxes.shape[2:]) rois = F.stack(rois, axis=1) rois = F.reshape(rois, (-1,) + rois.shape[2:]) return rois, bboxes, visual_backprop
def predict(self, images, return_visual_backprop=False): with cuda.Device(self._device_id): images = [self.xp.array(image) for image in images] images = self.xp.stack(images, axis=0) with chainer.using_config('train', False): rois, bboxes = self(images) if return_visual_backprop: if not hasattr(self, 'visual_backprop'): self.visual_backprop = VisualBackprop() visual_backprop = cuda.to_cpu( self.visual_backprop.perform_visual_backprop( self.visual_backprop_anchors[0])) else: visual_backprop = None bboxes = self.extract_corners(bboxes) bboxes = self.scale_bboxes(bboxes, Size._make(images.shape[-2:])) bboxes = [cuda.to_cpu(bbox).reshape(1, -1) for bbox in bboxes.data] return bboxes, rois, np.ones((len(bboxes), 1)), visual_backprop
def __init__(self, out_size, transform_rois_to_grayscale=False, train_imagenet=False): super().__init__() with self.init_scope(): self.feature_extractor = ResNet( 18, class_labels=1000 if train_imagenet else None) if not train_imagenet: self.res6 = BasicBlock(2, 512) self.res7 = BasicBlock(2, 512) self.param_predictor = L.Linear(512, 6) transform_bias = self.param_predictor.b.data transform_bias[[0, 4]] = 0.8 transform_bias[[2, 5]] = 0 self.param_predictor.W.data[...] = 0 self.visual_backprop_anchors = [] self.out_size = out_size self.transform_rois_to_grayscale = transform_rois_to_grayscale self.visual_backprop = VisualBackprop() self.train_imagenet = train_imagenet
class BBOXPlotter(Extension): def __init__(self, image, out_dir, out_size, loss_metrics, **kwargs): super(BBOXPlotter, self).__init__() self.image = image self.render_extracted_rois = kwargs.pop("render_extracted_rois", True) self.image_size = Size(height=image.shape[1], width=image.shape[2]) self.out_dir = out_dir os.makedirs(self.out_dir, exist_ok=True) self.out_size = out_size self.colours = COLOR_MAP self.send_bboxes = kwargs.pop("send_bboxes", False) self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1') self.upstream_port = kwargs.pop("upstream_port", 1337) self.loss_metrics = loss_metrics self.font = ImageFont.truetype("utils/DejaVuSans.ttf", 20) self.visualization_anchors = kwargs.pop("visualization_anchors", []) self.visual_backprop = VisualBackprop() self.xp = np def send_image(self, data): height = data.height width = data.width channels = len(data.getbands()) # convert image to png in order to save network bandwidth png_stream = BytesIO() data.save(png_stream, format="PNG") png_stream = png_stream.getvalue() with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: try: sock.connect((self.upstream_ip, self.upstream_port)) except Exception as e: print(e) print("could not connect to display server, disabling image rendering") self.send_bboxes = False return data = { 'width': width, 'height': height, 'channels': channels, 'image': base64.b64encode(png_stream).decode('utf-8'), } sock.send(bytes(json.dumps(data), 'utf-8')) def array_to_image(self, array): if array.shape[0] == 1: # image is black and white, we need to trick the system into thinking, that we are having an RGB image array = self.xp.tile(array, (3, 1, 1)) return Image.fromarray(cuda.to_cpu(array.transpose(1, 2, 0) * 255).astype(np.uint8), "RGB").convert("RGBA") def variable_to_image(self, variable): return self.array_to_image(variable.data) def __call__(self, trainer): iteration = trainer.updater.iteration with cuda.get_device_from_id(trainer.updater.get_optimizer('main').target._device_id), chainer.using_config('train', False): self.xp = np if trainer.updater.get_optimizer('main').target._device_id < 0 else cuda.cupy image = self.xp.asarray(self.image) predictor = trainer.updater.get_optimizer('main').target.predictor predictions, rois, bboxes = predictor(image[self.xp.newaxis, ...]) backprop_visualizations = [] for visanchor in self.visualization_anchors: vis_target = predictor for target in visanchor: vis_target = getattr(vis_target, target) backprop_visualizations.append(self.visual_backprop.perform_visual_backprop(vis_target)) self.render_rois(predictions, rois, bboxes, iteration, self.image.copy(), backprop_vis=backprop_visualizations) @property def original_image_paste_location(self): return 0, 0 def render_rois(self, predictions, rois, bboxes, iteration, image, backprop_vis=()): # get the predicted text text = self.decode_predictions(predictions) image = self.array_to_image(image) num_timesteps = self.get_num_timesteps(bboxes) bboxes, dest_image = self.set_output_sizes(backprop_vis, bboxes, image, num_timesteps) if self.render_extracted_rois: self.render_extracted_regions(dest_image, image, rois, num_timesteps) if len(backprop_vis) != 0: # if we have a backprop visualization we can show it now self.show_backprop_vis(backprop_vis, dest_image, image, num_timesteps) self.draw_bboxes(bboxes, image) dest_image.paste(image, self.original_image_paste_location) if len(text) > 0: dest_image = self.render_text(dest_image, text) dest_image.save("{}.png".format(os.path.join(self.out_dir, str(iteration))), 'png') if self.send_bboxes: self.send_image(dest_image) def get_num_timesteps(self, bboxes): return bboxes.shape[0] def set_output_sizes(self, backprop_vis, bboxes, image, num_timesteps): _, num_channels, height, width = bboxes.shape image_height = image.height if len(backprop_vis) == 0 else image.height + self.image_size.height image_width = image.width + image.width * num_timesteps if self.render_extracted_rois else image.width dest_image = Image.new("RGBA", (image_width, image_height), color='black') bboxes = F.reshape(bboxes, (num_timesteps, 1, num_channels, height, width)) return bboxes, dest_image def show_backprop_vis(self, backprop_vis, dest_image, image, num_timesteps): count = 0 for visualization in backprop_vis: for vis in visualization: backprop_image = self.array_to_image(self.xp.tile(vis[0], (3, 1, 1))).resize( (self.image_size.width, self.image_size.height)) dest_image.paste(backprop_image, (count * backprop_image.width, image.height)) count += 1 def decode_predictions(self, predictions): words = [] for prediction in predictions: if isinstance(prediction, list): prediction = F.concat([F.expand_dims(p, axis=0) for p in prediction], axis=0) prediction = self.xp.transpose(prediction.data, (1, 0, 2)) prediction = self.xp.squeeze(prediction, axis=0) prediction = self.xp.argmax(prediction, axis=1) word = self.loss_metrics.strip_prediction(prediction[self.xp.newaxis, ...])[0] if len(word) == 1 and word[0] == 0: continue word = "".join(map(self.loss_metrics.label_to_char, word)) word = word.replace(chr(self.loss_metrics.char_map[str(self.loss_metrics.blank_symbol)]), '') if len(word) > 0: words.append(word) text = " ".join(words) return text def render_extracted_regions(self, dest_image, image, rois, num_timesteps): _, num_channels, height, width = rois.shape rois = self.xp.reshape(rois, (num_timesteps, -1, num_channels, height, width)) for i, roi in enumerate(rois, start=1): roi_image = self.variable_to_image(roi[0]) paste_location = i * image.width, 0 dest_image.paste(roi_image.resize((self.image_size.width, self.image_size.height)), paste_location) def render_text(self, dest_image, text): label_image = Image.new(dest_image.mode, dest_image.size) # only keep ascii characters # labels = ''.join(filter(lambda x: len(x) == len(x.encode()), labels)) draw = ImageDraw.Draw(label_image) text_width, text_height = draw.textsize(text, font=self.font) draw.rectangle([dest_image.width - text_width - 1, 0, dest_image.width, text_height], fill=(255, 255, 255, 160)) draw.text((dest_image.width - text_width - 1, 0), text, fill='green', font=self.font) dest_image = Image.alpha_composite(dest_image, label_image) return dest_image def draw_bboxes(self, bboxes, image): draw = ImageDraw.Draw(image) for i, sub_box in enumerate(F.separate(bboxes, axis=1)): for bbox, colour in zip(F.separate(sub_box, axis=0), self.colours): bbox.data[...] = (bbox.data[...] + 1) / 2 bbox.data[0, :] *= self.image_size.width bbox.data[1, :] *= self.image_size.height x = self.xp.clip(bbox.data[0, :].reshape(self.out_size), 0, self.image_size.width) + i * self.image_size.width y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0, self.image_size.height) top_left = (x[0, 0], y[0, 0]) top_right = (x[0, -1], y[0, -1]) bottom_left = (x[-1, 0], y[-1, 0]) bottom_right = (x[-1, -1], y[-1, -1]) corners = [top_left, top_right, bottom_right, bottom_left] next_corners = corners[1:] + [corners[0]] for first_corner, next_corner in zip(corners, next_corners): draw.line([first_corner, next_corner], fill=colour, width=3)
class TextLocalizer(Chain): def __init__(self, out_size, **kwargs): self.transform_rois_to_grayscale = kwargs.pop('transform_rois_to_grayscale', False) self.num_bboxes_to_localize = kwargs.pop('num_bboxes_to_localize', 1) self.dropout_ratio = kwargs.pop('dropout_ratio', 0) self.box_offset_side_length = kwargs.pop('box_offset_side_length', 3) self.box_offset_factor = kwargs.pop('box_offset_factor', 20) self.features_per_timestep = kwargs.pop('features_per_timestep', 256) super().__init__() with self.init_scope(): self.feature_extractor = ResNet(kwargs.pop('num_layers', 18)) self.visual_backprop = VisualBackprop() self.visual_backprop_anchors = [] self.out_size = out_size self.rotation_dropout_params = [1, 0, 1, 0, 1, 1] self.translation_dropout_params = [1, 1, 0, 1, 1, 0] @maybe_copy def __call__(self, images): self.visual_backprop_anchors.clear() h = self.feature_extractor(images) self.visual_backprop_anchors.append(h) batch_size = len(h) transform_params = self.get_transform_params(h) boxes = F.spatial_transformer_grid(transform_params, self.out_size) expanded_images = F.broadcast_to(F.expand_dims(images, axis=1), (batch_size, self.num_bboxes_to_localize) + images.shape[1:]) expanded_images = F.reshape(expanded_images, (-1,) + expanded_images.shape[2:]) rois = F.spatial_transformer_sampler(expanded_images, boxes) rois = F.reshape(rois, (batch_size, self.num_bboxes_to_localize, images.shape[1], self.out_size.height, self.out_size.width)) boxes = F.reshape(boxes, (batch_size, self.num_bboxes_to_localize, 2, self.out_size.height, self.out_size.width)) # return shapes: # 1. batch_size, num_bboxes, num_channels, (out-)height, (out-)width # 2. batch_size, num_bboxes, 2, (out-)height, (out-)width return rois, boxes def get_transform_params(self, features): raise NotImplementedError @maybe_copy def predict(self, images, return_visual_backprop=False): with cuda.Device(self._device_id): if isinstance(images, list): images = [self.xp.array(image) for image in images] images = self.xp.stack(images, axis=0) visual_backprop = None with chainer.using_config('train', False): roi, bbox = self(images) rois = [roi] bboxes = [bbox] if return_visual_backprop: if not hasattr(self, 'visual_backprop'): self.visual_backprop = VisualBackprop() visual_backprop = self.visual_backprop.perform_visual_backprop(self.visual_backprop_anchors[0]) bboxes = F.stack(bboxes, axis=1) bboxes = F.reshape(bboxes, (-1,) + bboxes.shape[2:]) rois = F.stack(rois, axis=1) rois = F.reshape(rois, (-1,) + rois.shape[2:]) return rois, bboxes, visual_backprop def virtual_box_number_increase(self, boxes, image_shape): image_shape = Size(*image_shape) offset_boxes = [] box_offset_bounds = self.box_offset_side_length // 2 x_box_shifts = self.xp.random.randint(1, 20, size=(self.box_offset_side_length, self.box_offset_side_length)) y_box_shifts = self.xp.random.randint(1, 20, size=(self.box_offset_side_length, self.box_offset_side_length)) for i in range(box_offset_bounds, box_offset_bounds + 1): for j in range(box_offset_bounds, box_offset_bounds + 1): x_shift = boxes[:, 0, :, :] + j * (x_box_shifts[i, j] / image_shape.width) y_shift = boxes[:, 1, :, :] + i * (y_box_shifts[i, j] / image_shape.height) offset_boxes.append(F.stack([x_shift, y_shift], axis=1)) return F.stack(offset_boxes, axis=1)
class BBOXPlotter(Extension): def __init__(self, image, out_dir, out_size, **kwargs): super(BBOXPlotter, self).__init__() self.image = image self.reference_image = kwargs.pop("reference_image", None) self.reference_features = None self.render_extracted_rois = kwargs.pop("render_extracted_rois", True) self.image_size = Size(height=image.shape[1], width=image.shape[2]) self.out_dir = out_dir os.makedirs(self.out_dir, exist_ok=True) self.out_size = out_size self.colours = get_next_color self.send_bboxes = kwargs.pop("send_bboxes", False) self.upstream_ip = kwargs.pop("upstream_ip", '127.0.0.1') self.upstream_port = kwargs.pop("upstream_port", 1337) self.font = ImageFont.truetype("train_utils/DejaVuSans.ttf", 14) self.visualization_anchors = kwargs.pop("visualization_anchors", []) self.visual_backprop = VisualBackprop() self.vis_features = kwargs.pop("feature_anchors", []) self.plot_objectness_classification_result = kwargs.pop('plot_objectness_classification_result', False) self.show_visual_backprop_overlay = kwargs.pop('show_visual_backprop_overlay', False) # index of the visual backrpop prediction that is to be shown in overlay self.visual_backprop_index = kwargs.pop('visual_backprop_index', 0) self.show_backprop_and_feature_vis = kwargs.pop('show_backprop_and_feature_vis', False) self.get_discriminator_output_function = kwargs.pop('discriminator_output_function', self.get_discriminator_output) self.render_pca = kwargs.pop('render_pca', False) self.gt_bbox = kwargs.pop('gt_bbox', None) self.xp = np self.devices = kwargs.pop('devices', None) self.log_name = kwargs.pop('log_name', 'training') self.max_num_rois_to_render = kwargs.pop('num_rois_to_render', None) self.sort_rois = kwargs.pop('sort_rois', False) self.init_predictors(kwargs.pop("predictors", {})) def init_predictors(self, predictors): self.localizer = predictors['localizer'] # self.assessor = predictors['assessor'] def initialize(self, trainer): # run the network with the completely randomized state we start with self(trainer) def send_image(self, data): height = data.height width = data.width channels = len(data.getbands()) # convert image to png in order to save network bandwidth png_stream = BytesIO() data.save(png_stream, format="PNG") png_stream = png_stream.getvalue() with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: try: sock.connect((self.upstream_ip, self.upstream_port)) except Exception as e: print(e) print("could not connect to display server, disabling image rendering") self.send_bboxes = False return data = { 'width': width, 'height': height, 'channels': channels, 'title': self.log_name, 'image': base64.b64encode(png_stream).decode('utf-8'), } sock.send(bytes(json.dumps(data), 'utf-8')) def array_to_image(self, array): if array.shape[0] == 1: # image is black and white, we need to trick the system into thinking, that we are having an RGB image array = self.xp.tile(array, (3, 1, 1)) array = array.copy() * 255 return Image.fromarray(cuda.to_cpu(array.transpose(1, 2, 0)).astype(np.uint8), "RGB").convert("RGBA") def variable_to_image(self, data): if isinstance(data, chainer.Variable): data = data.data return self.array_to_image(data) def get_predictions(self, image): rois, bboxes, localizer_visual_backprop = self.localizer.predict( image[self.xp.newaxis, ...], return_visual_backprop=True ) # assessor_prediction, assessor_visual_backprop = self.assessor.predict(rois, return_visual_backprop=True) return { "rois": rois, "bboxes": bboxes, # "assessor_prediction": assessor_prediction, "visual_backprop": { "localizer": getattr(localizer_visual_backprop, 'array', localizer_visual_backprop), # "assessor": getattr(assessor_visual_backprop, 'array', assessor_visual_backprop), } } def sort_predictions(self, predictions): rois, assessor_output, assessor_visual_backprop = self.sort_rois_and_scores( predictions["rois"], predictions["assessor_prediction"], self.assessor, roi_visual_backprop=predictions["visual_backprop"]["assessor"] ) predictions["rois"] = rois predictions["assessor_prediction"] = assessor_output predictions["visual_backprop"]["assessor"] = assessor_visual_backprop return predictions def filter_predictions(self, predictions): predictions["rois"] = predictions["rois"][:self.max_num_rois_to_render] # predictions["assessor_prediction"] = predictions["assessor_prediction"][:self.max_num_rois_to_render] # predictions["visual_backprop"]["assessor"] = predictions["visual_backprop"]["assessor"][:self.max_num_rois_to_render] return predictions def render_predictions(self, dest_image, predictions): return self.render_discriminator_result( dest_image, self.array_to_image(self.image.copy()), self.get_discriminator_output_function(predictions["assessor_prediction"]) ) def __call__(self, trainer): iteration = trainer.updater.iteration with chainer.using_device(trainer.updater.get_optimizer('opt_gen').target.device), chainer.using_config('train', False): self.xp = trainer.updater.get_optimizer('opt_gen').target.device.xp image = self.xp.asarray(self.image) predictions = self.get_predictions(image) if self.sort_rois: predictions = self.sort_predictions(predictions) if self.render_extracted_rois and self.max_num_rois_to_render is not None: predictions = self.filter_predictions(predictions) dest_image = self.render_rois( predictions["rois"], predictions["bboxes"], iteration, self.image.copy(), backprop_vis=predictions["visual_backprop"]['localizer'], ) # self.show_backprop_vis( # predictions["visual_backprop"]["assessor"], # dest_image, # self.array_to_image(self.image.copy()), # count=1 # ) dest_image = self.render_predictions(dest_image, predictions) if self.gt_bbox is not None: dest_image = self.draw_gt_bbox(dest_image) if self.render_pca and self.render_extracted_rois: dest_image = self.show_pca(dest_image, trainer.updater) self.save_image(dest_image, iteration) def get_feature_maps(self, predictor): feature_visualizations = [] for feature_anchor in self.vis_features: targets = predictor for attr in feature_anchor: targets = getattr(targets, attr, None) if targets is not None: for target in targets: feature_visualizations.append(self.show_feature_map(target)) return feature_visualizations def get_backprop_visualization(self, predictor): backprop_visualizations = [] for visanchor in self.visualization_anchors: vis_targets = predictor for target in visanchor: vis_targets = getattr(vis_targets, target) if vis_targets is not None: if not hasattr(vis_targets, '__iter__'): vis_targets = [vis_targets] for vis_target in vis_targets: backprop_visualizations.append(self.visual_backprop.perform_visual_backprop(vis_target)) return backprop_visualizations @property def original_image_paste_location(self): return 0, 0 def compose_image_and_visual_backprop(self, original_image, backprop_image): backprop_image = self.array_to_image( self.xp.tile(backprop_image, (3, 1, 1)) ).resize( (self.image_size.width, self.image_size.height) ) original_image = original_image.convert("RGBA") backprop_image = backprop_image.convert("RGBA") resulting_image = Image.blend(original_image, backprop_image, 0.6) return resulting_image def render_rois(self, rois, bboxes, iteration, image, backprop_vis=(), feature_vis=()): image = self.array_to_image(image) num_timesteps = self.get_num_timesteps(bboxes) bboxes, dest_image = self.set_output_sizes(backprop_vis, feature_vis, bboxes, image, num_timesteps, len(rois)) if self.render_extracted_rois: self.render_extracted_regions(dest_image, image, rois) if len(backprop_vis) != 0 and self.show_backprop_and_feature_vis: # if we have a backprop visualization we can show it now self.show_backprop_vis(backprop_vis, dest_image, image) if self.show_visual_backprop_overlay and len(backprop_vis) != 0: backprop_image_to_show = backprop_vis[self.visual_backprop_index][0] image = self.compose_image_and_visual_backprop(image, backprop_image_to_show) if len(feature_vis) != 0 and self.show_backprop_and_feature_vis: self.show_backprop_vis(feature_vis, dest_image, image, image.height) self.draw_bboxes(bboxes, image) dest_image.paste(image, self.original_image_paste_location) return dest_image def sort_rois_and_scores(self, rois, scores, assessor, roi_visual_backprop=None): # sort rois based on scores obtained by assessor, but only sort using first prediction sort_scores = assessor.extract_iou_prediction(scores) sort_scores = sort_scores.data.copy() score_indices = sort_scores.argsort()[::-1] rois = rois[score_indices] scores = scores[score_indices] if roi_visual_backprop is not None: roi_visual_backprop = roi_visual_backprop[score_indices] return rois, scores, roi_visual_backprop return rois, scores def save_image(self, dest_image, iteration): dest_image.save("{}.png".format(os.path.join(self.out_dir, str(iteration))), 'png') if self.send_bboxes: self.send_image(dest_image) def get_num_timesteps(self, bboxes): return bboxes.shape[0] def set_output_sizes(self, backprop_vis, feature_vis, bboxes, image, num_timesteps, num_rois_to_render): _, num_channels, height, width = bboxes.shape image_height = image.height if (len(backprop_vis) == 0 or not self.show_backprop_and_feature_vis) and not self.render_pca else image.height + self.image_size.height image_height = image_height + self.image_size.height if len(feature_vis) > 0 and self.show_backprop_and_feature_vis else image_height image_width = image.width + image.width * num_rois_to_render if self.render_extracted_rois else image.width dest_image = Image.new("RGBA", (image_width, image_height), color='black') bboxes = F.reshape(bboxes, (num_timesteps, 1, num_channels, height, width)) return bboxes, dest_image def show_backprop_vis(self, visualizations, dest_image, image, height_offset=0, count=0): for visualization in visualizations: for vis in visualization: backprop_image = self.array_to_image(self.xp.tile(vis, (3, 1, 1))).resize( (self.image_size.width, self.image_size.height)) dest_image.paste(backprop_image, (count * backprop_image.width, height_offset + image.height)) count += 1 def show_feature_map(self, feature_map): with chainer.no_backprop_mode(): averaged_feature_map = F.average(feature_map, axis=1, keepdims=True)[0] averaged_feature_map -= averaged_feature_map.data.min() max_value = averaged_feature_map.data.max() if max_value > 0: averaged_feature_map /= max_value return averaged_feature_map[None, ...].data def show_pca(self, dest_image, updater): colors = ['navy', 'turquoise', 'darkorange'] if getattr(updater, 'pca', None) is None: return dest_image pca_discriminator = updater.pca.reshape(3, -1, updater.n_components_pca) plt.figure() for i, color, in enumerate(colors): plt.scatter(pca_discriminator[i, :, 0], pca_discriminator[i, :, 1], color=color, lw=2) plt.legend(['fake', 'real', 'anchor']) canvas = plt.get_current_fig_manager().canvas canvas.draw() image = Image.frombytes('RGB', canvas.get_width_height(), canvas.tostring_rgb()) image = image.resize((self.image_size.width, self.image_size.height), Image.LANCZOS) dest_image.paste(image, (self.image_size.width, self.image_size.height)) plt.close() return dest_image def render_extracted_regions(self, dest_image, image, rois): num_rois, num_channels, height, width = rois.shape if num_rois == 0: return rois = rois.reshape(len(rois), -1, num_channels, height, width) for i, roi in enumerate(rois, start=1): roi_image = self.variable_to_image(roi[0]) paste_location = i * image.width, 0 dest_image.paste(roi_image.resize((self.image_size.width, self.image_size.height)), paste_location) def draw_bboxes(self, bboxes, image): if len(bboxes) == 0: return draw = ImageDraw.Draw(image) for i, sub_box in enumerate(F.separate(bboxes, axis=1)): for bbox, colour in zip(F.separate(sub_box, axis=0), self.colours()): bbox.data[...] = (bbox.data[...] + 1) / 2 bbox.data[0, :] *= self.image_size.width bbox.data[1, :] *= self.image_size.height x = self.xp.clip(bbox.data[0, :].reshape(self.out_size), 0, self.image_size.width) + i * self.image_size.width y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0, self.image_size.height) top_left = (x[0, 0], y[0, 0]) top_right = (x[0, -1], y[0, -1]) bottom_left = (x[-1, 0], y[-1, 0]) bottom_right = (x[-1, -1], y[-1, -1]) corners = [top_left, top_right, bottom_right, bottom_left] self.draw_bbox(colour, corners, draw) def draw_bbox(self, colour, corners, draw): next_corners = corners[1:] + [corners[0]] for first_corner, next_corner in zip(corners, next_corners): draw.line([first_corner, next_corner], fill=colour, width=3) def get_discriminator_output(self, discriminator_result): if discriminator_result.shape[1] > 1: discriminator_result = F.softmax(discriminator_result, axis=1) results = [] for result in discriminator_result: if result.shape[0] == 1: result = format(float(result.data), ".3f") else: result = str(int(result.data.argmax())) results.append(result) return results def render_discriminator_result(self, dest_image, source_image, discriminator_result): for i, result in enumerate(discriminator_result, start=1): dest_image = self.render_text(dest_image, source_image, result, i) return dest_image def render_text(self, dest_image, source_image, text, i, bottom=False): label_image = Image.new(dest_image.mode, dest_image.size) draw = ImageDraw.Draw(label_image) paste_width = (i + 1) * source_image.width text_width, text_height = draw.textsize(text, self.font) insert_height = source_image.height - text_height - 1 if bottom else 0 draw.rectangle([paste_width - text_width - 1, insert_height, paste_width, insert_height + text_height], fill=(255, 255, 255, 160)) draw.text((paste_width - text_width - 1, insert_height), text, fill='green', font=self.font) dest_image = Image.alpha_composite(dest_image, label_image) return dest_image def draw_gt_bbox(self, image): draw = ImageDraw.Draw(image) for bbox in self.gt_bbox: top_left = bbox[1], bbox[0] top_right = bbox[3], bbox[0] bottom_left = bbox[1], bbox[2] bottom_right = bbox[3], bbox[2] colour = COLOR_MAP[-1] self.draw_bbox(colour, [top_left, top_right, bottom_right, bottom_left], draw) return image
class SheepLocalizer(Chain): def __init__(self, out_size, transform_rois_to_grayscale=False, train_imagenet=False): super().__init__() with self.init_scope(): self.feature_extractor = ResNet( 18, class_labels=1000 if train_imagenet else None) if not train_imagenet: self.res6 = BasicBlock(2, 512) self.res7 = BasicBlock(2, 512) self.param_predictor = L.Linear(512, 6) transform_bias = self.param_predictor.b.data transform_bias[[0, 4]] = 0.8 transform_bias[[2, 5]] = 0 self.param_predictor.W.data[...] = 0 self.visual_backprop_anchors = [] self.out_size = out_size self.transform_rois_to_grayscale = transform_rois_to_grayscale self.visual_backprop = VisualBackprop() self.train_imagenet = train_imagenet def __call__(self, images): self.visual_backprop_anchors.clear() with cuda.Device(images.data.device): input_images = self.prepare_images(images.copy() * 255) h = self.feature_extractor(input_images) if self.train_imagenet: return h if images.shape[-2] > 224: h = self.res6(h) if images.shape[-2] > 300: h = self.res7(h) self.visual_backprop_anchors.append(h) h = _global_average_pooling_2d(h) transform_params = self.param_predictor(h) transform_params = rotation_dropout(F.reshape(transform_params, (-1, 2, 3)), ratio=0.0) points = F.spatial_transformer_grid(transform_params, self.out_size) rois = F.spatial_transformer_sampler(images, points) if self.transform_rois_to_grayscale: assert rois.shape[ 1] == 3, "rois are not in RGB, can not convert them to grayscale" b, g, r = F.split_axis(rois, 3, axis=1) rois = 0.299 * r + 0.587 * g + 0.114 * b return rois, points def prepare_images(self, images): if self.xp != np: device = images.data.device images = F.copy(images, -1) converted_images = [ resnet.prepare(image.data, size=None) for image in F.separate(images, axis=0) ] converted_images = F.stack(converted_images, axis=0) if self.xp != np: converted_images = F.copy(converted_images, device.id) return converted_images def extract_corners(self, bboxes): top = bboxes[:, 1, 0, 0] left = bboxes[:, 0, 0, 0] bottom = bboxes[:, 1, -1, -1] right = bboxes[:, 0, -1, -1] corners = F.stack([top, left, bottom, right], axis=1) return corners def scale_bboxes(self, bboxes, image_size): bboxes = (bboxes + 1) / 2 bboxes.data[:, ::2] *= image_size.height bboxes.data[:, 1::2] *= image_size.width return bboxes def predict(self, images, return_visual_backprop=False): with cuda.Device(self._device_id): images = [self.xp.array(image) for image in images] images = self.xp.stack(images, axis=0) with chainer.using_config('train', False): rois, bboxes = self(images) if return_visual_backprop: if not hasattr(self, 'visual_backprop'): self.visual_backprop = VisualBackprop() visual_backprop = cuda.to_cpu( self.visual_backprop.perform_visual_backprop( self.visual_backprop_anchors[0])) else: visual_backprop = None bboxes = self.extract_corners(bboxes) bboxes = self.scale_bboxes(bboxes, Size._make(images.shape[-2:])) bboxes = [cuda.to_cpu(bbox).reshape(1, -1) for bbox in bboxes.data] return bboxes, rois, np.ones((len(bboxes), 1)), visual_backprop