def __call__(self, images): self.visual_backprop_anchors.clear() h = self.feature_extractor(images) self.visual_backprop_anchors.append(h) batch_size = len(h) transform_params = self.get_transform_params(h) boxes = F.spatial_transformer_grid(transform_params, self.out_size) expanded_images = F.broadcast_to( F.expand_dims(images, axis=1), (batch_size, self.num_bboxes_to_localize) + images.shape[1:]) expanded_images = F.reshape(expanded_images, (-1, ) + expanded_images.shape[2:]) rois = F.spatial_transformer_sampler(expanded_images, boxes) rois = F.reshape( rois, (batch_size, self.num_bboxes_to_localize, images.shape[1], self.out_size.height, self.out_size.width)) boxes = F.reshape(boxes, (batch_size, self.num_bboxes_to_localize, 2, self.out_size.height, self.out_size.width)) # return shapes: # 1. batch_size, num_bboxes, num_channels, (out-)height, (out-)width # 2. batch_size, num_bboxes, 2, (out-)height, (out-)width return rois, boxes
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) # h = self.data_bn(rois) h = F.relu(self.bn0(self.conv0(rois))) h = F.average_pooling_2d(h, 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps * 2 + 1, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): # go 2x num_labels plus 1 timesteps because of ctc loss lstm_predictions = [] self.lstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) classified = self.classifier(lstm_prediction) lstm_predictions.append(classified) overall_predictions.append(lstm_predictions) return overall_predictions, rois, points
def crop_from_image(self, image, output_size=None, use_spatial_transformer=True): if output_size is None: output_size = (self.width, self.height) if use_spatial_transformer: image_array = np.asarray(image).transpose(2, 0, 1).astype(np.float32) crop_transform = self.get_affine_transform_params( image.size).astype(np.float32) transform_grid = spatial_transformer_grid( crop_transform[np.newaxis, ...], (output_size[1], output_size[0])) cropped_image = spatial_transformer_sampler( image_array[np.newaxis, ...], transform_grid).data[0] cropped_image = cropped_image.astype(np.uint8) cropped_image = Image.fromarray(cropped_image.transpose(1, 2, 0)) else: cropped_image = image.crop(self.to_aabb()) cropped_image = cropped_image.resize(output_size, Image.BILINEAR) return cropped_image
def __call__(self, images): self.visual_backprop_anchors.clear() with cuda.Device(images.data.device): input_images = self.prepare_images(images.copy() * 255) h = self.feature_extractor(input_images) if self.train_imagenet: return h if images.shape[-2] > 224: h = self.res6(h) if images.shape[-2] > 300: h = self.res7(h) self.visual_backprop_anchors.append(h) h = _global_average_pooling_2d(h) transform_params = self.param_predictor(h) transform_params = rotation_dropout(F.reshape(transform_params, (-1, 2, 3)), ratio=0.0) points = F.spatial_transformer_grid(transform_params, self.out_size) rois = F.spatial_transformer_sampler(images, points) if self.transform_rois_to_grayscale: assert rois.shape[ 1] == 3, "rois are not in RGB, can not convert them to grayscale" b, g, r = F.split_axis(rois, 3, axis=1) rois = 0.299 * r + 0.587 * g + 0.114 * b return rois, points
def apply_transform_params(self, image, transform_params): image = self.xp.tile(image[np.newaxis, ...], (len(transform_params), 1, 1, 1)) transform_grid = spatial_transformer_grid(transform_params, self.image_size) cropped_image = spatial_transformer_sampler(image, transform_grid).array return cropped_image
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) h = self.bn0(self.conv0(rois)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) if self.uses_original_data: # merge data of all 4 individual images in channel dimension batch_size, num_channels, height, width = h.shape h = F.reshape(h, (batch_size // 4, 4 * num_channels, height, width)) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): # go 2x num_labels plus 1 timesteps because of ctc loss lstm_predictions = [] self.lstm.reset_state() if self.use_blstm: self.blstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) lstm_predictions.append(lstm_prediction) if self.use_blstm: blstm_predictions = [] for lstm_prediction in reversed(lstm_predictions): blstm_prediction = self.blstm(lstm_prediction) blstm_predictions.append(blstm_prediction) lstm_predictions = reversed(blstm_predictions) final_lstm_predictions = [] for lstm_prediction in lstm_predictions: classified = self.classifier(lstm_prediction) final_lstm_predictions.append(F.expand_dims(classified, axis=0)) final_lstm_predictions = F.concat(final_lstm_predictions, axis=0) overall_predictions.append(final_lstm_predictions) return overall_predictions, rois, points
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) h = self.data_bn(rois) h = F.relu(self.bn0(self.conv0(h))) h = F.average_pooling_2d(h, 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): lstm_predictions = [] self.lstm.reset_state() if self.use_blstm: self.blstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) lstm_predictions.append(lstm_prediction) if self.use_blstm: blstm_predictions = [] for lstm_prediction in reversed(lstm_predictions): blstm_prediction = self.blstm(lstm_prediction) blstm_predictions.append(blstm_prediction) lstm_predictions = reversed(blstm_predictions) final_lstm_predictions = [] for lstm_prediction in lstm_predictions: classified = self.classifier(lstm_prediction) final_lstm_predictions.append(F.expand_dims(classified, axis=1)) final_lstm_predictions = F.concat(final_lstm_predictions, axis=1) overall_predictions.append(final_lstm_predictions) return overall_predictions, rois, points
def check_forward(self, theta, output_shape): grid = functions.spatial_transformer_grid(theta, output_shape).data theta = cuda.to_cpu(theta) B = theta.shape[0] H, W = output_shape expected = [] for b in range(B): for i in numpy.linspace(-1., 1., H): for j in numpy.linspace(-1., 1., W): coord = numpy.array([j, i, 1]) expected.append(self.theta[b].dot(coord)) expected = numpy.array( expected).reshape(B, H, W, 2).transpose(0, 3, 1, 2) testing.assert_allclose(grid, expected) self.assertEqual(grid.dtype, theta.dtype)
def check_forward(self, theta, output_shape): grid = functions.spatial_transformer_grid(theta, output_shape).data theta = cuda.to_cpu(theta) B = theta.shape[0] H, W = output_shape expected = [] for b in range(B): for i in numpy.linspace(-1., 1., H): for j in numpy.linspace(-1., 1., W): coord = numpy.array([j, i, 1]) expected.append(self.theta[b].dot(coord)) expected = numpy.array(expected).reshape(B, H, W, 2).transpose(0, 3, 1, 2) testing.assert_allclose(grid, expected) self.assertEqual(grid.dtype, theta.dtype)
def do_transformation_param_refinement_step(self, images, transformation_params): transformation_params = self.remove_homogeneous_coordinates(transformation_params) points = F.spatial_transformer_grid(transformation_params, self.target_shape) rois = F.spatial_transformer_sampler(images, points) # rerun parts of the feature extraction for producing a refined version of the transformation params h = self.bn0_1(self.conv0_1(rois)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs4(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs5(h) h = F.max_pooling_2d(h, 2, stride=2) transformation_params = self.refinement_transform(h) transformation_params = F.reshape(transformation_params, (-1, 2, 3)) transformation_params = rotation_dropout(transformation_params, ratio=self.dropout_ratio) return transformation_params
def __call__(self, encs, hiddens, batch_size, prev_image, num_masks, color_channels): """ Learn through StatelessSTP. Args: encs: An array of computed transformation hiddens: An array of hidden layers batch_size: Size of mini batches prev_image: The image to transform num_masks: Number of masks to apply color_channels: Output color channels Returns: transformed: A list of masks to apply on the previous image """ logger = logging.getLogger(__name__) enc0, enc1, enc2, enc3, enc4, enc5, enc6 = encs hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7 = hiddens xp = chainer.cuda.get_array_module(enc6.data) # STP specific enc7 = self.enc7(enc6) transformed = list([F.sigmoid(enc7)]) stp_input0 = F.reshape(hidden5, (int(batch_size), -1)) stp_input1 = self.stp_input(stp_input0) stp_input1 = F.relu(stp_input1) identity_params = np.array([[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]], dtype=np.float32) identity_params = np.repeat(identity_params, int(batch_size), axis=0) identity_params = variable.Variable(xp.array(identity_params)) stp_transformations = [] for i in range(num_masks-1): params = self.identity_params(stp_input1) params = params + identity_params params = F.reshape(params, (int(params.shape[0]), 2, 3)) grid = F.spatial_transformer_grid(params, (prev_image.shape[2], prev_image.shape[3])) trans = F.spatial_transformer_sampler(prev_image, grid) stp_transformations.append(trans) transformed += stp_transformations return transformed, enc7
def __call__(self, images, localizations): self.lstm.reset_state() if self.use_blstm: self.blstm.reset_state() points = [ F.spatial_transformer_grid(localization, self.target_shape) for localization in localizations ] rois = [ F.spatial_transformer_sampler(images, point) for point in points ] h = F.relu(self.bn0(self.conv0(rois[-1]))) h = F.average_pooling_2d(h, 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) h = F.relu(self.fc1(h)) # each timestep of the localization contains one character prediction, that needs to be classified overall_predictions = [] h = F.reshape(h, (self.num_rois, -1, self.fc1.out_size)) for timestep in F.separate(h, axis=0): lstm_state = self.lstm(timestep) prediction = self.classifier(lstm_state) overall_predictions.append(prediction) return overall_predictions, rois, points
def f(theta): return functions.spatial_transformer_grid(theta, output_shape)
def __call__(self, x): theta = self.affine_matrix(x) self.grid = F.spatial_transformer_grid(theta, x.shape[2:]) return F.spatial_transformer_sampler(x, self.grid)