def __call__(self, images): self.visual_backprop_anchors.clear() h = self.feature_extractor(images) self.visual_backprop_anchors.append(h) batch_size = len(h) transform_params = self.get_transform_params(h) boxes = F.spatial_transformer_grid(transform_params, self.out_size) expanded_images = F.broadcast_to( F.expand_dims(images, axis=1), (batch_size, self.num_bboxes_to_localize) + images.shape[1:]) expanded_images = F.reshape(expanded_images, (-1, ) + expanded_images.shape[2:]) rois = F.spatial_transformer_sampler(expanded_images, boxes) rois = F.reshape( rois, (batch_size, self.num_bboxes_to_localize, images.shape[1], self.out_size.height, self.out_size.width)) boxes = F.reshape(boxes, (batch_size, self.num_bboxes_to_localize, 2, self.out_size.height, self.out_size.width)) # return shapes: # 1. batch_size, num_bboxes, num_channels, (out-)height, (out-)width # 2. batch_size, num_bboxes, 2, (out-)height, (out-)width return rois, boxes
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) # h = self.data_bn(rois) h = F.relu(self.bn0(self.conv0(rois))) h = F.average_pooling_2d(h, 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps * 2 + 1, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): # go 2x num_labels plus 1 timesteps because of ctc loss lstm_predictions = [] self.lstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) classified = self.classifier(lstm_prediction) lstm_predictions.append(classified) overall_predictions.append(lstm_predictions) return overall_predictions, rois, points
def crop_from_image(self, image, output_size=None, use_spatial_transformer=True): if output_size is None: output_size = (self.width, self.height) if use_spatial_transformer: image_array = np.asarray(image).transpose(2, 0, 1).astype(np.float32) crop_transform = self.get_affine_transform_params( image.size).astype(np.float32) transform_grid = spatial_transformer_grid( crop_transform[np.newaxis, ...], (output_size[1], output_size[0])) cropped_image = spatial_transformer_sampler( image_array[np.newaxis, ...], transform_grid).data[0] cropped_image = cropped_image.astype(np.uint8) cropped_image = Image.fromarray(cropped_image.transpose(1, 2, 0)) else: cropped_image = image.crop(self.to_aabb()) cropped_image = cropped_image.resize(output_size, Image.BILINEAR) return cropped_image
def __call__(self, images): self.visual_backprop_anchors.clear() with cuda.Device(images.data.device): input_images = self.prepare_images(images.copy() * 255) h = self.feature_extractor(input_images) if self.train_imagenet: return h if images.shape[-2] > 224: h = self.res6(h) if images.shape[-2] > 300: h = self.res7(h) self.visual_backprop_anchors.append(h) h = _global_average_pooling_2d(h) transform_params = self.param_predictor(h) transform_params = rotation_dropout(F.reshape(transform_params, (-1, 2, 3)), ratio=0.0) points = F.spatial_transformer_grid(transform_params, self.out_size) rois = F.spatial_transformer_sampler(images, points) if self.transform_rois_to_grayscale: assert rois.shape[ 1] == 3, "rois are not in RGB, can not convert them to grayscale" b, g, r = F.split_axis(rois, 3, axis=1) rois = 0.299 * r + 0.587 * g + 0.114 * b return rois, points
def _apply_backward(self, x, grid, grads, use_cudnn): x = Variable(x) grid = Variable(grid) y = functions.spatial_transformer_sampler(x, grid, use_cudnn=use_cudnn) x.zerograd() grid.zerograd() y.grad = grads y.backward() return x, grid, y
def apply_transform_params(self, image, transform_params): image = self.xp.tile(image[np.newaxis, ...], (len(transform_params), 1, 1, 1)) transform_grid = spatial_transformer_grid(transform_params, self.image_size) cropped_image = spatial_transformer_sampler(image, transform_grid).array return cropped_image
def _apply_backward(self, x, grid, grads): x = Variable(x) grid = Variable(grid) y = functions.spatial_transformer_sampler(x, grid) x.cleargrad() grid.cleargrad() y.grad = grads y.backward() return x, grid, y
def _apply_backward(self, x, grid, grads, use_cudnn): x = Variable(x) grid = Variable(grid) y = functions.spatial_transformer_sampler( x, grid, use_cudnn=use_cudnn) x.zerograd() grid.zerograd() y.grad = grads y.backward() return x, grid, y
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) h = self.bn0(self.conv0(rois)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) if self.uses_original_data: # merge data of all 4 individual images in channel dimension batch_size, num_channels, height, width = h.shape h = F.reshape(h, (batch_size // 4, 4 * num_channels, height, width)) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): # go 2x num_labels plus 1 timesteps because of ctc loss lstm_predictions = [] self.lstm.reset_state() if self.use_blstm: self.blstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) lstm_predictions.append(lstm_prediction) if self.use_blstm: blstm_predictions = [] for lstm_prediction in reversed(lstm_predictions): blstm_prediction = self.blstm(lstm_prediction) blstm_predictions.append(blstm_prediction) lstm_predictions = reversed(blstm_predictions) final_lstm_predictions = [] for lstm_prediction in lstm_predictions: classified = self.classifier(lstm_prediction) final_lstm_predictions.append(F.expand_dims(classified, axis=0)) final_lstm_predictions = F.concat(final_lstm_predictions, axis=0) overall_predictions.append(final_lstm_predictions) return overall_predictions, rois, points
def __call__(self, images, localizations): points = F.spatial_transformer_grid(localizations, self.target_shape) rois = F.spatial_transformer_sampler(images, points) h = self.data_bn(rois) h = F.relu(self.bn0(self.conv0(h))) h = F.average_pooling_2d(h, 2, stride=2) h = self.rs1(h) h = self.rs2(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs3(h) self.vis_anchor = h h = F.average_pooling_2d(h, 5, stride=1) h = F.relu(self.fc1(h)) # for each timestep of the localization net do the 'classification' h = F.reshape(h, (self.num_timesteps, -1, self.fc1.out_size)) overall_predictions = [] for timestep in F.separate(h, axis=0): lstm_predictions = [] self.lstm.reset_state() if self.use_blstm: self.blstm.reset_state() for _ in range(self.num_labels): lstm_prediction = self.lstm(timestep) lstm_predictions.append(lstm_prediction) if self.use_blstm: blstm_predictions = [] for lstm_prediction in reversed(lstm_predictions): blstm_prediction = self.blstm(lstm_prediction) blstm_predictions.append(blstm_prediction) lstm_predictions = reversed(blstm_predictions) final_lstm_predictions = [] for lstm_prediction in lstm_predictions: classified = self.classifier(lstm_prediction) final_lstm_predictions.append(F.expand_dims(classified, axis=1)) final_lstm_predictions = F.concat(final_lstm_predictions, axis=1) overall_predictions.append(final_lstm_predictions) return overall_predictions, rois, points
def projective_inverse_warp(imgs, depthes, poses, K): """ Args: imgs(Variable): Source images. Shape is [N, 3, H, W] depthes(Variable): Predicted depthes. Shape is [N, 3, H*W] poses(Variable): Predicted poses. Shape is [N, 6] K(array): [N, 3, 3] Return: transformed images of shape [N, 3, H, W] """ xp = cuda.get_array_module(imgs) im_shape = imgs.shape N, _, H, W = im_shape #poses = np.expand_dims(poses, axis=0) #poses = np.asarray([[0.14, 0.02, -0.13, 0, 0, 0]], dtype='f') #poses = np.asarray([[0, 0, 0, 0 ,0 ,0]], dtype='f') #print(imgs.shape, depthes.shape, poses.shape, K.shape) # start, stop = create_timer() proj_tgt_cam_to_src_pixel = proj_tgt_to_src(poses, K, N, xp=xp, use_cpu=False) # print_timer(start, stop, 'proj') # start, stop = create_timer() pixel_coords = generate_2dmeshgrid(H, W, N, xp) # print_timer(start, stop, 'mesh grid') # start, stop = create_timer() cam_coords = pixel2cam(depthes, pixel_coords, K, im_shape, xp=xp) # print_timer(start, stop, 'pixel2cam') # start, stop = create_timer() src_pixel_coords = cam2pixel(cam_coords, proj_tgt_cam_to_src_pixel, im_shape, xp=xp) # print_timer(start, stop, 'cam2pixel') # start, stop = create_timer() transformed_img = F.spatial_transformer_sampler(imgs, src_pixel_coords) # transformed_img = spatial_transformer_sampler(imgs, src_pixel_coords) # transformed_img = spatial_transformer_sampler_interp(imgs, src_pixel_coords) # print_timer(start, stop, 'spatial transformer') return transformed_img
def do_transformation_param_refinement_step(self, images, transformation_params): transformation_params = self.remove_homogeneous_coordinates(transformation_params) points = F.spatial_transformer_grid(transformation_params, self.target_shape) rois = F.spatial_transformer_sampler(images, points) # rerun parts of the feature extraction for producing a refined version of the transformation params h = self.bn0_1(self.conv0_1(rois)) h = F.average_pooling_2d(F.relu(h), 2, stride=2) h = self.rs4(h) h = F.max_pooling_2d(h, 2, stride=2) h = self.rs5(h) h = F.max_pooling_2d(h, 2, stride=2) transformation_params = self.refinement_transform(h) transformation_params = F.reshape(transformation_params, (-1, 2, 3)) transformation_params = rotation_dropout(transformation_params, ratio=self.dropout_ratio) return transformation_params
def __call__(self, encs, hiddens, batch_size, prev_image, num_masks, color_channels): """ Learn through StatelessSTP. Args: encs: An array of computed transformation hiddens: An array of hidden layers batch_size: Size of mini batches prev_image: The image to transform num_masks: Number of masks to apply color_channels: Output color channels Returns: transformed: A list of masks to apply on the previous image """ logger = logging.getLogger(__name__) enc0, enc1, enc2, enc3, enc4, enc5, enc6 = encs hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7 = hiddens xp = chainer.cuda.get_array_module(enc6.data) # STP specific enc7 = self.enc7(enc6) transformed = list([F.sigmoid(enc7)]) stp_input0 = F.reshape(hidden5, (int(batch_size), -1)) stp_input1 = self.stp_input(stp_input0) stp_input1 = F.relu(stp_input1) identity_params = np.array([[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]], dtype=np.float32) identity_params = np.repeat(identity_params, int(batch_size), axis=0) identity_params = variable.Variable(xp.array(identity_params)) stp_transformations = [] for i in range(num_masks-1): params = self.identity_params(stp_input1) params = params + identity_params params = F.reshape(params, (int(params.shape[0]), 2, 3)) grid = F.spatial_transformer_grid(params, (prev_image.shape[2], prev_image.shape[3])) trans = F.spatial_transformer_sampler(prev_image, grid) stp_transformations.append(trans) transformed += stp_transformations return transformed, enc7
def warp_dense(image, ps): """Dense image warping Args: image (:class `~chainer.Variable` or :ref:`ndarray`): A 4-D array of shape `(B, C, H, W)`. ps (:class `~chainer.Variable` or :ref:`ndarray`): A 4-D array of shape `(B, 2, H, W)` Pixel coordinates in source images. Returns: ~chainer.Variable: Warped image. A 4-D array of shape `(B, C, H, W)`. """ xp = backend.get_array_module(image) B, _, H, W = image.shape ps = 2 * ps / xp.array([W-1, H-1]).reshape(-1, 2, 1, 1) - 1 ps = ps.reshape(B, 2, H, W) return F.spatial_transformer_sampler(image, ps)
def check_forward(self, x, grid): y = functions.spatial_transformer_sampler(x, grid) self.assertEqual(y.shape, self.out_shape)
def check_forward(self, x, grid, expected): y = functions.spatial_transformer_sampler(x, grid) testing.assert_allclose(y.data, expected)
def check_forward(self, x, grid): y = functions.spatial_transformer_sampler(x, grid) testing.assert_allclose(y.data, self.operator(self.x))
def callback(msg_1): global i global j #global ini global xcgg global lcg, lhg global vel_res, wvel_res #linear velocity, and angular velocity global img_nn_c global out_gonet global vres, wres global xc, yc, xyoffset j = j + 1 if j == 1: xkg = xcgg #previous image xd = np.zeros((3, 3, 128, 128), dtype=np.float32) # resize and crop image for msg_1 cv2_msg_img = bridge.imgmsg_to_cv2(msg_1) cv_imgc = bridge.cv2_to_imgmsg(cv2_msg_img, 'rgb8') pil_img = encode(cv_imgc) fg_img = PILImage.new('RGBA', pil_img.size, (0, 0, 0, 255)) draw = ImageDraw.Draw(fg_img) draw.ellipse(XYc, fill=(0, 0, 0, 0)) pil_img.paste(fg_img, (0, 0), fg_img.split()[3]) img_msg = decode(pil_img) cv2_imgd = bridge.imgmsg_to_cv2(img_msg, 'rgb8') #crop the image cv_cutimg = cv2_imgd[yc - xyoffset:yc + xyoffset, xc - xyoffset:xc + xyoffset] cv_cutimg = cv2.transpose(cv_cutimg) cv_cutimg = cv2.flip(cv_cutimg, 1) #resize the image 3x128x128 cv_resize1 = cv2.resize(cv_cutimg, (rsizex, rsizey)) cv_resizex = cv_resize1.transpose(2, 0, 1) in_imgcc1 = np.array([cv_resizex], dtype=np.float32) #normalization in_img1 = (in_imgcc1 - 128) / 128 for i in range(batchsize): #usually batchsize=1 img_nn_c[i] = in_img1 vresc[i][0][0][0] = vel_res wresc[i][0][0][0] = wvel_res xcg = Variable(cuda.to_gpu(img_nn_c)) vrescg = Variable(cuda.to_gpu(vresc)) wrescg = Variable(cuda.to_gpu(wresc)) #designing the virtual velocity from joypad input vel_ref(linear velocity) and wvel_ref(angular velocity) vres[0][0][0] = 0.5 if wvel_ref == 0.0: wres[0][0][0] = 0.0 elif wvel_ref != 0.0 and vel_ref == 0.0: if wvel_ref > 0.0: wres[0][0][0] = 1.0 else: wres[0][0][0] = -1.0 elif vel_ref < 0.0: wres[0][0][0] = 0.0 else: rd = 2.5 * vel_ref / wvel_ref wres[0][0][0] = 0.5 / rd if wres[0][0][0] > 1.0: wres[0][0][0] = 1.0 elif wres[0][0][0] < -1.0: wres[0][0][0] = -1.0 vresg = Variable(cuda.to_gpu(vres)) #virtual linear velocity wresg = Variable(cuda.to_gpu(wres)) #virtual angular velocity fl.l_LSTM.h = Variable(lhg) #internal value of LSTM fl.l_LSTM.c = Variable(lcg) #internal value of LSTM with chainer.using_config('train', False), chainer.no_backprop_mode(): img_gen = gen(invg(xcg)) dis_real = dis(xcg) dis_gen = dis(img_gen) output = fl(xcg - img_gen, dis_real - dis_gen, dis_real) out_gonet[0] = np.reshape(cuda.to_cpu(output.data), (batchsize)) lhg = fl.l_LSTM.h.data lcg = fl.l_LSTM.c.data xcgg = xcg #keep current image for the next step font = ImageFont.truetype( "/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf", 25) stext = 35 for k in range(4): # 4steps prediction if k == 0: xkg = xkg xcg = xcg else: xkg = xcg xcg = xap #future prediction:start with chainer.using_config('train', False), chainer.no_backprop_mode(): if k == 0: x = dec(enc(xkg), vrescg, wrescg) else: x = dec(enc(xkg), vresg, wresg) genk = F.spatial_transformer_sampler(xkg, x) xkp = genk with chainer.using_config('train', False), chainer.no_backprop_mode(): xcf = dec(enc(xcg), vresg, wresg) xkf = dec(enc(xkp), vresg, wresg) gencf = F.spatial_transformer_sampler(xcg, xcf) genkf = F.spatial_transformer_sampler(xkp, xkf) indata = F.concat((genkf, gencf), axis=1) maskg = Variable(cuda.to_gpu(mask_batch)) with chainer.using_config('train', False), chainer.no_backprop_mode(): zL = encD(indata) xfLs = decD(zL) xfL = F.separate(xfLs, axis=1) apf0 = F.reshape(xfL[0], (batchsize, 1, 128, 128)) apf1 = F.reshape(xfL[1], (batchsize, 1, 128, 128)) apf2 = F.reshape(xfL[2], (batchsize, 1, 128, 128)) apf3 = F.reshape(xfL[3], (batchsize, 1, 128, 128)) mask_kL = F.reshape(xfL[4], (batchsize, 1, 128, 128)) mask_cL = F.reshape(xfL[5], (batchsize, 1, 128, 128)) apfLc = F.concat((apf0, apf1), axis=1) apfLk = F.concat((apf2, apf3), axis=1) genLcx = F.spatial_transformer_sampler(gencf, apfLc + maskg) genLkx = F.spatial_transformer_sampler(genkf, apfLk + maskg) maskL = F.concat((mask_kL, mask_cL), axis=1) mask_softL = F.softmax(maskL, axis=1) mask_sepL = F.separate(mask_softL, axis=1) mask_knL = F.reshape(mask_sepL[0], (batchsize, 1, 128, 128)) mask_cnL = F.reshape(mask_sepL[1], (batchsize, 1, 128, 128)) xap = F.scale(genLcx, mask_cnL, axis=0) + F.scale( genLkx, mask_knL, axis=0) #predicted image #GONet with chainer.using_config('train', False), chainer.no_backprop_mode(): img_gen = gen(invg(xap)) dis_real = dis(xap) dis_gen = dis(img_gen) output = fl(xap - img_gen, dis_real - dis_gen, dis_real) out_gonet[k + 1] = np.reshape(cuda.to_cpu(output.data), (batchsize)) #showing predicted image and traversable probability by GONet out_imgc = img_nn_c imgb = np.fmin(255.0, np.fmax(0.0, out_imgc * 128 + 128)) imgc = np.reshape(imgb, (3, 128, 128)) imgd = imgc.transpose(1, 2, 0) imge = imgd.astype(np.uint8) imgm = bridge.cv2_to_imgmsg(imge) imgg = PILImage.new('RGB', (128, 30)) d = ImageDraw.Draw(imgg) d.text((stext, 0), str(np.round(out_gonet[0][0], 2)), fill=(int(255 * (1.0 - out_gonet[0][0])), int(255 * out_gonet[0][0]), 0), font=font) imgtext = decode(imgg) imgtextcv = bridge.imgmsg_to_cv2(imgtext, 'bgr8') imgtextcvx = imgtextcv.transpose(2, 0, 1) imgc = np.concatenate((imgc, imgtextcvx), axis=1) # imgcc0 = imgc for im in range(1): out_imgc = cuda.to_cpu(xap.data[im]) imgb = np.fmin(255.0, np.fmax(0.0, out_imgc * 128 + 128)) imgc = np.reshape(imgb, (3, 128, 128)) if k == 0: if im == 0: imgg = PILImage.new('RGB', (128, 30)) d = ImageDraw.Draw(imgg) d.text((stext, 0), str(np.round(out_gonet[k + 1][im], 2)), fill=(int(255 * (1.0 - out_gonet[k + 1][im])), int(255 * out_gonet[k + 1][im]), 0), font=font) imgtext = decode(imgg) imgtextcv = bridge.imgmsg_to_cv2(imgtext, 'bgr8') imgtextcvx = imgtextcv.transpose(2, 0, 1) imgc = np.concatenate((imgc, imgtextcvx), axis=1) # imgcc1 = imgc elif k == 1: if im == 0: imgg = PILImage.new('RGB', (128, 30)) d = ImageDraw.Draw(imgg) d.text((stext, 0), str(np.round(out_gonet[k + 1][im], 2)), fill=(int(255 * (1.0 - out_gonet[k + 1][im])), int(255 * out_gonet[k + 1][im]), 0), font=font) imgtext = decode(imgg) imgtextcv = bridge.imgmsg_to_cv2(imgtext, 'bgr8') imgtextcvx = imgtextcv.transpose(2, 0, 1) imgc = np.concatenate((imgc, imgtextcvx), axis=1) # imgcc2 = imgc elif k == 2: if im == 0: imgg = PILImage.new('RGB', (128, 30)) d = ImageDraw.Draw(imgg) d.text((stext, 0), str(np.round(out_gonet[k + 1][im], 2)), fill=(int(255 * (1.0 - out_gonet[k + 1][im])), int(255 * out_gonet[k + 1][im]), 0), font=font) imgtext = decode(imgg) imgtextcv = bridge.imgmsg_to_cv2(imgtext, 'bgr8') imgtextcvx = imgtextcv.transpose(2, 0, 1) imgc = np.concatenate((imgc, imgtextcvx), axis=1) # imgcc3 = imgc elif k == 3: if im == 0: imgg = PILImage.new('RGB', (128, 30)) d = ImageDraw.Draw(imgg) d.text((stext, 0), str(np.round(out_gonet[k + 1][im], 2)), fill=(int(255 * (1.0 - out_gonet[k + 1][im])), int(255 * out_gonet[k + 1][im]), 0), font=font) imgtext = decode(imgg) imgtextcv = bridge.imgmsg_to_cv2(imgtext, 'bgr8') imgtextcvx = imgtextcv.transpose(2, 0, 1) imgc = np.concatenate((imgc, imgtextcvx), axis=1) # imgcc4 = imgc imgcc = np.concatenate((imgcc0, imgcc1, imgcc2, imgcc3, imgcc4), axis=2) imgdd = imgcc.transpose(1, 2, 0) imgee = imgdd.astype(np.uint8) imgmm = bridge.cv2_to_imgmsg(imgee) image_all.publish(imgmm) j = 0
def __call__(self, x): theta = self.affine_matrix(x) self.grid = F.spatial_transformer_grid(theta, x.shape[2:]) return F.spatial_transformer_sampler(x, self.grid)