def forward(self, x, targets): """Applies network layers and ops on input image(s) x. Args: x: input image or batch of images. Shape: [batch,3,300,300]. Return: Depending on phase: test: Variable(tensor) of output class label predictions, confidence score, and corresponding location predictions for each object detected. Shape: [batch,topk,7] train: list of concat outputs from: 1: confidence layers, Shape: [batch*num_priors,num_classes] 2: localization layers, Shape: [batch,num_priors*4] 3: priorbox layers, Shape: [2,num_priors*4] """ sources = list() loc = list() conf = list() has_lp = list() size_lp = list() offset = list() sources_2 = list() loc_2 = list() conf_2 = list() four_corners_2 = list() # apply vgg up to conv1_1 relu for k in range(2): x = self.vgg[k](x) if k == 1: # conv1_1 feature relu conv1_1_feat = x # apply vgg up to conv4_3 relu for k in range(2, 23): x = self.vgg[k](x) s = self.L2Norm(x) sources.append(s) # apply vgg up to fc7 for k in range(23, len(self.vgg)): x = self.vgg[k](x) sources.append(x) # apply extra layers and cache source layer outputs for k, v in enumerate(self.extras): x = F.relu(v(x), inplace=True) if k % 2 == 1: sources.append(x) # apply multibox head to source layers for (x, l, c, h, s, o) in zip(sources, self.loc, self.conf, self.has_lp, self.size_lp, self.offset): loc.append(l(x).permute(0, 2, 3, 1).contiguous()) conf.append(c(x).permute(0, 2, 3, 1).contiguous()) has_lp.append(h(x).permute(0, 2, 3, 1).contiguous()) size_lp.append(s(x).permute(0, 2, 3, 1).contiguous()) offset.append(o(x).permute(0, 2, 3, 1).contiguous()) loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) has_lp = torch.cat([o.view(o.size(0), -1) for o in has_lp], 1) size_lp = torch.cat([o.view(o.size(0), -1) for o in size_lp], 1) offset = torch.cat([o.view(o.size(0), -1) for o in offset], 1) # [num, num_classes, top_k, 10] rpn_rois = self.detect( loc.view(loc.size(0), -1, 4), # loc preds self.softmax(conf.view(conf.size(0), -1, self.num_classes)), # conf preds self.priors.cuda(), # default boxes self.sigmoid(has_lp.view(has_lp.size(0), -1, 1)), size_lp.view(size_lp.size(0), -1, 2), offset.view(offset.size(0), -1, 2)) rpn_rois = rpn_rois.detach() # roi align or roi warping crop_height = self.size_2 crop_width = self.size_2 is_cuda = torch.cuda.is_available() if self.phase == 'test': has_lp_th = 0.5 th = 0.6 output = torch.zeros(1, 3, 200, 13) output[0, 1, :, :5] = rpn_rois[0, 1, :, :5] rois_idx = (rpn_rois[0, 1, :, 0] > th) & (rpn_rois[0, 1, :, 5] > has_lp_th) matches = rpn_rois[0, 1, rois_idx, :] if matches.shape[0] == 0: return output car_center = (matches[:, [1, 2]] + matches[:, [3, 4]]) / 2 lp_center = car_center + matches[:, [8, 9]] lp_bbox_top_left = lp_center - matches[:, [6, 7 ]] / 2 * self.expand_num lp_bbox_bottom_right = lp_center + matches[:, [ 6, 7 ]] / 2 * self.expand_num lp_bbox = torch.cat((lp_bbox_top_left, lp_bbox_bottom_right), 1) lp_bbox = torch.max(lp_bbox, torch.zeros(lp_bbox.shape)) lp_bbox = torch.min(lp_bbox, torch.ones(lp_bbox.shape)) lp_bbox = torch.max(lp_bbox, matches[:, 1:3].repeat(1, 2)) lp_bbox = torch.min(lp_bbox, matches[:, 3:5].repeat(1, 2)) # [num_car, 4] rois_squeeze = lp_bbox # Define the boxes ( crops ) # box = [y1/heigth , x1/width , y2/heigth , x2/width] boxes_data = torch.zeros(rois_squeeze.shape) boxes_data[:, 0] = rois_squeeze[:, 1] boxes_data[:, 1] = rois_squeeze[:, 0] boxes_data[:, 2] = rois_squeeze[:, 3] boxes_data[:, 3] = rois_squeeze[:, 2] # Create an index to indicate which box crops which image box_index_data = torch.IntTensor(range(boxes_data.shape[0])) image_data = conv1_1_feat.repeat(rois_squeeze.shape[0], 1, 1, 1) # Convert from numpy to Variables image_torch = to_varabile(image_data, is_cuda=is_cuda, requires_grad=False) boxes = to_varabile(boxes_data, is_cuda=is_cuda, requires_grad=False) box_index = to_varabile(box_index_data, is_cuda=is_cuda, requires_grad=False) # Crops and resize bbox1 from img1 and bbox2 from img2 # n*64*crop_height*crop_width crops_torch = CropAndResizeFunction.apply(image_torch, boxes, box_index, crop_height, crop_width, 0) # second network x_2 = crops_torch for k in range(4): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) for k in range(4, 9): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) for k in range(9, 14): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) # apply multibox head to source layers for (x_2, l_2, c_2, f_2) in zip(sources_2, self.loc_2, self.conf_2, self.four_corners_2): loc_2.append(l_2(x_2).permute(0, 2, 3, 1).contiguous()) conf_2.append(c_2(x_2).permute(0, 2, 3, 1).contiguous()) four_corners_2.append( f_2(x_2).permute(0, 2, 3, 1).contiguous()) loc_2 = torch.cat([o.view(o.size(0), -1) for o in loc_2], 1) conf_2 = torch.cat([o.view(o.size(0), -1) for o in conf_2], 1) four_corners_2 = torch.cat( [o.view(o.size(0), -1) for o in four_corners_2], 1) output_2 = self.detect_2( loc_2.view(loc_2.size(0), -1, 4), self.softmax_2( conf_2.view(conf_2.size(0), -1, self.num_classes)), self.priors_2.cuda(), four_corners_2.view(four_corners_2.size(0), -1, 8)) output_2_pos = output_2[:, 1, 0, :] rois_size = rois_squeeze[:, 2:4] - rois_squeeze[:, :2] rois_top_left = rois_squeeze[:, :2] rois_size_expand = rois_size.repeat(1, 6) rois_top_left_expand = rois_top_left.repeat(1, 6) output_2_pos[:, 1:] = output_2_pos[:, 1:] * rois_size_expand + rois_top_left_expand num_car = output_2_pos.shape[0] output[0, 2, :num_car, :] = output_2_pos output[0, 1, :num_car, 5:9] = lp_bbox output[0, 1, :num_car, 9] = 1 return output else: print("ERROR: Phase: " + self.phase + " not recognized") return return output
def compare_with_tf(crop_height, crop_width, is_cuda=True): # generate data image_data, boxes_data, box_index_data = generate_data( batch_size=2, depth=128, im_height=200, im_width=200, n_boxes=10, xyxy=False, box_normalize=True) # boxes_tf_data = np.stack((boxes_data[:, 1], boxes_data[:, 0], boxes_data[:, 3], boxes_data[:, 2]), axis=1) # boxes_tf_data[:, 0::2] /= (image_data.shape[2] - 1.) # boxes_tf_data[:, 1::2] /= (image_data.shape[3] - 1.) # rand conv layer conv_torch = nn.Conv2d(image_data.shape[1], 64, 3, padding=1, bias=False) if is_cuda: conv_torch = conv_torch.cuda() # pytorch forward image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) print('pytorch forward and backward start') crops_torch = CropAndResizeFunction(crop_height, crop_width, 0)(image_torch, boxes, box_index) crops_torch = conv_torch(crops_torch) crops_torch_data = crops_torch.data.cpu().numpy() # pytorch backward loss_torch = crops_torch.sum() loss_torch.backward() grad_torch_data = image_torch.grad.data.cpu().numpy() print('pytorch forward and backward end') # tf forward & backward image_tf = tf.placeholder(tf.float32, (None, None, None, None), name='image') boxes = tf.placeholder(tf.float32, (None, 4), name='boxes') box_index = tf.placeholder(tf.int32, (None,), name='box_index') image_t = tf.transpose(image_tf, (0, 2, 3, 1)) crops_tf = tf.image.crop_and_resize(image_t, boxes, box_index, (crop_height, crop_width)) conv_tf = tf.nn.conv2d(crops_tf, np.transpose(conv_torch.weight.data.cpu().numpy(), (2, 3, 1, 0)), [1, 1, 1, 1], padding='SAME') trans_tf = tf.transpose(conv_tf, (0, 3, 1, 2)) loss_tf = tf.reduce_sum(trans_tf) grad_tf = tf.gradients(loss_tf, image_tf)[0] with tf.Session() as sess: crops_tf_data, grad_tf_data = sess.run( (trans_tf, grad_tf), feed_dict={image_tf: image_data, boxes: boxes_data, box_index: box_index_data} ) crops_diff = np.abs(crops_tf_data - crops_torch_data) print('forward (maxval, min_err, max_err, mean_err):', crops_tf_data.max(), crops_diff.min(), crops_diff.max(), crops_diff.mean()) grad_diff = np.abs(grad_tf_data - grad_torch_data) print('backward (maxval, min_err, max_err, mean_err):', grad_tf_data.max(), grad_diff.min(), grad_diff.max(), grad_diff.mean())
# box = [y1/heigth , x1/width , y2/heigth , x2/width] boxes_data = torch.FloatTensor([[0, 0, 1, 1], [0, 0, 0.5, 0.5]]) # Create an index to say which box crops which image box_index_data = torch.IntTensor([0, 1]) # Import the images from file image_data1 = transforms.ToTensor()(Image.open(img_path1)).unsqueeze(0) image_data2 = transforms.ToTensor()(Image.open(img_path2)).unsqueeze(0) # Create a batch of 2 images image_data = torch.cat((image_data1, image_data2), 0) # Convert from numpy to Variables image_torch = to_varabile(image_data, is_cuda=is_cuda) boxes = to_varabile(boxes_data, is_cuda=is_cuda) box_index = to_varabile(box_index_data, is_cuda=is_cuda) # Crops and resize bbox1 from img1 and bbox2 from img2 crops_torch = CropAndResizeFunction(crop_height, crop_width, 0)(image_torch, boxes, box_index) # Visualize the crops print(crops_torch.data.size()) crops_torch_data = crops_torch.data.cpu().numpy().transpose(0, 2, 3, 1) fig = plt.figure() plt.subplot(121) plt.imshow(crops_torch_data[0]) plt.subplot(122) plt.imshow(crops_torch_data[1]) plt.show()
def train(**kwargs): # Retrieve training configuration data_loader = kwargs['data_loader'] net = kwargs['net'] loss = kwargs['loss'] optimizer = kwargs['optimizer'] feature_center = kwargs['feature_center'] epoch = kwargs['epoch'] save_freq = kwargs['save_freq'] save_dir = kwargs['save_dir'] verbose = kwargs['verbose'] writer = kwargs['writer'] # Attention Regularization: LA Loss l2_loss = nn.MSELoss() # Default Parameters beta = 0.05 theta_c = 0.5 theta_d = 0.5 crop_size = (448, 448) # size of cropped images for 'See Better' # metrics initialization batches = 0 epoch_loss = np.array([0, 0, 0], dtype='float') # Loss on Raw/Crop/Drop Images epoch_acc = np.array( [[0, 0, 0], [0, 0, 0], [0, 0, 0]], dtype='float') # Top-1/3/5 Accuracy for Raw/Crop/Drop Images # begin training start_time = time.time() logging.info('Epoch %03d, Learning Rate %g' % (epoch + 1, optimizer.param_groups[0]['lr'])) net.train() for i, (X, y) in enumerate(data_loader): batch_start = time.time() # obtain data for training X = X.to(torch.device("cuda")) y = y.to(torch.device("cuda")) ################################## # Raw Image ################################## y_pred, feature_matrix, attention_maps = net(X) # loss batch_loss_1 = loss(y_pred, y) epoch_loss[0] += batch_loss_1.item() # metrics: top-1, top-3, top-5 error with torch.no_grad(): epoch_acc[0] += accuracy(y_pred, y, topk=(1, 3, 5)) ###################################### # Reshape center and bap #################################### feature_center = feature_center.reshape((feature_center.shape[0], -1)) feature_matrix = feature_matrix.reshape((feature_matrix.shape[0], -1)) #get this batch's batch_center batch_center = feature_center[y] #Normalize centermatrix batch_center batch_center = nn.functional.normalize(batch_center, 2, -1) # Update Feature Center feature_center[y] += beta * (feature_matrix.detach() - batch_center) # loss_center = l2_loss(feature_matrix, batch_center) distance = torch.pow(feature_matrix - batch_center, 2) distance = torch.sum(distance, -1) loss_center = torch.mean(distance) ################################## # Attention Cropping ################################## with torch.no_grad(): crop_masks = F.upsample_bilinear(attention_maps, size=(X.size(2), X.size(3))) bboxes = attention_crop(crop_masks.cpu().detach().numpy()) bboxes = torch.from_numpy(bboxes).cuda() box_index = torch.IntTensor(range(crop_masks.size(0))).cuda() crop_images = CropAndResizeFunction(crop_size[0], crop_size[1], 0)(to_varabile(X), to_varabile(bboxes), to_varabile(box_index)) #loss y_pred, _, _ = net(crop_images) batch_loss_2 = loss(y_pred, y) epoch_loss[1] += batch_loss_2.item() with torch.no_grad(): epoch_acc[1] += accuracy(y_pred, y, topk=(1, 3, 5)) ################################## # Attention Dropping ################################## with torch.no_grad(): crop_masks = F.upsample_bilinear(attention_maps, size=(X.size(2), X.size(3))) mask = attention_drop(crop_masks.cpu().detach().numpy()) mask = torch.from_numpy(mask).cuda() drop_images = X * mask # loss y_pred, _, _ = net(drop_images) batch_loss_3 = loss(y_pred, y) epoch_loss[2] += batch_loss_3.item() with torch.no_grad(): epoch_acc[2] += accuracy(y_pred, y, topk=(1, 3, 5)) totol_loss = 1 / 3.0 * batch_loss_1 + 1 / 3.0 * batch_loss_2 + 1 / 3.0 * batch_loss_3 + loss_center # totol_loss = 1 / 2.0 * batch_loss_1 + 1 / 2.0 * batch_loss_2 + loss_center optimizer.zero_grad() totol_loss.backward() optimizer.step() # end of this batch batches += 1 batch_end = time.time() if (i + 1) % verbose == 0: logging.info( '\tBatch %d: (Raw) Loss %.4f, Accuracy: (%.2f, %.2f, %.2f), (Crop) Loss %.4f, Accuracy: (%.2f, %.2f, %.2f), (Drop) Loss %.4f, Accuracy: (%.2f, %.2f, %.2f), Time %3.2f' % (i + 1, epoch_loss[0] / batches, epoch_acc[0, 0] / batches, epoch_acc[0, 1] / batches, epoch_acc[0, 2] / batches, epoch_loss[1] / batches, epoch_acc[1, 0] / batches, epoch_acc[1, 1] / batches, epoch_acc[1, 2] / batches, epoch_loss[2] / batches, epoch_acc[2, 0] / batches, epoch_acc[2, 1] / batches, epoch_acc[2, 2] / batches, batch_end - batch_start)) writer.add_image('raw_img', X[0], (epoch + 1) * 100 + (i + 1) / verbose) # writer.add_image('crop_mask', crop_mask[0], (epoch+1) * 100+(i + 1) / verbose) # writer.add_image('crop_img', crop_images[0], (epoch+1) * 100+(i + 1) / verbose) # writer.add_image('drop_mask', drop_mask[0], (epoch+1) * 100+(i + 1) / verbose) # writer.add_image('drop_img', drop_images[0], (epoch+1) * 100+(i + 1) / verbose) # crop_mask = F.upsample_bilinear(attention_maps, size=(X.size(2), X.size(3))) > theta_c # writer.add_image('attention_img', (X * crop_masks.float())[0], (epoch+1) * 100+(i + 1) / verbose) # print(type(attention_map[0])) # writer.add_image('attention_img',generate_attention_image(X[0],attention_map[0].cpu().numpy()) , (epoch + 1) * 100 + (i + 1) / verbose) # save checkpoint model if epoch % save_freq == 0: state_dict = net.module.state_dict() for key in state_dict.keys(): state_dict[key] = state_dict[key].cpu() torch.save( { 'epoch': epoch, 'save_dir': save_dir, 'state_dict': state_dict, 'feature_center': feature_center.cpu() }, os.path.join(save_dir, '%03d.ckpt' % (epoch + 1))) # end of this epoch end_time = time.time() # metrics for average epoch_loss /= batches epoch_acc /= batches # show information for this epoch logging.info( 'Train: (Raw) Loss %.4f, Accuracy: (%.2f, %.2f, %.2f), (Crop) Loss %.4f, Accuracy: (%.2f, %.2f, %.2f), (Drop) Loss %.4f, Accuracy: (%.2f, %.2f, %.2f), Time %3.2f' % (epoch_loss[0], epoch_acc[0, 0], epoch_acc[0, 1], epoch_acc[0, 2], epoch_loss[1], epoch_acc[1, 0], epoch_acc[1, 1], epoch_acc[1, 2], epoch_loss[2], epoch_acc[2, 0], epoch_acc[2, 1], epoch_acc[2, 2], end_time - start_time)) writer.add_scalars( 'scalar/train', { 'acc_raw': epoch_acc[0, 0], 'acc_crop': epoch_acc[1, 0], 'acc_drop': epoch_acc[2, 0] }, epoch) writer.add_scalars( 'scalar/train', { 'loss_raw': epoch_loss[0], 'loss_crop': epoch_loss[1], 'loss_drop': epoch_loss[2] }, epoch)
def validate(**kwargs): # Retrieve training configuration data_loader = kwargs['data_loader'] net = kwargs['net'] loss = kwargs['loss'] verbose = kwargs['verbose'] # metrics initialization batches = 0 epoch_loss = 0 epoch_acc = np.array([0, 0, 0], dtype='float') # top - 1, 3, 5 # begin validation start_time = time.time() net.eval() with torch.no_grad(): for i, (X, y) in enumerate(data_loader): batch_start = time.time() # obtain data X = X.to(torch.device("cuda")) y = y.to(torch.device("cuda")) ################################## # Raw Image ################################## y_pred_raw, feature_matrix, attention_maps = net(X) ################################## # Object Localization and Refinement ################################## attention_maps = torch.mean(attention_maps, dim=1, keepdim=True) attention_maps = F.upsample_bilinear(attention_maps, size=(X.size(2), X.size(3))) bboxes = mask2bbox(attention_maps.cpu().detach().numpy()) bboxes = torch.from_numpy(bboxes).cuda() box_index = torch.IntTensor(range(attention_maps.size(0))).cuda() crop_images = CropAndResizeFunction(X.size(2), X.size(3), 0)(to_varabile(X), to_varabile(bboxes), to_varabile(box_index)) y_pred_crop, _, _ = net(crop_images) # crop_mask = F.upsample_bilinear(attention_map, size=(X.size(2), X.size(3))) > theta_c # crop_images = [] # for batch_index in range(crop_mask.size(0)): # nonzero_indices = torch.nonzero(crop_mask[batch_index, 0, ...]) # height_min = nonzero_indices[:, 0].min() # height_max = nonzero_indices[:, 0].max() # width_min = nonzero_indices[:, 1].min() # width_max = nonzero_indices[:, 1].max() # crop_images.append(F.upsample_bilinear(X[batch_index:batch_index + 1, :, height_min:height_max, width_min:width_max], size=crop_size)) # crop_images = torch.cat(crop_images, dim=0) # # y_pred_crop, _, _ = net(crop_images) # final prediction # y_pred = (y_pred_raw + y_pred_crop) / 2.0 y_pred = torch.log( F.softmax(y_pred_raw) * 0.5 + F.softmax(y_pred_crop) * 0.5) # y_pred = y_pred_raw # loss batch_loss = loss(y_pred, y) epoch_loss += batch_loss.item() # metrics: top-1, top-3, top-5 error epoch_acc += accuracy(y_pred, y, topk=(1, 3, 5)) # end of this batch batches += 1 batch_end = time.time() if (i + 1) % verbose == 0: logging.info( '\tBatch %d: Loss %.5f, Accuracy: Top-1 %.2f, Top-3 %.2f, Top-5 %.2f, Time %3.2f' % (i + 1, epoch_loss / batches, epoch_acc[0] / batches, epoch_acc[1] / batches, epoch_acc[2] / batches, batch_end - batch_start)) # end of validation end_time = time.time() # metrics for average epoch_loss /= batches epoch_acc /= batches # show information for this epoch logging.info( 'Valid: Loss %.5f, Accuracy: Top-1 %.2f, Top-3 %.2f, Top-5 %.2f, Time %3.2f' % (epoch_loss, epoch_acc[0], epoch_acc[1], epoch_acc[2], end_time - start_time)) logging.info('') return epoch_loss
def forward(self, x, mask=None): assert (len(x) == 2) total_bins = 1 # x[0] is image with shape (rows, cols, channels) img = x[0] # x[1] is roi with shape (num_rois,4) with ordering (x,y,w,h) rois = x[1] # because crop_size of tf.crop_and_resize requires 1-D tensor, we use uniform length bin_crop_size = [] for num_bins, crop_dim in zip((7, 7), (14, 14)): assert num_bins >= 1 assert crop_dim % num_bins == 0 total_bins *= num_bins bin_crop_size.append(crop_dim // num_bins) xmin, ymin, xmax, ymax = torch.unbind(rois[0], dim=1) # torch.tensor([[1, 2, 3],[4, 5, 6],[7, 8, 9]])) # ->(tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9])) spatial_bins_y = spatial_bins_x = 7 step_y = (ymax - ymin) / spatial_bins_y step_x = (xmax - xmin) / spatial_bins_x # gen bins position_sensitive_boxes = [] for bin_x in range(self.pool_size): for bin_y in range(self.pool_size): box_coordinates = [ ymin + bin_y * step_y, xmin + bin_x * step_x, ymin + (bin_y + 1) * step_y, xmin + (bin_x + 1) * step_x ] position_sensitive_boxes.append(torch.stack(box_coordinates, dim=1)) img_splits = torch.split(img, total_bins, dim=3) box_image_indices = np.zeros(self.num_rois) feature_crops = [] for split, box in zip(img_splits, position_sensitive_boxes): #assert box.shape[0] == box_image_indices.shape[0], "Psroi box number doesn't match roi box indices!" #crop = tf.image.crop_and_resize( # split, box, box_image_indices, # bin_crop_size, method='bilinear' #) crop = CropAndResizeFunction.apply(split, box, box_image_indices, bin_crop_size[0], bin_crop_size[1], 0) # shape [num_boxes, crop_height/spatial_bins_y, crop_width/spatial_bins_x, depth/total_bins] # do max pooling over spatial positions within the bin crop_1 = torch.max(crop, dim=1, keepdim=False, out=None) # tf.reduce_max(crop, axis=[1, 2]) crop_2 = torch.max(crop, dim=2, keepdim=False, out=None) # tf.reduce_max(crop, axis=[1, 2]) crop = torch.stack(crop_1, crop_2) crop = crop.unsqueeze(1) #tf.expand_dims(crop, 1) # shape [num_boxes, 1, depth/total_bins] feature_crops.append(crop) final_output = torch.cat(feature_crops, dim=1) # Reshape to (1, num_rois, pool_size, pool_size, nb_channels) # Might be (1, 4, 7, 7, 5) final_output = final_output.reshape(1, self.num_rois, self.pool_size, self.pool_size, self.alpha_channels) # permute_dimensions is similar to transpose final_output = final_output.permute(0, 1, 2, 3, 4) return final_output
def forward(self, X): N = X.size()[0] crop_height = 7 crop_width = 7 boxes_data = torch.FloatTensor([[0, 0, 1, 1]]) box_index_data = torch.IntTensor([0]) boxes = Variable(boxes_data, requires_grad=False) box_index = Variable(box_index_data, requires_grad=False) assert X.size() == (N, 3, 448, 448) bird = Image.open('cropped_bird.jpg') Image2PIL = transforms.ToPILImage() X_conv4_3 = self.features_conv4_3(X) X_conv4_3_down = self.resize_halve(X_conv4_3) X_conv5_1 = self.features_conv5_1(X) X_conv5_2 = self.features_conv5_2(X_conv5_1) X_conv5_3 = self.features_conv5_3(X_conv5_2) X_conv4_add_5_1 = X_conv5_1.add(X_conv4_3_down) X_conv4_add_5_2 = X_conv5_2.add(X_conv4_3_down) X_conv4_add_5_3 = X_conv5_3.add(X_conv4_3_down) X_conv451_torch = Variable(X_conv4_add_5_1, requires_grad=False) X_conv451_crop = CropAndResizeFunction(crop_height, crop_width, 0)(X_conv451_torch, boxes, box_index) X_conv452_torch = Variable(X_conv4_add_5_2, requires_grad=False) X_conv452_crop = CropAndResizeFunction(crop_height, crop_width, 0)(X_conv452_torch, boxes, box_index) X_conv453_torch = Variable(X_conv4_add_5_3, requires_grad=False) X_conv453_crop = CropAndResizeFunction(crop_height, crop_width, 0)(X_conv4_add_5_3, boxes, box_index) X_branch_1 = self.hbp(X_conv451_crop, X_conv452_crop) X_branch_2 = self.hbp(X_conv452_crop, X_conv453_crop) X_branch_3 = self.hbp(X_conv451_crop, X_conv453_crop) # X_branch_1 = self.hbp(X_conv4_add_5_1,X_conv4_add_5_2) # X_branch_2 = self.hbp(X_conv4_add_5_2,X_conv4_add_5_3) # X_branch_3 = self.hbp(X_conv4_add_5_1,X_conv4_add_5_3) X_branch = torch.cat([X_branch_1, X_branch_2, X_branch_3], dim=1) # print("X_branch_1.size()") # print(X_branch_1.size()) # print("X_branch.size()") # print(X_branch.size()) assert X_branch.size() == (N, 1024 * 3) # crop_height = 7 # crop_width = 7 # boxes_data = torch.FloatTensor([[0, 0, 1, 1]]) # box_index_data = torch.IntTensor([0]) #X_branch_torch = Variable(X_branch, requires_grad=False) # boxes = Variable(boxes_data, requires_grad=False) # box_index = Variable(box_index_data, requires_grad=False) #crops_torch = CropAndResizeFunction(crop_height, crop_width, 0)(X_branch_torch, boxes, box_index) #roi_align = RoIAlign(7, 7) #crops = roi_align(X_branch, boxes, box_index) print(X_conv453_crop.size()) print(X_branch_1.size()) X_branch_1 X = self.fc(X_branch) assert X.size() == (N, 200) return X
def pyramid_roi_align(inputs, pool_size=[14, 14], image_shape=[416, 416, 3]): """Implements ROI Pooling on multiple levels of the feature pyramid. Params: - pool_size: [height, width] of the output pooled regions. Usually [7, 7] - image_shape: [height, width, channels]. Shape of input image in pixels Inputs: - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordinates. - Feature maps: List of feature maps from different levels of the pyramid. Each is [batch, channels, height, width] Output: Pooled regions in the shape: [num_boxes, channels, height, width]. The width and height are those specific in the pool_shape in the layer constructor. """ # Currently only supports batchsize 1 for i in range(len(inputs)): inputs[i] = inputs[i].squeeze(0) # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords boxes = inputs[0] # Feature Maps. List of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] feature_maps = inputs[1:] # Assign each ROI to a level in the pyramid based on the ROI area. y1, x1, y2, x2 = boxes.chunk(4, dim=1) h = y2 - y1 w = x2 - x1 # Equation 1 in the Feature Pyramid Networks paper. Account for # the fact that our coordinates are normalized here. # e.g. a 224x224 ROI (in pixels) maps to P4 image_area = torch.FloatTensor([float(image_shape[0] * image_shape[1])], requires_grad=False) # image_area = torch.Tensor([float(image_shape[0] * image_shape[1])], requires_grad = False) if boxes.is_cuda: image_area = image_area.cuda() roi_level = 3 + torch.log2( torch.sqrt(h * w) / (224.0 / torch.sqrt(image_area))) roi_level = roi_level.round().int() roi_level = roi_level.clamp(1, 3) # Loop through levels and apply ROI pooling to each. P1 to P3. pooled = [] box_to_level = [] for i, level in enumerate(range(1, 4)): ix = roi_level == level if not ix.any(): continue ix = torch.nonzero(ix)[:, 0] level_boxes = boxes[ix.data, :] # Keep track of which box is mapped to which level box_to_level.append(ix.data) # Stop gradient propogation to ROI proposals level_boxes = level_boxes.detach() # Crop and Resize # From Mask R-CNN paper: "We sample four regular locations, so # that we can evaluate either max or average pooling. In fact, # interpolating only a single value at each bin center (without # pooling) is nearly as effective." # # Here we use the simplified approach of a single value per bin, # which is how it's done in tf.crop_and_resize() # Result: [batch * num_boxes, pool_height, pool_width, channels] ind = torch.zeros(level_boxes.size()[0], requires_grad=False).int() # ind = torch.zeros(level_boxes.size()[0], requires_grad = False).int() if level_boxes.is_cuda: ind = ind.cuda() feature_maps[i] = feature_maps[i].unsqueeze( 0) # CropAndResizeFunction needs batch dimension pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind) pooled.append(pooled_features) # Pack pooled features into one tensor pooled = torch.cat(pooled, dim=0) # Pack box_to_level mapping into one array and add another # column representing the order of pooled boxes box_to_level = torch.cat(box_to_level, dim=0) # Rearrange pooled features to match the order of the original boxes _, box_to_level = torch.sort(box_to_level) pooled = pooled[box_to_level, :, :] return pooled
def forward(self, x, targets): """Applies network layers and ops on input image(s) x. Args: x: input image or batch of images. Shape: [batch,3,300,300]. Return: Depending on phase: test: Variable(tensor) of output class label predictions, confidence score, and corresponding location predictions for each object detected. Shape: [batch,topk,7] train: list of concat outputs from: 1: confidence layers, Shape: [batch*num_priors,num_classes] 2: localization layers, Shape: [batch,num_priors*4] 3: priorbox layers, Shape: [2,num_priors*4] """ sources = list() loc = list() conf = list() has_lp = list() size_lp = list() offset = list() sources_2 = list() loc_2 = list() conf_2 = list() four_corners_2 = list() # apply vgg up to conv1_1 relu # TODO: may be conv1_1 features for k in range(2): x = self.vgg[k](x) if k == 1: # conv1_1 feature relu conv1_1_feat = x # apply vgg up to conv4_3 relu for k in range(2, 23): x = self.vgg[k](x) s = self.L2Norm(x) sources.append(s) # apply vgg up to fc7 for k in range(23, len(self.vgg)): x = self.vgg[k](x) sources.append(x) # apply extra layers and cache source layer outputs for k, v in enumerate(self.extras): x = F.relu(v(x), inplace=True) if k % 2 == 1: sources.append(x) # apply multibox head to source layers for (x, l, c, h, s, o) in zip(sources, self.loc, self.conf, self.has_lp, self.size_lp, self.offset): loc.append(l(x).permute(0, 2, 3, 1).contiguous()) conf.append(c(x).permute(0, 2, 3, 1).contiguous()) has_lp.append(h(x).permute(0, 2, 3, 1).contiguous()) size_lp.append(s(x).permute(0, 2, 3, 1).contiguous()) offset.append(o(x).permute(0, 2, 3, 1).contiguous()) loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) has_lp = torch.cat([o.view(o.size(0), -1) for o in has_lp], 1) size_lp = torch.cat([o.view(o.size(0), -1) for o in size_lp], 1) offset = torch.cat([o.view(o.size(0), -1) for o in offset], 1) # [num, num_classes, top_k, 10] rpn_rois = self.detect( loc.view(loc.size(0), -1, 4), # loc preds self.softmax(conf.view(conf.size(0), -1, self.num_classes)), # conf preds self.priors.cuda(), # default boxes 这个地方按照之前会有重大bug,参数分布在不同GPU上 self.sigmoid(has_lp.view(has_lp.size(0), -1, 1)), size_lp.view(size_lp.size(0), -1, 2), offset.view(offset.size(0), -1, 2) ) # 解除这部分的可导 rpn_rois = rpn_rois.detach() # roi align or roi warping crop_height = self.size_2 crop_width = self.size_2 is_cuda = torch.cuda.is_available() if self.phase == 'train': # rpn_rois: [num, num_classes, top_k, 10] # rois: [num, num_gt, 6], 6: IOU with GT, bbox(4), max iou with GT or not # target: [num, num_gt, 22], 10: bbox(4), has_lp, size(2), offset(2), # lp_bbox(4), lp_four_points(8), label # rois和target最外层是list, 里面是tensor,这样可以确保里面的tensor维度不同 proposal_target_offset = ProposalTargetLayer_offset() rois = proposal_target_offset(rpn_rois, targets, self.expand_num) gt_new = torch.empty(0) boxes_data_list = [] box_index_data_list = [] for idx in range(len(rois)): num_gt = targets[idx].shape[0] # 获取所有GT车牌的位置 targets_tensor = targets[idx] # car_center_x = (targets_tensor[:, 0].unsqueeze(1) + targets_tensor[:, 2].unsqueeze(1)) / 2.0 # car_center_y = (targets_tensor[:, 1].unsqueeze(1) + targets_tensor[:, 3].unsqueeze(1)) / 2.0 # car_center = torch.cat((car_center_x, car_center_y), 1) # lp_center = car_center + targets_tensor[:, 7:9] # lp_bbox = torch.cat((lp_center - targets_tensor[:, 5:7]/2, lp_center + targets_tensor[:, 5:7]/2), 1) lp_bbox = targets_tensor[:, 9:13] # 获取车牌的四点坐标 lp_four_points = targets_tensor[:, 13:21] # 获取在rois中的车牌GT,并且根据rois的左上角调整成新的车牌GT rois_squeeze = rois[idx][:num_gt, 1:-1] a_include_b_list = [] for i in range(num_gt): a_include_b_list.append(a_include_b(rois_squeeze[i, :], lp_bbox[i, :])) has_lp_list = [] for i in range(num_gt): has_lp_list.append(targets_tensor[i, 4].cpu().numpy() > 0) gt_in_rois_list = np.array(a_include_b_list) + 0 & np.array(has_lp_list) + 0 gt_in_rois_tensor = torch.tensor(gt_in_rois_list).type(torch.uint8).bool() rois_squeeze = rois_squeeze[gt_in_rois_tensor, :] lp_bbox = lp_bbox[gt_in_rois_tensor, :] lp_four_points = lp_four_points[gt_in_rois_tensor, :] if rois_squeeze.shape[0] > 0: # 调整车牌GT bbox rois_top_left = rois_squeeze[:, :2].repeat(1, 2) rois_width = rois_squeeze[:, 2] - rois_squeeze[:, 0] rois_height = rois_squeeze[:, 3] - rois_squeeze[:, 1] rois_size = torch.cat((rois_width.unsqueeze(1), rois_height.unsqueeze(1)), 1).repeat(1, 2) gt_bbox = (lp_bbox - rois_top_left) / rois_size # 新的车牌四点 rois_top_left_2 = rois_squeeze[:, :2].repeat(1, 4) rois_size_2 = torch.cat((rois_width.unsqueeze(1), rois_height.unsqueeze(1)), 1).repeat(1, 4) gt_four_points = (lp_four_points - rois_top_left_2) / rois_size_2 # GT label gt_label = torch.zeros((gt_bbox.shape[0], 1)) # is valid,说明这个gt是有效的,因为后面为了迎合多GPU合并必须有输出的情况,后面会伪造一些is not valid的数据 # TODO: 这是不太友好的做法 gt_valid = torch.ones((gt_bbox.shape[0], 1)) # concat gt_cur = torch.cat((gt_bbox, gt_four_points, gt_label, gt_valid), 1) gt_new = torch.cat((gt_new, gt_cur), 0) # 按照损失创造第二个网络的GT,其中gt_2的list要跟后面的crops_torch的n一致,所以用for循环 for gt_idx in range(gt_cur.shape[0]): box_index_data_list.append(idx) # 当前图片的idx boxes_data = torch.zeros(rois_squeeze.shape) boxes_data[:, 0] = rois_squeeze[:, 1] boxes_data[:, 1] = rois_squeeze[:, 0] boxes_data[:, 2] = rois_squeeze[:, 3] boxes_data[:, 3] = rois_squeeze[:, 2] boxes_data_list.append(boxes_data[gt_idx, :].cpu().numpy()) # 当前的区域 if gt_new.shape[0] > 0: # 这是将车作为roi的做法 # Define the boxes ( crops ) # box = [y1/heigth , x1/width , y2/heigth , x2/width] boxes_data = torch.FloatTensor(boxes_data_list) # Create an index to say which box crops which image box_index_data = torch.IntTensor(box_index_data_list) # Create batch of images image_data = conv1_1_feat # Convert from numpy to Variables # image feature这部分还是需要可导的,参见ROIAlign源程序,训练时需要可导,测试时不需要可导 image_torch = to_varabile(image_data, is_cuda=is_cuda, requires_grad=True) boxes = to_varabile(boxes_data, is_cuda=is_cuda, requires_grad=False) box_index = to_varabile(box_index_data, is_cuda=is_cuda, requires_grad=False) # Crops and resize bbox1 from img1 and bbox2 from img2 # n*64*crop_height*crop_width crops_torch = CropAndResizeFunction.apply(image_torch, boxes, box_index, crop_height, crop_width, 0) # 第二个网络!!!!!!!!!!!!!!!!!!!!!!!!!! x_2 = crops_torch for k in range(4): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) for k in range(4, 9): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) for k in range(9, 14): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) # apply multibox head to source layers for (x_2, l_2, c_2, f_2) in zip(sources_2, self.loc_2, self.conf_2, self.four_corners_2): loc_2.append(l_2(x_2).permute(0, 2, 3, 1).contiguous()) conf_2.append(c_2(x_2).permute(0, 2, 3, 1).contiguous()) four_corners_2.append(f_2(x_2).permute(0, 2, 3, 1).contiguous()) loc_2 = torch.cat([o.view(o.size(0), -1) for o in loc_2], 1) conf_2 = torch.cat([o.view(o.size(0), -1) for o in conf_2], 1) four_corners_2 = torch.cat([o.view(o.size(0), -1) for o in four_corners_2], 1) # 如果loc_2还是list,说明gt_new是没有的,第二个网络的预测和GT都为空 if isinstance(loc_2, list): output = ( loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors, has_lp.view(has_lp.size(0), -1, 1), size_lp.view(size_lp.size(0), -1, 2), offset.view(offset.size(0), -1, 2), # 第二个网络 TODO: 这是非常不友好的做法 torch.zeros(1, self.priors_2.shape[0], 4), torch.zeros(1, self.priors_2.shape[0], 2), self.priors_2, torch.zeros(1, self.priors_2.shape[0], 8), torch.zeros(1, 14) # 最后一位为0表示这个GT not valid ) else: output = ( loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors, has_lp.view(has_lp.size(0), -1, 1), size_lp.view(size_lp.size(0), -1, 2), offset.view(offset.size(0), -1, 2), # 第二个网络 loc_2.view(loc_2.size(0), -1, 4), conf_2.view(conf_2.size(0), -1, self.num_classes), self.priors_2, four_corners_2.view(four_corners_2.size(0), -1, 8), gt_new ) elif self.phase == 'test': has_lp_th = 0.5 th = 0.6 # 包括车和车牌的检测结果 output = torch.zeros(1, 3, 200, 13) # 存储车的检测结果 output[0, 1, :, :5] = rpn_rois[0, 1, :, :5] # 这里把是否有车牌也考虑进来,有车并且有车牌的才去检测车牌 rois_idx = (rpn_rois[0, 1, :, 0] > th) & (rpn_rois[0, 1, :, 5] > has_lp_th) matches = rpn_rois[0, 1, rois_idx, :] if matches.shape[0] == 0: return output # 针对matches中offset,size以及扩大倍数在车内扩大 car_center = (matches[:, [1, 2]] + matches[:, [3, 4]]) / 2 lp_center = car_center + matches[:, [8, 9]] lp_bbox_top_left = lp_center - matches[:, [6, 7]] / 2 * self.expand_num lp_bbox_bottom_right = lp_center + matches[:, [6, 7]] / 2 * self.expand_num lp_bbox = torch.cat((lp_bbox_top_left, lp_bbox_bottom_right), 1) # 将扩大后的车牌区域限制在图片内 lp_bbox = torch.max(lp_bbox, torch.zeros(lp_bbox.shape)) lp_bbox = torch.min(lp_bbox, torch.ones(lp_bbox.shape)) # 将扩大后的车牌区域限制在检测到的车内 lp_bbox = torch.max(lp_bbox, matches[:, 1:3].repeat(1, 2)) lp_bbox = torch.min(lp_bbox, matches[:, 3:5].repeat(1, 2)) # [num_car, 4] rois_squeeze = lp_bbox # 这是将车作为roi的做法 # Define the boxes ( crops ) # box = [y1/heigth , x1/width , y2/heigth , x2/width] boxes_data = torch.zeros(rois_squeeze.shape) boxes_data[:, 0] = rois_squeeze[:, 1] boxes_data[:, 1] = rois_squeeze[:, 0] boxes_data[:, 2] = rois_squeeze[:, 3] boxes_data[:, 3] = rois_squeeze[:, 2] # Create an index to indicate which box crops which image box_index_data = torch.IntTensor(range(boxes_data.shape[0])) # Create a batch of 2 images # 这个地方非常关键,需要repeat,不然后面的feature全是0 !!!!!!!!!!!!!!! image_data = conv1_1_feat.repeat(rois_squeeze.shape[0], 1, 1, 1) # Convert from numpy to Variables # image feature这部分还是需要可导的 image_torch = to_varabile(image_data, is_cuda=is_cuda, requires_grad=False) boxes = to_varabile(boxes_data, is_cuda=is_cuda, requires_grad=False) box_index = to_varabile(box_index_data, is_cuda=is_cuda, requires_grad=False) # Crops and resize bbox1 from img1 and bbox2 from img2 # n*64*crop_height*crop_width crops_torch = CropAndResizeFunction.apply(image_torch, boxes, box_index, crop_height, crop_width, 0) # Visualize the crops # print(crops_torch.data.size()) # crops_torch_data = crops_torch.data.cpu().numpy().transpose(0, 2, 3, 1) # import matplotlib.pyplot as plt # for m in range(rois_squeeze.shape[0]): # fig = plt.figure() # currentAxis = plt.gca() # # pt = gt_2[m][0, :4].cpu().numpy() * self.size_2 # # coords = (pt[0], pt[1]), pt[2] - pt[0] + 1, pt[3] - pt[1] + 1 # # currentAxis.add_patch(plt.Rectangle(*coords, fill=False)) # plt.imshow(crops_torch_data[m, :, :, 33]) # plt.show() # 第二个网络!!!!!!!!!!!!!!!!!!!!!!!!!! x_2 = crops_torch for k in range(4): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) for k in range(4, 9): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) for k in range(9, 14): x_2 = self.vgg_2[k](x_2) sources_2.append(x_2) # apply multibox head to source layers for (x_2, l_2, c_2, f_2) in zip(sources_2, self.loc_2, self.conf_2, self.four_corners_2): loc_2.append(l_2(x_2).permute(0, 2, 3, 1).contiguous()) conf_2.append(c_2(x_2).permute(0, 2, 3, 1).contiguous()) four_corners_2.append(f_2(x_2).permute(0, 2, 3, 1).contiguous()) loc_2 = torch.cat([o.view(o.size(0), -1) for o in loc_2], 1) conf_2 = torch.cat([o.view(o.size(0), -1) for o in conf_2], 1) four_corners_2 = torch.cat([o.view(o.size(0), -1) for o in four_corners_2], 1) output_2 = self.detect_2( loc_2.view(loc_2.size(0), -1, 4), self.softmax_2(conf_2.view(conf_2.size(0), -1, self.num_classes)), self.priors_2.cuda(), four_corners_2.view(four_corners_2.size(0), -1, 8) ) # 这种方法是综合所有车里面的车牌检测结果,然后只选取所有结果的前200个 # (num_car, 200, 13) # output_2_pos = output_2[:, 1, :, :] # # (num_car, 2) # rois_size = rois_squeeze[:, 2:4] - rois_squeeze[:, :2] # rois_top_left = rois_squeeze[:, :2] # # (num_car, 200, 12) # rois_size_expand = rois_size.repeat(1, 6).unsqueeze(1).repeat(1, 200, 1) # # (num_car, 200, 12) # rois_top_left_expand = rois_top_left.repeat(1, 6).unsqueeze(1).repeat(1, 200, 1) # # (num_car, 200, 12) # output_2_pos[:, :, 1:] = output_2_pos[:, :, 1:] * rois_size_expand + rois_top_left_expand # # (num_car*200, 13) # output_2_pos_squeeze = output_2_pos.reshape(-1, output_2_pos.shape[2]) # _, indices = output_2_pos_squeeze[:, 0].sort(descending=True) # output_2_pos_squeeze_sorted = output_2_pos_squeeze[indices, :] # # (1, 2, 200, 13) # results_2 = output_2_pos_squeeze_sorted[:200, :].unsqueeze(0).unsqueeze(1).repeat(1, 2, 1, 1) # 这种方法是每辆车里面只选conf最大的车牌 # (num_car, 13) output_2_pos = output_2[:, 1, 0, :] # (num_car, 2) rois_size = rois_squeeze[:, 2:4] - rois_squeeze[:, :2] rois_top_left = rois_squeeze[:, :2] # (num_car, 12) rois_size_expand = rois_size.repeat(1, 6) # (num_car, 12) rois_top_left_expand = rois_top_left.repeat(1, 6) # (num_car, 12) output_2_pos[:, 1:] = output_2_pos[:, 1:] * rois_size_expand + rois_top_left_expand # 存储车牌的检测结果 num_car = output_2_pos.shape[0] output[0, 2, :num_car, :] = output_2_pos # 存储expand区域的结果,放在车后面,并设置flag output[0, 1, :num_car, 5:9] = lp_bbox output[0, 1, :num_car, 9] = 1 return output else: print("ERROR: Phase: " + self.phase + " not recognized") return return output
def _crop_rois(self, bottom, rois): pre_pool_size = 7 crops = CropAndResizeFunction(pre_pool_size, pre_pool_size)( bottom, Variable(rois), Variable(torch.zeros(rois.size(0), 1).cuda().int())) return crops