def iou_matrix(preds): ''' Calculate iou matrix for a list of predictions. Parameters ---------- preds : theano.tensor :math:`N \\times 4` `theano.tensor` list of bounding box parameters parameterized as :math:`(x_i, y_i, x_f, y_f)`. Returns ------- theano.tensor Matrix of IOU values. ''' idx1, idx2 = meshgrid(T.arange(preds.shape[0]), T.arange(preds.shape[0])) preds1, preds2 = preds[idx1, :], preds[idx2, :] xi, yi = T.maximum(preds1[:, :, 0], preds2[:, :, 0]), T.maximum(preds1[:, :, 1], preds2[:, :, 1]) xf, yf = T.minimum(preds1[:, :, 2], preds2[:, :, 2]), T.minimum(preds1[:, :, 3], preds2[:, :, 3]) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) isec = w * h u = (preds1[:, :, 2] - preds1[:, :, 0]) * (preds1[:, :, 3] - preds1[:, :, 1]) + ( preds2[:, :, 2] - preds2[:, :, 0]) * (preds2[:, :, 3] - preds2[:, :, 1]) - isec return isec / u
def _get_cost(self, input, truth, alpha=1., min_iou=0.5): cost = 0. # create ground truth for non-object class neg_example = theano.shared( np.zeros(self.num_classes + 1, dtype=theano.config.floatX)) neg_example = T.set_subtensor(neg_example[-1], 1.) neg_example = neg_example.dimshuffle('x', 'x', 0, 'x', 'x') cost_coord, cost_class, cost_noobj = 0., 0., 0. for i in range(self._predictive_maps.__len__()): dmap = self._default_maps[i] fmap = self._predictive_maps[i] shape = layers.get_output_shape(self.network['detection'][i])[2:] # get iou between default maps and ground truth iou_default = self._get_iou( dmap.dimshuffle('x', 'x', 0, 1, 2, 3), truth.dimshuffle(0, 1, 'x', 2, 'x', 'x')) #pdb.set_trace() # get which object for which cell idx_match = T.argmax(iou_default, axis=1) # extend truth to cover all cell/box/examples truth_extended = T.repeat(T.repeat(T.repeat(truth.dimshuffle( 0, 1, 'x', 2, 'x', 'x'), self.ratios.__len__(), axis=2), shape[0], axis=4), shape[1], axis=5) idx1, idx2, idx3, idx4 = meshgrid(T.arange(truth.shape[0]), T.arange(self.ratios.__len__()), T.arange(shape[0]), T.arange(shape[1])) # copy truth for every cell/box. truth_extended = truth_extended[idx1, idx_match, idx2, :, idx3, idx4].dimshuffle(0, 1, 4, 2, 3) iou_default = iou_default.max(axis=1) iou_gt_min = iou_default >= min_iou dmap_extended = dmap.dimshuffle('x', 0, 1, 2, 3) # penalize coordinates # cost_fmap = 0. cost_coord_fmap = 0. cost_coord_fmap += (( (fmap[:, :, 0] - (truth_extended[:, :, 0] - dmap_extended[:, :, 0]) / dmap_extended[:, :, 2])[iou_gt_min.nonzero()])**2).sum() cost_coord_fmap += (( (fmap[:, :, 1] - (truth_extended[:, :, 1] - dmap_extended[:, :, 1]) / dmap_extended[:, :, 3])[iou_gt_min.nonzero()])**2).sum() cost_coord_fmap += (( (fmap[:, :, 2] - T.log(truth_extended[:, :, 2] / dmap_extended[:, :, 2]) )[iou_gt_min.nonzero()])**2).sum() cost_coord_fmap += (( (fmap[:, :, 3] - T.log(truth_extended[:, :, 3] / dmap_extended[:, :, 3]) )[iou_gt_min.nonzero()])**2).sum() cost_class_fmap = -( truth_extended[:, :, -(self.num_classes + 1):] * T.log(fmap[:, :, -(self.num_classes + 1):])).sum(axis=2) cost_class_fmap = cost_class_fmap[iou_gt_min.nonzero()].sum() # find negative examples iou_default = iou_default.reshape((-1, )) # iou_idx_sorted = T.argsort(iou_default)[::-1] # iou_st_min = iou_default < min_iou iou_st_min = T.bitwise_and(iou_default >= 0.1, iou_default < min_iou) # Choose index for top boxes whose overlap is smaller than the min overlap. pos_size = iou_gt_min[iou_gt_min.nonzero()].size neg_size = pos_size * 3 # ratio of 3 to 1 #neg_size = 10 idx_neg = T.arange(iou_default.shape[0])[iou_st_min.nonzero()] replace = T.le(idx_neg.shape[0], neg_size) idx_neg = theano.ifelse.ifelse( idx_neg.shape[0] > 0, self._random_stream.choice((neg_size, ), a=idx_neg, replace=replace), T.arange(0)) # iou_idx_sorted = iou_idx_sorted[iou_st_min[iou_idx_sorted].nonzero()][:neg_size] # neg_size = iou_idx_sorted.size neg_size, pos_size = T.maximum(1., neg_size), T.maximum(1., pos_size) # Add the negative examples to the costs. cost_noobj_fmap = -(neg_example * T.log( fmap[:, :, -(self.num_classes + 1):])).sum(axis=2).reshape( (-1, )) cost_noobj_fmap = cost_noobj_fmap[idx_neg].sum() # # NEW STUFF # cost_coord += cost_coord_fmap / pos_size cost_class += alpha * cost_class_fmap / pos_size cost_noobj += alpha * cost_noobj_fmap / neg_size # cost += cost_fmap cost = cost_coord + cost_class + cost_noobj return cost, [cost_coord, cost_class, cost_noobj]
def _get_cost( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj, lambda_anchor = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('lambda_anchor') self._lambda_obj, self._lambda_noobj, self._lambda_anchor = lambda_obj, lambda_noobj, lambda_anchor else: lambda_obj, lambda_noobj, lambda_anchor = self._lambda_obj, self._lambda_noobj, self._lambda_anchor # lambda_obj, lambda_noobj, lambda_anchor = 1., 5., 0.1 w_cell, h_cell = 1./self.output_shape[1], 1./self.output_shape[0] x, y = T.arange(w_cell/2, 1., w_cell), T.arange(h_cell/2, 1., h_cell) y, x = meshgrid(x, y) x, y = x.dimshuffle('x','x','x',0,1), y.dimshuffle('x','x','x',0,1) # create anchors for later w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr').dimshuffle('x',0,'x','x','x') * T.ones_like(x) h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr').dimshuffle('x',0,'x','x','x') * T.ones_like(y) anchors = T.concatenate((x * T.ones_like(w_acr), y * T.ones_like(h_acr), w_acr, h_acr), axis=2) anchors = T.repeat(anchors, truth.shape[0], axis=0) cell_coord = T.concatenate((x,y), axis=2) gt_coord = (truth[:,:,:2] + truth[:,:,2:4]/2).dimshuffle(0,1,2,'x','x') gt_dist = T.sum((gt_coord - cell_coord)**2, axis=2).reshape((truth.shape[0],truth.shape[1],-1)) cell_idx = argmin_unique(gt_dist, 1, 2).reshape((-1,)) # assign unique cell to each obj per example row_idx = T.cast(cell_idx // self.output_shape[1], 'int64') col_idx = cell_idx - row_idx * self.output_shape[1] num_idx = T.repeat(T.arange(truth.shape[0]).reshape((-1,1)), truth.shape[1], axis=1).reshape((-1,)) obj_idx = T.repeat(T.arange(truth.shape[1]).reshape((1,-1)), truth.shape[0], axis=0).reshape((-1,)) valid_example = gt_dist[num_idx, obj_idx, cell_idx] < 1 # if example further than 1 away from cell it's a garbage example num_idx, obj_idx = num_idx[valid_example.nonzero()], obj_idx[valid_example.nonzero()] row_idx, col_idx = row_idx[valid_example.nonzero()], col_idx[valid_example.nonzero()] truth_flat = truth[num_idx, obj_idx, :].dimshuffle(0,'x',1) pred_matched = output[num_idx,:,:,row_idx, col_idx] x, y = x[:,0,0,row_idx, col_idx].dimshuffle(1,0), y[:,0,0,row_idx, col_idx].dimshuffle(1,0) w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr').dimshuffle('x',0) h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr').dimshuffle('x',0) # reformat prediction pred_shift = pred_matched pred_shift = T.set_subtensor(pred_shift[:,:,2], w_acr * T.exp(pred_shift[:,:,2])) pred_shift = T.set_subtensor(pred_shift[:,:,3], h_acr * T.exp(pred_shift[:,:,3])) pred_shift = T.set_subtensor(pred_shift[:,:,0], pred_shift[:,:,0] + T.repeat(x, pred_shift.shape[1], axis=1) - pred_shift[:,:,2]/2) pred_shift = T.set_subtensor(pred_shift[:,:,1], pred_shift[:,:,1] + T.repeat(y, pred_shift.shape[1], axis=1) - pred_shift[:,:,3]/2) # calculate iou xi = T.maximum(pred_shift[:,:,0], truth_flat[:,:,0]) yi = T.maximum(pred_shift[:,:,1], truth_flat[:,:,1]) xf = T.minimum(pred_shift[:,:,[0,2]].sum(axis=2), truth_flat[:,:,[0,2]].sum(axis=2)) yf = T.minimum(pred_shift[:,:,[1,3]].sum(axis=2), truth_flat[:,:,[1,3]].sum(axis=2)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h union = T.prod(pred_shift[:,:,[2,3]], axis=2) + T.prod(truth_flat[:,:,[2,3]], axis=2) - isec iou = isec / union # calculate iou for anchor anchors_matched = anchors[num_idx,:,:,row_idx,col_idx] xi = T.maximum(anchors_matched[:,:,0], truth_flat[:,:,0]) yi = T.maximum(anchors_matched[:,:,1], truth_flat[:,:,1]) xf = T.minimum(anchors_matched[:,:,[0,2]].sum(axis=2), truth_flat[:,:,[0,2]].sum(axis=2)) yf = T.minimum(anchors_matched[:,:,[1,3]].sum(axis=2), truth_flat[:,:,[1,3]].sum(axis=2)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h union = T.prod(anchors_matched[:,:,[2,3]], axis=2) + T.prod(truth_flat[:,:,[2,3]], axis=2) - isec iou_acr = isec / union # get max iou acr_idx = T.argmax(iou_acr, axis=1) # reformat truth truth_formatted = truth_flat truth_formatted = T.repeat(truth_formatted, self.boxes.__len__(), axis=1) truth_formatted = T.set_subtensor(truth_formatted[:,:,0], truth_formatted[:,:,0] + truth_formatted[:,:,2]/2 - T.repeat(x, truth_formatted.shape[1], axis=1)) truth_formatted = T.set_subtensor(truth_formatted[:,:,1], truth_formatted[:,:,1] + truth_formatted[:,:,3]/2 - T.repeat(y, truth_formatted.shape[1], axis=1)) truth_formatted = T.set_subtensor(truth_formatted[:,:,2], T.log(truth_formatted[:,:,2] / w_acr)) truth_formatted = T.set_subtensor(truth_formatted[:,:,3], T.log(truth_formatted[:,:,3] / h_acr)) truth_formatted = truth_formatted[T.arange(truth_formatted.shape[0]),acr_idx,:] # # calculate cost # item_idx = T.arange(pred_matched.shape[0]) anchors = T.set_subtensor(anchors[:,:,:2], 0.) cost = 0. cost_noobject = lambda_noobj * (T.mean(output[:,:,4]**2) - T.sum(pred_matched[item_idx, acr_idx,4]**2) / output[:,:,4].size) cost_anchor = lambda_anchor * (T.mean(T.sum(output[:,:,:4]**2, axis=2)) - T.sum(T.sum(pred_matched[item_idx,acr_idx,:4]**2, axis=1)) / output[:,:,0].size) cost_coord = lambda_obj * T.mean(T.sum((pred_matched[item_idx,acr_idx,:4] - truth_formatted[:,:4])**2, axis=1)) cost_class = lambda_obj * T.mean(T.sum(-truth_formatted[:,-self.num_classes:] * T.log(pred_matched[item_idx, acr_idx, -self.num_classes:]), axis=1)) if rescore: cost_obj = lambda_obj * T.mean((pred_matched[item_idx, acr_idx,4] - iou[item_idx, acr_idx])**2) else: cost_obj = lambda_obj * T.mean((pred_matched[item_idx, acr_idx,4] - 1)**2) cost = cost_noobject + cost_obj + cost_anchor + cost_coord + cost_class return cost, [iou], [row_idx, col_idx, acr_idx, cost_noobject, cost_anchor, cost_coord, cost_class, cost_obj]
def _get_cost3( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj = T.scalar('lambda_obj'), T.scalar('lambda_noobj') self._lambda_obj, self._lambda_noobj = lambda_obj, lambda_noobj else: lambda_obj, lambda_noobj, thresh = self._lambda_obj, self._lambda_noobj, self._thresh cost = 0. # penalize everything, this will be undone if box matches ground truth #cost += lambda_noobj_coord * T.mean(output[:,:,:4]**2) cost += lambda_noobj * T.mean(output[:,:,4]**2) # get index for each truth row_idx = T.cast(T.floor((truth[:,:,0] + 0.5 * truth[:,:,2]) * self.output_shape[1]), 'int32') col_idx = T.cast(T.floor((truth[:,:,1] + 0.5 * truth[:,:,3]) * self.output_shape[0]), 'int32') # image index img_idx = T.repeat(T.arange(truth.shape[0]).dimshuffle(0,'x'), truth.shape[1], axis=1) # index for each object in an image obj_idx = T.repeat(T.arange(truth.shape[1]), truth.shape[0], axis=0) # reshape to flat row_idx = row_idx.reshape((-1,)) col_idx = col_idx.reshape((-1,)) img_idx = img_idx.reshape((-1,)) obj_idx = obj_idx.reshape((-1,)) # use only valid indices (i.e. greater or equal to zero) valid_idx = T.bitwise_and(row_idx >= 0, col_idx >= 0).reshape((-1,)) row_idx = row_idx[valid_idx.nonzero()] col_idx = col_idx[valid_idx.nonzero()] img_idx = img_idx[valid_idx.nonzero()] obj_idx = obj_idx[valid_idx.nonzero()] # reshape output and truth output = output.dimshuffle(0,'x',1,2,3,4) truth = truth.dimshuffle(0,1,'x',2,'x','x') output = T.repeat(output, truth.shape[1], axis=1) truth = T.repeat(truth, self.boxes.__len__(), axis=2) truth = T.repeat(T.repeat(truth, self.output_shape[0], axis=4), self.output_shape[1], axis=5) # reformat ground truth labels so that they are relative to offsets # and that the width/height are log scale relative to the box height. # add offset to the x,y coordinates x_diff, y_diff = 1./self.output_shape[0], 1./self.output_shape[1] y, x = meshgrid(T.arange(0 + x_diff/2,1,x_diff), T.arange(0 + y_diff/2,1,y_diff)) x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # scaling from each anchor box x_scale = theano.shared(np.asarray([b[0] for b in self.boxes]), name='x_scale', borrow=True).dimshuffle('x',0,'x','x') y_scale = theano.shared(np.asarray([b[1] for b in self.boxes]), name='y_scale', borrow=True).dimshuffle('x',0,'x','x') # change predicted output to proper scale pred = T.set_subtensor(output[:,:,:,0], output[:,:,:,0] + x) pred = T.set_subtensor(pred[:,:,:,1], pred[:,:,:,1] + y) pred = T.set_subtensor(pred[:,:,:,2], x_scale * T.exp(pred[:,:,:,2])) pred = T.set_subtensor(pred[:,:,:,3], y_scale * T.exp(pred[:,:,:,3])) # determine iou of chosen boxes xi = T.maximum(pred[img_idx, obj_idx, :, 0, row_idx, col_idx], truth[img_idx, obj_idx, :, 0, row_idx, col_idx]) yi = T.maximum(pred[img_idx, obj_idx, :, 1, row_idx, col_idx], truth[img_idx, obj_idx, :, 1, row_idx, col_idx]) xf = T.minimum( pred[img_idx, obj_idx, :, 0, row_idx, col_idx] + pred[img_idx, obj_idx, :, 2, row_idx, col_idx], truth[img_idx, obj_idx, :, 0, row_idx, col_idx] + truth[img_idx, obj_idx, :, 2, row_idx, col_idx] ) yf = T.minimum( pred[img_idx, obj_idx, :, 1, row_idx, col_idx] + pred[img_idx, obj_idx, :, 3, row_idx, col_idx], truth[img_idx, obj_idx, :, 1, row_idx, col_idx] + truth[img_idx, obj_idx, :, 3, row_idx, col_idx] ) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) isec = w * h iou = isec / (pred[img_idx, obj_idx, :, 2, row_idx, col_idx] * pred[img_idx, obj_idx, :, 3, row_idx, col_idx] + \ truth[img_idx, obj_idx, :, 2, row_idx, col_idx] * truth[img_idx, obj_idx, :, 3, row_idx, col_idx] - isec) # get index for matched boxes match_idx = T.argmax(iou, axis=1) # change truth to proper scale for error truth = T.set_subtensor(truth[:,:,:,0,:,:], truth[:,:,:,0,:,:] - x) truth = T.set_subtensor(truth[:,:,:,1,:,:], truth[:,:,:,1,:,:] - y) truth = T.set_subtensor(truth[:,:,:,2,:,:], T.log(truth[:,:,:,2,:,:] / x_scale)) truth = T.set_subtensor(truth[:,:,:,3,:,:], T.log(truth[:,:,:,3,:,:] / y_scale)) # add to cost boxes which have been matched # correct for matched boxes #cost -= lambda_noobj_coord * T.mean(output[img_idx, obj_idx, :, :4, row_idx, col_idx][:,match_idx]**2) cost -= lambda_noobj * T.mean(output[img_idx, obj_idx, :, 4, row_idx, col_idx][:,match_idx]**2) # coordinate errors cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 0, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 0, row_idx, col_idx][:,match_idx])**2 ) cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 1, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 1, row_idx, col_idx][:,match_idx])**2 ) cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 2, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 2, row_idx, col_idx][:,match_idx])**2 ) cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 3, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 3, row_idx, col_idx][:,match_idx])**2 ) # objectness error if rescore: cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 4, row_idx, col_idx][:,match_idx] - iou[:,match_idx])**2 ) else: cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 4, row_idx, col_idx][:,match_idx] - 1)**2 ) # class error cost += lambda_obj * T.mean( ( -truth[img_idx, obj_idx, :, -self.num_classes:, row_idx, col_idx][:,match_idx] * \ T.log(output[img_idx, obj_idx, :, -self.num_classes:, row_idx, col_idx][:,match_idx]) ) ) return cost, [iou]
def _get_cost2( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj, thresh = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('thresh') self._lambda_obj, self._lambda_noobj, self._thresh = lambda_obj, lambda_noobj, thresh else: lambda_obj, lambda_noobj, thresh = self._lambda_obj, self._lambda_noobj, self._thresh cost = 0. # create grid for cells w_cell, h_cell = 1. / self.output_shape[1], 1. / self.output_shape[0] x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell) y, x = meshgrid(x, y) # reshape truth to match with cell truth_cell = truth.dimshuffle(0, 1, 2, 'x','x') x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # calculate overlap between cell and ground truth boxes xi, yi = T.maximum(truth_cell[:,:,0], x - w_cell/2), T.maximum(truth_cell[:,:,1], y - h_cell/2) xf = T.minimum(truth_cell[:,:,[0,2]].sum(axis=2), x + w_cell/2) yf = T.minimum(truth_cell[:,:,[1,3]].sum(axis=2), y + h_cell/2) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) # overlap between cell and ground truth box overlap = (w * h) / (w_cell * h_cell) # repeat truth boxes truth_boxes = truth.dimshuffle(0, 1, 'x', 2, 'x', 'x') # create grid for anchor boxes anchors = T.concatenate((x.dimshuffle(0,1,'x','x',2,3) - w_cell/2, y.dimshuffle(0,1,'x','x',2,3) - h_cell/2), axis=3) anchors = T.concatenate((anchors, T.ones_like(anchors)), axis=3) anchors = T.repeat(anchors, self.boxes.__len__(), axis=2) w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x','x',0,'x','x') h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x','x',0,'x','x') anchors = T.set_subtensor(anchors[:,:,:,2], anchors[:,:,:,2] * w_acr) anchors = T.set_subtensor(anchors[:,:,:,3], anchors[:,:,:,3] * h_acr) # find iou between anchors and ground truths xi, yi = T.maximum(truth_boxes[:,:,:,0], anchors[:,:,:,0]), T.maximum(truth_boxes[:,:,:,1], anchors[:,:,:,1]) xf = T.minimum(truth_boxes[:,:,:,[0,2]].sum(axis=3), anchors[:,:,:,[0,2]].sum(axis=3)) yf = T.minimum(truth_boxes[:,:,:,[1,3]].sum(axis=3), anchors[:,:,:,[1,3]].sum(axis=3)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h iou = isec / (T.prod(truth_boxes[:,:,:,[2,3]], axis=3) + T.prod(anchors[:,:,:,[2,3]], axis=3) - isec) overlap = overlap.dimshuffle(0,1,'x',2,3) best_iou_obj_idx = T.argmax(iou, axis=1).dimshuffle(0,'x',1,2,3) best_iou_box_idx = T.argmax(iou, axis=2).dimshuffle(0,1,'x',2,3) _,obj_idx,box_idx,_,_ = meshgrid( T.arange(truth.shape[0]), T.arange(truth.shape[1]), T.arange(self.boxes.__len__()), T.arange(self.output_shape[0]), T.arange(self.output_shape[1]) ) # define logical matrix assigning object to correct anchor box and cell. best_iou_idx = T.bitwise_and( T.bitwise_and( T.eq(best_iou_box_idx, box_idx), T.eq(best_iou_obj_idx, obj_idx) ), overlap >= thresh ) constants = [] if rescore: # scale predictions correctly pred = output.dimshuffle(0,'x',1,2,3,4) pred = T.set_subtensor(pred[:,:,:,0], pred[:,:,:,0] + x.dimshuffle(0,1,'x',2,3)) pred = T.set_subtensor(pred[:,:,:,1], pred[:,:,:,1] + y.dimshuffle(0,1,'x',2,3)) pred = T.set_subtensor(pred[:,:,:,2], w_acr * T.exp(pred[:,:,:,2])) pred = T.set_subtensor(pred[:,:,:,3], h_acr * T.exp(pred[:,:,:,3])) xi, yi = T.maximum(pred[:,:,:,0], truth_boxes[:,:,:,0]), T.maximum(pred[:,:,:,1], truth_boxes[:,:,:,1]) xf = T.minimum(pred[:,:,:,[0,2]].sum(axis=3), truth_boxes[:,:,:,[0,2]].sum(axis=3)) yf = T.minimum(pred[:,:,:,[1,3]].sum(axis=3), truth_boxes[:,:,:,[1,3]].sum(axis=3)) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) isec = w * h iou = isec / (pred[:,:,:,[2,3]].prod(axis=3) + truth_boxes[:,:,:,[2,3]].prod(axis=3) - isec) # make sure iou is considered constant when taking gradient constants.append(iou) # format ground truths correclty truth_boxes = truth_boxes = T.repeat( T.repeat( T.repeat(truth_boxes, self.boxes.__len__(), axis=2), self.output_shape[0], axis=4 ), self.output_shape[1], axis=5 ) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,0], truth_boxes[:,:,:,0] - anchors[:,:,:,0]) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,1], truth_boxes[:,:,:,1] - anchors[:,:,:,1]) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,2], T.log(truth_boxes[:,:,:,2] / anchors[:,:,:,2])) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,3], T.log(truth_boxes[:,:,:,3] / anchors[:,:,:,3])) # add dimension for objects per image pred = T.repeat(output.dimshuffle(0,'x',1,2,3,4), truth.shape[1], axis=1) # penalize coordinates cost += lambda_obj * T.mean(((pred[:,:,:,:4] - truth_boxes[:,:,:,:4])**2).sum(axis=3)[best_iou_idx.nonzero()]) # penalize class scores cost += lambda_obj * T.mean((-truth_boxes[:,:,:,-self.num_classes:] * T.log(pred[:,:,:,-self.num_classes:])).sum(axis=3)[best_iou_idx.nonzero()]) # penalize objectness score if rescore: cost += lambda_obj * T.mean(((pred[:,:,:,4] - iou)**2)[best_iou_idx.nonzero()]) else: cost += lambda_obj * T.mean(((pred[:,:,:,4] - 1.)**2)[best_iou_idx.nonzero()]) # flip all matched and penalize all un-matched objectness scores not_matched_idx = best_iou_idx.sum(axis=1) > 0 not_matched_idx = bitwise_not(not_matched_idx) # penalize objectness score for non-matched boxes cost += lambda_noobj * T.mean((pred[:,0,:,4]**2)[not_matched_idx.nonzero()]) return cost, constants
def detect(self, im, thresh=0.75, overlap=0.5, num_to_label=None, return_iou=False): im = format_image(im, dtype=theano.config.floatX) old_size = im.shape[:2] im = cv2.resize(im, self.input_shape[::-1], interpolation=cv2.INTER_LINEAR).swapaxes(2,1).swapaxes(1,0).reshape((1,3) + self.input_shape) if not hasattr(self, '_detect_fn'): ''' Make theano do all the heavy lifting for detection, this should speed up the process marginally. ''' output = self.output_test if self.use_custom_cost: new_output = None for i in range(len(self.boxes)): cls_idx = T.arange(i * (5 + self.num_classes), (i+1) * (5 + self.num_classes)) if new_output is None: new_output = output[:,cls_idx,:,:].dimshuffle(0,'x',1,2,3) else: new_output = T.concatenate((new_output, output[:,cls_idx,:,:].dimshuffle(0,'x',1,2,3)), axis=1) output = new_output thresh_var = T.scalar(name='thresh') conf = output[:,:,4] * T.max(output[:,:,-self.num_classes:], axis=2) # define offsets to predictions w_cell, h_cell = 1. / self.output_shape[1], 1. / self.output_shape[0] x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell) y, x = meshgrid(x, y) x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # define scale w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x',0,'x','x') h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x',0,'x','x') # rescale output output = T.set_subtensor(output[:,:,2], w_acr * T.exp(output[:,:,2])) output = T.set_subtensor(output[:,:,3], h_acr * T.exp(output[:,:,3])) output = T.set_subtensor(output[:,:,0], output[:,:,0] + x - output[:,:,2] / 2) output = T.set_subtensor(output[:,:,1], output[:,:,1] + y - output[:,:,3] / 2) output = T.set_subtensor(output[:,:,2:4], output[:,:,2:4] + output[:,:,:2]) # define confidence in prediction conf = output[:,:,4] * T.max(output[:,:,-self.num_classes:], axis=2) cls = T.argmax(output[:,:,-self.num_classes:], axis=2) # filter out all below thresh above_thresh_idx = conf > thresh_var pred = T.concatenate( ( output[:,:,0][above_thresh_idx.nonzero()].dimshuffle(0,'x'), output[:,:,1][above_thresh_idx.nonzero()].dimshuffle(0,'x'), output[:,:,2][above_thresh_idx.nonzero()].dimshuffle(0,'x'), output[:,:,3][above_thresh_idx.nonzero()].dimshuffle(0,'x'), conf[above_thresh_idx.nonzero()].dimshuffle(0,'x'), cls[above_thresh_idx.nonzero()].dimshuffle(0,'x') ), axis=1 ) iou_matrix = utils.iou_matrix(pred) self._detect_fn = theano.function([self.input, thresh_var], [pred, iou_matrix]) output, iou_matrix = self._detect_fn(im, thresh) boxes = [] for i in range(output.shape[0]): coord, conf, cls = output[i,:4], output[i,4], output[i,5] coord[2:] += coord[:2] if num_to_label is not None: cls =num_to_label[cls] box = utils.BoundingBox(*coord.tolist(), confidence=conf, cls=cls) boxes.append(box) boxes = [b * old_size for b in boxes] if return_iou: return boxes, iou_matrix else: return boxes