def __init__(self, net, checkpoint, cfg): super().__init__("TrackerDefault") self.cfg = cfg self.net = net if checkpoint is not None: utils.load_checkpoint(checkpoint, self.net) self.net.eval() self.anchors = utils.generate_anchors(cfg) if torch.cuda.is_available(): self.net.cuda() self.anchors = self.anchors.cuda() self.z_transform = Compose([ ToAbsoluteCoords(), Crop(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_Z, make_square=False), ToPercentCoords(), Resize(cfg.MODEL.Z_SIZE), ]) self.x_crop = Crop(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_X, return_rect=True, make_square=True) self.x_resize = Resize(size=cfg.MODEL.X_SIZE) self.z_crop = Crop(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_Z, return_rect=True, make_square=False) self.z_resize = Resize(size=cfg.MODEL.Z_SIZE) self.criterion = MultiBoxLoss(self.anchors, self.cfg)
def __init__(self, resized_image_size, test_images_list=cfg.TEST_IMAGE_LIST.copy(), anchor_box_file=cfg.ANCHOR_BOXES_STORE, subsampled_ratio=cfg.SUBSAMPLED_RATIO, transform=None): self.anchor_box_file = anchor_box_file self.subsampled_ratio = subsampled_ratio self.resized_image_size = resized_image_size self.test_images_list = test_images_list self.transform = transform anchor_file = open(self.anchor_box_file, 'r') anchor_tuples = anchor_file.read().replace('\n', '') assert (len(anchor_tuples) > 0, "Anchor data is empty!") self.all_anchor_sizes = dict(eval(anchor_tuples)) self.anchor_sizes = np.asarray(eval(self.all_anchor_sizes[str( self.resized_image_size)]), dtype=np.float32) self.anchors_list = generate_anchors( anchor_sizes=self.anchor_sizes, subsampled_ratio=self.subsampled_ratio, resized_image_size=self.resized_image_size)
def initialize(self, feat, label_map, gt_bbox): # build initial target classifier init_params, init_lrs = self.meta_init.initialize() self.meta_opti.initialize(init_lrs) target_cell_sz = torch.ceil(gt_bbox[:, 0, 2:]/config.cell_sz) filter_size = compute_filter_size(target_cell_sz) base_anchor_sizes = filter_size/config.filter_scale*config.cell_sz self.map_size = feat.shape[3] self.anchors = generate_anchors(self.map_size, base_anchor_sizes) # calculate initial update loss pred_map, pred_bbox = adaptable_conv2d(feat, init_params, filter_size) map_loss = self.l2loss(pred_map, label_map) bbox_loss = self.smoothl1loss(pred_bbox, gt_bbox, self.anchors) loss = map_loss + bbox_loss # meta update grads = torch.autograd.grad(loss, init_params) self.updated_params = init_params - init_lrs * grads[0] pred_map, pred_bbox = adaptable_conv2d(feat, self.updated_params, filter_size) lh_map_loss = self.l2loss(pred_map, label_map) lh_bbox_loss = self.smoothl1loss(pred_bbox, gt_bbox, self.anchors) lh_loss = lh_map_loss + lh_bbox_loss print(map_loss.data.item(), lh_map_loss.data.item()) print(bbox_loss.data.item(), lh_bbox_loss.data.item()) print(loss.data.item(), lh_loss.data.item()) self.updating_feats = [] self.updating_maps = [] self.updating_bboxes = [] self.filter_size = filter_size self.ref_score = np.max(pred_map.data.cpu().numpy())
def anchorboxes(self): from utils import generate_anchors layer_anchors = [] for feat_shape, step, scale, ratio in zip( self.params.feat_shapes, self.params.anchor_steps, self.params.anchor_scales, self.params.anchor_aspectratios): layer_anchors.append( generate_anchors(feat_shape, step, scale, ratio)) return layer_anchors
class FaceDetector: # load the model ort_session = onnxruntime.InferenceSession( "./face_recognition/data/ssd_mini_w360.onnx") # anchor configuration feature_map_sizes = [[45, 45], [23, 23], [12, 12], [6, 6], [4, 4]] anchor_sizes = [[0.04, 0.056], [0.08, 0.11], [0.16, 0.22], [0.32, 0.45], [0.64, 0.72]] anchor_ratios = [[1, 0.62, 0.42]] * 5 # generate anchors anchors = generate_anchors(feature_map_sizes, anchor_sizes, anchor_ratios) # for inference , the batch size is 1, the model output shape is [1, N, 4], # so we expand dim for anchors to [1, anchor_num, 4] anchors_exp = np.expand_dims(anchors, axis=0) id2class = {0: 'Mask', 1: 'NoMask'} def detect(self, image, conf_thresh=0.6, iou_thresh=0.4, target_shape=(360, 360)): height, width, _ = image.shape image_resized = cv2.resize(image, target_shape) image_np = image_resized / 255.0 image_exp = np.expand_dims(image_np, axis=0) image_transposed = image_exp.transpose((0, 3, 1, 2)).astype(np.float32) ort_inputs = {self.ort_session.get_inputs()[0].name: image_transposed} y_bboxes_output, y_cls_output = self.ort_session.run(None, ort_inputs) # remove the batch dimension, for batch is always 1 for inference. y_bboxes = decode_bbox(self.anchors_exp, y_bboxes_output)[0] y_cls = y_cls_output[0] # To speed up, do single class NMS, not multiple classes NMS. bbox_max_scores = np.max(y_cls, axis=1) bbox_max_score_classes = np.argmax(y_cls, axis=1) # keep_idx is the alive bounding box after nms. keep_idxs = single_class_non_max_suppression(y_bboxes, bbox_max_scores, conf_thresh, iou_thresh) max_area, r_item = -1, None for idx in keep_idxs: # conf = float(bbox_max_scores[idx]) class_id = bbox_max_score_classes[idx] bbox = y_bboxes[idx] # clip the coordinate, avoid the value exceed the image boundary. xmin = max(0, int(bbox[0] * width)) ymin = max(0, int(bbox[1] * height)) xmax = min(int(bbox[2] * width), width) ymax = min(int(bbox[3] * height), height) item = (xmin, ymin, xmax, ymax), class_id area = (xmax - xmin) * (ymax - ymin) if max_area < area: max_area, r_item = area, item return r_item
def __init__(self, args): super().__init__() self.args = args self.dropout = args.dropout self.max_num_frames = args.max_num_frames self.anchors = generate_anchors(dataset=args.dataset) self.num_anchors = self.anchors.shape[0] widths = (self.anchors[:, 1] - self.anchors[:, 0] + 1) # [num_anchors] centers = np.arange(0, args.max_num_frames) # [video_len] start = np.expand_dims(centers, 1) - 0.5 * (np.expand_dims(widths, 0) - 1) end = np.expand_dims(centers, 1) + 0.5 * (np.expand_dims(widths, 0) - 1) self.proposals = np.stack([start, end], -1) # [video_len, num_anchors, 2] # VideoEncoder self.video_encoder = VideoEncoder(args) # SentenceEncoder self.sentence_encoder = SentenceEncoder(args) #attentive graph self.atten = CoAttention(args.d_model, args.d_model) self.intra_v = CoAttention_intra(args.max_num_frames, args.d_model) self.intra_s = CoAttention_intra(args.max_num_words, args.d_model) self.update_v = ConvGRUCell(args.d_model, args.d_model) self.update_s = ConvGRUCell(args.d_model, args.d_model) self.update_v_intra = ConvGRUCell(args.d_model, args.d_model) self.update_s_intra = ConvGRUCell(args.d_model, args.d_model) self.v2s = TanhAttention(args.d_model) self.rnn = DynamicGRU(args.d_model << 1, args.d_model >> 1, bidirectional=True, batch_first=True) self.fc_score = nn.Conv1d(args.d_model, self.num_anchors, kernel_size=1, padding=0, stride=1) self.fc_reg = nn.Conv1d(args.d_model, self.num_anchors << 1, kernel_size=1, padding=0, stride=1) # loss function self.criterion1 = nn.BCELoss() self.criterion2 = nn.SmoothL1Loss()
def rpn_model_fn(features, labels, mode): feature_maps = ResNet_w_FPN.forward(features['img']) pred_rpn_anchor_logits, pred_rpn_anchor_probs, pred_rpn_anchor_deltas = RPN.forward( feature_maps) class_loss, bbox_loss = tf.map_fn( RPN.rpn_loss, (pred_rpn_anchor_logits, pred_rpn_anchor_deltas, labels['rpn_labels'], labels['rpn_deltas'], labels['rpn_mask'], labels['rpn_positive_range'], labels['rpn_positive_mask']), dtype=(tf.float32, tf.float32)) class_loss = tf.reduce_sum(class_loss) # shape = tf.shape(bbox_loss, name='shape') bbox_loss = tf.reduce_sum(bbox_loss) loss = tf.identity(class_loss + bbox_loss, name='loss') all_anchors = tf.convert_to_tensor( utils.generate_anchors(format=utils.BBOX_FORMAT.YXYX), dtype=tf.float32) all_anchors = tf.tile(tf.expand_dims(all_anchors, 0), [tf.shape(pred_rpn_anchor_logits)[0], 1, 1]) proposals = tf.map_fn( RPN.inference, (pred_rpn_anchor_probs, pred_rpn_anchor_deltas, all_anchors), dtype=tf.float32) # get proposals in YXYX format proposals = tf.identity(proposals, name='proposals') rois, target_class, target_deltas, target_mask = tf.map_fn( utils.generate_mask_rcnn_x_y_tf, (proposals, labels['bboxs'], labels['cls'], labels['masks'], labels['valid_label_ranges']), dtype=(tf.float32, tf.int32, tf.float32, tf.float32)) # rois = tf.Print(rois, [tf.shape(rois), tf.shape(target_class), tf.shape(target_deltas), tf.shape(target_mask)]) mrcnn_cls_bbox_in = RoiAlign.forward(rois, feature_maps[:-1], config.CLS_BBOX_ROI_POOL_SIZE) mrcnn_mask_in = RoiAlign.forward(rois, feature_maps[:-1], config.MASK_ROI_POOL_SIZE) mrcnn_mask_out_logits, _ = rcnn_head.forward_mask(mrcnn_mask_in) mrcnn_cls_logits, _, mrcnn_deltas = rcnn_head.forward_cls_bbox( mrcnn_cls_bbox_in) cls_loss, bbox_loss, mask_loss = tf.map_fn( rcnn_head.mrcnn_loss, (mrcnn_cls_logits, mrcnn_deltas, mrcnn_mask_out_logits, target_class, target_deltas, target_mask, rois), dtype=(tf.float32, tf.float32, tf.float32)) # return proposals, rois, target_class, target_mask, mrcnn_cls_bbox_in, mrcnn_deltas, cls_loss, bbox_loss, mask_loss global_step = tf.train.get_global_step() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss, global_step=global_step) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def __init__(self, model_path, gpu_id): self.gpu_id = gpu_id with torch.cuda.device(gpu_id): self.model = SiamRPN() self.model.load_model(model_path) self.model = self.model.cuda() self.model.eval() self.response_sz = config.response_sz self.anchors = generate_anchors(config.total_stride, config.anchor_base_size, config.anchor_scales, config.anchor_ratios, self.response_sz) self.transforms = transforms.Compose([ToTensor()])
def __init__(self, feature_path, data_path, word2vec, max_num_frames, max_num_words, max_num_nodes, is_training=True): data = load_json(data_path) super().__init__(feature_path, data, word2vec, is_training) self.max_num_frames = max_num_frames self.max_num_words = max_num_words self.max_num_nodes = max_num_nodes self.anchors = generate_anchors(dataset='ActivityNet') widths = (self.anchors[:, 1] - self.anchors[:, 0] + 1) # [num_anchors] centers = np.arange(0, max_num_frames) # [video_len] start = np.expand_dims(centers, 1) - 0.5 * (np.expand_dims(widths, 0) - 1) end = np.expand_dims(centers, 1) + 0.5 * (np.expand_dims(widths, 0) - 1) self.proposals = np.stack([start, end], -1) # [video_len, num_anchors, 2]
def get_anchors(self, image_shape): """Returns anchor pyramid for the given image size.""" feature_map_size = image_shape[0] // config.RPN_DOWNSCALE # Cache anchors and reuse if image shape is the same if tuple(image_shape) not in self._anchor_cache: # Generate Anchors a = utils.generate_anchors(config.RPN_ANCHOR_HEIGHTS, config.RPN_ANCHOR_WIDTHS, feature_map_size, config.RPN_DOWNSCALE, config.RPN_ANCHOR_STRIDE) # Normalize coordinates self._anchor_cache[tuple(image_shape)] = utils.norm_boxes( a, image_shape[:2]) return self._anchor_cache[tuple(image_shape)]
def __init__(self, resized_image_size, k=cfg.K, classes=cfg.CLASSES.copy(), list_images=cfg.LIST_IMAGES.copy(), list_annotations=cfg.LIST_ANNOTATIONS.copy(), total_images=cfg.TOTAL_IMAGES, subsampled_ratio=cfg.SUBSAMPLED_RATIO, detection_conv_size=cfg.DETECTION_CONV_SIZE, excluded_classes=cfg.EXCLUDED_CLASSES.copy(), anchor_box_write=cfg.ANCHOR_BOXES_STORE, transform=None): ''' Initialize parameters and anchors using KMeans. ''' self.resized_image_size = resized_image_size self.classes = classes self.list_images = list_images self.list_annotations = list_annotations self.total_images = total_images self.k = k self.subsampled_ratio = subsampled_ratio self.detection_conv_size = detection_conv_size self.excluded_classes = excluded_classes self.transform = transform self.anchor_boxes_write = anchor_box_write #get the top-k anchor sizes using modifed K-Means clustering. self.anchor_sizes = cluster_bounding_boxes( k=self.k, total_images=self.total_images, resized_image_size=self.resized_image_size, list_annotations=cfg.LIST_ANNOTATIONS, classes=cfg.CLASSES, excluded_classes=cfg.EXCLUDED_CLASSES) #python dbm to store the anchor sizes for a specific training set for every image size. #the anchor sizes are necessary for the use of evaluation later as we would not have training data to perform clustering. yolo_db = dbm.open(cfg.YOLO_DB, 'c') #for every image size, different anchor set. Each set will be stored in database for the use of evaluation later. yolo_db[str(resized_image_size)] = str(self.anchor_sizes.tolist()) yolo_db.close() self.anchors_list = generate_anchors( anchor_sizes=self.anchor_sizes, subsampled_ratio=self.subsampled_ratio, resized_image_size=self.resized_image_size)
def generate_proposals(data): # Extract feature map feature_map = CNN_model_cut.predict( data.reshape(-1, data.shape[0], data.shape[1], data.shape[2])) padded_fcmap = np.pad(feature_map, ((0, 0), (1, 1), (1, 1), (0, 0)), mode='constant') # Extract RPN results RPN_results = RPN_model.predict(padded_fcmap) anchor_probs = RPN_results[0].reshape((-1, 1)) anchor_targets = RPN_results[1].reshape((-1, 4)) # Original anchors feature_size = feature_map.shape[1] number_feature_points = feature_size * feature_size feature_stride = int(image_size / feature_size) base_anchors = generate_anchors(feature_stride, feature_stride, ratios=ANCHOR_RATIOS, scales=ANCHOR_SCALES) shift = np.arange(0, feature_size) * feature_stride shift_x, shift_y = np.meshgrid(shift, shift) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() original_anchors = (base_anchors.reshape( (1, anchor_number, 4)) + shifts.reshape( (1, number_feature_points, 4)).transpose((1, 0, 2))) original_anchors = original_anchors.reshape((-1, 4)) # Proposals by the RPN proposals = bbox_transform_inv(original_anchors, anchor_targets) proposals = clip_boxes(proposals, (data.shape[0], data.shape[1])) # clip to image. high_to_low_scores = anchor_probs.ravel().argsort()[::-1] # highest scores high_to_low_scores = high_to_low_scores[0:N] proposals = proposals[high_to_low_scores, :] anchor_probs = anchor_probs[high_to_low_scores] del original_anchors del RPN_results del feature_map del padded_fcmap return proposals, anchor_probs
def __init__(self,in_channel=512,out_channel=512,feature_stride=16): super().__init__() self.in_channel=in_channel self.out_channel=out_channel base_anchors=utils.generate_base_anchors(feature_stride=feature_stride) self.anchors=utils.generate_anchors(base_anchors) self.num_anchors=self.anchors.size(0) self.proposal_layer=Proposal() self.rpn_conv=nn.Sequential(nn.Conv2d(in_channel,out_channel, kernel_size=3, stride=1, padding=1), nn.LeakyReLU(0.2,inplace=True)) self.cls_conv=nn.Conv2d(in_channel,self.num_anchors, kernel_size=1, stride=1, padding=0) self.reg_conv=nn.Conv2d(in_channel,self.num_anchors*4, kernel_size=1, stride=1, padding=0) self.softmax=nn.Sigmoid(dim=2)
from networks import backbone import tensorflow as tf import numpy as np from utils import generate_anchors, read_batch_data from ops import smooth_l1, focal_loss from config import BATCH_SIZE, IMG_H, IMG_W, K, WEIGHT_DECAY, LEARNING_RATE anchors_p3 = generate_anchors(area=32, stride=8) anchors_p4 = generate_anchors(area=64, stride=16) anchors_p5 = generate_anchors(area=128, stride=32) anchors_p6 = generate_anchors(area=256, stride=64) anchors_p7 = generate_anchors(area=512, stride=128) anchors = np.concatenate( (anchors_p3, anchors_p4, anchors_p5, anchors_p6, anchors_p7), axis=0) def train(): inputs = tf.placeholder(tf.float32, [BATCH_SIZE, IMG_H, IMG_W, 3]) labels = tf.placeholder(tf.float32, [BATCH_SIZE, None, K]) target_bbox = tf.placeholder(tf.float32, [BATCH_SIZE, None, 4]) foreground_mask = tf.placeholder(tf.float32, [BATCH_SIZE, None]) valid_mask = tf.placeholder( tf.float32, [BATCH_SIZE, None] ) # valid_mask = foreground_mask + background_mask, remove the influence of the bbox iou in [0.4, 0.5] is_training = tf.placeholder(tf.bool) learning_rate = tf.placeholder(tf.float32) class_logits, box_logits, _, _ = backbone(inputs, is_training) class_loss = tf.reduce_sum(focal_loss(class_logits, labels) * valid_mask) / tf.reduce_sum(foreground_mask) box_loss = tf.reduce_sum(
def data_generator(dataset, config, shuffle=True, augment=True, random_rois=0, batch_size=1, detection_targets=False): b = 0 # batch item index image_index = -1 image_ids = np.copy(dataset.image_ids) error_count = 0 anchors = [] for i, scale in enumerate(config.ANCHOR_SCALES): anchors.append( utils.generate_anchors(scales=scale, ratios=config.ANCHOR_RATIOS, shape=[8 * 2**i, 8 * 2**i], feature_stride=config.FEATURE_STRIDE / (2**i), anchor_stride=1)) anchors = np.concatenate(anchors, axis=0) while True: try: # Increment index to pick next image. Shuffle if at the start of an epoch. image_index = (image_index + 1) % len(image_ids) if shuffle and image_index == 0: np.random.shuffle(image_ids) # Get GT bounding boxes and masks for image. image_id = image_ids[image_index] image, image_shape = dataset.load_image(image_id) image_bbox, image_class_id, attribute, class_attribute = dataset.load_bbox_class_attr( image_id) image_bbox = translate_bbox(image_bbox, input_shape=image_shape[:2], output_shape=config.IMAGE_SHAPE[:2]) image = cv2.resize(image, tuple(config.IMAGE_SHAPE[:2])) gt_target_object, gt_target_bbox, gt_target_bbox_object = assign_bbox_to_anchors( config, image_bbox, anchors, image_id) # Skip images that have no instances. This can happen in cases # where we train on a subset of classes and the image doesn't # have any of the classes we care about. # if not np.any(gt_class_ids > 0): # continue # Init batch arrays if b == 0: batch_images = np.zeros((batch_size, ) + image.shape, dtype=np.float32) batch_gt_class_ids = np.zeros( (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32) batch_gt_object = np.zeros((batch_size, anchors.shape[0]), dtype=np.int32) batch_gt_boxes = np.zeros((batch_size, anchors.shape[0], 4), dtype=np.float32) batch_gt_boxes_object = np.zeros( (batch_size, anchors.shape[0]), dtype=np.int32) batch_gt_attribute = np.zeros( (batch_size, config.NUM_ATTRIBUTE)) # Add to batch batch_images[b] = image.astype( np.float32) # mold_image(image.astype(np.float32), config) batch_gt_class_ids[b, 0] = image_class_id batch_gt_object[b] = gt_target_object batch_gt_boxes[b, :gt_target_bbox.shape[0]] = gt_target_bbox batch_gt_boxes_object[b] = gt_target_bbox_object batch_gt_attribute[b] = attribute b += 1 # Batch full? if b >= batch_size: inputs = [ batch_images, batch_gt_attribute, batch_gt_object, batch_gt_boxes, batch_gt_boxes_object ] outputs = [] yield inputs, outputs # start a new batch b = 0 except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image logging.exception("Error processing image {}".format( dataset.print_image_info[image_id])) error_count += 1 if error_count > 5: raise
def forward(self, patches, label_maps, gt_bboxes, iter_step=0): # ----------------- feature extraction ----------------------------------------- with torch.no_grad(): feats = self.feat_extractor(patches.contiguous().view([-1] + list(patches.shape[2:]))) win_cell_sz = label_maps.size(3) feats = F.interpolate(feats, (win_cell_sz, win_cell_sz), mode='bilinear', align_corners=True) feats = feats.view(list(patches.shape[0:2]) + list(feats.shape[1:])) map_loss_total, bbox_loss_total, meta_loss_total = 0, 0, 0 # --------------- initial frame meta updating ---------------------------------- init_params, init_lrs = self.meta_init.initialize() self.meta_opti.initialize(init_lrs) # random scale params to prevent overfitting offset = (config.feat_channels + 1) * config.cf_channels + \ config.base_filter_size[0] * config.base_filter_size[1] * config.cf_channels + 1 rand_scales_cf = torch.exp(config.rand_scale_radius_cf * (2 * torch.rand(offset) - 1)) rand_scales_reg = torch.exp(config.rand_scale_radius_reg * (2 * torch.rand(len(init_params)-offset) - 1)) rand_scales = torch.cat([rand_scales_cf, rand_scales_reg], 0) if torch.cuda.is_available(): rand_scales = rand_scales.cuda() init_params = init_params / rand_scales filter_size = compute_filter_size(gt_bboxes[:, 0, 2:]) base_anchor_sizes = filter_size/config.filter_scale*config.cell_sz anchors = generate_anchors(win_cell_sz, base_anchor_sizes) # calculate initial update loss n_init_aug = len(config.aug_init_scales)*len(config.aug_init_ratios) pred_map, pred_bbox = adaptable_conv2d(feats[:, 0:n_init_aug], init_params*rand_scales, filter_size) map_loss = self.l2loss(pred_map, label_maps[:, 0:n_init_aug]) bbox_loss = self.smoothl1loss(pred_bbox, gt_bboxes[:, 0:n_init_aug], anchors) loss = map_loss + bbox_loss grads = torch.autograd.grad(loss, init_params, create_graph=True) updated_params = init_params - init_lrs * grads[0] # calculate initial meta loss if self.training: lh_idx = np.random.randint(config.look_ahead) + n_init_aug lh_pred_map, lh_pred_bbox = adaptable_conv2d(feats[:, lh_idx: lh_idx + 1], updated_params * rand_scales, filter_size) lh_map_loss = self.l2loss(lh_pred_map, label_maps[:, lh_idx:lh_idx + 1]) lh_bbox_loss = self.smoothl1loss(lh_pred_bbox, gt_bboxes[:, lh_idx: lh_idx + 1], anchors) else: lh_pred_map, lh_pred_bbox = adaptable_conv2d(feats[:, n_init_aug:config.look_ahead + n_init_aug], updated_params * rand_scales, filter_size) lh_map_loss = self.l2loss(lh_pred_map, label_maps[:, n_init_aug:config.look_ahead + n_init_aug]) lh_bbox_loss = self.smoothl1loss(lh_pred_bbox, gt_bboxes[:, n_init_aug:config.look_ahead + n_init_aug], anchors) meta_loss_init = lh_map_loss + lh_bbox_loss if self.training and iter_step % config.disp_inter == 0 and torch.cuda.current_device() == 0: self.writer.add_scalar('roam_training/init_update_loss', loss.data.item(), iter_step) self.writer.add_scalar('roam_training/init_meta_loss', meta_loss_init.data.item(), iter_step) self.writer.add_histogram('roam_training/init_cf_params', init_params[:offset], iter_step) self.writer.add_histogram('roam_training/init_cf_lrs', init_lrs[:offset], iter_step) self.writer.add_histogram('roam_training/init_reg_params', init_params[offset:], iter_step) self.writer.add_histogram('roam_training/init_reg_lrs', init_lrs[offset:], iter_step) print(bbox_loss.data.item(), lh_bbox_loss.data.item()) map_loss_total += lh_map_loss bbox_loss_total += lh_bbox_loss meta_loss_total += meta_loss_init # --------------- subsequent frames meta updating ------------------------------ for k in range(1, config.time_step): # adapt to new size filter_size = compute_filter_size(gt_bboxes[:, k * config.look_ahead + n_init_aug, 2:]) base_anchor_sizes = filter_size / config.filter_scale * config.cell_sz anchors = generate_anchors(win_cell_sz, base_anchor_sizes) # calculate update loss training_idxes = range((k - 1) * config.look_ahead + n_init_aug, k * config.look_ahead + n_init_aug) pred_map, pred_bbox = adaptable_conv2d(feats[:, training_idxes], updated_params * rand_scales, filter_size) map_loss = self.l2loss(pred_map, label_maps[:, training_idxes]) bbox_loss = self.smoothl1loss(pred_bbox, gt_bboxes[:, training_idxes], anchors) loss = map_loss + bbox_loss # meta update grads = torch.autograd.grad(loss, updated_params, retain_graph=True) updated_params = self.meta_opti.meta_update(updated_params, loss, grads[0], self.writer, iter_step) # calculate meta loss if self.training: delta_lh_idx = np.random.randint(config.look_ahead) lh_idx = k * config.look_ahead + n_init_aug + delta_lh_idx lh_pred_map, lh_pred_bbox = adaptable_conv2d(feats[:, lh_idx: lh_idx + 1], updated_params * rand_scales, filter_size) lh_map_loss = self.l2loss(lh_pred_map, label_maps[:, lh_idx:lh_idx + 1]) lh_bbox_loss = self.smoothl1loss(lh_pred_bbox, gt_bboxes[:, lh_idx:lh_idx + 1], anchors) else: chosen_idxes = range(k * config.look_ahead + n_init_aug, (k + 1) * config.look_ahead + n_init_aug) lh_pred_map, lh_pred_bbox= adaptable_conv2d(feats[:, chosen_idxes], updated_params * rand_scales, filter_size) lh_map_loss = self.l2loss(lh_pred_map, label_maps[:, chosen_idxes]) lh_bbox_loss = self.smoothl1loss(lh_pred_bbox, gt_bboxes[:, chosen_idxes], anchors) meta_loss_update = lh_map_loss + lh_bbox_loss map_loss_total += lh_map_loss bbox_loss_total += lh_bbox_loss meta_loss_total += meta_loss_update map_loss_avg = map_loss_total / config.time_step bbox_loss_avg = bbox_loss_total / config.time_step meta_loss_avg = meta_loss_total / config.time_step return map_loss_avg, bbox_loss_avg, meta_loss_avg
nms_thresh = 0.4 box_num = 5 use_fpn = 1 box_thresh = 0.6 cls_num = len(classes) (input_w, input_h) = input_size = (672, 224) (out_w, out_h) = output_size = (input_w/16, input_h/16) biases = [8.00, 10.27, 15.51, 26.77, 35.73, 44.89, 59.50, 103.39, 160.69, 162.92] biases = np.array(biases, dtype=np.float32) if use_fpn == 1: biases = 1.0 * biases * out_w / input_w box_num = len(biases) / 2 anchors = generate_anchors(output_size, box_num, biases) def_img_path = '../assets/000000.png' def_model = '../models_maskyolo/mb_v2_t4_cls5_yolo/mb_v2_t4_cls5_deploy.prototxt' def_weights = '../models_maskyolo/pretrained_models/mb_v2_t4_cls5.caffemodel' def run(img_path=def_img_path, model=def_model, weights=def_weights): net = caffe.Net(model, weights, caffe.TEST) img = cv2.imread(img_path) (height, width, channel) = img.shape img_org = img.copy() #img = cv2.resize(img, (480, 480))#, interpolation=cv2.INTER_AREA) img = cv2.resize(img, input_size, interpolation=cv2.INTER_AREA) (height, width, channel) = img.shape
from data import create_data from svhn_dataset import SVHN from train import RetinaTrainer, parse_args, output_predictions import efficientdet import utils from coco_eval import CocoEvaluation if __name__ == '__main__': args, argstr = parse_args(skip_name = True) # Prepare data num_classes = SVHN.LABELS pyramid_levels = args.pyramid_levels anchors = utils.generate_anchors(pyramid_levels, args.image_size, num_scales=args.num_scales, aspect_ratios=args.aspect_ratios) train_dataset, dev_dataset, _ = create_data(args.batch_size, anchors, image_size = args.image_size, test=args.test, augmentation=args.augmentation) # Prepare network and trainer anchors_per_level = args.num_scales * len(args.aspect_ratios) network = efficientdet.EfficientDet(num_classes, anchors_per_level, input_size = args.image_size, pyramid_levels = pyramid_levels, filters=args.efficientdet_filters, num_layers = args.efficientdet_layers) model = RetinaTrainer(network, anchors, train_dataset, dev_dataset, args) # Load weights model.model.load_weights('model.h5')
def view(dataset, config, shuffle=True, augment=True, batch_size=1): """ shuffle: shuffle image every epoch return: - input_image - input_image_meta - input_rpn_match - input_rpn_bbox - input_gt_class_ids - input_gt_boxes """ anchors = utils.generate_anchors(config.ANCHOR_SCALES, config.ANCHOR_RATIOS, config.ANCHOR_STRIDE, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES) b = 0 #batch index image_ids = np.copy(dataset.image_ids) print(len(image_ids)) error_count = 0 index=1723 while True: image_id = image_ids[index] input_image, input_image_meta, input_gt_class_ids, input_gt_boxes =\ load_image_gt(dataset, config, image_id) print(input_image.shape) print(input_image_meta) print(input_gt_class_ids) print(input_gt_boxes) if not np.any(input_gt_class_ids > 0): continue #RPN targets rpn_match, rpn_bbox = RPN.build_targets(input_image.shape, anchors, input_gt_class_ids, input_gt_boxes, config) #print(input_gt_boxes) for gt_box in input_gt_boxes: y1,x1,y2,x2 = gt_box print(y1,x1,y2,x2) y1 = int(y1) x1 = int(x1) y2 = int(y2) x2 = int(x2) for y in range(y1,y2+1): input_image[y][x1][0] = 255.0 input_image[y][x1][1] = 0.0 input_image[y][x1][2] = 0.0 for y in range(y1,y2+1): input_image[y][x2][0] = 255.0 input_image[y][x2][1] = 0.0 input_image[y][x2][2] = 0.0 for x in range(x1,x2+1): input_image[y1][x][0] = 255.0 input_image[y1][x][1] = 0.0 input_image[y1][x][2] = 0.0 for x in range(x1,x2+1): input_image[y2][x][0] = 255.0 input_image[y2][x][1] = 0.0 input_image[y2][x][2] = 0.0 for i,anchor in enumerate(anchors): #anchor = utils.clip_boxes(anchor, np.array([0,0,832,832])) if rpn_match[i]==0: continue y1,x1,y2,x2 = anchor y1 = max(min(y1,512-1), 0) x1 = max(min(x1,512-1), 0) y2 = max(min(y2,512-1), 0) x2 = max(min(x2,512-1), 0) y1 = int(y1) x1 = int(x1) y2 = int(y2) x2 = int(x2) #print(y1,x1,y2,x2) if rpn_match[i]==1: for y in range(y1,y2+1): input_image[y][x1][0] = 0.0 input_image[y][x1][1] = 255.0 input_image[y][x1][2] = 0.0 input_image[y][x2][0] = 0.0 input_image[y][x2][1] = 255.0 input_image[y][x2][2] = 0.0 for x in range(x1,x2+1): input_image[y1][x][0] = 0.0 input_image[y1][x][1] = 255.0 input_image[y1][x][2] = 0.0 input_image[y2][x][0] = 0.0 input_image[y2][x][1] = 255.0 input_image[y2][x][2] = 0.0 else: for y in range(y1,y2+1): input_image[y][x1][0] = 0.0 input_image[y][x1][1] = 0.0 input_image[y][x1][2] = 255.0 input_image[y][x2][0] = 0.0 input_image[y][x2][1] = 0.0 input_image[y][x2][2] = 255.0 for x in range(x1,x2+1): input_image[y1][x][0] = 0.0 input_image[y1][x][1] = 0.0 input_image[y1][x][2] = 255.0 input_image[y2][x][0] = 0.0 input_image[y2][x][1] = 0.0 input_image[y2][x][2] = 255.0 """ f = open("todo.txt", "w") for x in rpn_match: f.write(str(x)) for x in rpn_bbox: f.write(str(x)) f.close() """ from skimage.io import imsave imsave('GT_RPN_input_val.png',input_image) break
def produce_batch(image_file, true_boxes): image_name = image_file.replace('.jpg','').replace(trainDIR ,'') image = Image.open(image_file).resize((image_size ,image_size ), Image.NEAREST) data = asarray(image)/255.0 del image feature_map = pretrained_model.predict(data.reshape(-1,data.shape[0],data.shape[1],data.shape[2])) del data feature_size = feature_map.shape[1] feature_stride = int( image_size / feature_size ) number_feature_points = feature_size * feature_size shift = np.arange(0, feature_size) * feature_stride shift_x, shift_y = np.meshgrid(shift, shift) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() base_anchors = generate_anchors(feature_stride, feature_stride,ratios = ANCHOR_RATIOS, scales = ANCHOR_SCALES) all_anchors = (base_anchors.reshape((1, anchor_number, 4)) + shifts.reshape((1, number_feature_points, 4)).transpose((1, 0, 2))) total_anchor_number = anchor_number*number_feature_points all_anchors = all_anchors.reshape((total_anchor_number , 4)) #only keep anchors inside image+border. border=0 # could also be FILTER_SIZE x feature stride inds_inside = np.where( (all_anchors[:, 0] >= -border) & (all_anchors[:, 1] >= -border) & (all_anchors[:, 2] < image_size+border ) & (all_anchors[:, 3] < image_size+border) )[0] anchors=all_anchors[inds_inside] useful_anchor_number = len(inds_inside) overlaps = bbox_overlaps(anchors, true_boxes) which_box = overlaps.argmax(axis=1) # Which true box has more overlap with each anchor? anchor_max_overlaps = overlaps[np.arange(overlaps.shape[0]), which_box] which_anchor = overlaps.argmax(axis=0) # Which anchor has more overlap for each true box? box_max_overlaps = overlaps[which_anchor, np.arange(overlaps.shape[1])] which_anchor_v2 = np.where(overlaps == box_max_overlaps)[0] labels = np.empty((useful_anchor_number, ), dtype=np.float32) labels.fill(-1) labels[ which_anchor_v2 ] = 1 labels[ anchor_max_overlaps >= FG_THRESHOLD] = 1 labels[ anchor_max_overlaps <= BG_THRESHOLD] = 0 fg_inds = np.where(labels == 1)[0] bg_inds = np.where(labels == 0)[0] num_fg = int(BATCH_SIZE/(1+BG_FG_FRAC)) if len(fg_inds) > num_fg: disable_inds = np.random.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1 fg_inds = np.where(labels == 1)[0] num_bg = int(len(fg_inds) * BG_FG_FRAC) if len(bg_inds) > num_bg: disable_inds = np.random.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 bg_inds = np.where(labels == 0)[0] anchor_batch_inds = inds_inside[labels!=-1] np.random.shuffle(anchor_batch_inds) feature_batch_inds=(anchor_batch_inds / anchor_number).astype(np.int) pad_size = int((FILTER_SIZE-1)/2) padded_fcmap=np.pad(feature_map,((0,0),(pad_size,pad_size),(pad_size,pad_size),(0,0)),mode='constant') padded_fcmap=np.squeeze(padded_fcmap) batch_tiles=[] for ind in feature_batch_inds: # x,y are the point in the feature map pointed at by feature_batch_inds indices x = ind % feature_size y = int(ind/feature_size) fc_snip=padded_fcmap[y:y+FILTER_SIZE,x:x+FILTER_SIZE,:] batch_tiles.append(fc_snip) # unmap creates another array of labels that includes a -1 for the originally deleted anchors for being out of bounds. full_labels = unmap(labels, total_anchor_number , inds_inside, fill=-1) batch_labels =full_labels.reshape(-1,1,1,1*anchor_number)[feature_batch_inds] targets = np.zeros((len(inds_inside), 4), dtype=np.float32) pos_anchors=all_anchors[inds_inside[labels==1]] # positive anchors targets = bbox_transform(pos_anchors, true_boxes[which_box, :][labels==1]) targets = unmap(targets, total_anchor_number, inds_inside[labels==1], fill=0) batch_targets = targets.reshape(-1,1,1,4*anchor_number)[feature_batch_inds] return np.asarray(batch_tiles), batch_labels.tolist(), batch_targets.tolist()
def main(): train_z_transforms = transforms.Compose([ # RandomStretch(), # CenterCrop((config.exemplar_size, config.exemplar_size)), ToTensor() ]) train_x_transforms = transforms.Compose([ # RandomStretch(), # RandomCrop((config.instance_size, config.instance_size), # config.max_translate), # ColorAug(config.color_ratio), ToTensor() ]) val_z_transforms = transforms.Compose([ # CenterCrop((config.exemplar_size, config.exemplar_size)), ToTensor() ]) val_x_transforms = transforms.Compose([ ToTensor() ]) score_size = int((config.instance_size - config.exemplar_size) / config.total_stride + 1) anchors = generate_anchors(config.total_stride, config.anchor_base_size, config.anchor_scales, config.anchor_ratios, score_size) # create dataset train_dataset = GOT_10KDataset(train_data_dir, train_z_transforms, train_x_transforms, anchors) valid_dataset = GOT_10KDataset(val_data_dir, val_z_transforms, val_x_transforms, anchors) trainloader = DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True, pin_memory=True, num_workers=config.train_num_workers, drop_last=True) validloader = DataLoader(valid_dataset, batch_size=config.valid_batch_size, shuffle=False, pin_memory=True, num_workers=config.valid_num_workers, drop_last=True) # create summary writer if not os.path.exists(config.log_dir): os.mkdir(config.log_dir) summary_writer = SummaryWriter(config.log_dir) # start training with torch.cuda.device(config.gpu_id): model = SiamRPN() model.load_pretrain(pretrain_model_dir) model.freeze_layers() model = model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # schdeuler = StepLR(optimizer, step_size=config.step_size, gamma=config.gamma) scheduler = np.logspace(math.log10(config.lr), math.log10(config.end_lr), config.epoch) for epoch in range(config.epoch): train_loss = [] model.train() curlr = scheduler[epoch] for param_group in optimizer.param_groups: param_group['lr'] = curlr for i, data in enumerate(tqdm(trainloader)): z, x, reg_label, cls_label = data z, x = Variable(z.cuda()), Variable(x.cuda()) reg_label, cls_label = Variable(reg_label.cuda()), Variable(cls_label.cuda()) pred_cls, pred_reg = model(z, x) optimizer.zero_grad() # permute pred_cls = pred_cls.reshape(-1, 1, config.anchor_num * score_size * score_size).permute(0,2,1) pred_reg = pred_reg.reshape(-1, 4, config.anchor_num * score_size * score_size).permute(0,2,1) cls_loss = rpn_cross_entropy_balance(pred_cls, cls_label, config.num_pos, config.num_neg) reg_loss = rpn_smoothL1(pred_reg, reg_label, cls_label, config.num_pos) loss = cls_loss + config.lamb * reg_loss loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.clip) optimizer.step() step = epoch * len(trainloader) + i summary_writer.add_scalar('train/loss', loss.data, step) train_loss.append(loss.data.cpu().numpy()) train_loss = np.mean(train_loss) valid_loss = [] model.eval() for i, data in enumerate(tqdm(validloader)): z, x, reg_label, cls_label = data z, x = Variable(z.cuda()), Variable(x.cuda()) reg_label, cls_label = Variable(reg_label.cuda()), Variable(cls_label.cuda()) pred_cls, pred_reg = model(z, x) pred_cls = pred_cls.reshape(-1, 1, config.anchor_num * score_size * score_size).permute(0, 2, 1) pred_reg = pred_reg.reshape(-1, 4, config.anchor_num * score_size * score_size).permute(0, 2, 1) cls_loss = rpn_cross_entropy_balance(pred_cls, cls_label, config.num_pos, config.num_neg) reg_loss = rpn_smoothL1(pred_reg, reg_label, cls_label, config.num_pos) loss = cls_loss + config.lamb * reg_loss valid_loss.append(loss.data.cpu().numpy()) valid_loss = np.mean(valid_loss) print("EPOCH %d valid_loss: %.4f, train_loss: %.4f, learning_rate: %.4f" % (epoch, valid_loss, train_loss, optimizer.param_groups[0]["lr"])) summary_writer.add_scalar('valid/loss', valid_loss, epoch + 1) torch.save(model.cpu().state_dict(), "./models/siamrpn_{}.pth".format(epoch + 1)) model.cuda()
def verify_label(h5_path): hdf5_dataset = h5py.File(h5_path, 'r') hdf5_gt_boxes = hdf5_dataset['gt_boxes'] hdf5_num_gt_boxes = hdf5_dataset['num_gt_boxes'] for i in range(len(hdf5_num_gt_boxes)): num_gt_boxes = hdf5_num_gt_boxes[i] gt_boxes = hdf5_gt_boxes[i].reshape((num_gt_boxes, 4)) w = gt_boxes[:, 3] - gt_boxes[:, 1] print(np.min(w)) if __name__ == '__main__': generator = data_generator('ctpn/train_art_0722_4482_1121.h5') anchors = utils.generate_anchors(config.RPN_ANCHOR_HEIGHTS, config.RPN_ANCHOR_WIDTHS, config.RPN_INPUT_SIZE // config.RPN_DOWNSCALE, config.RPN_DOWNSCALE, config.RPN_ANCHOR_STRIDE, ) while True: # next(generator) batch_images, batch_rpn_match, batch_rpn_bbox, batch_gt_boxes = next(generator)[0] for i in range(len(batch_images)): image = batch_images[i] image = utils.unmold_image(image)[:, :, ::-1].astype(np.uint8) rpn_match = batch_rpn_match[i] rpn_bbox = batch_rpn_bbox[i] gt_boxes = batch_gt_boxes[i] # anchors = utils.generate_anchors_2(config.RPN_ANCHOR_HEIGHTS, # config.RPN_ANCHOR_WIDTHS, # image.shape[0] // config.RPN_DOWNSCALE, # image.shape[1] // config.RPN_DOWNSCALE,
def data_generator(h5_path, batch_size=config.BATCH_SIZE, input_size=config.RPN_INPUT_SIZE, is_training=False): """ A generator that returns images and corresponding target class ids, bounding box deltas. Args: h5_path: batch_size: How many images to return in each call input_size: is_training: Returns: Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The contents of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ # Anchors # num_anchors = feature_map_size * feature_map_size * num_anchor_scales * num_anchor_ratios # [num_anchors, (y1, x1, y2, x2)] feature_map_size = input_size // config.RPN_DOWNSCALE anchors = utils.generate_anchors(config.RPN_ANCHOR_HEIGHTS, config.RPN_ANCHOR_WIDTHS, feature_map_size, config.RPN_DOWNSCALE, config.RPN_ANCHOR_STRIDE, ) current_idx = 0 hdf5_dataset = h5py.File(h5_path, 'r') hdf5_images = hdf5_dataset['images'] dataset_size = len(hdf5_images) # hdf5_image_shapes = hdf5_dataset['image_shapes'] hdf5_gt_boxes = hdf5_dataset['gt_boxes'] hdf5_num_gt_boxes = hdf5_dataset['num_gt_boxes'] indicies = np.arange(dataset_size) if is_training: np.random.shuffle(indicies) # batch item index b = 0 # Keras requires a generator to run indefinitely. while True: if current_idx >= dataset_size: if is_training: np.random.shuffle(indicies) current_idx = 0 if b == 0: # Init batch arrays batch_rpn_match = np.zeros([batch_size, anchors.shape[0], 1], dtype=np.int32) batch_rpn_bbox = np.zeros([batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=np.float32) batch_images = np.zeros((batch_size, input_size, input_size, 3), dtype=np.float32) # batch_gt_boxes = np.zeros((batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32) i = indicies[current_idx] image = hdf5_images[i] # image_shape = hdf5_image_shapes[i] # image = image.reshape(image_shape) num_gt_boxes = hdf5_num_gt_boxes[i] gt_boxes = hdf5_gt_boxes[i].reshape((num_gt_boxes, 4)) # image, scale, pad, window = utils.resize_and_pad_image(image, input_size) # gt_boxes = np.round(gt_boxes * scale).astype(np.int32) # (y1, x1, y2, x2) # gt_boxes = gt_boxes[:, [1, 0, 3, 2]] # + pad_left # gt_boxes[:, [1, 3]] += pad[1][0] # + pad top # gt_boxes[:, [0, 2]] += pad[0][0] # gt_boxes[:, [0, 2]] = np.clip(gt_boxes[:, [0, 2]], window[0], window[2]) # gt_boxes[:, [1, 3]] = np.clip(gt_boxes[:, [1, 3]], window[1], window[3]) # for gt_box in gt_boxes: # cv2.rectangle(image, (gt_box[1], gt_box[0]), (gt_box[3], gt_box[2]), (0, 255, 0), 1) # cv2.namedWindow('image', cv2.WINDOW_NORMAL) # cv2.imshow('image', image) # cv2.waitKey(0) # RPN Targets rpn_match, rpn_bbox = build_rpn_targets(anchors, gt_boxes) # Add to batch batch_rpn_match[b] = rpn_match[:, np.newaxis] batch_rpn_bbox[b] = rpn_bbox batch_images[b] = image # batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes b += 1 if b == batch_size: # inputs = [batch_images, batch_rpn_match, batch_rpn_bbox, batch_gt_boxes] inputs = [batch_images, batch_rpn_match, batch_rpn_bbox] outputs = [] yield inputs, outputs b = 0 current_idx += 1
def __init__(self, net, cfg): self.cfg = cfg self.net = net self.anchors = generate_anchors(cfg) if torch.cuda.is_available(): self.net.cuda() self.anchors = self.anchors.cuda() # Dataset transform transform = [ Transform(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_Z, size=cfg.MODEL.Z_SIZE), Transform(context_amount=cfg.TRAIN.CROP_CONTEXT_AMOUNT_X, size=cfg.MODEL.X_SIZE, random_translate=True, random_resize=True, motion_blur=True, random_translate_range=cfg.TRAIN.DATA_AUG_TRANSLATE_RANGE, random_resize_scale_min=cfg.TRAIN.DATA_AUG_RESIZE_SCALE_MIN, random_resize_scale_max=cfg.TRAIN.DATA_AUG_RESIZE_SCALE_MAX ) ] # Training dataset trackingnet = TrackingNet(cfg.PATH.TRACKINGNET, subset="train", debug_seq=cfg.TRAIN.DEBUG_SEQ) imagenet = ImageNetVID(cfg.PATH.ILSVRC, subset="train") sampler = PairSampler([trackingnet, imagenet], cfg=cfg, transform=transform, pairs_per_video=cfg.TRAIN.PAIRS_PER_VIDEO, frame_range=cfg.TRAIN.FRAME_RANGE) # Distractor dataset coco = CocoDetection(cfg.PATH.COCO, cfg.PATH.COCO_ANN_FILE) # coco_distractor = COCODistractor(coco, 4000) coco_positive = COCOPositivePair(coco, 4000, cfg=cfg, transform=transform) coco_negative = COCONegativePair(coco, 12000, cfg=cfg, transform=transform) dataset = ConcatDataset([sampler, coco_positive, coco_negative]) self.dataloader = DataLoader(dataset, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=4, shuffle=True, pin_memory=True, drop_last=True) # Validation dataset val_trackingnet = TrackingNet(cfg.PATH.TRACKINGNET, subset="val") val_imagenet = ImageNetVID(cfg.PATH.ILSVRC, subset="val") validation_sampler = PairSampler([val_trackingnet, val_imagenet], cfg=cfg, transform=transform, pairs_per_video=1, frame_range=cfg.TRAIN.FRAME_RANGE) val_coco_positive = COCOPositivePair(coco, 100, cfg=cfg, transform=transform) val_dataset = ConcatDataset([validation_sampler, val_coco_positive]) if cfg.TRAIN.DEBUG_SEQ >= 0: # When debugging on a single sequence, the validation is performed on the same one val_dataset = PairSampler([trackingnet], cfg=cfg, transform=transform, pairs_per_video=200) self.validation_dataloader = DataLoader(val_dataset, batch_size=min(cfg.TRAIN.BATCH_SIZE, 20), num_workers=4, shuffle=True, pin_memory=True, drop_last=False) # Loss self.criterion = MultiBoxLoss(self.anchors, cfg) self.optimizer = optim.Adam(self.net.parameters(), lr=cfg.TRAIN.LR, weight_decay=cfg.TRAIN.WEIGHT_DECAY) self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=cfg.TRAIN.SCHEDULER_STEP_SIZE, gamma=cfg.TRAIN.SCHEDULER_GAMMA) # Summary Writer self.run_id = datetime.now().strftime('%b%d_%H-%M-%S') if not cfg.DEBUG: self.save_config() self.save_code() self.writer = SummaryWriter(log_dir=os.path.join(cfg.PATH.DATA_DIR, "runs", self.run_id)) self.start_epoch = 0 if cfg.TRAIN.RESUME_CHECKPOINT: self.start_epoch = utils.load_checkpoint(cfg.TRAIN.RESUME_CHECKPOINT, self.net, self.optimizer) if torch.cuda.is_available(): self.net = nn.DataParallel(self.net) self.best_IOU = 0.
def produce_batch(filepath, gt_boxes, w_h): # 首先加载feature_map feature_map=np.load(filepath)["fc"] # print("load feature map done.") # 获得feature map的长乘宽,即所有像素点数量 height = np.shape(feature_map)[1] width = np.shape(feature_map)[2] num_feature_map=width*height # 用图片的长宽除以feature map的长宽,获得步长 img_width = w_h[0] img_height = w_h[1] w_stride = img_width / width h_stride = img_height / height # print("w_stride, h_stride", w_stride, h_stride) # 根据步长计算anchors #base anchors are 9 anchors wrt a tile (0,0,w_stride-1,h_stride-1) # base_anchors = generate_anchors(w_stride, h_stride, scales=np.asarray([1, 2, 4])) base_anchors = generate_anchors(16, 16, ratios=[0.5, 1], scales=np.asarray([1, 2, 8, 16])) #slice tiles according to image size and stride. #each 1x1x1532 feature map is mapping to a tile. shift_x = np.arange(0, width) * w_stride shift_y = np.arange(0, height) * h_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) #这一步获得了分割点的所有横坐标及纵坐标 # 计算出了所有偏移的(x, y, x, y)值,为什么会重复两下,因为base_anchors输出的就是(0,0,w_stride-1,h_stride-1)的模式,需要同步偏移 shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # 事实证明,对shape为(1, 9, 4)的矩阵与shape为(num_feature_map, 1, 4)的矩阵相加结果是得到shape为(num_feature_map, 9, 4) all_anchors = (base_anchors.reshape((1, k, 4)) + shifts.reshape((1, num_feature_map, 4)).transpose((1, 0, 2))) total_anchors = num_feature_map*k all_anchors = all_anchors.reshape((total_anchors, 4)) #only keep anchors inside image+borader. border=0 inds_inside = np.where( (all_anchors[:, 0] >= -border) & (all_anchors[:, 1] >= -border) & (all_anchors[:, 2] < img_width+border ) & # width (all_anchors[:, 3] < img_height+border) # height )[0] anchors=all_anchors[inds_inside] if len(anchors) == 0: return None, None, None # calculate overlaps each anchors to each gt boxes, # a matrix with shape [len(anchors) x len(gt_boxes)] overlaps = bbox_overlaps(anchors, gt_boxes) # find the gt box with biggest overlap to each anchors, # and the overlap ratio. result (len(anchors),) argmax_overlaps = overlaps.argmax(axis=1) # overlaps中每一行的最大值的索引值,即每一个anchor与哪一个gt_box得分最高,返回的是一维张量 max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] # 获得overlaps中每一列的最大值,即得分 # find the anchor with biggest overlap to each gt boxes, # and the overlap ratio. result (len(gt_boxes),) gt_argmax_overlaps = overlaps.argmax(axis=0) # overlaps中每一列的最大值的索引,即gt与哪个anchor最接近 gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] # 获得overlaps中每一列的最大值 gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] # 获得与最大值相同的列值(纵坐标) #labels, 1=fg/0=bg/-1=ignore 指在图片范围内的anchors的标签 labels = np.empty((len(inds_inside), ), dtype=np.float32) labels.fill(-1) # 根据论文,设置positive标签: # 只对两种anchor设置positive标签 # (1)与对每一个gt,IoU值最高的anchor # (2)对每一个anchor,其与所有gt的IoU最高分大于0.7的anchor labels[gt_argmax_overlaps] = 1 labels[max_overlaps >= .7] = 1 # 设置负面标签 labels[max_overlaps <= .3] = 0 # subsample positive labels if we have too many # num_fg = int(RPN_FG_FRACTION * RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] # if len(fg_inds) > num_fg: # disable_inds = npr.choice( # fg_inds, size=(len(fg_inds) - num_fg), replace=False) # labels[disable_inds] = -1 # subsample negative labels if we have too many num_bg = int(len(fg_inds) * BG_FG_FRAC) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: # 因为背景太多了,随机选出多余个的设置成忽略 disable_inds = npr.choice( bg_inds, size=(len(bg_inds) - num_bg), replace=False) # 从np.arange(0, bg_inds)中随机选len(bg_inds) - num_bg个 labels[disable_inds] = -1 # 从这里开始,计算batch,batch_inds是所有不被忽略的points batch_inds=inds_inside[labels!=-1] # 是这样的,首先batch_inds获得了在特征图内部的的anchor的索引值,又因为anchor排列是按9个9个排下来的,因此除9就是为了得到这个anchor对应的坐标 batch_inds=(batch_inds / k).astype(np.int) # 获得对应于所有anchos的label full_labels = unmap(labels, total_anchors, inds_inside, fill=-1) # batch_label_targets为n个1*1*k的 batch_label_targets=full_labels.reshape(-1,1,1,1*k)[batch_inds] bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) # bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :] # 获得标签为fg的anchors pos_anchors=all_anchors[inds_inside[labels==1]] # 归一化? bbox_targets = bbox_transform(pos_anchors, gt_boxes[argmax_overlaps, :][labels==1]) bbox_targets = unmap(bbox_targets, total_anchors, inds_inside[labels==1], fill=0) batch_bbox_targets = bbox_targets.reshape(-1,1,1,4*k)[batch_inds] # 在feature_map的第二个和第三个轴前后各填充一个值 padded_fcmap=np.pad(feature_map,((0,0),(1,1),(1,1),(0,0)),mode='constant') # 把padded_fcmap中维度为1的轴去掉,预期得到的是3维 padded_fcmap=np.squeeze(padded_fcmap) batch_tiles=[] for ind in batch_inds: x = ind % width y = int(ind/width) fc_3x3=padded_fcmap[y:y+3,x:x+3,:] batch_tiles.append(fc_3x3) # print("produce batch done.") return np.asarray(batch_tiles), batch_label_targets.tolist(), batch_bbox_targets.tolist()
def produce_batch(filepath, gt_boxes, scale): img = load_img(filepath) img_width = np.shape(img)[1] * scale[1] img_height = np.shape(img)[0] * scale[0] img = img.resize((int(img_width), int(img_height))) #feed image to pretrained model and get feature map img = img_to_array(img) img = np.expand_dims(img, axis=0) feature_map = pretrained_model.predict(img) height = np.shape(feature_map)[1] width = np.shape(feature_map)[2] num_feature_map = width * height #calculate output w, h stride w_stride = img_width / width h_stride = img_height / height #generate base anchors according output stride. #base anchors are 9 anchors wrt a tile (0,0,w_stride-1,h_stride-1) base_anchors = generate_anchors(w_stride, h_stride) #slice tiles according to image size and stride. #each 1x1x1532 feature map is mapping to a tile. shift_x = np.arange(0, width) * w_stride shift_y = np.arange(0, height) * h_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() #apply base anchors to all tiles, to have a num_feature_map*9 anchors. all_anchors = (base_anchors.reshape((1, 9, 4)) + shifts.reshape( (1, num_feature_map, 4)).transpose((1, 0, 2))) total_anchors = num_feature_map * 9 all_anchors = all_anchors.reshape((total_anchors, 4)) #only keep anchors inside image+borader. border = 0 inds_inside = np.where((all_anchors[:, 0] >= -border) & (all_anchors[:, 1] >= -border) & (all_anchors[:, 2] < img_width + border) & # width (all_anchors[:, 3] < img_height + border) # height )[0] anchors = all_anchors[inds_inside] # calculate overlaps each anchors to each gt boxes, # a matrix with shape [len(anchors) x len(gt_boxes)] overlaps = bbox_overlaps(anchors, gt_boxes) # find the gt box with biggest overlap to each anchors, # and the overlap ratio. result (len(anchors),) argmax_overlaps = overlaps.argmax(axis=1) max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] # find the anchor with biggest overlap to each gt boxes, # and the overlap ratio. result (len(gt_boxes),) gt_argmax_overlaps = overlaps.argmax(axis=0) gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] #labels, 1=fg/0=bg/-1=ignore labels = np.empty((len(inds_inside), ), dtype=np.float32) labels.fill(-1) # set positive label, define in Paper3.1.2: # We assign a positive label to two kinds of anchors: (i) the # anchor/anchors with the highest Intersection-overUnion # (IoU) overlap with a ground-truth box, or (ii) an # anchor that has an IoU overlap higher than 0.7 with any gt boxes labels[gt_argmax_overlaps] = 1 labels[max_overlaps >= .7] = 1 # set negative labels labels[max_overlaps <= .3] = 0 # subsample positive labels if we have too many # num_fg = int(RPN_FG_FRACTION * RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] # if len(fg_inds) > num_fg: # disable_inds = npr.choice( # fg_inds, size=(len(fg_inds) - num_fg), replace=False) # labels[disable_inds] = -1 # subsample negative labels if we have too many num_bg = int(len(fg_inds) * BG_FG_FRAC) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 # batch_inds = inds_inside[labels != -1] batch_inds = (batch_inds / k).astype(np.int) full_labels = unmap(labels, total_anchors, inds_inside, fill=-1) batch_label_targets = full_labels.reshape(-1, 1, 1, 1 * k)[batch_inds] bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) # bbox_targets = bbox_transform(anchors, gt_boxes[argmax_overlaps, :] pos_anchors = all_anchors[inds_inside[labels == 1]] bbox_targets = bbox_transform(pos_anchors, gt_boxes[argmax_overlaps, :][labels == 1]) bbox_targets = unmap(bbox_targets, total_anchors, inds_inside[labels == 1], fill=0) batch_bbox_targets = bbox_targets.reshape(-1, 1, 1, 4 * k)[batch_inds] padded_fcmap = np.pad(feature_map, ((0, 0), (1, 1), (1, 1), (0, 0)), mode='constant') padded_fcmap = np.squeeze(padded_fcmap) batch_tiles = [] for ind in batch_inds: x = ind % width y = int(ind / width) fc_3x3 = padded_fcmap[y:y + 3, x:x + 3, :] batch_tiles.append(fc_3x3) return np.asarray(batch_tiles), batch_label_targets.tolist( ), batch_bbox_targets.tolist()
def gen(dataset, config, shuffle=True, batch_size=1): """ shuffle: shuffle image every epoch return: - input_image - input_image_meta - input_rpn_match - input_rpn_bbox - input_gt_class_ids - input_gt_boxes """ anchors = utils.generate_anchors(config.ANCHOR_SCALES, config.ANCHOR_RATIOS, config.ANCHOR_STRIDE, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES) b = 0 #batch index image_ids = np.copy(dataset.image_ids) #print(image_ids) error_count = 0 index = -1 while True: try: index = (index + 1) % len(image_ids) if shuffle and index == 0: np.random.shuffle(image_ids) image_id = image_ids[index] input_image, input_image_meta, input_gt_class_ids, input_gt_boxes =\ load_image_gt(dataset, config, image_id) #print(input_image_meta) #print(input_gt_class_ids) if not np.any(input_gt_class_ids > 0): continue #RPN targets rpn_match, rpn_bbox = RPN.build_targets(input_image.shape, anchors, input_gt_class_ids, input_gt_boxes, config) if b == 0: #initial batch_image = np.zeros((batch_size, ) + input_image.shape, dtype=np.float32) batch_image_meta = np.zeros( (batch_size, ) + input_image_meta.shape, dtype=input_image_meta.dtype) batch_gt_class_ids = np.zeros( (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32) batch_gt_boxes = np.zeros( (batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.float32) batch_rpn_match = np.zeros([batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) batch_rpn_bbox = np.zeros( [batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype) if input_gt_boxes.shape[0] > config.MAX_GT_INSTANCES: ids = np.random.choice(np.arange(input_gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False) input_gt_class_ids = input_gt_class_ids[ids] input_gt_boxes = input_gt_boxes[ids] batch_image[b] = utils.mold_image(input_image.astype(np.float32), config) batch_image_meta[b] = input_image_meta batch_gt_class_ids[ b, :input_gt_class_ids.shape[0]] = input_gt_class_ids batch_gt_boxes[b, :input_gt_boxes.shape[0]] = input_gt_boxes batch_rpn_match[b] = rpn_match[:, np.newaxis] batch_rpn_bbox[b] = rpn_bbox b += 1 if b >= batch_size: """ inputs = [batch_image, batch_image_meta, batch_rpn_match, batch_gt_boxes, batch_gt_class_ids,batch_gt_boxes, batch_gt_masks] outputs = [] """ inputs = [ batch_image, batch_image_meta, batch_rpn_match, batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes ] outputs = [] yield inputs, outputs b = 0 except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image print("Error processing image: ", image_id) error_count += 1 if error_count > 5: raise
def build(self, mode, config): # build anchors anchors = [] for i, scale in enumerate(config.ANCHOR_SCALES): anchors.append( utils.generate_anchors(scales=scale, ratios=config.ANCHOR_RATIOS, shape=[8 * 2**i, 8 * 2**i], feature_stride=config.FEATURE_STRIDE / (2**i), anchor_stride=1)) self.anchors = np.concatenate(anchors, axis=0) # Define the input layers input_image = KL.Input(shape=config.IMAGE_SHAPE, name="input_image") if not self.mode == 'detection': input_attribute = KL.Input(shape=[config.NUM_ATTRIBUTE], name="input_features") # darknet53 S1, S2, S3, S4, S5 = darknet_graph(input_image, architecture='darknet53') def output_detection_layers(x, num_filters, out_filters, name): for i in range(2): conv_name_base = "last_conv_" + name + "_" + str(i) x = KL.Conv2D(num_filters, (1, 1), padding="SAME", name=conv_name_base + '_a', use_bias=True)(x) # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) x = KL.LeakyReLU(alpha=0.1)(x) x = KL.Conv2D(num_filters * 2, (3, 3), padding="SAME", name=conv_name_base + '_b', use_bias=True)(x) # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) x = KL.LeakyReLU(alpha=0.1)(x) x = KL.Conv2D(num_filters, (1, 1), padding="SAME", name=conv_name_base + "_out", use_bias=True)(x) # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) x = KL.LeakyReLU(alpha=0.1)(x) conv_name_base = "detection_head_" + name y = KL.Conv2D(num_filters * 2, (3, 3), padding="SAME", name=conv_name_base + 'a', use_bias=True)(x) # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) y = KL.LeakyReLU(alpha=0.1)(y) y = KL.Conv2D(out_filters, (1, 1), padding="SAME", name=conv_name_base + 'b', use_bias=True)(y) # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) y = KL.LeakyReLU(alpha=0.1)(y) return x, y # FPN: top to bottom # stage 1 x, y1 = output_detection_layers( S5, num_filters=512, out_filters=len(config.ANCHOR_SCALES[0]) * 3 * (4 + 1), name="fpn1") # stage 2 # x = KL.Conv2D(256, (1, 1), padding="SAME", name="last", use_bias=True)(x) # # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) # x = KL.LeakyReLU(alpha=0.1)(x) x = KL.UpSampling2D(2)(x) x = KL.Concatenate()([x, S4]) x, y2 = output_detection_layers( x, num_filters=256, out_filters=len(config.ANCHOR_SCALES[1]) * 3 * (4 + 1), name="fpn2") # stage 3 # x = KL.Conv2D(128, (1, 1), padding="SAME", name="last", use_bias=True)(x) # # x = BatchNorm(axis=3, name=bn_name_base + 'b')(x) # x = KL.LeakyReLU(alpha=0.1)(x) x = KL.UpSampling2D(2)(x) x = KL.Concatenate()([x, S3]) x, y3 = output_detection_layers( x, num_filters=128, out_filters=len(config.ANCHOR_SCALES[2]) * 3 * (4 + 1), name="fpn3") # detection: 3 * (4 + 1) for each anchor y1 = KL.Reshape((-1, 5))(y1) y2 = KL.Reshape((-1, 5))(y2) y3 = KL.Reshape((-1, 5))(y3) detection = KL.Concatenate(name="final_detection", axis=1)([y1, y2, y3]) # image feature image_feature = x if mode == 'training': # Define the input layers for ground truth num_anchors = K.int_shape(detection)[0] gt_bbox = KL.Input(shape=[num_anchors, 4], name="ground_truth_bbox") gt_bbox_object = KL.Input(shape=[num_anchors], name="ground_truth_bbox_object") gt_object = KL.Input(shape=[num_anchors], name="ground_truth_object") # Define the loss object_loss = KL.Lambda(lambda x: yolo_object_loss(*x), name='yolo_object_loss')( [detection, gt_object]) bbox_loss = KL.Lambda(lambda x: yolo_bbox_loss(*x), name='yolo_bbox_loss')( [detection, gt_bbox, gt_bbox_object]) bbox_object_loss = KL.Lambda( lambda x: yolo_bbox_object_loss(*x), name='yolo_bbox_object_loss')([detection, gt_bbox_object]) return KM.Model([ input_image, input_attribute, gt_object, gt_bbox, gt_bbox_object ], [ image_feature, detection, object_loss, bbox_loss, bbox_object_loss ]) if mode == 'detection': def transform_detection(detection): to = K.expand_dims(detection[..., 0], axis=-1) to = KL.Activation('sigmoid')(to) tx = K.expand_dims(detection[..., 1], axis=-1) ty = K.expand_dims(detection[..., 2], axis=-1) tw = K.expand_dims(detection[..., 3], axis=-1) th = K.expand_dims(detection[..., 4], axis=-1) tx = KL.Activation('sigmoid')(tx) ty = KL.Activation('sigmoid')(ty) detection = KL.Concatenate(name="final_transformed_detection", axis=-1)([to, tx, ty, tw, th]) return detection detection = KL.Lambda(lambda x: transform_detection(x), name="transform_final_detection")(detection) return KM.Model([input_image], [detection])
def data_generator(): ann_file = '{}/annotations/instances_{}.json'.format( config.DATASET_DIR, config.DATASET_TYPE) coco = COCO(ann_file) categories = coco.loadCats(coco.getCatIds()) nms = [cat['name'] for cat in categories] print('COCO categories: \n{}\n'.format(' '.join(nms))) img_ids = coco.getImgIds() all_anchors = utils.generate_anchors() while True: rand = np.random.randint(0, len(img_ids)) # rand = 3118 # print(rand) img_info = coco.loadImgs(img_ids[rand])[0] img = scipy.ndimage.imread(config.DATASET_DIR + '\\' + config.DATASET_TYPE + '\\' + img_info['file_name']) img = img.astype(np.float32) / 255. ratio, img, offset = utils.resize_keep_ratio(img, (1024, 1024)) ann_ids = coco.getAnnIds(imgIds=img_info['id'], iscrowd=0) anns = coco.loadAnns(ann_ids) bboxs = [ann['bbox'] for ann in anns] bboxs = np.vstack(bboxs) # OFFSET one for backgroound cls = np.array([ann['category_id'] + 1 for ann in anns]) masks = np.array([ utils.annToMask(ann, img_info['height'], img_info['width']) for ann in anns ]) # resize masks to desired shape bboxs_ind = bboxs.astype(np.int) masks = np.array([ cv2.resize( mask[bboxs_ind[i, 1]:bboxs_ind[i, 1] + bboxs_ind[i, 3], bboxs_ind[i, 0]:bboxs_ind[i, 0] + bboxs_ind[i, 2]], (config.MASK_OUTPUT_SHAPE, config.MASK_OUTPUT_SHAPE)) for i, mask in enumerate(masks) ]) bboxs = bboxs * ratio bboxs[:, :2] += offset bboxs_rpn = bboxs valid_label_range = 0 # we pad ot trim all labels to MAX_GT_TRAIN_INSTANCES to make it batched if bboxs.shape[0] > config.MAX_GT_TRAIN_INSTANCES: valid_label_range = config.MAX_GT_TRAIN_INSTANCES bboxs = bboxs[:config.MAX_GT_TRAIN_INSTANCES, :] cls = cls[:config.MAX_GT_TRAIN_INSTANCES] masks = masks[:config.MAX_GT_TRAIN_INSTANCES, :, :] else: valid_label_range = bboxs.shape[0] bboxs = np.pad( bboxs, ((0, config.MAX_GT_TRAIN_INSTANCES - bboxs.shape[0]), (0, 0)), mode='constant', constant_values=((0, 0), (0, 0))) cls = np.pad(cls, (0, config.MAX_GT_TRAIN_INSTANCES - cls.shape[0]), mode='constant', constant_values=(0, 0)) masks = np.pad( masks, ((0, config.MAX_GT_TRAIN_INSTANCES - masks.shape[0]), (0, 0), (0, 0)), mode='constant', constant_values=((0, 0), (0, 0), (0, 0))) # pre compute rpn targets anchor_types, matches = utils.generate_anchor_types( all_anchors, bboxs_rpn) rpn_positive_mask, rpn_mask = utils.get_mask(anchor_types) rpn_labels = utils.generate_rpn_labels(anchor_types, rpn_mask) rpn_deltas = utils.generate_rpn_deltas(all_anchors, bboxs_rpn, rpn_positive_mask, matches) rpn_positive_range = rpn_deltas.shape[0] # do some padding rpn_deltas = np.pad( rpn_deltas, ((0, config.RPN_ANCHORS_TRAIN_PER_IMAGE - rpn_positive_range), (0, 0)), 'constant') rpn_positive_mask = np.pad( rpn_positive_mask, (0, config.RPN_ANCHORS_TRAIN_PER_IMAGE - rpn_positive_range), 'constant', constant_values=-1) if config.DEBUG: fig = plt.figure() ax = fig.add_subplot(111) plt.imshow(img) # coco.showAnns(anns) for bbox in bboxs: ax.add_patch( patches.Rectangle( (bbox[0], bbox[1]), bbox[2], bbox[3], edgecolor="red", fill=False # remove background )) for m in matches: ax.add_patch( patches.Rectangle( (all_anchors[m][0], all_anchors[m][1]), all_anchors[m][2], all_anchors[m][3], edgecolor="blue", fill=False # remove background )) plt.show() # we feed precomputed rpn masks on multi-threaded cpu print() yield img, bboxs, rpn_labels, rpn_deltas, rpn_mask, rpn_positive_range, rpn_positive_mask, cls, masks, valid_label_range
def produce_batch(feature_map, gt_boxes, h_w=None, category=None): height = np.shape(feature_map)[1] width = np.shape(feature_map)[2] num_feature_map = width * height w_stride = h_w[1] / width h_stride = h_w[0] / height #base anchors are 9 anchors wrt a tile (0,0,w_stride-1,h_stride-1) base_anchors = generate_anchors(w_stride, h_stride) shift_x = np.arange(0, width) * w_stride shift_y = np.arange(0, height) * h_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() all_anchors = (base_anchors.reshape((1, anchors_num, 4)) + shifts.reshape( (1, num_feature_map, 4)).transpose((1, 0, 2))) total_anchors = num_feature_map * anchors_num all_anchors = all_anchors.reshape((total_anchors, 4)) # 用训练好的rpn进行预测,得出scores和deltas res = rpn_model.query_cnn(feature_map) scores = res[0] scores = scores.reshape(-1, 1) deltas = res[1] deltas = np.reshape(deltas, (-1, 4)) # 把dx dy转换成具体的xy值,并把照片以外的anchors去掉 proposals = bbox_transform_inv(all_anchors, deltas) proposals = clip_boxes(proposals, (h_w[0], h_w[1])) # remove small boxes keep = filter_boxes(proposals, small_box_threshold) # here threshold is 40 pixel proposals = proposals[keep, :] scores = scores[keep] # sort socres and only keep top 6000. pre_nms_topN = 6000 order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # apply NMS to to 6000, and then keep top 300 post_nms_topN = 300 keep = py_cpu_nms(np.hstack((proposals, scores)), 0.7) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # 把ground true也加到proposals中 proposals = np.vstack((proposals, gt_boxes)) # calculate overlaps of proposal and gt_boxes overlaps = bbox_overlaps(proposals, gt_boxes) gt_assignment = overlaps.argmax(axis=1) max_overlaps = overlaps.max(axis=1) # labels = gt_labels[gt_assignment] #? # sub sample fg_inds = np.where(max_overlaps >= FG_THRESH)[0] fg_rois_per_this_image = min(int(BATCH * FG_FRAC), fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) bg_inds = np.where((max_overlaps < BG_THRESH_HI) & (max_overlaps >= BG_THRESH_LO))[0] bg_rois_per_this_image = BATCH - fg_rois_per_this_image bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) # Sample background regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: # labels = labels[keep_inds] rois = proposals[keep_inds] gt_rois = gt_boxes[gt_assignment[keep_inds]] targets = bbox_transform(rois, gt_rois) #input rois rois_num = targets.shape[0] batch_box = np.zeros((rois_num, 200, 4)) for i in range(rois_num): batch_box[i, category] = targets[i] batch_box = np.reshape(batch_box, (rois_num, -1)) # get gt category batch_categories = np.zeros((rois_num, 200, 1)) for i in range(rois_num): batch_categories[i, category] = 1 batch_categories = np.reshape(batch_categories, (rois_num, -1)) return rois, batch_box, batch_categories