def _get_surpressed_boxes(self, img_batch, regression, classification, anchors): self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() transformed_anchors = self.regressBoxes.forward(anchors, regression) transformed_anchors = self.clipBoxes.forward(transformed_anchors, img_batch) scores = torch.max(classification, dim=2, keepdim=True)[0] scores_over_thresh = (scores>0.05)[0, :, 0] if scores_over_thresh.sum() == 0: # no boxes to NMS, just return return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] classification = classification[:, scores_over_thresh, :] transformed_anchors = transformed_anchors[:, scores_over_thresh, :] scores = scores[:, scores_over_thresh, :] anchors_nms_idx = nms(transformed_anchors, scores, overlap=0.5) nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) outputs = { 'bboxes': transformed_anchors[0, anchors_nms_idx, :], 'pred_class': nms_class, 'prob': nms_scores } return outputs
def __init__(self, num_classes, block, layers): self.inplanes = 64 super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if block == BasicBlock: fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, self.layer4[layers[3] - 1].conv2.out_channels] elif block == Bottleneck: fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, self.layer4[layers[3] - 1].conv3.out_channels] else: raise ValueError(f"Block type {block} not understood") self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) self.regressionModel = RegressionModel(256) self.classificationModel = ClassificationModel(256, num_classes=num_classes) self.anchors = Anchors() self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() self.focalLoss = losses.FocalLoss() for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() prior = 0.01 self.classificationModel.output.weight.data.fill_(0) self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior)) self.regressionModel.output.weight.data.fill_(0) self.regressionModel.output.bias.data.fill_(0) self.freeze_bn()
def __init__(self, num_classes, block, layers, n_head=1, attention_type='concat', shot_mode='mean', num_way=2, num_shot=5, pos_encoding=True, pretrained=False): super(ceaa_retinanet, self).__init__() self.model_path = 'data/pretrained_model/resnet50_caffe.pth' self.pretrained = pretrained self.inplanes = 64 self.n_head = n_head self.attention_type = attention_type self.shot_mode = shot_mode self.num_shot = num_shot self.pos_encoding = pos_encoding self.support_im_size = 320 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if self.pretrained == True: print("Loading pretrained weights from %s" % (self.model_path)) state_dict = torch.load(self.model_path) self.load_state_dict({ k: v for k, v in state_dict.items() if k in self.state_dict() }) def set_bn_fix(m): classname = m.__class__.__name__ if classname.find('BatchNorm') != -1: for p in m.parameters(): p.requires_grad = False self.apply(set_bn_fix) if block == BasicBlock: fpn_sizes = [ self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, self.layer4[layers[3] - 1].conv2.out_channels ] elif block == Bottleneck: fpn_sizes = [ self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, self.layer4[layers[3] - 1].conv3.out_channels ] else: raise ValueError(f"Block type {block} not understood") self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) # [512, 1024, 2048] self.fpn_dim = 256 attention_output_dim = 256 if attention_type == 'product' else 512 self.regressionModel = RegressionModel(attention_output_dim) self.classificationModel = ClassificationModel(attention_output_dim, num_classes=num_classes) self.anchors = Anchors([4, 5, 6, 7]) self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() self.focalLoss = losses.FocalLoss() # weights initialization for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() prior = 0.01 self.classificationModel.output.weight.data.fill_(0) self.classificationModel.output.bias.data.fill_(-math.log( (1.0 - prior) / prior)) self.regressionModel.output.weight.data.fill_(0) self.regressionModel.output.bias.data.fill_(0) self.freeze_bn() self.resnet_base = nn.Sequential(self.conv1, self.bn1, self.relu, self.maxpool) # querys, keys Q_list = [] K_list = [] self.d_k = 64 for i in range(self.n_head): Q_weight = nn.Linear(self.fpn_dim, self.d_k) K_weight = nn.Linear(self.fpn_dim, self.d_k) init.normal_(Q_weight.weight, std=0.01) init.constant_(Q_weight.bias, 0) init.normal_(K_weight.weight, std=0.01) init.constant_(K_weight.bias, 0) Q_list.append(Q_weight) K_list.append(K_weight) self.pyramid_Q_layers = nn.ModuleList(Q_list) self.pyramid_K_layers = nn.ModuleList(K_list) if self.pos_encoding: pel_4 = PositionalEncoding(d_model=256, max_len=20 * 20) pel_5 = PositionalEncoding(d_model=256, max_len=10 * 10) pel_6 = PositionalEncoding(d_model=256, max_len=5 * 5) pel_7 = PositionalEncoding(d_model=256, max_len=3 * 3) self.pos_encoding_layers = nn.ModuleList([pel_4, pel_5, pel_6, pel_7]) if n_head != 1: self.multihead_layer = nn.Linear(n_head * feature_size, feature_size)
class ObjectDetection(ImageAnalysis): def __init__(self, model, classes, means, sdevs, train_loader=None, val_loader=None, line_colors=None): super().__init__(model, classes, train_loader, val_loader, means, sdevs) self.line_col = line_colors self.epoch_now = 1 def run_singletask_model(self, settings, split, loader, optimizer=False): loss = 0 accuracies = [] for i, batch in enumerate(loader): if optimizer: optimizer.zero_grad() images = Variable(batch[0]) labels = Variable(batch[1]) if torch.cuda.is_available: images = images.cuda() labels = labels.cuda() losses, preds = self._get_batch_loss_and_preds(images, labels, settings['criterion']) batch_loss = losses[0] + losses[1] if optimizer: batch_loss.backward() optimizer.step() loss += batch_loss # accuracies.append(analysis_utils.get_mean_acc(preds, labels)) if (i+1) % settings['report_interval'][split] == 0: print(f"{split}: [{i} out of {len(loader)}]\nClassification: {losses[0]/(i+1):.4f}\nRegression: {losses[1]/(i+1):.4f}") if self.visualiser and i+1==len(loader): visualise.draw_rectangles(images, preds['bboxes'], preds['pred_class'], preds['prob'], gt=labels) self._imgs_to_tensorboard(images, split) # Memory management - must be cleared else the output between train/val phase are both stored # Which leads to 2x memory use images = labels = batch_loss = None loss = loss/(i+1) return loss def _get_batch_loss_and_preds(self, images, labels, criterion): regression, classification, anchors = self.model([images, labels]) criterion = FocalLoss() loss = criterion.forward(regression, classification, anchors, labels) preds = self._get_surpressed_boxes(images, regression, classification, anchors) return loss, preds def _get_surpressed_boxes(self, img_batch, regression, classification, anchors): self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() transformed_anchors = self.regressBoxes.forward(anchors, regression) transformed_anchors = self.clipBoxes.forward(transformed_anchors, img_batch) scores = torch.max(classification, dim=2, keepdim=True)[0] scores_over_thresh = (scores>0.05)[0, :, 0] if scores_over_thresh.sum() == 0: # no boxes to NMS, just return return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] classification = classification[:, scores_over_thresh, :] transformed_anchors = transformed_anchors[:, scores_over_thresh, :] scores = scores[:, scores_over_thresh, :] anchors_nms_idx = nms(transformed_anchors, scores, overlap=0.5) nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) outputs = { 'bboxes': transformed_anchors[0, anchors_nms_idx, :], 'pred_class': nms_class, 'prob': nms_scores } return outputs def _imgs_to_tensorboard(self, imgs, split): img = visualise.decode_image(imgs, self.means, self.sdevs) imag = torch.Tensor(img.permute(0,3,1,2)) row_views = imag[:,:3,:,:] # Grab only colour bands on image side_view = vutils.make_grid(row_views, nrow=len(img), normalize=True, scale_each=True) self.writer.add_image(f'{split}_Predicted-from-Image', side_view, self.epoch_now, dataformats='CHW') def train(self, settings): """Performs model training""" if self.loss_tracker.store_loss is True: self.loss_tracker.set_loss_file('train') if 'lr_decay_patience' in settings: lr_scheduler = ReduceLROnPlateau(settings['optimizer'], 'min', factor=settings['lr_decay'], patience=settings['lr_decay_patience'], verbose=True) for epoch in range(settings['n_epochs']): print(f"\n======= Epoch {self.epoch_now} =======\n") self.model = self.model.train() epoch_train_loss = self.run_singletask_model(settings, 'train', self.train_loader, optimizer=settings['optimizer']) # self.loss_tracker.store_epoch_loss('train', self.epoch_now, epoch_train_loss, epoch_train_accuracy) if self.val_loader is not None: self.validate(settings) if self.epoch_now % settings['save_interval'] == 0 and self.loss_tracker.store_models is True: print("Checkpoint-saving model") self.loss_tracker.save_model(self.model, epoch) # self._visualise_loss(settings, self.epoch_now, epoch_train_accuracy, 'train') # self._print_results(self.epoch_now, epoch_train_loss, epoch_train_accuracy, 'train') if 'lr_decay_epoch' in settings: if epoch in settings['lr_decay_epoch']: analysis_utils.decay_optimizer_lr(settings['optimizer'], settings['lr_decay']) print(f"\nlr decayed by {settings['lr_decay']}\n") elif 'lr_decay_patience' in settings: lr_scheduler.step(epoch_train_loss) self.epoch_now = len(self.loss_tracker.all_loss['train'])+1 if settings['shutdown'] is True: os.system("shutdown") def validate(self, settings): """For a given model, evaluation criterion, and validation loader, performs a single evaluation pass.""" self.model = self.model.eval() with torch.no_grad(): if self.loss_tracker.store_loss is True: self.loss_tracker.set_loss_file('val') epoch_val_loss, epoch_val_accuracy = self.run_singletask_model(settings, 'val', self.val_loader) self.loss_tracker.store_epoch_loss('val', self.epoch_now, epoch_val_loss, epoch_val_accuracy) self._visualise_loss(settings, self.epoch_now, epoch_val_accuracy, 'val') self._save_if_best(epoch_val_loss, self.model, settings['run_name']+'_best') self._print_results(self.epoch_now, epoch_val_loss, epoch_val_accuracy, 'val')
def __init__(self, num_classes, block, layers, pretrained=False): super(retina, self).__init__() self.model_path = 'data/pretrained_model/resnet50_caffe.pth' self.pretrained = pretrained self.inplanes = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if self.pretrained == True: print("Loading pretrained weights from %s" %(self.model_path)) state_dict = torch.load(self.model_path) self.load_state_dict({k:v for k,v in state_dict.items() if k in self.state_dict()}) def set_bn_fix(m): classname = m.__class__.__name__ if classname.find('BatchNorm') != -1: for p in m.parameters(): p.requires_grad=False self.apply(set_bn_fix) if block == BasicBlock: fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, self.layer4[layers[3] - 1].conv2.out_channels] elif block == Bottleneck: fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, self.layer4[layers[3] - 1].conv3.out_channels] else: raise ValueError(f"Block type {block} not understood") self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) self.regressionModel = RegressionModel(256) self.classificationModel = ClassificationModel(256, num_classes=num_classes) self.anchors = Anchors() self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() self.focalLoss = losses.FocalLoss() # weights initialization for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() prior = 0.01 self.classificationModel.output.weight.data.fill_(0) self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior)) self.regressionModel.output.weight.data.fill_(0) self.regressionModel.output.bias.data.fill_(0) self.freeze_bn() self.resnet_base = nn.Sequential( self.conv1, self.bn1, self.relu, self.maxpool )
def __init__(self, num_classes, block, layers, attention_type, reduce_dim, beta, num_way=2, num_shot=5, pos_encoding=True, pretrained=False): super(SEPAA_retinanet, self).__init__() self.model_path = 'data/pretrained_model/resnet50_caffe.pth' self.pretrained = pretrained self.inplanes = 64 self.attention_type = attention_type self.num_shot = num_shot self.pos_encoding = pos_encoding self.support_im_size = 320 self.reduce_dim = reduce_dim self.beta = beta self.unary_gamma = 0.1 self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if self.pretrained == True: print("Loading pretrained weights from %s" % (self.model_path)) state_dict = torch.load(self.model_path) self.load_state_dict({ k: v for k, v in state_dict.items() if k in self.state_dict() }) def set_bn_fix(m): classname = m.__class__.__name__ if classname.find('BatchNorm') != -1: for p in m.parameters(): p.requires_grad = False self.apply(set_bn_fix) if block == BasicBlock: fpn_sizes = [ self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, self.layer4[layers[3] - 1].conv2.out_channels ] elif block == Bottleneck: fpn_sizes = [ self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, self.layer4[layers[3] - 1].conv3.out_channels ] else: raise ValueError(f"Block type {block} not understood") attention_output_dim = 256 if self.attention_type == 'product' else 512 if self.attention_type == 'product': self.fpn = PyramidFeatures( fpn_sizes[0], fpn_sizes[1], fpn_sizes[2], feature_size=attention_output_dim) # [512, 1024, 2048] else: self.fpn = PyramidFeatures(fpn_sizes[0] * 2, fpn_sizes[1] * 2, fpn_sizes[2] * 2, feature_size=attention_output_dim) self.regressionModel = RegressionModel(attention_output_dim) self.classificationModel = ClassificationModel(attention_output_dim, num_classes=num_classes) self.anchors = Anchors([4, 5, 6, 7]) self.regressBoxes = BBoxTransform() self.clipBoxes = ClipBoxes() self.focalLoss = losses.FocalLoss() # weights initialization for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() prior = 0.01 self.classificationModel.output.weight.data.fill_(0) self.classificationModel.output.bias.data.fill_(-math.log( (1.0 - prior) / prior)) self.regressionModel.output.weight.data.fill_(0) self.regressionModel.output.bias.data.fill_(0) self.freeze_bn() self.resnet_base = nn.Sequential(self.conv1, self.bn1, self.relu, self.maxpool) # querys, keys unary_list = [] adapt_q_list = [] adapt_k_list = [] channel_k_list = [] self.fpn_dims = [512, 1024, 2048] for fpn_dim in self.fpn_dims: unary_layer = nn.Linear(fpn_dim, 1) init.normal_(unary_layer.weight, std=0.01) init.constant_(unary_layer.bias, 0) adapt_q_layer = nn.Linear(fpn_dim, reduce_dim) init.normal_(adapt_q_layer.weight, std=0.01) init.constant_(adapt_q_layer.bias, 0) adapt_k_layer = nn.Linear(fpn_dim, reduce_dim) init.normal_(adapt_k_layer.weight, std=0.01) init.constant_(adapt_k_layer.bias, 0) channel_k_layer = nn.Linear(fpn_dim, 1) init.normal_(channel_k_layer.weight, std=0.01) init.constant_(channel_k_layer.bias, 0) unary_list.append(unary_layer) adapt_q_list.append(adapt_q_layer) adapt_k_list.append(adapt_k_layer) channel_k_list.append(channel_k_layer) self.unary_layers = nn.ModuleList(unary_list) self.adapt_Q_layers = nn.ModuleList(adapt_q_list) self.adapt_K_layers = nn.ModuleList(adapt_k_list) self.channel_K_layers = nn.ModuleList(channel_k_list) if self.pos_encoding: pel_3 = PositionalEncoding(d_model=512, max_len=40 * 40) pel_4 = PositionalEncoding(d_model=1024, max_len=20 * 20) pel_5 = PositionalEncoding(d_model=2048, max_len=10 * 10) self.pos_encoding_layers = nn.ModuleList([pel_3, pel_4, pel_5])