def print_loss_log(self, start_time, iters_per_epoch, e, i, class_loss, loc_loss, loss): """ Prints the loss and elapsed time for each epoch """ """ Prints the loss and elapsed time for each epoch """ total_iter = self.num_epochs * iters_per_epoch cur_iter = e * iters_per_epoch + i elapsed = time.time() - start_time total_time = (total_iter - cur_iter) * elapsed / (cur_iter + 1) epoch_time = (iters_per_epoch - i) * elapsed / (cur_iter + 1) epoch_time = str(datetime.timedelta(seconds=epoch_time)) total_time = str(datetime.timedelta(seconds=total_time)) elapsed = str(datetime.timedelta(seconds=elapsed)) log = "Elapsed {}/{} -- {}, Epoch [{}/{}], Iter [{}/{}], " \ "class_loss: {:.4f}, loc_loss: {:.4f}, " \ "loss: {:.4f}".format(elapsed, epoch_time, total_time, e + 1, self.num_epochs, i + 1, iters_per_epoch, class_loss.item(), loc_loss.item(), loss.item()) write_print(self.output_txt, log)
def save_results(all_boxes, dataset, results_path, output_txt): # for each class for class_i, class_name in enumerate(VOC_CLASSES): text = 'Writing {:s} VOC results file'.format(class_name) write_print(output_txt, text) filename = osp.join(results_path, class_name + '.txt') with open(filename, 'wt') as f: # get detections for the class in an image for image_i, image_id in enumerate(dataset.ids): detections = all_boxes[class_i + 1][image_i] # print('HELLO', detections) # if there are detections for the class in the image if len(detections) != 0: for k in range(detections.shape[0]): output = '{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n' # the VOCdevkit expects 1-based indices output = output.format(image_id[2], detections[k, -1], detections[k, 0] + 1, detections[k, 1] + 1, detections[k, 2] + 1, detections[k, 3] + 1) f.write(output)
def load_pretrained_model(self): """ loads a pre-trained model from a .pth file """ self.model.load_state_dict( torch.load( os.path.join(self.model_save_path, '{}.pth'.format(self.pretrained_model)))) write_print(self.output_txt, 'loaded trained model {}'.format(self.pretrained_model))
def print_network(self, model): """ Prints the structure of the network and the total number of parameters """ num_params = 0 for p in model.parameters(): num_params += p.numel() write_print(self.output_txt, str(model)) write_print(self.output_txt, 'The number of parameters: {}'.format(num_params))
def do_python_eval(results_path, dataset, output_txt, mode, use_07_metric): # annotation cache directory cache_dir = osp.join(results_path, 'annotations_cache') # path to VOC + year path = osp.join(dataset.data_path, 'VOC{}'.format(dataset.image_sets[0][0])) # path to XML annotation folder annotation_path = dataset.annotation_path # text file containing the list of (test) images list_path = dataset.text_path.format(path, mode, mode) # The PASCAL VOC metric changed in 2010 write_print(output_txt, '\nVOC07 metric? ' + ('Yes\n' if use_07_metric else 'No\n')) # for each class, compute the recall, precision, and ap aps = [] for class_name in VOC_CLASSES: detection_path = osp.join(results_path, class_name + '.txt') recall, precision, ap = voc_eval(detection_path=detection_path, path=path, annotation_path=annotation_path, list_path=list_path, class_name=class_name, cache_dir=cache_dir, output_txt=output_txt, overlap_threshold=0.5, use_07_metric=use_07_metric) aps += [ap] write_print(output_txt, 'AP for {} = {:.4f}'.format(class_name, ap)) pickle_file = osp.join(results_path, class_name + '_pr.pkl') with open(pickle_file, 'wb') as f: pickle.dump({'rec': recall, 'prec': precision, 'ap': ap}, f) write_print(output_txt, 'Mean AP = {:.4f}'.format(np.mean(aps))) return aps, np.mean(aps)
def eval(self, dataset, max_per_image, score_threshold): num_images = len(dataset) all_boxes = [[[] for _ in range(num_images)] for _ in range(self.class_count)] # prepare timers, paths, and files timer = {'detection': Timer(), 'nms': Timer()} results_path = osp.join(self.model_test_path, self.pretrained_model) detection_file = osp.join(results_path, 'detections.pkl') detect_times = [] nms_times = [] with torch.no_grad(): # for each image for i in range(num_images): # get image image, target, h, w = dataset.pull_item(i) image = to_var(image.unsqueeze(0), self.use_gpu) # get and time detection timer['detection'].tic() bboxes, scores = self.model(image) detect_time = timer['detection'].toc(average=False) detect_times.append(detect_time) # convert to CPU tensors bboxes = bboxes[0] scores = scores[0] bboxes = bboxes.cpu().numpy() scores = scores.cpu().numpy() # scale each detection back up to the image scale = torch.Tensor([w, h, w, h]).cpu().numpy() bboxes *= scale # perform and time NMS timer['nms'].tic() for j in range(1, self.class_count): # get scores greater than score_threshold selected_i = np.where(scores[:, j] > score_threshold)[0] # if there are scores greather than score_threshold if len(selected_i) > 0: bboxes_i = bboxes[selected_i] scores_i = scores[selected_i, j] detections_i = (bboxes_i, scores_i[:, np.newaxis]) detections_i = np.hstack(detections_i) detections_i = detections_i.astype(np.float32, copy=False) keep = nms(detections=detections_i, threshold=0.45, force_cpu=True) # keep = nms(boxes=bboxes_i, # scores=scores_i, # iou_threshold=0.45) keep = keep[:50] detections_i = detections_i[keep, :] # if len(detections_i.shape) == 1: # all_boxes[j][i] = np.expand_dims(detections_i, 0) # else: all_boxes[j][i] = detections_i elif len(selected_i) == 0: all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) # if we need to limit the maximum per image if max_per_image > 0: # get all the scores for the image across all classes scores_i = np.hstack([ all_boxes[j][i][:, -1] for j in range(1, self.class_count) ]) # if the number of detections is greater than max_per_image if len(scores_i) > max_per_image: # get the score of the max_per_image-th image threshold_i = np.sort(scores_i)[-max_per_image] # keep detections with score greater than threshold_i for j in range(1, self.class_count): keep = np.where( all_boxes[j][i][:, -1] >= threshold_i)[0] all_boxes[j][i] = all_boxes[j][i][keep, :] nms_time = timer['nms'].toc(average=False) nms_times.append(nms_time) temp_string = 'detection: {:d}/{:d} {:.4f}s {:.4f}s' temp_string = temp_string.format(i + 1, num_images, detect_time, nms_time) write_print(self.output_txt, temp_string) with open(detection_file, 'wb') as f: pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) write_print(self.output_txt, '\nEvaluating detections') # perform evaluation if self.dataset == 'voc': voc_save(all_boxes=all_boxes, dataset=dataset, results_path=results_path, output_txt=self.output_txt) aps, mAP = do_python_eval(results_path=results_path, dataset=dataset, output_txt=self.output_txt, mode='test', use_07_metric=self.use_07_metric) detect_times = np.asarray(detect_times) nms_times = np.asarray(nms_times) total_times = np.add(detect_times, nms_times) write_print(self.output_txt, '\nfps[all]: ' + str(1 / np.mean(detect_times[1:]))) write_print(self.output_txt, 'fps[all]:' + str(1 / np.mean(nms_times[1:]))) write_print(self.output_txt, 'fps[all]:' + str(1 / np.mean(total_times[1:]))) write_print(self.output_txt, '\nResults:') for ap in aps: write_print(self.output_txt, '{:.4f}'.format(ap)) write_print(self.output_txt, '{:.4f}'.format(np.mean(aps))) write_print(self.output_txt, str(1 / np.mean(detect_times[1:]))) write_print(self.output_txt, str(1 / np.mean(nms_times[1:]))) write_print(self.output_txt, str(1 / np.mean(total_times[1:])))
def train(self): """ training process """ # set model in training mode self.model.train() self.losses = [] iters_per_epoch = len(self.data_loader) # start with a trained model if exists if self.pretrained_model: start = int(self.pretrained_model.split('/')[-1]) else: start = 0 sched = 0 # start training start_time = time.time() for e in range(start, self.num_epochs): for i, (images, targets) in enumerate(tqdm(self.data_loader)): images = to_var(images, self.use_gpu) targets = [to_var(target, self.use_gpu) for target in targets] class_loss, loc_loss, loss = self.model_step(images, targets) # print out loss log if (e + 1) % self.loss_log_step == 0: self.print_loss_log(start_time=start_time, iters_per_epoch=iters_per_epoch, e=e, i=i, class_loss=class_loss, loc_loss=loc_loss, loss=loss) self.losses.append([e, class_loss, loc_loss, loss]) # save model if (e + 1) % self.model_save_step == 0: self.save_model(e) num_sched = len(self.learning_sched) if num_sched != 0 and sched < num_sched: if (e + 1) == self.learning_sched[sched]: self.lr /= 10 write_print(self.output_txt, 'Learning rate reduced to ' + str(self.lr)) sched += 1 self.adjust_learning_rate(optimizer=self.optimizer, gamma=self.sched_gamma, step=sched) # print losses write_print(self.output_txt, '\n--Losses--') for e, class_loss, loc_loss, loss in self.losses: loss_string = ' {:.4f} {:.4f} {:.4f}'.format( class_loss, loc_loss, loss) write_print(self.output_txt, str(e) + loss_string)
help='Number of step for saving model') config = parser.parse_args() args = vars(config) output_txt = '' if args['mode'] == 'train': version = str(datetime.now()).replace(':', '_') version = '{}_train'.format(version) path = args['model_save_path'] path = osp.join(path, version) output_txt = osp.join(path, '{}.txt'.format(version)) elif args['mode'] == 'test': model = args['pretrained_model'].split('/') version = '{}_test_{}'.format(model[0], model[1]) path = args['model_test_path'] path = osp.join(path, model[0]) output_txt = osp.join(path, '{}.txt'.format(version)) mkdir(path) save_config(path, version, args) write_print(output_txt, '------------ Options -------------') for k, v in args.items(): write_print(output_txt, '{}: {}'.format(str(k), str(v))) write_print(output_txt, '-------------- End ----------------') main(version, config, output_txt)
def voc_eval(detection_path, path, annotation_path, list_path, class_name, cache_dir, output_txt, overlap_threshold=0.5, use_07_metric=True): # create or get the cache_file if not osp.isdir(cache_dir): os.mkdir(cache_dir) cache_file = osp.join(cache_dir, 'annotations.pkl') # read list of images with open(list_path, 'r') as f: lines = f.readlines() image_names = [x.strip() for x in lines] # if cache_file does not exists if not osp.isfile(cache_file): targets = {} # per image, read annotations from XML file write_print(output_txt, 'Reading annotations') for i, image_name in enumerate(image_names): temp_path = annotation_path.format(path, 'test', image_name) targets[image_name] = parse_annotation(temp_path) # save annotations to cache_file temp_string = 'Saving cached annotations to {:s}\n'.format(cache_file) write_print(output_txt, temp_string) with open(cache_file, 'wb') as f: pickle.dump(targets, f) # else if cache_file exists else: with open(cache_file, 'rb') as f: targets = pickle.load(f) class_targets = {} n_positive = 0 # get targets for objects with class equal to class_name in image_name for image_name in image_names: target = [x for x in targets[image_name] if x['name'] == class_name] bbox = np.array([x['bbox'] for x in target]) difficult = np.array([x['difficult'] for x in target]).astype(np.bool) det = [False] * len(target) n_positive += sum(~difficult) class_targets[image_name] = { 'bbox': bbox, 'difficult': difficult, 'det': det } # read detections from class_name.txt detection_file = detection_path.format(class_name) with open(detection_file, 'r') as f: lines = f.readlines() # if there are detections if any(lines) == 1: # get ids, confidences, and bounding boxes values = [x.strip().split(' ') for x in lines] image_ids = [x[0] for x in values] confidences = np.array([float(x[1]) for x in values]) bboxes = np.array([[float(z) for z in x[2:]] for x in values]) # sort by confidence sorted_index = np.argsort(-confidences) bboxes = bboxes[sorted_index, :] image_ids = [image_ids[x] for x in sorted_index] num_detections = len(image_ids) tp = np.zeros(num_detections) fp = np.zeros(num_detections) # go through detections and mark TPs and FPs for i in range(num_detections): # get target bounding box image_target = class_targets[image_ids[i]] bbox_target = image_target['bbox'].astype(float) # get detected bounding box bbox = bboxes[i, :].astype(float) overlap_max = -np.inf if bbox_target.size > 0: # get the overlapping region # compute the area of intersection x_min = np.maximum(bbox_target[:, 0], bbox[0]) y_min = np.maximum(bbox_target[:, 1], bbox[1]) x_max = np.minimum(bbox_target[:, 2], bbox[2]) y_max = np.minimum(bbox_target[:, 3], bbox[3]) width = np.maximum(x_max - x_min, 0.) height = np.maximum(y_max - y_min, 0.) intersection = width * height # get the area of the gt and the detection # compute the union area_bbox = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) area_bbox_target = ((bbox_target[:, 2] - bbox_target[:, 0]) * (bbox_target[:, 3] - bbox_target[:, 1])) union = area_bbox + area_bbox_target - intersection # compute the iou iou = intersection / union overlap_max = np.max(iou) j_max = np.argmax(iou) # if the maximum overlap is over the overlap threshold if overlap_max > overlap_threshold: # if it is not difficult if not image_target['difficult'][j_max]: # if it is not yet detected, count as a true positive if not image_target['det'][j_max]: tp[i] = 1. image_target['det'][j_max] = 1 # else, count as a false positive else: fp[i] = 1. # else, count as a false positive else: fp[i] = 1. # compute precision and recall # avoid divide by zero if the first detection matches a difficult gt tp = np.cumsum(tp) fp = np.cumsum(fp) recall = tp / float(n_positive) precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = voc_ap(recall=recall, precision=precision, use_07_metric=use_07_metric) else: recall = -1. precision = -1. ap = -1. return recall, precision, ap