def parse_annotation(self, annotation, mAP='False'): if TRAIN_LOAD_IMAGES_TO_RAM: image_path = annotation[0] image = annotation[2] else: image_path = annotation[0] image = cv2.imread(image_path) bboxes = np.array( [list(map(int, box.split(','))) for box in annotation[1]]) if self.data_aug: image, bboxes = self.random_horizontal_flip( np.copy(image), np.copy(bboxes)) image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes)) image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes)) #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if mAP == True: return image, bboxes image, bboxes = image_preprocess(np.copy(image), [self.input_sizes, self.input_sizes], np.copy(bboxes)) return image, bboxes
def detect_video_bgs(Yolo, video_path, output_path, log_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', draw_roi=False, zoom=0, show_diver=True): times, times_2 = [], [] vid = cv2.VideoCapture(video_path) # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 LOW = np.array([80, 0, 200]) HIGH = np.array([255, 110, 255]) log = pd.DataFrame(columns=[ "vis_px", "vis_px_pc", "total_px", "total_px_pc", "diff", "diff_pc" ]) while True: _, img = vid.read() try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') #Countour BGS: hsv = cv2.cvtColor(original_image, cv2.COLOR_BGR2HSV) # mask image fgMask = cv2.inRange(hsv, LOW, HIGH) #(x1, y1), (x2, y2) = (bboxes[0], bboxes[1]), (bboxes[2], bboxes[3]) splash_boxes = [ i for i in bboxes if CLASS_INDECES[int(i[5])] == "splash" ] if splash_boxes: splash_x_min, splash_y_min, splash_x_max, splash_y_max = splash_bbox_roi( splash_boxes=splash_boxes, zoom=zoom) #normal_image: number_of_white_pix = np.sum(fgMask == 255) number_total_pix = fgMask.shape[0] * fgMask.shape[1] print("Normal_image: Number of white pixels: {} ({}%)".format( number_of_white_pix, round((number_of_white_pix / number_total_pix) * 100), 2)) #splash_roi: splash_roi = fgMask[splash_y_min:splash_y_max, splash_x_min:splash_x_max] roi_number_of_white_pix = np.sum(splash_roi == 255) # roi_number_total_pix = splash_roi.shape[0]*splash_roi.shape[1] print("Roi: Number of white pixels: {} ({}%)".format( roi_number_of_white_pix, round((roi_number_of_white_pix / number_total_pix) * 100), 2)) pixel_diff = abs(roi_number_of_white_pix - number_of_white_pix) image = cv2.cvtColor(fgMask, cv2.COLOR_GRAY2RGB) if draw_roi: # image = draw_bbox(image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) #splash_x_min,splash_y_min,splash_x_max,splash_y_max image = cv2.rectangle(image, (splash_x_min, splash_y_min), (splash_x_max, splash_y_max), (255, 0, 0), 2) else: # create mask and apply mask = np.zeros(image.shape[:2], dtype="uint8") cv2.rectangle(mask, (splash_x_min, splash_y_min), (splash_x_max, splash_y_max), 255, -1) masked = cv2.bitwise_and(image, image, mask=mask) image = masked #Recolor image = recolor_bw(image, splash_red=True) #Calcs vis_px_pc = round( (roi_number_of_white_pix / number_total_pix) * 100, 2) total_px_pc = round((number_of_white_pix / number_total_pix) * 100, 2) diff_pc = round( (roi_number_of_white_pix / number_of_white_pix) * 100, 2) image = cv2.putText( image, "Vis. PXs (roi): {} ({}%) Total wPXs: {} ({}%) Diff: {} ({}%) " .format(roi_number_of_white_pix, vis_px_pc, number_of_white_pix, total_px_pc, pixel_diff, diff_pc), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.7, (0, 0, 255), 1) # Create logs: log = log.append( { "vis_px": roi_number_of_white_pix, "vis_px_pc": vis_px_pc, "total_px": number_of_white_pix, "total_px_pc": total_px_pc, "diff": pixel_diff, "diff_pc": diff_pc }, ignore_index=True) else: if not show_diver: #No splash and no diver should be shown. image = np.zeros(original_image.shape[:2], dtype="uint8") image = recolor_bw(image, splash_red=False) else: image = draw_bbox(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) # image = cv2.putText(image, "Time: {:.1f}FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, # (0, 0, 255), 2) # CreateXMLfile("XML_Detections", str(int(time.time())), original_image, bboxes, read_class_names(CLASSES)) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break log.to_csv(log_path)
def detect_video(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors=''): times, times_2 = [], [] vid = cv2.VideoCapture(video_path) # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 while True: _, img = vid.read() try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') image = draw_bbox(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) image = cv2.putText(image, "Time: {:.1f}FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # CreateXMLfile("XML_Detections", str(int(time.time())), original_image, bboxes, read_class_names(CLASSES)) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows()
def detect_video_knn(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', draw_roi=False, zoom=0): #different background subtraction methods # backSub = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=40, detectShadows=False) backSub = cv2.createBackgroundSubtractorKNN() #KNN backSub.setDetectShadows(False) backSub.setDist2Threshold(13000) backSub.setkNNSamples(6) backSub.setNSamples(30) times, times_2 = [], [] vid = cv2.VideoCapture(video_path) # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 while True: _, img = vid.read() try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') fgMask = backSub.apply(original_image, learningRate=0.9) #(x1, y1), (x2, y2) = (bboxes[0], bboxes[1]), (bboxes[2], bboxes[3]) splash_boxes = [ i for i in bboxes if CLASS_INDECES[int(i[5])] == "splash" ] if splash_boxes: splash_x_min, splash_y_min, splash_x_max, splash_y_max = splash_bbox_roi( splash_boxes=splash_boxes, zoom=zoom) #normal_image: number_of_white_pix = np.sum(fgMask == 255) number_total_pix = fgMask.shape[0] * fgMask.shape[1] print("Normal_image: Number of white pixels: {} ({}%)".format( number_of_white_pix, round((number_of_white_pix / number_total_pix) * 100), 2)) #splash_roi: splash_roi = fgMask[splash_y_min:splash_y_max, splash_x_min:splash_x_max] roi_number_of_white_pix = np.sum(splash_roi == 255) # roi_number_total_pix = splash_roi.shape[0]*splash_roi.shape[1] print("Roi: Number of white pixels: {} ({}%)".format( roi_number_of_white_pix, round((roi_number_of_white_pix / number_total_pix) * 100), 2)) pixel_diff = abs(roi_number_of_white_pix - number_of_white_pix) image = cv2.cvtColor(fgMask, cv2.COLOR_GRAY2RGB) if draw_roi: # image = draw_bbox(image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) #splash_x_min,splash_y_min,splash_x_max,splash_y_max image = cv2.rectangle(image, (splash_x_min, splash_y_min), (splash_x_max, splash_y_max), (255, 0, 0), 2) else: # create mask and apply mask = np.zeros(image.shape[:2], dtype="uint8") cv2.rectangle(mask, (splash_x_min, splash_y_min), (splash_x_max, splash_y_max), 255, -1) masked = cv2.bitwise_and(image, image, mask=mask) image = masked image = cv2.putText( image, "Vis. PXs (roi): {} ({}%) Total wPXs: {} ({}%) Diff: {} ({}%) " .format( roi_number_of_white_pix, round((roi_number_of_white_pix / number_total_pix) * 100, 2), number_of_white_pix, round((number_of_white_pix / number_total_pix) * 100, 2), pixel_diff, round( (roi_number_of_white_pix / number_of_white_pix) * 100, 2)), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.7, (0, 0, 255), 1) else: #TODO what todo with no splash images ? image = draw_bbox(original_image, bboxes, CLASSES=CLASSES, rectangle_colors=rectangle_colors) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) # image = cv2.putText(image, "Time: {:.1f}FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, # (0, 0, 255), 2) # CreateXMLfile("XML_Detections", str(int(time.time())), original_image, bboxes, read_class_names(CLASSES)) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows()
def get_mAP(Yolo, dataset, score_threshold=0.25, iou_threshold=0.50, TEST_INPUT_SIZE=TEST_INPUT_SIZE): MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) NUM_CLASS = read_class_names(TRAIN_CLASSES) ground_truth_dir_path = 'mAP/ground-truth' if os.path.exists(ground_truth_dir_path): shutil.rmtree(ground_truth_dir_path) if not os.path.exists('mAP'): os.mkdir('mAP') os.mkdir(ground_truth_dir_path) print(f'\ncalculating mAP{int(iou_threshold * 100)}...\n') gt_counter_per_class = {} for index in range(dataset.num_samples): ann_dataset = dataset.annotations[index] original_image, bbox_data_gt = dataset.parse_annotation( ann_dataset, True) if len(bbox_data_gt) == 0: bboxes_gt = [] classes_gt = [] else: bboxes_gt, classes_gt = bbox_data_gt[:, :4], bbox_data_gt[:, 4] ground_truth_path = os.path.join(ground_truth_dir_path, str(index) + '.txt') num_bbox_gt = len(bboxes_gt) bounding_boxes = [] for i in range(num_bbox_gt): class_name = NUM_CLASS[classes_gt[i]] xmin, ymin, xmax, ymax = list(map(str, bboxes_gt[i])) bbox = xmin + " " + ymin + " " + xmax + " " + ymax bounding_boxes.append({ "class_name": class_name, "bbox": bbox, "used": False }) # count that object if class_name in gt_counter_per_class: gt_counter_per_class[class_name] += 1 else: # if class didn't exist yet gt_counter_per_class[class_name] = 1 bbox_mess = ' '.join([class_name, xmin, ymin, xmax, ymax]) + '\n' with open(f'{ground_truth_dir_path}/{str(index)}_ground_truth.json', 'w') as outfile: json.dump(bounding_boxes, outfile) gt_classes = list(gt_counter_per_class.keys()) # sort the classes alphabetically gt_classes = sorted(gt_classes) n_classes = len(gt_classes) times = [] json_pred = [[] for i in range(n_classes)] for index in range(dataset.num_samples): ann_dataset = dataset.annotations[index] image_name = ann_dataset[0].split('/')[-1] original_image, bbox_data_gt = dataset.parse_annotation( ann_dataset, True) image = image_preprocess(np.copy(original_image), [TEST_INPUT_SIZE, TEST_INPUT_SIZE]) image_data = image[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) t2 = time.time() times.append(t2 - t1) pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, TEST_INPUT_SIZE, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') for bbox in bboxes: coor = np.array(bbox[:4], dtype=np.int32) score = bbox[4] class_ind = int(bbox[5]) class_name = NUM_CLASS[class_ind] score = '%.4f' % score xmin, ymin, xmax, ymax = list(map(str, coor)) bbox = xmin + " " + ymin + " " + xmax + " " + ymax json_pred[gt_classes.index(class_name)].append({ "confidence": str(score), "file_id": str(index), "bbox": str(bbox) }) ms = sum(times) / len(times) * 1000 fps = 1000 / ms for class_name in gt_classes: json_pred[gt_classes.index(class_name)].sort( key=lambda x: float(x['confidence']), reverse=True) with open(f'{ground_truth_dir_path}/{class_name}_predictions.json', 'w') as outfile: json.dump(json_pred[gt_classes.index(class_name)], outfile) # Calculate the AP for each class sum_AP = 0.0 ap_dictionary = {} # open file to store the results with open("mAP/results.txt", 'w') as results_file: results_file.write("# AP and precision/recall per class\n") count_true_positives = {} for class_index, class_name in enumerate(gt_classes): count_true_positives[class_name] = 0 # Load predictions of that class predictions_file = f'{ground_truth_dir_path}/{class_name}_predictions.json' predictions_data = json.load(open(predictions_file)) # Assign predictions to ground truth objects nd = len(predictions_data) tp = [0] * nd # creates an array of zeros of size nd fp = [0] * nd for idx, prediction in enumerate(predictions_data): file_id = prediction["file_id"] # assign prediction to ground truth object if any # open ground-truth with that file_id gt_file = f'{ground_truth_dir_path}/{str(file_id)}_ground_truth.json' ground_truth_data = json.load(open(gt_file)) ovmax = -1 gt_match = -1 # load prediction bounding-box bb = [float(x) for x in prediction["bbox"].split() ] # bounding box of prediction for obj in ground_truth_data: # look for a class_name match if obj["class_name"] == class_name: bbgt = [float(x) for x in obj["bbox"].split() ] # bounding box of ground truth bi = [ max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), min(bb[3], bbgt[3]) ] iw = bi[2] - bi[0] + 1 ih = bi[3] - bi[1] + 1 if iw > 0 and ih > 0: # compute overlap (IoU) = area of intersection / area of union ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + ( bbgt[2] - bbgt[0] + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih ov = iw * ih / ua if ov > ovmax: ovmax = ov gt_match = obj # assign prediction as true positive/don't care/false positive if ovmax >= MINOVERLAP: # if ovmax > minimum overlap if not bool(gt_match["used"]): # true positive tp[idx] = 1 gt_match["used"] = True count_true_positives[class_name] += 1 # update the ".json" file with open(gt_file, 'w') as f: f.write(json.dumps(ground_truth_data)) else: # false positive (multiple detection) fp[idx] = 1 else: # false positive fp[idx] = 1 # compute precision/recall cumsum = 0 for idx, val in enumerate(fp): fp[idx] += cumsum cumsum += val cumsum = 0 for idx, val in enumerate(tp): tp[idx] += cumsum cumsum += val # print(tp) rec = tp[:] for idx, val in enumerate(tp): rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] # print(rec) prec = tp[:] for idx, val in enumerate(tp): prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) # print(prec) ap, mrec, mprec = voc_ap(rec, prec) sum_AP += ap text = "{0:.3f}%".format( ap * 100 ) + " = " + class_name + " AP " # class_name + " AP = {0:.2f}%".format(ap*100) rounded_prec = ['%.3f' % elem for elem in prec] rounded_rec = ['%.3f' % elem for elem in rec] # Write to results.txt results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") print(text) ap_dictionary[class_name] = ap results_file.write("\n# mAP of all classes\n") mAP = sum_AP / n_classes text = "mAP = {:.3f}%, {:.2f} FPS".format(mAP * 100, fps) results_file.write(text + "\n") print(text) return mAP * 100