def main(): webapi = WebAPI(IP_ADDR, PORT, ACCOUNT, PASSWORD) cameras = webapi.list_cameras() camera_ids = [camera['id'] for camera in cameras] # the last added camera in the Surveillance Station camera_id = camera_ids[-1] rtsp = webapi.get_liveview_rtsp(camera_id) fall_label = webapi.create_recording_label('fall_event') # Initialize fall detection model yolo = Create_Yolo(input_size=YOLO_INPUT_SIZE) load_yolo_weights(yolo, YOLO_V3_WEIGHTS) # use Darknet weights classes = read_class_names(YOLO_COCO_CLASSES) q = queue.Queue() p1 = threading.Thread(target=read_frame, args=([q, rtsp])) p2 = threading.Thread( target=process_frame, args=([q, webapi, camera_id, yolo, classes, fall_label])) p1.start() p2.start() p1.join() p2.join() webapi.logout()
def Create_Yolo(input_size=416, channels=3, training=False, CLASSES=YOLO_COCO_CLASSES): NUM_CLASS = len(read_class_names(CLASSES)) input_layer = Input([input_size, input_size, channels]) if TRAIN_YOLO_TINY: if YOLO_TYPE == "yolov4": conv_tensors = YOLOv4_tiny(input_layer, NUM_CLASS) if YOLO_TYPE == "yolov3": conv_tensors = YOLOv3_tiny(input_layer, NUM_CLASS) else: if YOLO_TYPE == "yolov4": conv_tensors = YOLOv4(input_layer, NUM_CLASS) if YOLO_TYPE == "yolov3": conv_tensors = YOLOv3(input_layer, NUM_CLASS) output_tensors = [] for i, conv_tensor in enumerate(conv_tensors): pred_tensor = decode(conv_tensor, NUM_CLASS, i) if training: output_tensors.append(conv_tensor) output_tensors.append(pred_tensor) Yolo = tf.keras.Model(input_layer, output_tensors) return Yolo
def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=YOLO_COCO_CLASSES): NUM_CLASS = len(read_class_names(CLASSES)) conv_shape = tf.shape(conv) batch_size = conv_shape[0] output_size = conv_shape[1] input_size = STRIDES[i] * output_size conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) conv_raw_conf = conv[:, :, :, :, 4:5] conv_raw_prob = conv[:, :, :, :, 5:] pred_xywh = pred[:, :, :, :, 0:4] pred_conf = pred[:, :, :, :, 4:5] label_xywh = label[:, :, :, :, 0:4] respond_bbox = label[:, :, :, :, 4:5] label_prob = label[:, :, :, :, 5:] giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1) input_size = tf.cast(input_size, tf.float32) bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / ( input_size** 2) giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) # Find the value of IoU with the real box The largest prediction box max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) # If the largest iou is less than the threshold, it is considered that the prediction box contains no objects, then the background box respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < YOLO_IOU_LOSS_THRESH, tf.float32) conf_focal = tf.pow(respond_bbox - pred_conf, 2) # Calculate the loss of confidence # we hope that if the grid contains objects, then the network output prediction box has a confidence of 1 and 0 when there is no object. conf_loss = conf_focal * ( respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf) + respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf)) prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=label_prob, logits=conv_raw_prob) giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1, 2, 3, 4])) conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1, 2, 3, 4])) prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1, 2, 3, 4])) return giou_loss, conf_loss, prob_loss
def compute_loss(pred, conv, label, bboxes, i=0, CLASSES=YOLO_COCO_CLASSES): NUM_CLASS = len(read_class_names(CLASSES)) conv_shape = tf.shape(conv) batch_size = conv_shape[0] output_size = conv_shape[1] input_size = STRIDES[i] * output_size conv = tf.reshape(conv, (batch_size, output_size, output_size, 3, 5 + NUM_CLASS)) conv_raw_conf = conv[:, :, :, :, 4:5] conv_raw_prob = conv[:, :, :, :, 5:] pred_xywh = pred[:, :, :, :, 0:4] pred_conf = pred[:, :, :, :, 4:5] label_xywh = label[:, :, :, :, 0:4] respond_bbox = label[:, :, :, :, 4:5] label_prob = label[:, :, :, :, 5:] giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis=-1) input_size = tf.cast(input_size, tf.float32) bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / ( input_size** 2) giou_loss = respond_bbox * bbox_loss_scale * (1 - giou) iou = bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :]) max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1) respond_bgd = (1.0 - respond_bbox) * tf.cast(max_iou < IOU_LOSS_THRESH, tf.float32) conf_focal = tf.pow(respond_bbox - pred_conf, 2) conf_loss = conf_focal * ( respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf) + respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits( labels=respond_bbox, logits=conv_raw_conf)) prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits( labels=label_prob, logits=conv_raw_prob) giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1, 2, 3, 4])) conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1, 2, 3, 4])) prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1, 2, 3, 4])) return giou_loss, conf_loss, prob_loss
def __init__(self): # Initialize first object number self.nextObjectID = 0 # Find class names and number of classes from model self.classes = read_class_names(YOLO_COCO_CLASSES) self.num_classes = len(self.classes) # Ordered dics keep track of the insertion order of the inputs to each dict self.objects = OrderedDict() self.disappeared = OrderedDict() self.MaxMissedFrames = 50
def Create_Yolov3(input_size=416, channels=3, training=False, CLASSES=YOLO_COCO_CLASSES): NUM_CLASS = len(read_class_names(CLASSES)) input_layer = Input([input_size, input_size, channels]) conv_tensors = YOLOv3(input_layer, NUM_CLASS) output_tensors = [] for i, conv_tensor in enumerate(conv_tensors): pred_tensor = decode(conv_tensor, NUM_CLASS, i) if training: output_tensors.append(conv_tensor) output_tensors.append(pred_tensor) YoloV3 = tf.keras.Model(input_layer, output_tensors) return YoloV3
def __init__(self, dataset_type, TEST_INPUT_SIZE=TEST_INPUT_SIZE): self.annot_path = TRAIN_ANNOT_PATH if dataset_type == 'train' else TEST_ANNOT_PATH self.input_sizes = TRAIN_INPUT_SIZE if dataset_type == 'train' else TEST_INPUT_SIZE self.batch_size = TRAIN_BATCH_SIZE if dataset_type == 'train' else TEST_BATCH_SIZE self.data_aug = TRAIN_DATA_AUG if dataset_type == 'train' else TEST_DATA_AUG self.train_input_sizes = TRAIN_INPUT_SIZE self.strides = np.array(YOLO_STRIDES) self.classes = read_class_names(TRAIN_CLASSES) self.num_classes = len(self.classes) self.anchors = (np.array(YOLO_ANCHORS).T / self.strides).T self.anchor_per_scale = YOLO_ANCHOR_PER_SCALE self.max_bbox_per_scale = YOLO_MAX_BBOX_PER_SCALE self.annotations = self.load_annotations(dataset_type) self.num_samples = len(self.annotations) self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) self.batch_count = 0
def Create_Yolov3(input_size=416, channels=3, training=False, CLASSES=YOLO_COCO_CLASSES): NUM_CLASS = len(read_class_names(CLASSES)) input_layer = Input([input_size, input_size, channels]) if TRAIN_YOLO_TINY: conv_tensors = YOLOv3_tiny(input_layer, NUM_CLASS) #THIS Func uses tiny else: conv_tensors = YOLOv3(input_layer, NUM_CLASS) # and this uses normal YOLO output_tensors = [] for i, conv_tensor in enumerate(conv_tensors): pred_tensor = decode(conv_tensor, NUM_CLASS, i) if training: output_tensors.append(conv_tensor) output_tensors.append(pred_tensor) YoloV3 = tf.keras.Model(input_layer, output_tensors) return YoloV3
def Object_tracking(YoloV3, webapi, recording_id, video_path, model, cate_predictor, landmark_tensor, input_size=416, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only=[]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None # initialize deep sort object encoder = gdet.create_box_encoder(DEEP_SORT_MODEL_FILE, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times = [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) bookmarks = {} while True: _, img = vid.read() print(vid.get(cv2.CAP_PROP_POS_MSEC)) try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = tf.expand_dims(image_data, 0) t1 = time.time() pred_bbox = YoloV3.predict(image_data) t2 = time.time() times.append(t2 - t1) times = times[-20:] pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_image, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() deleted_tracks = tracker.update(detections, vid.get(cv2.CAP_PROP_POS_MSEC), original_image) # Throw frames into classifier once a person is deleted from the tracker marks = predict_tracks_cate(model, cate_predictor, deleted_tracks, landmark_tensor, video_path) add_text_to_bookmarks(bookmarks, marks) ms = sum(times) / len(times) * 1000 fps = 1000 / ms print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps)) marks = predict_tracks_cate(model, cate_predictor, tracker.tracks, landmark_tensor, video_path) add_text_to_bookmarks(bookmarks, marks) timestamp = int(os.path.splitext(video_path)[0].split('-')[-1]) for sec_since_start, texts in bookmarks.items(): webapi.add_bookmark(recording_id, ' | '.join(texts), '', timestamp + sec_since_start)
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only = []): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int length = int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) print("VIDEO PROPERTIES:FrameCount:{}\tWidth:{}\tHeight:{}\tFps:{}\t".format(length,width,height,fps)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) #1.BACKGROUND DETECTION backSub = cv2.createBackgroundSubtractorMOG2(history = 400, varThreshold = 16, detectShadows = False) bgMask=None frame_no=0 while True: _, frame = vid.read() frame_no=frame_no+1 try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break #1.1 BACKGROUND Update fgMask = backSub.apply(original_frame) bgMask = backSub.getBackgroundImage() if frame_no % 100==0: print(frame_no) image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) #t1 = time.time() #pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) !=0 and NUM_CLASS[int(bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int)-bbox[0].astype(int), bbox[3].astype(int)-bbox[1].astype(int)]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_frame, boxes)) detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip(boxes, scores, names, features)] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class() #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index(class_name)] # Get predicted object index by object name tracked_bboxes.append(bbox.tolist() + [tracking_id, index]) # Structure data, that we could use it with our draw_bbox function #Save to File box_item=bbox.tolist() + [tracking_id, index,frame_no] ts.save(box_item) # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True) t3 = time.time() times.append(t2-t1) times_2.append(t3-t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times)/len(times)*1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2)/len(times_2)*1000) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.imwrite(ts.out_bg_img,bgMask) cv2.destroyAllWindows()
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only = [], custom_yolo=None, custom_classes=YOLO_CUSTOM_CLASSES, Custom_track_only=[]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) # set a bunch of flags and variables for made baskets and possessions possession = None possession_list = [] combined_possession_avg = 0.5 total_basket_count=0 basket_frame_list = [] baskets_dict = {"Dark": 0, "Light": 0} made_basket_first_frame = 0 made_basket_frames = 0 basket_marked = False if custom_yolo: NUM_CUSTOM_CLASS = read_class_names(custom_classes) custom_key_list = list(NUM_CUSTOM_CLASS.keys()) custom_val_list = list(NUM_CUSTOM_CLASS.values()) frame_counter = 0 # loop through each frame in video while True: _, frame = vid.read() try: first_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB) frame_counter += 1 except: break image_data = image_preprocess(np.copy(first_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() # CUSTOM BLOCK FOR BASKETBALL if custom_yolo: if YOLO_FRAMEWORK == "tf": # use yolo model to make prediction on the image data custom_pred_bbox = custom_yolo.predict(image_data) # reshape our data to be in correct form for processing custom_pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in custom_pred_bbox] custom_pred_bbox = tf.concat(custom_pred_bbox, axis=0) # get boxes based on threshhold custom_bboxes = postprocess_boxes(custom_pred_bbox, original_frame, input_size, 0.3) # custom_bboxes = nms(custom_bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names custom_boxes, custom_scores, custom_names = [], [], [] for bbox in custom_bboxes: if len(Custom_track_only) !=0 and NUM_CUSTOM_CLASS[int(bbox[5])] in Custom_track_only or len(Custom_track_only) == 0: custom_boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int)-bbox[0].astype(int), bbox[3].astype(int)-bbox[1].astype(int)]) custom_scores.append(bbox[4]) custom_names.append(NUM_CUSTOM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. custom_boxes = np.array(custom_boxes) custom_names = np.array(custom_names) custom_scores = np.array(custom_scores) # take note of the highest "scoring" made basket and basketball obj in each frame highest_scoring_basketball = 0 basketball_box = None basketball_center = None highest_scoring_made_basket = 0 made_basket_box = None for i, bbox in enumerate(custom_bboxes): # loop through each bounding box to get the "best one" of the frame # we do this because sometimes our model will detect two, and we know there can only be one name = custom_names[i] score = round(custom_scores[i], 3) if name == 'basketball': if score > highest_scoring_basketball: highest_scoring_basketball = score basketball_box = bbox if name == 'made-basket': if score > .85 and score > highest_scoring_made_basket: highest_scoring_made_basket = score made_basket_box = bbox # if it sees a basketball, put a box on it and note the center (for possession) if basketball_box is not None: cv2.rectangle(original_frame, (int(basketball_box[0]), int(basketball_box[1])), (int(basketball_box[2]), int(basketball_box[3])), (0,0,255), 1) cv2.rectangle(original_frame, (int(basketball_box[0]), int(basketball_box[1]-30)), (int(basketball_box[0])+(10)*17, int(basketball_box[1])), (0,0,255), -1) cv2.putText(original_frame, "basketball" + "-" + str(highest_scoring_basketball),(int(basketball_box[0]), int(basketball_box[1]-10)),0, 0.5, (255,255,255),1) basketball_center = ( (basketball_box[2]+basketball_box[0])/2, (basketball_box[3]+basketball_box[1])/2 ) if made_basket_box is not None: # if theres a made basket put the box on it cv2.rectangle(original_frame, (int(made_basket_box[0]), int(made_basket_box[1])), (int(made_basket_box[2]), int(made_basket_box[3])), (0,255,0), 1) cv2.rectangle(original_frame, (int(made_basket_box[0]), int(made_basket_box[1]-30)), (int(made_basket_box[0])+(15)*17, int(made_basket_box[1])), (0,255,0), -1) cv2.putText(original_frame, "made-basket" + " " + str(highest_scoring_made_basket),(int(made_basket_box[0]), int(made_basket_box[1]-10)),0, 0.6, (0,0,0),1) if made_basket_frames == 0: # if this is the first frame in the sequence made_basket_first_frame = frame_counter # increment a counter for made basket frames made_basket_frames += 1 # if there were 3 consecuative frames AND we havnt marked the basket yet then lets count it! if made_basket_frames >= 3 and not basket_marked: basket_marked = True basket_frame_list.append(made_basket_first_frame) if possession: # record which "team" scored the basket baskets_dict[possession] += 1 # if no made basket make sure the made basket counter is at zero else: # no made basket made_basket_frames = 0 # 60 frames after a made basket we can reset the "marked basket" flag to False # in essence this means we start looking for made baskets again if basket_marked and frame_counter > basket_frame_list[-1] + 60: basket_marked = False # END CUSTOM BLOCK # PRESON PREDICTION and TRACKING BLOCK if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) !=0 and NUM_CLASS[int(bbox[5])] in Track_only or len(Track_only) == 0: w = bbox[2].astype(int)-bbox[0].astype(int) h = bbox[3].astype(int)-bbox[1].astype(int) if h < height/3 and w < width/4: if h > 120: boxes.append([bbox[0].astype(int), bbox[1].astype(int), w, h]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) # detect jersey color using the tracked persons bounding box patches = [gdet.extract_image_patch(frame, box, [box[3], box[2]]) for box in boxes] color_ratios = [find_color(patch) for patch in patches] features = np.array(encoder(original_frame, boxes)) # mark the detection detections = [Detection(bbox, score, class_name, feature, color_ratio) for bbox, score, class_name, feature, color_ratio in zip(boxes, scores, names, features, color_ratios)] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] color_ratio_list = [] check_possession = False for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue color_ratio = track.get_color_ratio() color_ratio_list.append(color_ratio) bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class() #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index(class_name)] # Get predicted object index by object name tracked_bboxes.append(bbox.tolist() + [tracking_id, index]) # Structure data, that we could use it with our draw_bbox function # if there is a basketball in the frame, and its "in" a ersons bounding box, check what box it is in for psosession if basketball_center: if basketball_center[0] >= bbox[0] and basketball_center[0] <= bbox[2]: if basketball_center[1] >= bbox[1] and basketball_center[1] <= bbox[3]: check_possession = True if color_ratio <= .2: # light team possession_list.append(0) else: # dark team possession_list.append(1) else: # no basketball in frame # possession_list.append(-1) # test_list.pop(0) pass # if the ball is in a bounding box, update out possession tracker if check_possession: if len(possession_list) > 60: # this function takes an average of the last 60 posessions marked to determine current position # it weights the most recent detections more # this algo is a WIP possession_list = possession_list[-60:] # full_avg = sum(possession_list)/len(possession) last_60_avg = sum(possession_list[-60:])/60 last_30_avg = sum(possession_list[-30:])/30 last_15_avg = sum(possession_list[-15:])/15 last_5_avg = sum(possession_list[-5:])/5 combined_possession_avg = round((last_60_avg + last_30_avg + last_15_avg + last_5_avg)/4,3) #most_common_possession = stats.mode(possession_list)[0] else: combined_possession_avg = round(sum(possession_list)/len(possession_list),3) # use our possession average to determine who has the ball right now if combined_possession_avg < 0.5: possession = "Light" elif combined_possession_avg > 0.5: possession = "Dark" # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, color_ratios=color_ratio_list, CLASSES=CLASSES, tracking=True) t3 = time.time() times.append(t2-t1) times_2.append(t3-t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times)/len(times)*1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2)/len(times_2)*1000) if possession == "Light": image = cv2.putText(image, "Posession: {}".format(possession), (width-400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (50, 255, 255), 2) else: image = cv2.putText(image, "Posession: {}".format(possession), (width-400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # image = cv2.putText(image, "Light: {} Dark: {} None: {}".format(possession_list.count(0), possession_list.count(1), possession_list.count(-1)), (400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) image = cv2.putText(image, "Posession Avg: {}".format(combined_possession_avg), (400, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows() return_data = {"baskets_dict": baskets_dict, "basket_frame_list": basket_frame_list} print("video saved to {}".format(output_path)) return(return_data)
def Object_tracking(Yolo, video_path, input_size=416, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, Track_only=["person"]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'data/model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] vid = cv2.VideoCapture(video_path) # detect on video # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) for x in range(120): _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) #t1 = time.time() #pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_frame, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index] ) # Structure data, that we could use it with our draw_bbox function # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) out_tracked_bboxes.append(tracked_bboxes) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2))
def get_mAP(model, dataset, score_threshold=0.25, iou_threshold=0.50, TEST_INPUT_SIZE=TEST_INPUT_SIZE): MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) NUM_CLASS = read_class_names(TRAIN_CLASSES) ground_truth_dir_path = 'mAP/ground-truth' if os.path.exists(ground_truth_dir_path): shutil.rmtree(ground_truth_dir_path) if not os.path.exists('mAP'): os.mkdir('mAP') os.mkdir(ground_truth_dir_path) print(f'\ncalculating mAP{int(iou_threshold*100)}...\n') gt_counter_per_class = {} for index in range(dataset.num_samples): ann_dataset = dataset.annotations[index] original_image, bbox_data_gt = dataset.parse_annotation( ann_dataset, True) if len(bbox_data_gt) == 0: bboxes_gt = [] classes_gt = [] else: bboxes_gt, classes_gt = bbox_data_gt[:, :4], bbox_data_gt[:, 4] ground_truth_path = os.path.join(ground_truth_dir_path, str(index) + '.txt') num_bbox_gt = len(bboxes_gt) bounding_boxes = [] for i in range(num_bbox_gt): class_name = NUM_CLASS[classes_gt[i]] xmin, ymin, xmax, ymax = list(map(str, bboxes_gt[i])) bbox = xmin + " " + ymin + " " + xmax + " " + ymax bounding_boxes.append({ "class_name": class_name, "bbox": bbox, "used": False }) # count that object if class_name in gt_counter_per_class: gt_counter_per_class[class_name] += 1 else: # if class didn't exist yet gt_counter_per_class[class_name] = 1 bbox_mess = ' '.join([class_name, xmin, ymin, xmax, ymax]) + '\n' with open(f'{ground_truth_dir_path}/{str(index)}_ground_truth.json', 'w') as outfile: json.dump(bounding_boxes, outfile) gt_classes = list(gt_counter_per_class.keys()) # sort the classes alphabetically gt_classes = sorted(gt_classes) n_classes = len(gt_classes) times = [] json_pred = [[] for i in range(n_classes)] for index in range(dataset.num_samples): ann_dataset = dataset.annotations[index] image_name = ann_dataset[0].split('/')[-1] original_image, bbox_data_gt = dataset.parse_annotation( ann_dataset, True) image = image_preprocess(np.copy(original_image), [TEST_INPUT_SIZE, TEST_INPUT_SIZE]) image_data = tf.expand_dims(image, 0) t1 = time.time() pred_bbox = model.predict(image_data) t2 = time.time() times.append(t2 - t1) pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, TEST_INPUT_SIZE, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') for bbox in bboxes: coor = np.array(bbox[:4], dtype=np.int32) score = bbox[4] class_ind = int(bbox[5]) class_name = NUM_CLASS[class_ind] score = '%.4f' % score xmin, ymin, xmax, ymax = list(map(str, coor)) bbox = xmin + " " + ymin + " " + xmax + " " + ymax json_pred[gt_classes.index(class_name)].append({ "confidence": str(score), "file_id": str(index), "bbox": str(bbox) }) ms = sum(times) / len(times) * 1000 fps = 1000 / ms for class_name in gt_classes: json_pred[gt_classes.index(class_name)].sort( key=lambda x: float(x['confidence']), reverse=True) with open(f'{ground_truth_dir_path}/{class_name}_predictions.json', 'w') as outfile: json.dump(json_pred[gt_classes.index(class_name)], outfile) # Calculate the AP for each class sum_AP = 0.0 ap_dictionary = {} # open file to store the results with open("mAP/results.txt", 'w') as results_file: results_file.write("# AP and precision/recall per class\n") count_true_positives = {} for class_index, class_name in enumerate(gt_classes): count_true_positives[class_name] = 0 # Load predictions of that class predictions_file = f'{ground_truth_dir_path}/{class_name}_predictions.json' predictions_data = json.load(open(predictions_file)) # Assign predictions to ground truth objects nd = len(predictions_data) tp = [0] * nd # creates an array of zeros of size nd fp = [0] * nd for idx, prediction in enumerate(predictions_data): file_id = prediction["file_id"] # assign prediction to ground truth object if any # open ground-truth with that file_id gt_file = f'{ground_truth_dir_path}/{str(file_id)}_ground_truth.json' ground_truth_data = json.load(open(gt_file)) ovmax = -1 gt_match = -1 # load prediction bounding-box bb = [float(x) for x in prediction["bbox"].split() ] # bounding box of prediction for obj in ground_truth_data: # look for a class_name match if obj["class_name"] == class_name: bbgt = [float(x) for x in obj["bbox"].split() ] # bounding box of ground truth bi = [ max(bb[0], bbgt[0]), max(bb[1], bbgt[1]), min(bb[2], bbgt[2]), min(bb[3], bbgt[3]) ] iw = bi[2] - bi[0] + 1 ih = bi[3] - bi[1] + 1 if iw > 0 and ih > 0: # compute overlap (IoU) = area of intersection / area of union ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + ( bbgt[2] - bbgt[0] + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih ov = iw * ih / ua if ov > ovmax: ovmax = ov gt_match = obj # assign prediction as true positive/don't care/false positive if ovmax >= MINOVERLAP: # if ovmax > minimum overlap if not bool(gt_match["used"]): # true positive tp[idx] = 1 gt_match["used"] = True count_true_positives[class_name] += 1 # update the ".json" file with open(gt_file, 'w') as f: f.write(json.dumps(ground_truth_data)) else: # false positive (multiple detection) fp[idx] = 1 else: # false positive fp[idx] = 1 # compute precision/recall cumsum = 0 for idx, val in enumerate(fp): fp[idx] += cumsum cumsum += val cumsum = 0 for idx, val in enumerate(tp): tp[idx] += cumsum cumsum += val #print(tp) rec = tp[:] for idx, val in enumerate(tp): rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] #print(rec) prec = tp[:] for idx, val in enumerate(tp): prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) #print(prec) ap, mrec, mprec = voc_ap(rec, prec) sum_AP += ap text = "{0:.3f}%".format( ap * 100 ) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) rounded_prec = ['%.3f' % elem for elem in prec] rounded_rec = ['%.3f' % elem for elem in rec] # Write to results.txt results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") print(text) ap_dictionary[class_name] = ap results_file.write("\n# mAP of all classes\n") mAP = sum_AP / n_classes text = "mAP = {:.3f}%, {:.2f} FPS".format(mAP * 100, fps) results_file.write(text + "\n") print(text) return mAP * 100
def Object_tracking(YoloV3, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only=[]): # Definition of the parameters max_cosine_distance = 0.7 nn_budget = None #initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times = [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) while True: _, img = vid.read() try: original_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_image), [input_size, input_size]) image_data = tf.expand_dims(image_data, 0) t1 = time.time() pred_bbox = YoloV3.predict(image_data) t2 = time.time() times.append(t2 - t1) times = times[-20:] pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_image, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_image, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index] ) # Structure data, that we could use it with our draw_bbox function ms = sum(times) / len(times) * 1000 fps = 1000 / ms # draw detection on frame image = draw_bbox(original_image, tracked_bboxes, CLASSES=CLASSES, tracking=True) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) #print("Time: {:.2f}ms, {:.1f} FPS".format(ms, fps)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows()
print("Output - ", output_video_filepath) input_video = cv2.VideoCapture(input_video_filepath) fps = input_video.get(cv2.CAP_PROP_FPS) Width = int(input_video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float Height = int(input_video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float total_frames = int(input_video.get(cv2.CAP_PROP_FRAME_COUNT)) fourcc = cv2.VideoWriter_fourcc(*'XVID') output_video = cv2.VideoWriter(output_video_filepath, fourcc, fps, (Width, Height)) times = [] CLASSES = "model_data/coco.names" NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) classes = None with open('coco.names.txt', 'r') as f: classes = [line.strip() for line in f.readlines()] # read pre-trained model and config file net = cv2.dnn.readNet( './model_data/yolov3.weights', 'yolov3.cfg') # or ./model_data/yolov4.weights and yolov4.cfg max_cosine_distance = 1 nn_budget = None #initialize deep sort object
def track_object(Yolo, video_path, vid_output_path, text_output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', tracking=True, track_only=[], tracker_max_age=30, passenger_det=False, face_score_threshold=0.3, color="bincount"): """ Do detection on video :param Yolo: <model_obj> YOLO model for vehicle detection :param video_path: <str> Path to video file. Leave empty to use camera :param vid_output_path: <str> Path to save processed video. Leave empty to not save :param input_size: <int> YOLO model input size :param show: <bool> True if you want to see processing live :param CLASSES: <obj> YOLO model classed. By default they are taken from the config file :param score_threshold: <float> minimum confidence for vehicle detection :param iou_threshold: <float> minimum bounding box overlap for them to be counted as same object :param rectangle_colors: bounding box colors. Currently does nothing :param tracking: whether to use vehicle tracking :param track_only: <list> List of objects to track if detector detects more :param tracker_max_age: <int> number of missed before track is deleted :param face_det: <bool> whether to initialize face detection :param face_score_threshold: <float> minimum confidence for face detection :param color: <str> Color detection method to use. None if neither one :return: """ if not Yolo: Yolo = load_yolo_model() if passenger_det: passenger_det = FaceDetector() else: passenger_det = None if text_output_path: write_csv([[ "x1", "y1", "x2", "y2", "id", "class", "probability", "color" if color else None, "passengers" if passenger_det else None ]], text_output_path) # Definition of the deep sort parameters max_cosine_distance = 0.7 nn_budget = None # initialize deep sort object model_filename = 'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric, max_age=tracker_max_age) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'XVID') out = cv2.VideoWriter(vid_output_path, codec, fps, (width, height)) # vid_output_path must be .mp4 NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) while True: _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) # image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms') # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(track_only) != 0 and NUM_CLASS[int( bbox[5])] in track_only or len(track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) features = np.array(encoder(original_frame, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # if score >= confidence_threshold] # Pass detections to the deep sort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) # Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index, track.class_confidence] ) # Structure data, that we could use it with our draw_bbox function # draw detection on frame image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True, color=color, text_output_path=text_output_path, passenger_detector=passenger_det, passenger_threshold=face_score_threshold) t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2)) if vid_output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break cv2.destroyAllWindows()
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=True, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only=[]): output_file = "D:/PELUSO/ITSligo/lectures_MENG/4-Symulation and Testing/assignments/assignment 2 - group/simV5_anotation.CSV" csv_file = open(output_file, mode='a') #new results_csv = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) results_csv.writerow([ 'Frame_index', 'Score', 'Confidence', 'Pixel_Area', 'X1', 'Y1', 'X2', 'Y2', 'ClassID' ]) # Definition of the parameters max_cosine_distance = 0.9 nn_budget = None #initialize deep sort object model_filename = "D:/PELUSO/ITSligo/lectures_MENG/4-Symulation and Testing/assignments/assignment 2 - group/TensorFlow-2.x-YOLOv3-master/model_data/mars-small128.pb" ##'model_data/mars-small128.pb' encoder = gdet.create_box_encoder(model_filename, batch_size=1) metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) tracker = Tracker(metric) times, times_2 = [], [] if video_path: vid = cv2.VideoCapture(video_path) # detect on video else: vid = cv2.VideoCapture(0) # detect from webcam # by default VideoCapture returns float instead of int width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'mp4v') ##(*'XVID') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4 print("FPS:::::" + str(fps)) NUM_CLASS = read_class_names(CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) frame_idx = 0 while True: _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) #image_data = tf.expand_dims(image_data, 0) image_data = image_data[np.newaxis, ...].astype(np.float32) t1 = time.time() if YOLO_FRAMEWORK == "tf": pred_bbox = Yolo.predict(image_data) elif YOLO_FRAMEWORK == "trt": batched_input = tf.constant(image_data) result = Yolo(batched_input) pred_bbox = [] for key, value in result.items(): value = value.numpy() pred_bbox.append(value) #t1 = time.time() #pred_bbox = Yolo.predict(image_data) t2 = time.time() pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox] pred_bbox = tf.concat(pred_bbox, axis=0) bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold) bboxes = nms(bboxes, iou_threshold, method='nms', sigma=0.4) print(np.argmax(pred_bbox[:, 5:], axis=-1)) # extract bboxes to boxes (x, y, width, height), scores and names boxes, scores, names = [], [], [] for bbox in bboxes: if len(Track_only) != 0 and NUM_CLASS[int( bbox[5])] in Track_only or len(Track_only) == 0: boxes.append([ bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int) - bbox[0].astype(int), bbox[3].astype(int) - bbox[1].astype(int) ]) scores.append(bbox[4]) names.append(NUM_CLASS[int(bbox[5])]) # Obtain all the detections for the given frame. boxes = np.array(boxes) names = np.array(names) scores = np.array(scores) ##---esse eh o fidaputi features = np.array(encoder(original_frame, boxes)) detections = [ Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip( boxes, scores, names, features) ] # Pass detections to the deepsort object and obtain the track information. tracker.predict() tracker.update(detections) # Obtain info from the tracks tracked_bboxes = [] tracked_scores = [] for i, track in enumerate(tracker.tracks): if not track.is_confirmed() or track.time_since_update > 5: continue bbox = track.to_tlbr() # Get the corrected/predicted bounding box class_name = track.get_class( ) #Get the class name of particular object tracking_id = track.track_id # Get the ID for the particular track index = key_list[val_list.index( class_name)] # Get predicted object index by object name tracked_bboxes.append( bbox.tolist() + [tracking_id, index] ) # Structure data, that we could use it with our draw_bbox function try: tracked_scores.append(track.Confidence) ##new except: print("skip") # draw detection on frame draw_bbox(image, bboxes, CLASSES=YOLO_COCO_CLASSES, show_label=True, show_confidence = True, Text_colors=(255,255,0), rectangle_colors='', tracking=False) frame_idx += 1 print("------frame_idx-------" + str(frame_idx)) image = draw_bbox(original_frame, results_csv, tracked_bboxes, frame_idx, tracked_scores, show_label=True, show_confidence=True, CLASSES=CLASSES, tracking=True) ##new t3 = time.time() times.append(t2 - t1) times_2.append(t3 - t1) times = times[-20:] times_2 = times_2[-20:] ms = sum(times) / len(times) * 1000 fps = 1000 / ms fps2 = 1000 / (sum(times_2) / len(times_2) * 1000) ##image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2) # draw original yolo detection #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True) print( "Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format( ms, fps, fps2)) if output_path != '': out.write(image) if show: cv2.imshow('output', image) if cv2.waitKey(25) & 0xFF == ord("q"): cv2.destroyAllWindows() break results_csv.close() cv2.destroyAllWindows()
times, times_2 = [], [] vid = cv2.VideoCapture(video_path) prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2( ) else cv2.CAP_PROP_FRAME_COUNT totalFrames = int(vid.get(prop)) print(f"info - Total frames are {totalFrames}") width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = int(vid.get(cv2.CAP_PROP_FPS)) codec = cv2.VideoWriter_fourcc(*'MJPG') out = cv2.VideoWriter(output_path, codec, fps, (width, height)) NUM_CLASS = read_class_names(YOLO_COCO_CLASSES) key_list = list(NUM_CLASS.keys()) val_list = list(NUM_CLASS.values()) while True: _, frame = vid.read() try: original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB) except: break image_data = image_preprocess(np.copy(original_frame), [input_size, input_size]) image_data = image_data[np.newaxis, ...].astype(np.float32)