Exemplo n.º 1
0
Arquivo: demo.py Projeto: SpecDI/cs407
def main(yolo, sequence_file, fps_render_rate, writeVideo_flag, labels_file,
         hide_window):
    # Compute output file
    file_name = os.path.splitext(
        os.path.basename(sequence_file))[0] if sequence_file != '0' else '0'
    if sequence_file == '0':
        sequence_file = 0

    # Compute the action map if labels provided
    action_map = dict()
    if labels_file != None:
        action_map = parse_labels_file(labels_file)
    print(action_map)

    # Build directory path
    frames_dir_path = "output/action_tubes/" + file_name
    if not writeVideo_flag:
        if os.path.exists(frames_dir_path):
            shutil.rmtree(frames_dir_path)
        os.mkdir(frames_dir_path)

    # Create coords dir for movie
    coords_path = 'output/tracked_bounding_boxes/' + file_name + '.json'

    output_seq = 'output/annotated_videos/' + file_name + '.avi'

    # Dict of coordinates for each tracked individual
    track_map = dict()

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0

    # deep_sort
    model_filename = 'object_detection/model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename, batch_size=1)

    metric = nn_matching.NearestNeighborDistanceMetric("cosine",
                                                       max_cosine_distance,
                                                       nn_budget)
    tracker = Tracker(metric)

    video_capture = cv2.VideoCapture(sequence_file)

    if writeVideo_flag:
        # Define the codec and create VideoWriter object
        w = int(video_capture.get(3))
        h = int(video_capture.get(4))
        fourcc = cv2.VideoWriter_fourcc(*'XVID')  #*'MJPG'
        # Build video output handler only if we are not cropping
        out = cv2.VideoWriter(output_seq, fourcc, fps_render_rate, (w, h))
        list_file = open('detection.txt', 'w')
        frame_index = -1

    fps = 0.0
    frame_number = 0
    while video_capture.isOpened():
        frame_number += 1
        ret, frame = video_capture.read()  # frame shape 640*480*3
        if ret != True:
            break
        t1 = time.time()

        # image = Image.fromarray(frame)
        image = Image.fromarray(frame[..., ::-1])  #bgr to rgb
        boxs = yolo.detect_image(image)
        # print("box_num",len(boxs))
        features = encoder(frame, boxs)

        # score to 1.0 here).
        detections = [
            Detection(bbox, 1.0, feature)
            for bbox, feature in zip(boxs, features)
        ]

        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(boxes, nms_max_overlap,
                                                    scores)
        detections = [detections[i] for i in indices]

        # Call the tracker
        tracker.predict()
        tracker.update(detections)
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()
            crop_img = frame[int(bbox[1]):int(bbox[3]),
                             int(bbox[0]):int(bbox[2])].copy()

            # Append coordinates for individual to track map
            if track.track_id not in track_map:
                track_map[track.track_id] = [
                    (frame_number,
                     [int(bbox[0]),
                      int(bbox[1]),
                      int(bbox[2]),
                      int(bbox[3])])
                ]
            else:
                track_map[track.track_id].append(
                    (frame_number,
                     [int(bbox[0]),
                      int(bbox[1]),
                      int(bbox[2]),
                      int(bbox[3])]))

            # Build directory path
            frames_dir_path = "output/action_tubes/" + file_name + '/' + str(
                track.track_id)
            if not os.path.exists(frames_dir_path) and not writeVideo_flag:
                os.mkdir(frames_dir_path)
            # Write frame or annotate frame
            if not writeVideo_flag:
                cv2.imwrite(frames_dir_path + "/" + str(frame_number) + ".jpg",
                            crop_img)
            else:
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), (255, 255, 255), 2)

                append_str = str(track.track_id) + ": Person"
                if track.track_id in action_map:
                    append_str += ' ' + action_map[track.track_id]
                cv2.putText(frame, append_str, (int(bbox[0]), int(bbox[1])), 0,
                            5e-3 * 200, (0, 255, 0), 2)

        with open(coords_path, 'w') as fp:
            json.dump(track_map, fp)

        for det in detections:
            bbox = det.to_tlbr()
            if writeVideo_flag:
                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])), (255, 0, 0), 2)

        if not hide_window:
            cv2.imshow('', cv2.resize(frame, (1200, 675)))

        if writeVideo_flag:
            # save a frame
            out.write(frame)
            frame_index = frame_index + 1
            list_file.write(str(frame_index) + ' ')
            if len(boxs) != 0:
                for i in range(0, len(boxs)):
                    list_file.write(
                        str(boxs[i][0]) + ' ' + str(boxs[i][1]) + ' ' +
                        str(boxs[i][2]) + ' ' + str(boxs[i][3]) + ' ')
            list_file.write('\n')

        fps = (fps + (1. / (time.time() - t1))) / 2
        print("fps= %f" % (fps))

        # Press Q to stop!
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    video_capture.release()
    if writeVideo_flag:
        out.release()
        list_file.close()
    cv2.destroyAllWindows()
    def detect_n_track(self, origimg, img):
        """Do object detection and object tracking (optional) over 1 image."""
        input_imgs = torch.from_numpy(img).float().unsqueeze(0).to(self.device)
        raw_img = origimg.copy()

        # Applies yolov3 detection
        with torch.no_grad():
            detections, _ = self.detect_model(input_imgs)
            detections = non_max_suppression(detections, self.conf_th, self.nms_thres)[0]

        if detections is None:
            return [], [], []

        _box, _conf, cls = boxes_filtering(origimg, detections, self.img_size, cls_out=[0], mode='square')

        if len(_box) == 0:
            return [], [], []

        # Applies deep_sort/sort for tracking people
        if self.tracking_type == 'sort':
            dets = []
            for i in range(len(_box)):
                x, y, w, h = _box[i]
                dets.append([x, y, x + w, y + h, _conf[i]])

            dets = np.asarray(dets)
            tracks = self.tracker.update(dets)

            tmp_box = []
            for track in tracks:
                bbox = np.array(track[:4]).astype(int)

                self.save_probe_dir(int(track[4]), raw_img, bbox)
                cv2.rectangle(origimg, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 2)
                cv2.putText(origimg, str(int(track[4])), (bbox[0], bbox[3]), 0, 5e-3 * 200, (0, 255, 0), 2)

                tlwh = bbox.copy()
                tlwh[2:] -= tlwh[:2]
                tmp_box.append(tlwh)

            vis_box = visualize_box(tmp_box)
        elif self.tracking_type == "deep_sort":
            features = self.encoder(origimg, _box)
            detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(_box, features)]
            boxes = np.array([d.tlwh for d in detections])
            scores = np.array([d.confidence for d in detections])
            indices = preprocessing.non_max_suppression(boxes, 1.0, scores)
            detections = [detections[i] for i in indices]
            self.tracker.predict()
            self.tracker.update(detections)

            tmp_box = []
            for track in self.tracker.tracks:
                if track.time_since_update > 1:
                    continue
                bbox = track.to_tlbr().astype(int)
                self.save_probe_dir(track.track_id, raw_img, bbox)
                cv2.rectangle(origimg, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 2)
                cv2.putText(origimg, str(track.track_id), (bbox[0], bbox[1]), 0, 5e-3 * 200, (0, 255, 0), 2)
                tmp_box.append(track.to_tlwh().astype(int))

            vis_box = visualize_box(tmp_box)
        else:
            vis_box = visualize_box(_box)

        return vis_box, _conf, cls
Exemplo n.º 3
0
def main(yolo, hide_window, weights_file, test_mode, test_output, bayesian,
         batch_factor, input_file, progress_recorder):
    if test_mode:
        global object_detection_file
        global object_tracking_directory
        global action_recognition_directory
        initialiseTestMode(test_output)

    print('Starting pipeline...')

    # Define output path based on file name and web_server dir
    file_name = os.path.splitext(os.path.basename(input_file))[0]
    #output_file = f'./web_server/output_{file_name}.avi'
    #output_file = f'./gui/webapp/static/output_{file_name}.avi' # Existing pipeline version. Commented for easier Django display.
    output_file = f'./gui/webapp/static/output.mp4'  ###

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0

    model = TS_CNN_LSTM(INPUT_SHAPE, CLASSES)
    model.load_weights('action_recognition/architectures/weights/' +
                       weights_file + '.hdf5')

    # Track id frame batch
    track_tubeMap = {}

    # Track id action
    track_actionMap = {}

    # Image data generator
    datagen = ImageDataGenerator()

    # deep_sort
    model_filename = 'object_detection/model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename, batch_size=1)

    metric = nn_matching.NearestNeighborDistanceMetric("cosine",
                                                       max_cosine_distance,
                                                       nn_budget)
    tracker = Tracker(metric)

    ################################################################################
    # New: Count total number of frames!
    video_count = cv2.VideoCapture(input_file)
    frame_total = int(video_count.get(cv2.CAP_PROP_FRAME_COUNT))
    print("FRAME COUNT: ", frame_total)
    video_count.release()
    ################################################################################

    video_capture = FileVideoStream(input_file).start()

    # Let input stream load some frames
    time.sleep(5)

    # Define the codec and create VideoWriter object
    w = 3840
    h = 2160
    #fourcc = cv2.VideoWriter_fourcc(*'MJPG') #*'XVID'
    fourcc = cv2.VideoWriter_fourcc(*'H264')  # Now mp4!
    # Build video output handler only if we are not cropping

    out = None
    if not test_mode:
        out = cv2.VideoWriter(output_file, fourcc, 11, (w, h))

    fps = 0.0
    location = (0, 0)

    frame_number = 0

    track_buffer = []

    unprocessedFrames = []

    processedFrames = []

    processedTracks = []

    locations = []

    skip = 1
    while video_capture.more():  ####
        frame = video_capture.read()  # frame shape 640*480*3
        if not isinstance(frame, np.ndarray):
            break
        t1 = time.time()

        x = w
        y = h

        scaledX = 640
        scaledY = 360

        xScale = x / scaledX
        yScale = y / scaledY

        if (frame_number % skip == 0):
            if (not location) or frame_number % 5 == 0:
                location = [0, 0, scaledX, scaledY]
            else:
                location = rescale(location, xScale, yScale, 0, 0, False)

            frameCopy = frame.copy()

            frame = cv2.resize(frame, (scaledX, scaledY),
                               interpolation=cv2.INTER_AREA)
            # image = Image.fromarray(frame)
            image = Image.fromarray(frame[..., ::-1])  #bgr to rgb

            diffx = location[0]
            diffy = location[1]

            image = image.crop(
                (location[0], location[1], location[2], location[3]))

            boxs = yolo.detect_image(image)

            features = encoder(frame, boxs)

            # score to 1.0 here).
            detections = [
                Detection(bbox, 1.0, feature).rescale(xScale, yScale, diffx,
                                                      diffy)
                for bbox, feature in zip(boxs, features)
            ]

            # Run non-maxima suppression.
            boxes = np.array([d.tlwh for d in detections])
            scores = np.array([d.confidence for d in detections])
            indices = preprocessing.non_max_suppression(
                boxes, nms_max_overlap, scores)
            detections = [detections[i] for i in indices]

            if test_mode:
                if len(detections) != 0:
                    for i in range(0, len(detections)):
                        bbox = detections[i].to_tlbr()
                        object_detection_file.write(
                            str(frame_number) + ' ' + str(int(bbox[0])) + ' ' +
                            str(int(bbox[1])) + ' ' + str(int(bbox[2])) + ' ' +
                            str(int(bbox[3])) + '\n')
            # Call the tracker
            tracker.predict()
            tracker.update(detections)

            frame = frameCopy

            tracks = dict()
            for track in tracker.tracks:
                if not track.is_confirmed() or track.time_since_update > 1:
                    continue
                bbox = track.to_tlbr()

                tracks[track.track_id] = bbox

            if frame_number == 0:
                track_buffer.append(tracks)

                processedFrames.append(frame)
                processedTracks.append(tracks)
            else:
                firstTrack = track_buffer.pop(0)
                secondTrack = tracks.copy()

                for i, oldFrame in enumerate(unprocessedFrames, 1):
                    tracks = dict()
                    for trackId in secondTrack:
                        if trackId in firstTrack:
                            min_x, min_y, max_x, max_y = interpolateBbox(
                                firstTrack[trackId], secondTrack[trackId],
                                i / skip)
                            result = [
                                x + y
                                for x, y in zip([min_x, min_y, max_x, max_y],
                                                firstTrack[trackId])
                            ]
                            tracks[trackId] = result

                    processedFrames.append(oldFrame)
                    processedTracks.append(tracks.copy())
                    unprocessedFrames = []
                processedFrames.append(frame)
                processedTracks.append(secondTrack)
                track_buffer.append(secondTrack)

            location = rescale(location, xScale, yScale, 0, 0, True)
            locations.append(location)

            if (frame_number >= skip):
                frame = processFrame(locations, processedFrames,
                                     processedTracks, track_tubeMap,
                                     track_actionMap, model, test_mode,
                                     bayesian, batch_factor)

            currentXs = []
            currentYs = []
            for det in detections:
                bbox = det.to_tlbr()

                currentXs.extend([int(bbox[0]), int(bbox[2])])
                currentYs.extend([int(bbox[1]), int(bbox[3])])

        else:
            unprocessedFrames.append(frame)
            locations.append([0, 0, 0, 0])
            if (frame_number >= skip):
                frame = processFrame(locations, processedFrames,
                                     processedTracks, track_tubeMap,
                                     track_actionMap, model, test_mode,
                                     bayesian, batch_factor)

        # Display video as processed if necessary

        # save a frame

        if (frame_number >= skip):
            writeFrame(frame, out, hide_window, test_mode)

        frame_number += 1

        # Updates progress bar every 10 frames
        if frame_number % 10 == 0:
            progress_recorder.set_progress(frame_number, frame_total)  ####

        if frame_number % 5 != 0:
            location = calculateLocation(currentXs, currentYs)

        fps = (fps + (1. / (time.time() - t1))) / 2
        print("fps= %f" % (fps / skip))

        # Press Q to stop!
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    video_capture.stop()

    while len(processedFrames) != 0:
        frame_number += 1
        frame = processFrame(locations, processedFrames, processedTracks,
                             track_tubeMap, track_actionMap, model, test_mode,
                             bayesian, batch_factor)
        writeFrame(frame, out, hide_window, test_mode)

    while len(unprocessedFrames) != 0:
        frame_number += 1
        frame = unprocessedFrames.pop()
        writeFrame(frame, out, hide_window, test_mode)

    if not test_mode:
        out.release()

    cv2.destroyAllWindows()

    if test_mode:
        object_detection_file.close()
Exemplo n.º 4
0
def main(yolo, hide_window, weights_file):
    print('Starting pipeline...')

    input_file = './web_server/input.mp4'
    output_file = './web_server/output.avi'

    # Definition of the parameters
    max_cosine_distance = 0.3
    nn_budget = None
    nms_max_overlap = 1.0

    metrics = MetricsAtTopK(k=2)
    losses = LossFunctions()

    # Load in model
    # model = load_model(
    #     'action_recognition/architectures/weights/lstm_1_2.hdf5',
    #     custom_objects={
    #         "weighted_binary_crossentropy": losses.weighted_binary_crossentropy,
    #         "recall_at_k": metrics.recall_at_k, 
    #         "precision_at_k": metrics.precision_at_k, 
    #         "f1_at_k": metrics.f1_at_k,
    #         "hamming_loss": hamming_loss,
    #     }
    # )

    model = cnn_lstm(INPUT_SHAPE, KERNEL_SHAPE, POOL_SHAPE, CLASSES)
    model.load_weights('action_recognition/architectures/weights/' + weights_file + '.hdf5')

    metrics = MetricsAtTopK(k=2)
    losses = LossFunctions()
    model.compile(loss=losses.weighted_binary_crossentropy, 
                    optimizer='adam', metrics=['accuracy', 
                                                metrics.recall_at_k, 
                                                metrics.precision_at_k, 
                                                metrics.f1_at_k, 
                                                losses.hamming_loss])

    # Track id frame batch
    track_tubeMap = {}

    # Track id action
    track_actionMap = {}

    # Image data generator
    datagen = ImageDataGenerator()

    # deep_sort 
    model_filename = 'object_detection/model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename,batch_size=1)
    
    metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
    tracker = Tracker(metric)

    video_capture = cv2.VideoCapture(input_file)

    # Define the codec and create VideoWriter object
    w = int(video_capture.get(3))
    h = int(video_capture.get(4))
    fourcc = cv2.VideoWriter_fourcc(*'MJPG') #*'XVID'
    # Build video output handler only if we are not cropping
    out = cv2.VideoWriter(output_file, fourcc, 11, (w, h))
    list_file = open('detection.txt', 'w')
    frame_index = -1

    fps = 0.0
    frame_number = 0
    while video_capture.isOpened():
        frame_number+=1
        ret, frame = video_capture.read()  # frame shape 640*480*3
        if ret != True:
            break
        t1 = time.time()

        # image = Image.fromarray(frame)
        image = Image.fromarray(frame[...,::-1]) #bgr to rgb
        boxs = yolo.detect_image(image)
       # print("box_num",len(boxs))
        features = encoder(frame,boxs)
        
        # score to 1.0 here).
        detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(boxs, features)]
        
        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # Call the tracker
        tracker.predict()
        tracker.update(detections)
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()
            
            # Init text to be appended to bbox
            append_str = str(track.track_id)

            if track.track_id not in track_actionMap:
                track_actionMap[track.track_id] = 'Unknown'

            # Init new key if necessary
            if track.track_id not in track_tubeMap:
                track_tubeMap[track.track_id] = []

            # Add frame segment to track dict
            block = frame[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])].copy()
            track_tubeMap[track.track_id].append(block / 255.0)

            # Check size of track bucket
            if len(track_tubeMap[track.track_id]) == FRAME_NUM:
                # Process action tube
                batch = process_batch(track_tubeMap[track.track_id])
                batch = batch.reshape(1, FRAME_NUM, FRAME_LENGTH, FRAME_WIDTH, 3)
                
                # Generate predictions
                preds = model.predict(batch)[0].tolist()
                print(preds)

                # Clear the list
                track_tubeMap[track.track_id] = []
                # Update action label to match corresponding action
                action_label = actions_header[preds.index(max(preds))]
                print(f"Person {track.track_id} is {action_label}")

                # Update map
                track_actionMap[track.track_id] = action_label

            # Update text to be appended
            append_str += ' ' + track_actionMap[track.track_id]
            # Create bbox and text label
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,255,255), 2)
            cv2.putText(frame, append_str,(int(bbox[0]), int(bbox[1])),0, 5e-3 * 200, (0,255,0),2)

        for det in detections:
            bbox = det.to_tlbr()
            cv2.rectangle(frame,(int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])),(255,0,0), 2)

        # Display video as processed if necessary
        if not hide_window:
            cv2.imshow('', cv2.resize(frame, (1200, 675)))

        # save a frame
        out.write(frame)
        frame_index = frame_index + 1
        list_file.write(str(frame_index)+' ')
        if len(boxs) != 0:
            for i in range(0,len(boxs)):
                list_file.write(str(boxs[i][0]) + ' '+str(boxs[i][1]) + ' '+str(boxs[i][2]) + ' '+str(boxs[i][3]) + ' ')
        list_file.write('\n')


        fps  = ( fps + (1./(time.time()-t1)) ) / 2
        print("fps= %f"%(fps))

        # Press Q to stop!
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    video_capture.release()
    out.release()
    list_file.close()
    cv2.destroyAllWindows()
    def detect_n_counting(self, origimg, img, loader=None, out=None):
        """Do object detection over 1 image."""
        input_imgs = torch.from_numpy(img).float().unsqueeze(0).to(self.device)
        raw_img = origimg.copy()

        # Apply Object Detection models
        with torch.no_grad():
            if self.od_model == 'yolo':
                detections, _ = self.detect_model(input_imgs)
                detections = non_max_suppression(detections, self.conf_th,
                                                 self.nms_thres)[0]

                if detections is None:
                    return self.null_values

                box, conf, cls = boxes_filtering(raw_img,
                                                 detections,
                                                 self.img_size,
                                                 cls_out=self.cls_out,
                                                 mode=self.resize_mode)
            else:
                detections = self.detect_model.run(img,
                                                   loader.frame,
                                                   vid_writer=out)
                if not bool(detections):
                    return self.null_values

                box, conf, cls = ct_boxes_filer(detections['results'],
                                                self.cls_out, self.conf_th)

            # Identify shake point
            if loader.frame > 1:
                if abs(len(box) - self.prev_bboxes) >= self.min_shake_point:
                    self.shake_camera = True
                    self.cons_frames.clear()
                else:
                    self.cons_frames.append(True)

            if len(self.cons_frames) >= self.stable_point:
                self.shake_camera = False

            self.prev_bboxes = len(box)

        if len(box) == 0:
            return self.null_values

        cls_out_dict = {}
        for i in range(len(box)):
            if cls[i] not in cls_out_dict:
                cls_out_dict[cls[i]] = [[box[i]], [conf[i]]]
            else:
                cls_out_dict[cls[i]][0].append(box[i])
                cls_out_dict[cls[i]][1].append(conf[i])

        for cls in cls_out_dict:
            cls_boxes = cls_out_dict[cls][0]
            cls_conf = cls_out_dict[cls][1]

            if cls in self.tracker:
                # People and Vehicle Tracking
                if cls == 0:
                    features = self.p_encoder(raw_img, cls_boxes)
                else:
                    features = self.v_encoder(raw_img, cls_boxes)

                detections = [
                    Detection(bbox, 1.0, feature)
                    for bbox, feature in zip(cls_boxes, features)
                ]
                boxes = np.array([d.tlwh for d in detections])
                scores = np.array([d.confidence for d in detections])
                indices = preprocessing.non_max_suppression(boxes, 1.0, scores)
                detections = [detections[i] for i in indices]

                self.tracker[cls].predict()
                self.tracker[cls].update(detections, self.shake_camera)

                for track in self.tracker[cls].tracks:
                    bbox = track.to_tlbr().astype(int)

                    # save tracked list
                    if self.save_probe:
                        save_probe_dir(video_id=os.path.basename(
                            loader.path).split('.')[0][1:],
                                       track_id=track.track_id,
                                       raw_img=raw_img,
                                       bbox=bbox)

                    if not track.is_confirmed() or track.time_since_update > 1:
                        continue

                    cv2.rectangle(origimg, (bbox[0], bbox[1]),
                                  (bbox[2], bbox[3]), self.colors[cls], 2)
                    cv2.putText(origimg, str(track.track_id),
                                (bbox[0], bbox[1]), 0, 5e-3 * 200, (0, 255, 0),
                                2)

                    # write coordinates
                    self.write_coordinates(loader=loader,
                                           x=bbox[0],
                                           y=bbox[1],
                                           w=bbox[2] - bbox[0],
                                           h=bbox[3] - bbox[1],
                                           cls=cls,
                                           track_id=track.track_id)
            else:
                # Other objects Tracking
                dets = []
                for i in range(len(cls_boxes)):
                    x, y, w, h = cls_boxes[i]
                    dets.append([x, y, x + w, y + h, cls_conf[i]])

                dets = np.asarray(dets)
                self.other_trackers[cls].update(dets)

                for track in self.other_trackers[cls].trackers:
                    bbox = np.array(track.get_state()[0]).astype(int)
                    if (track.time_since_update > 1) or \
                            (track.hit_streak < 3):
                        continue

                    cv2.rectangle(origimg, (bbox[0], bbox[1]),
                                  (bbox[2], bbox[3]), self.colors[cls], 2)
                    cv2.putText(origimg, str(int(track.id)),
                                (bbox[0], bbox[3]), 0, 5e-3 * 200, (0, 255, 0),
                                2)

                    # write coordinates
                    self.write_coordinates(loader=loader,
                                           x=bbox[0],
                                           y=bbox[1],
                                           w=bbox[2] - bbox[0],
                                           h=bbox[3] - bbox[1],
                                           cls=cls,
                                           track_id=int(track.id))