class MPT(): def __init__( self, device=None, batch_size=12, display=False, detection_threshold=0.7, detector_type='yolo', yolo_img_size=608, output_format='list', ): ''' Multi Person Tracker :param device (str, 'cuda' or 'cpu'): torch device for model and inputs :param batch_size (int): batch size for detection model :param display (bool): display the results of multi person tracking :param detection_threshold (float): threshold to filter detector predictions :param detector_type (str, 'maskrcnn' or 'yolo'): detector architecture :param yolo_img_size (int): yolo detector input image size :param output_format (str, 'dict' or 'list'): result output format ''' if device is not None: self.device = device else: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.batch_size = batch_size self.display = display self.detection_threshold = detection_threshold self.output_format = output_format if detector_type == 'maskrcnn': self.detector = keypointrcnn_resnet50_fpn(pretrained=True).to( self.device).eval() elif detector_type == 'yolo': self.detector = YOLOv3(device=self.device, img_size=yolo_img_size, person_detector=True, video=True, return_dict=True) else: raise ModuleNotFoundError self.tracker = Sort() @torch.no_grad() def run_tracker(self, frame): ''' Run tracker on an input video :param video (ndarray): input video tensor of shape NxHxWxC. Preferable use skvideo to read videos :return: trackers (ndarray): output tracklets of shape Nx5 [x1,y1,x2,y2,track_id] ''' # initialize tracker #self.tracker = Sort() start = time.time() print('Running Multi-Person-Tracker') trackers = [] frame = frame.to(self.device) #print('detector frame',frame.shape) predictions = self.detector(frame) for pred in predictions: bb = pred['boxes'].cpu().numpy() sc = pred['scores'].cpu().numpy()[..., None] dets = np.hstack([bb, sc]) dets = dets[sc[:, 0] > self.detection_threshold] #print('dets222222 ',dets) # if nothing detected do not update the tracker if dets.shape[0] > 0: track_bbs_ids = self.tracker.update(dets) else: track_bbs_ids = np.empty((0, 5)) trackers.append(track_bbs_ids) runtime = time.time() - start fps = 1 / runtime print(f'Finished. Detection + Tracking FPS {fps:.2f}') return trackers def prepare_output_tracks(self, trackers): ''' Put results into a dictionary consists of detected people :param trackers (ndarray): input tracklets of shape Nx5 [x1,y1,x2,y2,track_id] :return: dict: of people. each key represent single person with detected bboxes and frame_ids ''' people = dict() for frame_idx, tracks in enumerate(trackers): for d in tracks: person_id = int(d[4]) # bbox = np.array([d[0], d[1], d[2] - d[0], d[3] - d[1]]) # x1, y1, w, h w, h = d[2] - d[0], d[3] - d[1] c_x, c_y = d[0] + w / 2, d[1] + h / 2 w = h = np.where(w / h > 1, w, h) bbox = np.array([c_x, c_y, w, h]) if person_id in people.keys(): people[person_id]['bbox'].append(bbox) people[person_id]['frames'].append(frame_idx) else: people[person_id] = { 'bbox': [], 'frames': [], } people[person_id]['bbox'].append(bbox) people[person_id]['frames'].append(frame_idx) for k in people.keys(): people[k]['bbox'] = np.array(people[k]['bbox']).reshape( (len(people[k]['bbox']), 4)) people[k]['frames'] = np.array(people[k]['frames']) return people def __call__(self, frame, output_file=None): ''' Execute MPT and return results as a dictionary of person instances :param video (ndarray): input video tensor of shape NxHxWxC :return: a dictionary of person instances ''' trackers = self.run_tracker(frame) #print('tracker: ',trackers) if self.display: self.display_results(frame, trackers, output_file) if self.output_format == 'dict': result = self.prepare_output_tracks(trackers) elif self.output_format == 'list': result = trackers return result
class MPT(): def __init__( self, device=None, batch_size=12, display=False, detection_threshold=0.7, detector_type='yolo', yolo_img_size=608, output_format='list', ): ''' Multi Person Tracker :param device (str, 'cuda' or 'cpu'): torch device for model and inputs :param batch_size (int): batch size for detection model :param display (bool): display the results of multi person tracking :param detection_threshold (float): threshold to filter detector predictions :param detector_type (str, 'maskrcnn' or 'yolo'): detector architecture :param yolo_img_size (int): yolo detector input image size :param output_format (str, 'dict' or 'list'): result output format ''' if device is not None: self.device = device else: self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.batch_size = batch_size self.display = display self.detection_threshold = detection_threshold self.output_format = output_format if detector_type == 'maskrcnn': self.detector = keypointrcnn_resnet50_fpn(pretrained=True).to( self.device).eval() elif detector_type == 'yolo': self.detector = YOLOv3(device=self.device, img_size=yolo_img_size, person_detector=True, video=True, return_dict=True) else: raise ModuleNotFoundError self.tracker = Sort() @torch.no_grad() def run_tracker(self, dataloader): ''' Run tracker on an input video :param video (ndarray): input video tensor of shape NxHxWxC. Preferable use skvideo to read videos :return: trackers (ndarray): output tracklets of shape Nx5 [x1,y1,x2,y2,track_id] ''' # initialize tracker self.tracker = Sort() # start = time.time() # print('Running Multi-Person-Tracker') trackers = [] for batch in dataloader: batch = batch.to(self.device) predictions = self.detector(batch) for pred in predictions: bb = pred['boxes'].cpu().numpy() sc = pred['scores'].cpu().numpy()[..., None] dets = np.hstack([bb, sc]) dets = dets[sc[:, 0] > self.detection_threshold] # if nothing detected do not update the tracker if dets.shape[0] > 0: track_bbs_ids = self.tracker.update(dets) else: track_bbs_ids = np.empty((0, 5)) trackers.append(track_bbs_ids) # runtime = time.time() - start # fps = len(dataloader.dataset) / runtime # print(f'Finished. Detection + Tracking FPS {fps:.2f}') return trackers def prepare_output_tracks(self, trackers): ''' Put results into a dictionary consists of detected people :param trackers (ndarray): input tracklets of shape Nx5 [x1,y1,x2,y2,track_id] :return: dict: of people. each key represent single person with detected bboxes and frame_ids ''' people = dict() for frame_idx, tracks in enumerate(trackers): for d in tracks: person_id = int(d[4]) # bbox = np.array([d[0], d[1], d[2] - d[0], d[3] - d[1]]) # x1, y1, w, h w, h = d[2] - d[0], d[3] - d[1] c_x, c_y = d[0] + w / 2, d[1] + h / 2 w = h = np.where(w / h > 1, w, h) bbox = np.array([c_x, c_y, w, h]) if person_id in people.keys(): people[person_id]['bbox'].append(bbox) people[person_id]['frames'].append(frame_idx) else: people[person_id] = { 'bbox': [], 'frames': [], } people[person_id]['bbox'].append(bbox) people[person_id]['frames'].append(frame_idx) for k in people.keys(): people[k]['bbox'] = np.array(people[k]['bbox']).reshape( (len(people[k]['bbox']), 4)) people[k]['frames'] = np.array(people[k]['frames']) return people def display_results(self, image_folder, trackers, output_file=None): ''' Display the output of multi-person-tracking :param video (ndarray): input video tensor of shape NxHxWxC :param trackers (ndarray): tracklets of shape Nx5 [x1,y1,x2,y2,track_id] :return: None ''' # print('Displaying results..') save = True if output_file else False tmp_write_folder = osp.join( '/tmp', f'{osp.basename(image_folder)}_mpt_results') os.makedirs(tmp_write_folder, exist_ok=True) colours = np.random.rand(32, 3) image_file_names = sorted([ osp.join(image_folder, x) for x in os.listdir(image_folder) if x.endswith('.png') or x.endswith('.jpg') ]) for idx, (img_fname, tracker) in enumerate(zip(image_file_names, trackers)): img = cv2.imread(img_fname) for d in tracker: d = d.astype(np.int32) c = (colours[d[4] % 32, :] * 255).astype(np.uint8).tolist() cv2.rectangle(img, (d[0], d[1]), (d[2], d[3]), color=c, thickness=int(round(img.shape[0] / 256))) cv2.putText(img, f'{d[4]}', (d[0] - 9, d[1] - 9), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0)) cv2.putText(img, f'{d[4]}', (d[0] - 8, d[1] - 8), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255)) cv2.imshow('result video', img) # time.sleep(0.03) if cv2.waitKey(1) & 0xFF == ord('q'): break if save: cv2.imwrite(osp.join(tmp_write_folder, f'{idx:06d}.png'), img) cv2.destroyAllWindows() if save: # print(f'Saving output video to {output_file}') images_to_video(img_folder=tmp_write_folder, output_vid_file=output_file) shutil.rmtree(tmp_write_folder) def __call__(self, image_folder, output_file=None): ''' Execute MPT and return results as a dictionary of person instances :param video (ndarray): input video tensor of shape NxHxWxC :return: a dictionary of person instances ''' image_dataset = ImageFolder(image_folder) dataloader = DataLoader(image_dataset, batch_size=self.batch_size, num_workers=8) trackers = self.run_tracker(dataloader) if self.display: self.display_results(image_folder, trackers, output_file) if self.output_format == 'dict': result = self.prepare_output_tracks(trackers) elif self.output_format == 'list': result = trackers return result
class VideoTracker: def __init__(self, device='cpu', img_size=608, thresh=0.7): self.detector = YOLOv3(device=device, img_size=img_size, person_detector=True, video=True, return_dict=True) self.tracker = Sort() self.tresh = thresh self.device = device def track(self, frames: torch.Tensor): """ frames: torch.Tensor, with dimentions NxCxHxW output: dict, ordered: { person_id: { 'bbox': np.array [[x0, x1, y0, y1], ..., ] - bounding box for each frame that the person appeared in. 'frames': np.array [i1,i2,...] - those are the frames that this person appeared in. } ... } """ with torch.no_grad(): frames = frames.to(self.device) detector_res = self.detector(frames) tracks = [] for res in detector_res: bbox = res['boxes'].cpu().numpy() scores = res['scores'].cpu().numpy()[..., None] detections = np.hstack((bbox, scores)) detections = detections[scores[:, 0] > self.tresh] if detections.shape[0] > 0: detections = self.tracker.update(detections) else: detections = np.empty((0, 5)) tracks.append(detections) return self.tracks2dict(tracks) @staticmethod def tracks2dict(trackers): """ transforms the trackers list into a dictionary by the person's Id. """ people = {} for frame_idx, tracks in enumerate(trackers): for d in tracks: y0, x0, y1, x1, person_id = d person_id = int(person_id) bbox = np.array([x0, x1, y0, y1]) if person_id in people.keys(): people[person_id]['bbox'].append(bbox) people[person_id]['frames'].append(frame_idx) else: people[person_id] = { 'bbox': [], 'frames': [], } people[person_id]['bbox'].append(bbox) people[person_id]['frames'].append(frame_idx) for k in people.keys(): people[k]['bbox'] = np.array(people[k]['bbox']).reshape((len(people[k]['bbox']), 4)) people[k]['frames'] = np.array(people[k]['frames']) return people