def __init__(self, model_path): # 检测结果阈值。低于这个阈值的检测结果将会被忽略 # 过滤掉置信度小于self.min_confidence的bbox,生成detections self.min_confidence = 0.25 self.nms_max_overlap = 1.0 # 非极大抑制的阈值 原始值1.0 # NMS (这里self.nms_max_overlap的值为1,即保留了所有的detections) self.extractor = Extractor(model_path, use_cuda=True) max_cosine_distance = 0.2 # 0.2 余弦距离的控制阈值 调节这个能改善IDsw # 描述的区域的最大值 它是一个列表,列出了每次出现曲目的特征。nn_bodget确定此列表的大小。例如,如果它是10,则仅存储曲目在板上出现的最后10次的特征 nn_budget = 100 metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) self.tracker = Tracker(metric)
def __init__(self, model_path): self.min_confidence = 0.3 self.nms_max_overlap = 1.0 self.extractor = Extractor(model_path, use_cuda=True) max_cosine_distance = 0.2 nn_budget = 100 n_init = 0 max_age = 30 metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) self.tracker = Tracker(metric, max_age=max_age, n_init=n_init)
def __init__(self, model_path='yolov3/of_model/yolov3_model_python/', gpu_ids='0', model_name='resid', confidence_l=0.2, confidence_h=0.4, max_cosine_distance=0.2, max_iou_distance=0.7, save_feature=False, use_filter=False, init_extractor=True, max_age=30, std_Q_w=1e-1, std_Q_wv=1e-3, std_R_w=5e-2, cls_=0): self.confidence_l = confidence_l self.confidence_h = confidence_h self.iou_thresh_l = 0.24 self.iou_thresh = 0.5 self.nms_max_overlap = 1.0 self.extractor = None self.height, self.width = None, None if init_extractor: self.extractor = Extractor(model_name=model_name, load_path=model_path, gpu_ids=gpu_ids, cls=cls_) max_iou = max_iou_distance nn_budget = 100 metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) self.tracker = Tracker(metric, max_iou_distance=max_iou, max_age=max_age, std_Q_w=std_Q_w, std_Q_wv=std_Q_wv, std_R_w=std_R_w) self.all_feature = None self.save_feature = save_feature self.count = 1 self.result = [] self.use_filter = use_filter
def recognize_from_video(): results = [] idx_frame = 0 # net initialize detector = init_detector(args.env_id) extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=args.env_id) # tracker class instance metric = NearestNeighborDistanceMetric( "cosine", MAX_COSINE_DISTANCE, NN_BUDGET ) tracker = Tracker( metric, max_iou_distance=0.7, max_age=70, n_init=3 ) capture = webcamera_utils.get_capture(args.video) # create video writer if args.savepath is not None: writer = webcamera_utils.get_writer( args.savepath, int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), ) else: writer = None print('Start Inference...') while(True): idx_frame += 1 ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break # In order to use ailia.Detector, the input should have 4 channels. input_img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA) h, w = frame.shape[0], frame.shape[1] # do detection detector.compute(input_img, THRESHOLD, IOU) bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w) # select person class mask = cls_ids == 0 bbox_xywh = bbox_xywh[mask] # bbox dilation just in case bbox too small, # delete this line if using a better pedestrian detector bbox_xywh[:, 3:] *= 1.2 cls_conf = cls_conf[mask] # do tracking img_crops = [] for box in bbox_xywh: x1, y1, x2, y2 = xywh_to_xyxy(box, h, w) img_crops.append(frame[y1:y2, x1:x2]) if img_crops: # preprocess img_batch = np.concatenate([ normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :] for img in img_crops ], axis=0).transpose(0, 3, 1, 2) # TODO better to pass a batch at once # features = extractor.predict(img_batch) features = [] for img in img_batch: features.append(extractor.predict(img[np.newaxis, :, :, :])[0]) features = np.array(features) else: features = np.array([]) bbox_tlwh = xywh_to_tlwh(bbox_xywh) detections = [ Detection(bbox_tlwh[i], conf, features[i]) for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE ] # run on non-maximum supression boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) nms_max_overlap = 1.0 indices = non_max_suppression(boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] # update tracker tracker.predict() tracker.update(detections) # update bbox identities outputs = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue box = track.to_tlwh() x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w) track_id = track.track_id outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int)) if len(outputs) > 0: outputs = np.stack(outputs, axis=0) # draw box for visualization if len(outputs) > 0: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] frame = draw_boxes(frame, bbox_xyxy, identities) for bb_xyxy in bbox_xyxy: bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy)) results.append((idx_frame - 1, bbox_tlwh, identities)) cv2.imshow('frame', frame) if writer is not None: writer.write(frame) if args.savepath is not None: write_results(args.savepath.split('.')[0] + '.txt', results, 'mot') else: write_results('result.txt', results, 'mot') capture.release() cv2.destroyAllWindows() if writer is not None: writer.release() print(f'Save results to {args.savepath}') print('Script finished successfully.')
def recognize_from_video(): try: print('[INFO] Webcam mode is activated') RECORD_TIME = 80 capture = cv2.VideoCapture(int(args.video)) if not capture.isOpened(): print("[ERROR] webcamera not found") sys.exit(1) except ValueError: if check_file_existance(args.video): capture = cv2.VideoCapture(args.video) frame_rate = capture.get(cv2.CAP_PROP_FPS) if FRAME_SKIP: action_recognize_fps = int(args.fps) else: action_recognize_fps = frame_rate if args.savepath != "": size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') writer = cv2.VideoWriter(args.savepath, fmt, action_recognize_fps, size) else: writer = None # pose estimation env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') if args.arch == "lw_human_pose": pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=env_id, algorithm=ALGORITHM) detector = None else: detector = ailia.Detector(DETECTOR_MODEL_PATH, DETECTOR_WEIGHT_PATH, len(COCO_CATEGORY), format=ailia.NETWORK_IMAGE_FORMAT_RGB, channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST, range=ailia.NETWORK_IMAGE_RANGE_U_FP32, algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3, env_id=env_id) pose = ailia.Net(POSE_MODEL_PATH, POSE_WEIGHT_PATH, env_id=env_id) # tracker class instance extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=env_id) metric = NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE, NN_BUDGET) tracker = Tracker(metric, max_iou_distance=0.7, max_age=70, n_init=3) # action recognition env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') model = ailia.Net(ACTION_MODEL_PATH, ACTION_WEIGHT_PATH, env_id=env_id) action_data = {} frame_nb = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) idx_frame = 0 time_start = time.time() while (True): time_curr = time.time() if args.video == '0' and time_curr - time_start > RECORD_TIME: break ret, frame = capture.read() if cv2.waitKey(1) & 0xFF == ord('q'): break if (not ret) or (frame_nb >= 1 and idx_frame >= frame_nb): break if FRAME_SKIP: mod = round(frame_rate / action_recognize_fps) if mod >= 1: if idx_frame % mod != 0: idx_frame = idx_frame + 1 continue input_image, input_data = adjust_frame_size( frame, frame.shape[0], frame.shape[1], ) input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA) # inferece if args.arch == "lw_human_pose": _ = pose.compute(input_data) else: detector.compute(input_data, THRESHOLD, IOU) # deepsort format h, w = input_image.shape[0], input_image.shape[1] if args.arch == "lw_human_pose": bbox_xywh, cls_conf, cls_ids = get_detector_result_lw_human_pose( pose, h, w) else: bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w) mask = cls_ids == 0 bbox_xywh = bbox_xywh[mask] # bbox dilation just in case bbox too small, # delete this line if using a better pedestrian detector if args.arch == "pose_resnet": # bbox_xywh[:, 3:] *= 1.2 #May need to be removed in the future cls_conf = cls_conf[mask] # do tracking img_crops = [] for box in bbox_xywh: x1, y1, x2, y2 = xywh_to_xyxy(box, h, w) img_crops.append(input_image[y1:y2, x1:x2]) if img_crops: # preprocess img_batch = np.concatenate([ normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :] for img in img_crops ], axis=0).transpose(0, 3, 1, 2) # TODO better to pass a batch at once # features = extractor.predict(img_batch) features = [] for img in img_batch: features.append(extractor.predict(img[np.newaxis, :, :, :])[0]) features = np.array(features) else: features = np.array([]) bbox_tlwh = xywh_to_tlwh(bbox_xywh) detections = [ Detection(bbox_tlwh[i], conf, features[i]) for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE ] # run on non-maximum supression boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) nms_max_overlap = 1.0 indices = non_max_suppression(boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] # update tracker tracker.predict() tracker.update(detections) # update bbox identities outputs = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue box = track.to_tlwh() x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w) track_id = track.track_id outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int)) if len(outputs) > 0: outputs = np.stack(outputs, axis=0) # action detection actions = [] persons = [] if len(outputs) > 0: bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] for i, box in enumerate(bbox_xyxy): id = identities[i] if not (id in action_data): action_data[id] = np.zeros( (ailia.POSE_KEYPOINT_CNT - 1, TIME_RANGE, 3)) # action recognition action, person = action_recognition(box, input_image, pose, detector, model, action_data[id]) actions.append(action) persons.append(person) # draw box for visualization if len(outputs) > 0: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] frame = draw_boxes(input_image, bbox_xyxy, identities, actions, action_data, (0, 0)) for bb_xyxy in bbox_xyxy: bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy)) # draw skelton for person in persons: if person != None: display_result(input_image, person) if writer is not None: writer.write(input_image) # show progress if idx_frame == "0": print() print("\r" + str(idx_frame + 1) + " / " + str(frame_nb), end="") if idx_frame == frame_nb - 1: print() cv2.imshow('frame', input_image) idx_frame = idx_frame + 1 if writer is not None: writer.release() capture.release() cv2.destroyAllWindows() print('Script finished successfully.')