def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network(args.model,args.device) # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### infer_network.load_model() ### TODO: Handle the input stream ### if args.webcam != None: vc = cv2.VideoCapture(args.webcam) else: # handle the video or image with -i resources/image_0100.jpeg or .mp4 vc = cv2.VideoCapture(args.input) if not vc.isOpened(): logging.error(f"Error opening input file (video or image {args.input})") exit(1) does_got_frame,frame = vc.read() # last_count = 0 # predict_time_count = 0 person_in_frame = False real_count = 0 total_count = 0 input_shape = infer_network.get_input_shape() while does_got_frame: image = preprocess_image(frame,input_shape[3],input_shape[2]) infer_request_handle = infer_network.async_exec_net(image) detections = infer_network.wait(infer_request_handle) detections = infer_network.get_output(detections) current_count = detections['num_detections'] # predict_time_count += 1 # if current_count > last_count and last_count == 0: # start_time = vc.get(cv2.CAP_PROP_POS_MSEC) # total_count = total_count + current_count - last_count # if current_count < last_count and current_count == 0 and predict_time_count >= 3: # # Person duration in the video is calculated # duration = int((vc.get(cv2.CAP_PROP_POS_MSEC) - start_time) / 1000.0) # # Publish messages to the MQTT server # client.publish("person/duration", # json.dumps({"duration": duration})) # when person exit frame if current_count == 0: if person_in_frame: miss_count += 1 if miss_count > 20: real_count -= 1 miss_count = 0 duration = int(time.time() - start_time) client.publish("person/duration",json.dumps({"duration": duration})) person_in_frame = False else: miss_count = 0 if real_count == 0: real_count += 1 total_count += 1 start_time = time.time() person_in_frame = True client.publish("person", json.dumps({"total": total_count})) # if predict_time_count >= 5: # last_count = current_count # predict_time_count = 0 client.publish("person", json.dumps({"count": real_count})) ### Draw bounding boxes to provide intuition ### img = draw_bboxes(frame, detections) cv2.putText(img, f'current: {real_count} total: {total_count}', (0, 100), cv2.FONT_HERSHEY_SIMPLEX, .5, (255,255,255), 2, cv2.LINE_AA) sys.stdout.buffer.write(img) sys.stdout.flush() ### Write an output image if `single_image_mode` ### if vc.get(cv2.CAP_PROP_FRAME_COUNT) == 1.0: cv2.imwrite('detected.png', img) ### Read from the video capture ### does_got_frame, frame = vc.read() vc.release()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold model_path = args.model input = args.input device = args.device ### TODO: Load the model through `infer_network` ### input_dict = infer_network.load_model(model_path, device) ### TODO: Handle the input stream ### # camera input_type = None if input == "cam": input_type = "cam" elif isinstance(input, str) and Path(input).is_file(): mimetype = mimetypes.guess_type(input)[0] if mimetype: mimetype = mimetype.split('/')[0] if mimetype == 'video': input_type = "video" elif mimetype == "image": input_type = "image" if input_type not in ["cam", "video", "image"]: raise Exception("Invalid input parameter") # handler different inputs ### TODO: Handle the input stream ### cap = cv2.VideoCapture(input) cap.open(input) w = int(cap.get(3)) h = int(cap.get(4)) request_id = 0 # counters people_total_counter = 0 people_last_counter = 0 frames_counter = 0 # this value can be adjusted according to the model accuracy frames_interval_baseline = 50 start_time = 0 # labels map labels_map = COCO_LABELS ### TODO: Loop until stream is over ### while cap.isOpened(): ### TODO: Read from the video capture ### flag, frame = cap.read() if not flag: break ### TODO: Pre-process the image as needed ### net_input = infer_network.process_input(frame) net_input_dict = {'image_tensor': net_input} if "image_info" in input_dict: net_input_dict['image_info'] = net_input.shape[1:] ### TODO: Start asynchronous inference for specified request ### infer_network.async_exec_net(net_input_dict, request_id) ### TODO: Wait for the result ### if infer_network.wait() == 0: ### TODO: Get the results of the inference request ### output_dict = infer_network.get_output() ### TODO: Extract any desired stats from the results ### predictions = infer_network.process_output(output_dict, target_size=(h, w), boxes_threshold=0.3) people_curr_count = 0 for box in predictions: label_id = box.label_id label = labels_map[label_id] # counting the number of people in the frame if label == "person": people_curr_count += 1 box.draw(frame) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### client.publish("person", json.dumps({"count": people_curr_count})) frames_counter += 1 # detect changes between n(frames_interval_baseline) frames if frames_counter == frames_interval_baseline: # if was a change between on frame to the other, in this case # if a new person enters to the scene if people_curr_count > people_last_counter: start_time = time.time() # increase the total people counter and send the new value to the gui people_total_counter = people_total_counter + people_curr_count - people_last_counter client.publish("person", json.dumps({"total": people_total_counter})) # Person duration in the video is calculated if people_curr_count < people_last_counter: time_delta = int(time.time() - start_time) client.publish("person/duration", json.dumps({"duration": time_delta})) # update the counter people_last_counter = people_curr_count frames_counter = frames_counter % frames_interval_baseline # reset the frame counter ### TODO: Send the frame to the FFMPEG server ### sys.stdout.buffer.write(frame) sys.stdout.flush() cv2.imshow("Frame", frame) ### TODO: Write an output image if `single_image_mode` ### key = cv2.waitKey(1) & 0xFF if key == ord("q"): break if input_type == "image": cv2.imwrite("out.jpg", frame) cap.release() cv2.destroyAllWindows()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network(args.model[:-4], args.device) # Set Probability threshold for detections prob_threshold = args.prob_threshold ### Load the model through `infer_network` ### infer_network.load_model() ### Handle the input stream ### vc = cv2.VideoCapture(args.input) if not vc.isOpened(): logging.error( f"Error opening input file (video or image {args.input})") exit(1) ### Read from the video capture ### got_frame, frame = vc.read() ### Initialize for stats calculation ### #last_detections = None last_count = 0 total_count = 0 ### Loop until stream is over ### while got_frame: ### Pre-process the image as needed ### image, normalization_consts = preprocess_image( frame, width=640, height=640, preserve_aspect_ratio=True) batch = image[np.newaxis, :, :, :] ### Start asynchronous inference for specified request ### infer_request_handle = infer_network.async_exec_net(batch) ### Wait for the result ### detections_arr = infer_network.async_wait(infer_request_handle) ### Get the results of the inference request ### detections = infer_network.get_output( detections_arr, threshold=prob_threshold, whitelist_filter=[1], normalization_consts=normalization_consts) ### Extract any desired stats from the results ### ### Calculate and send relevant information on ### #TODO improve, use bbox to identify if it is the same person and support multiple people, currently should work for assignment current_count = detections['num_detections'] if current_count > last_count and last_count == 0: start_time = vc.get(cv2.CAP_PROP_POS_MSEC) total_count = total_count + current_count - last_count if current_count < last_count and current_count == 0: # Person duration in the video is calculated duration = int( (vc.get(cv2.CAP_PROP_POS_MSEC) - start_time) / 1000.0) # Publish messages to the MQTT server client.publish("person/duration", json.dumps({"duration": duration})) last_count = current_count ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### client.publish( "person", json.dumps({ "count": current_count, "total": total_count })) ### Draw bounding boxes to provide intuition ### img = draw_bboxes(frame, detections) cv2.putText(img, f'current: {current_count} total: {total_count}', (0, 100), cv2.FONT_HERSHEY_SIMPLEX, .5, (255, 255, 255), 2, cv2.LINE_AA) ## Send the frame to the FFMPEG server ### sys.stdout.buffer.write(img) sys.stdout.flush() ### Write an output image if `single_image_mode` ### if vc.get(cv2.CAP_PROP_FRAME_COUNT) == 1.0: cv2.imwrite('ov_od.png', img) ### Read from the video capture ### got_frame, frame = vc.read() vc.release()