def infer_on_stream(): # Initialise the class infer_network = Network() infer_network.load_model( "./models/mobilenet_ssd_pedestrian_detection/MobileNetSSD_deploy10695.xml", "CPU", CPU_EXTENSION) net_input_shape = infer_network.get_input_shape() ### TODO: Handle the input stream ### img = cv2.imread('./frame1.jpg', 0) img = cv2.resize(img, (net_input_shape[3], net_input_shape[2])) imgProcessed = img - 127.5 imgProcessed = imgProcessed * 0.007843 imgProcessed = imgProcessed.astype(np.float32) infer_network.exec_net(imgProcessed) if infer_network.wait() == 0: ### TODO: Get the results of the inference request ### width, height = imgProcessed.shape[:2] result = infer_network.get_output() h = img.shape[0] w = img.shape[1] box = result[0, 0, :, 3:7] * np.array([w, h, w, h]) cls = result[0, 0, :, 1] conf = result[0, 0, :, 2] for i in range(len(box)): aR = abs(box[i][2] - box[i][0]) * (box[i][3] - box[i][1]) if conf[i] > 0.25 and aR < 30000: cv2.rectangle(img, (int(box[i][0]), int(box[i][1])), (int(box[i][2]), int(box[i][3])), (0, 255, 0)) cv2.imwrite("frameProcessed.jpg", img)
def infer_on_stream(cap, out, width, height, args): infer_network = Network() infer_network.load_model(args.model, args.device, args.cpu_extension) net_input_shape = infer_network.get_input_shape() while cap.isOpened(): flag, frame = cap.read() if not flag: break p_frame = cv2.resize(frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) infer_network.exec_net(p_frame) if infer_network.wait() == 0: result = infer_network.get_output() frame, current_count = draw_boxes(frame, result, width, height, float(args.prob_threshold)) send_update = update_count(current_count, cap.get(cv2.CAP_PROP_POS_MSEC) / 1000) if send_update: print(current_count, total_count, duration) out.write(frame)
class FacialLandmarkDetection: """ Facial Landmark Detection Class """ def __init__(self, model, device="CPU", extensions=None): """ set instance variables """ self.model_xml = model self.device = device self.extensions = extensions self.infer_network = Network() def load_model(self): """ load the model specified by the user """ self.infer_network.load_model(self.model_xml, self.device, self.extensions) def predict(self, image): """ run predictions on the input image """ self.infer_network.exec_net(image) return (self.infer_network.get_output()[self.infer_network.output_blob] if self.infer_network.wait() == 0 else None) def preprocess_input(self, image): """ preprocess input image """ input_shape = self.infer_network.get_input_shape() frame = np.copy(image) frame = cv2.resize(frame, (input_shape[3], input_shape[2])).transpose( (2, 0, 1)) return frame.reshape(1, *frame.shape) def preprocess_output(self, outputs, box, img, overlay_inference): """ preprocess output image """ landmarks = outputs.reshape(1, 10)[0] h, w = (box[3] - box[1], box[2] - box[0]) # This is broken, but I can't figure out why... overlay_inference = False # ...so I've disabled it for now. if overlay_inference: for e in range(2): x, y = (w * int(landmarks[e * 2]), h * int(landmarks[e * 2 + 1])) cv2.circle(img, (box[0] + x, box[1] + y), 30, (0, 255, e * 255), 2) return ( img, [w * landmarks[0], h * landmarks[1]], [w * landmarks[2], h * landmarks[3]], )
class GazeEstimator: def __init__(self, model_name, device='CPU', extensions=None): self.network = Network(model_name, device, extensions) def load_model(self): self.network.load_model() def predict(self, right_eye_image, head_pose_angles, left_eye_image): _, _, roll = head_pose_angles right_eye_image, head_pose_angles, left_eye_image, preprocess_input_time = self._preprocess_input( right_eye_image, head_pose_angles, left_eye_image) input_dict = { "left_eye_image": left_eye_image, "right_eye_image": right_eye_image, "head_pose_angles": head_pose_angles } self.network.exec_net(0, input_dict) status = self.network.wait(0) if status == 0: outputs = self.network.get_output(0) gaze_vector, preprocess_output_time = self._preprocess_output( outputs, roll) self.preprocess_time = preprocess_input_time + preprocess_output_time return gaze_vector def _preprocess_input(self, right_eye_image, head_pose_angles, left_eye_image): start_preprocess_time = time.time() left_eye_image = self._preprocess_eye_image(left_eye_image) right_eye_image = self._preprocess_eye_image(right_eye_image) head_pose_angles = self._preprocess_angels(head_pose_angles) total_preprocess_time = time.time() - start_preprocess_time return right_eye_image, head_pose_angles, left_eye_image, total_preprocess_time def _preprocess_angels(self, head_pose_angles): input_shape = self.network.get_input_shape("head_pose_angles") head_pose_angles = np.reshape(head_pose_angles, input_shape) return head_pose_angles def _preprocess_eye_image(self, image): n, c, h, w = self.network.get_input_shape("left_eye_image") input_image = cv2.resize(image, (w, h), interpolation=cv2.INTER_AREA) input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)) return input_image def _preprocess_output(self, outputs, roll): start_preprocess_time = time.time() gaze_vector = outputs[0] gaze_vector_n = gaze_vector / np.linalg.norm(gaze_vector) vcos = math.cos(math.radians(roll)) vsin = math.sin(math.radians(roll)) x = gaze_vector_n[0] * vcos + gaze_vector_n[1] * vsin y = -gaze_vector_n[0] * vsin + gaze_vector_n[1] * vcos total_preprocess_time = time.time() - start_preprocess_time return [x, y], total_preprocess_time
class FaceDetector: ''' Class for the Face Detection Model. ''' def __init__(self, model_name, device='CPU', extensions=None, threshold=0.60): self.threshold = threshold self.network = Network(model_name, device, extensions) def load_model(self): self.network.load_model() def predict(self, image): input_image, preprocess_input_time = self._preprocess_input(image) self.network.exec_net(0, input_image) status = self.network.wait(0) if status == 0: outputs = self.network.get_output(0) face_boxes, preprocess_output_time = self._preprocess_output( outputs, image) self.preprocess_time = preprocess_input_time + preprocess_output_time return face_boxes def _preprocess_input(self, image): start_preprocess_time = time.time() n, c, h, w = self.network.get_input_shape() input_image = cv2.resize(image, (w, h), interpolation=cv2.INTER_AREA) input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)) total_preprocess_time = time.time() - start_preprocess_time return input_image, total_preprocess_time def _preprocess_output(self, outputs, image): start_preprocess_time = time.time() face_boxes = [] h, w, _ = image.shape color = (255, 0, 0) for obj in outputs[0][0]: if obj[2] > self.threshold: xmin = int(obj[3] * w) ymin = int(obj[4] * h) xmax = int(obj[5] * w) ymax = int(obj[6] * h) face_boxes.append([xmin, ymin, xmax, ymax]) cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color, 1) total_preprocess_time = time.time() - start_preprocess_time return face_boxes, total_preprocess_time
def post_convertion(frame, model, cpu_extension, device): network = Network() network.load_model(model, cpu_extension, device) processed_frame = pre_process(frame, net_input_shape=network.get_input_shape()) inference_start_time = time.time() network.exec_net(processed_frame) if network.wait() == 0: inference_end_time = time.time() total_inference_time = inference_end_time - inference_start_time result = network.get_all_output() output = result['DetectionOutput'] detection = output[0][0][0] image_id, label, conf, x_min, y_min, x_max, y_max = detection return str(round(total_inference_time * 1000, 3)) + "ms", conf
def infer_on_image(args): client = mqtt.Client() client.connect(MQTT_HOST, MQTT_PORT, MQTT_KEEPALIVE_INTERVAL) # Initialize the Inference Engine plugin = Network() time_stamp = [] # Load the network model into the IE plugin.load_model(args.m, args.d, CPU_EXTENSION) net_input_shape = plugin.get_input_shape() # Get and open video capture img = cv2.imread(args.i) height, width, _ = img.shape #Preprocess the image p_frame = cv2.resize(img, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) #Execute the network plugin.exec_net(p_frame) #Extract result result = plugin.get_output() #statistics on image ppl = 0 times = [] counter_frame = 10 iflag = False iflag, ppl, times = count_ppl(result, counter_frame, iflag, ppl, times) #Draw bounding box out_img = draw_bb(result, width, height, img) cv2.imwrite('file.jpg', out_img) client.publish('person', json.dumps({'count': ppl})) #client.publish('Duration',json.dumps({'duration':times})) #Publish the image sys.stdout.buffer.write(out_img) sys.stdout.flush() client.disconnect() return (ppl)
class FacialLandmarksDetector: def __init__(self, model_name, device='CPU', extensions=None): self.network = Network(model_name, device, extensions) def load_model(self): self.network.load_model() def predict(self, face_image): input_image, preprocess_input_time = self._preprocess_input(face_image) self.network.exec_net(0, input_image) status = self.network.wait(0) if status == 0: outputs = self.network.get_output(0) eye_boxes, eye_centers, preprocess_output_time = self._preprocess_output(outputs, face_image) self.preprocess_time = preprocess_input_time + preprocess_output_time return eye_boxes, eye_centers def _preprocess_input(self, image): start_preprocess_time = time.time() n, c, h, w = self.network.get_input_shape() input_image = cv2.resize(image, (w,h), interpolation = cv2.INTER_AREA) input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)) total_preprocess_time = time.time() - start_preprocess_time return input_image, total_preprocess_time def _preprocess_output(self, outputs, image): start_preprocess_time = time.time() normalized_landmarks = np.squeeze(outputs).reshape((5,2)) h, w, _ = image.shape color = (255,255,255) length_offset = int(w * 0.15) eye_boxes, eye_centers = [], [] for i in range(2): normalized_x, normalized_y = normalized_landmarks[i] x = int(normalized_x*w) y = int(normalized_y*h) eye_centers.append([x, y]) xmin, xmax = max(0, x - length_offset), min(w, x + length_offset) ymin, ymax = max(0, y - length_offset), min(h, y + length_offset) eye_boxes.append([xmin, ymin, xmax, ymax]) cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color, 1) total_preprocess_time = time.time() - start_preprocess_time return eye_boxes, eye_centers, total_preprocess_time
class HeadPoseEstimator: ''' Class for the Head Pose Estimation Model. ''' def __init__(self, model_name, device='CPU', extensions=None): self.network = Network(model_name, device, extensions) def load_model(self): self.network.load_model() def predict(self, image): input_image, preprocess_input_time = self._preprocess_input(image) self.network.exec_net(0, input_image) status = self.network.wait(0) if status == 0: outputs = self.network.get_outputs(0) head_pose_angles, preprocess_output_time = self._preprocess_output(outputs, image) self.preprocess_time = preprocess_input_time + preprocess_output_time return head_pose_angles def _preprocess_input(self, image): start_preprocess_time = time.time() n, c, h, w = self.network.get_input_shape() input_image = cv2.resize(image, (w,h), interpolation = cv2.INTER_AREA) input_image = input_image.transpose((2, 0, 1)) input_image = input_image.reshape((n, c, h, w)) total_preprocess_time = time.time() - start_preprocess_time return input_image, total_preprocess_time def _preprocess_output(self, outputs, image): start_preprocess_time = time.time() yaw = outputs['angle_y_fc'][0][0] pitch = outputs['angle_p_fc'][0][0] roll = outputs['angle_r_fc'][0][0] total_preprocess_time = time.time() - start_preprocess_time return [yaw, pitch, roll], total_preprocess_time
def main(): """ Load the network and parse the SSD output. :return: None """ # Connect to the MQTT server client = mqtt.Client() client.connect(MQTT_HOST, MQTT_PORT, MQTT_KEEPALIVE_INTERVAL) log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) # Flag for the input image single_image_mode = False cur_request_id = 0 last_count = 0 total_count = 0 start_time = 0 model = os.environ['MODEL'] device = os.environ['DEVICE'] if 'DEVICE' in os.environ.keys() else 'CPU' cpu_extension = os.environ[ 'CPU_EXTENSION'] if 'CPU_EXTENSION' in os.environ.keys() else None # Checks for live feed if os.environ['INPUT'] == 'CAM': input_stream = 0 # Checks for input image elif os.environ['INPUT'].endswith('.jpg') or os.environ['INPUT'].endswith( '.bmp'): single_image_mode = True input_stream = os.environ['INPUT'] # Checks for video file else: input_stream = os.environ['INPUT'] assert os.path.isfile( os.environ['INPUT']), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) if input_stream: cap.open(os.environ['INPUT']) if not cap.isOpened(): log.error("ERROR! Unable to open video source") # Initialise the class infer_network = Network() # Load the network to IE plugin to get shape of input layer n, c, h, w = infer_network.load_model(model, device, 1, 1, cur_request_id, cpu_extension)[1] global initial_w, initial_h initial_w = cap.get(3) initial_h = cap.get(4) fps = cap.get(cv2.CAP_PROP_FPS) cmdstring = ( 'ffmpeg', '-y', '-r', '%d' % (fps), # overwrite, 60fps '-s', '%dx%d' % (initial_w, initial_h), # size of image string '-pixel_format', 'bgr24', # format '-f', 'rawvideo', '-i', '-', # tell ffmpeg to expect raw video from the pipe 'http://localhost:8090/fac.ffm') # output encoding p = subprocess.Popen(cmdstring, stdin=subprocess.PIPE) while cap.isOpened(): flag, frame = cap.read() if not flag: break key_pressed = cv2.waitKey(60) # Start async inference inf_start = time.time() image = cv2.resize(frame, (w, h)) # Change data layout from HWC to CHW image = image.transpose((2, 0, 1)) image = image.reshape((n, c, h, w)) # Start asynchronous inference for specified request. infer_network.exec_net(cur_request_id, image) # Wait for the result if infer_network.wait(cur_request_id) == 0: det_time = time.time() - inf_start # Results of the output layer of the network result = infer_network.get_output(cur_request_id) if os.environ['PERF_COUNTS'] > str(0): perf_count = infer_network.performance_counter(cur_request_id) performance_counts(perf_count) frame, current_count = ssd_parser(frame, result) inf_time_message = "Inference time: {:.3f}ms" \ .format(det_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) # When new person enters the video if current_count > last_count: start_time = time.time() total_count = total_count + current_count - last_count client.publish("person", json.dumps({"total": total_count})) # Person duration in the video is calculated if current_count < last_count: duration = int(time.time() - start_time) # Publish messages to the MQTT server client.publish("person/duration", json.dumps({"duration": duration})) client.publish("person", json.dumps({"count": current_count})) last_count = current_count if key_pressed == 27: break p.stdin.write(frame.tostring()) if single_image_mode: cv2.imwrite('output_image.jpg', frame) infer_network.clean() cap.release() cv2.destroyAllWindows() client.disconnect() infer_network.clean()
def intruder_detector(): """ Process the input source frame by frame and detects intruder, if any. :return status: 0 on success, negative value on failure """ global CONF_CANDIDATE_CONFIDENCE global LOG_WIN_HEIGHT global LOG_WIN_WIDTH global CONF_FILE global video_caps global conf_labels_file_path parse_args() if not os.path.isfile(CONF_FILE): return -12, "" if not os.path.isfile(conf_labels_file_path): return -13, "" # Creates subdirectory to save output snapshots pathlib.Path(os.getcwd() + '/output/').mkdir(parents=True, exist_ok=True) # Read the configuration file ret, req_labels = get_input() if ret != 0: return ret, req_labels[0] if not video_caps: return -14, '' # Get the labels that are used in the application ret, label_names, used_labels = get_used_labels(req_labels) if ret != 0: return ret, '' if True not in used_labels: return -15, '' # Init a rolling log to store events rolling_log_size = int((LOG_WIN_HEIGHT - 15) / 20) log_list = collections.deque(maxlen=rolling_log_size) # Open a file for intruder logs log_file = open(LOG_FILE_PATH, 'w') if not log_file: return -16, '' # Initializing VideoWriter for each source for video_cap in video_caps: ret, ret_value = video_cap.init_vw(int(video_cap.input_height), int(video_cap.input_width)) if ret != 0: return ret, ret_value # Initialise the class infer_network = Network() # Load the network to IE plugin to get shape of input layer n, c, h, w = infer_network.load_model(model_xml, TARGET_DEVICE, 1, 1, 0, CPU_EXTENSION) min_fps = min([i.vc.get(cv2.CAP_PROP_FPS) for i in video_caps]) no_more_data = [False] * len(video_caps) start_time = time.time() inf_time = 0 fourcc = cv2.VideoWriter_fourcc(*'avc1') statsVideo = cv2.VideoWriter(os.path.join(output_dir, 'Statistics.mp4'), fourcc, min_fps, (LOG_WIN_WIDTH, LOG_WIN_HEIGHT), True) job_id = os.environ['PBS_JOBID'] progress_file_path = os.path.join(output_dir, 'i_progress_' + str(job_id) + '.txt') infer_start_time = time.time() # Main loop starts here. Loop over all the video captures while True: for idx, video_cap in enumerate(video_caps): # Get a new frame vfps = int(round(video_cap.vc.get(cv2.CAP_PROP_FPS))) for i in range(0, int(round(vfps / min_fps))): ret, video_cap.frame = video_cap.vc.read() video_cap.loop_frames += 1 # If no new frame or error in reading a frame, exit the loop if not ret: no_more_data[idx] = True break if no_more_data[idx]: stream_end_frame = numpy.zeros((int( video_cap.input_height), int(video_cap.input_width), 1), dtype='uint8') stream_end_message = "Stream from {} has ended.".format( video_cap.cam_name) cv2.putText(stream_end_frame, stream_end_message, (int(video_cap.input_width / 2) - 30, int(video_cap.input_height / 2) - 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 255, 255), 1) continue for i in range(video_cap.no_of_labels): video_cap.current_count[i] = 0 video_cap.changed_count[i] = False # Resize to expected size (in model .xml file) # Input frame is resized to infer resolution in_frame = cv2.resize(video_cap.frame, (w, h)) # PRE-PROCESS STAGE: # Convert image to format expected by inference engine # IE expects planar, convert from packed # Change data layout from HWC to CHW in_frame = in_frame.transpose((2, 0, 1)) in_frame = in_frame.reshape((n, c, h, w)) # Start asynchronous inference for specified request. inf_start = time.time() infer_network.exec_net(0, in_frame) # Wait for the result if infer_network.wait(0) == 0: inf_time = time.time() - inf_start # Results of the output layer of the network res = infer_network.get_output(0) for obj in res[0][0]: label = int(obj[1]) - 1 # Draw the bounding box around the object when the probability is more than specified threshold if obj[2] > CONF_THRESHOLD_VALUE and used_labels[label]: video_cap.current_count[label] += 1 xmin = int(obj[3] * video_cap.input_width) ymin = int(obj[4] * video_cap.input_height) xmax = int(obj[5] * video_cap.input_width) ymax = int(obj[6] * video_cap.input_height) # Draw bounding box around the intruder detected cv2.rectangle(video_cap.frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 4, 16) for i in range(video_cap.no_of_labels): if video_cap.candidate_count[i] == video_cap.current_count[ i]: video_cap.candidate_confidence[i] += 1 else: video_cap.candidate_confidence[i] = 0 video_cap.candidate_count[i] = video_cap.current_count[ i] if video_cap.candidate_confidence[ i] == CONF_CANDIDATE_CONFIDENCE: video_cap.candidate_confidence[i] = 0 video_cap.changed_count[i] = True else: continue if video_cap.current_count[ i] > video_cap.last_correct_count[i]: video_cap.total_count[i] += video_cap.current_count[ i] - video_cap.last_correct_count[i] det_objs = video_cap.current_count[ i] - video_cap.last_correct_count[i] total_count = sum(video_cap.total_count) for det_obj in range(det_objs): current_time = time.strftime("%H:%M:%S") log = "{} - Intruder {} detected on {}".format( current_time, label_names[i], video_cap.cam_name) print(log) log_list.append(log) log_file.write(log + "\n") event = Event(event_time=current_time, intruder=label_names[i], count=total_count, frame=video_cap.frame_count) video_cap.events.append(event) snapshot_name = "output/intruder_{}.png".format( total_count) cv2.imwrite(snapshot_name, video_cap.frame) video_cap.last_correct_count[i] = video_cap.current_count[ i] # Create intruder log window, add logs to the frame and display it log_window = numpy.zeros((LOG_WIN_HEIGHT, LOG_WIN_WIDTH, 1), dtype='uint8') for i, log in enumerate(log_list): cv2.putText(log_window, log, (10, 20 * i + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) log_window = cv2.cvtColor(log_window, cv2.COLOR_GRAY2BGR) statsVideo.write(log_window) video_cap.frame_count += 1 # Video output inf_time_message = "Inference time: {:.3f} ms".format(inf_time * 1000) cv2.putText(video_cap.frame, inf_time_message, (10, int(video_cap.input_height) - 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) fps_time = time.time() - start_time fps_message = "FPS: {:.3f} fps".format(1 / fps_time) cv2.putText(video_cap.frame, fps_message, (10, int(video_cap.input_height) - 10), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) # Display the video output video_cap.vw.write(video_cap.frame) if video_cap.frame_count % 10 == 0: progressUpdate(progress_file_path, time.time() - infer_start_time, video_cap.frame_count, int(video_cap.vc.get(cv2.CAP_PROP_FRAME_COUNT))) start_time = time.time() # Loop video to mimic continuous input if LOOP_VIDEO flag is True if LOOP_VIDEO and not video_cap.is_cam: vfps = int(round(video_cap.vc.get(cv2.CAP_PROP_FPS))) # If a video capture has ended restart it if video_cap.loop_frames > video_cap.vc.get( cv2.CAP_PROP_FRAME_COUNT) - int(round(vfps / min_fps)): video_cap.loop_frames = 0 video_cap.vc.set(cv2.CAP_PROP_POS_FRAMES, 0) if False not in no_more_data: progressUpdate(progress_file_path, time.time() - infer_start_time, int(video_cap.vc.get(cv2.CAP_PROP_FRAME_COUNT)), int(video_cap.vc.get(cv2.CAP_PROP_FRAME_COUNT))) break no_more_data = False t2 = time.time() - infer_start_time for videos in video_caps: with open(os.path.join(output_dir, 'stats.txt'), 'w') as f: f.write('{} \n'.format(round(t2))) f.write('{} \n'.format(videos.frame_count)) infer_network.clean() log_file.close() return 0, ''
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ single_image_mode = False # flag for the imput images cur_request_id = 0 last_count = 0 total_count = 0 start_time = 0 track_threshold = 0.1 max_len = 30 track = deque(maxlen=max_len) # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### n, c, h, w = infer_network.load_model(args.model, args.device, cur_request_id, args.cpu_extension)[1] ### TODO: Handle the input stream ### # Checks for live feed if args.input == 'CAM': input_stream = 0 # Checks for input image elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True input_stream = args.input # Checks for video file else: input_stream = args.input assert os.path.isfile(args.input), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) if input_stream: cap.open(args.input) ### TODO: Loop until stream is over ### initial_w = cap.get(3) initial_h = cap.get(4) while cap.isOpened(): flag, frame = cap.read() if not flag: break key_pressed = cv2.waitKey(60) # Start async inference ### TODO: Read from the video capture ### image = cv2.resize(frame, (w, h)) ### TODO: Pre-process the image as needed ### image = image.transpose((2, 0, 1)) image = image.reshape((n, c, h, w)) ### TODO: Start asynchronous inference for specified request ### inf_start = time.time() infer_network.exec_net(cur_request_id, image) ### TODO: Wait for the result ### if infer_network.wait(cur_request_id) == 0: det_time = time.time() - inf_start ### TODO: Get the results of the inference request ### result = infer_network.get_output(cur_request_id) ### TODO: Extract any desired stats from the results ### inf_time_message = "Inference time: {:.3f}ms"\ .format(det_time*1000) cv2.putText(frame, inf_time_message, (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 150), 1) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### current_count = 0 for obj in result[0][0]: # Draw bounding box for object when it's probability is more than # the specified threshold if obj[2] > prob_threshold: xmin = int(obj[3] * initial_w) ymin = int(obj[4] * initial_h) xmax = int(obj[5] * initial_w) ymax = int(obj[6] * initial_h) cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (255, 0, 150), 1) current_count = current_count + 1 track.append(current_count) # proportion of frames with a positive detection num_tracked = 0 if np.sum(track) / max_len > track_threshold: num_tracked = 1 if num_tracked > last_count: start_time = time.time() total_count = total_count + num_tracked - last_count client.publish("person", json.dumps({"total": total_count})) # Person duration in the video is calculated if num_tracked < last_count: fps = cap.get(cv2.CAP_PROP_FPS) #duration = int(total_count/fps) #duration = int(fps/last_count) duration = int(time.time() - start_time) # Publish messages to the MQTT server client.publish("person/duration", json.dumps({"duration": duration})) client.publish("person", json.dumps({"count": current_count})) last_count = num_tracked key_pressed = cv2.waitKey(60) if key_pressed == 27: cap.release() cv2.destroyAllWindows() client.disconnect() break ### TODO: Send the frame to the FFMPEG server ### frame = cv2.resize(frame, (768, 432)) sys.stdout.buffer.write(frame) sys.stdout.flush() ### TODO: Write an output image if `single_image_mode` ### if single_image_mode: cv2.imwrite('output_image.jpg', frame) cap.release() cv2.destroyAllWindows() client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Flag for the input image single_image_mode = False cur_request_id = 0 last_count = 0 total_count = 0 start_time = 0 # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold # Initialize the Inference Engine infer_network = Network() # Load the network model into the IE n, c, h, w = infer_network.load_model(args.model, args.device, 1, 1, cur_request_id, args.cpu_extension)[1] # Checks for live feed if args.input == 'CAM': input_stream = 0 # Checks for input image elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True input_stream = args.input # Checks for video file else: input_stream = args.input assert os.path.isfile(args.input), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) if input_stream: cap.open(args.input) if not cap.isOpened(): log.error("ERROR! Unable to open video source") global initial_w, initial_h, prob_threshold prob_threshold = args.prob_threshold initial_w = cap.get(3) initial_h = cap.get(4) while cap.isOpened(): #Reading the next frame flag, frame = cap.read() if not flag: break key_pressed = cv2.waitKey(60) # Pre-process the frame image = cv2.resize(frame, (w, h)) # Change data layout from HWC to CHW image = image.transpose((2, 0, 1)) image = image.reshape((n, c, h, w)) inf_start = time.time() # Perform inference on the frame infer_network.exec_net(cur_request_id, image) if infer_network.wait(cur_request_id) == 0: det_time = time.time() - inf_start result = infer_network.get_output(cur_request_id) #if args.perf_counts: perf_count = infer_network.performance_counter(cur_request_id) #performance_counts(perf_count) frame, current_count = ssd_out(frame, result) inf_time_message = "Inference time: {:.3f}ms"\ .format(det_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) # When new person enters the video if current_count > last_count: start_time = time.time() total_count = total_count + current_count - last_count client.publish("person", json.dumps({"total": total_count})) # Person duration in the video is calculated if current_count < last_count: duration = int(time.time() - start_time) # Publish messages to the MQTT server client.publish("person/duration", json.dumps({"duration": duration})) client.publish("person", json.dumps({"count": current_count})) last_count = current_count ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### sys.stdout.buffer.write(frame) sys.stdout.flush() if single_image_mode: cv2.imwrite('output_image.jpg', frame)
class GazeEstimationModel: ''' Class for the Face Detection Model. ''' def __init__(self, model_name, device='CPU', extensions=None): ''' TODO: Use this to set your instance variables. ''' self.model_xml = model_name self.device = device self.extensions = extensions # Initialise the class self.infer_network = Network() #raise NotImplementedError def load_model(self): ''' TODO: You will need to complete this method. This method is for loading the model to the device specified by the user. If your model requires any Plugins, this is where you can load them. ''' self.infer_network.load_model(self.model_xml, self.device, self.extensions) #raise NotImplementedError def predict(self, left_eye_image, right_eye_image, headpose_angles): ''' TODO: You will need to complete this method. This method is meant for running predictions on the input image. ''' self.infer_network.exec_net(headpose_angles, left_eye_image, right_eye_image) # Wait for the result if self.infer_network.wait() == 0: # end time of inference end_time = time.time() result = (self.infer_network.get_output() )[self.infer_network.output_blob] return result def check_model(self): raise NotImplementedError def preprocess_input(self, frame, face, left_eye_point, right_eye_point, print_flag=True): ''' Before feeding the data into the model for inference, you might have to preprocess it. This function is where you can do that. Blob in the format [BxCxHxW] where: B - batch size C - number of channels H - image height W - image width with the name left_eye_image and the shape [1x3x60x60]. Blob in the format [BxCxHxW] where: B - batch size C - number of channels H - image height W - image width with the name right_eye_image and the shape [1x3x60x60]. Blob in the format [BxC] where: B - batch size C - number of channels with the name head_pose_angles and the shape [1x3]. ''' lefteye_input_shape = [1, 3, 60, 60] #self.infer_network.get_input_shape() righteye_input_shape = [1, 3, 60, 60 ] #self.infer_network.get_next_input_shape(2) # crop left eye x_center = left_eye_point[0] y_center = left_eye_point[1] width = lefteye_input_shape[3] height = lefteye_input_shape[2] # ymin:ymax, xmin:xmax facewidthedge = face.shape[1] faceheightedge = face.shape[0] # check for edges to not crop ymin = int(y_center - height // 2) if int(y_center - height // 2) >= 0 else 0 ymax = int(y_center + height // 2) if int(y_center + height // 2) <= faceheightedge else faceheightedge xmin = int(x_center - width // 2) if int(x_center - width // 2) >= 0 else 0 xmax = int(x_center + width // 2) if int(x_center + width // 2) <= facewidthedge else facewidthedge left_eye_image = face[ymin:ymax, xmin:xmax] # print out left eye to frame if (print_flag): frame[150:150 + left_eye_image.shape[0], 20:20 + left_eye_image.shape[1]] = left_eye_image # left eye [1x3x60x60] p_frame_left = cv2.resize( left_eye_image, (lefteye_input_shape[3], lefteye_input_shape[2])) p_frame_left = p_frame_left.transpose((2, 0, 1)) p_frame_left = p_frame_left.reshape(1, *p_frame_left.shape) # crop right eye x_center = right_eye_point[0] y_center = right_eye_point[1] width = righteye_input_shape[3] height = righteye_input_shape[2] # ymin:ymax, xmin:xmax # check for edges to not crop ymin = int(y_center - height // 2) if int(y_center - height // 2) >= 0 else 0 ymax = int(y_center + height // 2) if int(y_center + height // 2) <= faceheightedge else faceheightedge xmin = int(x_center - width // 2) if int(x_center - width // 2) >= 0 else 0 xmax = int(x_center + width // 2) if int(x_center + width // 2) <= facewidthedge else facewidthedge right_eye_image = face[ymin:ymax, xmin:xmax] # print out left eye to frame if (print_flag): frame[150:150 + right_eye_image.shape[0], 100:100 + right_eye_image.shape[1]] = right_eye_image # right eye [1x3x60x60] p_frame_right = cv2.resize( right_eye_image, (righteye_input_shape[3], righteye_input_shape[2])) p_frame_right = p_frame_right.transpose((2, 0, 1)) p_frame_right = p_frame_right.reshape(1, *p_frame_right.shape) #headpose_angles return frame, p_frame_left, p_frame_right #raise NotImplementedError def preprocess_output(self, outputs, image, facebox, left_eye_point, right_eye_point, print_flag=True, threshold=0.5): ''' Before feeding the output of this model to the next model, you might have to preprocess the output. This function is where you can do that. The net outputs a blob with the shape: [1, 3], containing Cartesian coordinates of gaze direction vector. Please note that the output vector is not normalizes and has non-unit length. Output layer name in Inference Engine format: gaze_vector ''' x = outputs[0][0] y = outputs[0][1] z = outputs[0][2] #Draw output if (print_flag): cv2.putText( image, "x:" + str('{:.1f}'.format(x * 100)) + ",y:" + str('{:.1f}'.format(y * 100)) + ",z:" + str('{:.1f}'.format(z)), (20, 100), 0, 0.6, (0, 0, 255), 1) #left eye xmin, ymin, _, _ = facebox x_center = left_eye_point[0] y_center = left_eye_point[1] left_eye_center_x = int(xmin + x_center) left_eye_center_y = int(ymin + y_center) #right eye x_center = right_eye_point[0] y_center = right_eye_point[1] right_eye_center_x = int(xmin + x_center) right_eye_center_y = int(ymin + y_center) cv2.arrowedLine(image, (left_eye_center_x, left_eye_center_y), (left_eye_center_x + int(x * 100), left_eye_center_y + int(-y * 100)), (255, 100, 100), 5) cv2.arrowedLine(image, (right_eye_center_x, right_eye_center_y), (right_eye_center_x + int(x * 100), right_eye_center_y + int(-y * 100)), (255, 100, 100), 5) return image, [x, y, z]
def infer_on_stream(args, client): isImage = False # Handle the input stream if args.input != 'CAM': assert os.path.isfile(args.input) if args.input == 'CAM': args.input = 0 elif args.input.endswith(('.jpg', '.bmp', '.png')): isImage = True last_count = 0 total_count = 0 durationList = [] inferenceList = [] f_n = 0 #False Negatives for analysis purpose... # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold # Load the model through `infer_network` mdl_start = cv2.getTickCount() infer_network.load_model(args.model, args.device, args.cpu_extension) load_time = utils.timeLapse(mdl_start) cap = cv2.VideoCapture(args.input) cap.open(args.input) if not cap.isOpened(): log.error("ERROR! Unable to open video source") exit(1) w, h = utils.getSrcDim(cap) #dimensions from the captured source if not isImage: out = cv2.VideoWriter('out.mp4', utils.getCODEC(), cap.get(cv2.CAP_PROP_FPS), (w, h)) else: out = None #Loop until stream is over while cap.isOpened(): # Read from the video capture flag, frame = cap.read() if not flag: break key_pressed = cv2.waitKey(60) # Pre-process the image as needed p_frame = utils.preprocessed_input(infer_network, frame) # Start asynchronous inference for specified request inf_start = time.time() infer_network.exec_net(p_frame, request_id=0) # Wait for the result if infer_network.wait(request_id=0) == 0: det_time = time.time() - inf_start inferenceList.append(det_time * 1000) # Get the results of the inference request result = infer_network.get_output(request_id=0) # Extract any desired stats from the results frame, count, f_n = drawBBoxes(frame, result, prob_threshold, w, h, last_count, f_n) # Calculate and send relevant information on inf_time_message = "Inference time: {:.3f}ms".format(det_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) # When new person enters the video if count > last_count: frameProcessor.to_PersonOut( ) # re-assuring by Setting Previous person to have moved out. frameProcessor.newPersonEntered( ) #Set the state as new person entered and initialize timer. total_count = total_count + count - last_count client.publish("person", json.dumps({"total": total_count})) ''' log.info('Entered Scene at MS {}'.format(cap.get(cv2.CAP_PROP_POS_MSEC))) log.info('Entered Scene at FRM {}'.format(cap.get(cv2.CAP_PROP_POS_FRAMES))) log.info('Entered Scene at AVI {}'.format(cap.get(cv2.CAP_PROP_POS_AVI_RATIO))) log.info('Frame Rate {}'.format(cap.get(cv2.CAP_PROP_FPS))) ## The logic of calculating duration with above CV2 attribs worked fine. ## But realised it may not work in CAM mode.. so need to build a generic logic. ''' # current_count, total_count and duration to the MQTT server # Person duration in the video is calculated # Topic "person": keys of "count" and "total" # Topic "person/duration": key of "duration" if count < last_count: duration = float(time.time() - frameProcessor.getPersonEntrytime()) frameProcessor.to_PersonOut() durationList.append(duration) # Publish average duration spent by people to the MQTT server client.publish( "person/duration", json.dumps({"duration": round(np.mean(durationList))})) client.publish("person", json.dumps({"count": count})) last_count = count if key_pressed == 27: break # Send the frame to the FFMPEG server sys.stdout.buffer.write(frame) sys.stdout.flush() # Write an output image if `single_image_mode` if isImage: cv2.imwrite('output/output_image.jpg', frame) else: out.write(frame) log.info('######################################################') log.info( '# Average Inference Time :: {:.3f} ms' .format(np.mean(inferenceList))) log.info( '# (IR) Model Size (XML) :: {}' .format(metrics.getSize(utils.getMOFiles(args.model)['model']))) log.info( '# (IR) Model Weight (BIN) :: {}' .format(metrics.getSize(utils.getMOFiles(args.model)['weights']))) log.info( '# Total Model Load Time :: {:.3f} ms' .format(load_time)) log.info( '# Set Probability Threshold :: {}' .format(prob_threshold)) log.info( '# No. of False Negatives @ 0.75 & 0.5 times of the set threhold :: {}' .format(f_n)) log.info( '# Error_percent in detecting Total ppl :: {}' .format(metrics.getErrorPercent(total_count, "people"))) log.info( '# Error_percent in average duration :: {}' .format( metrics.getErrorPercent(round(np.mean(durationList)), "duration"))) log.info('######################################################') release(out, cap, client)
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### infer_network.load_model(model=args.model,cpu_extension=args.cpu_extension) ### TODO: Handle the input stream ### cap = cv2.VideoCapture(args.input) cap.open(args.input) width = int(cap.get(3)) height = int(cap.get(4)) #out = cv2.VideoWriter('out2.mp4', 0x00000021, 30, (width,height)) Used for create an Ouput video file counter=0 start_flag=0 time_start=0 count_person=0 total_count_person=0 last_count=0 elapsed=0 elapsed_prom=0 frame_out=0 time_counter=0 conf_prom=0 single_image_mode=0 count_frame_person_total=0 ### TODO: Loop until stream is over ### while cap.isOpened(): counter+=1 time_counter+=1 ### TODO: Read from the video capture ### frame_prev_out=frame_out flag, frame = cap.read() if not flag: if (counter==2): single_image_mode=1 break ### TODO: Pre-process the image as needed ### shape_input=infer_network.get_input_shape() frame_proc=cv2.resize(frame,(shape_input[3],shape_input[2])) frame_proc=np.transpose(frame_proc,(2,0,1)) frame_proc=np.reshape(frame_proc,(1,3,shape_input[2],shape_input[3])) ### TODO: Start asynchronous inference for specified request ### infer_network.exec_net(frame_proc) ### It's use for measuring the inference time start = timer() ### TODO: Wait for the result ### if infer_network.wait()==0: end = timer() elapsed=(end - start) elapsed_prom=(elapsed_prom+elapsed) #print(elapsed) ### TODO: Get the results of the inference request ### output_boxes=infer_network.get_output() ### TODO: Extract any desired stats from the results ### #This part has been adapted from: https://knowledge.udacity.com/questions/139281 frame_out,count_person,conf=draw_boxes(frame,output_boxes,args,width,height) if(count_person>0): conf_prom+=conf count_frame_person_total+=count_person ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### client.publish("person", json.dumps({"count": count_person})) if count_person > last_count: time_start=counter/10 total_count_person = total_count_person + count_person - last_count client.publish("person", json.dumps({"total": total_count_person})) # Person duration in the video is calculated if count_person < last_count: duration = int(counter/10 - time_start) counter=couter=0 # Publish messages to the MQTT server client.publish("person/duration",json.dumps({"duration": duration})) last_count = count_person #out.write(frame) Used for create an Ouput video file ### TODO: Send the frame to the FFMPEG server ### sys.stdout.buffer.write(frame) sys.stdout.flush() ### TODO: Write an output image if `single_image_mode` ### if(single_image_mode==1): cv2.imwrite("/home/workspace/resources/out.png",frame_prev_out) #print(elapsed_prom/(time_counter-1)) #print(conf_prom/count_frame_person_total) #out.release() cap.release() cv2.destroyAllWindows() client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### model = args.model DEVICE = args.device CPU_EXTENSION = args.cpu_extension infer_network.load_model(model, CPU_EXTENSION, DEVICE) network_shape = infer_network.get_input_shape() ### TODO: Handle the input stream ### # Checks for live feed if args.input == 'CAM': input_validated = 0 # Checks for input image elif args.input.endswith('.jpg') or args.input.endswith('.bmp') : single_image_mode = True input_validated = args.input # Checks for video file else: input_validated = args.input assert os.path.isfile(args.input), "file doesn't exist" ### TODO: Handle the input stream ### cap = cv2.VideoCapture(input_validated) cap.open(input_validated) w = int(cap.get(3)) h = int(cap.get(4)) in_shape = network_shape['image_tensor'] #iniatilize variables report = 0 counter = 0 counter_prev = 0 duration_prev = 0 counter_total = 0 dur = 0 request_id=0 ### TODO: Loop until stream is over ### ### TODO: Loop until stream is over ### while cap.isOpened(): ### TODO: Read from the video capture ### flag, frame = cap.read() if not flag: break ### TODO: Pre-process the image as needed ### image = cv2.resize(frame, (in_shape[3], in_shape[2])) image_p = image.transpose((2, 0, 1)) image_p = image_p.reshape(1, *image_p.shape) ### TODO: Start asynchronous inference for specified request ### net_input = {'image_tensor': image_p,'image_info': image_p.shape[1:]} duration_report = None infer_network.exec_net(net_input, request_id) ### TODO: Wait for the result ### if infer_network.wait() == 0: ### TODO: Get the results of the inference request ### net_output = infer_network.get_output() ### TODO: Extract any desired stats from the results ### pointer = 0 probs = net_output[0, 0, :, 2] for i, p in enumerate(probs): if p > prob_threshold: pointer += 1 box = net_output[0, 0, i, 3:] p1 = (int(box[0] * w), int(box[1] * h)) p2 = (int(box[2] * w), int(box[3] * h)) frame = cv2.rectangle(frame, p1, p2, (0, 255, 0), 3) if pointer != counter: counter_prev = counter counter = pointer if dur >= 3: duration_prev = dur dur = 0 else: dur = duration_prev + dur duration_prev = 0 # unknown, not needed in this case else: dur += 1 if dur >= 3: report = counter if dur == 3 and counter > counter_prev: counter_total += counter - counter_prev elif dur == 3 and counter < counter_prev: duration_report = int((duration_prev / 10.0) * 1000) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### client.publish('person', payload=json.dumps({ 'count': report, 'total': counter_total}), qos=0, retain=False) if duration_report is not None: client.publish('person/duration', payload=json.dumps({'duration': duration_report}), qos=0, retain=False) ### TODO: Send the frame to the FFMPEG server ### ### TODO: Write an output image if `single_image_mode` ### frame = cv2.resize(frame, (768, 432)) sys.stdout.buffer.write(frame) sys.stdout.flush() cap.release() cv2.destroyAllWindows()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ modelPath = args.model deviceType = args.device cpuExt = args.cpu_extension probThresh = args.prob_threshold filePath = args.input # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = probThresh ### TODO: Load the model through `infer_network` ### if filePath.lower() == "cam": camera = cv2.VideoCapture(0) elif filePath.split(".")[-1].lower() in ['jpg', 'jpeg', 'png', 'bmp']: infer_network.load_model(modelPath, 1, deviceType, cpuExt) image_input_shape = infer_network.get_input_shape() #print(image_input_shape) img = cv2.imread(filePath, cv2.IMREAD_COLOR) resized_frame = cv2.resize( img, (image_input_shape[3], image_input_shape[2])) frame_preproc = np.transpose( np.expand_dims(resized_frame.copy(), axis=0), (0, 3, 1, 2)) infer_network.exec_net(frame_preproc) if infer_network.wait() == 0: outputs = infer_network.get_output() box_frame, count, bbox = extract_box(img, outputs, prob_threshold) cv2.putText(box_frame, "Count:" + str(count), (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3) cv2.imwrite('output.jpg', box_frame) return else: if not os.path.isfile(filePath): #print(" Given input file is not present.") exit(1) camera = cv2.VideoCapture(filePath) ### TODO: Handle the input stream ### client.connect(MQTT_HOST, MQTT_PORT, MQTT_KEEPALIVE_INTERVAL) if (camera.isOpened() == False): #print("Error opening video stream or file") exit(1) cur_req_id = 0 next_req_id = 1 num_requests = 2 infer_network.load_model(modelPath, num_requests, deviceType, cpuExt) image_input_shape = infer_network.get_input_shape() #print(image_input_shape) ret, frame = camera.read() ### TODO: Loop until stream is over ### total_count = 0 pres_count = 0 prev_count = 0 start_time = 0 no_bbox = 0 duration = 0 prev_bbox_x = 0 while camera.isOpened(): ### TODO: Read from the video capture ### ret, next_frame = camera.read() if not ret: break key = cv2.waitKey(60) ### TODO: Pre-process the image as needed ### resized_frame = cv2.resize( next_frame.copy(), (image_input_shape[3], image_input_shape[2])) frame_preproc = np.transpose( np.expand_dims(resized_frame.copy(), axis=0), (0, 3, 1, 2)) ### TODO: Start asynchronous inference for specified request ### infer_network.exec_net(frame_preproc.copy(), req_id=next_req_id) ### TODO: Wait for the result ### if infer_network.wait(cur_req_id) == 0: ### TODO: Get the results of the inference request ### outputs = infer_network.get_output(cur_req_id) ### TODO: Extract any desired stats from the results ### frame, pres_count, bbox = extract_box(frame.copy(), outputs[0], prob_threshold) box_w = frame.shape[1] tl, br = bbox #top_left, bottom_right if pres_count > prev_count: start_time = time.time() total_count += pres_count - prev_count no_bbox = 0 client.publish("person", json.dumps({"total": total_count})) elif pres_count < prev_count: if no_bbox <= 20: pres_count = prev_count no_bbox += 1 elif prev_bbox_x < box_w - 200: pres_count = prev_count no_bbox = 0 else: duration = int(time.time() - start_time) client.publish("person/duration", json.dumps({"duration": duration})) if not (tl == None and br == None): prev_bbox_x = int((tl[0] + br[0]) / 2) prev_count = pres_count client.publish("person", json.dumps({"count": pres_count})) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### ### TODO: Send the frame to the FFMPEG server ### sys.stdout.buffer.write(frame) sys.stdout.flush() ### TODO: Write an output image if `single_image_mode` ### cur_req_id, next_req_id = next_req_id, cur_req_id frame = next_frame if key == 27: break #output_video.release() camera.release() client.disconnect()
class HeadPoseEstimationModel: ''' Class for the Face Detection Model. ''' def __init__(self, model_name, device='CPU', extensions=None): ''' TODO: Use this to set your instance variables. ''' self.model_xml = model_name self.device = device self.extensions = extensions # Initialise the class self.infer_network = Network() #raise NotImplementedError def load_model(self): ''' TODO: You will need to complete this method. This method is for loading the model to the device specified by the user. If your model requires any Plugins, this is where you can load them. ''' self.infer_network.load_model(self.model_xml, self.device, self.extensions) #raise NotImplementedError def predict(self, image): ''' TODO: You will need to complete this method. This method is meant for running predictions on the input image. ''' self.infer_network.exec_net(image) # Wait for the result if self.infer_network.wait() == 0: # end time of inference end_time = time.time() result = (self.infer_network.get_output())#[self.infer_network.output_blob] return result def check_model(self): raise NotImplementedError def preprocess_input(self, image): ''' Before feeding the data into the model for inference, you might have to preprocess it. This function is where you can do that. ''' # [1x3x60x60] net_input_shape = self.infer_network.get_input_shape() p_frame = np.copy(image) p_frame = cv2.resize(p_frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2,0,1)) p_frame = p_frame.reshape(1, *p_frame.shape) return p_frame #raise NotImplementedError def preprocess_output(self, outputs, image, face, facebox, print_flag=True, threshold = 0.5): ''' Before feeding the output of this model to the next model, you might have to preprocess the output. This function is where you can do that. Output layer names in Inference Engine format: name: "angle_y_fc", shape: [1, 1] - Estimated yaw (in degrees). name: "angle_p_fc", shape: [1, 1] - Estimated pitch (in degrees). name: "angle_r_fc", shape: [1, 1] - Estimated roll (in degrees). Each output contains one float value (yaw, pitсh, roll). ''' yaw = outputs['angle_y_fc'][0][0] # Axis of rotation: z pitch = outputs['angle_p_fc'][0][0] # Axis of rotation: y roll = outputs['angle_r_fc'][0][0] # Axis of rotation: x #Draw output if(print_flag): cv2.putText(image,"y:{:.1f}".format(yaw), (20,20), 0, 0.6, (255,255,0)) cv2.putText(image,"p:{:.1f}".format(pitch), (20,40), 0, 0.6, (255,255,0)) cv2.putText(image,"r:{:.1f}".format(roll), (20,60), 0, 0.6, (255,255,0)) xmin, ymin,_ , _ = facebox face_center = (xmin + face.shape[1] / 2, ymin + face.shape[0] / 2, 0) self.draw_axes(image, face_center, yaw, pitch, roll) return image, [yaw, pitch, roll] # code source: https://knowledge.udacity.com/questions/171017 def draw_axes(self, frame, center_of_face, yaw, pitch, roll): focal_length = 950.0 scale = 50 yaw *= np.pi / 180.0 pitch *= np.pi / 180.0 roll *= np.pi / 180.0 cx = int(center_of_face[0]) cy = int(center_of_face[1]) Rx = np.array([[1, 0, 0], [0, math.cos(pitch), -math.sin(pitch)], [0, math.sin(pitch), math.cos(pitch)]]) Ry = np.array([[math.cos(yaw), 0, -math.sin(yaw)], [0, 1, 0], [math.sin(yaw), 0, math.cos(yaw)]]) Rz = np.array([[math.cos(roll), -math.sin(roll), 0], [math.sin(roll), math.cos(roll), 0], [0, 0, 1]]) # R = np.dot(Rz, Ry, Rx) # ref: https://www.learnopencv.com/rotation-matrix-to-euler-angles/ # R = np.dot(Rz, np.dot(Ry, Rx)) R = Rz @ Ry @ Rx # print(R) camera_matrix = self.build_camera_matrix(center_of_face, focal_length) xaxis = np.array(([1 * scale, 0, 0]), dtype='float32').reshape(3, 1) yaxis = np.array(([0, -1 * scale, 0]), dtype='float32').reshape(3, 1) zaxis = np.array(([0, 0, -1 * scale]), dtype='float32').reshape(3, 1) zaxis1 = np.array(([0, 0, 1 * scale]), dtype='float32').reshape(3, 1) o = np.array(([0, 0, 0]), dtype='float32').reshape(3, 1) o[2] = camera_matrix[0][0] xaxis = np.dot(R, xaxis) + o yaxis = np.dot(R, yaxis) + o zaxis = np.dot(R, zaxis) + o zaxis1 = np.dot(R, zaxis1) + o xp2 = (xaxis[0] / xaxis[2] * camera_matrix[0][0]) + cx yp2 = (xaxis[1] / xaxis[2] * camera_matrix[1][1]) + cy p2 = (int(xp2), int(yp2)) cv2.line(frame, (cx, cy), p2, (0, 0, 255), 2) xp2 = (yaxis[0] / yaxis[2] * camera_matrix[0][0]) + cx yp2 = (yaxis[1] / yaxis[2] * camera_matrix[1][1]) + cy p2 = (int(xp2), int(yp2)) cv2.line(frame, (cx, cy), p2, (0, 255, 0), 2) xp1 = (zaxis1[0] / zaxis1[2] * camera_matrix[0][0]) + cx yp1 = (zaxis1[1] / zaxis1[2] * camera_matrix[1][1]) + cy p1 = (int(xp1), int(yp1)) xp2 = (zaxis[0] / zaxis[2] * camera_matrix[0][0]) + cx yp2 = (zaxis[1] / zaxis[2] * camera_matrix[1][1]) + cy p2 = (int(xp2), int(yp2)) cv2.line(frame, p1, p2, (255, 0, 0), 2) cv2.circle(frame, p2, 3, (255, 0, 0), 2) return frame # code source: https://knowledge.udacity.com/questions/171017 def build_camera_matrix(self, center_of_face, focal_length): cx = int(center_of_face[0]) cy = int(center_of_face[1]) camera_matrix = np.zeros((3, 3), dtype='float32') camera_matrix[0][0] = focal_length camera_matrix[0][2] = cx camera_matrix[1][1] = focal_length camera_matrix[1][2] = cy camera_matrix[2][2] = 1 return camera_matrix
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ global i_w, i_h, prob_threshold current_request_num = 0 total_count = 0 latest_count = 0 previous_count = 0 duration_sum = 0 duration_in_frame = 0.0 frame_count = 0 infer_frame_count = 0 single_image_mode = False # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold client.connect(HOSTNAME, port=MQTT_PORT, keepalive=60, bind_address=IPADDRESS) ### Load the model through `infer_network` ### n, c, h, w = infer_network.load_model(args.model, args.device, 1, 1, current_request_num, args.cpu_extension) ### Handle the input stream ### if args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True input_stream = args.input else: input_stream = args.input capture_frames = cv2.VideoCapture(input_stream) length_of_video = int(capture_frames.get(cv2.CAP_PROP_FRAME_COUNT)) frame_rate = int(capture_frames.get(cv2.CAP_PROP_FPS)) ### Read from the video capture ### infer_time_start = time.time() if input_stream: capture_frames.open(args.input) if not capture_frames.isOpened(): log.error("Unable to Open the Video File.") i_w = capture_frames.get(3) i_h = capture_frames.get(4) out = cv2.VideoWriter(os.path.join("people_counter.mp4"), 0x00000021, frame_rate, (int(i_w), int(i_h)), True) while capture_frames.isOpened(): isEnd, frame = capture_frames.read() frame_count += 1 current_count = 0 if not isEnd: break cv2.waitKey(10) ### Pre-process the image as needed ### inf_image = cv2.resize(frame, (w, h)) inf_image = inf_image.transpose((2, 0, 1)) inf_image = inf_image.reshape((n, c, h, w)) # Starting the Asynchronous Inference: inf_start = time.time() infer_network.exec_net(current_request_num, inf_image) ### Waiting for the result ### if infer_network.wait(current_request_num) == 0: duration = (time.time() - inf_start) results = infer_network.get_output(current_request_num) out_frame, current_count = draw_frame_on_inference(frame, results) duration_message = "Inference Time Per Frame: {:.3f}ms".format( duration * 1000) if current_count > 0: infer_frame_count += 1 duration_sum += float(infer_frame_count) / frame_rate if current_count > 0 and infer_frame_count > args.frames_ignore and previous_count > 0: ''' If the Count of People Goes up and keeps like that for more than ''' previous_count = max(previous_count, current_count) if previous_count == 0 and infer_frame_count > args.frames_ignore: total_count += current_count # infer_frame_count = 0 previous_count = max(previous_count, current_count) client.publish("person", json.dumps({"count": current_count})) client.publish("person", json.dumps({"total": total_count})) if args.enable_alert_limit is not None and current_count >= args.enable_alert_limit: client.publish( "alert", json.dumps({ "alert_msg": "Stampede", "count": current_count })) intruder_msg = "STAMPEDE ALERT, CURRENT COUNT {} IS SAME OR EXCEEDED SAFE LIMIT {}".format( current_count, args.enable_alert_limit) cv2.putText(out_frame, intruder_msg, (15, 45), cv2.FONT_HERSHEY_DUPLEX, 0.5, (10, 10, 210), 1) if previous_count != 0 and current_count == 0: duration_in_frame = infer_frame_count / frame_rate for i in range(previous_count): client.publish("person/duration", json.dumps({"duration": duration_in_frame})) if current_count == 0: infer_frame_count = 0 previous_count = current_count duration_sum = 0.0 client.publish("person", json.dumps({"count": current_count})) cv2.putText(out_frame, duration_message, (15, 15), cv2.FONT_HERSHEY_DUPLEX, 0.5, (210, 10, 10), 1) people_count_msg = "People counted: in Current Frame: {} ; Total: {}".format( current_count, total_count) cv2.putText(out_frame, people_count_msg, (15, 30), cv2.FONT_HERSHEY_DUPLEX, 0.5, (210, 10, 10), 1) # person_duration_msg = "Duration in Frame: {:.2f} seconds".format(duration_sum%60) # cv2.putText(out_frame, person_duration_msg, (15, 45), cv2.FONT_HERSHEY_DUPLEX, 0.5, (210, 10, 10), 1) out.write(out_frame) client.publish("person", json.dumps({"count": current_count})) ### Send the frame to the FFMPEG server ### sys.stdout.buffer.write(out_frame) sys.stdout.flush() ### Write an output image if `single_image_mode` ### if single_image_mode: cv2.imWrite('infer_out.jpg', frame) capture_frames.release() client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ frame_count = 0 frame_time = 0 duration_prev = 0 total_count = 0 time_thresh = 0 person_count_in_each_frame = 0 last_count = 0 previous_last_count = 0 font_scale = 0.5 font = cv2.FONT_HERSHEY_SIMPLEX # Flag for the input image single_image_mode = False # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold model = args.model device = args.device ### TODO: Load the model through `infer_network` ### infer_network.load_model(model, device, CPU_EXTENSION) infer_network_input_shape = infer_network.get_input_shape() #print("\n.... network input shape... ",infer_network_input_shape,"\n") # Check if the input is a webcam if args.input == 'CAM': input_Type = 0 # Checks for input image elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True input_Type = args.input # Checks for video file else: input_Type = args.input assert os.path.isfile(args.input), "Specified input file doesn't exist" ### TODO: Handle the input stream ### input_stream = cv2.VideoCapture(input_Type) if input_Type: input_stream.open(args.input) if not input_stream.isOpened(): log.error("ERROR! Unable to open video source") # Grab the shape of the input width = int(input_stream.get(3)) height = int(input_stream.get(4)) if not single_image_mode: # The second argument should be `cv2.VideoWriter_fourcc('M','J','P','G')` # on Mac, and `0x00000021` on Linux # 100x100 to match desired resizing out = cv2.VideoWriter('output_video.mp4', 0x00000021, 30, (width, height)) else: out = None ### TODO: Loop until stream is over ### while input_stream.isOpened(): ### TODO: Read from the video capture ### flag, frame = input_stream.read() if not flag: break frame_count += 1 t = time.time() key_pressed = cv2.waitKey(60) ### TODO: Pre-process the image as needed ### n c h w preProcessed_frame = cv2.resize( frame, (infer_network_input_shape[3], infer_network_input_shape[2])) preProcessed_frame = preProcessed_frame.transpose((2, 0, 1)) preProcessed_frame = preProcessed_frame.reshape( 1, *preProcessed_frame.shape) ### TODO: Start asynchronous inference for specified request ### inferencing_start = time.time() total_time_spent = None infer_network.exec_net(preProcessed_frame) ### TODO: Wait for the result ### if infer_network.wait() == 0: detection_time = time.time() - inferencing_start ### TODO: Get the results of the inference request ### result = infer_network.get_output() frame, current_count = draw_bounding_boxes(frame, result, prob_threshold, width, height) inference_time_message = "Inference time: {:.3f}ms".format( detection_time * 1000) cv2.putText(frame, inference_time_message, (25, 25), cv2.FONT_HERSHEY_COMPLEX, font_scale, (0, 10, 250), 1) ### TODO: Extract any desired stats from the results ### if current_count == last_count: time_thresh += 1 if time_thresh >= 10: person_count_in_each_frame = last_count if time_thresh == 10 and last_count > previous_last_count: total_count += last_count - previous_last_count elif time_thresh == 10 and last_count < previous_last_count: total_time_spent = int( (duration_prev / 10.0) * 1000) # in ms else: previous_last_count = last_count last_count = current_count if time_thresh >= 10: duration_prev = time_thresh time_thresh = 0 else: time_thresh = duration_prev + time_thresh current_count_label = "No of Persons : {:.2f}".format( current_count) cv2.putText(frame, current_count_label, (25, 50), font, font_scale, (255, 0, 0), 1) total_count_label = "Total Detected Person : {:.2f}".format( total_count) cv2.putText(frame, total_count_label, (25, 75), font, font_scale, (255, 0, 0), 1) alert_flag = False alert_msg = None if current_count > 5: alert_msg = "ALERT!!! " + str( current_count) + " persons are at same place" alert_flag = True if total_time_spent is not None and total_time_spent > 3000000: # 5 min alert_msg = "ALERT!!! " + str( current_count) + " person are in store from long time." alert_flag = True if alert_flag: # set the rectangle background to white rectangle_bgr = (0, 0, 255) # get the width and height of the text box (text_width, text_height) = cv2.getTextSize(alert_msg, font, fontScale=font_scale, thickness=1)[0] # set the text start position text_offset_x = 0 text_offset_y = frame.shape[0] - 15 # make the coords of the box with a small padding of two pixels box_coords = ((text_offset_x, text_offset_y), (text_offset_x + text_width + 5, text_offset_y - text_height - 5)) cv2.rectangle(frame, box_coords[0], box_coords[1], rectangle_bgr, cv2.FILLED) cv2.putText(frame, alert_msg, (text_offset_x, text_offset_y), font, 0.45, color=(255, 255, 255), thickness=1) frame_time += time.time() - t fps = frame_count / float(frame_time) fps_label = "FPS : {:.2f}".format(fps) cv2.putText(frame, fps_label, (25, 100), font, font_scale, (255, 0, 0), 1) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### client.publish( "person", json.dumps({ "count": current_count, "total": total_count })) if total_time_spent is not None: client.publish("person/duration", json.dumps({"duration": total_time_spent})) ### TODO: Send the frame to the FFMPEG server ### frame = cv2.resize(frame, (768, 432)) sys.stdout.buffer.write(frame) sys.stdout.flush() # Break if escape key pressed if key_pressed == 27: break ### TODO: Write an output image if `single_image_mode` ### if single_image_mode: frame = cv2.resize(frame, (1980, 1080)) cv2.imwrite('output_image.jpg', frame) else: out.write(frame) # Release the capture and destroy any OpenCV windows if not single_image_mode: out.release() input_stream.release() cv2.destroyAllWindows() ### TODO: Disconnect from MQTT client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ global single_image_mode # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### plugin = Network() plugin.load_model(args.model, args.device, args.cpu_extension) net_input_shape = plugin.get_input_shape() ### TODO: Handle the input stream ### if args.input == 'CAM': args.input = 0 elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True cap = cv2.VideoCapture(args.input) cap.open(args.input) width = int(cap.get(3)) height = int(cap.get(4)) fps = cap.get(cv2.CAP_PROP_FPS) # storing the fps of the video ### TODO: Loop until stream is over ### frame_no = 0 conf_arr = [] while cap.isOpened(): ### TODO: Read from the video capture ### flag, frame = cap.read() frame_no = frame_no + 1 start_infr = time.time() if frame_no % (args.frame_skip_rate + 1) == 0 or single_image_mode == True: # frame will be skipped based on the -s argument (frame_skip_rate) to decrease the inference time. if not flag: break ### TODO: Pre-process the image as needed ### key_pressed = cv2.waitKey(60) new_frame = np.copy(frame) p_frame = cv2.resize(frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) ### TODO: Start asynchronous inference for specified request ### plugin.exec_net(p_frame) ### TODO: Wait for the result ### if plugin.wait() == 0: ### TODO: Get the results of the inference request ### result = plugin.get_output() ### TODO: Extract any desired stats from the results ### out_frame, conf = draw_boxes(new_frame, result, prob_threshold, width, height) conf_arr.append(conf) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### postprocess(conf, frame_no, conf_arr, client, fps) infr_arr.append((time.time() - start_infr)) ### TODO: Send the frame to the FFMPEG server ### sys.stdout.buffer.write(out_frame) sys.stdout.flush() if key_pressed == 27: break ### TODO: Write an output image if `single_image_mode` ### if single_image_mode: cv2.imwrite('output_image.jpg', out_frame) cap.release() cv2.destroyAllWindows() client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() # Set Probability threshold for detections # prob_threshold = args.prob_threshold cur_request_id = 0 last_count = 0 total_count = 0 start_time = 0 time_on_video = 0 time_not_on_video = 0 image_mode = False positive_count = 0 ### TODO: Load the model through `infer_network` ### n, c, h, w = infer_network.load_model(args.model, args.device, 1, 1, cur_request_id, args.cpu_extension)[1] ### TODO: Handle the input stream ### # Checks for image input if args.input.endswith('.jpg') or args.input.endswith('.png') or \ args.input.endswith('.bmp'): image_mode = True media_stream = args.input # Checks for webcam input elif args.input == 'CAM': media_stream = 0 # Check for video input else: media_stream = args.input assert os.path.isfile(args.input) ### TODO: Loop until stream is over ### capture = cv2.VideoCapture(media_stream) if media_stream: capture.open(args.input) if not capture.isOpened(): log.error("Not able to open the video file!") ### TODO: Read from the video capture ### # global width, height, prob_threshold prob_threshold = args.prob_threshold width = capture.get(3) height = capture.get(4) while capture.isOpened(): check, frame = capture.read() if not check: break ### TODO: Pre-process the image as needed ### image = cv2.resize(frame, (w, h)) image = image.transpose(2, 0, 1) image = image.reshape(n, c, h, w) ### TODO: Start asynchronous inference for specified request ### inference_start = time.time() infer_network.exec_net(cur_request_id, image) ### TODO: Wait for the result ### if infer_network.wait(cur_request_id) == 0: inference_time = time.time() - inference_start ### TODO: Get the results of the inference request ### result = infer_network.get_output(cur_request_id) # if perf_counts: # perf_count = infer_network.exec_net(cur_request_id) # performance_counts(perf_count) ### TODO: Extract any desired stats from the results ### current_count = 0 track_frames = {} track_person = {positive_count: 0} frame_count = 0 for character in result[0][0]: if character[2] > prob_threshold: frame_count += 1 track_frames[frame_count] = character[2] start_time_not_on_video = time.time() positive_count += 1 track_person[positive_count] = time_on_video xmin = int(character[3] * width) ymin = int(character[4] * height) xmax = int(character[5] * width) ymax = int(character[6] * height) frame = cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 55, 255), 1) time_on_video = start_time_not_on_video - start_time if time_on_video > 3: if current_count > 1: current_count = last_count else: current_count += 1 else: current_count = last_count ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### if current_count > last_count: start_time = time.time() time_not_on_video = time.time() - start_time_not_on_video if current_count == 1 and last_count == 0: if time_on_video > 2: total_count = total_count + current_count - last_count client.publish("person", json.dumps({"total": total_count})) if current_count < last_count: if current_count == 0: start_time_not_on_video = time.time() time_on_video = int(time.time() - start_time) if last_count == 0 and time_not_on_video < 0.005: time_on_video = track_person[positive_count] + time_on_video client.publish("person/duration", json.dumps({"duration": time_on_video})) client.publish("person", json.dumps({"count": current_count})) last_count = current_count cv2.putText( frame, "Inference time = {:.2f} ms".format( (inference_time * 1000)), (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, "Persons in video frame = {:}".format(last_count), (15, 30), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, "Total count = {:}".format(total_count), (15, 45), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText(frame, "Time on video = {:.2f} s".format(time_on_video), (15, 60), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) cv2.putText( frame, "Time not on video = {:.3f} s".format( time_not_on_video * 1000), (15, 75), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) key = cv2.waitKey(15) if key == ord('q'): break ### TODO: Send the frame to the FFMPEG server ### sys.stdout.buffer.write(frame) sys.stdout.flush() ### TODO: Write an output image if `single_image_mode` ### if image_mode: cv2.imwrite('output.jpg', frame) # cv2.imshow('frame', frame) capture.release() cv2.destroyAllWindows() client.disconnect() infer_network.clean()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # my) init parameters current_count = 0 total_count = 0 duration = 0 last_count = 0 start_time = 0 isFirst = True single_image_mode = False # Initialise the class (ok) infer_network = Network() # Set Probability threshold for detections (ok) prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### (ok) infer_network.load_model(args.model, device="CPU", cpu_extension=args.cpu_extension) n, c, h, w = infer_network.get_input_shape() ### TODO: Handle the input stream ### (ok) if args.input == 'CAM': input_stream = 0 elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True input_stream = args.input else: input_stream = args.input #assert os.path.isfile(args.input), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) cap.open(input_stream) ### TODO: Loop until stream is over ###(ok) while cap.isOpened(): ### TODO: Read from the video capture ###(ok) ret, frame = cap.read() key_pressed = cv2.waitKey(60) if not ret: break ### TODO: Pre-process the image as needed ###(ok) image = cv2.resize(frame, (w, h)) image = image.transpose((2, 0, 1)) image = image.reshape((n, c, h, w)) ### TODO: Start asynchronous inference for specified request ###(ok) infer_network.exec_net(image) ### TODO: Wait for the result ###(ok) if infer_network.wait() == 0: ### TODO: Get the results of the inference request ###(ok) result = infer_network.get_output() ### TODO: Extract any desired stats from the results ###(ok) boxes, score = post_detection(result, frame.shape, prob_threshold) for box in boxes: xmin, ymin, xmax, ymax = box cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 238, 255), 1) ### TODO: Calculate and send relevant information on ###(ok) ### current_count, total_count and duration to the MQTT server ### if len(boxes) != current_count: if isFirst: ts1 = time.time() isFirst = False if time.time() - ts1 > 0.5: current_count = len(boxes) isFirst = True ### Topic "person": keys of "count" and "total" ###(ok) if current_count > last_count: start_time = time.time() total_count = total_count + current_count - last_count client.publish("person", json.dumps({"total": total_count})) ### Topic "person/duration": key of "duration" ###(ok) if current_count < last_count: duration = int(time.time() - start_time) # Publish messages to the MQTT server client.publish("person/duration", json.dumps({"duration": duration})) client.publish("person", json.dumps({"count": current_count})) last_count = current_count if key_pressed == ord('q'): break ### TODO: Send the frame to the FFMPEG server ###(ok) sys.stdout.buffer.write(frame) sys.stdout.flush() ### TODO: Write an output image if `single_image_mode` ###(ok) if single_image_mode: cv2.imwrite('output_image.jpg', frame) cap.release() cv2.destroyAllWindows() client.disconnect() infer_network.clean()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # declaring variables to count the people and duration total_number = 0 last_number = 0 missed_number = 0 start = 0 duration = 0 frame_number = 0 # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### infer_network.load_model(args.model, args.device, args.cpu_extension) net_input_shape = infer_network.get_input_shape() # This applies only for faster rcnn since it outputs two things for # input shape: image: [1, 3] and image tensor: [1, 3, 600, 600] # We need image tensor # input_shape = net_input_shape['image_tensor'] ### TODO: Handle the input stream ### single_image_mode = False if args.input == 'CAM': args.input = 0 elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True else: # Check the input value assert os.path.isfile(args.input), "Input file doesn't exist..." captured = cv2.VideoCapture(args.input) captured.open(args.input) # Grab the shape of the input width = int(captured.get(3)) height = int(captured.get(4)) # Processing the video # Create a video writer for the output video # if not single_image_mode: # # out = cv2.VideoWriter('out.mp4', 0x00000021, 30, (width, height)) # for linux # out = cv2.VideoWriter('out_frcnn.mp4', cv2.VideoWriter_fourcc( # 'M', 'J', 'P', 'G'), 30, (width, height)) # for Mac # else: # out = None ### TODO: Loop until stream is over ### while captured.isOpened(): ### TODO: Read from the video capture ### flag, frame = captured.read() frame_number += 1 if not flag: break key_pressed = cv2.waitKey(60) ### TODO: Pre-process the image as needed ### p_frame = cv2.resize( frame, (net_input_shape[3], net_input_shape[2])) # for SSD model # p_frame = cv2.resize( # frame, (input_shape[3], input_shape[2])) # for faster rcnn p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) # Input to the network (only required for faster rcnn) # network_input_data = {'image_tensor': p_frame, # 'image_info': p_frame.shape[1:]} # request id for making inferences request_id = 0 ### TODO: Start asynchronous inference for specified request ### # Start asynchronous inference for specified request. infer_start = time.time() infer_network.exec_net(request_id, p_frame) # for SSD # infer_network.exec_net( # request_id, network_input_data) # for faster rcnn ### TODO: Wait for the result ### if infer_network.wait(request_id) == 0: ### TODO: Get the results of the inference request ### det_time = time.time() - infer_start result = infer_network.get_output() ### TODO: Extract any desired stats from the results ### # Draw bounding box frame, current_number = draw_bbox(prob_threshold, result, frame, width, height) inf_time_message = "Inference time: {:.3f}ms"\ .format(det_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### # When a person enters the frame if current_number > last_number: start = time.time() total_number += current_number - last_number client.publish("person", json.dumps({"total": total_number})) # when a person leaves the frame if current_number == 0 and last_number != 0: missed_number += 1 # wait for few frames to make sure the person has actually left the frame # this number should be bigger for SSD because of high false negatives # missing frame threshold if missed_number >= 30: # use 30 for SSD and 5 for faster rcnn duration = int(time.time() - start) client.publish("person/duration", json.dumps({"duration": duration})) # resetting the dropped frames missed_number = 0 # updating the last number last_number = current_number else: # publishing the results client.publish("person", json.dumps({"count": current_number})) # updating the last number last_number = current_number # Write out the frame # out.write(frame) # Break if escape key pressed if key_pressed == 27: break ### TODO: Send the frame to the FFMPEG server ### # Resize the frame according to the video frame = cv2.resize(frame, (768, 432)) sys.stdout.buffer.write(frame) sys.stdout.flush() ### TODO: Write an output image if `single_image_mode` ### if single_image_mode: cv2.imwrite("output_image.jpg", p_frame) # Release the capture and destroy any OpenCV windows captured.release() cv2.destroyAllWindows() # TODO: Disconnect from MQTT client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Input arguments modelArgs = args.model deviceArgs = args.device cpuExtensionArgs = args.cpu_extension propThresholdArgs = args.prob_threshold filePathArgs = args.input # Initialise the class infer_network = Network() #Load the model through `infer_network` infer_network.load_model(modelArgs, deviceArgs, cpuExtensionArgs) net_input_shape = infer_network.get_input_shape() # Set Probability threshold for detections prob_threshold = propThresholdArgs # Handle image, video or webcam # Create a flag for single images # Flag for the input image single_image_mode = False # Check if the input is a webcam if filePathArgs == 'CAM': filePathArgs = 0 elif filePathArgs.endswith('.jpg') or filePathArgs.endswith('.bmp'): single_image_mode = True # Handle the input stream # Get and open video capture capture = cv2.VideoCapture(filePathArgs) capture.open(filePathArgs) # Grab the shape of the input width = int(capture.get(3)) height = int(capture.get(4)) # initlise some variable report = 0 counter = 0 counter_prev = 0 duration_prev = 0 counter_total = 0 dur = 0 request_id = 0 # Process frames until the video ends, or process is exited while capture.isOpened(): # Read the next frame flag, frame = capture.read() if not flag: break key_pressed = cv2.waitKey(60) # Pre-process the frame #Re-size the frame to inputshape_width x inputshape_height p_frame = cv2.resize(frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) #Start asynchronous inference for specified request #Perform inference on the frame duration_report = None inf_start = time.time() infer_network.exec_net(p_frame) # Get the output of inference if infer_network.wait() == 0: det_time = time.time() - inf_start # Results of the output layer of the network output_results = infer_network.get_output() #Extract any desired stats from the results #Update the frame to include detected bounding boxes frame_with_box, pointer = draw_boxes(frame, output_results, prob_threshold, width, height) #Display inference time inf_time_message = "Manasse_Ngudia | Inference time: {:.3f}ms"\ .format(det_time * 1000) cv2.putText(frame_with_box, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.45, (200, 10, 10), 1) #Calculate and send relevant information on ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### if pointer != counter: counter_prev = counter counter = pointer if dur >= 3: duration_prev = dur dur = 0 else: dur = duration_prev + dur duration_prev = 0 # unknown, not needed in this case else: dur += 1 if dur >= 3: report = counter if dur == 3 and counter > counter_prev: counter_total += counter - counter_prev elif dur == 3 and counter < counter_prev: duration_report = int((duration_prev / 10.0) * 1000) client.publish('person', payload=json.dumps({ 'count': report, 'total': counter_total }), qos=0, retain=False) if duration_report is not None: client.publish('person/duration', payload=json.dumps( {'duration': duration_report}), qos=0, retain=False) #Send frame to the ffmpeg server # Resize the frame #frame = cv2.resize(frame, (768, 432)) sys.stdout.buffer.write(frame_with_box) sys.stdout.flush() if single_image_mode: cv2.imwrite('output_image.jpg', frame_with_box) # Break if escapturee key pressed if key_pressed == 27: break # Release the out writer, captureture, and destroy any OpenCV windows capture.release() cv2.destroyAllWindows() client.disconnect()
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() # Set Probability threshold for detections prob_threshold = args.prob_threshold ### TODO: Load the model through `infer_network` ### infer_network.load_model(args.model, args.device, args.cpu_extension) net_input_shape = infer_network.get_input_shape() ### TODO: Handle the input stream ### if args.input == 'CAM': input_stream = 0 # Checks for input image elif args.input.endswith('.jpg') or args.input.endswith('.bmp'): single_image_mode = True input_stream = args.input # Checks for video file else: input_stream = args.input assert os.path.isfile(args.i), "Specified input file doesn't exist" cap = cv2.VideoCapture(input_stream) if input_stream: cap.open(args.i) width = int(cap.get(3)) hight = int(cap.get(4)) ### TODO: Loop until stream is over ### while cap.isOpened(): # Read the next frame flag, frame = cap.read() if not flag: break key_pressed = cv2.waitKey(60) ### TODO: Read from the video capture ### ### TODO: Pre-process the image as needed ### p_frame = cv2.resize(frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2, 0, 1)) p_frame = p_frame.reshape(1, *p_frame.shape) ### TODO: Start asynchronous inference for specified request ### single_image_mode = False cur_request_id = 0 last_count = 0 total_count = 0 start_time = 0 time_start = time.time() infer_network.exec_net(cur_request_id, p_frame) ### TODO: Wait for the result ### if infer_network.wait(cur_request_id) == 0: total_time = time.time() - time_start result = infer_network.get_output(cur_request_id) if args.perf_counts: perf_count = infer_network.performance_counter(cur_request_id) performance_counts(perf_count) ### TODO: Get the results of the inference request ### ### TODO: Extract any desired stats from the results ### frame, current_count = ssd_out(p_frame, width, height) inf_time_message = "Inference time: {:.3f}ms"\ .format(total_time * 1000) cv2.putText(frame, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1) ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### if current_count > last_count: start_time = time.time() total_count = total_count + current_count - last_count client.publish("person", json.dumps({"total": total_count})) # Person duration in the video is calculated if current_count < last_count: duration = int(time.time() - start_time) # Publish messages to the MQTT server client.publish("person/duration", json.dumps({"duration": duration})) client.publish("person", json.dumps({"count": current_count})) last_count = current_count if key_pressed == 27: break ### TODO: Send the frame to the FFMPEG server ### print(frame.shape) ### TODO: Write an output image if `single_image_mode` ### if single_image_mode: cv2.imwrite('output_image.jpg', frame)
def main(): """ Load the network and parse the output. :return: None """ get_args() global is_async_mode nextReq = 1 currReq = 0 nextReq_s = 1 currReq_s = 0 prevVideo = None vid_finished = [False] * len(videos) min_FPS = min( [videos[i][1].video.get(cv2.CAP_PROP_FPS) for i in range(len(videos))]) # Initialise the class infer_network = Network() infer_network_safety = Network() # Load the network to IE plugin to get shape of input layer plugin, (batch_size, channels, model_height, model_width) = \ infer_network.load_model(conf_modelLayers, targetDevice, 1, 1, 2, cpu_extension) if use_safety_model: batch_size_sm, channels_sm, model_height_sm, model_width_sm = \ infer_network_safety.load_model(conf_safety_modelLayers, targetDevice, 1, 1, 2, cpu_extension, plugin)[1] while True: for index, currVideo in videos: # Read image from video/cam vfps = int(round(currVideo.video.get(cv2.CAP_PROP_FPS))) for i in range(0, int(round(vfps / min_FPS))): ret, current_img = currVideo.video.read() if not ret: vid_finished[index] = True break if vid_finished[index]: stream_end_frame = np.zeros( (int(currVideo.height), int(currVideo.width), 1), dtype='uint8') cv2.putText( stream_end_frame, "Input file {} has ended".format( name_of_videos[index][1].split('/')[-1]), (10, int(currVideo.height / 2)), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 255, 255), 2) cv2.imshow(currVideo.name, stream_end_frame) continue # Transform image to person detection model input rsImg = cv2.resize(current_img, (model_width, model_height)) rsImg = rsImg.transpose((2, 0, 1)) rsImg = rsImg.reshape( (batch_size, channels, model_height, model_width)) infer_start_time = datetime.datetime.now() # Infer current image if is_async_mode: infer_network.exec_net(nextReq, rsImg) else: infer_network.exec_net(currReq, rsImg) prevVideo = currVideo previous_img = current_img # Wait for previous request to end if infer_network.wait(currReq) == 0: infer_end_time = (datetime.datetime.now() - infer_start_time) * 1000 in_frame_workers = [] people = 0 violations = 0 hard_hat_detection = False vest_detection = False result = infer_network.get_output(currReq) # Filter output for obj in result[0][0]: if obj[2] > conf_inferConfidenceThreshold: xmin = int(obj[3] * prevVideo.width) ymin = int(obj[4] * prevVideo.height) xmax = int(obj[5] * prevVideo.width) ymax = int(obj[6] * prevVideo.height) xmin = int(xmin - padding) if (xmin - padding) > 0 else 0 ymin = int(ymin - padding) if (ymin - padding) > 0 else 0 xmax = int(xmax + padding) if ( xmax + padding) < prevVideo.width else prevVideo.width ymax = int(ymax + padding) if ( ymax + padding) < prevVideo.height else prevVideo.height cv2.rectangle(previous_img, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2) people += 1 in_frame_workers.append((xmin, ymin, xmax, ymax)) new_frame = previous_img[ymin:ymax, xmin:xmax] if use_safety_model: # Transform image to safety model input in_frame_sm = cv2.resize( new_frame, (model_width_sm, model_height_sm)) in_frame_sm = in_frame_sm.transpose((2, 0, 1)) in_frame_sm = in_frame_sm.reshape( (batch_size_sm, channels_sm, model_height_sm, model_width_sm)) infer_start_time_sm = datetime.datetime.now() if is_async_mode: infer_network_safety.exec_net( nextReq_s, in_frame_sm) else: infer_network_safety.exec_net( currReq_s, in_frame_sm) # Wait for the result infer_network_safety.wait(currReq_s) infer_end_time_sm = (datetime.datetime.now() - infer_start_time_sm) * 1000 result_sm = infer_network_safety.get_output( currReq_s) # Filter output hard_hat_detection = False vest_detection = False detection_list = [] for obj_sm in result_sm[0][0]: if (obj_sm[2] > 0.4): # Detect safety vest if (int(obj_sm[1])) == 2: xmin_sm = int(obj_sm[3] * (xmax - xmin)) ymin_sm = int(obj_sm[4] * (ymax - ymin)) xmax_sm = int(obj_sm[5] * (xmax - xmin)) ymax_sm = int(obj_sm[6] * (ymax - ymin)) if vest_detection == False: detection_list.append([ xmin_sm + xmin, ymin_sm + ymin, xmax_sm + xmin, ymax_sm + ymin ]) vest_detection = True # Detect hard-hat if int(obj_sm[1]) == 4: xmin_sm_v = int(obj_sm[3] * (xmax - xmin)) ymin_sm_v = int(obj_sm[4] * (ymax - ymin)) xmax_sm_v = int(obj_sm[5] * (xmax - xmin)) ymax_sm_v = int(obj_sm[6] * (ymax - ymin)) if hard_hat_detection == False: detection_list.append([ xmin_sm_v + xmin, ymin_sm_v + ymin, xmax_sm_v + xmin, ymax_sm_v + ymin ]) hard_hat_detection = True if hard_hat_detection is False or vest_detection is False: violations += 1 for _rect in detection_list: cv2.rectangle(current_img, (_rect[0], _rect[1]), (_rect[2], _rect[3]), (0, 255, 0), 2) if is_async_mode: currReq_s, nextReq_s = nextReq_s, currReq_s # Use OpenCV if worker-safety-model is not provided else: violations = detect_workers( in_frame_workers, previous_img) # Check if detected violations equals previous frames if violations == prevVideo.currentViolationCount: prevVideo.currentViolationCountConfidence += 1 # If frame threshold is reached, change validated count if prevVideo.currentViolationCountConfidence == conf_inFrameViolationsThreshold: # If another violation occurred, save image if prevVideo.currentViolationCount > prevVideo.prevViolationCount: prevVideo.totalViolations += ( prevVideo.currentViolationCount - prevVideo.prevViolationCount) prevVideo.prevViolationCount = prevVideo.currentViolationCount else: prevVideo.currentViolationCountConfidence = 0 prevVideo.currentViolationCount = violations # Check if detected people count equals previous frames if people == prevVideo.currentPeopleCount: prevVideo.currentPeopleCountConfidence += 1 # If frame threshold is reached, change validated count if prevVideo.currentPeopleCountConfidence == conf_inFrameViolationsThreshold: prevVideo.currentTotalPeopleCount += ( prevVideo.currentPeopleCount - prevVideo.prevPeopleCount) if prevVideo.currentTotalPeopleCount > prevVideo.prevPeopleCount: prevVideo.totalPeopleCount += prevVideo.currentTotalPeopleCount - prevVideo.prevPeopleCount prevVideo.prevPeopleCount = prevVideo.currentPeopleCount else: prevVideo.currentPeopleCountConfidence = 0 prevVideo.currentPeopleCount = people frame_end_time = datetime.datetime.now() cv2.putText( previous_img, 'Total people count: ' + str(prevVideo.totalPeopleCount), (10, prevVideo.height - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText( previous_img, 'Current people count: ' + str(prevVideo.currentTotalPeopleCount), (10, prevVideo.height - 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText( previous_img, 'Total violation count: ' + str(prevVideo.totalViolations), (10, prevVideo.height - 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText( previous_img, 'FPS: %0.2fs' % (1 / (frame_end_time - prevVideo.frame_start_time).total_seconds()), (10, prevVideo.height - 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.putText(previous_img, "Inference time: N\A for async mode" if is_async_mode else\ "Inference time: {:.3f} ms".format((infer_end_time).total_seconds()), (10, prevVideo.height - 130), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2) cv2.imshow(prevVideo.name, previous_img) prevVideo.frame_start_time = datetime.datetime.now() # Swap if is_async_mode: currReq, nextReq = nextReq, currReq previous_img = current_img prevVideo = currVideo if cv2.waitKey(1) == 27: print("Attempting to stop input files") infer_network.clean() infer_network_safety.clean() cv2.destroyAllWindows() return if False not in vid_finished: infer_network.clean() infer_network_safety.clean() cv2.destroyAllWindows() break
def main(): # Plugin initialization for specified device and load extensions library global rolling_log global TARGET_DEVICE global videoCapsJson env_parser() check_args() parse_conf_file() if TARGET_DEVICE not in acceptedDevices: print("Unsupporterd device " + TARGET_DEVICE + ". Defaulting to CPU") TARGET_DEVICE = 'CPU' # Initialize the class infer_network = Network() # Load the network to IE Plugin n, c, h, w = infer_network.load_model(model_xml, TARGET_DEVICE, 1, 1, 2, CPU_EXTENSION)[1] minFPS = min([i.cap.get(cv2.CAP_PROP_FPS) for i in videoCaps]) waitTime = int( round(1000 / minFPS / len(videoCaps))) # wait time in ms between showing frames for vc in videoCaps: vc.init_vw(h, w, minFPS) statsWidth = w if w > 345 else 345 statsHeight = h if h > (len(videoCaps) * 20 + 15) else ( len(videoCaps) * 20 + 15) statsVideo = cv2.VideoWriter(os.path.join('resources', 'Statistics.mp4'), 0x00000021, minFPS, (statsWidth, statsHeight), True) if not statsVideo.isOpened(): print("Couldn't open stats video for writing") sys.exit(4) # Read the labels file if labels_file: with open(labels_file, 'r') as f: labels_map = [x.strip() for x in f] else: labels_map = None # Init a rolling log to store events rolling_log_size = int((h - 15) / 20) rolling_log = collections.deque(maxlen=rolling_log_size) # Init inference request IDs cur_request_id = 0 next_request_id = 1 # Start with async mode enabled is_async_mode = True if not UI_OUTPUT: # Arrange windows so they are not overlapping arrange_windows(w, h) print("To stop the execution press Esc button") for idx, vc in enumerate(videoCaps): vc.start_time = datetime.datetime.now() vc.pos = idx if UI_OUTPUT: videoCapsJson = videoCaps.copy() while True: # If all video captures are closed stop the loop no_more_data = [videoCap.closed for videoCap in videoCaps] # loop over all video captures for idx, videoCapInfer in enumerate(videoCaps): # read the next frame vfps = int(round(videoCapInfer.cap.get(cv2.CAP_PROP_FPS))) for i in range(0, int(round(vfps / minFPS))): ret, frame = videoCapInfer.cap.read() videoCapInfer.cur_frame_count += 1 # If the read failed close the program if not ret: no_more_data[idx] = True break if no_more_data[idx]: if UI_OUTPUT: videoCaps.pop(idx) continue else: stream_end_frame = np.zeros((h, w, 1), dtype='uint8') cv2.putText( stream_end_frame, "Input file {} has ended".format( videoCapInfer.cap_name), (20, 150), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 255, 255), 1) cv2.imshow(videoCapInfer.cap_name, stream_end_frame) cv2.waitKey(waitTime) videoCaps.pop(idx) continue # Copy the current frame for later use videoCapInfer.cur_frame = frame.copy() videoCapInfer.initial_w = videoCapInfer.cap.get(3) videoCapInfer.initial_h = videoCapInfer.cap.get(4) # Resize and change the data layout so it is compatible in_frame = cv2.resize(videoCapInfer.cur_frame, (w, h)) in_frame = in_frame.transpose( (2, 0, 1)) # Change data layout from HWC to CHW in_frame = in_frame.reshape((n, c, h, w)) infer_start = datetime.datetime.now() if is_async_mode: # Async enabled and only one video capture infer_network.exec_net(next_request_id, in_frame) if (len(videoCaps) == 1): videoCapResult = videoCapInfer # Async enabled and more than one video capture else: # Get previous index videoCapResult = videoCaps[idx - 1 if idx - 1 >= 0 else len(videoCaps) - 1] else: # Async disabled infer_network.exec_net(next_request_id, in_frame) videoCapResult = videoCapInfer if infer_network.wait(cur_request_id) == 0: infer_end = datetime.datetime.now() res = infer_network.get_output(cur_request_id) infer_duration = infer_end - infer_start current_count = 0 # Parse detection results of the current request for obj in res[0][0]: class_id = int(obj[1]) # Draw only objects when probability more than specified threshold if (obj[2] > PROB_THRESHOLD and videoCapResult.req_label in labels_map and labels_map.index( videoCapResult.req_label) == class_id - 1): current_count += 1 xmin = int(obj[3] * videoCapResult.initial_w) ymin = int(obj[4] * videoCapResult.initial_h) xmax = int(obj[5] * videoCapResult.initial_w) ymax = int(obj[6] * videoCapResult.initial_h) # Draw box cv2.rectangle(videoCapResult.cur_frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 4, 16) if videoCapResult.candidate_count is current_count: videoCapResult.candidate_confidence += 1 else: videoCapResult.candidate_confidence = 0 videoCapResult.candidate_count = current_count if videoCapResult.candidate_confidence is FRAME_THRESHOLD: videoCapResult.candidate_confidence = 0 if current_count > videoCapResult.last_correct_count: videoCapResult.total_count += current_count - videoCapResult.last_correct_count if current_count is not videoCapResult.last_correct_count: if UI_OUTPUT: currtime = datetime.datetime.now().strftime( "%H:%M:%S") fr = FrameInfo(videoCapResult.frames, current_count, currtime) videoCapResult.countAtFrame.append(fr) new_objects = current_count - videoCapResult.last_correct_count for _ in range(new_objects): strng = "{} - {} detected on {}". \ format(time.strftime("%H:%M:%S"), videoCapResult.req_label, videoCapResult.cap_name) rolling_log.append(strng) videoCapResult.frames += 1 videoCapResult.last_correct_count = current_count else: videoCapResult.frames += 1 videoCapResult.cur_frame = cv2.resize(videoCapResult.cur_frame, (w, h)) if UI_OUTPUT: imgName = videoCapResult.cap_name imgName = imgName.split()[0] + "_" + chr( ord(imgName.split()[1]) + 1) imgName += "_" + str(videoCapResult.frames) frameNames.append(imgName) imgName = CONF_VIDEODIR + imgName + ".jpg" cv2.imwrite(imgName, videoCapResult.cur_frame) videoCapsJson[ videoCapResult. pos].countAtFrame = videoCapResult.countAtFrame a = saveJSON() if a: return a if not UI_OUTPUT: # Add log text to each frame log_message = "Async mode is on." if is_async_mode else \ "Async mode is off." cv2.putText(videoCapResult.cur_frame, log_message, (15, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) log_message = "Total {} count: {}" \ .format(videoCapResult.req_label, videoCapResult.total_count) cv2.putText(videoCapResult.cur_frame, log_message, (10, h - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) log_message = "Current {} count: {}" \ .format(videoCapResult.req_label, videoCapResult.last_correct_count) cv2.putText(videoCapResult.cur_frame, log_message, (10, h - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) cv2.putText( videoCapResult.cur_frame, 'Infer wait: %0.3fs' % (infer_duration.total_seconds()), (10, h - 70), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) # Display inferred frame and stats stats = numpy.zeros((statsHeight, statsWidth, 1), dtype='uint8') for i, log in enumerate(rolling_log): cv2.putText(stats, log, (10, i * 20 + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) cv2.imshow(STATS_WINDOW_NAME, stats) if idx == 0: stats = cv2.cvtColor(stats, cv2.COLOR_GRAY2BGR) statsVideo.write(stats) end_time = datetime.datetime.now() cv2.putText( videoCapResult.cur_frame, 'FPS: %0.2fs' % (1 / (end_time - videoCapResult.start_time).total_seconds()), (10, h - 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) cv2.imshow(videoCapResult.cap_name, videoCapResult.cur_frame) videoCapResult.start_time = datetime.datetime.now() videoCapResult.video.write(videoCapResult.cur_frame) # Wait if necessary for the required time key = cv2.waitKey(waitTime) # Esc key pressed if key == 27: cv2.destroyAllWindows() infer_network.clean() print("Finished") return # Tab key pressed if key == 9: is_async_mode = not is_async_mode print("Switched to {} mode".format( "async" if is_async_mode else "sync")) if is_async_mode: # Swap infer request IDs cur_request_id, next_request_id = next_request_id, cur_request_id # Loop video if LOOP_VIDEO = True and input isn't live from USB camera if LOOP_VIDEO and not videoCapInfer.is_cam: vfps = int(round(videoCapInfer.cap.get(cv2.CAP_PROP_FPS))) # If a video capture has ended restart it if (videoCapInfer.cur_frame_count > videoCapInfer.cap.get(cv2.CAP_PROP_FRAME_COUNT) - int(round(vfps / minFPS))): videoCapInfer.cur_frame_count = 0 videoCapInfer.cap.set(cv2.CAP_PROP_POS_FRAMES, 0) if False not in no_more_data: break infer_network.clean() cv2.destroyAllWindows()
class FaceDetectionModel: ''' Class for the Face Detection Model. ''' def __init__(self, model_name, device='CPU', extensions=None): ''' TODO: Use this to set your instance variables. ''' self.model_xml = model_name self.device = device self.extensions = extensions # Initialise the class self.infer_network = Network() #raise NotImplementedError def load_model(self): ''' TODO: You will need to complete this method. This method is for loading the model to the device specified by the user. If your model requires any Plugins, this is where you can load them. ''' self.infer_network.load_model(self.model_xml, self.device, self.extensions) #raise NotImplementedError def predict(self, image): ''' TODO: You will need to complete this method. This method is meant for running predictions on the input image. ''' self.infer_network.exec_net(image) # Wait for the result if self.infer_network.wait() == 0: # end time of inference end_time = time.time() result = (self.infer_network.get_output())[self.infer_network.output_blob] return result def check_model(self): raise NotImplementedError def preprocess_input(self, image): ''' Before feeding the data into the model for inference, you might have to preprocess it. This function is where you can do that. ''' # [1x3x384x672] net_input_shape = self.infer_network.get_input_shape() p_frame = np.copy(image) p_frame = cv2.resize(p_frame, (net_input_shape[3], net_input_shape[2])) p_frame = p_frame.transpose((2,0,1)) p_frame = p_frame.reshape(1, *p_frame.shape) return p_frame #raise NotImplementedError def preprocess_output(self, outputs, image, print_flag=True, threshold = 0.5): ''' Before feeding the output of this model to the next model, you might have to preprocess the output. This function is where you can do that. ''' # [1, 1, N, 7] # [image_id, label, conf, x_min, y_min, x_max, y_max] height = image.shape[0] width = image.shape[1] faceboxes = [] # Drawing the box/boxes for i in range(len(outputs[0][0])): box = outputs[0][0][i] # i-th box confidence = box[2] if confidence>threshold: xmin = int(box[3] * width) ymin = int(box[4] * height) xmax = int(box[5] * width) ymax = int(box[6] * height) # Drawing the box in the image if(print_flag): cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255,0,0), 1) faceboxes.append([xmin, ymin,xmax, ymax]) return image, faceboxes