def image_object_detection(in_image, out_image): frame = cv2.imread(in_image) y2t = yolov2tiny.YOLO_V2_TINY([1, 416, 416, 3], "./y2t_weights.pickle") t_end2end = time.time() _frame = resize_input(frame) _frame = np.expand_dims(_frame, axis=0) t_inference = time.time() tout = y2t.inference(_frame) t_inference = time.time() - t_inference tout = np.squeeze(tout) boxes = yolov2tiny.postprocessing(tout) frame = cv2.resize(frame, (416, 416), interpolation=cv2.INTER_CUBIC) for b in boxes: frame = cv2.rectangle(frame, b[1], b[2], b[3]) cv2.putText( frame, b[0], (int(min(b[1][0], b[2][0]) - 1), int(min(b[1][1], b[2][1])) - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, b[3], 1) t_end2end = time.time() - t_end2end cv2.imwrite(out_image, frame) print('DNN inference elapsed time: %.3f' % t_inference) print('End-to-end elapsed time : %.3f' % t_end2end)
def photo_write(in_video_path, out_photo_path, tensor_path='./intermediate/layer_39.npy'): in_video = cv2.VideoCapture(in_video_path) ret, frame = in_video.read() prediction = np.load(tensor_path) label_boxes = yolov2tiny.postprocessing(prediction) frame = draw_output_frame(frame, label_boxes) cv2.imwrite(out_photo_path, frame) in_video.release()
def video_object_detection(in_video_path: str, out_video_path: str, proc="cpu"): """ Read a videofile, scan each frame and draw objects using pretrained yolo_v2_tiny model. Finally, store drawed frames into 'out_video_path' """ reader, writer = open_video_with_opencv(in_video_path, out_video_path) yolo = yolov2tiny.YOLO_V2_TINY((416, 416, 3), "./y2t_weights.pickle", proc) width = int(reader.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(reader.get(cv2.CAP_PROP_FRAME_HEIGHT)) acc, firstTime = [], True while reader.isOpened(): okay, original_image = reader.read() if not okay: break beg_start = datetime.now() image = resize_input(original_image) beg_infer = datetime.now() batched_tensors_list = yolo.inference(image) inference_time = (datetime.now() - beg_infer).total_seconds() tensor = batched_tensors_list[-1][0] proposals = yolov2tiny.postprocessing(tensor) proposals = restore_shape(proposals, width, height) out_image = draw(original_image, proposals) writer.write(out_image) end_to_end_time = (datetime.now() - beg_start).total_seconds() acc.append((inference_time, end_to_end_time)) print("#{} inference: {:.3f}\tend-to-end: {:.3f}".format( len(acc), inference_time, end_to_end_time)) if firstTime: store_tensors(map(lambda x: x[0], batched_tensors_list)) # Remove batch shape firstTime = False reader.release() writer.release() inference_sum, end_to_end_sum = reduce( lambda x, y: (x[0] + y[0], x[1] + y[1]), acc) size = len(acc) print("Total inference: {:.3f}s\ttotal end-to-end: {:.3f}s".format( inference_sum, end_to_end_sum)) print("Average inference: {:.3f}s\taverage end-to-end: {:.3f}s".format( inference_sum / size, end_to_end_sum / size)) print("Throughput: {:.3f}fps".format(size / end_to_end_sum)) return
def photo_object_detection(in_photo_path, out_photo_path, proc="cpu"): frame = cv2.imread(in_photo_path) weight_pickle_path = os.path.join(os.getcwd(), '../test-proj3/y2t_weights.pickle') model = yolov2tiny.YOLO_V2_TINY([1, k_input_height, k_input_width, 3], weight_pickle_path, proc) input_img = resize_input(frame) input_img = np.expand_dims(input_img, 0) predictions = model.inference(input_img) save_tensors(predictions) label_boxes = yolov2tiny.postprocessing(predictions[-1]) frame = draw_output_frame(frame, label_boxes) cv2.imwrite(out_photo_path, frame)
def video_object_detection(in_video_path, out_video_path, proc="cpu"): # # This function runs the inference for each frame and creates the output video. # in_video, out_video = open_video_with_opencv(in_video_path, out_video_path) # Create an instance of the YOLO_V2_TINY class. Pass the dimension of # the input, a path to weight file, and which device you will use as arguments. weight_pickle_path = os.path.join(os.getcwd(), '../test-proj3/y2t_weights.pickle') model = yolov2tiny.YOLO_V2_TINY([1, k_input_height, k_input_width, 3], weight_pickle_path, proc) # Start the main loop. For each frame of the video, the loop must do the followings: # 1. Do the inference. # 2. Run postprocessing using the inference result, accumulate them through the video writer object. # The coordinates from postprocessing are calculated according to resized input; you must adjust # them to fit into the original video. # 3. Measure the end-to-end time and the time spent only for inferencing. # 4. Save the intermediate values for the first frame. # Note that your input must be adjusted to fit into the algorithm, # including resizing the frame and changing the dimension. e2e_time = 0 inference_time = 0 frame_count = 0 while True: e2e_time_start = time.time() ret, frame = in_video.read() if not ret: break frame_count += 1 input_img = resize_input(frame) input_img = np.expand_dims(input_img, 0) inference_time_start = time.time() predictions = model.inference(input_img) inference_time += time.time() - inference_time_start label_boxes = yolov2tiny.postprocessing(predictions[-1]) frame = draw_output_frame(frame, label_boxes) out_video.write(frame) e2e_time += time.time() - e2e_time_start # Exclude time for save_tensors in e2e time. if frame_count == 1: save_tensors(predictions) # Check the inference peformance; end-to-end elapsed time and inferencing time. # Check how many frames are processed per second respectivly. inference_fps = frame_count / inference_time e2e_fps = frame_count / e2e_time print("Inference time: {}".format(inference_time)) print("End-to-end time: {}".format(e2e_time)) print("Inference fps: {}".format(inference_fps)) print("End-to-end fps: {}".format(e2e_fps)) # Release the opened videos. in_video.release() out_video.release()
def video_object_detection(in_video_path, out_video_path, proc="cpu"): # # This function runs the inference for each frame and creates the output video. # # Your code from here. You may clear the comments. # # print('video_object_detection is not yet implemented') # sys.exit() # Open video using open_video_with_opencv. input_video, output_video, dim = open_video_with_opencv( in_video_path, out_video_path) in_shape = (1, 416, 416, 3) pickle_path = "./y2t_weights.pickle" total_elapsed_time = 0 # scale_w = dim[0] / 416 # scale_h = dim[1] / 416 # Check if video is opened. Otherwise, exit. if not input_video.isOpened(): print('video is not opened') sys.exit() # Create an instance of the YOLO_V2_TINY class. Pass the dimension of # the input, a path to weight file, and which device you will use as arguments. model = YOLO_V2_TINY(in_shape, pickle_path, proc) first = True # Start the main loop. For each frame of the video, the loop must do the followings: # 1. Do the inference. # 2. Run postprocessing using the inference result, accumulate them through the video writer object. # The coordinates from postprocessing are calculated according to resized input; you must adjust # them to fit into the original video. # 3. Measure the end-to-end time and the time spent only for inferencing. # 4. Save the intermediate values for the first layer. # Note that your input must be adjusted to fit into the algorithm, # including resizing the frame and changing the dimension. while True: ret, img = input_video.read() if not ret: break img = resize_input(img) start = time.time() output_tensors = model.inference(img) if first: first = False for i, tensor in enumerate(output_tensors): np.save("./intermediate/layer_{}.npy".format(i + 1), tensor) output_tensor = output_tensors[-1] end = time.time() elapsed_time = end - start total_elapsed_time += elapsed_time # print("Elapsed time to run inference: {}".format(elapsed_time)) label_boxes = postprocessing(output_tensor) # print(len(label_boxes)) img = recover_input(img, dim) for cl, (x1, y1), (x2, y2), (b, g, r) in label_boxes: # cl, (x1, y1), (x2, y2), col = label_boxes # x1 = int(x1*scale_w) # y1 = int(y1*scale_h) # x2 = int(x2*scale_w) # y2 = int(y2*scale_h) cv2.rectangle(img, (x1, y1), (x2, y2), (r, g, b), 3) cv2.putText(img, cl, (x1, y1), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 0), 1) img = cv2.resize(img, dim) output_video.write(img) # Check the inference peformance; end-to-end elapsed time and inferencing time. # Check how many frames are processed per second respectivly. # length = int(input_video.get(cv2.CAP_PROP_FRAME_COUNT)) fps = input_video.get(5) performance = fps / total_elapsed_time print("Total elapsed time for running inference: {}".format( total_elapsed_time)) print("FPS processed per second: {}".format(performance)) # Release the opened videos. input_video.release() output_video.release()