def convertData(gesture): parser = argparse.ArgumentParser(description='Pose detector') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') args = parser.parse_args() # load model pose_detector = PoseDetector("posenet", "models/coco_posenet.npz", device=args.gpu) hand_detector = HandDetector("handnet", "models/handnet.npz", device=args.gpu) dataset = buildGestureDict("dataset/") gesturedf = pd.read_csv("sample.csv") for video in dataset[gesture]["videos"]: print("Currently processing the video for " + video["filename"]) startvideo = time.time() cap = cv2.VideoCapture(video["filepath"]) cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720) amount_of_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) print("Amount of Frames:", amount_of_frames) cap.set(cv2.CAP_PROP_FPS, 5) ret, img = cap.read() counter = 1 df = pd.DataFrame(columns=["Head", "Left", "Right"]) frame_tracker = int(amount_of_frames / 12) framecounter = 0 #print(frame_tracker) left = 0 right = 0 while ret: ret, img = cap.read() # get video frame if not ret: print("Failed to capture image") break person_pose_array, _ = pose_detector(img) res_img = cv2.addWeighted(img, 0.6, draw_person_pose(img, person_pose_array), 0.4, 0) if (counter % frame_tracker == 0): for person_pose in person_pose_array: firstPerson = True if not firstPerson: continue unit_length = pose_detector.get_unit_length(person_pose) # hands estimation # print("Estimating hands keypoints...") hands = pose_detector.crop_hands(img, person_pose, unit_length) if hands["left"] is not None: hand_img = hands["left"]["img"] bbox = hands["left"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="left") for x in range(len(hand_keypoints)): if (hand_keypoints[x] != None): hand_keypoints[x] = list( np.delete(hand_keypoints[x], 2)) hand_keypoints[x] = [ int(y) for y in hand_keypoints[x] ] res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) left = hand_keypoints cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) else: left = [[1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000]] if hands["right"] is not None: hand_img = hands["right"]["img"] bbox = hands["right"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="right") for x in range(len(hand_keypoints)): if (hand_keypoints[x] != None): hand_keypoints[x] = list( np.delete(hand_keypoints[x], 2)) hand_keypoints[x] = [ int(y) for y in hand_keypoints[x] ] res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) right = hand_keypoints cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) else: right = [[1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000], [1000, 1000]] print("Body Pose") person_pose = np.delete(person_pose, 9, 0) person_pose = np.delete(person_pose, 9, 0) person_pose = np.delete(person_pose, 10, 0) person_pose = np.delete(person_pose, 10, 0) person_pose = person_pose.tolist() for z in range(len(person_pose)): if (person_pose[z] != None): person_pose[z] = list(np.delete(person_pose[z], 2)) person_pose[z] = [int(a) for a in person_pose[z]] print(person_pose) print("Left") print(left) print("Right") print(right) cv2.imshow("result", res_img) head = person_pose for x in range(len(head)): if (head[x] == None): head[x] = [1000, 1000] pca = sklearnPCA(n_components=1) head = pca.fit_transform(head) dfhead = pd.DataFrame(data=head) dfhead = dfhead.T dfhead = dfhead.rename( columns={ 0: "head_1", 1: "head_2", 2: "head_3", 3: "head_4", 4: "head_5", 5: "head_6", 6: "head_7", 7: "head_8", 8: "head_9", 9: "head_10", 10: "head_11", 11: "head_12", 12: "head_13", 13: "head_14" }) for x in range(len(left)): if (left[x] == None): left[x] = [1000, 1000] pca = sklearnPCA(n_components=1) left = pca.fit_transform(left) dfleft = pd.DataFrame(data=left) dfleft = dfleft.T dfleft = dfleft.rename( columns={ 0: "left_1", 1: "left_2", 2: "left_3", 3: "left_4", 4: "left_5", 5: "left_6", 6: "left_7", 7: "left_8", 8: "left_9", 9: "left_10", 10: "left_11", 11: "left_12", 12: "left_13", 13: "left_14", 14: "left_15", 15: "left_16", 16: "left_17", 17: "left_18", 18: "left_19", 19: "left_20", 20: "left_21" }) for x in range(len(right)): if (right[x] == None): right[x] = [1000, 1000] pca = sklearnPCA(n_components=1) right = pca.fit_transform(right) dfright = pd.DataFrame(data=right) dfright = dfright.T dfright = dfright.rename( columns={ 0: "right_1", 1: "right_2", 2: "right_3", 3: "right_4", 4: "right_5", 5: "right_6", 6: "right_7", 7: "right_8", 8: "right_9", 9: "right_10", 10: "right_11", 11: "right_12", 12: "right_13", 13: "right_14", 14: "right_15", 15: "right_16", 16: "right_17", 17: "right_18", 18: "right_19", 19: "right_20", 20: "right_21" }) df2 = pd.concat([dfhead, dfleft, dfright], axis=1) df2["frame"] = framecounter df2["gesture"] = video["gesture"] df2["speaker"] = video["actor"] framecounter = framecounter + 1 df2["frame"] = df2["frame"].astype(int) newdf = newdf.append(df2, sort=False) gesturedf = gesturedf.append(df2, sort=False) firstPerson = False else: cv2.imshow("result", img) counter = counter + 1 #print("Frame",counter) if cv2.waitKey(1) & 0xFF == ord('q'): break #print(df) cap.release() cv2.destroyAllWindows() gesturedf.to_csv("dataset720new/" + gesture + ".csv", index=False) print("Done Recording for: " + gesture) print("Took " + str(time.time() - startvideo) + "seconds")
unit_length = pose_detector.get_unit_length(person_pose) # face estimation print("Estimating face keypoints...") cropped_face_img, bbox = pose_detector.crop_face( img, person_pose, unit_length) if cropped_face_img is not None: face_keypoints = face_detector(cropped_face_img) res_img = draw_face_keypoints(res_img, face_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) # hands estimation print("Estimating hands keypoints...") hands = pose_detector.crop_hands(img, person_pose, unit_length) if hands["left"] is not None: hand_img = hands["left"]["img"] bbox = hands["left"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="left") res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) if hands["right"] is not None: hand_img = hands["right"]["img"] bbox = hands["right"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="right") res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1]))
# each person detected for person_pose in person_pose_array: unit_length = pose_detector.get_unit_length(person_pose) # face estimation print("Estimating face keypoints...") cropped_face_img, bbox = pose_detector.crop_face(img, person_pose, unit_length) if cropped_face_img is not None: face_keypoints = face_detector(cropped_face_img) res_img = draw_face_keypoints(res_img, face_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) # hands estimation print("Estimating hands keypoints...") hands = pose_detector.crop_hands(img, person_pose, unit_length) if hands["left"] is not None: hand_img = hands["left"]["img"] bbox = hands["left"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="left") res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) if hands["right"] is not None: hand_img = hands["right"]["img"] bbox = hands["right"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="right") res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) print('Saving result into result.png...')
def main(cap, im_scale=2, view_results=False): debug_i = 0 fps_timer_arr = [0] * 16 fps = 0 # load model pose_device = 0 pose_model_dir = '../../Chainer_Realtime_Multi-Person_Pose_Estimation/models' pose_detector = PoseDetector("posenet", f"{pose_model_dir}/coco_posenet.npz", device=pose_device) hand_detector = HandDetector("handnet", f"{pose_model_dir}/handnet.npz", device=pose_device) # cv2.namedWindow('display', flags=(cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE)) if view_results: cv2.namedWindow('display') video_label_file = VideoLabelFile(cap.video_fname, fname_add='pre_points_pose') labels_current = defaultdict(lambda: []) labels_all_previous = video_label_file.load_previous() im_input = cap.read() im_input_shape = im_input.shape[0:2] first_run = True while (not cap.eof): fps_time_begin = time.perf_counter() debug_i += 1 im_input = cap.read() current_frame_id = cap.frame_idx() # print(cap.info()) im_pose = cv2.resize(im_input, (round(im_input_shape[1] / im_scale), round(im_input_shape[0] / im_scale))) if first_run: print( f"Video size {im_input.shape} -> Model input size {im_pose.shape}" ) first_run = False ########################################## person_pose_array, _ = pose_detector(im_pose) im_display = cv2.addWeighted( im_pose, 0.6, draw_person_pose(im_pose, person_pose_array), 0.4, 0) for person_pose in person_pose_array: unit_length = pose_detector.get_unit_length(person_pose) # arr = np.array([a for a in person_pose if a is not None]) # if arr.any(): # arr[:, 0:2] *= im_scale # labels_current[current_frame_id].append(['pre_person_pose', arr.tolist()]) # hands estimation hands = pose_detector.crop_hands(im_pose, person_pose, unit_length) if hands["left"] is not None: hand_img = hands["left"]["img"] bbox = hands["left"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="left") im_display = draw_hand_keypoints(im_display, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(im_display, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) if hand_keypoints[5] and hand_keypoints[8]: f_points = np.array( [hand_keypoints[5][:2], hand_keypoints[8][:2]]) f_points = (f_points + np.array([bbox[0], bbox[1]])) * im_scale #f_points = tuple(map(tuple, f_points.astype(int))) f_points = f_points.astype(int).tolist() labels_current[current_frame_id].append(f_points) if hands["right"] is not None: hand_img = hands["right"]["img"] bbox = hands["right"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="right") im_display = draw_hand_keypoints(im_display, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(im_display, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) if hand_keypoints[5] and hand_keypoints[8]: f_points = np.array( [hand_keypoints[5][:2], hand_keypoints[8][:2]]) f_points = (f_points + np.array([bbox[0], bbox[1]])) * im_scale #f_points = tuple(map(tuple, f_points.astype(int))) f_points = f_points.astype(int).tolist() labels_current[current_frame_id].append(f_points) ############################################# for l in labels_current[current_frame_id]: cv2.circle(im_display, (round(l[0][0] / im_scale), round(l[0][1] / im_scale)), 10, (255, 0, 0), 2) cv2.circle(im_display, (round(l[1][0] / im_scale), round(l[1][1] / im_scale)), 10, (0, 255, 0), 2) cv2.putText(im_display, f"frame {int(current_frame_id)}, fps: {int(fps)}.", (10, im_display.shape[0] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2) if view_results: #cv2.imshow('display', im_display) cv2.imshow('display', im_pose) else: print(".", end="") sys.stdout.flush() # labels_current[current_frame_id].append ############################################# ## KEYBOARD k = cv2.waitKey(5) if k == 27: # esc break elif k == ord('c'): import ipdb ipdb.set_trace() # ipdb.set_trace() # pdb.set_trace() fps_timer_arr[debug_i % 16] = time.perf_counter() - fps_time_begin fps = int(len(fps_timer_arr) * 1 / sum(fps_timer_arr)) print(". ") # cap.release() video_label_file.save_current_labels(labels_current, append_previous=False, custom_lists=True) if view_results: cv2.destroyAllWindows()
def estimate_pose(img_path, gpu = -1): # parser = argparse.ArgumentParser(description='Pose detector') # parser.add_argument('--img', help='image file path') # parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') # args = parser.parse_args() # load model print("Loading pose detection model...") pose_detector = PoseDetector("posenet", "models/coco_posenet.npz", device=gpu) print("Loading hand detection model...") hand_detector = HandDetector("handnet", "models/handnet.npz", device=gpu) # face_detector = FaceDetector("facenet", "models/facenet.npz", device=args.gpu) # read image img = cv2.imread(img_path) # inference print("Estimating pose...") person_pose_array, _ = pose_detector(img) res_img = cv2.addWeighted(img, 0.6, draw_person_pose(img, person_pose_array), 0.4, 0) # will cause the loop below to perform only at most 1 iteration; which means only 1 person will be recognized has_detected = False # each person detected for person_pose in person_pose_array: if has_detected: continue has_detected = True print("Body:", person_pose) unit_length = pose_detector.get_unit_length(person_pose) # face estimation # print("Estimating face keypoints...") # cropped_face_img, bbox = pose_detector.crop_face(img, person_pose, unit_length) # if cropped_face_img is not None: # face_keypoints = face_detector(cropped_face_img) # res_img = draw_face_keypoints(res_img, face_keypoints, (bbox[0], bbox[1])) # cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) # hands estimation print("Estimating hands keypoints...") hands = pose_detector.crop_hands(img, person_pose, unit_length) if hands["left"] is not None: hand_img = hands["left"]["img"] bbox = hands["left"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="left") print("Left hand: ", print_arr(hand_keypoints)) res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) if hands["right"] is not None: hand_img = hands["right"]["img"] bbox = hands["right"]["bbox"] hand_keypoints = hand_detector(hand_img, hand_type="right") print("Right hand: ", print_arr(hand_keypoints)) res_img = draw_hand_keypoints(res_img, hand_keypoints, (bbox[0], bbox[1])) cv2.rectangle(res_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 255, 255), 1) print('Saving result into result.png...') cv2.imwrite('result.png', res_img)