def test_on_local_segment(): actors = [0, 1, 2] size = [400, 400] timesteps = 32 batch_np = np.zeros([len(actors), timesteps] + size + [3]) rois_np = np.zeros([len(actors), 4]) batch_indices_np = np.array(range(len(actors))) for bb, actor_id in enumerate(actors): vid_path = 'person_%i.mp4' % actor_id reader = imageio.get_reader(vid_path, 'ffmpeg') for tt, frame in enumerate(reader): batch_np[bb, tt, :] = frame roi_path = "person_%i_roi.json" % actor_id with open(roi_path) as fp: rois_np[bb] = json.load(fp) # act_detector = act.Action_Detector('i3d_tail') # ckpt_name = 'model_ckpt_RGB_i3d_pooled_tail-4' act_detector = act.Action_Detector('soft_attn') ckpt_name = 'model_ckpt_RGB_soft_attn-9' input_seq, rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders( ) sess = act_detector.session #main_folder = sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) main_folder = "../" ckpt_path = os.path.join(main_folder, 'action_detection', 'weights', ckpt_name) act_detector.restore_model(ckpt_path) feed_dict = { input_seq: batch_np, rois: rois_np, roi_batch_indices: batch_indices_np } probs = sess.run(pred_probs, feed_dict=feed_dict) # debug = sess.run(tf.get_collection('debug'), feed_dict=feed_dict) # import pdb;pdb.set_trace() # highest_conf_actions = np.argsort(probs, axis=1) print_top_k = 5 for ii in range(len(actors)): act_probs = probs[ii] order = np.argsort(act_probs)[::-1] print("Person %i" % actors[ii]) for pp in range(print_top_k): print('\t %s: %.3f' % (act.ACTION_STRINGS[order[pp]], act_probs[order[pp]]))
def __init__(self): obj_detection_graph = os.path.join("object_detection", "weights", OBJ_DETECTION_MODEL, "frozen_inference_graph.pb") self.obj_detector = obj.Object_Detector(obj_detection_graph) self.act_detector = act.Action_Detector('soft_attn', timesteps=NUM_INPUT_FRAMES) crop_in_tubes = self.act_detector.crop_tubes_in_tf( [NUM_INPUT_FRAMES, HEIGHT, WIDTH, 3]) (self.input_frames, self.temporal_rois, self.temporal_roi_batch_indices, self.cropped_frames) = crop_in_tubes self.rois, self.roi_batch_indices, self.pred_probs = ( self.act_detector.define_inference_with_placeholders_noinput( self.cropped_frames)) ckpt_path = os.path.join(MAIN_FOLDER, 'action_detection', 'weights', CKPT_NAME) self.act_detector.restore_model(ckpt_path)
def set_up_detector(): act_detector = act.Action_Detector('soft_attn') #ckpt_name = 'model_ckpt_RGB_soft_attn-16' #ckpt_name = 'model_ckpt_soft_attn_ava-23' ckpt_name = 'model_ckpt_soft_attn_pooled_cosine_drop_ava-130' input_seq, rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders() ckpt_path = os.path.join('action_detection', 'weights', ckpt_name) act_detector.restore_model(ckpt_path) detector_dict = { 'detector':act_detector, 'input_seq': input_seq, 'rois': rois, 'roi_batch_indices': roi_batch_indices, 'pred_probs': pred_probs} return detector_dict
def main(): parser = argparse.ArgumentParser() parser.add_argument('-v', '--video_path', type=str, required=False, default="") parser.add_argument('-d', '--display', type=str, required=False, default="True") args = parser.parse_args() display = (args.display == "True" or args.display == "true") #actor_to_display = 6 # for cams video_path = args.video_path basename = os.path.basename(video_path).split('.')[0] out_vid_path = "./output_videos/%s_output.mp4" % ( basename if not SHOW_CAMS else basename + '_cams_actor_%.2d' % actor_to_display) #out_vid_path = './output_videos/testing.mp4' # video_path = "./tests/chase1Person1View3Point0.mp4" # out_vid_path = 'output.mp4' main_folder = './' # NAS obj_detection_model = 'ssd_mobilenet_v2_coco_2018_03_29' obj_detection_graph = os.path.join("object_detection", "weights", obj_detection_model, "frozen_inference_graph.pb") print("Loading object detection model at %s" % obj_detection_graph) obj_detector = obj.Object_Detector(obj_detection_graph) tracker = obj.Tracker() print("Reading video file %s" % video_path) reader = imageio.get_reader(video_path, 'ffmpeg') action_freq = 8 # fps_divider = 1 print('Running actions every %i frame' % action_freq) fps = reader.get_meta_data()['fps'] #// fps_divider W, H = reader.get_meta_data()['size'] T = tracker.timesteps if not display: writer = imageio.get_writer(out_vid_path, fps=fps) print("Writing output to %s" % out_vid_path) # act_detector = act.Action_Detector('i3d_tail') # ckpt_name = 'model_ckpt_RGB_i3d_pooled_tail-4' act_detector = act.Action_Detector('soft_attn') #ckpt_name = 'model_ckpt_RGB_soft_attn-16' #ckpt_name = 'model_ckpt_soft_attn_ava-23' ckpt_name = 'model_ckpt_soft_attn_pooled_cosine_drop_ava-130' #input_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf([T,H,W,3]) memory_size = act_detector.timesteps - action_freq updated_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf_with_memory( [T, H, W, 3], memory_size) rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders_noinput( cropped_frames) ckpt_path = os.path.join(main_folder, 'action_detection', 'weights', ckpt_name) act_detector.restore_model(ckpt_path) prob_dict = {} frame_cnt = 0 for cur_img in reader: frame_cnt += 1 #tracker.add_frame(cur_img) print("frame_cnt: %i" % frame_cnt) # Object Detection expanded_img = np.expand_dims(cur_img, axis=0) #expanded_img = np.tile(expanded_img, [10,1,1,1]) # test the speed t1 = time.time() detection_list = obj_detector.detect_objects_in_np(expanded_img) detection_info = [info[0] for info in detection_list] t2 = time.time() print('obj det %.2f seconds' % (t2 - t1)) tracker.update_tracker(detection_info, cur_img) t3 = time.time() print('tracker %.2f seconds' % (t3 - t2)) no_actors = len(tracker.active_actors) if tracker.active_actors and frame_cnt % action_freq == 0: probs = [] cur_input_sequence = np.expand_dims(np.stack( tracker.frame_history[-action_freq:], axis=0), axis=0) rois_np, temporal_rois_np = tracker.generate_all_rois() if no_actors > 14: no_actors = 14 rois_np = rois_np[:14] temporal_rois_np = temporal_rois_np[:14] #feed_dict = {input_frames:cur_input_sequence, feed_dict = { updated_frames: cur_input_sequence, # only update last #action_freq frames temporal_rois: temporal_rois_np, temporal_roi_batch_indices: np.zeros(no_actors), rois: rois_np, roi_batch_indices: np.arange(no_actors) } run_dict = {'pred_probs': pred_probs} if SHOW_CAMS: run_dict['cropped_frames'] = cropped_frames #import pdb;pdb.set_trace() run_dict[ 'final_i3d_feats'] = act_detector.act_graph.get_collection( 'final_i3d_feats')[0] #run_dict['cls_weights'] = [var for var in tf.global_variables() if var.name == "CLS_Logits/kernel:0"][0] run_dict[ 'cls_weights'] = act_detector.act_graph.get_collection( 'variables')[-2] # this is the kernel #import pdb;pdb.set_trace() out_dict = act_detector.session.run(run_dict, feed_dict=feed_dict) probs = out_dict['pred_probs'] # associate probs with actor ids print_top_k = 5 for bb in range(no_actors): act_probs = probs[bb] order = np.argsort(act_probs)[::-1] cur_actor_id = tracker.active_actors[bb]['actor_id'] print("Person %i" % cur_actor_id) cur_results = [] for pp in range(print_top_k): print( '\t %s: %.3f' % (act.ACTION_STRINGS[order[pp]], act_probs[order[pp]])) cur_results.append( (act.ACTION_STRINGS[order[pp]], act_probs[order[pp]])) prob_dict[cur_actor_id] = cur_results t5 = time.time() print('action %.2f seconds' % (t5 - t3)) # # Action detection # no_actors = len(tracker.active_actors) # #batch_np = np.zeros([no_actors, act_detector.timesteps] + act_detector.input_size + [3], np.uint8) # batch_list = [] # rois_np = np.zeros([no_actors, 4]) # batch_indices_np = np.array(range(no_actors)) # for bb, actor_info in enumerate(tracker.active_actors): # actor_no = actor_info['actor_id'] # tube, roi = tracker.crop_person_tube(actor_no) # #batch_np[bb, :] = tube # batch_list.append(tube) # rois_np[bb]= roi #t4 = time.time(); print('cropping %.2f seconds' % (t4-t3)) # if tracker.active_actors: # batch_np = np.stack(batch_list, axis=0) # max_batch_size = 10 # prob_list = [] # cur_index = 0 # while cur_index < no_actors: # cur_batch = batch_np[cur_index:cur_index+max_batch_size] # cur_roi = rois_np[cur_index:cur_index+max_batch_size] # cur_indices = batch_indices_np[cur_index:cur_index+max_batch_size] - cur_index # feed_dict = {input_seq:cur_batch, rois:cur_roi, roi_batch_indices:cur_indices} # #t51 = time.time(); print('action before run %.2f seconds' % (t51-t4)) # cur_probs = act_detector.session.run(pred_probs, feed_dict=feed_dict) # #t52 = time.time(); print('action after run %.2f seconds' % (t52-t51)) # prob_list.append(cur_probs) # cur_index += max_batch_size # probs = np.concatenate(prob_list, axis=0) #t5 = time.time(); print('action %.2f seconds' % (t5-t4)) # Print top_k probs #out_img = visualize_detection_results(cur_img, tracker.active_actors, prob_dict) if frame_cnt > 16: out_img = visualize_detection_results(tracker.frame_history[-16], tracker.active_actors, prob_dict) if SHOW_CAMS: if tracker.active_actors: actor_indices = [ ii for ii in range(no_actors) if tracker.active_actors[ii]['actor_id'] == actor_to_display ] if actor_indices: out_img = visualize_cams(out_img, cur_input_sequence, out_dict, actor_indices[0]) else: continue else: continue if display: cv2.imshow('results', out_img[:, :, ::-1]) cv2.waitKey(10) else: writer.append_data(out_img) if not display: writer.close()
def run_act_detector(shape, detection_q, actions_q, act_gpu): import tensorflow as tf # there is a bug. if you dont import tensorflow within the process you cant use the same gpus for both processes. os.environ['CUDA_VISIBLE_DEVICES'] = act_gpu # act_detector = act.Action_Detector('i3d_tail') # ckpt_name = 'model_ckpt_RGB_i3d_pooled_tail-4' act_detector = act.Action_Detector('soft_attn', timesteps=T) #ckpt_name = 'model_ckpt_RGB_soft_attn-16' #ckpt_name = 'model_ckpt_soft_attn_ava-23' #ckpt_name = 'model_ckpt_soft_attn_pooled_ava-52' ckpt_name = 'model_ckpt_soft_attn_pooled_cosine_drop_ava-130' main_folder = "./" ckpt_path = os.path.join(main_folder, 'action_detection', 'weights', ckpt_name) #input_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf([T,H,W,3]) memory_size = act_detector.timesteps - ACTION_FREQ updated_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf_with_memory( shape, memory_size) rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders_noinput( cropped_frames) act_detector.restore_model(ckpt_path) processed_frames_cnt = 0 while True: images = [] for _ in range(ACTION_FREQ): cur_img, active_actors, rois_np, temporal_rois_np = detection_q.get( ) images.append(cur_img) #print("action frame: %i" % len(images)) if not active_actors: prob_dict = {} if SHOW_CAMS: prob_dict = {"cams": visualize_cams({})} else: # use the last active actors and rois vectors no_actors = len(active_actors) cur_input_sequence = np.expand_dims(np.stack(images, axis=0), axis=0) if no_actors > 14: no_actors = 14 rois_np = rois_np[:14] temporal_rois_np = temporal_rois_np[:14] active_actors = active_actors[:14] #feed_dict = {input_frames:cur_input_sequence, feed_dict = { updated_frames: cur_input_sequence, # only update last #action_freq frames temporal_rois: temporal_rois_np, temporal_roi_batch_indices: np.zeros(no_actors), rois: rois_np, roi_batch_indices: np.arange(no_actors) } run_dict = {'pred_probs': pred_probs} if SHOW_CAMS: run_dict['cropped_frames'] = cropped_frames #import pdb;pdb.set_trace() run_dict[ 'final_i3d_feats'] = act_detector.act_graph.get_collection( 'final_i3d_feats')[0] #run_dict['cls_weights'] = [var for var in tf.global_variables() if var.name == "CLS_Logits/kernel:0"][0] run_dict[ 'cls_weights'] = act_detector.act_graph.get_collection( 'variables')[-2] # this is the kernel out_dict = act_detector.session.run(run_dict, feed_dict=feed_dict) probs = out_dict['pred_probs'] if not SHOW_CAMS: # associate probs with actor ids print_top_k = 5 prob_dict = {} for bb in range(no_actors): act_probs = probs[bb] order = np.argsort(act_probs)[::-1] cur_actor_id = active_actors[bb]['actor_id'] print("Person %i" % cur_actor_id) cur_results = [] for pp in range(print_top_k): print('\t %s: %.3f' % (act.ACTION_STRINGS[order[pp]], act_probs[order[pp]])) cur_results.append((act.ACTION_STRINGS[order[pp]], act_probs[order[pp]])) prob_dict[cur_actor_id] = cur_results else: # prob_dict = out_dict prob_dict = { "cams": visualize_cams(out_dict) } # do it here so it doesnt slow down visualization process processed_frames_cnt += ACTION_FREQ # each turn we process this many frames if processed_frames_cnt >= act_detector.timesteps / 2: # we are doing this so we can skip the initialization period # first frame needs timesteps / 2 frames to be processed before visualizing actions_q.put(prob_dict)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-v', '--video_path', type=str, required=False, default="") parser.add_argument('-d', '--display', type=str, required=False, default="True") args = parser.parse_args() display = (args.display == "True" or args.display == "true") #actor_to_display = 6 # for cams video_path = args.video_path basename = os.path.basename(video_path).split('.')[0] out_vid_path = "./output_videos/%s_output.mp4" % (basename if not SHOW_CAMS else basename+'_cams_actor_%.2d' % actor_to_display) clf_out_path = "./clf_output/{}_output.csv".format(basename if not SHOW_CAMS else basename+'_cams_actor_{}'.format(actor_to_display)) #out_vid_path = './output_videos/testing.mp4' # video_path = "./tests/chase1Person1View3Point0.mp4" # out_vid_path = 'output.mp4' main_folder = './' # NAS obj_detection_model = 'ssd_mobilenet_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03' obj_detection_graph = os.path.join("object_detection", "weights", obj_detection_model, "frozen_inference_graph.pb") print("Loading object detection model at %s" % obj_detection_graph) obj_detector = obj.Object_Detector(obj_detection_graph) tracker = obj.Tracker() print("Reading video file %s" % video_path) reader = imageio.get_reader(video_path, 'ffmpeg') action_freq = 8 # fps_divider = 1 print('Running actions every %i frame' % action_freq) fps = reader.get_meta_data()['fps'] #// fps_divider print("FPS: {}".format(fps)) W, H = reader.get_meta_data()['size'] T = tracker.timesteps #if not display: writer = imageio.get_writer(out_vid_path, fps=fps) csv_file = open(clf_out_path, 'w', newline='') csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(['Time', 'Person', 'Action', 'Probability']) print("Writing output to %s" % out_vid_path) # act_detector = act.Action_Detector('i3d_tail') # ckpt_name = 'model_ckpt_RGB_i3d_pooled_tail-4' act_detector = act.Action_Detector('soft_attn') #ckpt_name = 'model_ckpt_RGB_soft_attn-16' #ckpt_name = 'model_ckpt_soft_attn_ava-23' ckpt_name = 'model_ckpt_soft_attn_pooled_cosine_drop_ava-130' #input_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf([T,H,W,3]) memory_size = act_detector.timesteps - action_freq updated_frames, temporal_rois, temporal_roi_batch_indices, cropped_frames = act_detector.crop_tubes_in_tf_with_memory([T,H,W,3], memory_size) rois, roi_batch_indices, pred_probs = act_detector.define_inference_with_placeholders_noinput(cropped_frames) ckpt_path = os.path.join(main_folder, 'action_detection', 'weights', ckpt_name) act_detector.restore_model(ckpt_path) prob_dict = {} frame_cnt = 0 # Tewan min_teacher_features = 3 teacher_identified = 0 #missed_frame_cnt = 0 #max_age = 120 #frame_skips = 60 #next_frame = 0 teacher_ids = [] matched_id = None # Tewan for cur_img in reader: frame_cnt += 1 #if frame_cnt < next_frame: # continue # Detect objects and make predictions every 8 frames (0.3 seconds) #if frame_cnt % action_freq == 0: # Object Detection expanded_img = np.expand_dims(cur_img, axis=0) detection_list = obj_detector.detect_objects_in_np(expanded_img) detection_info = [info[0] for info in detection_list] # Updates active actors in tracker tracker.update_tracker(detection_info, cur_img) no_actors = len(tracker.active_actors) """ if no_actors == 0: missed_frame_cnt += 1 if missed_frame_cnt >= max_age: tracker.update_tracker(detection_info, cur_img) no_actors = len(tracker.active_actors) teacher_identified = False tracker.set_invalid_track() missed_frame_cnt = 0 print("Reset active actors. Current number: {}".format(no_actors)) """ if frame_cnt % action_freq == 0 and frame_cnt > 16: if no_actors == 0: print("No actor found.") continue video_time = round(frame_cnt / fps, 1) valid_actor_ids = [actor["actor_id"] for actor in tracker.active_actors] print("frame count: {}, video time: {}s".format(frame_cnt, video_time)) probs = [] cur_input_sequence = np.expand_dims(np.stack(tracker.frame_history[-action_freq:], axis=0), axis=0) rois_np, temporal_rois_np = tracker.generate_all_rois() if teacher_identified < min_teacher_features: prompt_img = visualize_detection_results(img_np=tracker.frame_history[-16], active_actors=tracker.active_actors, prob_dict=None) cv2.imshow('prompt_img', prompt_img[:,:,::-1]) cv2.waitKey(500) teacher_present = False teacher_id = _prompt_user_input() if not _check_teacher_in_frame(teacher_id=teacher_id): print("Teacher not in this frame. Continuing.") cv2.destroyWindow("prompt_img") pass else: if _check_valid_teacher_id(teacher_id=teacher_id, valid_actor_ids=valid_actor_ids): teacher_id = int(teacher_id) teacher_identified += 1 teacher_present = True else: while not teacher_present: print("Invalid ID.") teacher_id = _prompt_user_input() if not _check_teacher_in_frame(teacher_id=teacher_id): print("Teacher not in this frame. Continuing.") cv2.destroyWindow("prompt_img") break else: if _check_valid_teacher_id(teacher_id=teacher_id, valid_actor_ids=valid_actor_ids): teacher_id = int(teacher_id) teacher_identified += 1 teacher_present = True else: pass # Move on to next frame if teacher not in current frame if not teacher_present: continue cv2.destroyWindow("prompt_img") if teacher_id not in teacher_ids: teacher_ids.append(teacher_id) tracker.update_teacher_candidate_ids(teacher_candidate_id=teacher_id) else: tracker.set_valid_track() # Identify idx of teacher for ROI selection roi_idx = None found_id = False for idx, actor_info in enumerate(tracker.active_actors): actor_id = actor_info["actor_id"] for i in range(len(teacher_ids)-1, -1, -1): if actor_id == teacher_ids[i]: roi_idx = idx matched_id = actor_info["actor_id"] found_id = True break if found_id: break # Identify ROI and temporal ROI using ROI idx if roi_idx is not None: rois_np = rois_np[roi_idx] temporal_rois_np = temporal_rois_np[roi_idx] rois_np = np.expand_dims(rois_np, axis=0) temporal_rois_np = np.expand_dims(temporal_rois_np, axis=0) no_actors = 1 # If teacher not found (i.e. roi_idx is None) in current frame, move on to next frame else: continue #max_actors = 5 #if no_actors > max_actors: # no_actors = max_actors # rois_np = rois_np[:max_actors] # temporal_rois_np = temporal_rois_np[:max_actors] # Might have issue of not using attention map because only predict action for 1 actor (memory issue) feed_dict = {updated_frames:cur_input_sequence, # only update last #action_freq frames temporal_rois: temporal_rois_np, temporal_roi_batch_indices: np.zeros(no_actors), rois:rois_np, roi_batch_indices:np.arange(no_actors)} run_dict = {'pred_probs': pred_probs} if SHOW_CAMS: run_dict['cropped_frames'] = cropped_frames run_dict['final_i3d_feats'] = act_detector.act_graph.get_collection('final_i3d_feats')[0] run_dict['cls_weights'] = act_detector.act_graph.get_collection('variables')[-2] # this is the kernel out_dict = act_detector.session.run(run_dict, feed_dict=feed_dict) probs = out_dict['pred_probs'] # associate probs with actor ids print_top_k = 5 for bb in range(no_actors): #act_probs = probs[bb] #order = np.argsort(act_probs)[::-1] #cur_actor_id = tracker.active_actors[bb]['actor_id'] act_probs = probs[bb] order = np.argsort(act_probs)[::-1] cur_actor_id = tracker.active_actors[roi_idx]["actor_id"] #print(cur_actor_id == actor_id) #print("Person %i" % cur_actor_id) #print("act_probs: {}".format(act_probs)) #print("order: {}".format(order)) #print("tracker.active_actors[bb]: {}".format(tracker.active_actors[bb])) cur_results = [] for pp in range(print_top_k): #print('\t %s: %.3f' % (act.ACTION_STRINGS[order[pp]], act_probs[order[pp]])) cur_results.append((act.ACTION_STRINGS[order[pp]], act_probs[order[pp]])) csv_writer.writerow([video_time, cur_actor_id, act.ACTION_STRINGS[order[pp]], act_probs[order[pp]]]) prob_dict[cur_actor_id] = cur_results if frame_cnt > 16: out_img = visualize_detection_results(tracker.frame_history[-16], tracker.active_actors, prob_dict=prob_dict, teacher_id=matched_id) if SHOW_CAMS: if tracker.active_actors: actor_indices = [ii for ii in range(no_actors) if tracker.active_actors[ii]['actor_id'] == actor_to_display] if actor_indices: out_img = visualize_cams(out_img, cur_input_sequence, out_dict, actor_indices[0]) else: continue else: continue if display: cv2.imshow('results', out_img[:,:,::-1]) cv2.waitKey(10) writer.append_data(out_img) #if not display: writer.close() csv_file.close()