def main(): args = parser.parse_args() output_path = logging(args) model_path = args.model_path data_dir = args.data_dir start_frame = 0 num_samples = 25 if args.modality[:3] == 'rgb' else 1 ext = ".png" if args.modality == "rhythm" and args.dataset == "hmdb51" else ".jpg" num_categories = 51 if args.dataset == 'hmdb51' else 101 new_size = 224 model_start_time = time.time() params = torch.load(model_path) if args.architecture == "inception_v3": new_size = 299 if args.modality == "rhythm": spatial_net = models.flow_inception_v3(pretrained=False, channels=1, num_classes=num_categories) else: spatial_net = models.rgb_inception_v3(pretrained=False, channels=3, num_classes=num_categories) else: if args.modality == "rhythm": spatial_net = models.flow_resnet152(pretrained=False, channels=1, num_classes=num_categories) else: spatial_net = models.rgb_resnet152(pretrained=False, channels=3, num_classes=num_categories) spatial_net.load_state_dict(params['state_dict']) spatial_net.cuda() spatial_net.eval() model_end_time = time.time() model_time = model_end_time - model_start_time print("Action recognition spatial model is loaded in %4.4f seconds." % (model_time)) test_path = os.path.join(args.settings, args.dataset) test_file = os.path.join(test_path, "dataset_list.txt") if args.w else os.path.join( test_path, "test_split%d.txt" % (args.split)) print(test_file) f_test = open(test_file, "r") test_list = f_test.readlines() print("we got %d videos" % len(test_list)) line_id = 1 match_count = 0 result_list = [] lines = [] if args.vr_approach == 3: direction_path = os.path.join(args.settings, args.dataset, "direction.txt") lines = [int(line.rstrip('\n')) for line in open(direction_path)] elif args.vr_approach == 4: direction_path = os.path.join(args.settings, args.dataset, "direction_video.txt") lines = { line.split()[0]: int(line.split()[1]) for line in open(direction_path) } for line in test_list: line_info = line.split(" ") clip_path = os.path.join(data_dir, line_info[0]) num_frames = int(line_info[1]) input_video_label = int(line_info[2]) video_name = clip_path.split("/")[-1] index = lines[input_video_label] if args.vr_approach == 3 else ( lines[video_name] if args.vr_approach == 4 else args.vr_approach) spatial_prediction = VideoSpatialPrediction(args.modality, clip_path, spatial_net, num_categories, start_frame, num_frames, num_samples, index, new_size=new_size, ext=ext) avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1) result_list.append(avg_spatial_pred_fc8) pred_index = np.argmax(avg_spatial_pred_fc8) print(args.modality + " split " + str(args.split) + ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" % (line_id, len(test_list), input_video_label, pred_index, match_count)) if pred_index == input_video_label: match_count += 1 line_id += 1 print(match_count) print(len(test_list)) print("Accuracy is: %4.4f" % (float(match_count) / len(test_list))) npy_name = args.dataset + "_" + args.modality + "_" + args.architecture + "_s" + str( args.split) + ".npy" npy_path = os.path.join(output_path, npy_name) np.save(npy_path, np.array(result_list))
def main(): args = parser.parse_args() #model_path = '../../parameters/'+args.architecture+"/"+args.modality+'_s'+str(args.split)+'.pth.tar' #data_dir = '../datasets/'+args.dataset+'_frames' #data_dir = '/home/Datasets/UCF-101-OF_CPU' model_path = args.model_path data_dir = args.data_dir start_frame = 0 if args.modality[:3] == 'rgb': num_samples = 25 else: num_samples = 1 num_categories = 51 if args.dataset == 'hmdb51' else 101 model_start_time = time.time() params = torch.load(model_path) new_size = 224 if args.architecture == "inception_v3": new_size = 299 if args.modality == "rhythm": spatial_net = models.flow_inception_v3(pretrained=False, channels=1, num_classes=num_categories) else: spatial_net = models.rgb_inception_v3(pretrained=False, channels=3, num_classes=num_categories) else: if args.modality == "rhythm": spatial_net = models.flow_resnet152(pretrained=False, channels=1, num_classes=num_categories) else: spatial_net = models.rgb_resnet152(pretrained=False, channels=3, num_classes=num_categories) spatial_net.load_state_dict(params['state_dict']) spatial_net.cuda() spatial_net.eval() model_end_time = time.time() model_time = model_end_time - model_start_time print("Action recognition model is loaded in %4.4f seconds." % (model_time)) val_file = "./splits/" + args.dataset + "/val_split%d.txt" % (args.split) #val_file = 'spatial_testlist01_with_labels.txt' f_val = open(val_file, "r") val_list = f_val.readlines() print("we got %d test videos" % len(val_list)) line_id = 1 match_count = 0 result = [] lines = [ int(line.rstrip('\n')) for line in open('../datasets/settings/' + args.dataset + '/direction.txt') ] for line in val_list: line_info = line.split(" ") clip_path = os.path.join(data_dir, line_info[0]) num_frames = int(line_info[1]) input_video_label = int(line_info[2]) spatial_prediction = VideoSpatialPrediction( args.modality, clip_path, spatial_net, num_categories, start_frame, num_frames, num_samples, args.vr_approach if args.vr_approach != 3 else lines[input_video_label], new_size) avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1) result.append(avg_spatial_pred_fc8) # avg_spatial_pred = softmax(avg_spatial_pred_fc8) pred_index = np.argmax(avg_spatial_pred_fc8) print(args.modality + " split " + str(args.split) + ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" % (line_id, len(val_list), input_video_label, pred_index, match_count)) if pred_index == input_video_label: match_count += 1 line_id += 1 print(match_count) print(len(val_list)) print("Accuracy is : %4.4f" % ((float(match_count) / len(val_list)))) np.save( args.dataset + "_" + args.modality + "_" + args.architecture + "_s" + str(args.split) + ".npy", np.array(result))
def main(): args = parser.parse_args() output_path = logging(args) model_path = args.model_path data_dir = args.data_dir start_frame = 0 num_samples = 10 num_categories = 51 if args.dataset == 'hmdb51' else 101 model_start_time = time.time() new_size = 224 if args.architecture == "inception_v3": new_size = 299 spatial_net = models.flow_inception_v3(pretrained=False, channels=1, num_classes=num_categories) else: spatial_net = models.flow_resnet152(pretrained=False, channels=1, num_classes=num_categories) params = torch.load(model_path) spatial_net.load_state_dict(params['state_dict']) spatial_net.cuda() spatial_net.eval() model_end_time = time.time() model_time = model_end_time - model_start_time print("Action recognition model is loaded in %4.4f seconds." % (model_time)) test_path = os.path.join(args.settings, args.dataset) test_file = os.path.join(test_path, "dataset_list.txt") if args.w else os.path.join( test_path, "test_split%d.txt" % (args.split)) f_test = open(test_file, "r") test_list = f_test.readlines() print("we got %d test videos" % len(test_list)) line_id = 1 match_count = 0 result = [] for line in test_list: line_info = line.split(" ") clip_path = os.path.join(data_dir, line_info[0]) num_frames = int(line_info[1]) input_video_label = int(line_info[2]) spatial_prediction = VideoSpatialPrediction(clip_path, spatial_net, num_categories, num_samples, new_size, args.batch_size) avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1) result.append(avg_spatial_pred_fc8) # avg_spatial_pred = softmax(avg_spatial_pred_fc8) pred_index = np.argmax(avg_spatial_pred_fc8) print("Rhythm split " + str(args.split) + ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" % (line_id, len(test_list), input_video_label, pred_index, match_count)) if pred_index == input_video_label: match_count += 1 line_id += 1 print(match_count) print(len(test_list)) print("Accuracy is : %4.4f" % ((float(match_count) / len(test_list)))) npy_name = args.dataset + "_rhythm_" + args.architecture + "_s" + str( args.split) + ".npy" npy_path = os.path.join(output_path, npy_name) np.save(npy_path, np.array(result))
def main(): args = parser.parse_args() #model_path = '../../parameters/'+args.architecture+'/'+args.modality+'_s'+str(args.split)+'.pth.tar' #data_path = '/home/Datasets/UCF-101-OF_CPU' model_path = args.model_path data_path = args.data_dir start_frame = 0 num_categories = 51 if args.dataset == 'hmdb51' else 101 new_size = 224 model_start_time = time.time() params = torch.load(model_path) if args.architecture == "inception_v3": new_size = 299 temporal_net = models.flow_inception_v3(pretrained=False, channels=20, num_classes=num_categories) else: temporal_net = models.flow_resnet152(pretrained=False, channels=20, num_classes=num_categories) temporal_net.load_state_dict(params['state_dict']) temporal_net.cuda() temporal_net.eval() model_end_time = time.time() model_time = model_end_time - model_start_time print("Action recognition temporal model is loaded in %4.4f seconds." % (model_time)) val_file = "./splits/" + args.dataset + "/val_split%d.txt" % (args.split) #val_file = 'spatial_testlist01_with_labels.txt' f_val = open(val_file, "r") val_list = f_val.readlines() print("we got %d test videos" % len(val_list)) line_id = 1 match_count = 0 result_list = [] for line in val_list: line_info = line.split(" ") clip_path = os.path.join(data_path, line_info[0]) num_frames = int(line_info[1]) input_video_label = int(line_info[2]) spatial_prediction = VideoTemporalPrediction(args.modality, clip_path, temporal_net, num_categories, start_frame, num_frames, new_size=new_size) avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1) # print(avg_spatial_pred_fc8.shape) result_list.append(avg_spatial_pred_fc8) # avg_spatial_pred = softmax(avg_spatial_pred_fc8) pred_index = np.argmax(avg_spatial_pred_fc8) print(args.modality + " split " + str(args.split) + ", sample %d/%d: GT: %d, Prediction: %d" % (line_id, len(val_list), input_video_label, pred_index)) if pred_index == input_video_label: match_count += 1 line_id += 1 print(match_count) print(len(val_list)) print("Accuracy is %4.4f" % (float(match_count) / len(val_list))) np.save( args.dataset + "_" + args.modality + "_resnet152_s" + str(args.split) + ".npy", np.array(result_list))