def main():

    # model_path = '../../checkpoints/rgb_model_best.pth.tar'
    model_path = '../../checkpoints/225_rgb_checkpoint.pth.tar'
    # data_dir = "~/basedata/expression_data/ck+_pre"
    start_frame = 0
    num_categories = 8

    model_start_time = time.time()
    params = torch.load(model_path)

    spatial_net = models.rgb_resnet152(pretrained=False, num_classes=8)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." % (model_time))

    val_file = "/home/wxc/.pyenv/versions/3.5.5/envs/two_stream/datasets/settings/ck/val_rgb_split.txt"
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0
    result_list = []
    for line in val_list:
        line_info = line.split(" ")
        clip_path = line_info[0]
        input_video_frames=int(line_info[1])
        input_video_label = int(line_info[2])

        spatial_prediction = VideoSpatialPrediction(
                clip_path,
                spatial_net,
                num_categories,
                start_frame)

        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        # print(avg_spatial_pred_fc8.shape)
        result_list.append(avg_spatial_pred_fc8)
        avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        save_fig(avg_spatial_pred,clip_path)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print("Sample %d/%d: number of frames:%d, GT: %d, Prediction: %d, " % (line_id, len(val_list),input_video_frames, input_video_label, pred_index)+clip_path)

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is %4.4f" % (float(match_count)/len(val_list)))
    np.save("ucf101_s1_rgb_resnet152.npy", np.array(result_list))
Exemplo n.º 2
0
def main():

    #model_path = '../../checkpoints/model_best.pth.tar'
    #model_path = 'D:/deep-learning/proposal-defence/ucf101_s1_rgb_resnet152.pth.tar'
    model_path = '../../models/ucf101_s1_rgb_resnet152.pth.tar'
    data_dir = "~/UCF101/frames"
    start_frame = 0
    num_categories = 101

    model_start_time = time.time()
    #params = torch.load(model_path, map_location={'cuda:0': 'cpu'})
    params = torch.load(model_path)

    spatial_net = models.rgb_resnet152(pretrained=False, num_classes=101)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." % (model_time))

    #val_file = "./spatial_testlist01_with_labels.txt"
    val_file = "./out2.txt"
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0
    result_list = []
    for line in val_list:
        line_info = line.split(" ")
        clip_path = line_info[0]
        input_video_label = int(line_info[1]) - 1

        spatial_prediction = VideoSpatialPrediction(
                clip_path,
                spatial_net,
                num_categories,
                start_frame)

        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        # print(avg_spatial_pred_fc8.shape)
        result_list.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print("Sample %d/%d: GT: %d, Prediction: %d" % (line_id, len(val_list), input_video_label, pred_index))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is %4.4f" % (float(match_count)/len(val_list)))
    np.save("ucf101_s1_rgb_resnet152.npy", np.array(result_list))
Exemplo n.º 3
0
def main():

    model_path = './checkpoints/model_best.pth.tar'
    data_dir = "./"
    start_frame = 0
    num_categories = 2

    model_start_time = time.time()
    params = torch.load(model_path)

    spatial_net = models.rgb_resnet101(pretrained=True, num_classes=2)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." % (model_time))

    val_file = "./test_rgb.txt"
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0
    result_list = []
    for line in val_list:
        line_info = line.split()
        clip_path = data_dir+line_info[0]
        input_video_label = int(line_info[2])

        spatial_prediction = VideoSpatialPrediction(
                clip_path,
                spatial_net,
                num_categories,
                start_frame)

        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        # print(avg_spatial_pred_fc8.shape)
        result_list.append(avg_spatial_pred_fc8)
        avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print("Sample %d/%d: GT: %d, Prediction: %d, Probability: %.3f" % (line_id, len(val_list), input_video_label, pred_index,np.amax(avg_spatial_pred)))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is %4.4f" % (float(match_count)/len(val_list)))
    np.save("ucf101_s1_rgb_resnet101.npy", np.array(result_list))
Exemplo n.º 4
0
def main():

    # caffe init
    gpu_id = 0
    caffe.set_device(gpu_id)
    caffe.set_mode_gpu()

    # spatial prediction
    model_def_file = '../models/action_recognition/dextro_spatial.prototxt'
    model_file = '../dextro_benchmark_rgb_iter_48000.caffemodel'
    spatial_net = caffe.Net(model_def_file, model_file, caffe.TEST)

    # temporal prediction
    model_def_file = '../models/action_recognition/dextro_temporal.prototxt'
    model_file = '../dextro_benchmark_flow_iter_39000.caffemodel'
    temporal_net = caffe.Net(model_def_file, model_file, caffe.TEST)

    # input video (containing image_*.jpg and flow_*.jpg) and some settings
    input_video_dir = 'video/'
    start_frame = 0
    num_categories = 131
    feature_layer = 'fc8-2'

    # temporal net prediction
    temporal_mean_file = 'flow_mean.mat'
    temporal_prediction = VideoTemporalPrediction(input_video_dir,
                                                  temporal_mean_file,
                                                  temporal_net, num_categories,
                                                  feature_layer, start_frame)
    avg_temporal_pred_fc8 = np.mean(temporal_prediction, axis=1)
    avg_temporal_pred = softmax(avg_temporal_pred_fc8)

    # spatial net prediction
    spatial_mean_file = 'rgb_mean.mat'
    spatial_prediction = VideoSpatialPrediction(input_video_dir,
                                                spatial_mean_file, spatial_net,
                                                num_categories, feature_layer,
                                                start_frame)
    avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
    avg_spatial_pred = softmax(avg_spatial_pred_fc8)

    # fused prediction (temporal:spatial = 2:1)
    fused_pred = np.array(avg_temporal_pred) * 2./3 + \
                 np.array(avg_spatial_pred) * 1./3
Exemplo n.º 5
0
def main():
    args = parser.parse_args()
    output_path = logging(args)
    model_path = args.model_path
    data_dir = args.data_dir

    start_frame = 0
    num_samples = 25 if args.modality[:3] == 'rgb' else 1
    ext = ".png" if args.modality == "rhythm" and args.dataset == "hmdb51" else ".jpg"
    num_categories = 51 if args.dataset == 'hmdb51' else 101
    new_size = 224

    model_start_time = time.time()
    params = torch.load(model_path)

    if args.architecture == "inception_v3":
        new_size = 299
        if args.modality == "rhythm":
            spatial_net = models.flow_inception_v3(pretrained=False,
                                                   channels=1,
                                                   num_classes=num_categories)
        else:
            spatial_net = models.rgb_inception_v3(pretrained=False,
                                                  channels=3,
                                                  num_classes=num_categories)
    else:
        if args.modality == "rhythm":
            spatial_net = models.flow_resnet152(pretrained=False,
                                                channels=1,
                                                num_classes=num_categories)
        else:
            spatial_net = models.rgb_resnet152(pretrained=False,
                                               channels=3,
                                               num_classes=num_categories)

    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition spatial model is loaded in %4.4f seconds." %
          (model_time))

    test_path = os.path.join(args.settings, args.dataset)
    test_file = os.path.join(test_path,
                             "dataset_list.txt") if args.w else os.path.join(
                                 test_path, "test_split%d.txt" % (args.split))
    print(test_file)
    f_test = open(test_file, "r")
    test_list = f_test.readlines()
    print("we got %d videos" % len(test_list))

    line_id = 1
    match_count = 0
    result_list = []

    lines = []
    if args.vr_approach == 3:
        direction_path = os.path.join(args.settings, args.dataset,
                                      "direction.txt")
        lines = [int(line.rstrip('\n')) for line in open(direction_path)]
    elif args.vr_approach == 4:
        direction_path = os.path.join(args.settings, args.dataset,
                                      "direction_video.txt")
        lines = {
            line.split()[0]: int(line.split()[1])
            for line in open(direction_path)
        }

    for line in test_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])

        video_name = clip_path.split("/")[-1]
        index = lines[input_video_label] if args.vr_approach == 3 else (
            lines[video_name] if args.vr_approach == 4 else args.vr_approach)

        spatial_prediction = VideoSpatialPrediction(args.modality,
                                                    clip_path,
                                                    spatial_net,
                                                    num_categories,
                                                    start_frame,
                                                    num_frames,
                                                    num_samples,
                                                    index,
                                                    new_size=new_size,
                                                    ext=ext)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result_list.append(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print(args.modality + " split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" %
              (line_id, len(test_list), input_video_label, pred_index,
               match_count))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(test_list))
    print("Accuracy is: %4.4f" % (float(match_count) / len(test_list)))

    npy_name = args.dataset + "_" + args.modality + "_" + args.architecture + "_s" + str(
        args.split) + ".npy"
    npy_path = os.path.join(output_path, npy_name)
    np.save(npy_path, np.array(result_list))
Exemplo n.º 6
0
def main():
    args = parser.parse_args()
    output_path = logging(args)
    model_path = args.model_path
    data_dir = args.data_dir

    start_frame = 0
    num_samples = 10
    num_categories = 51 if args.dataset == 'hmdb51' else 101

    model_start_time = time.time()

    new_size = 224
    if args.architecture == "inception_v3":
        new_size = 299
        spatial_net = models.flow_inception_v3(pretrained=False,
                                               channels=1,
                                               num_classes=num_categories)
    else:
        spatial_net = models.flow_resnet152(pretrained=False,
                                            channels=1,
                                            num_classes=num_categories)

    params = torch.load(model_path)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))

    test_path = os.path.join(args.settings, args.dataset)
    test_file = os.path.join(test_path,
                             "dataset_list.txt") if args.w else os.path.join(
                                 test_path, "test_split%d.txt" % (args.split))

    f_test = open(test_file, "r")
    test_list = f_test.readlines()
    print("we got %d test videos" % len(test_list))

    line_id = 1
    match_count = 0

    result = []
    for line in test_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])
        spatial_prediction = VideoSpatialPrediction(clip_path, spatial_net,
                                                    num_categories,
                                                    num_samples, new_size,
                                                    args.batch_size)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)

        print("Rhythm split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" %
              (line_id, len(test_list), input_video_label, pred_index,
               match_count))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(test_list))
    print("Accuracy is : %4.4f" % ((float(match_count) / len(test_list))))

    npy_name = args.dataset + "_rhythm_" + args.architecture + "_s" + str(
        args.split) + ".npy"
    npy_path = os.path.join(output_path, npy_name)
    np.save(npy_path, np.array(result))
Exemplo n.º 7
0
def main():
    idx2class = {}
    with open('../../datasets/ucf101_splits/classInd.txt', 'r') as f:
        for lines in f.readlines():
            classNo, className = lines.strip('\n').split(' ')
            idx2class[classNo] = className

    amountList = [0] * len(idx2class)
    correctList = [0] * len(idx2class)

    report_file = "./fusion_model_LK256_t2/fusion_validation_report.txt"
    f_re = open(report_file, 'w', buffering=1)

    Tmodel_path = './fusion_model_LK256_t2/temporal.pth.tar'
    Smodel_path = './fusion_model_LK256_t2/spatial.pth.tar'
    print('temporal_model_path {}'.format(os.path.abspath(Tmodel_path)),
          file=f_re)
    print('spatial_model_path {}'.format(os.path.abspath(Smodel_path)),
          file=f_re)
    Tmodel_use = 'flow_mobilenet'  #flow_mobilenet, flow_vgg16
    Smodel_use = 'rgb_mobilenet'  #rgb_mobilenet, rgb_vgg16
    print('Temporal model backbone: {}'.format(Tmodel_use), file=f_re)
    print('Spatial model backbone: {}'.format(Smodel_use), file=f_re)

    start_frame = 0
    num_categories = 101

    model_start_time = time.time()
    temporal_net = models.__dict__[Tmodel_use](pretrained=False,
                                               num_classes=101)
    temporal_net.load_state_dict(torch.load(Tmodel_path)['state_dict'])
    temporal_net.cuda()
    temporal_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition temporal model is loaded in %4.4f seconds." %
          (model_time))
    print("Action recognition temporal model is loaded in %4.4f seconds." %
          (model_time),
          file=f_re)

    model_start_time = time.time()
    spatial_net = models.__dict__[Smodel_use](pretrained=False,
                                              num_classes=101)
    spatial_net.load_state_dict(torch.load(Smodel_path)['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition spatial model is loaded in %4.4f seconds." %
          (model_time))
    print("Action recognition spatial model is loaded in %4.4f seconds." %
          (model_time),
          file=f_re)

    Tval_file = "../../datasets/settings/ucf101/val_lk_flow_split1.txt"
    Sval_file = "../../datasets/settings/ucf101/val_rgb_split1.txt"
    print('temporal validation file = {}'.format(os.path.abspath(Tval_file)),
          file=f_re)
    print('spatial validation file = {}'.format(os.path.abspath(Sval_file)),
          file=f_re)
    f_Tval = open(Tval_file, "r")
    f_Sval = open(Sval_file, "r")
    val_Tlist = f_Tval.readlines()
    val_Slist = f_Sval.readlines()
    print("we got %d test videos" % len(val_Tlist))
    print("we got %d test videos" % len(val_Tlist), file=f_re)

    line_id = 1
    match_count = 0
    spatial_result_list = []
    temporal_result_list = []
    fusion_result_list = []
    ground_truth_list = []

    print("\nDetail Prediction:", file=f_re)
    for lineT, lineS in zip(val_Tlist, val_Slist):
        line_info = lineT.split(" ")
        clip_Tpath = os.path.join(flow_clip_root, line_info[0])
        clip_Spath = os.path.join(rgb_clip_root, lineS.split(" ")[0])

        input_video_label = int(line_info[2])

        temporal_prediction = VideoTemporalPrediction(clip_Tpath, temporal_net,
                                                      num_categories,
                                                      start_frame)

        spatial_prediction = VideoSpatialPrediction(clip_Spath, spatial_net,
                                                    num_categories,
                                                    start_frame)

        avg_temporal_pred_fc8 = np.mean(temporal_prediction, axis=1)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        avg_fusionByMean_pred = (avg_temporal_pred_fc8 +
                                 avg_spatial_pred_fc8) / 2
        temporal_result_list.append(avg_temporal_pred_fc8)
        spatial_result_list.append(avg_spatial_pred_fc8)
        fusion_result_list.append(avg_fusionByMean_pred)
        ground_truth_list.append(input_video_label)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        spatial_pred_index = np.argmax(avg_spatial_pred_fc8)
        temporal_pred_index = np.argmax(avg_temporal_pred_fc8)
        fusion_pred_index = np.argmax(avg_fusionByMean_pred)

        print(
            "Sample %d/%d: GT: %d, Spatial Prediction: %d, Temporal Prediction: %d, Fusion Prediction: %d"
            % (line_id, len(val_Tlist), input_video_label, spatial_pred_index,
               temporal_pred_index, fusion_pred_index))
        print(
            "Sample %d/%d: GT: %d, Spatial Prediction: %d, Temporal Prediction: %d, Fusion Prediction: %d"
            % (line_id, len(val_Tlist), input_video_label, spatial_pred_index,
               temporal_pred_index, fusion_pred_index),
            file=f_re)
        amountList[input_video_label] = amountList[input_video_label] + 1
        if fusion_pred_index == input_video_label:
            match_count += 1
            correctList[input_video_label] = correctList[input_video_label] + 1
        line_id += 1

    print(match_count)
    print(len(val_Tlist))
    print("Accuracy {}/{}".format(match_count, len(val_Tlist)), file=f_re)
    print("Accuracy is %4.4f" % (float(match_count) / len(val_Tlist)))
    print("Accuracy is %4.4f" % (float(match_count) / len(val_Tlist)),
          file=f_re)

    print("\nPrediction Distribution:", file=f_re)
    for idx in range(len(idx2class)):
        if (amountList[idx] != 0):
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx],
                float(correctList[idx]) / amountList[idx]))
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx],
                float(correctList[idx]) / amountList[idx]),
                  file=f_re)
        elif (amountList[idx] == 0):
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx], 0))
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx], 0),
                  file=f_re)

    np.savez('{}/ucf101_S{}T{}_fusion_result.npz'.format(
        os.path.abspath(report_file + '/../'), Smodel_use, Tmodel_use),
             correctList=np.array(correctList),
             amountList=np.array(amountList),
             spatialresultList=np.array(spatial_result_list),
             temporalresultList=np.array(temporal_result_list),
             fusionresultList=np.array(fusion_result_list),
             groundtruthList=np.array(ground_truth_list))

    f_re.close()
Exemplo n.º 8
0
def main():
    args = parser.parse_args()

    data_dir = args.data_dir
    val_file = args.list_files
    ext_batch_sz = int(args.ext_batch_sz)
    int_batch_sz = int(args.int_batch_sz)
    start_instance = int(args.start_instance)
    end_instance = int(args.end_instance)
    checkpoint = args.checkpoint_path

    model_start_time = time.time()
    if args.architecture == "inception_v3":
        new_size = 299
        num_categories = 3528, 3468, 2048
        spatial_net = models.inception_v3(pretrained=(checkpoint == ""),
                                          num_outputs=len(num_categories))
    else:  #resnet
        new_size = 224
        num_categories = 8192, 4096, 2048
        spatial_net = models.resnet152(pretrained=(checkpoint == ""),
                                       num_outputs=len(num_categories))

    if os.path.isfile(checkpoint):
        print('loading checkpoint {} ...'.format(checkpoint))
        params = torch.load(checkpoint)
        model_dict = spatial_net.state_dict()

        # 1. filter out unnecessary keys
        pretrained_dict = {
            k: v
            for k, v in params['state_dict'].items() if k in model_dict
        }

        # 2. overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)
        # 3. load the new state dict
        spatial_net.load_state_dict(model_dict)
        print('loaded')
    else:
        print(checkpoint)
        print('ERROR: No checkpoint found')

    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))

    f_val = open(val_file, "r")
    val_list = f_val.readlines()[start_instance:end_instance]
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0

    for line_id, line in enumerate(val_list):
        print("sample %d/%d" % (line_id + 1, len(val_list)))
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])

        spatial_prediction = VideoSpatialPrediction(clip_path, spatial_net,
                                                    num_categories, num_frames,
                                                    ext_batch_sz, int_batch_sz,
                                                    new_size)

        for ii in range(len(spatial_prediction)):
            for vr_ind, vr in enumerate(spatial_prediction[ii]):
                folder_name = args.architecture + "_" + args.dataset + "_VR" + str(
                    ii)
                if not os.path.isdir(folder_name + '/' + line_info[0]):
                    print("creating folder: " + folder_name + "/" +
                          line_info[0])
                    os.makedirs(folder_name + "/" + line_info[0])
                vr_name = folder_name + '/' + line_info[
                    0] + '/vr_{0:02d}.png'.format(vr_ind)
                vr_gray = normalize_maxmin(vr.transpose()).transpose() * 255.
                cv2.imwrite(vr_name, vr_gray)
Exemplo n.º 9
0
def main():
    args = parser.parse_args()

    #model_path = '../../parameters/'+args.architecture+"/"+args.modality+'_s'+str(args.split)+'.pth.tar'
    #data_dir = '../datasets/'+args.dataset+'_frames'
    #data_dir = '/home/Datasets/UCF-101-OF_CPU'
    model_path = args.model_path
    data_dir = args.data_dir

    start_frame = 0
    if args.modality[:3] == 'rgb':
        num_samples = 25
    else:
        num_samples = 1
    num_categories = 51 if args.dataset == 'hmdb51' else 101

    model_start_time = time.time()
    params = torch.load(model_path)

    new_size = 224
    if args.architecture == "inception_v3":
        new_size = 299
        if args.modality == "rhythm":
            spatial_net = models.flow_inception_v3(pretrained=False,
                                                   channels=1,
                                                   num_classes=num_categories)
        else:
            spatial_net = models.rgb_inception_v3(pretrained=False,
                                                  channels=3,
                                                  num_classes=num_categories)
    else:
        if args.modality == "rhythm":
            spatial_net = models.flow_resnet152(pretrained=False,
                                                channels=1,
                                                num_classes=num_categories)
        else:
            spatial_net = models.rgb_resnet152(pretrained=False,
                                               channels=3,
                                               num_classes=num_categories)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))

    val_file = "./splits/" + args.dataset + "/val_split%d.txt" % (args.split)
    #val_file = 'spatial_testlist01_with_labels.txt'
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0

    result = []
    lines = [
        int(line.rstrip('\n'))
        for line in open('../datasets/settings/' + args.dataset +
                         '/direction.txt')
    ]
    for line in val_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir, line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])
        spatial_prediction = VideoSpatialPrediction(
            args.modality, clip_path, spatial_net, num_categories, start_frame,
            num_frames, num_samples, args.vr_approach
            if args.vr_approach != 3 else lines[input_video_label], new_size)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)

        print(args.modality + " split " + str(args.split) +
              ", sample %d/%d: GT: %d, Prediction: %d ==> correct: %d" %
              (line_id, len(val_list), input_video_label, pred_index,
               match_count))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is : %4.4f" % ((float(match_count) / len(val_list))))
    np.save(
        args.dataset + "_" + args.modality + "_" + args.architecture + "_s" +
        str(args.split) + ".npy", np.array(result))
Exemplo n.º 10
0
def main():
    idx2class = {}
    with open('../../datasets/ucf101_splits/classInd.txt', 'r') as f:
        for lines in f.readlines():
            classNo, className = lines.strip('\n').split(' ')
            idx2class[classNo] = className

    amountList = [0] * len(idx2class)
    correctList = [0] * len(idx2class)

    model_path = '../bestcase/SpatialModel/rgbM.pth.tar'

    report_file = os.path.abspath(model_path +
                                  '/../') + '/spatial_validation_report.txt'
    f_re = open(report_file, 'w', buffering=1)

    print('model_path {}'.format(os.path.abspath(model_path)), file=f_re)
    model_use = 'rgb_mobilenet'  #rgb_mobilenet, rgb_vgg16
    print('Model backbone: {}'.format(model_use), file=f_re)
    data_dir = "~/UCF101/frames"
    start_frame = 0
    num_categories = 101

    model_start_time = time.time()
    params = torch.load(model_path)

    #spatial_net = torchvision.models.mobilenet_v2(pretrained=False, num_classes=101)
    spatial_net = models.__dict__[model_use](pretrained=False, num_classes=101)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))
    print("Action recognition temporal model is loaded in %4.4f seconds." %
          (model_time),
          file=f_re)

    val_file = "./spatial_testlist01_with_labels.txt"
    print('validation file = {}'.format(os.path.abspath(val_file)), file=f_re)
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))
    print("we got %d test videos" % len(val_list), file=f_re)

    line_id = 1
    match_count = 0
    result_list = []

    print("\nDetail Prediction:\n", file=f_re)

    for line in val_list:
        line_info = line.split(" ")
        clip_path = line_info[0]
        input_video_label = int(line_info[1]) - 1

        spatial_prediction = VideoSpatialPrediction(clip_path, spatial_net,
                                                    num_categories,
                                                    start_frame)

        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        # print(avg_spatial_pred_fc8.shape)
        result_list.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print("Sample %d/%d: GT: %d, Prediction: %d" %
              (line_id, len(val_list), input_video_label, pred_index))
        print("Sample %d/%d: GT: %d, Prediction: %d" %
              (line_id, len(val_list), input_video_label, pred_index),
              file=f_re)
        amountList[input_video_label] = amountList[input_video_label] + 1

        if pred_index == input_video_label:
            match_count += 1
            correctList[input_video_label] = correctList[input_video_label] + 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy {}/{}".format(match_count, len(val_list)), file=f_re)
    print("Accuracy is %4.4f" % (float(match_count) / len(val_list)))
    print("Accuracy is %4.4f" % (float(match_count) / len(val_list)),
          file=f_re)

    print("\nPrediction Distribution:\n", file=f_re)
    for idx in range(len(idx2class)):
        if (amountList[idx] != 0):
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx],
                float(correctList[idx]) / amountList[idx]))
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx],
                float(correctList[idx]) / amountList[idx]),
                  file=f_re)
        elif (amountList[idx] == 0):
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx], 0))
            print('{:<5} {:<20} {:>8} / {:<8} = {:4.4f}'.format(
                idx, idx2class[str(idx + 1)], correctList[idx],
                amountList[idx], 0),
                  file=f_re)
    np.savez(os.path.abspath(report_file + '/../') +
             '/ucf101_{}_model_result.npz'.format(model_use),
             correctList=np.array(correctList),
             amountList=np.array(amountList),
             resultList=np.array(result_list))
    #np.save("ucf101_s1_mobilenet_rgb.npy", np.array(result_list))
    f_re.close()
Exemplo n.º 11
0
def main():

    model_path ='/home/thl/Desktop/challeng/checkpoints/675_checkpoint.pth.tar'
    class_name_file = '/home/thl/Desktop/challeng/datasets/settings/class_name.txt'
    class_list = []
    for line in open(class_name_file, "r"):
        class_list.append(line)

    start_frame = 0
    num_categories = 90

    model_start_time = time.time()
    params = torch.load(model_path)

    spatial_net = models.rgb_vgg16(pretrained=False, num_classes=90)
    if torch.cuda.is_available():
        spatial_net = torch.nn.DataParallel(spatial_net)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." % (model_time))

    val_file_dir = '/home/thl/Desktop/challeng/datasets/settings/test_set.txt'
    val_list = []
    for line in open(val_file_dir, "r"):
        val_list.append(line)

    print("we got %d test videos" % len(val_list))

    line_id = 1

    result_list = []
    for line in val_list:
        clip_path ='/home/thl/Desktop/challeng/datasets/frame_and_flow/test/'+line[:-1]
        spatial_prediction = VideoSpatialPrediction(
                clip_path,
                spatial_net,
                num_categories,
                start_frame)

        final_lab,final_num= def_my_result(spatial_prediction, layers=1)
        # avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        # final_softmax = softmax(final_num/sum(final_num))
        final_softmax =final_num / sum(final_num)
        write_json(line[:-1], final_lab, final_softmax,class_list)
        # result_list.append(avg_spatial_pred_fc8)

        # pred_index = np.argmax(avg_spatial_pred_fc8)

        # print(final_lab,"   ",final_softmax)
        print_score = [float('%.2f' % final_softmax[0]),float('%.2f' % final_softmax[1]),float('%.2f' % final_softmax[2]),
                       float('%.2f' % final_softmax[3]),float('%.2f' % final_softmax[4])]

        print(final_lab,print_score, ' ',line_id ,' / ',len(val_list),'  video ')
        line_id += 1
    print(len(val_list))
    with open("./result.json", "w") as file:
        json.dump(final_result, file)
        file.close()
def main():
    args = parser.parse_args()

    model_path = '../../checkpoints/'+args.modality+'_s'+str(args.split)+'.pth.tar'
    data_dir = '../../datasets/ucf101_frames'        
    
    start_frame = 0
    if args.modality[:3]=='rgb':
        num_samples = 25
    else:
        num_samples = 1
    num_categories = 101

    model_start_time = time.time()
    params = torch.load(model_path)

    spatial_net = models.rgb_resnet152(pretrained=False, num_classes=101)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." % (model_time))


    val_file = "./splits/val_split%d.txt"%(args.split)
    f_val = open(val_file, "r")
    val_list = f_val.readlines()
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0

    result = []

    for line in val_list:
        line_info = line.split(" ")
        clip_path = os.path.join(data_dir,line_info[0])
        num_frames = int(line_info[1])
        input_video_label = int(line_info[2])
        spatial_prediction = VideoSpatialPrediction(
                "rgb" if args.modality=='rgb2' else args.modality,
                clip_path,
                spatial_net,
                num_categories,
                start_frame,
                num_frames,
                num_samples
                )
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)
        result.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        
        print(args.modality+" split "+str(args.split)+", sample %d/%d: GT: %d, Prediction: %d" % (line_id, len(val_list), input_video_label, pred_index))

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is : %4.4f" % ((float(match_count)/len(val_list))))
    np.save("ucf101_"+args.modality+"_resnet152_s"+str(args.split)+".npy", np.array(result))
Exemplo n.º 13
0
def main():

    # model_path = '../../checkpoints/model_best.pth.tar'
    model_path = '/home/thl/Desktop/challeng/checkpoints/40batch/model_best.pth.tar'
    # data_dir = "~/UCF101/frames"
    data_dir = '/home/thl/Desktop/challeng/datasets/frame_and_flow'

    start_frame = 0
    num_categories = 90

    model_start_time = time.time()
    params = torch.load(model_path)

    # spatial_net = models.rgb_resnet152(pretrained=False, num_classes=101)

    spatial_net = models.rgb_vgg16(pretrained=False, num_classes=90)
    if torch.cuda.is_available():
        spatial_net = torch.nn.DataParallel(spatial_net)
    spatial_net.load_state_dict(params['state_dict'])
    spatial_net.cuda()
    spatial_net.eval()
    model_end_time = time.time()
    model_time = model_end_time - model_start_time
    print("Action recognition model is loaded in %4.4f seconds." %
          (model_time))

    # val_file = "./testlist01_with_labels.txt"

    # val_file_dir ='./spatial_testlist01_with_labels.txt'
    val_file_dir = '/home/thl/Desktop/challeng/datasets/settings/val_set_detail.csv'

    # frame_base_dir = '/home/thl/Desktop/smart_city/datasets/ucf101_frames_flow/'
    frame_base_dir = '/home/thl/Desktop/challeng/datasets/frame_and_flow/val/'

    val_list = return_val_list(val_file_dir, frame_base_dir)
    print("we got %d test videos" % len(val_list))

    line_id = 1
    match_count = 0
    result_list = []
    for line in val_list:
        line_info = line.split(" ")
        clip_path = line_info[0]
        input_video_label = int(line_info[1])

        spatial_prediction = VideoSpatialPrediction(clip_path, spatial_net,
                                                    num_categories,
                                                    start_frame)

        final_lab, final_num = def_my_result(spatial_prediction, layers=2)
        avg_spatial_pred_fc8 = np.mean(spatial_prediction, axis=1)

        # print(avg_spatial_pred_fc8.shape)
        result_list.append(avg_spatial_pred_fc8)
        # avg_spatial_pred = softmax(avg_spatial_pred_fc8)

        pred_index = np.argmax(avg_spatial_pred_fc8)
        print(
            "Sample %d/%d: GT: %d, Prediction: %d" % (
                line_id,
                len(val_list),
                input_video_label,
                pred_index,
            ), final_lab)

        if pred_index == input_video_label:
            match_count += 1
        line_id += 1

    print(match_count)
    print(len(val_list))
    print("Accuracy is %4.4f" % (float(match_count) / len(val_list)))
    np.save("ucf101_s1_rgb_resnet152.npy", np.array(result_list))