Пример #1
0
    def run_model(self, V, gpu_devices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec,
                    'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)])
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill([],
                                                     "vec_{}".format(num),
                                                     shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [], "vec_2", shape=[batch_per_device, 16]))
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size)
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(vec, orig_vec)
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format(
                    g, num)) for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.
                    format(len(idx), len(grad_slice)))
Пример #2
0
def Test(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))
    #========================================================
    for db_name, db_size in data_base.items():
        workspace.ResetWorkspace()
        print(workspace.Blobs())

        def create_mobilenet_model_ops(model, loss_scale):
            [softmax, loss] = mobilenet.create_mobilenet(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                label="label")
            # loss = model.Scale(loss, scale=loss_scale)
            brew.accuracy(model, [softmax, "label"], "accuracy")
            # return [loss]

        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(
            name="mobilenet_test",
            arg_scope=test_arg_scope  #, init_params=False
        )

        test_data_db = os.path.join(data_folder, db_name)
        print(test_data_db, db_size)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=test_data_db,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=args.batch_size,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_mobilenet_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)
        # load the pre-trained model and mobilenet epoch

        LoadModel(args.load_model_path, test_model)
        data_parallel_model.FinalizeAfterCheckpoint(test_model)

        (test_max_softmax, test_max_index,
         test_label) = RunEpoch(args, db_size, test_model, args.batch_size)
        # flag_accuracy = (test_label == test_max_index).astype(np.int)
        # print(flag_accuracy.mean())

        save_path = os.path.join(root_folder, db_name[:-5] + ".npz")
        print(save_path)
        np.savez(save_path,
                 max_softmax=test_max_softmax,
                 max_index=test_max_index,
                 label=test_label)
        if os.path.exists(save_path):
            print("OK")
Пример #3
0
 def test_device_scope_check(self):
     with self.assertRaises(AssertionError):
         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
             data_parallel_model.Parallelize_GPU(None, None, None)
Пример #4
0
    def run_model(self, V, gpu_devices, cpu_indices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather([self.vecs, 'indices'],
                                                    'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs,
                    "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [param_grad.values, param_momentum, param],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill([],
                                                             "vecs",
                                                             shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([
                            param, self.ONE_CPU, param_grad.indices,
                            param_grad.values, self.LR
                        ], self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16)
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(self.vecs, orig_vecs)
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs),
                                     orig_vecs))
        return [
            workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
            workspace.FetchBlob("gpu_0/fc_w")
        ]
Пример #5
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustice_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(
        name="resnet50", arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id
    if num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                )
            )
        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            exit_nets=None)
    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        [softmax, loss] = resnet.create_resnet50(
            model,
            "data",
            num_input_channels=args.num_channels,
            num_labels=args.num_labels,
            label="label",
            no_bias=True,
        )
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        optimizer.build_sgd(
            model,
            args.base_learning_rate,
            momentum=0.9,
            nesterov=1,
            policy="step",
            stepsize=stepsz,
            gamma=0.1
        )

    # Input. Note that the reader must be shared with all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
        )

    # Create parallelized model
    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        optimizer_builder_fun=add_optimizer,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
    )

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(
            name="resnet50_test", arg_scope=test_arg_scope
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (
            args.file_store_path,
            args.save_model_name
        )
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
Пример #6
0
def Train(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Modify to make it consistent with the distributed trainer
    total_batch_size = args.batch_size * num_gpus
    batch_per_device = args.batch_size

    # Round down epoch size to closest multiple of batch size across machines
    epoch_iters = int(args.epoch_size / total_batch_size)
    args.epoch_size = epoch_iters * total_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create CNNModeLhelper object
    train_model = cnn.CNNModelHelper(
        order="NCHW",
        name='{}_train'.format(args.model_name),
        use_cudnn=(True if args.use_cudnn == 1 else False),
        cudnn_exhaustive_search=True,
        ws_nbytes_limit=(args.cudnn_workspace_limit_mb * 1024 * 1024),
    )

    # Model building functions
    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            batch_size=args.batch_size,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(
                args.clip_length_of if args.input_type
                else args.clip_length_rgb
            ),
            loss_scale=loss_scale,
            pred_layer_name=args.pred_layer_name,
            multi_label=args.multi_label,
            channel_multiplier=args.channel_multiplier,
            bottleneck_multiplier=args.bottleneck_multiplier,
            use_dropout=args.use_dropout,
            conv1_temporal_stride=args.conv1_temporal_stride,
            conv1_temporal_kernel=args.conv1_temporal_kernel,
            use_pool1=args.use_pool1,
        )

    # SGD
    def add_parameter_update_ops(model):
        model.AddWeightDecay(args.weight_decay)
        ITER = model.Iter("ITER")
        stepsz = args.step_epoch * args.epoch_size / args.batch_size / num_gpus
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=args.base_learning_rate * num_gpus,
            policy="step",
            stepsize=int(stepsz),
            gamma=args.gamma,
        )
        AddMomentumParameterUpdate(model, LR)

    # Input. Note that the reader must be shared with all GPUS.
    train_reader, train_examples = reader_utils.create_data_reader(
        train_model,
        name="train_reader",
        input_data=args.train_data,
    )
    log.info("Training set has {} examples".format(train_examples))

    def add_video_input(model):
        model_helper.AddVideoInput(
            model,
            train_reader,
            batch_size=batch_per_device,
            length_rgb=args.clip_length_rgb,
            clip_per_video=1,
            random_mirror=True,
            decode_type=0,
            sampling_rate_rgb=args.sampling_rate_rgb,
            scale_h=args.scale_h,
            scale_w=args.scale_w,
            crop_size=args.crop_size,
            video_res_type=args.video_res_type,
            short_edge=min(args.scale_h, args.scale_w),
            num_decode_threads=args.num_decode_threads,
            do_multi_label=args.multi_label,
            num_of_class=args.num_labels,
            random_crop=True,
            input_type=args.input_type,
            length_of=args.clip_length_of,
            sampling_rate_of=args.sampling_rate_of,
            frame_gap_of=args.frame_gap_of,
            do_flow_aggregation=args.do_flow_aggregation,
            flow_data_type=args.flow_data_type,
            get_rgb=(args.input_type == 0),
            get_optical_flow=(args.input_type == 1),
            get_video_id=args.get_video_id,
            jitter_scales=[int(n) for n in args.jitter_scales.split(',')],
            use_local_file=args.use_local_file,
        )

    # Create parallelized model
    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=add_video_input,
        forward_pass_builder_fun=create_model_ops,
        param_update_builder_fun=add_parameter_update_ops,
        devices=gpus,
        rendezvous=None,
        net_type=('prof_dag' if args.profiling == 1 else 'dag'),
        optimize_gradient_memory=True,
    )

    # Add test model, if specified
    test_model = None
    if args.test_data is not None:
        log.info("----- Create test net ----")
        test_model = cnn.CNNModelHelper(
            order="NCHW",
            name='{}_test'.format(args.model_name),
            use_cudnn=(True if args.use_cudnn == 1 else False),
            cudnn_exhaustive_search=True
        )

        test_reader, test_examples = reader_utils.create_data_reader(
            test_model,
            name="test_reader",
            input_data=args.test_data,
        )

        log.info("Testing set has {} examples".format(test_examples))

        def test_input_fn(model):
            model_helper.AddVideoInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                length_rgb=args.clip_length_rgb,
                clip_per_video=1,
                decode_type=0,
                random_mirror=False,
                random_crop=False,
                sampling_rate_rgb=args.sampling_rate_rgb,
                scale_h=args.scale_h,
                scale_w=args.scale_w,
                crop_size=args.crop_size,
                video_res_type=args.video_res_type,
                short_edge=min(args.scale_h, args.scale_w),
                num_decode_threads=args.num_decode_threads,
                do_multi_label=args.multi_label,
                num_of_class=args.num_labels,
                input_type=args.input_type,
                length_of=args.clip_length_of,
                sampling_rate_of=args.sampling_rate_of,
                frame_gap_of=args.frame_gap_of,
                do_flow_aggregation=args.do_flow_aggregation,
                flow_data_type=args.flow_data_type,
                get_rgb=(args.input_type == 0),
                get_optical_flow=(args.input_type == 1),
                get_video_id=args.get_video_id,
                use_local_file=args.use_local_file,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
            optimize_gradient_memory=True,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        if args.db_type == 'pickle':
            model_loader.LoadModelFromPickleFile(
                train_model,
                args.load_model_path,
                use_gpu=True,
                root_gpu_id=gpus[0]
            )
        else:
            model_helper.LoadModel(
                args.load_model_path, args.db_type
            )
        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(
            train_model,
            GetCheckpointParams(train_model),
        )

        if args.is_checkpoint:
            # reset epoch. load_model_path should end with *_X.mdl,
            # where X is the epoch number
            last_str = args.load_model_path.split('_')[-1]
            if last_str.endswith('.mdl'):
                epoch = int(last_str[:-4])
                log.info("Reset epoch to {}".format(epoch))
            else:
                log.warning("The format of load_model_path doesn't match!")

    expname = "%s_gpu%d_b%d_L%d_lr%.2f" % (
        args.model_name,
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            1,
            expname,
            explog
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)
Пример #7
0
    train_x, test_x = normalization(train_x, test_x)

    devices = list(range(0, args.gpus))
    print(devices)
    arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
    }

    train_model = model_helper.ModelHelper(name="train_net",
                                           arg_scope=arg_scope)

    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=input_builder_fun,
        forward_pass_builder_fun=model_build_fun_train,
        optimizer_builder_fun=add_optimizer,
        devices=devices,
    )

    test_model = model_helper.ModelHelper(name="test_net",
                                          arg_scope=arg_scope,
                                          init_params=False)

    data_parallel_model.Parallelize_GPU(
        test_model,
        input_builder_fun=input_builder_fun,
        forward_pass_builder_fun=model_build_fun_test,
        optimizer_builder_fun=None,
        devices=devices,
    )
Пример #8
0
def Test(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        total_batch_size = args.batch_size * num_gpus
        log.info("Running on GPUs: {}".format(gpus))
        log.info("total_batch_size: {}".format(total_batch_size))
    else:
        total_batch_size = args.batch_size
        log.info("Running on CPU")
        log.info("total_batch_size: {}".format(total_batch_size))

    video_input_args = dict(
        batch_size=args.batch_size,
        clip_per_video=args.clip_per_video,
        decode_type=1,
        length_rgb=args.clip_length_rgb,
        sampling_rate_rgb=args.sampling_rate_rgb,
        scale_h=args.scale_h,
        scale_w=args.scale_w,
        crop_size=args.crop_size,
        video_res_type=args.video_res_type,
        short_edge=min(args.scale_h, args.scale_w),
        num_decode_threads=args.num_decode_threads,
        do_multi_label=args.multi_label,
        num_of_class=args.num_labels,
        random_mirror=False,
        random_crop=False,
        input_type=args.input_type,
        length_of=args.clip_length_of,
        sampling_rate_of=args.sampling_rate_of,
        frame_gap_of=args.frame_gap_of,
        do_flow_aggregation=args.do_flow_aggregation,
        flow_data_type=args.flow_data_type,
        get_rgb=(args.input_type == 0 or args.input_type >= 3),
        get_optical_flow=(args.input_type == 1 or args.input_type >= 4),
        get_logmels=(args.input_type >= 2),
        use_local_file=args.use_local_file,
        crop_per_clip=args.crop_per_clip,
    )

    reader_args = dict(
        name="test_reader",
        input_data=args.test_data,
    )

    # Model building functions
    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            batch_size=args.batch_size * args.clip_per_video *
            args.crop_per_clip,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(args.clip_length_of
                         if args.input_type == 1 else args.clip_length_rgb),
            loss_scale=loss_scale,
            is_test=1,
            pred_layer_name=args.pred_layer_name,
            multi_label=args.multi_label,
            channel_multiplier=args.channel_multiplier,
            bottleneck_multiplier=args.bottleneck_multiplier,
            use_dropout=args.use_dropout,
            conv1_temporal_stride=args.conv1_temporal_stride,
            conv1_temporal_kernel=args.conv1_temporal_kernel,
            use_convolutional_pred=args.use_convolutional_pred,
            use_pool1=args.use_pool1,
        )

    test_model = cnn.CNNModelHelper(
        order="NCHW",
        name="video_model_test",
        use_cudnn=(True if args.use_cudnn == 1 else False),
        cudnn_exhaustive_search=True,
    )

    test_reader, number_of_examples = reader_utils.create_data_reader(
        test_model, **reader_args)

    if args.num_iter <= 0:
        num_iter = int(math.ceil(number_of_examples / total_batch_size))
    else:
        num_iter = args.num_iter

    def test_input_fn(model):
        model_helper.AddVideoInput(test_model, test_reader, **video_input_args)

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
            optimize_gradient_memory=True,
        )
    else:
        test_model._device_type = caffe2_pb2.CPU
        test_model._devices = [0]
        device_opt = core.DeviceOption(test_model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                test_input_fn(test_model)
                create_model_ops(test_model, 1.0)

    workspace.RunNetOnce(test_model.param_init_net)
    workspace.CreateNet(test_model.net)

    if args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    elif args.db_type == 'pickle':
        if num_gpus > 0:
            model_loader.LoadModelFromPickleFile(test_model,
                                                 args.load_model_path,
                                                 use_gpu=True,
                                                 root_gpu_id=gpus[0])
        else:
            model_loader.LoadModelFromPickleFile(test_model,
                                                 args.load_model_path,
                                                 use_gpu=False)
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    data_parallel_model.FinalizeAfterCheckpoint(test_model)

    # metric couters for multilabel
    all_prob_for_map = np.empty(shape=[0, args.num_labels], dtype=np.float)
    all_label_for_map = np.empty(shape=[0, args.num_labels], dtype=np.int32)

    # metric counters for closed-world classification
    clip_acc = 0
    video_top1 = 0
    video_topk = 0
    video_count = 0
    clip_count = 0
    crop_per_video = args.clip_per_video * args.crop_per_clip
    for i in range(num_iter):
        workspace.RunNet(test_model.net.Proto().name)

        num_devices = 1  # default for cpu
        if num_gpus > 0:
            num_devices = num_gpus

        for g in range(num_devices):
            # get labels
            label = workspace.FetchBlob("gpu_{}".format(g) + '/label')
            # get predictions
            if args.multi_label:
                predicts = workspace.FetchBlob("gpu_{}".format(g) + '/prob')
            else:
                predicts = workspace.FetchBlob("gpu_{}".format(g) + '/softmax')
            assert predicts.shape[0] == args.batch_size * crop_per_video

            for j in range(args.batch_size):
                # get label for one video
                if args.multi_label:
                    sample_label = label[j * crop_per_video, :]
                else:
                    sample_label = label[j * crop_per_video]
                    # get clip accuracy
                    for k in range(crop_per_video):
                        sorted_preds = \
                            np.argsort(predicts[j * crop_per_video + k, :])
                        sorted_preds[:] = sorted_preds[::-1]
                        if sorted_preds[0] == label[j * crop_per_video + k]:
                            clip_acc = clip_acc + 1
                # get all clip predictions for one video
                all_clips = \
                    predicts[
                        j * crop_per_video:(j + 1) * crop_per_video, :
                    ]
                # aggregate predictions into one
                video_pred = PredictionAggregation(all_clips, args.aggregation)

                if args.multi_label:
                    video_pred = np.expand_dims(video_pred, axis=0)
                    sample_label = np.expand_dims(sample_label, axis=0)
                    all_prob_for_map = np.concatenate(
                        (all_prob_for_map, video_pred), axis=0)
                    all_label_for_map = np.concatenate(
                        (all_label_for_map, sample_label), axis=0)
                else:
                    sorted_video_pred = np.argsort(video_pred)
                    sorted_video_pred[:] = sorted_video_pred[::-1]
                    if sorted_video_pred[0] == sample_label:
                        video_top1 = video_top1 + 1
                    if sample_label in sorted_video_pred[0:args.top_k]:
                        video_topk = video_topk + 1

            video_count = video_count + args.batch_size
            clip_count = clip_count + label.shape[0]

        if i > 0 and i % args.display_iter == 0:
            if args.multi_label:
                # mAP
                auc, ap, wap, aps = metric.mean_ap_metric(
                    all_prob_for_map, all_label_for_map)
                log.info(
                    'Iter {}/{}: mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}'.
                    format(i, num_iter, auc, ap, wap, np.mean(aps)))
            else:
                # accuracy
                log.info('Iter {}/{}: clip: {}, top1: {}, top 5: {}'.format(
                    i, num_iter, clip_acc / clip_count,
                    video_top1 / video_count, video_topk / video_count))

    if args.multi_label:
        # mAP
        auc, ap, wap, aps = metric.mean_ap_metric(all_prob_for_map,
                                                  all_label_for_map)
        log.info("Test mAUC: {}, mAP: {}, mWAP: {}, mAP_all: {}".format(
            auc, ap, wap, np.mean(aps)))
        if args.print_per_class_metrics:
            log.info("Test mAP per class: {}".format(aps))
    else:
        # accuracy
        log.info("Test accuracy: clip: {}, top 1: {}, top{}: {}".format(
            clip_acc / clip_count, video_top1 / video_count, args.top_k,
            video_topk / video_count))

    if num_gpus > 0:
        flops, params, inters = model_helper.GetFlopsAndParams(
            test_model, gpus[0])
    else:
        flops, params, inters = model_helper.GetFlopsAndParams(test_model)
    log.info('FLOPs: {}, params: {}, inters: {}'.format(flops, params, inters))
Пример #9
0
def ExtractFeatures(args):
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    if num_gpus > 0:
        log.info("Running on GPUs: {}".format(gpus))
    else:
        log.info("Running on CPU")

    my_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True
    }

    model = cnn.CNNModelHelper(
        name="Extract Features",
        **my_arg_scope
    )

    video_input_args = dict(
        batch_size=args.batch_size,
        clip_per_video=args.clip_per_video,
        decode_type=args.decode_type,
        length_rgb=args.clip_length_rgb,
        sampling_rate_rgb=args.sampling_rate_rgb,
        scale_h=args.scale_h,
        scale_w=args.scale_w,
        crop_size=args.crop_size,
        video_res_type=args.video_res_type,
        short_edge=min(args.scale_h, args.scale_w),
        num_decode_threads=args.num_decode_threads,
        do_multi_label=args.multi_label,
        num_of_class=args.num_labels,
        random_mirror=False,
        random_crop=False,
        input_type=args.input_type,
        length_of=args.clip_length_of,
        sampling_rate_of=args.sampling_rate_of,
        frame_gap_of=args.frame_gap_of,
        do_flow_aggregation=args.do_flow_aggregation,
        flow_data_type=args.flow_data_type,
        get_rgb=(args.input_type == 0 or args.input_type >= 3),
        get_optical_flow=(args.input_type == 1 or args.input_type >= 4),
        get_logmels=(args.input_type >= 2),
        get_video_id=args.get_video_id,
        get_start_frame=args.get_start_frame,
        use_local_file=args.use_local_file,
        crop_per_clip=args.crop_per_clip,
    )

    reader_args = dict(
        name="extract_features" + '_reader',
        input_data=args.test_data,
    )

    reader, num_examples = reader_utils.create_data_reader(
        model,
        **reader_args
    )

    def input_fn(model):
        model_helper.AddVideoInput(
            model,
            reader,
            **video_input_args)

    def create_model_ops(model, loss_scale):
        return model_builder.build_model(
            model=model,
            model_name=args.model_name,
            model_depth=args.model_depth,
            num_labels=args.num_labels,
            batch_size=args.batch_size,
            num_channels=args.num_channels,
            crop_size=args.crop_size,
            clip_length=(
                args.clip_length_of if args.input_type == 1
                else args.clip_length_rgb
            ),
            loss_scale=loss_scale,
            is_test=1,
            multi_label=args.multi_label,
            channel_multiplier=args.channel_multiplier,
            bottleneck_multiplier=args.bottleneck_multiplier,
            use_dropout=args.use_dropout,
            use_convolutional_pred=args.use_convolutional_pred,
            use_pool1=args.use_pool1,
        )

    if num_gpus > 0:
        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_fn,
            forward_pass_builder_fun=create_model_ops,
            param_update_builder_fun=None,   # 'None' since we aren't training
            devices=gpus,
            optimize_gradient_memory=True,
        )
    else:
        model._device_type = caffe2_pb2.CPU
        model._devices = [0]
        device_opt = core.DeviceOption(model._device_type, 0)
        with core.DeviceScope(device_opt):
            # Because our loaded models are named with "gpu_x", keep the naming for now.
            # TODO: Save model using `data_parallel_model.ExtractPredictorNet`
            # to extract the model for "gpu_0". It also renames
            # the input and output blobs by stripping the "gpu_x/" prefix
            with core.NameScope("{}_{}".format("gpu", 0)):
                input_fn(model)
                create_model_ops(model, 1.0)

    workspace.RunNetOnce(model.param_init_net)
    workspace.CreateNet(model.net)

    if args.db_type == 'pickle':
        model_loader.LoadModelFromPickleFile(model, args.load_model_path)
    elif args.db_type == 'minidb':
        if num_gpus > 0:
            model_helper.LoadModel(args.load_model_path, args.db_type)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
                model_helper.LoadModel(args.load_model_path, args.db_type)
    else:
        log.warning("Unsupported db_type: {}".format(args.db_type))

    data_parallel_model.FinalizeAfterCheckpoint(model)

    def fetchActivations(model, outputs, num_iterations):

        all_activations = {}
        for counter in range(num_iterations):
            workspace.RunNet(model.net.Proto().name)

            num_devices = 1  # default for cpu
            if num_gpus > 0:
                num_devices = num_gpus
            for g in range(num_devices):
                for output_name in outputs:
                    blob_name = 'gpu_{}/'.format(g) + output_name
                    activations = workspace.FetchBlob(blob_name)
                    if output_name not in all_activations:
                        all_activations[output_name] = []
                    all_activations[output_name].append(activations)

            if counter % 20 == 0:
                log.info('{}/{} iterations'.format(counter, num_iterations))

        # each key holds a list of activations obtained from each minibatch.
        # we now concatenate these lists to get the final arrays.
        # concatenating during the loop requires a realloc and can get slow.
        for key in all_activations:
            all_activations[key] = np.concatenate(all_activations[key])

        return all_activations

    outputs = [name.strip() for name in args.features.split(',')]
    assert len(outputs) > 0

    if args.num_iterations > 0:
        num_iterations = args.num_iterations
    else:
        if num_gpus > 0:
            examples_per_iteration = args.batch_size * num_gpus
        else:
            examples_per_iteration = args.batch_size
        num_iterations = int(num_examples / examples_per_iteration)

    activations = fetchActivations(model, outputs, num_iterations)

    # saving extracted features
    for index in range(len(outputs)):
        log.info(
            "Read '{}' with shape {}".format(
                outputs[index],
                activations[outputs[index]].shape
            )
        )

    if args.output_path:
        output_path = args.output_path
    else:
        output_path = os.path.dirname(args.test_data) + '/features.pickle'

    log.info('Writing to {}'.format(output_path))
    if args.save_h5:
        with h5py.File(output_path, 'w') as handle:
            for name, activation in activations.items():
                handle.create_dataset(name, data=activation)
    else:
        with open(output_path, 'wb') as handle:
            pickle.dump(activations, handle)

    # perform sanity check
    if args.sanity_check == 1:  # check clip accuracy
        assert args.multi_label == 0
        clip_acc = 0
        softmax = activations['softmax']
        label = activations['label']
        for i in range(len(softmax)):
            sorted_preds = \
                np.argsort(softmax[i])
            sorted_preds[:] = sorted_preds[::-1]
            if sorted_preds[0] == label[i]:
                clip_acc += 1
        log.info('Sanity check --- clip accuracy: {}'.format(
            clip_acc / len(softmax))
        )
    elif args.sanity_check == 2:  # check auc
        assert args.multi_label == 1
        prob = activations['prob']
        label = activations['label']
        mean_auc, mean_ap, mean_wap, _ = metric.mean_ap_metric(prob, label)
        log.info('Sanity check --- AUC: {}, mAP: {}, mWAP: {}'.format(
            mean_auc, mean_ap, mean_wap)
        )