示例#1
0
def Train(args):
    if args.model == "resnext":
        model_name = "resnext" + str(args.num_layers)
    elif args.model == "shufflenet":
        model_name = "shufflenet"

    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Verify valid image mean/std per channel
    if args.image_mean_per_channel:
        assert \
            len(args.image_mean_per_channel) == args.num_channels, \
            "The number of channels of image mean doesn't match input"

    if args.image_std_per_channel:
        assert \
            len(args.image_std_per_channel) == args.num_channels, \
            "The number of channels of image std doesn't match input"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    if args.use_ideep:
        train_arg_scope = {
            'use_cudnn': False,
            'cudnn_exhaustive_search': False,
            'training_mode': 1
        }
    else:
        train_arg_scope = {
            'order': 'NCHW',
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
            'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
        }
    train_model = model_helper.ModelHelper(
        name=model_name, arg_scope=train_arg_scope
    )

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(
                kv_handler=None,
                num_shards=num_shards,
                shard_id=shard_id,
                engine="GLOO",
                transport=args.distributed_transport,
                interface=interfaces[0],
                mpi_rendezvous=True,
                exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate", [], [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                )
            )
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate", [], [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                )
            )

        rendezvous = dict(
            kv_handler=store_handler,
            shard_id=shard_id,
            num_shards=num_shards,
            engine="GLOO",
            transport=args.distributed_transport,
            interface=interfaces[0],
            exit_nets=None)

    else:
        rendezvous = None

    # Model building functions
    def create_resnext_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                       else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = resnet.create_resnext(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                num_layers=args.num_layers,
                num_groups=args.resnext_num_groups,
                num_width_per_group=args.resnext_width_per_group,
                no_bias=True,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
        return [loss]

    def create_shufflenet_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
                       else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = shufflenet.create_shufflenet(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            # TODO: merge with multi-prceision optimizer
            opt = optimizer.build_fp16_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                weight_decay=args.weight_decay,   # weight decay included
                policy="step",
                stepsize=stepsz,
                gamma=0.1
            )
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(
                model,
                args.base_learning_rate,
                momentum=0.9,
                nesterov=1,
                policy="step",
                stepsize=stepsz,
                gamma=0.1
            )
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    if args.train_data == "null":
        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
                mean_per_channel=args.image_mean_per_channel,
                std_per_channel=args.image_std_per_channel,
            )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob,
                    param_info.blob_copy[core.DataType.FLOAT]
                )

    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnext_model_ops
        if args.model == "resnext" else create_shufflenet_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        ideep=args.use_ideep,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        if args.use_ideep:
            test_arg_scope = {
                'use_cudnn': False,
                'cudnn_exhaustive_search': False,
            }
        else:
            test_arg_scope = {
                'order': "NCHW",
                'use_cudnn': True,
                'cudnn_exhaustive_search': True,
            }
        test_model = model_helper.ModelHelper(
            name=model_name + "_test",
            arg_scope=test_arg_scope,
            init_params=False,
        )

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
                mean_per_channel=args.image_mean_per_channel,
                std_per_channel=args.image_std_per_channel,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnext_model_ops
            if args.model == "resnext" else create_shufflenet_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model, args.use_ideep)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % (
        model_name,
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )

    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch, args.use_ideep)

        model_path = "%s/%s_" % (
            args.file_store_path,
            args.save_model_name
        )
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
示例#2
0
    def test_sum_reduce(self, gc, dc):
        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(4, 5).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=0)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(2, 3).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=0)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=3)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res, decimal=3)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(3, 4).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
        Y = np.random.rand(1).astype(np.float64)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.array(np.sum(X))
        np.testing.assert_array_almost_equal(out, res, decimal=0)

        # broadcasting with single elem dimensions at both ends
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(1, 3, 4, 1).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2).reshape(Y.shape)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # fp64 is not supported with the CUDA op
        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
        self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
示例#3
0
    def test_int8_fc_4_dims(self, n, m, k, gc, dc):
        X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
        w = np.random.rand(n, k, m, m).astype(np.float32) - 0.5
        b = np.random.rand(n).astype(np.float32) - 0.5

        fc_fp32 = core.CreateOperator('FC', ['X', 'w', 'b'], ["Y"])

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)

        workspace.FeedBlob('X', X, dc[0])
        workspace.FeedBlob('w', w, dc[0])
        workspace.FeedBlob('b', b, dc[0])
        workspace.RunOperatorOnce(fc_fp32)
        Y = workspace.FetchBlob('Y')

        workspace.ResetWorkspace()

        Y_absmax = np.array([np.absolute(Y).max()]).astype(np.float32)
        if Y.min() >= 0:
            Y_scale = Y_absmax / 0xFF
            Y_zero_point = 0
        else:
            Y_scale = Y_absmax / 0x7F
            Y_zero_point = 128

        X_absmax = np.array([np.absolute(X).max()]).astype(np.float32)
        if X.min() >= 0:
            X_scale = X_absmax / 0xFF
            X_zero_point = 0
        else:
            X_scale = X_absmax / 0x7F
            X_zero_point = 128

        w_absmax = np.array([
            np.absolute(w[i, ...]).max() for i in range(w.shape[0])
        ]).astype(np.float32)
        w_scale = w_absmax / 0x7F
        w_zero_point = 128
        w = np.transpose(w, (0, 2, 3, 1)).astype(np.float32)
        w_bytes = np.rint([w[i, ...] / w_scale[i] for i in range(w.shape[0])
                           ]).astype(np.int8) + w_zero_point

        w_filler = core.CreateOperator(
            "Int8GivenTensorFill",
            [],
            ["wi"],
            shape=w.shape,
            values=w_bytes.astype(np.uint8).tobytes(),
            Y_zero_point=w_zero_point,
            Y_scales=w_scale,
            device_option=dc[1],
        )

        b_scale = w_scale * X_scale
        b_zero_point = 0
        b_bytes = np.rint([b[i] / b_scale[i]
                           for i in range(b.shape[0])]).astype(np.int32)
        b_filler = core.CreateOperator(
            "Int8GivenIntTensorFill",
            [],
            ["bi"],
            shape=b.shape,
            values=b_bytes,
            Y_zero_point=b_zero_point,
            Y_scales=b_scale,
            device_option=dc[1],
        )

        sw2nhwc = core.CreateOperator("NCHW2NHWC", ["Xi"], ["Xi_nhwc"],
                                      device_option=dc[1])

        quantize_X = core.CreateOperator(
            "Int8Quantize",
            ["Xi_nhwc"],
            ["Xi_quantized"],
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=X_zero_point,
            Y_scale=X_scale[0],
        )

        fc = core.CreateOperator(
            'Int8FC',
            ['Xi_quantized', 'wi', 'bi'],
            ["Y_out"],
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=Y_zero_point,
            Y_scale=Y_scale[0],
        )

        net = caffe2_pb2.NetDef()
        net.op.extend([w_filler, b_filler, sw2nhwc, quantize_X, fc])

        workspace.FeedBlob("Xi", X, dc[1])
        workspace.RunNetOnce(net)
        Y_out = workspace.FetchBlob("Y_out")

        MSE = np.square(np.subtract(Y, Y_out)).mean()
        if MSE > 0.005:
            print(Y.flatten())
            print(Y_out.flatten())
            print(np.max(np.abs(Y_out - Y)))
            print("MSE", MSE)
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
示例#4
0
    def test_int8_pooling(self, stride, pad, kernel, size, input_channels,
                          batch_size, method, gc, dc):
        assume(pad < kernel)
        pool_fp32 = core.CreateOperator(method, ["X"], ["Y"],
                                        stride=stride,
                                        pad=pad,
                                        kernel=kernel,
                                        device_option=dc[0])
        X = np.random.rand(batch_size, input_channels, size,
                           size).astype(np.float32)

        if X.min() >= 0:
            scale = np.absolute(X).max() / 0xFF
            zero_point = 0
        else:
            scale = np.absolute(X).max() / 0x7F
            zero_point = 128

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)

        workspace.FeedBlob("X", X, dc[0])
        workspace.RunOperatorOnce(pool_fp32)
        Y = workspace.FetchBlob("Y")

        workspace.ResetWorkspace()

        sw2nhwc = core.CreateOperator("NCHW2NHWC", ["Xi"], ["Xi_nhwc"],
                                      device_option=dc[1])

        quantize = core.CreateOperator(
            "Int8Quantize",
            ["Xi_nhwc"],
            ["Xi_quantized"],
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=zero_point,
            Y_scale=scale,
        )

        pool = core.CreateOperator(
            "Int8{}".format(method),
            ["Xi_quantized"],
            ["Y_quantized"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            engine="DNNLOWP",
            device_option=dc[1],
        )

        dequantize = core.CreateOperator(
            "Int8Dequantize",
            ["Y_quantized"],
            ["Y_nhwc"],
            engine="DNNLOWP",
            device_option=dc[1],
        )

        sw2nchw = core.CreateOperator("NHWC2NCHW", ["Y_nhwc"], ["Y_out"],
                                      device_option=dc[1])

        net = caffe2_pb2.NetDef()
        net.op.extend([sw2nhwc, quantize, pool, dequantize, sw2nchw])

        workspace.FeedBlob("Xi", X, dc[1])
        workspace.RunNetOnce(net)
        Y_out = workspace.FetchBlob("Y_out")

        MSE = np.square(np.subtract(Y, Y_out)).mean()
        if MSE > 0.005:
            print(Y.flatten())
            print(Y_out.flatten())
            print(np.max(np.abs(Y_out - Y)))
            print("MSE", MSE)
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
示例#5
0
    def test_hsm_search(self):
        samples = 10
        dim_in = 5
        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
        w = np.random.rand(hierarchy_proto.size, dim_in) \
            .astype(np.float32) - 0.5
        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
            .astype(np.int32)

        workspace.GlobalInit(['caffe2'])
        workspace.FeedBlob("data", X)
        workspace.FeedBlob("weights", w)
        workspace.FeedBlob("bias", b)
        workspace.FeedBlob("labels", labels)
        op = core.CreateOperator('HSoftmaxSearch', ['data', 'weights', 'bias'],
                                 ['names', 'scores'],
                                 'HSoftmaxSearch',
                                 arg=args_search)
        workspace.RunOperatorOnce(op)
        names = workspace.FetchBlob('names')
        scores = workspace.FetchBlob('scores')

        def simulation_hsm_search():
            names = []
            scores = []
            for line in struct:
                s, e = line[0], line[0] + line[1]
                score = np.dot(X, w[s:e].transpose()) + b[s:e]
                score = np.exp(score - np.max(score, axis=1, keepdims=True))
                score /= score.sum(axis=1, keepdims=True)
                score = -np.log(score)

                score = score.transpose()
                idx = -1
                for j, n in enumerate(names):
                    if n == line[3]:
                        idx = j
                        score += scores[j]
                if idx == -1:
                    score[score > beam] = np.inf
                else:
                    score[score - scores[idx] > beam] = np.inf

                for i, name in enumerate(line[2]):
                    scores.append(score[i])
                    names.append(name)
            scores = np.vstack(scores)
            return names, scores.transpose()

        p_names, p_scores = simulation_hsm_search()
        idx = np.argsort(p_scores, axis=1)
        p_scores = np.sort(p_scores, axis=1)
        p_names = np.array(p_names)[idx]
        for i in range(names.shape[0]):
            for j in range(names.shape[1]):
                if names[i][j]:
                    self.assertEquals(names[i][j],
                                      p_names[i][j].item().encode('utf-8'))
                    self.assertAlmostEqual(scores[i][j],
                                           p_scores[i][j],
                                           delta=0.001)
def Run(args, extra_args):
    """main func of run inference"""
    if not m.IsSupported(args.model):
        logging.error("Not supported model: {}".format(args.model))
        m.ShowModels()
        return
    images_path = None
    if args.images_path:
        images_path = os.path.abspath(args.images_path)
    elif "CAFFE2_INF_IMG_PATH" in os.environ:
        images_path = os.path.abspath(os.environ["CAFFE2_INF_IMG_PATH"])
    if not args.dummydata and not os.path.isdir(images_path):
        logging.error("Can not find image path {}.".format(images_path))
        return
    labels = None
    validation = None
    if args.label_file:
        labels = cc2.LoadLabels(args.label_file)
    elif args.validation_file:
        validation = cc2.LoadValidation(args.validation_file)
    elif "CAFFE2_INF_LABEL_FILE" in os.environ:
        labels = cc2.LoadLabels(os.environ["CAFFE2_INF_LABEL_FILE"])
    elif "CAFFE2_INF_VAL_FILE" in os.environ:
        validation = cc2.LoadValidation(os.environ["CAFFE2_INF_VAL_FILE"])
    else:
        logging.warning("No validation or label file!")
    if args.annotations:
        apath = args.annotations
    elif args.model == 'faster-rcnn' or args.model == 'ssd':
        logging.error(
            "currently only support fasterrcnn and ssd for voc dataset, so will just collect performance"
        )
    iterations = args.iterations if args.iterations else sys.maxsize
    warmup_iter = args.warmup_iterations if args.warmup_iterations > 0 else 0
    optimization = []
    if args.optimization:
        optimization = [opt.strip() for opt in args.optimization.split(',')]
    batch_size = 1
    if args.batch_size:
        batch_size = int(args.batch_size)
        if batch_size <= 0:
            logging.error("Invalid batch size {}. Exit!".format(batch_size))
            return
    logging.warning("Run Caffe2 in inference mode with args:\n{}".format(
        vars(args)))
    model_info = m.GetModelInfo(args.model)
    logging.warning("The inference inputs of {0} model:\n{1}".format(
        args.model, {str(k): str(v)
                     for k, v in model_info.items()}))
    crop_size = int(model_info["crop_size"])
    if args.crop_size:
        crop_size = args.crop_size

    need_normalize = False
    if model_info["need_normalize"]:
        need_normalize = True

    mean = 128
    if str(model_info["image_mean"]) != 'None':
        mean_tmp = ((model_info["image_mean"]).split('/')[-1]).split(' ')
        if need_normalize:
            mean = np.zeros([3, crop_size, crop_size], dtype=np.float)
            mean[0, :, :] = float(mean_tmp[0])  # 104
            mean[1, :, :] = float(mean_tmp[1])  # 117
            mean[2, :, :] = float(mean_tmp[2])  # 124

        else:
            mean = np.zeros([3, crop_size, crop_size], dtype=np.int32)
            mean[0, :, :] = int(mean_tmp[0])  # 104
            mean[1, :, :] = int(mean_tmp[1])  # 117
            mean[2, :, :] = int(mean_tmp[2])  # 124
    scale = [1]
    if str(model_info["scale"]) != '':
        scale = (model_info["scale"]).split(' ')
    rescale_size = 256
    if str(model_info["rescale_size"]) != '':
        rescale_size = int(model_info["rescale_size"])
    color_format = "BGR"
    if str(model_info["color_format"]) != '':
        color_format = model_info["color_format"]

    model_start_time = timeit.default_timer()
    if args.onnx_model:
        init_def, predict_def = cc2.OnnxToCaffe2(model_info["onnx_model"])
    else:
        if args.int8_model or args.int8_cosim:
            init_file = model_info["init_net_int8"]
            predict_file = model_info["predict_net_int8"]
        else:
            init_file = model_info["init_net"]
            predict_file = model_info["predict_net"]
        with open(init_file) as i:
            if model_info["model_type"] == "prototext" or model_info[
                    "init_net"].split('.')[-1] == "pbtxt":
                import google.protobuf.text_format as ptxt
                init_def = ptxt.Parse(i.read(), caffe2_pb2.NetDef())
            else:
                init_def = caffe2_pb2.NetDef()
                init_def.ParseFromString(i.read())
        with open(predict_file) as p:
            if model_info["model_type"] == "prototext" or predict_file.split(
                    '.')[-1] == "pbtxt":
                import google.protobuf.text_format as ptxt
                predict_def = ptxt.Parse(p.read(), caffe2_pb2.NetDef())
            else:
                predict_def = caffe2_pb2.NetDef()
                predict_def.ParseFromString(p.read())
        if args.int8_cosim:
            with open(model_info["predict_net"]) as p:
                if model_info["model_type"] == "prototext" or model_info[
                        "predict_net"].split('.')[-1] == "pbtxt":
                    import google.protobuf.text_format as ptxt
                    cosim_predict_def = ptxt.Parse(p.read(),
                                                   caffe2_pb2.NetDef())
                else:
                    cosim_predict_def = caffe2_pb2.NetDef()
                    cosim_predict_def.ParseFromString(p.read())
    #cc2.SaveAsOnnxModel(init_def, predict_def, (1, 3, crop_size, crop_size),
    #            model_info["model_name"] + "_onnx.pb")
    if model_info["model_type"] == "caffe legacy":
        cc2.MergeScaleBiasInBN(predict_def)
        cc2.RemoveUselessExternalInput(predict_def)
        if args.int8_cosim:
            cc2.MergeScaleBiasInBN(cosim_predict_def)
            cc2.RemoveUselessExternalInput(cosim_predict_def)

    dev_map = {
        "cpu": caffe2_pb2.CPU,
        "gpu": caffe2_pb2.CUDA,
        "cuda": caffe2_pb2.CUDA,
        "mkldnn": caffe2_pb2.MKLDNN,
        "opengl": caffe2_pb2.OPENGL,
        "opencl": caffe2_pb2.OPENCL,
        "ideep": caffe2_pb2.IDEEP,
    }
    device_opts = caffe2_pb2.DeviceOption()
    if args.device.lower() in dev_map:
        device_opts.device_type = dev_map[args.device.lower()]
    else:
        logging.error("Wrong device {}. Exit!".format(args.device))
        return
    device_opts_cpu = caffe2_pb2.DeviceOption()
    device_opts_cpu.device_type = caffe2_pb2.CPU
    if model_info["allow_device_override"]:
        if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            cc2.UpdateDeviceOption(device_opts_cpu, init_def)
        else:
            cc2.UpdateDeviceOption(device_opts, init_def)
    if model_info["allow_device_override"]:
        cc2.UpdateDeviceOption(device_opts, predict_def)
    # search params shape to replace the 0 with 1 when ideep and throw warning
    if args.device.lower() == 'ideep':
        cc2.FillZeroParamsWithOne(init_def)

    init_data = np.random.rand(batch_size, 3, crop_size,
                               crop_size).astype(np.float32)
    init_label = np.ones((batch_size), dtype=np.int32)
    if args.cosim:
        def_ws_name = ws.CurrentWorkspace()
        inf_ws_name = "__inf_ws__"
        ws.SwitchWorkspace(inf_ws_name, True)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts)
        ws.RunNetOnce(init_def)
        cosim_ws_name = "__cosim_ws__"
        ws.SwitchWorkspace(cosim_ws_name, True)
        device_cosim = caffe2_pb2.DeviceOption()
        device_cosim.device_type = dev_map["cpu"]
        cosim_init_def = copy.deepcopy(init_def)
        cc2.UpdateDeviceOption(device_cosim, cosim_init_def)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_cosim)
        ws.RunNetOnce(cosim_init_def)
        cosim_predict_def = copy.deepcopy(predict_def)
        cc2.UpdateDeviceOption(device_cosim, cosim_predict_def)
    elif args.int8_cosim:
        inf_ws_name = "__int8_ws__"
        ws.SwitchWorkspace(inf_ws_name, True)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts)
        ws.RunNetOnce(init_def)

        net = core.Net(model_info["model_name"])
        net.Proto().CopyFrom(predict_def)
        tf.optimizeForIDEEP(net)
        predict_def = net.Proto()

        cosim_ws_name = "__fp32_ws__"
        ws.SwitchWorkspace(cosim_ws_name, True)
        ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), init_data,
                    device_opts)
        ws.RunNetOnce(init_def)
        cc2.UpdateDeviceOption(device_opts, cosim_predict_def)

        net = core.Net(model_info["model_name"])
        net.Proto().CopyFrom(cosim_predict_def)
        tf.optimizeForIDEEP(net)
        cosim_predict_def = net.Proto()
    else:
        # ApplyOptimizations(init_def, predict_def, model_info, optimization)
        ws.FeedBlob(str(predict_def.op[0].input[0]), init_data, device_opts)

        if os.environ.get('DEBUGMODE') == "1":
            cc2.SetOpName(predict_def)

        ws.RunNetOnce(init_def)
        net = core.Net(model_info["model_name"])
        net.Proto().CopyFrom(predict_def)
        if args.device.lower() == 'ideep' and not args.noptimize:
            logging.warning('Optimizing module {} ....................'.format(
                model_info["model_name"]))
            tf.optimizeForIDEEP(net)
        predict_def = net.Proto()

        # ws.CreateNet(predict_def)
        if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            new_predict_def, _ = core.InjectCrossDeviceCopies(
                core.Net(predict_def))
            net = core.Net(new_predict_def._net)
            #ws.CreateNet(new_predict_def._net)
            predict_def = new_predict_def._net

        if os.environ.get('DEBUGMODE') == "1":
            with open(
                    "{0}_opt_predict_net.pb".format(model_info["model_name"]),
                    "w") as fid:
                fid.write(predict_def.SerializeToString())
            with open(
                    "{}_opt_predict_net.pbtxt".format(
                        model_info["model_name"]), "w") as fid:
                fid.write(str(predict_def))

        if args.profile or predict_def.op[-1].type == 'Accuracy':
            #predict_model = model_helper.ModelHelper("predict")
            #predict_model.net = core.Net(predict_def)
            #predict_model.net.name = predict_def.name
            if predict_def.op[-1].type == 'Accuracy':
                label = net.AddExternalInput('label')
                if args.device.lower() == 'gpu':
                    ws.FeedBlob(label, init_label, device_opts)
                else:
                    ws.FeedBlob(label, init_label, device_opts_cpu)
                for i, op in enumerate(predict_def.op):
                    if op.type == 'Accuracy':
                        if args.device.lower() == 'gpu':
                            print(device_opts.device_type)
                            ws.FeedBlob(str(predict_def.op[i].output[0]),
                                        init_label, device_opts)
                        else:
                            ws.FeedBlob(str(predict_def.op[i].output[0]),
                                        init_label, device_opts_cpu)
            #if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            #    ws.CreateNet(net, True)
            #else:
            ws.CreateNet(net)
            if args.profile:
                #ob = predict_model.net.AddObserver("TimeObserver")
                ob = net.AddObserver("TimeObserver")
        else:
            #if (args.model == 'faster-rcnn' and args.device.lower() == 'gpu'):
            #    ws.CreateNet(net, True)
            #else:
            ws.CreateNet(net)

    model_elapsed_time = timeit.default_timer() - model_start_time

    outputs = []
    accuracy_top1 = []
    accuracy_top5 = []
    img_time = 0
    comp_time = 0
    processed_images = 0
    images = []
    labels = []
    fnames = []
    if args.dummydata:
        init_label = np.ones((batch_size), dtype=np.int32)
        imgs = np.random.rand(batch_size, 3, crop_size,
                              crop_size).astype(np.float32)
        for i in range(iterations):
            labels.append(init_label)
            images.append(imgs)
    else:
        process_data_start_time = timeit.default_timer()
        images, fnames = cc2.ImageProc.BatchImages(images_path, batch_size,
                                                   iterations)
        process_data_elapsed_time = timeit.default_timer(
        ) - process_data_start_time
        logging.warning(
            "processdata time = {}".format(process_data_elapsed_time))
    logging.warning("Start warmup {} iterations...".format(warmup_iter))
    forchw = 1
    if 'style-transfer' in args.model:
        forchw = 0
    wi = warmup_iter - 1
    while warmup_iter and not args.cosim:
        warmup_iter -= 1
        if args.dummydata:
            imgs = images[wi - warmup_iter]
            oshape = (crop_size, crop_size, 3)
        else:
            r = randint(0, len(images) - 1)
            imgs, oshape = cc2.ImageProc.PreprocessImages(
                images[r], crop_size, rescale_size, mean, scale, forchw,
                need_normalize, color_format)
            #imgs, oshape = cc2.ImageProc.PreprocessImagesByThreading(
            #    images[r], crop_size, rescale_size, mean, scale, forchw)
        if args.model == 'faster-rcnn':
            # init_def_update=copy.deepcopy(init_def)
            # cc2.UpdateImgInfo(oshape, init_def_update, predict_def, crop_size)
            # ws.RunNetOnce(init_def_update)
            im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def,
                                                  crop_size)
            if args.device.lower() == 'gpu':
                ws.FeedBlob(im_info_name, blob, device_opts_cpu)
            else:
                ws.FeedBlob(im_info_name, blob, device_opts)
        if 'style-transfer' in args.model or (args.model == 'faster-rcnn' and
                                              args.device.lower() == 'gpu'):
            ws.FeedBlob(str(predict_def.op[0].input[0]), imgs)
        else:
            ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts)
        if predict_def.op[-1].type == 'Accuracy' and len(validation) > 0:
            batch_fname = fnames[r]
            init_label = np.ones((len(fnames[r])), dtype=np.int32)
            for j in range(len(fnames[r])):
                init_label[j] = validation[batch_fname[j]]

            if args.device.lower() == 'gpu':
                ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                            device_opts)
                ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label,
                            device_opts)
            else:
                ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                            device_opts_cpu)
                ws.FeedBlob(str(predict_def.op[-2].input[1]), init_label,
                            device_opts_cpu)

        #if args.profile or predict_def.op[-1].type == 'Accuracy':
        #    ws.RunNet(net)
        #else:
        ws.RunNet(net)
    logging.warning("Start running performance")
    for k, raw in enumerate(images):
        processed_images += len(raw)
        img_start_time = timeit.default_timer()
        if args.dummydata:
            imgs = raw
            oshape = (crop_size, crop_size)
        else:
            imgs, oshape = cc2.ImageProc.PreprocessImages(
                raw, crop_size, rescale_size, mean, scale, forchw,
                need_normalize, color_format)
            #imgs, oshape = cc2.ImageProc.PreprocessImagesByThreading(raw, crop_size, rescale_size, mean, scale, forchw)
        # im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def, crop_size)
        # ws.FeedBlob(im_info_name, blob, device_opts)
        # x = ws.FetchBlob(im_info_name)
        init_label = None
        if predict_def.op[-1].type == 'Accuracy' and args.dummydata:
            init_label = labels[k]
        elif predict_def.op[-1].type == 'Accuracy' and len(validation) > 0:
            batch_fname = fnames[k]
            init_label = np.ones((len(fnames[k])), dtype=np.int32)
            for j in range(len(fnames[k])):
                init_label[j] = validation[batch_fname[j]]

        if args.model == 'faster-rcnn':
            # init_def_update=copy.deepcopy(init_def)
            # cc2.UpdateImgInfo(oshape, init_def_update, predict_def, crop_size)
            im_info_name, blob = cc2.CreateIMBlob(oshape, predict_def,
                                                  crop_size)

            if args.cosim:
                ws.SwitchWorkspace(inf_ws_name, True)
                # ws.RunNetOnce(init_def_update)
                ws.FeedBlob(im_info_name, blob, device_opts)
                ws.SwitchWorkspace(cosim_ws_name, True)
                # cosim_init_def_update=copy.deepcopy(cosim_init_def)
                # cc2.UpdateImgInfo(oshape, cosim_init_def_update, cosim_predict_def, crop_size)
                # ws.RunNetOnce(cosim_init_def_update)
                ws.FeedBlob(im_info_name, blob, device_cosim)
            else:
                # ws.RunNetOnce(init_def_update)
                if args.device.lower() == 'gpu':
                    ws.FeedBlob(im_info_name, blob, device_opts_cpu)
                else:
                    ws.FeedBlob(im_info_name, blob, device_opts)
        # logging.info("output blob is: {}".format(x))
        # imgs = ImageProc.PreprocessImages(raw, crop_size, mean)
        img_elapsed_time = timeit.default_timer() - img_start_time
        img_time += img_elapsed_time
        if args.cosim or args.int8_cosim:
            ws.SwitchWorkspace(cosim_ws_name)
            if args.cosim:
                ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), imgs,
                            device_cosim)
            else:
                ws.FeedBlob(str(cosim_predict_def.op[0].input[0]), imgs,
                            device_opts)
            ws.SwitchWorkspace(inf_ws_name)
            ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts)
            for i in range(len(predict_def.op)):
                ws.SwitchWorkspace(inf_ws_name)
                inf_inputs = []
                for inp in predict_def.op[i].input:
                    inf_inputs.append(ws.FetchBlob(str(inp)))
                ws.RunOperatorOnce(predict_def.op[i])
                inf_results = []
                for res in predict_def.op[i].output:
                    inf_results.append(ws.FetchBlob(str(res)))
                ws.SwitchWorkspace(cosim_ws_name)
                cosim_inputs = []
                for inp in cosim_predict_def.op[i].input:
                    cosim_inputs.append(ws.FetchBlob(str(inp)))
                ws.RunOperatorOnce(cosim_predict_def.op[i])
                cosim_results = []
                for res in cosim_predict_def.op[i].output:
                    cosim_results.append(ws.FetchBlob(str(res)))
                if len(inf_inputs) != len(cosim_inputs):
                    logging.error("Wrong number of inputs")
                if len(inf_results) != len(cosim_results):
                    logging.error("Wrong number of outputs")
                    return
                if args.cosim:
                    tol = {'atol': 1e-02, 'rtol': 1e-03}
                else:
                    tol = {'atol': 5, 'rtol': 1e-01}
                logging.warning("begin to check op[{}] {} input".format(
                    i, predict_def.op[i].type))
                for k in range(len(inf_inputs)):
                    if predict_def.op[i].input[k][0] == '_':
                        continue
                    #cc2.assert_allclose(inf_inputs[k], cosim_inputs[k], **tol)
                    #if not np.allclose(inf_inputs[k], cosim_inputs[k], **tol):
                    #    logging.error("Failure in cosim {} op {} input {}"
                    #        .format(
                    #        i,
                    #        predict_def.op[i].type,
                    #        predict_def.op[i].input[k]))
                    #    logging.error(inf_inputs[k].flatten())
                    #    logging.error(cosim_inputs[k].flatten())
                    #    logging.error("Max error: {}"
                    #        .format(
                    #        np.max(np.abs(
                    #            inf_inputs[k] - cosim_inputs[k]))))
                    #    return
                logging.warning("pass checking op[{0}] {1} input".format(
                    i, predict_def.op[i].type))
                logging.warning("begin to check op[{0}] {1} output".format(
                    i, predict_def.op[i].type))
                for j, _ in enumerate(inf_results):
                    if predict_def.op[i].output[j][0] == '_':
                        continue
                    if args.cosim:
                        if not cc2.assert_allclose(inf_results[j],
                                                   cosim_results[j], **tol):
                            logging.error(
                                "failed checking op[{0}] {1} output".format(
                                    i, predict_def.op[i].type))
                            exit()
                    if args.int8_cosim:
                        cc2.assert_allclose(inf_results[j], cosim_results[j],
                                            **tol)
                        cc2.assert_compare(inf_results[j], cosim_results[j],
                                           1e-01, 'ALL')
                    #if not np.allclose(inf_results[j], cosim_results[j], **tol):
                    # logging.error("Failure in cosim {} op {} output {}"
                    #     .format(
                    #     i,
                    #     predict_def.op[i].type,
                    #     predict_def.op[i].output[j]))
                    # logging.error(inf_results[j].flatten())
                    # logging.error(cosim_results[j].flatten())
                    # logging.error("Max error: {}"
                    #     .format(
                    #     np.max(np.abs(
                    #         inf_results[j] - cosim_results[j]))))
                    # return
                logging.warning("pass checking op[{0}] {1} output".format(
                    i, predict_def.op[i].type))
        else:
            if 'style-transfer' in args.model or (args.model == 'faster-rcnn'
                                                  and args.device.lower()
                                                  == 'gpu'):
                ws.FeedBlob(str(predict_def.op[0].input[0]), imgs)
            else:
                ws.FeedBlob(str(predict_def.op[0].input[0]), imgs, device_opts)
            if predict_def.op[-1].type == 'Accuracy':
                if args.device.lower() == 'gpu':
                    ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                                device_opts)
                    if predict_def.op[-2].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-2].input[1]),
                                    init_label, device_opts)
                    elif predict_def.op[-3].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-3].input[1]),
                                    init_label, device_opts)
                else:
                    ws.FeedBlob(str(predict_def.op[-1].input[1]), init_label,
                                device_opts_cpu)
                    if predict_def.op[-2].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-2].input[1]),
                                    init_label, device_opts_cpu)
                    elif predict_def.op[-3].type == 'Accuracy':
                        ws.FeedBlob(str(predict_def.op[-3].input[1]),
                                    init_label, device_opts_cpu)

            comp_start_time = timeit.default_timer()
            #if args.profile or predict_def.op[-1].type == 'Accuracy':
            #    ws.RunNet(net)
            #else:
            ws.RunNet(net)
            comp_elapsed_time = timeit.default_timer() - comp_start_time
            comp_time += comp_elapsed_time
            output = ws.FetchBlob(str(predict_def.op[-1].output[0]))
            if predict_def.op[-2].type == 'Accuracy':
                output2 = ws.FetchBlob(str(predict_def.op[-2].output[0]))
            elif predict_def.op[-3].type == 'Accuracy':
                output2 = ws.FetchBlob(str(predict_def.op[-3].output[0]))
            elif predict_def.op[-1].type == 'BoxWithNMSLimit':
                output2 = ws.FetchBlob(str(predict_def.op[-1].output[1]))
                output3 = ws.FetchBlob(str(predict_def.op[-1].output[2]))
            logging.warning(
                "[{0:.2%}] Output shape: {1}, computing in {2:.10f}"
                " seconds, processing {3} images in {4:.10f} seconds.".format(
                    ((k + 1) / len(images)), output.shape, comp_elapsed_time,
                    len(raw), img_elapsed_time))
            if predict_def.op[-1].type == 'BoxWithNMSLimit':
                outputs.append([output, output2, output3])
            elif predict_def.op[-1].type != 'Accuracy':
                outputs.append(output)
            else:
                accuracy_top1.append(output2)
                accuracy_top5.append(output)
            if args.profile:
                logging.warning("observer time = {}".format(ob.average_time()))
                logging.warning("observer time = {}".format(
                    ob.average_time_children()))

        del imgs
        if k >= (iterations - 1):
            logging.warning(
                "Exit after running {} iterations".format(iterations))
            break
    if args.profile:
        net.RemoveObserver(ob)

    if args.cosim:
        ws.SwitchWorkspace(def_ws_name)
        logging.info("Cosim passed")
        return
    if comp_time <= 0:
        logging.error("The total time is invalid!")
        return
    info_str = ""
    if len(accuracy_top1) > 0:
        mean_accuracy_top1 = 0
        mean_accuracy_top5 = 0
        for i, _ in enumerate(accuracy_top1):
            mean_accuracy_top1 += accuracy_top1[i] * batch_size
            mean_accuracy_top5 += accuracy_top5[i] * batch_size
        mean_accuracy_top1 /= batch_size * len(accuracy_top1)
        mean_accuracy_top5 /= batch_size * len(accuracy_top5)
        info_str += "\nAccuracy: {:.5%}".format(mean_accuracy_top1)
        info_str += "\nTop5Accuracy: {:.5%}".format(mean_accuracy_top5)
        total_image = processed_images
        logging.critical(
            "\nImages per second: {0:.10f}\nTotal computing time:"
            " {1:.10f} seconds\nTotal image processing time: {2:.10f} seconds\n"
            "Total model loading time: {3:.10f} seconds\nTotal images: {4}{5}".
            format(total_image / comp_time, comp_time, img_time,
                   model_elapsed_time, total_image, info_str))
        return
    if args.annotations:
        logging.info(" the total length of outputs is {}".format(len(outputs)))
        logging.critical("result is ={}".format(
            cc2.prepare_and_compute_map_data(outputs, fnames, apath)))
    info_str = ""
    accuracy = None
    top5accuracy = None
    summary = None
    if model_info["output_type"] == "segmentation" or args.dummydata:
        total_image = processed_images
    elif model_info["output_type"] == "possibility":
        results, total_image = cc2.ParsePossOutputs(outputs)
        summary = cc2.ParsePossResults(results, labels, validation, fnames)
        if not summary:
            logging.error("Failed to parse the results!")
            return
        elif total_image <= 0 or len(summary) != total_image:
            logging.error("No available results!")
            return
        if validation:
            accuracy = 0
            top5accuracy = 0
            for res in summary:
                if res[1] == "Pass":
                    accuracy += 1
                    top5accuracy += 1
                elif res[1] == "Top5Pass":
                    top5accuracy += 1
            accuracy = accuracy / total_image
            top5accuracy = top5accuracy / total_image
            info_str += "\nAccuracy: {:.5%}".format(accuracy)
            info_str += "\nTop5Accuracy: {:.5%}".format(top5accuracy)
    elif model_info["output_type"] == "post image":
        results, total_image = cc2.ParsePostOutputs(outputs)
        if args.post_images_path:
            cc2.SavePostImages(results, args.post_images_path, fnames)
    logging.critical(
        "\nImages per second: {0:.10f}\nTotal computing time:"
        " {1:.10f} seconds\nTotal image processing time: {2:.10f} seconds\n"
        "Total model loading time: {3:.10f} seconds\nTotal images: {4}{5}".
        format(total_image / comp_time, comp_time, img_time,
               model_elapsed_time, total_image, info_str))
    cc2.SaveOutput(args, summary, accuracy, top5accuracy, comp_time,
                   total_image, img_time, model_elapsed_time)
示例#7
0
def bmuf_process(filestore_dir, process_id, shared_results, nesterov=False):
    # We need to import caffe2 in every process to initialize CUDA independently.
    from caffe2.python import core, cnn, data_parallel_model, workspace, dyndep
    from caffe2.proto import caffe2_pb2
    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")

    if not workspace.has_gpu_support:
        log.info('No GPU support test is Ignored.')
        return

    if workspace.NumCudaDevices() < 4:
        log.info('Not enough GPU support, test IGNORED')
        return

    model = cnn.CNNModelHelper(order="NHWC", name="test")

    gpu_ids = [0, 1] if process_id == 0 else [2, 3]

    def _model_build_fun(model, loss_scale):
        fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                      ("ConstantFill", {}))
        fc_fl = model.FlattenToVec(fc, "fc_fl")
        sigm = model.Sigmoid(fc_fl, "sigm")
        sq = model.SquaredL2Distance([sigm, "label"], "sq")
        loss = model.AveragedLoss(sq, "loss")
        loss = model.Scale(loss, scale=loss_scale)

        # For testing explicit sync
        model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
        return [loss]

    def _input_builder_fun(model):
        return None

    def _param_update_fun(model):
        ITER = model.Iter("ITER")
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=(-0.1),
            policy="fixed",
        )
        ONE = model.param_init_net.ConstantFill(
            [],
            "ONE",
            shape=[1],
            value=1.0,
        )
        for param in model.GetParams():
            grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, grad, LR], param)

    def _generate_data(gpu_devices, process_id):
        np.random.seed(26 + process_id * 10)
        # Each run has same input, independent of number of gpus
        batch_size = 64
        for _ in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/data".format(g), data)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

    _generate_data(gpu_ids, process_id)

    workspace.RunOperatorOnce(
        core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"],
                            path=filestore_dir))
    rendezvous = dict(kv_handler="store_handler",
                      shard_id=process_id,
                      num_shards=2,
                      engine="GLOO",
                      exit_nets=None)

    data_parallel_model.Parallelize_GPU_BMUF(
        model,
        _input_builder_fun,
        _model_build_fun,
        _param_update_fun,
        devices=gpu_ids,
        rendezvous=rendezvous,
        nesterov=nesterov,
        add_blobs_to_sync=["sync_num"],
    )

    data_parallel_model.RunInitNet(model)

    def _gpu_pid(gpu_id, pid):
        if pid == 1:
            return gpu_id + 2
        return gpu_id

    np.testing.assert_equal(
        workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))),
        np.zeros(16).astype(np.float32).reshape(1, 16))

    # Run the algorithm for one iteration to have non-zero params.
    data_parallel_model.RunNet(model, 1)

    # Save iteration momentum and post local update params
    results = {}
    v_b_ = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id)))
    v_w_ = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id)))

    results['v_b_'] = v_b_
    results['v_w_'] = v_w_

    workspace.RunNetOnce(model.net)

    b_0_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id)))
    w_0_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id)))
    b_1_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id)))
    w_1_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id)))

    results['b_0_'] = b_0_
    results['w_0_'] = w_0_
    results['b_1_'] = b_1_
    results['w_1_'] = w_1_

    # Test sync
    if process_id == 0:
        workspace.FeedBlob(model._device_prefix + "_0/sync_num",
                           np.array([2603]).astype(np.float32),
                           device_option=core.DeviceOption(
                               model._device_type, 0))

    # Compute block gradients.
    b_g_ = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id)))
    w_g_ = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id)))
    results['b_g_'] = b_g_
    results['w_g_'] = w_g_
    workspace.RunNetOnce(model._global_model_param_updates_net)

    #  g_b = (b_0_ + b_1_) / 2 - b_g_
    #  g_w = (w_0_ + w_1_) / 2 - w_g_
    v_b = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id)))
    v_w = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id)))
    w_g = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id)))
    b_g = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id)))
    w_0 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id)))
    b_0 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id)))
    w_1 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id)))
    b_1 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id)))
    results['v_b'] = v_b
    results['v_w'] = v_w
    results['w_g'] = w_g
    results['b_g'] = b_g
    results['w_0'] = w_0
    results['b_0'] = b_0
    results['w_1'] = w_1
    results['b_1'] = b_1

    # Test add_blobs_to_sync
    for j in model._devices:
        sync = workspace.FetchBlob(model._device_prefix +
                                   "_{}/sync_num".format(j))[0]
        results['sync_{}'.format(j)] = sync

    shared_results[process_id] = results
    def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, group, gc, dc):
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        sum = core.CreateOperator(
            "Sum",
            ["S0", "Y0"],
            ["S0"],
            device_option=dc[0]
        )
        relu = core.CreateOperator(
            "Relu",
            ["S0"],
            ["S0"],
            device_option=dc[0]
        )
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type = 3,
            device_option=dc[1]
        )
        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv)
        Y0 = workspace.FetchBlob('Y0')
        S = np.random.rand(*Y0.shape).astype(np.float32) - 0.5
        workspace.FeedBlob('S0', S, dc[0])
        workspace.RunOperatorOnce(sum)
        workspace.RunOperatorOnce(relu)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.FeedBlob('S1', S, dc[1])
        workspace.RunOperatorOnce(conv_fusion)
        S1 = workspace.FetchBlob('S1')

        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
            print(S1.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S1 - S0)))
            self.assertTrue(False)
        workspace.SwitchWorkspace(old_ws_name)
    def test_convolution_relu_fusion(self, stride, pad, kernel, size,
                             input_channels, output_channels,
                             batch_size, use_bias, group, gc, dc):
        conv = core.CreateOperator(
            "Conv",
            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
            ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0]
        )
        relu = core.CreateOperator(
            "Relu",
            ["Y0"],
            ["Y0"],
            device_option=dc[0]
        )

        # Manual fusion
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1"] if use_bias else ["X1", "w1"],
            ["Y1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type = 1,
            device_option=dc[1]
        )

        # Auto fusion
        old_net = caffe2_pb2.NetDef()
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_old, relu_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForIDEEP(net)
        self.assertTrue(len(net.Proto().op) == 1)
        self.assertTrue(net.Proto().op[0].type == "ConvFusion")

        X = np.random.rand(
            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(relu)
        Y0 = workspace.FetchBlob('Y0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(conv_fusion)
        Y1 = workspace.FetchBlob('Y1')
        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
            print(Y1.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y1 - Y0)))
            self.assertTrue(False)

        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        workspace.RunOperatorOnce(net.Proto().op[0])
        Y2 = workspace.FetchBlob('Y0')
        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
            print(Y2.flatten())
            print(Y0.flatten())
            print(np.max(np.abs(Y2 - Y0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
示例#10
0
# ------------------------------------------------------------------------------------

# Create an operator.
op = core.CreateOperator(
    "Relu",  # The type of operator that we want to run
    ["X"],  # A list of input blobs by their names
    ["Y"],  # A list of output blobs by their names
)
# and we are done!

print("Type of the created op is: {}".format(type(op)))
print("Content:\n")
print(str(op))

workspace.FeedBlob("X", np.random.randn(2, 3).astype(np.float32))
workspace.RunOperatorOnce(op)

print("Current blobs in the workspace: {}\n".format(workspace.Blobs()))
print("X:\n{}\n".format(workspace.FetchBlob("X")))
print("Y:\n{}\n".format(workspace.FetchBlob("Y")))
print("Expected:\n{}\n".format(np.maximum(workspace.FetchBlob("X"), 0)))

op = core.CreateOperator(
    "GaussianFill",
    [],  # GaussianFill does not need any parameters.
    ["Z"],
    shape=[100, 100],  # shape argument as a list of ints.
    mean=1.0,  # mean as a single float
    std=1.0,  # std as a single float
)
print("Content of op:\n")
示例#11
0
    def test_swish_int8(self):
        np.random.seed(0)
        workspace.ResetWorkspace()
        n = 256

        X_fp32 = np.linspace(-20.5, 8., num=n).astype(np.float32).reshape(1, n)
        Y_fp32 = self._swish(X_fp32)
        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
        Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
        W_fp32 = np.identity(n, dtype=np.float32)
        b_fp32 = np.zeros((n,), dtype=np.float32)

        workspace.FeedBlob("X", X_fp32)
        workspace.FeedBlob("W", W_fp32)
        workspace.FeedBlob("b", b_fp32)

        workspace.RunOperatorOnce(
            core.CreateOperator(
                "Int8FCPackWeight",
                ["W"],
                ["W_int8"],
                engine="DNNLOWP",
                save_unpacked_weights=True,
                in_scale=X_scale,
            )
        )

        ref_net1 = core.Net("net")
        ref_net1.Int8QuantizeNNPI(
            ["X"],
            ["X_int8"],
            Y_scale=X_scale,
            Y_zero_point=X_zero_point
        )
        ref_net1.Int8FCFakeAcc32NNPI(
            ["X_int8", "W_int8", "b"],
            ["U_int8"],
            Y_scale=X_scale,
            Y_zero_point=X_zero_point,
        )
        ref_net1.SwishFakeInt8NNPI(
            ["U_int8"],
            ["Y"],
            X_scale=X_scale,
            X_zero_point=X_zero_point,
            Y_scale=Y_scale,
            Y_zero_point=Y_zero_point
        )
        ref_net1.Proto().external_output.append("Y")

        ref_net = core.Net("net")
        ref_net.Int8QuantizeNNPI(
            ["X"],
            ["X_int8"],
            Y_scale=X_scale,
            Y_zero_point=X_zero_point
        )
        ref_net.Int8FCFakeAcc32NNPI(
            ["X_int8", "W_int8", "b"],
            ["U_int8"],
            Y_scale=X_scale,
            Y_zero_point=X_zero_point,
        )
        ref_net.Int8DequantizeNNPI(
            ["U_int8"],
            ["U_fp16"],
            UsingOneOverScale=False
        )
        ref_net.SwishFakeFp16NNPI(
            ["U_fp16"],
            ["Y_fp16"]
        )
        ref_net.Int8QuantizeNNPI(
            ["Y_fp16"],
            ["Y"],
            Y_scale=Y_scale,
            Y_zero_point=Y_zero_point
        )
        ref_net.Proto().external_output.append("Y")

        # run ref_net
        workspace.RunNetOnce(ref_net1)
        Y_fbgemm = workspace.FetchInt8Blob("Y")

        # run onnxifi net
        ref_net.Proto().op[0].type = "Int8Quantize"
        ref_net.Proto().op[1].type = "Int8FC"
        ref_net.Proto().op[2].type = "Int8Dequantize"
        ref_net.Proto().op[3].type = "Swish"
        ref_net.Proto().op[4].type = "Int8Quantize"
        net_onnxified = onnxifi_caffe2_net(
            ref_net.Proto(),
            {},
            debug=True,
            adjust_batch=False,
            use_onnx=False,
            weight_names=["W_int8", "b"],
        )
        num_onnxified_ops = sum(
            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
        )
        np.testing.assert_equal(num_onnxified_ops, 1)
        # TODO: add an assertion to check the optimized net
        # fused Dequantize->Swish->Quantize to QuantizedSwish
        workspace.CreateNet(net_onnxified)
        workspace.RunNet(net_onnxified.name)
        Y_glow = workspace.FetchInt8Blob("Y")
        U_int8 = workspace.FetchInt8Blob("U_int8")

        diff_Y = np.abs(Y_glow.data - Y_fbgemm.data)

        num_mismatches = np.count_nonzero(diff_Y)
        max_diff = np.max(diff_Y)
        if max_diff > 0 or Y_glow.scale != Y_fbgemm.scale or \
           Y_glow.zero_point != Y_fbgemm.zero_point:
            print_test_debug_info(
                "QuantizedSwish",
                {
                    "X": X_fp32,
                    "X_scale": X_scale,
                    "X_zero_point": X_zero_point,
                    "Y_scale": Y_scale,
                    "Y_zero_point": Y_zero_point,
                    "U_int8": U_int8,
                    "Y_fbgemm": Y_fbgemm,
                    "Y_glow": Y_glow,
                    "diff": diff_Y,
                    "max_diff": max_diff,
                    "num_mismatches": num_mismatches,
                },
            )
            assert 0
示例#12
0
 def _run_zero_even_op(self, X):
     op = core.CreateOperator('ZeroEven', ['X'], ['Y'])
     workspace.FeedBlob('X', X)
     workspace.RunOperatorOnce(op)
     Y = workspace.FetchBlob('Y')
     return Y
示例#13
0
    def test_int8_quantize(self, n, rand_seed, non_zero_offset):
        print("n={}, rand_seed={}".format(n, rand_seed))
        np.random.seed(rand_seed)
        workspace.ResetWorkspace()

        if non_zero_offset:
            X_fp32 = np.random.uniform(-1, 1, size=(n, n)).astype(np.float16) \
                .astype(np.float32)
        else:
            X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32)

        W_fp32 = np.identity(n, dtype=np.float32)
        b_fp32 = np.zeros((n, ), dtype=np.float32)

        X_scale, X_zero_point = self._get_scale_zp(X_fp32)

        workspace.FeedBlob("X", X_fp32)
        workspace.FeedBlob("W", W_fp32)
        workspace.FeedBlob("b", b_fp32)

        workspace.RunOperatorOnce(
            core.CreateOperator(
                "Int8FCPackWeight",
                ["W"],
                ["W_int8"],
                engine="DNNLOWP",
                save_unpacked_weights=True,
                in_scale=X_scale,
            ))

        ref_net = core.Net("net")
        ref_net.Int8QuantizeNNPI(["X"], ["X_int8"],
                                 Y_scale=X_scale,
                                 Y_zero_point=X_zero_point)
        ref_net.Int8FCFakeAcc32NNPI(
            ["X_int8", "W_int8", "b"],
            ["Y_int8"],
            Y_scale=X_scale,
            Y_zero_point=X_zero_point,
        )
        ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"])
        ref_net.Proto().external_output.append("Y")

        # run ref_net
        workspace.RunNetOnce(ref_net)
        Y_fbgemm = workspace.FetchBlob("Y")

        # run onnxifi net
        ref_net.Proto().op[0].type = "Int8Quantize"
        ref_net.Proto().op[1].type = "Int8FC"
        ref_net.Proto().op[2].type = "Int8Dequantize"
        net_onnxified = onnxifi_caffe2_net(
            ref_net.Proto(),
            {},
            debug=True,
            adjust_batch=False,
            use_onnx=False,
            weight_names=["W_int8", "b"],
        )
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.CreateNet(net_onnxified)
        workspace.RunNet(net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow, Y_fbgemm):
            diff_Y = np.abs(Y_glow - Y_fbgemm)
            print_test_debug_info(
                "int8_fc",
                {
                    "seed": rand_seed,
                    "n": n,
                    "X": X_fp32,
                    "W": W_fp32,
                    "b": b_fp32,
                    "Y_fbgemm": Y_fbgemm,
                    "Y_glow": Y_glow,
                    "diff": diff_Y,
                    "maxdiff": diff_Y.max(axis=1),
                },
            )
            assert 0
示例#14
0
    def test_int8_fc(self, n, m, k, rand_seed, quantize_bias, f):
        print(
            f"n={n}, m={m}, k={k}, rand_seed={rand_seed}, quantize_bias={quantize_bias}"
        )
        np.random.seed(rand_seed)
        workspace.ResetWorkspace()

        ff = float(f)
        X_fp32 = np.random.uniform(-ff, ff, size=(m, k)).astype(np.float32)
        W_fp32 = np.random.uniform(-ff, ff, size=(n, k)).astype(np.float32)
        b_fp32 = np.random.uniform(-ff, ff, size=(n)).astype(np.float32)

        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
        Y_fp32 = np.dot(X_fp32, W_fp32.T) + b_fp32
        Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)

        workspace.FeedBlob("X", X_fp32)
        workspace.FeedBlob("W", W_fp32)
        workspace.FeedBlob("b", b_fp32)

        workspace.RunOperatorOnce(
            core.CreateOperator(
                "Int8FCPackWeight",
                ["W", "b"] if quantize_bias else ["W"],
                ["W_int8", "b_int32"] if quantize_bias else ["W_int8"],
                engine="DNNLOWP",
                save_unpacked_weights=True,
                in_scale=X_scale,
            ))

        ref_net = core.Net("net")
        ref_net.Int8QuantizeNNPI(["X"], ["X_int8"],
                                 Y_scale=X_scale,
                                 Y_zero_point=X_zero_point)
        ref_net.Int8FCFakeAcc32NNPI(
            ["X_int8", "W_int8", "b_int32" if quantize_bias else "b"],
            ["Y_int8"],
            Y_scale=Y_scale,
            Y_zero_point=Y_zero_point,
        )
        ref_net.Int8DequantizeNNPI(["Y_int8"], ["Y"])
        ref_net.Proto().external_output.append("Y")

        # run ref_net
        workspace.RunNetOnce(ref_net)
        Y_fbgemm = workspace.FetchBlob("Y")

        # run onnxifi net
        ref_net.Proto().op[0].type = "Int8Quantize"
        ref_net.Proto().op[1].type = "Int8FC"
        ref_net.Proto().op[2].type = "Int8Dequantize"
        net_onnxified = onnxifi_caffe2_net(
            ref_net.Proto(),
            {},
            debug=True,
            adjust_batch=False,
            use_onnx=False,
            weight_names=["W_int8", "b_int32"]
            if quantize_bias else ["W_int8", "b"],
        )
        num_onnxified_ops = sum(1 if o.type == "Onnxifi" else 0
                                for o in net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)
        workspace.CreateNet(net_onnxified)
        workspace.RunNet(net_onnxified.name)
        Y_glow = workspace.FetchBlob("Y")

        if not np.allclose(Y_glow, Y_fbgemm):
            diff_Y = np.abs(Y_glow - Y_fbgemm)
            print_test_debug_info(
                "int8_fc",
                {
                    "seed": rand_seed,
                    "n": n,
                    "m": m,
                    "k": k,
                    "X": X_fp32,
                    "W": W_fp32,
                    "b": b_fp32,
                    "Y_fbgemm": Y_fbgemm,
                    "Y_glow": Y_glow,
                    "diff": diff_Y,
                    "maxdiff": diff_Y.max(axis=1),
                },
            )
            assert 0
示例#15
0
 def create_queue(queue_name, num_blobs, capacity):
     workspace.RunOperatorOnce(
         core.CreateOperator("CreateBlobsQueue", [], [queue_name],
                             num_blobs=1,
                             capacity=capacity))
     return core.ScopedBlobReference(queue_name)
示例#16
0
    def _test_index_ops(self, entries, dtype, index_create_op):
        workspace.RunOperatorOnce(
            core.CreateOperator(index_create_op, [], ['index'],
                                max_elements=10))
        my_entries = np.array([entries[0], entries[1], entries[2]],
                              dtype=dtype)

        workspace.FeedBlob('entries', my_entries)
        workspace.RunOperatorOnce(
            core.CreateOperator('IndexLoad', ['index', 'entries'], ['index']))
        query1 = np.array([entries[0], entries[3], entries[0], entries[4]],
                          dtype=dtype)

        workspace.FeedBlob('query1', query1)
        workspace.RunOperatorOnce(
            core.CreateOperator('IndexGet', ['index', 'query1'], ['result1']))
        result1 = workspace.FetchBlob('result1')
        np.testing.assert_array_equal([1, 4, 1, 5], result1)

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexFreeze', ['index'], ['index']))

        query2 = np.array(
            [entries[5], entries[4], entries[0], entries[6], entries[7]],
            dtype=dtype)
        workspace.FeedBlob('query2', query2)
        workspace.RunOperatorOnce(
            core.CreateOperator('IndexGet', ['index', 'query2'], ['result2']))
        result2 = workspace.FetchBlob('result2')
        np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexSize', ['index'], ['index_size']))
        size = workspace.FetchBlob('index_size')
        self.assertEquals(size, 6)

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexStore', ['index'], ['stored_entries']))
        stored_actual = workspace.FetchBlob('stored_entries')
        new_entries = np.array([entries[3], entries[4]], dtype=dtype)
        np.testing.assert_array_equal(
            np.concatenate((my_entries, new_entries)), stored_actual)

        workspace.RunOperatorOnce(
            core.CreateOperator(index_create_op, [], ['index2']))

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexLoad', ['index2', 'stored_entries'],
                                ['index2'],
                                skip_first_entry=1))

        workspace.RunOperatorOnce(
            core.CreateOperator('IndexSize', ['index2'], ['index2_size']))
        index2_size = workspace.FetchBlob('index2_size')
        self.assertEquals(index2_size, 5)

        # test serde
        with tempfile.NamedTemporaryFile() as tmp:
            workspace.RunOperatorOnce(
                core.CreateOperator('Save', ['index'], [],
                                    absolute_path=1,
                                    db_type='minidb',
                                    db=tmp.name))
            # frees up the blob
            workspace.FeedBlob('index', np.array([]))
            # reloads the index
            workspace.RunOperatorOnce(
                core.CreateOperator('Load', [], ['index'],
                                    absolute_path=1,
                                    db_type='minidb',
                                    db=tmp.name))
            query3 = np.array(
                [entries[0], entries[3], entries[0], entries[4], entries[4]],
                dtype=dtype)

            workspace.FeedBlob('query3', query3)
            workspace.RunOperatorOnce(
                core.CreateOperator('IndexGet', ['index', 'query3'],
                                    ['result3']))
            result3 = workspace.FetchBlob('result3')
            np.testing.assert_array_equal([1, 4, 1, 5, 5], result3)
示例#17
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus

    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet152",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id
    interfaces = args.distributed_interfaces.split(",")

    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        store_handler = "store_handler"
        if args.redis_host is not None:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    def create_resnet152_model_ops(model, loss_scale):
        initializer = (pFP16Initializer
                       if args.dtype == 'float16' else Initializer)

        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = resnet.create_resnet152(
                model,
                "data",
                num_input_channels=args.num_channels,
                num_labels=args.num_labels,
                no_bias=True,
                no_loss=True,
            )

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
                                              ['softmax', 'loss'])
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)

        if args.float16_compute:
            opt = optimizer.build_fp16_sgd(model,
                                           args.base_learning_rate,
                                           momentum=0.9,
                                           nesterov=1,
                                           weight_decay=args.weight_decay,
                                           policy="step",
                                           stepsize=stepsz,
                                           gamma=0.1)
        else:
            optimizer.add_weight_decay(model, args.weight_decay)
            opt = optimizer.build_multi_precision_sgd(model,
                                                      args.base_learning_rate,
                                                      momentum=0.9,
                                                      nesterov=1,
                                                      policy="step",
                                                      stepsize=stepsz,
                                                      gamma=0.1)
        return opt

    if args.train_data == "null":

        def add_image_input(model):
            AddNullInput(
                model,
                None,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
            )
    else:
        reader = train_model.CreateDB(
            "reader",
            db=args.train_data,
            db_type=args.db_type,
            num_shards=num_shards,
            shard_id=shard_id,
        )

        def add_image_input(model):
            AddImageInput(
                model,
                reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=False,
            )

    def add_post_sync_ops(model):
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet152_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
    )

    if args.model_parallel:
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g
                    for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet152_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet152_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)
        data_parallel_model.FinalizeAfterCheckpoint(train_model)
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet152_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)
    # final save
    SaveModel(workspace, train_model)
示例#18
0
 def testEnforce(self):
     op = core.CreateOperator("Relu", ["X"], ["Y"])
     with self.assertRaises(RuntimeError):
         workspace.RunOperatorOnce(op)
    def test_sum_reduce(self, gc, dc):
        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(4, 5).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=0)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # Set broadcast and no axis, i.e. broadcasting last dimensions.
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(2, 3).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=0)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=3)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res, decimal=3)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
        Y = np.random.rand(3, 4).astype(np.float32)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1,
                                 axis=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.sum(X, axis=0)
        res = np.sum(res, axis=2)
        np.testing.assert_array_almost_equal(out, res)
        self.assertDeviceChecks(dc, op, [X, Y], [0])

        # broadcasting intermediate dimensions
        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
        Y = np.random.rand(1).astype(np.float64)
        op = core.CreateOperator("SumReduceLike", ["X", "Y"],
                                 "out",
                                 broadcast=1)
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("Y", Y)
        workspace.RunOperatorOnce(op)
        out = workspace.FetchBlob("out")
        res = np.array(np.sum(X))
        np.testing.assert_array_almost_equal(out, res, decimal=0)
        self.assertDeviceChecks(dc, op, [X, Y], [0])
    def test_lambda_rank_loss(self, n, k, m):
        y = np.random.rand(n * m).astype(np.float32)
        r = np.random.randint(k, size=n * m).astype(np.float32)
        # m sessions of length n
        session_lengths = np.repeat(n, m).astype(np.int32)
        ref_loss = np.empty(0)
        ref_ndcg_loss = np.empty(0)
        ref_ndcg_loss_no_exp = np.empty(0)
        ref_dcg_loss = np.empty(0)
        ref_dcg_loss_no_exp = np.empty(0)
        ref_dy = np.empty(0)
        ref_dy_no_exp = np.empty(0)
        ref_dcg_dy = np.empty(0)
        ref_dcg_dy_no_exp = np.empty(0)
        for i in range(m):
            r_loss, r_dy = self.ref_lambda_rank_loss(y[(i) * n:(i + 1) * n],
                                                     r[(i) * n:(i + 1) * n],
                                                     False, True, False)
            r_ndcg_loss, _ = self.ref_lambda_rank_loss(y[(i) * n:(i + 1) * n],
                                                       r[(i) * n:(i + 1) * n],
                                                       True, True, True)
            r_ndcg_loss_no_exp, r_dy_no_exp = self.ref_lambda_rank_loss(
                y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, True,
                False)
            r_dcg_loss, r_dcg_dy = self.ref_lambda_rank_loss(
                y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, False,
                True)
            r_dcg_loss_no_exp, r_dcg_dy_no_exp = self.ref_lambda_rank_loss(
                y[(i) * n:(i + 1) * n], r[(i) * n:(i + 1) * n], True, False,
                False)
            ref_loss = np.append(ref_loss, r_loss)
            ref_dy = np.append(ref_dy, r_dy)
            ref_ndcg_loss = np.append(ref_ndcg_loss, r_ndcg_loss)

            ref_ndcg_loss_no_exp = np.append(ref_ndcg_loss_no_exp,
                                             r_ndcg_loss_no_exp)
            ref_dy_no_exp = np.append(ref_dy_no_exp, r_dy_no_exp)

            ref_dcg_loss = np.append(ref_dcg_loss, r_dcg_loss)
            ref_dcg_dy = np.append(ref_dcg_dy, r_dcg_dy)

            ref_dcg_loss_no_exp = np.append(ref_dcg_loss_no_exp,
                                            r_dcg_loss_no_exp)
            ref_dcg_dy_no_exp = np.append(ref_dcg_dy_no_exp, r_dcg_dy_no_exp)

        dloss = np.random.random(m).astype(np.float32)

        workspace.blobs["y"] = y
        workspace.blobs["r"] = r
        workspace.blobs["session_lengths"] = session_lengths
        workspace.blobs["dloss"] = dloss

        op = core.CreateOperator(
            "LambdaRankNdcg",
            ["y", "r", "session_lengths"],
            ["loss", "dy"],
            use_ndcg_as_loss=False,
            use_idcg_normalization=True,
            use_exp_gain=False,
        )
        workspace.RunOperatorOnce(op)
        loss = workspace.blobs["loss"]
        dy = workspace.blobs["dy"]
        np.testing.assert_allclose(loss, ref_loss, rtol=1e-5, atol=1e-6)
        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)

        op = core.CreateOperator(
            "LambdaRankNdcg",
            ["y", "r", "session_lengths"],
            ["loss", "dy"],
            use_ndcg_as_loss=True,
            use_idcg_normalization=True,
            use_exp_gain=True,
        )
        workspace.RunOperatorOnce(op)
        loss = workspace.blobs["loss"]
        dy = workspace.blobs["dy"]
        np.testing.assert_allclose(loss, ref_ndcg_loss, rtol=1e-5, atol=1e-6)
        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)

        op = core.CreateOperator(
            "LambdaRankNdcgGradient",
            ["y", "session_lengths", "dy", "dloss"],
            ["dy_back"],
        )
        workspace.RunOperatorOnce(op)
        dy_back = workspace.blobs["dy_back"]
        for i in range(m):
            np.testing.assert_allclose(
                dy_back[i * n:(i + 1) * n],
                dloss[i] * ref_dy[i * n:(i + 1) * n],
                rtol=1e-5,
                atol=1e-6,
            )

        op = core.CreateOperator(
            "LambdaRankNdcg",
            ["y", "r", "session_lengths"],
            ["loss", "dy"],
            use_ndcg_as_loss=True,
            use_idcg_normalization=True,
            use_exp_gain=False,
        )
        workspace.RunOperatorOnce(op)
        loss = workspace.blobs["loss"]
        dy = workspace.blobs["dy"]
        np.testing.assert_allclose(loss,
                                   ref_ndcg_loss_no_exp,
                                   rtol=1e-5,
                                   atol=1e-6)
        np.testing.assert_allclose(dy, ref_dy_no_exp, rtol=1e-5, atol=1e-6)

        op = core.CreateOperator(
            "LambdaRankNdcgGradient",
            ["y", "session_lengths", "dy", "dloss"],
            ["dy_back"],
        )
        workspace.RunOperatorOnce(op)
        dy_back = workspace.blobs["dy_back"]
        for i in range(m):
            np.testing.assert_allclose(
                dy_back[i * n:(i + 1) * n],
                dloss[i] * ref_dy_no_exp[i * n:(i + 1) * n],
                rtol=1e-5,
                atol=1e-6,
            )

        op = core.CreateOperator(
            "LambdaRankNdcg",
            ["y", "r", "session_lengths"],
            ["loss", "dy"],
            use_ndcg_as_loss=True,
            use_idcg_normalization=False,
            use_exp_gain=True,
        )
        workspace.RunOperatorOnce(op)
        loss = workspace.blobs["loss"]
        dy = workspace.blobs["dy"]
        np.testing.assert_allclose(loss, ref_dcg_loss, rtol=1e-5, atol=1e-6)
        np.testing.assert_allclose(dy, ref_dcg_dy, rtol=1e-5, atol=1e-6)

        op = core.CreateOperator(
            "LambdaRankNdcgGradient",
            ["y", "session_lengths", "dy", "dloss"],
            ["dy_back"],
        )
        workspace.RunOperatorOnce(op)
        dy_back = workspace.blobs["dy_back"]
        for i in range(m):
            np.testing.assert_allclose(
                dy_back[i * n:(i + 1) * n],
                dloss[i] * ref_dcg_dy[i * n:(i + 1) * n],
                rtol=1e-5,
                atol=1e-6,
            )

        op = core.CreateOperator(
            "LambdaRankNdcg",
            ["y", "r", "session_lengths"],
            ["loss", "dy"],
            use_ndcg_as_loss=True,
            use_idcg_normalization=False,
            use_exp_gain=False,
        )
        workspace.RunOperatorOnce(op)
        loss = workspace.blobs["loss"]
        dy = workspace.blobs["dy"]
        np.testing.assert_allclose(loss,
                                   ref_dcg_loss_no_exp,
                                   rtol=1e-5,
                                   atol=1e-6)
        np.testing.assert_allclose(dy, ref_dcg_dy_no_exp, rtol=1e-5, atol=1e-6)

        op = core.CreateOperator(
            "LambdaRankNdcgGradient",
            ["y", "session_lengths", "dy", "dloss"],
            ["dy_back"],
        )
        workspace.RunOperatorOnce(op)
        dy_back = workspace.blobs["dy_back"]
        for i in range(m):
            np.testing.assert_allclose(
                dy_back[i * n:(i + 1) * n],
                dloss[i] * ref_dcg_dy_no_exp[i * n:(i + 1) * n],
                rtol=1e-5,
                atol=1e-6,
            )
示例#21
0
    def test_int8_elementwise_sum(self,
                                 size,
                                 input_channels,
                                 batch_size,
                                 inputs,
                                 inplace,
                                 gc,
                                 dc):
        sum_fp32 = core.CreateOperator(
            "Sum",
            ["X_{}".format(i) for i in range(inputs)],
            ["X_0" if inplace else "Y"],
        )
        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) for _ in range(inputs)]

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)

        Xi_scales = []
        Xi_zero_points = []
        for i, X in enumerate(Xs):
            workspace.FeedBlob("X_{}".format(i), X, dc[0])
            if X.min() >= 0:
                Xi_scales.append(np.absolute(X).max() / 0xFF)
                Xi_zero_points.append(0)
            else:
                Xi_scales.append(np.absolute(X).max() / 0x7F)
                Xi_zero_points.append(128)

        workspace.RunOperatorOnce(sum_fp32)
        Y = workspace.FetchBlob("X_0" if inplace else "Y")

        if Y.min() >= 0:
            Y_scale = np.absolute(Y).max() / 0xFF
            Y_zero_point = 0
        else:
            Y_scale = np.absolute(Y).max() / 0x7F
            Y_zero_point = 128

        workspace.ResetWorkspace()

        net = caffe2_pb2.NetDef()
        for i, Xi in enumerate(Xs):
            workspace.FeedBlob("Xi_{}".format(i), Xi, dc[1])
            sw2nhwc = core.CreateOperator(
                "NCHW2NHWC",
                ["Xi_{}".format(i)],
                ["Xi_{}_nhwc".format(i)],
                device_option=dc[1]
            )
            quantize = core.CreateOperator(
                "Int8Quantize",
                ["Xi_{}_nhwc".format(i)],
                ["Xi_{}_quantized".format(i)],
                engine="DNNLOWP",
                device_option=dc[1],
                Y_zero_point=Xi_zero_points[i],
                Y_scale=Xi_scales[i],
            )
            net.op.extend([sw2nhwc, quantize])

        sum = core.CreateOperator(
            "Int8Sum",
            ["Xi_{}_quantized".format(i) for i in range(inputs)],
            ["Xi_0_quantized" if inplace else "Y_quantized"],
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=Y_zero_point,
            Y_scale=Y_scale,
        )

        dequantize = core.CreateOperator(
            "Int8Dequantize",
            ["Xi_0_quantized" if inplace else "Y_quantized"],
            ["Y_nhwc"],
            engine="DNNLOWP",
            device_option=dc[1],
        )

        sw2nchw = core.CreateOperator(
            "NHWC2NCHW",
            ["Y_nhwc"],
            ["Y_out"],
            device_option=dc[1]
        )

        net.op.extend([sum, dequantize, sw2nchw])
        workspace.RunNetOnce(net)
        Y_out = workspace.FetchBlob("Y_out")

        MSE = np.square(np.subtract(Y, Y_out)).mean()
        if MSE > 0.005:
            print(Y.flatten())
            print(Y_out.flatten())
            print(np.max(np.abs(Y_out - Y)))
            print("MSE", MSE)
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
示例#22
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = range(args.num_gpus)
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create CNNModeLhelper object
    train_model = cnn.CNNModelHelper(
        order="NCHW",
        name="resnet50",
        use_cudnn=True,
        cudnn_exhaustive_search=True,
        ws_nbytes_limit=(args.cudnn_workspace_limit_mb * 1024 * 1024),
    )

    num_shards = args.num_shards
    shard_id = args.shard_id
    if num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                ))
        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          exit_nets=None)
    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        [softmax, loss] = resnet.create_resnet50(
            model,
            "data",
            num_input_channels=args.num_channels,
            num_labels=args.num_labels,
            label="label",
            no_bias=True,
        )
        loss = model.Scale(loss, scale=loss_scale)
        model.Accuracy([softmax, "label"], "accuracy")
        return [loss]

    # SGD
    def add_parameter_update_ops(model):
        model.AddWeightDecay(args.weight_decay)
        ITER = model.Iter("ITER")
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=float(job.get_parameter('base_learning_rate')),
            policy="step",
            stepsize=stepsz,
            gamma=0.1,
        )
        AddMomentumParameterUpdate(model, LR)

    # Input. Note that the reader must be shared with all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
        )

    # Create parallelized model
    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        param_update_builder_fun=add_parameter_update_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
    )

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_model = cnn.CNNModelHelper(order="NCHW",
                                        name="resnet50_test",
                                        use_cudnn=True,
                                        cudnn_exhaustive_search=True)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    epoch = 0
    while epoch < args.num_epochs:
        epoch, test_accuracy = RunEpoch(args, epoch, train_model, test_model,
                                        total_batch_size, num_shards, expname,
                                        explog)

        # send metric
        accuracy_channel.send(epoch, test_accuracy)
        job.progress(epoch, total=args.num_epochs)
示例#23
0
    def test_int8_relu(self, size, input_channels, batch_size, inplace, gc, dc):
        relu_fp32 = core.CreateOperator(
            "Relu",
            ["X"],
            ["Y"] if not inplace else ["X"],
            device_option=dc[0]
        )

        X = np.random.rand(
            batch_size, input_channels, size, size).astype(np.float32) - 0.5
        # go away from the origin point to avoid kink problems
        X += 0.02 * np.sign(X)
        X[X == 0.0] += 0.02

        if X.min() >=0:
            scale = np.absolute(X).max() / 0xFF
            zero_point = 0
        else:
            scale = np.absolute(X).max() / 0x7F
            zero_point = 128

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)

        workspace.FeedBlob("X", X, dc[0])
        workspace.RunOperatorOnce(relu_fp32)
        Y = workspace.FetchBlob("X" if inplace else "Y")

        workspace.ResetWorkspace()

        sw2nhwc = core.CreateOperator(
            "NCHW2NHWC",
            ["Xi"],
            ["Xi_nhwc"],
            device_option=dc[1]
        )

        quantize = core.CreateOperator(
            "Int8Quantize",
            ["Xi_nhwc"],
            ["Xi_quantized"],
            engine="DNNLOWP",
            device_option=dc[1],
            Y_zero_point=zero_point,
            Y_scale=scale,
        )

        relu = core.CreateOperator(
            "Int8Relu",
            ["Xi_quantized"],
            ["Y_quantized"] if not inplace else ["Xi_quantized"],
            engine="DNNLOWP",
            device_option=dc[1],
        )

        dequantize = core.CreateOperator(
            "Int8Dequantize",
            ["Y_quantized"] if not inplace else ["Xi_quantized"],
            ["Y_nhwc"],
            engine="DNNLOWP",
            device_option=dc[1],
        )

        sw2nchw = core.CreateOperator(
            "NHWC2NCHW",
            ["Y_nhwc"],
            ["Y_out"],
            device_option=dc[1]
        )

        net = caffe2_pb2.NetDef()
        net.op.extend([sw2nhwc, quantize, relu, dequantize, sw2nchw])

        workspace.FeedBlob("Xi", X, dc[1])
        workspace.RunNetOnce(net)
        Y_out = workspace.FetchBlob("Y_out")

        MSE = np.square(np.subtract(Y, Y_out)).mean()
        if MSE > 0.005:
            print(Y.flatten())
            print(Y_out.flatten())
            print(np.max(np.abs(Y_out - Y)))
            print("MSE", MSE)
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
示例#24
0
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)
    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustice_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id
    if num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                ))
        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          exit_nets=None)
    else:
        rendezvous = None

    # Model building functions
    def create_resnet50_model_ops(model, loss_scale):
        [softmax, loss] = resnet.create_resnet50(
            model,
            "data",
            num_input_channels=args.num_channels,
            num_labels=args.num_labels,
            label="label",
            no_bias=True,
        )
        loss = model.Scale(loss, scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], "accuracy")
        return [loss]

    def add_optimizer(model):
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        optimizer.build_sgd(model,
                            args.base_learning_rate,
                            momentum=0.9,
                            nesterov=1,
                            policy="step",
                            stepsize=stepsz,
                            gamma=0.1)

    # Input. Note that the reader must be shared with all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
        )

    # Create parallelized model
    data_parallel_model.Parallelize_GPU(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_resnet50_model_ops,
        optimizer_builder_fun=add_optimizer,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=True,
    )

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="resnet50_test",
                                              arg_scope=test_arg_scope)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
            )

        data_parallel_model.Parallelize_GPU(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_resnet50_model_ops,
            param_update_builder_fun=None,
            devices=gpus,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % (
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Run the training one epoch a time
    while epoch < args.num_epochs:
        epoch = RunEpoch(args, epoch, train_model, test_model,
                         total_batch_size, num_shards, expname, explog)

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
示例#25
0
    def test_fc_num0(self, seed, m, k, n, use_packed):
        """ Test numerics, fix a dimension and determine the ranges of error.
            Use Fp16FCAcc16 as a reference.
        """
        W = "W_packed" if use_packed else "W0"
        dtype = np.float32
        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["X", W, "b0"])
        pred_net.external_output.append("Y")
        pred_net.op.add().CopyFrom(
            core.CreateOperator(
                "FbFCPacked" if use_packed else "FC",
                ["X", W, "b0"],
                ["Y"],
            )
        )
        pred_net_ref = caffe2_pb2.NetDef()
        pred_net_ref.name = "pred"
        pred_net_ref.external_input.extend(["X", W, "b0"])
        pred_net_ref.external_output.append("Y")
        pred_net_ref.op.add().CopyFrom(
            core.CreateOperator(
                "Fp16FCAcc32NNPI",
                ["X", W, "b0"],
                ["Y"],
            )
        )

        workspace.SwitchWorkspace("glow_test_ws", True)
        workspace.ResetWorkspace()
        W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32)
        b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32)

        workspace.FeedBlob("W0", W0)
        workspace.FeedBlob("b0", b0)
        workspace.RunOperatorOnce(
            core.CreateOperator(
                "FbGemmPack",
                ['W0'],
                ['W_packed'],
                no_packing=True,
            )
        )

        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
                                                {"X": (m, k)},
                                                debug=True,
                                                adjust_batch=False,
                                                use_onnx=False)
        num_onnxified_ops = sum(
            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
        np.testing.assert_equal(num_onnxified_ops, 1)

        X0 = np.random.rand(m, k).astype(dtype) - 0.5
        workspace.FeedBlob("X", X0)
        workspace.CreateNet(pred_net_onnxified)
        workspace.CreateNet(pred_net_ref)

        workspace.RunNet(pred_net_onnxified.name)
        Y_glow = workspace.FetchBlob('Y')

        # Run caffe2 net
        workspace.RunNet(pred_net_ref.name)
        Y_c2 = workspace.FetchBlob('Y')

        diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
        rowdiff = np.max(diff, axis=1)

        n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
        if n_offenders > 0:
            print_test_debug_info("fc", {
                "seed": seed,
                "use_packed": use_packed,
                "m": m,
                "k": k,
                "n": n,
                "X": X0.shape,
                "W0": W0.shape,
                "b0": b0.shape,
                "Y_glow": Y_glow,
                "Y_c2": Y_c2,
                "diff": diff,
                "rowdiff": rowdiff})
            assert(0)
示例#26
0
 def test_exception(self):
     op = CreatePythonOperator(MainOpFunctionThatThrowsRuntimeError, [], [])
     with self.assertRaises(RuntimeError):
         workspace.RunOperatorOnce(op)
示例#27
0
 def _lengths_ref(X, Y):
     ref_op = core.CreateOperator(ref_op_name, ["X", "Y"], "out")
     workspace.FeedBlob("X", X)
     workspace.FeedBlob("Y", Y)
     workspace.RunOperatorOnce(ref_op)
     return workspace.FetchBlob("out")
def Train(args):
    # Either use specified device list or generate one
    if args.gpus is not None:
        gpus = [int(x) for x in args.gpus.split(',')]
        num_gpus = len(gpus)
    else:
        gpus = list(range(args.num_gpus))
        num_gpus = args.num_gpus

    log.info("Running on GPUs: {}".format(gpus))

    # Verify valid batch size
    total_batch_size = args.batch_size
    batch_per_device = total_batch_size // num_gpus
    assert \
        total_batch_size % num_gpus == 0, \
        "Number of GPUs must divide batch size"

    # Round down epoch size to closest multiple of batch size across machines
    global_batch_size = total_batch_size * args.num_shards
    epoch_iters = int(args.epoch_size / global_batch_size)

    assert \
        epoch_iters > 0, \
        "Epoch size must be larger than batch size times shard count"

    args.epoch_size = epoch_iters * global_batch_size
    log.info("Using epoch size: {}".format(args.epoch_size))

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_cudnn': True,
        'cudnn_exhaustive_search': True,
        'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
    }
    train_model = model_helper.ModelHelper(name="ban-pc-resnet50",
                                           arg_scope=train_arg_scope)

    num_shards = args.num_shards
    shard_id = args.shard_id

    # Expect interfaces to be comma separated.
    # Use of multiple network interfaces is not yet complete,
    # so simply use the first one in the list.
    interfaces = args.distributed_interfaces.split(",")

    # Rendezvous using MPI when run with mpirun
    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
        if num_shards > 1:
            rendezvous = dict(kv_handler=None,
                              num_shards=num_shards,
                              shard_id=shard_id,
                              engine="GLOO",
                              transport=args.distributed_transport,
                              interface=interfaces[0],
                              mpi_rendezvous=True,
                              exit_nets=None)

    elif num_shards > 1:
        # Create rendezvous for distributed computation
        store_handler = "store_handler"
        if args.redis_host is not None:
            # Use Redis for rendezvous if Redis host is specified
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "RedisStoreHandlerCreate",
                    [],
                    [store_handler],
                    host=args.redis_host,
                    port=args.redis_port,
                    prefix=args.run_id,
                ))
        else:
            # Use filesystem for rendezvous otherwise
            workspace.RunOperatorOnce(
                core.CreateOperator(
                    "FileStoreHandlerCreate",
                    [],
                    [store_handler],
                    path=args.file_store_path,
                    prefix=args.run_id,
                ))

        rendezvous = dict(kv_handler=store_handler,
                          shard_id=shard_id,
                          num_shards=num_shards,
                          engine="GLOO",
                          transport=args.distributed_transport,
                          interface=interfaces[0],
                          exit_nets=None)

    else:
        rendezvous = None

    # Model configs for constructing model
    with open(args.model_config) as f:
        model_config = yaml.load(f)

    # Model building functions
    def create_target_model_ops(model, loss_scale):
        initializer = (PseudoFP16Initializer
                       if args.dtype == 'float16' else Initializer)
        with brew.arg_scope([brew.conv, brew.fc],
                            WeightInitializer=initializer,
                            BiasInitializer=initializer,
                            enable_tensor_core=args.enable_tensor_core,
                            float16_compute=args.float16_compute):
            pred = add_se_model(model, model_config, "data", is_test=False)

        if args.dtype == 'float16':
            pred = model.net.HalfToFloat(pred, pred + '_fp32')

        loss = add_pc_loss(model, model_config, pred, 'label')
        brew.accuracy(model, ['softmax', 'label'], 'accuracy')
        return [loss]

    def add_optimizer(model):
        '''
        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            args.base_learning_rate,
            momentum=0.9,
            nesterov=1,
            policy="step",
            stepsize=stepsz,
            gamma=0.1
        )
        '''

        optimizer.add_weight_decay(model, args.weight_decay)
        opt = optimizer.build_multi_precision_sgd(
            model,
            base_learning_rate=args.base_learning_rate,
            momentum=model_config['solver']['momentum'],
            nesterov=model_config['solver']['nesterov'],
            policy=model_config['solver']['lr_policy'],
            power=model_config['solver']['power'],
            max_iter=model_config['solver']['max_iter'],
        )
        return opt

    # Define add_image_input function.
    # Depends on the "train_data" argument.
    # Note that the reader will be shared with between all GPUS.
    reader = train_model.CreateDB(
        "reader",
        db=args.train_data,
        db_type=args.db_type,
        num_shards=num_shards,
        shard_id=shard_id,
    )

    def add_image_input(model):
        AddImageInput(
            model,
            reader,
            batch_size=batch_per_device,
            img_size=args.image_size,
            dtype=args.dtype,
            is_test=False,
        )

    def add_post_sync_ops(model):
        """Add ops applied after initial parameter sync."""
        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
            if param_info.blob_copy is not None:
                model.param_init_net.HalfToFloat(
                    param_info.blob, param_info.blob_copy[core.DataType.FLOAT])

    # Create parallelized model
    data_parallel_model.Parallelize(
        train_model,
        input_builder_fun=add_image_input,
        forward_pass_builder_fun=create_target_model_ops,
        optimizer_builder_fun=add_optimizer,
        post_sync_builder_fun=add_post_sync_ops,
        devices=gpus,
        rendezvous=rendezvous,
        optimize_gradient_memory=False,
        cpu_device=args.use_cpu,
        shared_model=args.use_cpu,
        combine_spatial_bn=args.use_cpu,
    )

    if args.model_parallel:
        # Shift half of the activations to another GPU
        assert workspace.NumCudaDevices() >= 2 * args.num_gpus
        activations = data_parallel_model_utils.GetActivationBlobs(train_model)
        data_parallel_model_utils.ShiftActivationDevices(
            train_model,
            activations=activations[len(activations) // 2:],
            shifts={g: args.num_gpus + g
                    for g in range(args.num_gpus)},
        )

    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)

    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)

    # Add test model, if specified
    test_model = None
    if (args.test_data is not None):
        log.info("----- Create test net ----")
        test_arg_scope = {
            'order': "NCHW",
            'use_cudnn': True,
            'cudnn_exhaustive_search': True,
        }
        test_model = model_helper.ModelHelper(name="ban-pc-resnet50_test",
                                              arg_scope=test_arg_scope,
                                              init_params=False)

        test_reader = test_model.CreateDB(
            "test_reader",
            db=args.test_data,
            db_type=args.db_type,
        )

        def test_input_fn(model):
            AddImageInput(
                model,
                test_reader,
                batch_size=batch_per_device,
                img_size=args.image_size,
                dtype=args.dtype,
                is_test=True,
            )

        data_parallel_model.Parallelize(
            test_model,
            input_builder_fun=test_input_fn,
            forward_pass_builder_fun=create_target_model_ops,
            post_sync_builder_fun=add_post_sync_ops,
            param_update_builder_fun=None,
            devices=gpus,
            cpu_device=args.use_cpu,
        )
        workspace.RunNetOnce(test_model.param_init_net)
        workspace.CreateNet(test_model.net)

    epoch = 0
    # load the pre-trained model and reset epoch
    if args.load_model_path is not None:
        LoadModel(args.load_model_path, train_model)

        # Sync the model params
        data_parallel_model.FinalizeAfterCheckpoint(train_model)

        # reset epoch. load_model_path should end with *_X.mdl,
        # where X is the epoch number
        last_str = args.load_model_path.split('_')[-1]
        if last_str.endswith('.mdl'):
            epoch = int(last_str[:-4])
            log.info("Reset epoch to {}".format(epoch))
        else:
            log.warning("The format of load_model_path doesn't match!")

    expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format(
        args.dataset_name,
        args.num_gpus,
        total_batch_size,
        args.num_labels,
        args.base_learning_rate,
    )
    explog = experiment_util.ModelTrainerLog(expname, args)

    # Load pretrained param_init_net
    load_init_net_multigpu(args)

    # Run the training one epoch a time
    best_accuracy = 0
    while epoch < args.num_epochs:
        epoch, best_accuracy = RunEpoch(
            args,
            epoch,
            train_model,
            test_model,
            total_batch_size,
            num_shards,
            expname,
            explog,
            best_accuracy,
        )

        # Save the model for each epoch
        SaveModel(args, train_model, epoch)

        model_path = "%s/%s_" % (args.file_store_path, args.save_model_name)
        # remove the saved model from the previous epoch if it exists
        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
            os.remove(model_path + str(epoch - 1) + ".mdl")
示例#29
0
    def test_fc_with_axis(self, n, m, c, h, w, axis, gc, dc):
        X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5
        k = reduce((lambda x, y: x * y), [n, c, h, w][axis - 4:])
        nn = reduce((lambda x, y: x * y), [n, c, h, w][:axis])
        W = np.random.rand(m, k).astype(np.float32) - 0.5
        b = np.random.rand(m).astype(np.float32) - 0.5
        dY = np.random.rand(nn, m).astype(np.float32) - 0.5

        op0 = core.CreateOperator('FC', ['X', 'W', 'b'], ["Y"],
                                  axis=axis,
                                  device_option=dc[0])

        op0_bw = core.CreateOperator('FCGradient', ['X', 'W', 'dY'],
                                     ["dW", "db"],
                                     axis=axis,
                                     device_option=dc[0])

        workspace.ResetWorkspace()
        workspace.FeedBlob('X', X, dc[0])
        workspace.FeedBlob('W', W, dc[0])
        workspace.FeedBlob('b', b, dc[0])
        workspace.RunOperatorOnce(op0)
        Y0 = workspace.FetchBlob('Y')

        workspace.FeedBlob('dY', dY, dc[0])
        workspace.RunOperatorOnce(op0_bw)
        dW0 = workspace.FetchBlob('dW')
        db0 = workspace.FetchBlob('db')

        op1 = core.CreateOperator('FC', ['X', 'W', 'b'], ["Y"],
                                  axis=axis,
                                  device_option=dc[1])

        op1_bw = core.CreateOperator('FCGradient', ['X', 'W', 'dY'],
                                     ["dW", "db"],
                                     axis=axis,
                                     device_option=dc[1])

        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('X', X, dc[1])
        workspace.FeedBlob('W', W, dc[1])
        workspace.FeedBlob('b', b, dc[1])
        workspace.RunOperatorOnce(op1)
        Y1 = workspace.FetchBlob('Y')

        workspace.FeedBlob('dY', dY, dc[1])
        workspace.RunOperatorOnce(op1_bw)
        dW1 = workspace.FetchBlob('dW')
        db1 = workspace.FetchBlob('db')

        Y0 = Y0.flatten()
        Y1 = Y1.flatten()
        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
            print(Y1)
            print(Y0)
            print(np.max(np.abs(Y1 - Y0)))
            self.assertTrue(False)

        dW0 = dW0.flatten()
        dW1 = dW1.flatten()
        if not np.allclose(dW0, dW1, atol=0.01, rtol=0.01):
            print(dW1)
            print(dW0)
            print(np.max(np.abs(dW1 - dW0)))
            self.assertTrue(False)

        db0 = db0.flatten()
        db1 = db1.flatten()
        if not np.allclose(db0, db1, atol=0.01, rtol=0.01):
            print(db1)
            print(db0)
            print(np.max(np.abs(db1 - db0)))
            self.assertTrue(False)
示例#30
0
    def test_merge_multi_map_feature_tensors(self):
        op = core.CreateOperator("MergeMultiMapFeatureTensors", [
            "in1_lengths",
            "in1_keys",
            "in1_values_lengths",
            "in1_values_keys",
            "in1_values_values",
            "in2_lengths",
            "in2_keys",
            "in2_values_lengths",
            "in2_values_keys",
            "in2_values_values",
        ], [
            "out_lengths", "out_keys", "out_values_lengths", "out_values_keys",
            "out_values_values"
        ])

        # Input 1.
        workspace.FeedBlob("in1_lengths", np.array([1, 2], dtype=np.int32))
        workspace.FeedBlob("in1_keys", np.array([11, 12, 13], dtype=np.int64))
        workspace.FeedBlob("in1_values_lengths",
                           np.array([2, 2, 2], dtype=np.int32))
        workspace.FeedBlob(
            "in1_values_keys",
            np.array([111, 112, 121, 122, 131, 132], dtype=np.int64))
        workspace.FeedBlob(
            "in1_values_values",
            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float))
        # Input 2.
        workspace.FeedBlob("in2_lengths", np.array([2, 1], dtype=np.int32))
        workspace.FeedBlob("in2_keys", np.array([14, 15, 16], dtype=np.int64))
        workspace.FeedBlob("in2_values_lengths",
                           np.array([2, 2, 2], dtype=np.int32))
        workspace.FeedBlob(
            "in2_values_keys",
            np.array([141, 142, 151, 152, 161, 162], dtype=np.int64))
        workspace.FeedBlob(
            "in2_values_values",
            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float))

        workspace.RunOperatorOnce(op)

        np.testing.assert_array_equal(workspace.FetchBlob("out_lengths"),
                                      np.array([3, 3], dtype=np.int32))
        np.testing.assert_array_equal(
            workspace.FetchBlob("out_keys"),
            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64))
        np.testing.assert_array_equal(
            workspace.FetchBlob("out_values_lengths"),
            np.array([2, 2, 2, 2, 2, 2], dtype=np.int32))
        np.testing.assert_array_equal(
            workspace.FetchBlob("out_values_keys"),
            np.array(
                [111, 112, 141, 142, 151, 152, 121, 122, 131, 132, 161, 162],
                dtype=np.int64))
        np.testing.assert_array_equal(
            workspace.FetchBlob("out_values_values"),
            np.array([
                11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
                16.1, 16.2
            ],
                     dtype=np.float))