def test_auto_parallel_arithmetic_broadcast_both():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul = P.MatMul()
            self.floordiv = P.FloorDiv()

        def construct(self, x, y, b):
            out = self.matmul(x, y)
            out = self.floordiv(out, b)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    reset_op_id()

    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('FloorDiv-op', k) is not None:
            assert v == [[8, 1], [1, 1]]
        elif re.search('MatMul-op', k) is not None:
            assert v == [[8, 1], [1, 1]]
예제 #2
0
def test_train_4k_8p_gpu(batch_size=32, num_classes=4096):
    dev_num = 8
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num)
    set_algo_parameters(elementwise_op_strategy_follow=True)
    resset_op_id()
    np.random.seed(6)
    input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32)
    label_np = np.zeros([batch_size]).astype(np.int32)
    for i in range(0, batch_size):
        label_np[i] = i % num_classes
    dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9)
    model = Model(net, loss_fn=loss, optimizer=opt)
    model.train(5, dataset, dataset_sink_mode=False)
    strategies = _executor._get_shard_strategy(model._train_network)
    for (k, v) in strategies.items():
        if re.search('Conv2D-op', k) is not None:
            assert v[0][0] == dev_num
        elif re.search('MatMul-op', k) is not None:
            assert v == [[dev_num, 1], [1, 1]]
        elif re.search('ReduceSum-op', k) is not None:
            assert v == [[dev_num, 1]]
def test_auto_parallel_arithmetic_broadcast_both():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul = P.MatMul()
            self.floordiv = P.FloorDiv()

        def construct(self, x, y, b):
            out = self.matmul(x, y)
            out = self.floordiv(out, b)
            return out

    context.set_auto_parallel_context(device_num=8, global_rank=0)
    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    reset_op_id()

    x = Tensor(np.ones([64, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 1]), dtype=ms.float32)
    b = Tensor(np.ones([1, 64]), dtype=ms.float32)
    compile_net(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
    expected_strategies = {
        'Default/network-Net/FloorDiv-op1': [[8, 1], [1, 1]],
        'Default/network-Net/MatMul-op0': [[8, 1], [1, 1]]
    }
    assert strategies == expected_strategies
예제 #4
0
def test_two_matmul_transpose():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul1 = P.MatMul()
            self.matmul2 = P.MatMul()
            self.transpose1 = P.Transpose()
            self.transpose2 = P.Transpose()

        def construct(self, x, y, b):
            out = self.matmul1(x, y)
            out = self.matmul2(out, b)
            out = self.transpose1(out, (1, 0))
            out = self.transpose2(out, (1, 0))
            return out

    size = 16
    context.set_auto_parallel_context(device_num=size, global_rank=0)
    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)

    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    net.set_train()
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
    expected_strategies = {'Default/network-Net/Transpose-op4': [[1, 16]],
                           'Default/network-Net/Transpose-op5': [[16, 1]],
                           'Default/network-Net/MatMul-op7': [[16, 1], [1, 1]],
                           'Default/network-Net/MatMul-op6': [[16, 1], [1, 1]]}
    assert strategies == expected_strategies
def test_matmul_prelu():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.mul1 = P.Mul()
            self.prelu = P.PReLU()

        def construct(self, x, y, b):
            out = self.mul1(x, y)
            out = self.prelu(out, b)
            return out

    size = 16
    context.set_auto_parallel_context(device_num=size, global_rank=0)
    x = Tensor(np.ones([16, 3, 128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([16, 3, 128, 32]), dtype=ms.float32)
    b = Tensor(np.array([0.01, 0.02, 0.03]), dtype=ms.float32)

    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    net.set_train()
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('PReLU-op', k) is not None:
            assert v == [[16, 1, 1, 1], [1]]
        elif re.search('Mul-op', k) is not None:
            assert v == [[16, 1, 1, 1], [16, 1, 1, 1]]
def test_auto_parallel_assign_sub_with_ref_key():
    size = 8
    context.set_auto_parallel_context(device_num=size, global_rank=0)

    x = Tensor(np.random.rand(4, 4, 32, 64), dtype=ms.float32)

    net = NetWithLoss(nn.PReLU(4))
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    _executor.compile(net, x, phase="train")
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('PReLU-op', k) is not None:
            assert v == [[1, 1, 1, 8], [1]]
        elif re.search('ReLU-op', k) is not None:
            assert v == [[1]]
예제 #7
0
def test_double_subgraphs_train():
    context.set_context(save_graphs=True)
    context.set_auto_parallel_context(device_num=1, global_rank=0)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net = TrainStepWarp(NetWithLoss(Net()))

    batch_ids = np.ones([8, 8, 8, 8]).astype(np.int32)
    ds_train = DatasetLenet(Tensor(batch_ids), None)
    model = Model(net)
    model.train(1, ds_train, dataset_sink_mode=False)
    strategies = _executor._get_shard_strategy(net)
    expected_strategies = {'Default/network-NetWithLoss/ReduceMean-op3': [[1, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/ReLU-op4': [[1, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/Mul-op5': [[1, 1, 1, 1], [1, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/Mul-op6': [[1, 1, 1, 1], [1, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/Cast-op1': [[1, 1, 1, 1]],
                           'Default/network-NetWithLoss/ReduceSum-op7': [[1, 1, 1, 1]]}
    assert strategies == expected_strategies
예제 #8
0
def test_double_subgraphs():
    _set_multi_subgraphs()
    context.set_context(save_graphs=True)
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net = TrainStepWarp(NetWithLoss(Net()))
    net.set_auto_parallel()

    x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32)
    reset_op_id()
    _executor.compile(net, x, phase='train')
    strategies = _executor._get_shard_strategy(net)
    expected_strategies = {'Default/network-NetWithLoss/ReduceMean-op0': [[8, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/ReLU-op1': [[8, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/Mul-op2': [[8, 1, 1, 1], [8, 1, 1, 1]],
                           'Default/network-NetWithLoss/net-Net/Mul-op3': [[8, 1, 1, 1], [8, 1, 1, 1]],
                           'Default/network-NetWithLoss/ReduceSum-op4': [[8, 1, 1, 1]]}
    assert strategies == expected_strategies
예제 #9
0
def all_to_all_common():
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=1, global_rank=0)
    predict = Tensor(np.ones([32, 128]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = all_to_all_net()

    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, loss, opt)

    model.train(epoch_size, dataset, dataset_sink_mode=False)
    strategys = _executor._get_shard_strategy(model._train_network)
    return strategys
예제 #10
0
def test_common_parameter():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul1 = P.MatMul()
            self.matmul2 = P.MatMul()
            self.matmul3 = P.MatMul()
            self.weight1 = Parameter(Tensor(
                np.ones([64, 64]).astype(np.float16) * 0.01),
                                     "w",
                                     requires_grad=True)
            self.cast1 = P.Cast()
            self.cast2 = P.Cast()

        def construct(self, x, y):
            m1_result = self.matmul1(x, self.cast1(self.weight1,
                                                   mstype.float32))
            m2_result = self.matmul2(y, self.cast2(self.weight1,
                                                   mstype.float32))
            m3_result = self.matmul3(m2_result, m1_result)

            return m3_result

    size = 8
    context.set_auto_parallel_context(device_num=size, global_rank=0)

    set_algo_parameters(elementwise_op_strategy_follow=True)
    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
    y = Tensor(np.ones([64, 64]), dtype=ms.float32)

    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    net.set_train()
    _executor.compile(net, x, y, phase='train')
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('MatMul-op', k) is not None:
            assert v == [[8, 1], [1, 1]]
        elif re.search('Cast-op', k) is not None:
            assert v == [[1, 1]]
예제 #11
0
def test_double_star_graph():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul1 = P.MatMul()
            self.matmul2 = P.MatMul()
            self.matmul3 = P.MatMul()
            self.cast1 = P.Cast()
            self.cast2 = P.Cast()

        def construct(self, x, y, z, w):
            m1_result = self.matmul1(x, y)
            m2_result = self.matmul2(z, w)
            m3_result = self.matmul3(self.cast1(m2_result, mstype.float16),
                                     self.cast2(m1_result, mstype.float16))

            return m3_result

    size = 8
    context.set_auto_parallel_context(device_num=size, global_rank=0)

    x = Tensor(np.ones([32, 8]), dtype=ms.float32)
    y = Tensor(np.ones([8, 16]), dtype=ms.float32)
    z = Tensor(np.ones([8, 16]), dtype=ms.float32)
    w = Tensor(np.ones([16, 32]), dtype=ms.float32)

    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    net.set_train()
    _executor.compile(net, x, y, z, w, phase='train')
    strategies = _executor._get_shard_strategy(net)
    expected_strategies = {
        'Default/network-Net/Cast-op2': [[8, 1]],
        'Default/network-Net/Cast-op4': [[1, 8]],
        'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]],
        'Default/network-Net/MatMul-op5': [[1, 1], [1, 8]],
        'Default/network-Net/MatMul-op1': [[1, 8], [8, 1]]
    }
    assert strategies == expected_strategies
def test_double_subgraphs_train():
    context.set_context(save_graphs=True)
    context.set_auto_parallel_context(device_num=1, global_rank=0)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net = TrainStepWarp(NetWithLoss(Net()))

    batch_ids = np.ones([8, 8, 8, 8]).astype(np.int32)
    ds_train = DatasetLenet(Tensor(batch_ids), None)
    model = Model(net)
    model.train(1, ds_train, dataset_sink_mode=False)
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('ReduceMean-op', k) is not None:
            assert v == [[1, 1, 1, 1]]
        elif re.search('ReLU-op', k) is not None:
            assert v == [[1, 1, 1, 1]]
        elif re.search('Mul-op', k) is not None:
            assert v == [[1, 1, 1, 1], [1, 1, 1, 1]]
        elif re.search('Cast-op', k) is not None:
            assert v == [[1, 1, 1, 1]]
        elif re.search('ReduceSum-op', k) is not None:
            assert v == [[1, 1, 1, 1]]
def test_double_subgraphs():
    _set_multi_subgraphs()
    context.set_context(save_graphs=True)
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net = TrainStepWarp(NetWithLoss(Net()))
    net.set_auto_parallel()

    x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32)
    reset_op_id()
    net.set_train()
    _executor.compile(net, x, phase='train')
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('ReduceMean-op', k) is not None:
            assert v == [[8, 1, 1, 1]]
        elif re.search('ReLU-op', k) is not None:
            assert v == [[8, 1, 1, 1]]
        elif re.search('Mul-op', k) is not None:
            assert v == [[8, 1, 1, 1], [8, 1, 1, 1]]
        elif re.search('ReduceSum-op', k) is not None:
            assert v == [[8, 1, 1, 1]]
예제 #14
0
def all_to_all_common(strategy1):
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8)
    predict = Tensor(np.ones([32, 128]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = all_to_all_net(strategy1)

    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    loss.softmax_cross_entropy.shard(((8, 1), (8, 1)))
    loss.one_hot.shard(((8, 1), (), ()))
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, loss, opt)

    model.train(epoch_size, dataset, dataset_sink_mode=False)
    strategys = _executor._get_shard_strategy(model._train_network)
    return strategys
예제 #15
0
def test_two_bn():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.block1 = get_block()
            self.block2 = get_block()
            self.relu = P.ReLU()
            self.add = P.Add()
            self.bias = Tensor(np.ones([64, 64]), dtype=ms.float32)

        def construct(self, x):
            out = self.block1(x)
            out = self.relu(out)
            out = self.add(out, self.bias)
            out = self.block2(out)
            return out

    context.set_context(save_graphs=False)
    context.set_auto_parallel_context(device_num=8, global_rank=0)
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net = NetWithLoss(Net())
    x = Tensor(np.ones([64, 64]), dtype=ms.float32)
    net.set_auto_parallel()
    net.set_train()
    set_algo_parameters(elementwise_op_strategy_follow=True)
    reset_op_id()

    _executor.compile(net, x, phase='train')
    strategies = _executor._get_shard_strategy(net)
    assert len(strategies) == 4

    for (k, v) in strategies.items():
        if re.search('BatchNorm-op', k) is not None:
            assert v == [[8, 1], [1], [1], [1], [1]]
        elif re.search('TensorAdd-op', k) is not None:
            assert v == [[8, 1], [8, 1]]
        elif re.search('ReLU-op', k) is not None:
            assert v == [[8, 1]]
예제 #16
0
    else:
        ## fp32 training
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                       lr, config.momentum, config.weight_decay)
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    # define callbacks
    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossMonitor()
    cb = [time_cb, loss_cb]
    if config.save_checkpoint:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
            keep_checkpoint_max=config.keep_checkpoint_max)
        ckpt_cb = ModelCheckpoint(prefix="resnet",
                                  directory=ckpt_save_dir,
                                  config=config_ck)
        cb += [ckpt_cb]

    # train model
    if args_opt.net == "se-resnet50":
        config.epoch_size = config.train_epoch_size
    model.train(config.epoch_size - config.pretrain_epoch_size,
                dataset,
                callbacks=cb,
                sink_size=dataset.get_dataset_size(),
                dataset_sink_mode=(not args_opt.parameter_server))

    open("resnet50_partition.txt",
         "w").write(str(_executor._get_shard_strategy(model._train_network)))
예제 #17
0
def test_two_matmul():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul1 = P.MatMul()
            self.matmul2 = P.MatMul()

        def construct(self, x, y, b):
            out = self.matmul1(x, y)
            out = self.matmul2(out, b)
            return out

    size = 16
    context.set_auto_parallel_context(device_num=size, global_rank=0)
    cost_model_context.set_cost_model_context(
        device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0,
        costmodel_alpha=1.0,
        costmodel_beta=60.0,
        costmodel_gamma=0.1,
        costmodel_communi_threshold=1024.0,
        costmodel_communi_const=2222.0,
        costmodel_communi_bias=1111.0)
    dev_mem_cap = cost_model_context.get_cost_model_context(
        "device_memory_capacity")
    assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0
    costmodel_alpha = cost_model_context.get_cost_model_context(
        "costmodel_alpha")
    assert costmodel_alpha == 1.0
    costmodel_beta = cost_model_context.get_cost_model_context(
        "costmodel_beta")
    assert costmodel_beta == 60.0
    costmodel_gamma = cost_model_context.get_cost_model_context(
        "costmodel_gamma")
    assert costmodel_gamma == 0.1
    costmodel_communi_threshold = cost_model_context.get_cost_model_context(
        "costmodel_communi_threshold")
    assert costmodel_communi_threshold == 1024.0
    costmodel_communi_const = cost_model_context.get_cost_model_context(
        "costmodel_communi_const")
    assert costmodel_communi_const == 2222.0
    costmodel_communi_bias = cost_model_context.get_cost_model_context(
        "costmodel_communi_bias")
    assert costmodel_communi_bias == 1111.0

    cost_model_context.reset_cost_model_context()
    dev_mem_cap = cost_model_context.get_cost_model_context(
        "device_memory_capacity")
    assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0
    costmodel_alpha = cost_model_context.get_cost_model_context(
        "costmodel_alpha")
    assert costmodel_alpha == 1.0
    costmodel_beta = cost_model_context.get_cost_model_context(
        "costmodel_beta")
    assert costmodel_beta == 400.0
    costmodel_gamma = cost_model_context.get_cost_model_context(
        "costmodel_gamma")
    assert costmodel_gamma == 0.001
    costmodel_communi_threshold = cost_model_context.get_cost_model_context(
        "costmodel_communi_threshold")
    assert costmodel_communi_threshold == 2048.0
    costmodel_communi_const = cost_model_context.get_cost_model_context(
        "costmodel_communi_const")
    assert costmodel_communi_const == 3072.0
    costmodel_communi_bias = cost_model_context.get_cost_model_context(
        "costmodel_communi_bias")
    assert costmodel_communi_bias == 1024.0

    set_algo_parameters(tensor_slice_align_enable=False,
                        tensor_slice_align_size=32,
                        fully_use_devices=False,
                        elementwise_op_strategy_follow=False,
                        enable_algo_approxi=True,
                        algo_approxi_epsilon=0.001)
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert not para_slice_align_enable
    para_slice_align_size = get_algo_parameters("tensor_slice_align_size")
    assert para_slice_align_size == 32
    fully_use_devices = get_algo_parameters("fully_use_devices")
    assert not fully_use_devices
    elementwise_op_strategy_follow = get_algo_parameters(
        "elementwise_op_strategy_follow")
    assert not elementwise_op_strategy_follow
    enable_approxi = get_algo_parameters("enable_algo_approxi")
    assert enable_approxi
    algo_epsilon = get_algo_parameters("algo_approxi_epsilon")
    assert algo_epsilon == 0.001

    expecte_single_loop = True
    signle_loop = _get_algo_single_loop()
    assert expecte_single_loop == signle_loop
    expecte_single_loop = False
    _set_algo_single_loop(expecte_single_loop)
    signle_loop = _get_algo_single_loop()
    assert expecte_single_loop == signle_loop

    reset_algo_parameters()
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert not para_slice_align_enable
    para_slice_align_size = get_algo_parameters("tensor_slice_align_size")
    assert para_slice_align_size == 16
    fully_use_devices = get_algo_parameters("fully_use_devices")
    assert fully_use_devices
    elementwise_op_strategy_follow = get_algo_parameters(
        "elementwise_op_strategy_follow")
    assert not elementwise_op_strategy_follow
    enable_approxi = get_algo_parameters("enable_algo_approxi")
    assert not enable_approxi
    algo_epsilon = get_algo_parameters("algo_approxi_epsilon")
    assert algo_epsilon == 0.1

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)

    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    net.set_train()
    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
    for (k, v) in strategies.items():
        if re.search('MatMul-op', k) is not None:
            assert v == [[16, 1], [1, 1]]
def test_two_matmul():
    class Net(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul1 = P.MatMul()
            self.matmul2 = P.MatMul()

        def construct(self, x, y, b):
            out = self.matmul1(x, y)
            out = self.matmul2(out, b)
            return out

    size = 16
    context.set_auto_parallel_context(device_num=size, global_rank=0)
    cost_model_context.set_cost_model_context(
        device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0,
        costmodel_alpha=1.0,
        costmodel_beta=60.0,
        costmodel_gamma=0.1,
        costmodel_communi_threshold=1024.0,
        costmodel_communi_const=2222.0,
        costmodel_communi_bias=1111.0)
    dev_mem_cap = cost_model_context.get_cost_model_context(
        "device_memory_capacity")
    assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0
    costmodel_alpha = cost_model_context.get_cost_model_context(
        "costmodel_alpha")
    assert costmodel_alpha == 1.0
    costmodel_beta = cost_model_context.get_cost_model_context(
        "costmodel_beta")
    assert costmodel_beta == 60.0
    costmodel_gamma = cost_model_context.get_cost_model_context(
        "costmodel_gamma")
    assert costmodel_gamma == 0.1
    costmodel_communi_threshold = cost_model_context.get_cost_model_context(
        "costmodel_communi_threshold")
    assert costmodel_communi_threshold == 1024.0
    costmodel_communi_const = cost_model_context.get_cost_model_context(
        "costmodel_communi_const")
    assert costmodel_communi_const == 2222.0
    costmodel_communi_bias = cost_model_context.get_cost_model_context(
        "costmodel_communi_bias")
    assert costmodel_communi_bias == 1111.0

    cost_model_context.reset_cost_model_context()
    dev_mem_cap = cost_model_context.get_cost_model_context(
        "device_memory_capacity")
    assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0
    costmodel_alpha = cost_model_context.get_cost_model_context(
        "costmodel_alpha")
    assert costmodel_alpha == 1.0
    costmodel_beta = cost_model_context.get_cost_model_context(
        "costmodel_beta")
    assert costmodel_beta == 400.0
    costmodel_gamma = cost_model_context.get_cost_model_context(
        "costmodel_gamma")
    assert costmodel_gamma == 0.001
    costmodel_communi_threshold = cost_model_context.get_cost_model_context(
        "costmodel_communi_threshold")
    assert costmodel_communi_threshold == 2048.0
    costmodel_communi_const = cost_model_context.get_cost_model_context(
        "costmodel_communi_const")
    assert costmodel_communi_const == 3072.0
    costmodel_communi_bias = cost_model_context.get_cost_model_context(
        "costmodel_communi_bias")
    assert costmodel_communi_bias == 1024.0

    set_algo_parameters(tensor_slice_align_enable=False,
                        tensor_slice_align_size=32,
                        fully_use_devices=False,
                        elementwise_op_strategy_follow=False)
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert not para_slice_align_enable
    para_slice_align_size = get_algo_parameters("tensor_slice_align_size")
    assert para_slice_align_size == 32
    fully_use_devices = get_algo_parameters("fully_use_devices")
    assert not fully_use_devices
    elementwise_op_strategy_follow = get_algo_parameters(
        "elementwise_op_strategy_follow")
    assert not elementwise_op_strategy_follow

    reset_algo_parameters()
    para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable")
    assert not para_slice_align_enable
    para_slice_align_size = get_algo_parameters("tensor_slice_align_size")
    assert para_slice_align_size == 16
    fully_use_devices = get_algo_parameters("fully_use_devices")
    assert fully_use_devices
    elementwise_op_strategy_follow = get_algo_parameters(
        "elementwise_op_strategy_follow")
    assert not elementwise_op_strategy_follow

    x = Tensor(np.ones([128, 32]), dtype=ms.float32)
    y = Tensor(np.ones([32, 64]), dtype=ms.float32)
    b = Tensor(np.ones([64, 64]), dtype=ms.float32)

    net = NetWithLoss(Net())
    context.set_auto_parallel_context(parallel_mode="auto_parallel")
    net.set_auto_parallel()
    reset_op_id()

    _executor.compile(net, x, y, b, phase='train')
    strategies = _executor._get_shard_strategy(net)
    expected_strategies = {
        'Default/network-Net/MatMul-op0': [[16, 1], [1, 1]],
        'Default/network-Net/MatMul-op1': [[16, 1], [1, 1]]
    }
    assert strategies == expected_strategies
예제 #19
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        choices=["true", "false"],
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="1",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default="1",
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument(
        "--accumulation_steps",
        type=int,
        default="1",
        help=
        "Accumulating gradients N times before weight update, default is 1.")
    parser.add_argument("--save_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=1000,
                        help="Save checkpoint steps, "
                        "default is 1000.")
    parser.add_argument("--train_steps",
                        type=int,
                        default=-1,
                        help="Training Steps, default is -1, "
                        "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num",
                        type=int,
                        default=1,
                        help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init()
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init()
            device_num = D.get_group_size()
            rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
            get_rank()) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.AUTO_PARALLEL,
            gradients_mean=True,
            device_num=device_num,
            auto_parallel_search_mode="recursive_programming")
        _set_bert_all_reduce_split()
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(
            args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(
            cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(
                args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle,
                             args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size(
    ) // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        train_steps = args_opt.train_steps * args_opt.accumulation_steps
        new_repeat_count = min(new_repeat_count,
                               train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size(
        ) // args_opt.accumulation_steps
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [
        TimeMonitor(args_opt.data_sink_steps),
        LossCallBack(ds.get_dataset_size())
    ]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(
            8, device_num) == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(
            prefix='checkpoint_bert',
            directory=None if ckpt_save_dir == "" else ckpt_save_dir,
            config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)

        if args_opt.accumulation_steps <= 1:
            net_with_grads = BertTrainOneStepWithLossScaleCell(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell)
        else:
            accumulation_steps = args_opt.accumulation_steps
            net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell,
                accumulation_steps=accumulation_steps,
                enable_global_norm=cfg.enable_global_norm)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)

    model = Model(net_with_grads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)

    open("bert_4gpu.txt",
         "w").write(str(_executor._get_shard_strategy(model._train_network)))
예제 #20
0
def compile_graph_two_input(x, y, net):
    net.set_auto_parallel()
    net.set_train(False)
    _executor.compile(net, x, y, auto_parallel_mode=True)
    strategies = _executor._get_shard_strategy(net)
    return strategies
예제 #21
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.platform, save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        # parallel_mode = ParallelMode.DATA_PARALLEL
        parallel_mode = ParallelMode.AUTO_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
                                          gradients_mean=True)
    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size,
                                        args.per_batch_size, 1,
                                        args.rank, args.group_size, num_parallel_workers=8)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone, num_classes=args.num_classes, platform=args.platform)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))

    load_pretrain_model(args.pretrained, network, args)

    # lr scheduler
    lr = get_lr(args)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)


    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)

    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)

    model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
                  metrics={'acc'}, amp_level="O3")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [progress_cb,]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
                                       keep_checkpoint_max=args.ckpt_save_max)
        save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=save_ckpt_path,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
    
    open("resnext50_partition_4gpu.txt", "w").write(str(_executor._get_shard_strategy(model._train_network)))
    print("successfully exported partition!")