Пример #1
0
 def __init__(self,
              size=256,
              batch_size=16,
              image_size=(96, ),
              num_classes=16,
              random_offset=0):
     """init"""
     self.size = size
     self.rank_batch_size = batch_size
     self.total_batch_size = self.rank_batch_size
     self.random_offset = random_offset
     self.image_size = image_size
     self.num_classes = num_classes
     self.num_epochs = -1
     self.rank_size = 1
     self.rank_id = 0
     self.batch_index = 0
     self.image_data_type = np.float32
     self.label_data_type = np.float32
     self.is_onehot = True
     init(backend_name='hccl')
     self.rank_size = get_group_size()
     self.rank_id = get_rank()
     self.total_batch_size = self.rank_batch_size * self.rank_size
     self.total_batch_data_size = (self.rank_size,
                                   self.rank_batch_size) + image_size
     self.do_copy = False
Пример #2
0
def test_inference():
    """distributed inference after distributed training"""
    context.set_context(mode=context.GRAPH_MODE)
    init(backend_name="hccl")
    context.set_auto_parallel_context(full_batch=True, parallel_mode="semi_auto_parallel",
                                      strategy_ckpt_load_file="./train_strategy.ckpt", device_num=8)

    predict_data = create_predict_data()
    network = Net(matmul_size=(96, 16))
    model = Model(network)
    predict_layout = model.infer_predict_layout(Tensor(predict_data))
    ckpt_file_list = create_ckpt_file_list()
    load_distributed_checkpoint(network, ckpt_file_list, predict_layout)
    predict_result = model.predict(predict_data)
    print(predict_result)
Пример #3
0
    def __run_standalone(self):
        # import
        from mindspore import context
        from mindspore.communication import init
        from mindspore.context import ParallelMode

        # set context: device_target
        context.set_context(device_target=self.__device_target)

        # set context: mode
        if self.__graph_mode:
            context.set_context(mode=context.GRAPH_MODE)

        # set context: save_graphs
        context.set_context(save_graphs=self.__save_graphs)

        # set context: device_id
        device_id = int(os.environ.get("DEVICE_ID", 0))
        context.set_context(device_id=device_id)

        # init
        device_num = int(os.environ.get("DEVICE_NUM", 1))
        if device_num > 1 and "win32" not in sys.platform:
            context.reset_auto_parallel_context()
            context.set_auto_parallel_context(
                device_num=device_num,
                parallel_mode=ParallelMode.DATA_PARALLEL,
                gradients_mean=True)
            init()

        if self.__dataset is None:
            print(
                "Warning: `dataset` is None. Please call func: `set_dataset($dataset)`."
            )

        if self.__network is None:
            print(
                "Warning: `network` is None. Please call func: `set_network($network)`."
            )

        if self.__dataset is None or self.__network is None:
            return

        if self.__do_eval:
            self.__eval()
        else:
            self.__train()
Пример #4
0
def inception_v4_train():
    """
    Train Inceptionv4 in data parallelism
    """
    print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes))

    context.set_context(mode=context.GRAPH_MODE, device_target=args.platform)
    if args.platform == "Ascend":
        context.set_context(device_id=args.device_id)
        context.set_context(enable_graph_kernel=False)

    rank = 0
    if device_num > 1:
        if args.platform == "Ascend":
            init(backend_name='hccl')
        elif args.platform == "GPU":
            init()
        else:
            raise ValueError("Unsupported device target.")

        rank = get_rank()
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True,
                                          all_reduce_fusion_config=[200, 400])

    # create dataset
    train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True,
                                   repeat_num=1, batch_size=config.batch_size, shard_id=rank)
    train_step_size = train_dataset.get_dataset_size()

    # create model
    net = Inceptionv4(classes=config.num_classes)
    # loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # learning rate
    lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size))

    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype))
    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
                    {'params': no_decayed_params},
                    {'order_params': net.trainable_params()}]

    opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay,
                  momentum=config.momentum, loss_scale=config.loss_scale)

    if args.device_id == 0:
        print(lr)
        print(train_step_size)
    if args.resume:
        ckpt = load_checkpoint(args.resume)
        load_param_into_net(net, ckpt)

    loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)


    if args.platform == "Ascend":
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                      loss_scale_manager=loss_scale_manager, amp_level=config.amp_level)
    elif args.platform == "GPU":
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                      loss_scale_manager=loss_scale_manager, amp_level='O0')
    else:
        raise ValueError("Unsupported device target.")

    # define callbacks
    performance_cb = TimeMonitor(data_size=train_step_size)
    loss_cb = LossMonitor(per_print_times=train_step_size)
    ckp_save_step = config.save_checkpoint_epochs * train_step_size
    config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}",
                                 directory='ckpts_rank_' + str(rank), config=config_ck)
    callbacks = [performance_cb, loss_cb]
    if device_num > 1 and config.is_save_on_master:
        if args.device_id == 0:
            callbacks.append(ckpoint_cb)
    else:
        callbacks.append(ckpoint_cb)

    # train model
    model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
from mindspore import dtype as mstype
import mindspore.ops as ops
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as vision
import mindspore.dataset.transforms.c_transforms as C
from mindspore.communication import init, get_rank, get_group_size
from mindspore import Tensor, Model, context
from mindspore.nn import Momentum
from mindspore.context import ParallelMode
from mindspore.train.callback import LossMonitor
from resnet import resnet50

device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(device_id=device_id)  # set device_id
init()


def create_dataset(data_path,
                   repeat_num=1,
                   batch_size=32,
                   rank_id=0,
                   rank_size=1):  # pylint: disable=missing-docstring
    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # get rank_id and rank_size
    rank_id = get_rank()
    rank_size = get_group_size()
import mindspore.nn as nn
from mindspore import dtype as mstype
import mindspore.ops as ops
import mindspore.dataset as ds
import mindspore.dataset.vision.c_transforms as vision
import mindspore.dataset.transforms.c_transforms as C
from mindspore.communication import init, get_rank, get_group_size
from mindspore import Tensor, Model
from mindspore.nn import Momentum
from mindspore.context import ParallelMode
from mindspore import context
from mindspore.train.callback import LossMonitor
from resnet import resnet50

context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
init("nccl")


def create_dataset(data_path,
                   repeat_num=1,
                   batch_size=32,
                   rank_id=0,
                   rank_size=1):  # pylint: disable=missing-docstring
    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # get rank_id and rank_size
    rank_id = get_rank()
    rank_size = get_group_size()