def __init__(self, size=256, batch_size=16, image_size=(96, ), num_classes=16, random_offset=0): """init""" self.size = size self.rank_batch_size = batch_size self.total_batch_size = self.rank_batch_size self.random_offset = random_offset self.image_size = image_size self.num_classes = num_classes self.num_epochs = -1 self.rank_size = 1 self.rank_id = 0 self.batch_index = 0 self.image_data_type = np.float32 self.label_data_type = np.float32 self.is_onehot = True init(backend_name='hccl') self.rank_size = get_group_size() self.rank_id = get_rank() self.total_batch_size = self.rank_batch_size * self.rank_size self.total_batch_data_size = (self.rank_size, self.rank_batch_size) + image_size self.do_copy = False
def test_inference(): """distributed inference after distributed training""" context.set_context(mode=context.GRAPH_MODE) init(backend_name="hccl") context.set_auto_parallel_context(full_batch=True, parallel_mode="semi_auto_parallel", strategy_ckpt_load_file="./train_strategy.ckpt", device_num=8) predict_data = create_predict_data() network = Net(matmul_size=(96, 16)) model = Model(network) predict_layout = model.infer_predict_layout(Tensor(predict_data)) ckpt_file_list = create_ckpt_file_list() load_distributed_checkpoint(network, ckpt_file_list, predict_layout) predict_result = model.predict(predict_data) print(predict_result)
def __run_standalone(self): # import from mindspore import context from mindspore.communication import init from mindspore.context import ParallelMode # set context: device_target context.set_context(device_target=self.__device_target) # set context: mode if self.__graph_mode: context.set_context(mode=context.GRAPH_MODE) # set context: save_graphs context.set_context(save_graphs=self.__save_graphs) # set context: device_id device_id = int(os.environ.get("DEVICE_ID", 0)) context.set_context(device_id=device_id) # init device_num = int(os.environ.get("DEVICE_NUM", 1)) if device_num > 1 and "win32" not in sys.platform: context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() if self.__dataset is None: print( "Warning: `dataset` is None. Please call func: `set_dataset($dataset)`." ) if self.__network is None: print( "Warning: `network` is None. Please call func: `set_network($network)`." ) if self.__dataset is None or self.__network is None: return if self.__do_eval: self.__eval() else: self.__train()
def inception_v4_train(): """ Train Inceptionv4 in data parallelism """ print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes)) context.set_context(mode=context.GRAPH_MODE, device_target=args.platform) if args.platform == "Ascend": context.set_context(device_id=args.device_id) context.set_context(enable_graph_kernel=False) rank = 0 if device_num > 1: if args.platform == "Ascend": init(backend_name='hccl') elif args.platform == "GPU": init() else: raise ValueError("Unsupported device target.") rank = get_rank() context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) # create dataset train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size, shard_id=rank) train_step_size = train_dataset.get_dataset_size() # create model net = Inceptionv4(classes=config.num_classes) # loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # learning rate lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size)) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay, momentum=config.momentum, loss_scale=config.loss_scale) if args.device_id == 0: print(lr) print(train_step_size) if args.resume: ckpt = load_checkpoint(args.resume) load_param_into_net(net, ckpt) loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level=config.amp_level) elif args.platform == "GPU": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level='O0') else: raise ValueError("Unsupported device target.") # define callbacks performance_cb = TimeMonitor(data_size=train_step_size) loss_cb = LossMonitor(per_print_times=train_step_size) ckp_save_step = config.save_checkpoint_epochs * train_step_size config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}", directory='ckpts_rank_' + str(rank), config=config_ck) callbacks = [performance_cb, loss_cb] if device_num > 1 and config.is_save_on_master: if args.device_id == 0: callbacks.append(ckpoint_cb) else: callbacks.append(ckpoint_cb) # train model model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
from mindspore import dtype as mstype import mindspore.ops as ops import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as vision import mindspore.dataset.transforms.c_transforms as C from mindspore.communication import init, get_rank, get_group_size from mindspore import Tensor, Model, context from mindspore.nn import Momentum from mindspore.context import ParallelMode from mindspore.train.callback import LossMonitor from resnet import resnet50 device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") context.set_context(device_id=device_id) # set device_id init() def create_dataset(data_path, repeat_num=1, batch_size=32, rank_id=0, rank_size=1): # pylint: disable=missing-docstring resize_height = 224 resize_width = 224 rescale = 1.0 / 255.0 shift = 0.0 # get rank_id and rank_size rank_id = get_rank() rank_size = get_group_size()
import mindspore.nn as nn from mindspore import dtype as mstype import mindspore.ops as ops import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as vision import mindspore.dataset.transforms.c_transforms as C from mindspore.communication import init, get_rank, get_group_size from mindspore import Tensor, Model from mindspore.nn import Momentum from mindspore.context import ParallelMode from mindspore import context from mindspore.train.callback import LossMonitor from resnet import resnet50 context.set_context(mode=context.GRAPH_MODE, device_target="GPU") init("nccl") def create_dataset(data_path, repeat_num=1, batch_size=32, rank_id=0, rank_size=1): # pylint: disable=missing-docstring resize_height = 224 resize_width = 224 rescale = 1.0 / 255.0 shift = 0.0 # get rank_id and rank_size rank_id = get_rank() rank_size = get_group_size()