def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): """ create a train or eval imagenet2012 dataset for resnet50 Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 target(str): the device target. Default: Ascend Returns: dataset """ if target == "Ascend": device_num = int(os.getenv("DEVICE_NUM")) rank_id = int(os.getenv("RANK_ID")) else: init("nccl") rank_id = get_rank() device_num = get_group_size() if device_num == 1: ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) else: ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] # define map operations if do_train: trans = [ C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomHorizontalFlip(prob=0.5), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] else: trans = [ C.Decode(), C.Resize(256), C.CenterCrop(image_size), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans) ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds
def train_process(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) init() # network net = resnet50(class_num=config.class_num) # evaluation network dist_eval_network = ClassifyCorrectCell(net) if not config.use_label_smooth: config.label_smooth_factor = 0.0 # loss loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) # train dataset dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() eval_interval = config.eval_interval dataset.__loop_size__ = step_size * eval_interval # evalutation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=epoch_size, batch_size=config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) # learning rate lr = Tensor( get_learning_rate(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode)) # optimizer decayed_params = list( filter( lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name, net.trainable_params())) no_decayed_params = [ param for param in net.trainable_params() if param not in decayed_params ] group_params = [{ 'params': decayed_params, 'weight_decay': config.weight_decay }, { 'params': no_decayed_params }, { 'order_params': net.trainable_params() }] if config.use_lars: momentum = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, use_nesterov=config.use_nesterov) opt = nn.LARS(momentum, epsilon=config.lars_epsilon, hyperpara=config.lars_coefficient, weight_decay=config.weight_decay, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name, lars_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name, loss_scale=config.loss_scale) else: opt = nn.Momentum(group_params, lr, config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale, use_nesterov=config.use_nesterov) # model model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network) # model init print("init_start", device_id) model.init(dataset, eval_dataset) print("init_stop", device_id) # callbacks loss_cb = LossGet(1, step_size) # train and eval print("run_start", device_id) acc = 0.0 time_cost = 0.0 for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(1, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})
def parse_args(cloud_args=None): """parameters""" parser = argparse.ArgumentParser('mindspore classification training') parser.add_argument('--platform', type=str, default='Ascend', choices=('Ascend', 'GPU'), help='run platform') # dataset related parser.add_argument('--data_dir', type=str, default='', help='train data dir') parser.add_argument('--per_batch_size', default=128, type=int, help='batch size for per gpu') # network related parser.add_argument('--pretrained', default='', type=str, help='model_path, local pretrained model to load') # distributed related parser.add_argument('--is_distributed', type=int, default=1, help='if multi device') # roma obs parser.add_argument('--train_url', type=str, default="", help='train url') args, _ = parser.parse_known_args() args = merge_args(args, cloud_args) args.image_size = config.image_size args.num_classes = config.num_classes args.lr = config.lr args.lr_scheduler = config.lr_scheduler args.lr_epochs = config.lr_epochs args.lr_gamma = config.lr_gamma args.eta_min = config.eta_min args.T_max = config.T_max args.max_epoch = config.max_epoch args.backbone = config.backbone args.warmup_epochs = config.warmup_epochs args.weight_decay = config.weight_decay args.momentum = config.momentum args.is_dynamic_loss_scale = config.is_dynamic_loss_scale args.loss_scale = config.loss_scale args.label_smooth = config.label_smooth args.label_smooth_factor = config.label_smooth_factor args.ckpt_interval = config.ckpt_interval args.ckpt_save_max = config.ckpt_save_max args.ckpt_path = config.ckpt_path args.is_save_on_master = config.is_save_on_master args.rank = config.rank args.group_size = config.group_size args.lr_epochs = list(map(int, args.lr_epochs.split(','))) args.image_size = list(map(int, args.image_size.split(','))) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() else: args.rank = 0 args.group_size = 1 if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) return args
args.lr_epochs = list(map(int, args.lr_epochs.split(','))) args.data_root = os.path.join(args.data_dir, 'train2017') args.annFile = os.path.join(args.data_dir, 'annotations/instances_train2017.json') device_id = int(os.getenv('DEVICE_ID', '0')) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False, device_id=device_id) # init distributed if args.is_distributed: if args.device_target == "Ascend": init() else: init("nccl") args.rank = get_rank() args.group_size = get_group_size() # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' + str(get_rank()), directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ]) if __name__ == "__main__": wide_deep_config = WideDeepConfig() wide_deep_config.argparse_init() context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) if wide_deep_config.device_target == "Ascend": init("hccl") elif wide_deep_config.device_target == "GPU": init("nccl") context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=get_group_size()) train_and_eval(wide_deep_config)
def parse_args(): """Parse train arguments.""" parser = argparse.ArgumentParser('mindspore coco training') # device related parser.add_argument( '--device_target', type=str, default='GPU', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: GPU)') # dataset related parser.add_argument('--data_dir', required=True, type=str, help='Train dataset directory.') parser.add_argument('--per_batch_size', default=4, type=int, help='Batch size for Training. Default: 4.') parser.add_argument('--max_epoch', required=True, type=int, default=320, help='max epoch num to train the model. Default: 320.') parser.add_argument('--warmup_epochs', default=0, type=float, help='Warmup epochs. Default: 0') # network related parser.add_argument('--pretrained_backbone', default='', type=str, help='The ckpt file of DarkNet53. Default: "".') parser.add_argument( '--resume_yolov3', default='', type=str, help='The ckpt file of YOLOv3, which used to fine tune. Default: ""') # optimizer and lr related parser.add_argument( '--lr_scheduler', default='exponential', type=str, help= 'Learning rate scheduler, options: exponential, cosine_annealing. Default: exponential' ) parser.add_argument('--lr', default=0.001, type=float, help='Learning rate. Default: 0.001') parser.add_argument( '--lr_epochs', type=str, default='220,250', help= 'Epoch of changing of lr changing, split with ",". Default: 220,250') parser.add_argument( '--lr_gamma', type=float, default=0.1, help='Decrease lr by a factor of exponential lr_scheduler. Default: 0.1' ) parser.add_argument( '--eta_min', type=float, default=0., help='Eta_min in cosine_annealing scheduler. Default: 0') parser.add_argument( '--T_max', type=int, default=320, help='T-max in cosine_annealing scheduler. Default: 320') parser.add_argument('--weight_decay', type=float, default=0.0005, help='Weight decay factor. Default: 0.0005') parser.add_argument('--momentum', type=float, default=0.9, help='Momentum. Default: 0.9') # loss related parser.add_argument('--loss_scale', type=int, default=1024, help='Static loss scale. Default: 1024') parser.add_argument('--label_smooth', type=int, default=0, help='Whether to use label smooth in CE. Default:0') parser.add_argument( '--label_smooth_factor', type=float, default=0.1, help='Smooth strength of original one-hot. Default: 0.1') # logging related parser.add_argument('--log_interval', type=int, default=100, help='Logging interval steps. Default: 100') parser.add_argument('--ckpt_path', type=str, default='outputs/', help='Checkpoint save location. Default: outputs/') parser.add_argument('--ckpt_interval', type=int, default=None, help='Save checkpoint interval. Default: None') # distributed related parser.add_argument( '--is_distributed', type=int, default=0, help='Distribute train or not, 1 for yes, 0 for no. Default: 0') parser.add_argument('--rank', type=int, default=0, help='Local rank of distributed. Default: 0') parser.add_argument('--group_size', type=int, default=1, help='World size of device. Default: 1') # reset default config parser.add_argument('--training_shape', type=str, default="", help='Fix training shape. Default: ""') parser.add_argument( '--resize_rate', type=int, default=None, help='Resize rate for multi-scale training. Default: None') args, _ = parser.parse_known_args() args.lr_epochs = list(map(int, args.lr_epochs.split(','))) args.data_root = os.path.join(args.data_dir, 'images') args.ann_file = os.path.join(args.data_dir, 'annotation.json') # init distributed if args.is_distributed: if args.device_target == "Ascend": init() else: init("nccl") args.rank = get_rank() args.group_size = get_group_size() # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) return args
def train(): """Train function.""" args.outputs_dir = params['save_model_path'] if args.group_size > 1: init() context.set_auto_parallel_context( device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_{}/".format(str(get_rank()))) args.rank = get_rank() else: args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/") args.rank = 0 if args.group_size > 1: args.max_epoch = params["max_epoch_train_NP"] args.loss_scale = params['loss_scale'] / 2 args.lr_steps = list(map(int, params["lr_steps_NP"].split(','))) params['train_type'] = params['train_type_NP'] params['optimizer'] = params['optimizer_NP'] params['group_params'] = params['group_params_NP'] else: args.max_epoch = params["max_epoch_train"] args.loss_scale = params['loss_scale'] args.lr_steps = list(map(int, params["lr_steps"].split(','))) # create network print('start create network') criterion = openpose_loss() criterion.add_flags_recursive(fp32=True) network = OpenPoseNet(vggpath=params['vgg_path'], vgg_with_bn=params['vgg_with_bn']) if params["load_pretrain"]: print("load pretrain model:", params["pretrained_model_path"]) load_model(network, params["pretrained_model_path"]) train_net = BuildTrainNetwork(network, criterion) # create dataset if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \ and os.path.exists(args.maskpath_train): print('start create dataset') else: print('Error: wrong data path') return 0 num_worker = 20 if args.group_size > 1 else 48 de_dataset_train = create_dataset(args.jsonpath_train, args.imgpath_train, args.maskpath_train, batch_size=params['batch_size'], rank=args.rank, group_size=args.group_size, num_worker=num_worker, multiprocessing=True, shuffle=True, repeat_num=1) steps_per_epoch = de_dataset_train.get_dataset_size() print("steps_per_epoch: ", steps_per_epoch) # lr scheduler lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size, params['lr_gamma'], steps_per_epoch, args.max_epoch, args.lr_steps, args.group_size, lr_type=params['lr_type'], warmup_epoch=params['warmup_epoch']) # optimizer if params['group_params']: vgg19_base_params = list( filter(lambda x: 'base.vgg_base' in x.name, train_net.trainable_params())) base_params = list( filter(lambda x: 'base.conv' in x.name, train_net.trainable_params())) stages_params = list( filter(lambda x: 'base' not in x.name, train_net.trainable_params())) group_params = [{ 'params': vgg19_base_params, 'lr': lr_vgg }, { 'params': base_params, 'lr': lr_base }, { 'params': stages_params, 'lr': lr_stage }] if params['optimizer'] == "Momentum": opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(group_params) else: raise ValueError("optimizer not support.") else: if params['optimizer'] == "Momentum": opt = Momentum(train_net.trainable_params(), learning_rate=lr_stage, momentum=0.9) elif params['optimizer'] == "Adam": opt = Adam(train_net.trainable_params(), learning_rate=lr_stage) else: raise ValueError("optimizer not support.") # callback config_ck = CheckpointConfig( save_checkpoint_steps=params['ckpt_interval'], keep_checkpoint_max=params["keep_checkpoint_max"]) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank), directory=args.outputs_dir, config=config_ck) time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size()) if args.rank == 0: callback_list = [MyLossMonitor(), time_cb, ckpoint_cb] else: callback_list = [MyLossMonitor(), time_cb] # train if params['train_type'] == 'clip_grad': train_net = TrainOneStepWithClipGradientCell(train_net, opt, sens=args.loss_scale) train_net.set_train() model = Model(train_net) elif params['train_type'] == 'fix_loss_scale': loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) train_net.set_train() model = Model(train_net, optimizer=opt, loss_scale_manager=loss_scale_manager) else: raise ValueError("Type {} is not support.".format( params['train_type'])) print("============== Starting Training ==============") model.train(args.max_epoch, de_dataset_train, callbacks=callback_list, dataset_sink_mode=False) return 0
def train_on_ascend(): config = config_ascend_quant print("training args: {}".format(args_opt)) print("training configure: {}".format(config)) print("parallel args: rank_id {}, device_id {}, rank_size {}".format( rank_id, device_id, rank_size)) epoch_size = config.epoch_size # distribute init if run_distribute: context.set_auto_parallel_context( device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config, device_target=args_opt.device_target, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # load pre trained ckpt if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network quantizer = QuantizationAwareTraining(bn_fold=True, per_channel=[True, False], symmetric=[True, False], one_conv_fold=False) network = quantizer.quantize(network) # get learning rate lr = Tensor( get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum( filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") callback = None if rank_id == 0: callback = [Monitor(lr_init=lr.asnumpy())] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=config.save_checkpoint_path, config=config_ck) callback += [ckpt_cb] model.train(epoch_size, dataset, callbacks=callback) print("============== End Training ==============")
def run_pretrain(): """pre-train bert_clue""" parser = argparse_init() args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) is_auto_enable_graph_kernel = _auto_enable_graph_kernel( args_opt.device_target, args_opt.enable_graph_kernel) _set_graph_kernel_context(args_opt.device_target, args_opt.enable_graph_kernel, is_auto_enable_graph_kernel) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( get_rank()) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) _set_bert_all_reduce_split() else: rank = 0 device_num = 1 _check_compute_type(args_opt, is_auto_enable_graph_kernel) if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format( args_opt.accumulation_steps)) logger.info("global batch size: {}".format( cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format( args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: train_steps = args_opt.train_steps * args_opt.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.accumulation_steps logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [ TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size()) ] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min( 8, device_num) == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) accumulation_steps = args_opt.accumulation_steps enable_global_norm = cfg.enable_global_norm if accumulation_steps <= 1: if cfg.optimizer == 'AdamWeightDecay' and args_opt.device_target == 'GPU': net_with_grads = BertTrainOneStepWithLossScaleCellForAdam( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: allreduce_post = args_opt.distribute == "false" or args_opt.allreduce_post_accumulation == "true" net_with_accumulation = ( BertTrainAccumulationAllReducePostWithLossScaleCell if allreduce_post else BertTrainAccumulationAllReduceEachWithLossScaleCell) net_with_grads = net_with_accumulation( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model = ConvertModelUtils().convert_to_thor_model( model, network=net_with_grads, optimizer=optimizer, frequency=cfg.Thor.frequency) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def train(): args = parse_args() # backend assert args.device_target == 'GPU' context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) if args.distributed: init("nccl") args.rank = get_rank() args.group_size = get_group_size() context.set_auto_parallel_context(parallel_mode=context.ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=args.group_size) # experiments directory args.train_dir = os.path.join(env_dir, args.train_dir, 'ckpt') if args.rank == 0: if os.path.exists(args.train_dir): shutil.rmtree(args.train_dir, ignore_errors=True) # rm existing dir makedir_p(args.train_dir) args.data_file = os.path.join(env_dir, args.data_file) # dataset dataset = TransformSegDataset(data_file=args.data_file, batch_size=args.batch_size, crop_size=args.crop_size, min_scale=args.min_scale, max_scale=args.max_scale, ignore_label=args.ignore_label, num_classes=args.num_classes, shard_id=args.rank, shard_num=args.group_size) dataset = dataset.get_transformed_dataset(repeat=1) # network network = get_model_by_name(args.model, nclass=args.num_classes, phase='train') loss = SoftmaxCrossEntropyLoss(args.num_classes, ignore_label=args.ignore_label) loss.add_flags_recursive(fp32=True) train_net = BuildTrainNetwork(network, loss) # optimizer iters_per_epoch = dataset.get_dataset_size() total_train_steps = iters_per_epoch * args.epochs lr_iter = lr_scheduler(lr_type=args.lr_type, base_lr=args.base_lr, total_train_steps=total_train_steps, lr_decay_step=args.lr_decay_step, lr_decay_rate=args.lr_decay_rate) opt = nn.Momentum(params=train_net.trainable_params(), learning_rate=lr_iter, momentum=args.momentum, weight_decay=args.wd, loss_scale=args.loss_scale) # loss scale manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(train_net, optimizer=opt, amp_level='O0', loss_scale_manager=manager_loss_scale) # callback for saving ckpts time_cb = TimeMonitor(data_size=iters_per_epoch) loss_cb = LossMonitor() cbs = [time_cb, loss_cb] if args.rank == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args.save_steps, keep_checkpoint_max=args.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck) cbs.append(ckpoint_cb) model.train(args.epochs, dataset, callbacks=cbs, dataset_sink_mode=(args.device_target != "CPU"))
def train(): """Train function.""" args = parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id) if args.is_distributed: rank = args.rank_id device_num = args.device_num context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() else: rank = 0 device_num = 1 # Logger args.logger = get_logger(args.outputs_dir, rank) args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # DATASET dataset = create_ocr_train_dataset(args.mindrecord_file, config.batch_size, rank_size=device_num, rank_id=rank) args.steps_per_epoch = dataset.get_dataset_size() args.logger.info('Finish loading dataset') if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch args.logger.save_args(args) network = AttentionOCR(config.batch_size, int(config.img_width / 4), config.encoder_hidden_size, config.decoder_hidden_size, config.decoder_output_size, config.max_length, config.dropout_p) if args.pre_checkpoint_path: param_dict = load_checkpoint(args.pre_checkpoint_path) load_param_into_net(network, param_dict) network = AttentionOCRWithLossCell(network, config.max_length) lr = Tensor(config.lr, mstype.float32) opt = nn.Adam(network.trainable_params(), lr, beta1=config.adam_beta1, beta2=config.adam_beta2, loss_scale=config.loss_scale) network = TrainingWrapper(network, opt, sens=config.loss_scale) args.logger.info('Finished get network') callback = [TimeMonitor(data_size=1), LossMonitor()] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=args.steps_per_epoch, keep_checkpoint_max=config.keep_checkpoint_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix="crnn_seq2seq_ocr") callback.append(ckpt_cb) model = Model(network) model.train(config.num_epochs, dataset, callbacks=callback, dataset_sink_mode=False) args.logger.info('==========Training Done===============')
def test(): """test method""" # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() # logger args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) context.reset_auto_parallel_context() if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) args.logger.info('Creating Network....') network = YOLOV4CspDarkNet53(is_training=False) args.logger.info(args.pretrained) if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.pretrained)) else: args.logger.info('{} not exists or not a pre-trained file'.format( args.pretrained)) assert FileNotFoundError( '{} not exists or not a pre-trained file'.format(args.pretrained)) exit(1) data_root = args.data_root # annFile = args.annFile config = ConfigYOLOV4CspDarkNet53() if args.testing_shape: config.test_img_shape = convert_testing_shape(args) data_txt = os.path.join(args.data_dir, 'testdev2017.txt') ds, data_size = create_yolo_datasetv2(data_root, data_txt=data_txt, batch_size=args.per_batch_size, max_epoch=1, device_num=args.group_size, rank=args.rank, shuffle=False, config=config) args.logger.info('testing shape : {}'.format(config.test_img_shape)) args.logger.info('totol {} images to eval'.format(data_size)) network.set_train(False) # init detection engine detection = DetectionEngine(args) input_shape = Tensor(tuple(config.test_img_shape), ms.float32) args.logger.info('Start inference....') for i, data in enumerate(ds.create_dict_iterator()): image = Tensor(data["image"]) image_shape = Tensor(data["image_shape"]) image_id = Tensor(data["img_id"]) prediction = network(image, input_shape) output_big, output_me, output_small = prediction output_big = output_big.asnumpy() output_me = output_me.asnumpy() output_small = output_small.asnumpy() image_id = image_id.asnumpy() image_shape = image_shape.asnumpy() detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id) if i % 1000 == 0: args.logger.info('Processing... {:.2f}% '.format( i * args.per_batch_size / data_size * 100)) args.logger.info('Calculating mAP...') detection.do_nms_for_results() result_file_path = detection.write_result() args.logger.info('result file path: {}'.format(result_file_path))
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument( "--accumulation_steps", type=int, default="1", help= "Accumulating gradients N times before weight update, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init('nccl') device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( rank) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) from mindspore.parallel._auto_parallel_context import auto_parallel_context if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397]) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format( args_opt.accumulation_steps)) logger.info("global batch size: {}".format( bert_net_cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format( args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: new_repeat_count = min( new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() logger.info("train steps: {}".format(args_opt.train_steps)) if cfg.optimizer == 'Lamb': lr_schedule = BertLearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=cfg.Lamb.warmup_steps, decay_steps=args_opt.train_steps, power=cfg.Lamb.power) params = net_with_loss.trainable_params() decay_params = list(filter(cfg.Lamb.decay_filter, params)) other_params = list( filter(lambda x: not cfg.Lamb.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.Lamb.weight_decay }, { 'params': other_params }, { 'order_params': params }] optimizer = Lamb(group_params, learning_rate=lr_schedule, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(net_with_loss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecay': lr_schedule = BertLearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=cfg.AdamWeightDecay.warmup_steps, decay_steps=args_opt.train_steps, power=cfg.AdamWeightDecay.power) params = net_with_loss.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }, { 'order_params': params }] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=cfg.AdamWeightDecay.eps) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]" .format(cfg.optimizer)) callback = [ TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size()) ] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min( 8, device_num) == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) if args_opt.accumulation_steps <= 1: net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: accumulation_steps = args_opt.accumulation_steps net_with_grads = BertTrainAccumulateStepsWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=4, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="false", help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="false", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="100", help="Sink steps for each epoch, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id, save_graphs=False) context.set_context(reserve_class_name_in_scope=False) context.set_context(max_call_depth=3000) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/' context.reset_auto_parallel_context() _set_bert_all_reduce_split() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps if args_opt.train_steps > 0: new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()] if args_opt.enable_save_ckpt == "true" and rank == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads, frequency=cfg.Thor.frequency) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def create_dataset_cifar(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): """ create a train or evaluate cifar10 dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 target(str): the device target. Default: Ascend Returns: dataset """ if target == "Ascend": device_num, rank_id = _get_rank_info() else: init() rank_id = get_rank() device_num = get_group_size() if device_num == 1: ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) else: ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) # define map operations if do_train: trans = [ C.RandomCrop((32, 32), (4, 4, 4, 4)), C.RandomHorizontalFlip(prob=0.5), C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4), C.Resize((227, 227)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.CutOut(112), C.HWC2CHW() ] else: trans = [ C.Resize((227, 227)), C.Rescale(1.0 / 255.0, 0.0), C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds
def train(): # set args dev = "GPU" epoch_size = int(args_opt.epoch_size) total_batch = int(args_opt.batch_size) print_per_steps = int(args_opt.print_per_steps) compute_type = str(args_opt.dtype).lower() ckpt_save_dir = str(args_opt.ckpt_path) save_ckpt = bool(args_opt.save_ckpt) device_num = 1 # init context if args_opt.mode == "GRAPH": mode = context.GRAPH_MODE else: mode = context.PYNATIVE_MODE context.set_context(mode=mode, device_target=dev, save_graphs=False) if args_opt.run_distribute: init() device_num = get_group_size() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[85, 160]) ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type, device_num=device_num) step_size = dataset.get_dataset_size() if (print_per_steps > step_size or print_per_steps < 1): print("Arg: print_per_steps should lessequal to dataset_size ", step_size) print("Change to default: 20") print_per_steps = 20 # define net net = resnet(class_num=1001, dtype=compute_type) # init weight for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.set_data( weight_init.initializer(weight_init.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): cell.weight.set_data( weight_init.initializer(weight_init.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) # init lr lr = get_liner_lr(lr_init=0, lr_end=0, lr_max=0.8, warmup_epochs=0, total_epochs=epoch_size, steps_per_epoch=step_size) lr = Tensor(lr) # define opt decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4) loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # Mixed precision if compute_type == "fp16": opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) # define callbacks if mode == context.PYNATIVE_MODE: print_per_steps = 1 time_cb = MyTimeMonitor(total_batch, print_per_steps, step_size, mode) cb = [time_cb] if save_ckpt: config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size, keep_checkpoint_max=5) ckpt_cb = ModelCheckpoint(prefix="resnet_benchmark", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] # train model print("========START RESNET50 GPU BENCHMARK========") if mode == context.GRAPH_MODE: model.train(int(epoch_size * step_size / print_per_steps), dataset, callbacks=cb, sink_size=print_per_steps) else: model.train(epoch_size, dataset, callbacks=cb)
def create_dataset_imagenet(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): """ create a train or eval imagenet dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 target(str): the device target. Default: Ascend Returns: dataset """ if target == "Ascend": device_num, rank_id = _get_rank_info() else: init() rank_id = get_rank() device_num = get_group_size() if device_num == 1: ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) image_size = 227 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] # define map operations if do_train: trans = [ C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomHorizontalFlip(prob=0.5), C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4), C.Normalize(mean=mean, std=std), C.CutOut(112), C.HWC2CHW() ] else: trans = [ C.Decode(), C.Resize((256, 256)), C.CenterCrop(image_size), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds
def train(): """Train function.""" args = parse_args() # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.need_profiler: from mindinsight.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() else: parallel_mode = ParallelMode.STAND_ALONE degree = 1 context.set_auto_parallel_context(parallel_mode=parallel_mode, mirror_mean=True, device_num=degree) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) if args.pretrained_backbone: network = load_backbone(network, args.pretrained_backbone, args) args.logger.info('load pre-trained backbone {} into network'.format( args.pretrained_backbone)) else: args.logger.info('Not load pre-trained backbone, please be careful') if args.resume_yolov3: param_dict = load_checkpoint(args.resume_yolov3) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values args.logger.info('in resume {}'.format(key)) else: param_dict_new[key] = values args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov3)) network = YoloWithLossCell(network) args.logger.info('finish get network') config = ConfigYOLOV3DarkNet53() config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [conver_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr( args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_V2': lr = warmup_cosine_annealing_lr_V2(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_sample': lr = warmup_cosine_annealing_lr_sample(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) network = TrainingWrapper(network, opt) network.set_train() if args.rank_save_ckpt_flag: # checkpoint save ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator() shape_record = ShapeRecord() for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] args.logger.info('iter[{}], shape{}'.format(i, input_shape[0])) shape_record.set(input_shape) images = Tensor(images) annos = data["annotation"] if args.group_size == 1: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box(annos, config, input_shape) else: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box_single(annos, config, input_shape) batch_y_true_0 = Tensor(batch_y_true_0) batch_y_true_1 = Tensor(batch_y_true_1) batch_y_true_2 = Tensor(batch_y_true_2) batch_gt_box0 = Tensor(batch_gt_box0) batch_gt_box1 = Tensor(batch_gt_box1) batch_gt_box2 = Tensor(batch_gt_box2) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) if args.rank_save_ckpt_flag: # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format( epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: cb_params.cur_epoch_num += 1 if args.need_profiler: if i == 10: profiler.analyse() break args.logger.info('==========end training===============')
def train(): args = parse_args() if args.device_target == "CPU": context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU") else: context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False, device_target="Ascend", device_id=int(os.getenv('DEVICE_ID'))) # init multicards training if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size) # dataset dataset = data_generator.SegDataset(image_mean=args.image_mean, image_std=args.image_std, data_file=args.data_file, batch_size=args.batch_size, crop_size=args.crop_size, max_scale=args.max_scale, min_scale=args.min_scale, ignore_label=args.ignore_label, num_classes=args.num_classes, num_readers=2, num_parallel_calls=4, shard_id=args.rank, shard_num=args.group_size) dataset = dataset.get_dataset(repeat=1) # network if args.model == 'deeplab_v3_s16': network = net_factory.nets_map[args.model]('train', args.num_classes, 16, args.freeze_bn) elif args.model == 'deeplab_v3_s8': network = net_factory.nets_map[args.model]('train', args.num_classes, 8, args.freeze_bn) else: raise NotImplementedError('model [{:s}] not recognized'.format(args.model)) # loss loss_ = loss.SoftmaxCrossEntropyLoss(args.num_classes, args.ignore_label) loss_.add_flags_recursive(fp32=True) train_net = BuildTrainNetwork(network, loss_) # load pretrained model if args.ckpt_pre_trained: param_dict = load_checkpoint(args.ckpt_pre_trained) if args.filter_weight: filter_list = ["network.aspp.conv2.weight", "network.aspp.conv2.bias"] for key in list(param_dict.keys()): for filter_key in filter_list: if filter_key not in key: continue print('filter {}'.format(key)) del param_dict[key] load_param_into_net(train_net, param_dict) print('load_model {} success'.format(args.ckpt_pre_trained)) # optimizer iters_per_epoch = dataset.get_dataset_size() total_train_steps = iters_per_epoch * args.train_epochs if args.lr_type == 'cos': lr_iter = learning_rates.cosine_lr(args.base_lr, total_train_steps, total_train_steps) elif args.lr_type == 'poly': lr_iter = learning_rates.poly_lr(args.base_lr, total_train_steps, total_train_steps, end_lr=0.0, power=0.9) elif args.lr_type == 'exp': lr_iter = learning_rates.exponential_lr(args.base_lr, args.lr_decay_step, args.lr_decay_rate, total_train_steps, staircase=True) else: raise ValueError('unknown learning rate type') opt = nn.Momentum(params=train_net.trainable_params(), learning_rate=lr_iter, momentum=0.9, weight_decay=0.0001, loss_scale=args.loss_scale) # loss scale manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) amp_level = "O0" if args.device_target == "CPU" else "O3" model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale) # callback for saving ckpts time_cb = TimeMonitor(data_size=iters_per_epoch) loss_cb = LossMonitor() cbs = [time_cb, loss_cb] if args.rank == 0: config_ck = CheckpointConfig(save_checkpoint_steps=args.save_steps, keep_checkpoint_max=args.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck) cbs.append(ckpoint_cb) model.train(args.train_epochs, dataset, callbacks=cbs, dataset_sink_mode=(args.device_target != "CPU"))
def run_transformer_train(): """ Transformer training. """ parser = argparse_init() args, _ = parser.parse_known_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) if args.distribute == "true": device_num = args.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) D.init() rank_id = args.device_id % device_num save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_' + str(get_rank()) + '/') else: device_num = 1 rank_id = 0 save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_0/') dataset = create_transformer_dataset( epoch_count=1, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, dataset_path=args.data_path, bucket_boundaries=args.bucket_boundaries) netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) if args.checkpoint_path: parameter_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(netwithloss, parameter_dict) lr = Tensor( create_dynamic_lr( schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size() * args.epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=transformer_net_cfg.hidden_size, start_decay_step=cfg.lr_schedule.start_decay_step, min_lr=cfg.lr_schedule.min_lr), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callbacks = [ TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id) ] if args.enable_save_ckpt == "true": if device_num == 1 or (device_num > 1 and rank_id == 0): ckpt_config = CheckpointConfig( save_checkpoint_steps=args.save_checkpoint_steps, keep_checkpoint_max=args.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=save_ckpt_path, config=ckpt_config) callbacks.append(ckpoint_cb) if args.enable_lossscale == "true": scale_manager = DynamicLossScaleManager( init_loss_scale=cfg.init_loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)
def get_args(phase): """Define the common options that are used in both training and test.""" parser = argparse.ArgumentParser(description='Configuration') # Hardware specifications parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument("--device_id", type=int, default=0, help="device id, default is 0.") parser.add_argument('--device_num', type=int, default=1, help='device num, default is 1.') parser.add_argument('--platform', type=str, default="Ascend", \ help='run platform, only support Ascend') parser.add_argument('--save_graphs', type=ast.literal_eval, default=False, \ help='whether save graphs, default is False.') parser.add_argument('--dataset', type=str, default="large", choices=("large", "small", "demo"), \ help='MIND dataset, support large, small and demo.') parser.add_argument('--dataset_path', type=str, default=None, help='MIND dataset path.') # Model specifications parser.add_argument('--n_browsed_news', type=int, default=50, help='number of browsed news per user') parser.add_argument('--n_words_title', type=int, default=16, help='number of words per title') parser.add_argument('--n_words_abstract', type=int, default=48, help='number of words per abstract') parser.add_argument('--word_embedding_dim', type=int, default=304, help='dimension of word embedding vector') parser.add_argument('--category_embedding_dim', type=int, default=112, \ help='dimension of category embedding vector') parser.add_argument('--query_vector_dim', type=int, default=208, help='dimension of the query vector in attention') parser.add_argument('--n_filters', type=int, default=400, help='number of filters in CNN') parser.add_argument('--window_size', type=int, default=3, help='size of filter in CNN') parser.add_argument("--checkpoint_path", type=str, default=None, \ help="Pre trained checkpoint path, default is None.") parser.add_argument('--batch_size', type=int, default=64, help='size of each batch') # Training specifications if phase == "train": parser.add_argument('--epochs', type=int, default=None, help='number of epochs for training') parser.add_argument('--lr', type=float, default=None, help='learning rate') parser.add_argument('--beta1', type=float, default=0.9, help='ADAM beta1') parser.add_argument('--beta2', type=float, default=0.999, help='ADAM beta2') parser.add_argument('--epsilon', type=float, default=1e-8, help='ADAM epsilon for numerical stability') parser.add_argument( '--neg_sample', type=int, default=4, help='number of negative samples in negative sampling') parser.add_argument("--mixed", type=ast.literal_eval, default=True, \ help="whether use mixed precision, default is True.") parser.add_argument("--sink_mode", type=ast.literal_eval, default=True, \ help="whether use dataset sink, default is True.") parser.add_argument('--print_times', type=int, default=None, help='number of print times, default is None') parser.add_argument("--weight_decay", type=ast.literal_eval, default=True, \ help="whether use weight decay, default is True.") parser.add_argument('--save_checkpoint', type=ast.literal_eval, default=True, \ help='whether save checkpoint, default is True.') parser.add_argument("--save_checkpoint_path", type=str, default="./checkpoint", \ help="Save checkpoint path, default is checkpoint.") parser.add_argument('--dropout_ratio', type=float, default=0.2, help='ratio of dropout') if phase == "eval": parser.add_argument('--neg_sample', type=int, default=-1, \ help='number of negative samples in negative sampling') if phase == "export": parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', \ help='file format') parser.add_argument('--neg_sample', type=int, default=-1, \ help='number of negative samples in negative sampling') args = parser.parse_args() if args.device_num > 1: context.set_context(mode=context.GRAPH_MODE, device_target=args.platform, save_graphs=args.save_graphs) context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=args.device_num) init() args.rank = get_rank() args.save_checkpoint_path = os.path.join(args.save_checkpoint_path, "ckpt_" + str(args.rank)) else: context.set_context(mode=context.GRAPH_MODE, device_target=args.platform, device_id=args.device_id, save_graphs=args.save_graphs, save_graphs_path="naml_ir") args.rank = 0 args.device_num = 1 args.phase = phase cfg = get_dataset_config(args.dataset) args.n_categories = cfg.n_categories args.n_sub_categories = cfg.n_sub_categories args.n_words = cfg.n_words if phase == "train": args.epochs = cfg.epochs * math.ceil( args.device_num**0.5) if args.epochs is None else args.epochs args.lr = cfg.lr if args.lr is None else args.lr args.print_times = cfg.print_times if args.print_times is None else args.print_times args.embedding_file = cfg.embedding_file.format(args.dataset_path) args.word_dict_path = cfg.word_dict_path.format(args.dataset_path) args.category_dict_path = cfg.category_dict_path.format(args.dataset_path) args.subcategory_dict_path = cfg.subcategory_dict_path.format( args.dataset_path) args.uid2index_path = cfg.uid2index_path.format(args.dataset_path) args.train_dataset_path = cfg.train_dataset_path.format(args.dataset_path) args.eval_dataset_path = cfg.eval_dataset_path.format(args.dataset_path) args_dict = vars(args) for key in args_dict.keys(): print('--> {}:{}'.format(key, args_dict[key]), flush=True) return args
def _init_parallel(self): self._init_parallel_flag = False init(backend_name='hccl') self._init_parallel_flag = True
def main(): parser = argparse.ArgumentParser(description="YOLOv3 train") parser.add_argument("--only_create_dataset", type=bool, default=False, help="If set it true, only create " "Mindrecord, default is false.") parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is false.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.001, help="Learning rate, default is 0.001.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink") parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size") parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument( "--mindrecord_dir", type=str, default="./Mindrecord_train", help= "Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by" "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir " "rather than image_dir and anno_path. Default is ./Mindrecord_train") parser.add_argument("--image_dir", type=str, default="", help="Dataset directory, " "the absolute image path is joined by the image_dir " "and the relative path in anno_path") parser.add_argument("--anno_path", type=str, default="", help="Annotation path.") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) init() rank = args_opt.device_id % device_num else: rank = 0 device_num = 1 print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. if not os.path.isdir(args_opt.mindrecord_dir): os.makedirs(args_opt.mindrecord_dir) prefix = "yolo.mindrecord" mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0") if not os.path.exists(mindrecord_file): if os.path.isdir(args_opt.image_dir) and os.path.exists( args_opt.anno_path): print("Create Mindrecord.") data_to_mindrecord_byte_image(args_opt.image_dir, args_opt.anno_path, args_opt.mindrecord_dir, prefix, 8) print("Create Mindrecord Done, at {}".format( args_opt.mindrecord_dir)) else: print("image_dir or anno_path not exits.") if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. dataset = create_yolo_dataset(mindrecord_file, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") net = yolov3_resnet18(ConfigYOLOV3ResNet18()) net = YoloWithLossCell(net, ConfigYOLOV3ResNet18()) init_net_param(net, "XavierUniform") # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError( "pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) total_epoch_size = 60 if args_opt.distribute: total_epoch_size = 160 lr = Tensor( get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size, global_step=total_epoch_size * dataset_size, decay_step=1000, decay_rate=0.95, steps=True)) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [ TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb ] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print( "Start train YOLOv3, the first epoch will be slower because of the graph compilation." ) model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def _setup_parallel_env(): context.reset_auto_parallel_context() MultiAscend.init() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=MultiAscend.get_group_size(), gradients_mean=True)
def run_translation(): ''' run Summarization_task ''' parser = argparse.ArgumentParser( description="Finetune and Evaluate Summrization") parser.add_argument("--device_target", type=str, default="Ascend", help="Device type. Default: Ascend.") # parser.add_argument("--device_id", type=int, default=0, # help="ID of target device. ") parser.add_argument( "--metric_method", type=str, default="BLEU", help="The eval method including [BLEU]. Default: BLEU.") parser.add_argument("--do_train", type=str, default="true", help="Enable train. Default: false.") parser.add_argument("--do_eval", type=str, default="false", help="Enable evaluation. Default: false.") parser.add_argument("--epoch_num", type=int, default=5, help="Epoch number. Default: 5.") parser.add_argument("--train_data_shuffle", type=str, default="true", help="Enable train data shuffle. Default: true.") parser.add_argument("--eval_data_shuffle", type=str, default="false", help="Enable eval data shuffle. Default: false.") parser.add_argument( "--save_finetune_ckpt_path", type=str, default="/home/tju/gpt2/MindSpore-GPT2/pretrained-weight/saved/", help="Save the checkpoint path.") parser.add_argument( "--load_pretrain_ckpt_path", type=str, default= "/home/tju/gpt2/MindSpore-GPT2/pretrained-weight/mindspore_model_small.ckpt", help="Load the checkpoint file path.") parser.add_argument( "--load_finetune_ckpt_path", type=str, default= "/home/tju/gpt2/MindSpore-GPT2/pretrained-weight/mindspore_model_small.ckpt", help="Load the checkpoint file path.") parser.add_argument( "--train_data_file_path", type=str, default= "/home/tju/gpt2/MindSpore-GPT2/mindspore-dataset/en-fr-train-mindrecord", help="Data path, it is better to use absolute path") parser.add_argument( "--eval_data_file_path", type=str, default= "/home/tju/gpt2/MindSpore-GPT2/mindspore-dataset/en-fr-test-mindrecord", help="Data path, it is better to use absolute path") # parser.add_argument("--translate_direction", type=str, default="en-fr", # help="translate from Language_A to Language_B: ['en-fr','fr-en']") parser.add_argument("--device_num", type=int, default=1, help="device number") args_opt = parser.parse_args() epoch_num = args_opt.epoch_num metric = args_opt.metric_method save_finetune_ckpt_path = args_opt.save_finetune_ckpt_path load_finetune_ckpt_path = args_opt.load_finetune_ckpt_path load_pretrain_ckpt_path = args_opt.load_pretrain_ckpt_path if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower( ) == "false": raise ValueError( "At least one of 'do_train' or 'do_eval' must be true") if args_opt.do_train.lower( ) == "true" and args_opt.train_data_file_path == "": raise ValueError( "'train_data_file_path' must be set when do finetune task") if args_opt.do_eval.lower( ) == "true" and args_opt.eval_data_file_path == "": raise ValueError( "'eval_data_file_path' must be set when do evaluation task") # translate_direction = args_opt.translate_direction # if translate_direction not in ['en-fr','fr-en']: # raise ValueError("--translatate_direction should be in set: ['en-fr','fr-en']'") device_target = args_opt.device_target if device_target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU", device_id=args_opt.device_id, max_call_depth=3000) context.set_auto_parallel_context(parallel_mode="stand_alone") elif device_target == "Ascend": device_id = int(os.getenv('DEVICE_ID')) # device_id = args_opt.device_id print( "-------| This is {} device, {} target, {} device numbers |------". format(device_id, device_target, args_opt.device_num)) context.set_context(mode=context.GRAPH_MODE, device_target=device_target, device_id=device_id) context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) # context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,device_num = args_opt.device_num,gradients_mean=True) init() print("-------| HCCL init finished |-------") save_finetune_ckpt_path = save_finetune_ckpt_path + 'ckpt_' + str( get_rank()) + "/" else: raise Exception( "Device target error, Ascend and Nvidia GPU is supported.") # if device == "Ascend": # device_num_,rank_id = _get_rank_info() if args_opt.do_train.lower() == "true": gpt2_loss = GPT2Translation(config=gpt2_net_cfg, is_training=True, use_one_hot_embeddings=False) print("============== Start Loading Train Dataset ==============") train_dataset = create_translation_dataset( dataset_path=args_opt.train_data_file_path) do_train(train_dataset, gpt2_loss, load_pretrain_ckpt_path, save_finetune_ckpt_path, epoch_num) if args_opt.do_eval.lower() == "true": print("============ Start Loading Evaluation Dataset ============") eval_dataset = create_translation_dataset( dataset_path="/home/tju/gpt2/" + translate_direction + "-test-mindrecord", rank_id=device_id, device_num=args_opt.device_num) do_eval(eval_dataset, GPT2TranslationModel, metric, load_finetune_ckpt_path, translate_direction)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ from mindspore import Tensor from mindspore.ops import operations as P import mindspore.nn as nn import numpy as np import mindspore.context as context from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter from mindspore.communication.management import init, NCCL_WORLD_COMM_GROUP, get_rank, get_group_size context.set_context(mode=context.GRAPH_MODE, device_target='GPU') init('nccl') rank = get_rank() size = get_group_size() x = np.ones([size, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1) class Net(nn.Cell): def __init__(self): super(Net, self).__init__() self.x = Parameter(initializer(Tensor(x), x.shape), name='x') self.op0 = "sum" self.op1 = "max" self.op2 = "min" self.op3 = "prod"
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2 os.environ['RANK_ID'] = str(device_id - 4) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) auto_parallel_context().set_all_reduce_fusion_split_indices( [107], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum2") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum3") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum4") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum5") init() # network damping = get_model_damping(0, 0.03, 0.87, 50, 5004) net = resnet50_thor(class_num=thor_config.class_num, damping=damping, loss_scale=thor_config.loss_scale, frequency=thor_config.frequency) # evaluation network dist_eval_network = ClassifyCorrectCell(net) if not thor_config.label_smooth: thor_config.label_smooth_factor = 0.0 # loss loss = nn.SoftmaxCrossEntropyWithLogits( sparse=True, reduction="mean", smooth_factor=thor_config.label_smooth_factor, num_classes=thor_config.class_num) # train dataset dataset = create_dataset(dataset_path=dataset_path, do_train=True, repeat_num=epoch_size, batch_size=thor_config.batch_size) step_size = dataset.get_dataset_size() eval_interval = thor_config.eval_interval # evalutation dataset eval_dataset = create_dataset(dataset_path=eval_path, do_train=False, repeat_num=epoch_size, batch_size=thor_config.eval_batch_size) # loss scale loss_scale = FixedLossScaleManager(thor_config.loss_scale, drop_overflow_update=False) # learning rate lr = Tensor(get_model_lr(0, 0.045, 6, 70, 5004)) # optimizer opt = THOR(filter(lambda x: x.requires_grad, net.get_parameters()), lr, thor_config.momentum, filter(lambda x: 'matrix_A' in x.name, net.get_parameters()), filter(lambda x: 'matrix_G' in x.name, net.get_parameters()), filter(lambda x: 'A_inv_max' in x.name, net.get_parameters()), filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()), thor_config.weight_decay, thor_config.loss_scale) # model model = THOR_Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", keep_batchnorm_fp32=False, metrics={ 'acc': DistAccuracy(batch_size=thor_config.eval_batch_size, device_num=device_num) }, eval_network=dist_eval_network, frequency=thor_config.frequency) # model init print("init_start", device_id) model.init(dataset, eval_dataset) print("init_stop", device_id) # callbacks loss_cb = LossGet(1, step_size) # train and eval acc = 0.0 time_cost = 0.0 print("run_start", device_id) for epoch_idx in range(0, int(epoch_size / eval_interval)): model.train(eval_interval, dataset, callbacks=loss_cb) eval_start = time.time() output = model.eval(eval_dataset) eval_cost = (time.time() - eval_start) * 1000 acc = float(output["acc"]) time_cost = loss_cb.get_per_step_time() loss = loss_cb.get_loss() print( "the {} epoch's resnet result:\n " "device{}, training loss {}, acc {}, " "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms" .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost, time_cost * step_size + eval_cost)) q.put({'acc': acc, 'cost': time_cost})
def main(): parser = argparse.ArgumentParser(description="SSD training") parser.add_argument("--only_create_dataset", type=bool, default=False, help="If set it true, only create " "Mindrecord, default is False.") parser.add_argument("--distribute", type=bool, default=False, help="Run distribute, default is False.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--lr", type=float, default=0.05, help="Learning rate, default is 0.05.") parser.add_argument("--mode", type=str, default="sink", help="Run sink mode or not, default is sink.") parser.add_argument("--dataset", type=str, default="coco", help="Dataset, defalut is coco.") parser.add_argument("--epoch_size", type=int, default=250, help="Epoch size, default is 250.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.") parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.") parser.add_argument("--save_checkpoint_epochs", type=int, default=10, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id) if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) init() rank = args_opt.device_id % device_num else: rank = 0 device_num = 1 print("Start create dataset!") # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is ssd.mindrecord0, 1, ... file_num. prefix = "ssd.mindrecord" mindrecord_dir = config.mindrecord_dir mindrecord_file = os.path.join(mindrecord_dir, prefix + "0") if not os.path.exists(mindrecord_file): if not os.path.isdir(mindrecord_dir): os.makedirs(mindrecord_dir) if args_opt.dataset == "coco": if os.path.isdir(config.coco_root): print("Create Mindrecord.") data_to_mindrecord_byte_image("coco", True, prefix) print("Create Mindrecord Done, at {}".format(mindrecord_dir)) else: print("coco_root not exits.") else: if os.path.isdir(config.image_dir) and os.path.exists( config.anno_path): print("Create Mindrecord.") data_to_mindrecord_byte_image("other", True, prefix) print("Create Mindrecord Done, at {}".format(mindrecord_dir)) else: print("image_dir or anno_path not exits.") if not args_opt.only_create_dataset: loss_scale = float(args_opt.loss_scale) # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0. dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size, device_num=device_num, rank=rank) dataset_size = dataset.get_dataset_size() print("Create dataset done!") backbone = ssd_mobilenet_v2() ssd = SSD300(backbone=backbone, config=config) net = SSDWithLossCell(ssd, config) init_net_param(net) # checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config) if args_opt.pre_trained: if args_opt.pre_trained_epoch_size <= 0: raise KeyError( "pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) lr = Tensor( get_lr(global_step=config.global_step, lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr, warmup_epochs=config.warmup_epochs, total_epochs=args_opt.epoch_size, steps_per_epoch=dataset_size)) opt = nn.Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, loss_scale) net = TrainingWrapper(net, opt, loss_scale) callback = [ TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb ] model = Model(net) dataset_sink_mode = False if args_opt.mode == "sink": print("In sink mode, one epoch return a loss.") dataset_sink_mode = True print( "Start train SSD, the first epoch will be slower because of the graph compilation." ) model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
from mindspore.common.initializer import One from mindspore.train.model import Model, ParallelMode from mindspore import context import os from mindspore.communication.management import init import mindspore.ops.functional as F from mindspore.nn.loss.loss import _Loss from mindspore.train.callback import Callback from mindspore.parallel import set_algo_parameters context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") context.set_context(enable_hccl=True) context.set_context(enable_task_sink=True, device_id=int(os.getenv('DEVICE_ID'))) context.set_context(enable_ir_fusion=True) context.set_context(enable_loop_sink=False) init() context.set_auto_parallel_context(mirror_mean=True, parallel_mode=ParallelMode.AUTO_PARALLEL) def weight_variable(shape, factor=0.1): return One() def _conv3x3(in_channels, out_channels, stride=1, padding=0, pad_mode='same'): init_value = weight_variable((out_channels, in_channels, 3, 3)) return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=padding,
def train_net(args_opt, cross_valid_ind=1, epochs=400, batch_size=16, lr=0.0001, cfg=None): rank = 0 group_size = 1 data_dir = args_opt.data_url run_distribute = args_opt.run_distribute if run_distribute: init() group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) need_slice = False if cfg['model'] == 'unet_medical': net = UNetMedical(n_channels=cfg['num_channels'], n_classes=cfg['num_classes']) elif cfg['model'] == 'unet_nested': net = NestedUNet(in_channel=cfg['num_channels'], n_class=cfg['num_classes'], use_deconv=cfg['use_deconv'], use_bn=cfg['use_bn'], use_ds=cfg['use_ds']) need_slice = cfg['use_ds'] elif cfg['model'] == 'unet_simple': net = UNet(in_channel=cfg['num_channels'], n_class=cfg['num_classes']) else: raise ValueError("Unsupported model: {}".format(cfg['model'])) if cfg['resume']: param_dict = load_checkpoint(cfg['resume_ckpt']) if cfg['transfer_training']: filter_checkpoint_parameter_by_list(param_dict, cfg['filter_weight']) load_param_into_net(net, param_dict) if 'use_ds' in cfg and cfg['use_ds']: criterion = MultiCrossEntropyWithLogits() else: criterion = CrossEntropyWithLogits() if 'dataset' in cfg and cfg['dataset'] == "Cell_nuclei": repeat = 10 dataset_sink_mode = True per_print_times = 0 train_dataset = create_cell_nuclei_dataset(data_dir, cfg['img_size'], repeat, batch_size, is_train=True, augment=True, split=0.8, rank=rank, group_size=group_size) valid_dataset = create_cell_nuclei_dataset( data_dir, cfg['img_size'], 1, 1, is_train=False, eval_resize=cfg["eval_resize"], split=0.8, python_multiprocessing=False) else: repeat = epochs dataset_sink_mode = False per_print_times = 1 train_dataset, valid_dataset = create_dataset( data_dir, repeat, batch_size, True, cross_valid_ind, run_distribute, cfg["crop"], cfg['img_size']) train_data_size = train_dataset.get_dataset_size() print("dataset length is:", train_data_size) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=cfg['keep_checkpoint_max']) ckpoint_cb = ModelCheckpoint(prefix='ckpt_{}_adam'.format(cfg['model']), directory='./ckpt_{}/'.format(device_id), config=ckpt_config) optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=cfg['weight_decay'], loss_scale=cfg['loss_scale']) loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager( cfg['FixedLossScaleManager'], False) model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3") print("============== Starting Training ==============") callbacks = [ StepLossTimeMonitor(batch_size=batch_size, per_print_times=per_print_times), ckpoint_cb ] if args_opt.run_eval: eval_model = Model(UnetEval(net, need_slice=need_slice), loss_fn=TempLoss(), metrics={"dice_coeff": dice_coeff(cfg_unet, False)}) eval_param_dict = { "model": eval_model, "dataset": valid_dataset, "metrics_name": args_opt.eval_metrics } eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval, eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=True, ckpt_directory='./ckpt_{}/'.format(device_id), besk_ckpt_name="best.ckpt", metrics_name=args_opt.eval_metrics) callbacks.append(eval_cb) model.train(int(epochs / repeat), train_dataset, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode) print("============== End Training ==============")