repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) # learning rate strategy with cosine lr = Tensor( warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120, config.pretrain_epoch_size * step_size)) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, loss_scale_manager=loss_scale, metrics={'acc'}) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() cb = [time_cb, loss_cb] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
def test_train_32k_8p(epoch_size=3, batch_size=32, num_classes=32768): #1048576 #131072 #32768 #8192 dev_num = 8 context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=260.0) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) set_algo_parameters(elementwise_op_strategy_follow=True) resset_op_id() np.random.seed(6) input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) label_np = np.zeros([batch_size]).astype(np.int32) for i in range(0, batch_size): label_np[i] = i % num_classes dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) net = resnet50(num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(5, dataset, dataset_sink_mode=False) strategies = _executor._get_strategy(model._train_network) for (k, v) in strategies.items(): if re.search('Conv2D-op', k) is not None: assert v[0][0] == dev_num elif re.search('MatMul-op', k) is not None: assert v == [[dev_num, 1], [1, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[dev_num, 1]] allreduce_fusion_dict = _executor._get_allreduce_fusion(model._train_network) print(allreduce_fusion_dict) expect_dict = {'end_point.bias': 2, 'end_point.weight': 2, 'layer4.2.bn3.beta': 2, 'layer4.2.bn3.gamma': 2, 'layer4.2.conv3.weight': 2, 'layer4.2.bn2.beta': 2, 'layer4.2.bn2.gamma': 2, 'layer4.2.conv2.weight': 2, 'layer4.2.bn1.beta': 2, 'layer4.2.bn1.gamma': 2, 'layer4.2.conv1.weight': 2, 'layer4.1.bn3.beta': 2, 'layer4.1.bn3.gamma': 2, 'layer4.1.conv3.weight': 2, 'layer4.1.bn2.beta': 2, 'layer4.1.bn2.gamma': 2, 'layer4.1.conv2.weight': 2, 'layer4.1.bn1.beta': 2, 'layer4.1.bn1.gamma': 2, 'layer4.1.conv1.weight': 2, 'layer4.0.bn_down_sample.beta': 2, 'layer4.0.bn_down_sample.gamma': 2, 'layer4.0.conv_down_sample.weight': 2, 'layer4.0.bn3.beta': 2, 'layer4.0.bn3.gamma': 2, 'layer4.0.conv3.weight': 2, 'layer4.0.bn2.beta': 2, 'layer4.0.bn2.gamma': 2, 'layer4.0.conv2.weight': 2, 'layer4.0.bn1.beta': 2, 'layer4.0.bn1.gamma': 2, 'layer4.0.conv1.weight': 2, 'layer3.5.bn3.beta': 2, 'layer3.5.bn3.gamma': 2, 'layer3.5.conv3.weight': 2, 'layer3.5.bn2.beta': 2, 'layer3.5.bn2.gamma': 2, 'layer3.5.conv2.weight': 2, 'layer3.5.bn1.beta': 2, 'layer3.5.bn1.gamma': 2, 'layer3.5.conv1.weight': 2, 'layer3.4.bn3.beta': 2, 'layer3.4.bn3.gamma': 2, 'layer3.4.conv3.weight': 2, 'layer3.4.bn2.beta': 2, 'layer3.4.bn2.gamma': 2, 'layer3.4.conv2.weight': 2, 'layer3.4.bn1.beta': 2, 'layer3.4.bn1.gamma': 2, 'layer3.4.conv1.weight': 2, 'layer3.3.bn3.beta': 2, 'layer3.3.bn3.gamma': 2, 'layer3.3.conv3.weight': 2, 'layer3.3.bn2.beta': 2, 'layer3.3.bn2.gamma': 2, 'layer3.3.conv2.weight': 2, 'layer3.3.bn1.beta': 2, 'layer3.3.bn1.gamma': 2, 'layer3.3.conv1.weight': 2, 'layer3.2.bn3.beta': 2, 'layer3.2.bn3.gamma': 2, 'layer3.2.conv3.weight': 2, 'layer3.2.bn2.beta': 2, 'layer3.2.bn2.gamma': 2, 'layer3.2.conv2.weight': 2, 'layer3.2.bn1.beta': 2, 'layer3.2.bn1.gamma': 2, 'layer3.2.conv1.weight': 2, 'layer3.1.bn3.beta': 2, 'layer3.1.bn3.gamma': 2, 'layer3.1.conv3.weight': 2, 'layer3.1.bn2.beta': 2, 'layer3.1.bn2.gamma': 2, 'layer3.1.conv2.weight': 2, 'layer3.1.bn1.beta': 2, 'layer3.1.bn1.gamma': 2, 'layer3.1.conv1.weight': 2, 'layer3.0.bn_down_sample.beta': 1, 'layer3.0.bn_down_sample.gamma': 1, 'layer3.0.conv_down_sample.weight': 2, 'layer3.0.bn3.beta': 1, 'layer3.0.bn3.gamma': 1, 'layer3.0.conv3.weight': 2, 'layer3.0.bn2.beta': 2, 'layer3.0.bn2.gamma': 2, 'layer3.0.conv2.weight': 2, 'layer3.0.bn1.beta': 2, 'layer3.0.bn1.gamma': 2, 'layer3.0.conv1.weight': 2, 'layer2.3.bn3.beta': 2, 'layer2.3.bn3.gamma': 2, 'layer2.3.conv3.weight': 2, 'layer2.3.bn2.beta': 2, 'layer2.3.bn2.gamma': 2, 'layer2.3.conv2.weight': 2, 'layer2.3.bn1.beta': 2, 'layer2.3.bn1.gamma': 2, 'layer2.3.conv1.weight': 2, 'layer2.2.bn3.beta': 2, 'layer2.2.bn3.gamma': 2, 'layer2.2.conv3.weight': 2, 'layer2.2.bn2.beta': 2, 'layer2.2.bn2.gamma': 2, 'layer2.2.conv2.weight': 2, 'layer2.2.bn1.beta': 2, 'layer2.2.bn1.gamma': 2, 'layer2.2.conv1.weight': 2, 'layer2.1.bn3.beta': 1, 'layer2.1.bn3.gamma': 1, 'layer2.1.conv3.weight': 2, 'layer2.1.bn2.beta': 2, 'layer2.1.bn2.gamma': 2, 'layer2.1.conv2.weight': 2, 'layer2.1.bn1.beta': 2, 'layer2.1.bn1.gamma': 2, 'layer2.1.conv1.weight': 2, 'layer2.0.bn_down_sample.beta': 1, 'layer2.0.bn_down_sample.gamma': 1, 'layer2.0.conv_down_sample.weight': 2, 'layer2.0.bn3.beta': 1, 'layer2.0.bn3.gamma': 1, 'layer2.0.conv3.weight': 2, 'layer2.0.bn2.beta': 2, 'layer2.0.bn2.gamma': 2, 'layer2.0.conv2.weight': 2, 'layer2.0.bn1.beta': 2, 'layer2.0.bn1.gamma': 2, 'layer2.0.conv1.weight': 2, 'layer1.2.bn3.beta': 2, 'layer1.2.bn3.gamma': 2, 'layer1.2.conv3.weight': 2, 'layer1.2.bn2.beta': 2, 'layer1.2.bn2.gamma': 2, 'layer1.2.conv2.weight': 2, 'layer1.2.bn1.beta': 2, 'layer1.2.bn1.gamma': 2, 'layer1.2.conv1.weight': 2, 'layer1.1.bn3.beta': 1, 'layer1.1.bn3.gamma': 1, 'layer1.1.conv3.weight': 2, 'layer1.1.bn2.beta': 2, 'layer1.1.bn2.gamma': 2, 'layer1.1.conv2.weight': 2, 'layer1.1.bn1.beta': 2, 'layer1.1.bn1.gamma': 2, 'layer1.1.conv1.weight': 2, 'layer1.0.bn_down_sample.beta': 1, 'layer1.0.bn_down_sample.gamma': 1, 'layer1.0.conv_down_sample.weight': 2, 'layer1.0.bn3.beta': 1, 'layer1.0.bn3.gamma': 1, 'layer1.0.conv3.weight': 2, 'layer1.0.bn2.beta': 2, 'layer1.0.bn2.gamma': 2, 'layer1.0.conv2.weight': 2, 'layer1.0.bn1.beta': 2, 'layer1.0.bn1.gamma': 2, 'layer1.0.conv1.weight': 2, 'bn1.beta': 1, 'bn1.gamma': 1, 'conv1.weight': 2} assert (allreduce_fusion_dict == expect_dict) cost_model_context.reset_cost_model_context()
steps_per_epoch=batches_per_epoch, is_stair=True) lr = Tensor(lr) # optimizer decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) group_params = [{'params': decayed_params, 'weight_decay': cfg.weight_decay}, {'params': no_decayed_params}, {'order_params': net.trainable_params()}] optimizer = Momentum(params=net.trainable_params(), learning_rate=Tensor(lr), momentum=cfg.momentum, weight_decay=cfg.weight_decay) eval_metrics = {'Loss': nn.Loss(), 'Top1-Acc': nn.Top1CategoricalAccuracy(), 'Top5-Acc': nn.Top5CategoricalAccuracy()} if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net, ckpt) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={'acc'}) print("============== Starting Training ==============") loss_cb = LossMonitor(per_print_times=batches_per_epoch) time_cb = TimeMonitor(data_size=batches_per_epoch) callbacks = [loss_cb, time_cb] config_ck = CheckpointConfig(save_checkpoint_steps=batches_per_epoch, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"shufflenet-rank{cfg.rank}", directory=cfg.ckpt_path, config=config_ck)
def test(cloud_args=None): """test""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) args.outputs_dir = os.path.join( args.log_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.dataset == "cifar10": net = vgg16(num_classes=args.num_classes) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum, weight_decay=args.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(net, param_dict) net.set_train(False) dataset = vgg_create_dataset(args.data_path, 1, False) res = model.eval(dataset) print("result: ", res) else: # network args.logger.important_info('start create network') if os.path.isdir(args.pretrained): models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt'))) print(models) if args.graph_ckpt: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1]. split('_')[0]) else: f = lambda x: -1 * int( os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1]) args.models = sorted(models, key=f) else: args.models = [ args.pretrained, ] for model in args.models: if args.dataset == "cifar10": dataset = vgg_create_dataset(args.data_path, args.image_size, args.per_batch_size, training=False) else: dataset = classification_dataset(args.data_path, args.image_size, args.per_batch_size) eval_dataloader = dataset.create_tuple_iterator() network = vgg16(args.num_classes, args, phase="test") # pre_trained load_param_into_net(network, load_checkpoint(model)) network.add_flags_recursive(fp16=True) img_tot = 0 top1_correct = 0 top5_correct = 0 network.set_train(False) t_end = time.time() it = 0 for data, gt_classes in eval_dataloader: output = network(Tensor(data, mstype.float32)) output = output.asnumpy() top1_output = np.argmax(output, (-1)) top5_output = np.argsort(output)[:, -5:] t1_correct = np.equal(top1_output, gt_classes).sum() top1_correct += t1_correct top5_correct += get_top5_acc(top5_output, gt_classes) img_tot += args.per_batch_size if args.rank == 0 and it == 0: t_end = time.time() it = 1 if args.rank == 0: time_used = time.time() - t_end fps = (img_tot - args.per_batch_size) * args.group_size / time_used args.logger.info( 'Inference Performance: {:.2f} img/sec'.format(fps)) results = [[top1_correct], [top5_correct], [img_tot]] args.logger.info('before results={}'.format(results)) results = np.array(results) args.logger.info('after results={}'.format(results)) top1_correct = results[0, 0] top5_correct = results[1, 0] img_tot = results[2, 0] acc1 = 100.0 * top1_correct / img_tot acc5 = 100.0 * top5_correct / img_tot args.logger.info('after allreduce eval: top1_correct={}, tot={},' 'acc={:.2f}%(TOP1)'.format( top1_correct, img_tot, acc1)) args.logger.info('after allreduce eval: top5_correct={}, tot={},' 'acc={:.2f}%(TOP5)'.format( top5_correct, img_tot, acc5))
parser.add_argument('--checkpoint_path', type=str, default=None, help='checkpoint file path.') parser.add_argument('--device_id', type=int, default=None, help='device id of GPU or Ascend. (Default: None)') args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(device_id=args_opt.device_id) net = GooGLeNet(num_classes=cfg.num_classes) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum, weight_decay=cfg.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(net, param_dict) net.set_train(False) dataset = dataset.create_dataset(args_opt.data_path, 1, False) res = model.eval(dataset) print("result: ", res)
def train(): """Train function.""" args = parse_args() # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() else: parallel_mode = ParallelMode.STAND_ALONE degree = 1 context.set_auto_parallel_context(parallel_mode=parallel_mode, mirror_mean=True, device_num=degree) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) if args.resume_yolov3: param_dict = load_checkpoint(args.resume_yolov3) param_dict_new = {} for key, values in param_dict.items(): args.logger.info('ckpt param name = {}'.format(key)) if key.startswith('moments.') or key.startswith('global_') or \ key.startswith('learning_rate') or key.startswith('momentum'): continue elif key.startswith('yolo_network.'): key_new = key[13:] if key_new.endswith('1.beta'): key_new = key_new.replace('1.beta', 'batchnorm.beta') if key_new.endswith('1.gamma'): key_new = key_new.replace('1.gamma', 'batchnorm.gamma') if key_new.endswith('1.moving_mean'): key_new = key_new.replace('1.moving_mean', 'batchnorm.moving_mean') if key_new.endswith('1.moving_variance'): key_new = key_new.replace('1.moving_variance', 'batchnorm.moving_variance') if key_new.endswith('.weight'): if key_new.endswith('0.weight'): key_new = key_new.replace('0.weight', 'conv.weight') else: key_new = key_new.replace('.weight', '.conv.weight') if key_new.endswith('.bias'): key_new = key_new.replace('.bias', '.conv.bias') param_dict_new[key_new] = values args.logger.info('in resume {}'.format(key_new)) else: param_dict_new[key] = values args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') for _, param in network.parameters_and_names(): args.logger.info('network param name = {}'.format(param.name)) if param.name not in param_dict_new: args.logger.info('not match param name = {}'.format( param.name)) load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov3)) config = ConfigYOLOV3DarkNet53() # convert fusion network to quantization aware network if config.quantization_aware: network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) network = YoloWithLossCell(network) args.logger.info('finish get network') config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [conver_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr( args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_V2': lr = warmup_cosine_annealing_lr_V2(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_sample': lr = warmup_cosine_annealing_lr_sample(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) network = TrainingWrapper(network, opt) network.set_train() if args.rank_save_ckpt_flag: # checkpoint save ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator() shape_record = ShapeRecord() for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] args.logger.info('iter[{}], shape{}'.format(i, input_shape[0])) shape_record.set(input_shape) images = Tensor(images) annos = data["annotation"] if args.group_size == 1: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box(annos, config, input_shape) else: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box_single(annos, config, input_shape) batch_y_true_0 = Tensor(batch_y_true_0) batch_y_true_1 = Tensor(batch_y_true_1) batch_y_true_2 = Tensor(batch_y_true_2) batch_gt_box0 = Tensor(batch_gt_box0) batch_gt_box1 = Tensor(batch_gt_box1) batch_gt_box2 = Tensor(batch_gt_box2) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) if args.rank_save_ckpt_flag: # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format( epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: cb_params.cur_epoch_num += 1 if args.need_profiler: if i == 10: profiler.analyse() break args.logger.info('==========end training===============')
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor( get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def train(): """Train function.""" args = parse_args() devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=True, device_id=devid) # init distributed if args.is_distributed: if args.device_target == "Ascend": init() else: init("nccl") args.rank = get_rank() args.group_size = get_group_size() # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() else: parallel_mode = ParallelMode.STAND_ALONE degree = 1 context.set_auto_parallel_context(parallel_mode=parallel_mode, mirror_mean=True, device_num=degree) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) if args.pretrained_backbone: network = load_backbone(network, args.pretrained_backbone, args) args.logger.info('load pre-trained backbone {} into network'.format(args.pretrained_backbone)) else: args.logger.info('Not load pre-trained backbone, please be careful') if args.resume_yolov3: param_dict = load_checkpoint(args.resume_yolov3) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values args.logger.info('in resume {}'.format(key)) else: param_dict_new[key] = values args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov3)) network = YoloWithLossCell(network) args.logger.info('finish get network') config = ConfigYOLOV3DarkNet53() config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [conver_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr(args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_V2': lr = warmup_cosine_annealing_lr_V2(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_sample': lr = warmup_cosine_annealing_lr_sample(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) enable_amp = False is_gpu = context.get_context("device_target") == "GPU" if is_gpu: enable_amp = True if enable_amp: loss_scale_value = 1.0 loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=True) keep_loss_fp32(network) else: network = TrainingWrapper(network, opt) network.set_train() if args.rank_save_ckpt_flag: # checkpoint save ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator() for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] args.logger.info('iter[{}], shape{}'.format(i, input_shape[0])) images = Tensor(images) batch_y_true_0 = Tensor(data['bbox1']) batch_y_true_1 = Tensor(data['bbox2']) batch_y_true_2 = Tensor(data['bbox3']) batch_gt_box0 = Tensor(data['gt_box1']) batch_gt_box1 = Tensor(data['gt_box2']) batch_gt_box2 = Tensor(data['gt_box3']) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) if args.rank_save_ckpt_flag: # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * (i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format(epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: cb_params.cur_epoch_num += 1 if args.need_profiler: if i == 10: profiler.analyse() break args.logger.info('==========end training===============')
lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias' not in x.name, net.trainable_params())) no_decayed_params = [ param for param in net.trainable_params() if param not in decayed_params ] group_params = [{ 'params': decayed_params, 'weight_decay': config.weight_decay }, { 'params': no_decayed_params }, { 'order_params': net.trainable_params() }] opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale) # define loss, model if target == "Ascend": if args_opt.dataset == "imagenet2012": if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = SoftmaxCrossEntropyWithLogits( sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
def train(net, ckpt_save_dir, target): # pylint: disable=too-many-locals """ train the network""" # create dataset train_dataset = create_dataset(dataset_path=ARGS_OPT.train_dataset, do_train=True, repeat_num=1, batch_size=config.batch_size, target=target) # pylint: disable=no-member step_size = train_dataset.get_dataset_size() # init lr learning_rate = get_lr( lr_init=config.lr_init, lr_end=config.lr_end, # pylint: disable=no-member lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, # pylint: disable=no-member total_epochs=config.epoch_size, # pylint: disable=no-member steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode) # pylint: disable=no-member learning_rate = Tensor(learning_rate) # define opt decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) group_params = [ { 'params': decayed_params, 'weight_decay': config.weight_decay }, # pylint: disable=no-member { 'params': no_decayed_params }, { 'order_params': net.trainable_params() } ] opt = Momentum(group_params, learning_rate, config.momentum, loss_scale=config.loss_scale) # pylint: disable=no-member # define loss, model loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) # pylint: disable=no-member model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'top_1_accuracy', 'top_5_accuracy'}, amp_level="O2", keep_batchnorm_fp32=False) # define callbacks time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() callbacks = [time_cb, loss_cb] if config.save_checkpoint: # pylint: disable=no-member config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size, # pylint: disable=no-member keep_checkpoint_max=config.keep_checkpoint_max) # pylint: disable=no-member ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) callbacks += [ckpt_cb] # define the eval call back epochs_per_eval = {"epoch": [], "acc": []} if not ARGS_OPT.run_distribute: eval_dataset = create_dataset( dataset_path=ARGS_OPT.eval_dataset, do_train=False, batch_size=config.batch_size, # pylint: disable=no-member target=target) eval_cb = EvalCallBack(model, eval_dataset, 1, epochs_per_eval) callbacks.append(eval_cb) # start training the qunat aware training network model.train( config.epoch_size, train_dataset, callbacks=callbacks, # pylint: disable=no-member sink_size=train_dataset.get_dataset_size(), dataset_sink_mode=False) if not ARGS_OPT.run_distribute: print( "***************** evaluation results of training process ***************** " ) print(epochs_per_eval)
else: decay_params.append(x) return [{ 'params': no_decay_params, 'weight_decay': 0.0 }, { 'params': decay_params }] if cfg.is_dynamic_loss_scale: cfg.loss_scale = 1 opt = Momentum(params=get_param_groups(net), learning_rate=Tensor(lr), momentum=cfg.momentum, weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale) if not cfg.use_label_smooth: cfg.label_smooth_factor = 0.0 loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) if cfg.is_dynamic_loss_scale: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(
def train(): """Train function.""" args = parse_args() # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) loss_meter = AverageMeter('loss') network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recursive_init(network) pretrained_backbone_slice = args.pretrained_backbone.split('/') backbone_ckpt_file = pretrained_backbone_slice[ len(pretrained_backbone_slice) - 1] local_backbone_ckpt_path = '/cache/' + backbone_ckpt_file # download backbone checkpoint mox.file.copy_parallel(src_url=args.pretrained_backbone, dst_url=local_backbone_ckpt_path) if args.pretrained_backbone: network = load_backbone(network, local_backbone_ckpt_path, args) args.logger.info('load pre-trained backbone {} into network'.format( args.pretrained_backbone)) else: args.logger.info('Not load pre-trained backbone, please be careful') if args.resume_yolov3: param_dict = load_checkpoint(args.resume_yolov3) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('yolo_network.'): param_dict_new[key[13:]] = values args.logger.info('in resume {}'.format(key)) else: param_dict_new[key] = values args.logger.info('in resume {}'.format(key)) args.logger.info('resume finished') load_param_into_net(network, param_dict_new) args.logger.info('load_model {} success'.format(args.resume_yolov3)) network = YoloWithLossCell(network) args.logger.info('finish get network') config = ConfigYOLOV3DarkNet53() config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [convert_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate # data download local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' print('Download data.') mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_path) ds, data_size = create_yolo_dataset( image_dir=os.path.join(local_data_path, 'images'), anno_path=os.path.join(local_data_path, 'annotation.json'), is_training=True, batch_size=args.per_batch_size, max_epoch=args.epoch_size, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch * 10 # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr( args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.epoch_size, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_V2': lr = warmup_cosine_annealing_lr_V2(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) elif args.lr_scheduler == 'cosine_annealing_sample': lr = warmup_cosine_annealing_lr_sample(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) network = TrainingWrapper(network, opt) network.set_train() # checkpoint save ckpt_max_num = 10 ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=local_ckpt_path, prefix='yolov3') cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator() shape_record = ShapeRecord() for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] shape_record.set(input_shape) images = Tensor(images) annos = data["annotation"] if args.group_size == 1: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box(annos, config, input_shape) else: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box_single(annos, config, input_shape) batch_y_true_0 = Tensor(batch_y_true_0) batch_y_true_1 = Tensor(batch_y_true_1) batch_y_true_2 = Tensor(batch_y_true_2) batch_gt_box0 = Tensor(batch_gt_box0) batch_gt_box1 = Tensor(batch_gt_box1) batch_gt_box2 = Tensor(batch_gt_box2) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format( epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 args.logger.info('==========end training===============') # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args.train_url)
def example_lenet_mnist_fuzzing(): """ An example of fuzz testing and then enhance the non-robustness model. """ # upload trained network ckpt_path = '../common/networks/lenet5/trained_ckpt_file/lenet_m1-10_1250.ckpt' net = LeNet5() load_dict = load_checkpoint(ckpt_path) load_param_into_net(net, load_dict) model = Model(net) mutate_config = [{'method': 'Blur', 'params': {'auto_param': [True]}}, {'method': 'Contrast', 'params': {'auto_param': [True]}}, {'method': 'Translate', 'params': {'auto_param': [True]}}, {'method': 'Brightness', 'params': {'auto_param': [True]}}, {'method': 'Noise', 'params': {'auto_param': [True]}}, {'method': 'Scale', 'params': {'auto_param': [True]}}, {'method': 'Shear', 'params': {'auto_param': [True]}}, {'method': 'FGSM', 'params': {'eps': [0.3, 0.2, 0.4], 'alpha': [0.1]}} ] # get training data data_list = "../common/dataset/MNIST/train" batch_size = 32 ds = generate_mnist_dataset(data_list, batch_size, sparse=False) train_images = [] for data in ds.create_tuple_iterator(output_numpy=True): images = data[0].astype(np.float32) train_images.append(images) train_images = np.concatenate(train_images, axis=0) # initialize fuzz test with training dataset model_coverage_test = ModelCoverageMetrics(model, 10, 1000, train_images) # fuzz test with original test data # get test data data_list = "../common/dataset/MNIST/test" batch_size = 32 init_samples = 5000 max_iters = 50000 mutate_num_per_seed = 10 ds = generate_mnist_dataset(data_list, batch_size, num_samples=init_samples, sparse=False) test_images = [] test_labels = [] for data in ds.create_tuple_iterator(output_numpy=True): images = data[0].astype(np.float32) labels = data[1] test_images.append(images) test_labels.append(labels) test_images = np.concatenate(test_images, axis=0) test_labels = np.concatenate(test_labels, axis=0) initial_seeds = [] # make initial seeds for img, label in zip(test_images, test_labels): initial_seeds.append([img, label]) model_coverage_test.calculate_coverage( np.array(test_images[:100]).astype(np.float32)) LOGGER.info(TAG, 'KMNC of test dataset before fuzzing is : %s', model_coverage_test.get_kmnc()) LOGGER.info(TAG, 'NBC of test dataset before fuzzing is : %s', model_coverage_test.get_nbc()) LOGGER.info(TAG, 'SNAC of test dataset before fuzzing is : %s', model_coverage_test.get_snac()) model_fuzz_test = Fuzzer(model, train_images, 10, 1000) gen_samples, gt, _, _, metrics = model_fuzz_test.fuzzing(mutate_config, initial_seeds, eval_metrics='auto', max_iters=max_iters, mutate_num_per_seed=mutate_num_per_seed) if metrics: for key in metrics: LOGGER.info(TAG, key + ': %s', metrics[key]) def split_dataset(image, label, proportion): """ Split the generated fuzz data into train and test set. """ indices = np.arange(len(image)) random.shuffle(indices) train_length = int(len(image) * proportion) train_image = [image[i] for i in indices[:train_length]] train_label = [label[i] for i in indices[:train_length]] test_image = [image[i] for i in indices[:train_length]] test_label = [label[i] for i in indices[:train_length]] return train_image, train_label, test_image, test_label train_image, train_label, test_image, test_label = split_dataset( gen_samples, gt, 0.7) # load model B and test it on the test set ckpt_path = '../common/networks/lenet5/trained_ckpt_file/lenet_m2-10_1250.ckpt' net = LeNet5() load_dict = load_checkpoint(ckpt_path) load_param_into_net(net, load_dict) model_b = Model(net) pred_b = model_b.predict(Tensor(test_image, dtype=mindspore.float32)).asnumpy() acc_b = np.sum(np.argmax(pred_b, axis=1) == np.argmax(test_label, axis=1)) / len(test_label) print('Accuracy of model B on test set is ', acc_b) # enhense model robustness lr = 0.001 momentum = 0.9 loss_fn = SoftmaxCrossEntropyWithLogits(Sparse=True) optimizer = Momentum(net.trainable_params(), lr, momentum) adv_defense = AdversarialDefense(net, loss_fn, optimizer) adv_defense.batch_defense(np.array(train_image).astype(np.float32), np.argmax(train_label, axis=1).astype(np.int32)) preds_en = net(Tensor(test_image, dtype=mindspore.float32)).asnumpy() acc_en = np.sum(np.argmax(preds_en, axis=1) == np.argmax(test_label, axis=1)) / len(test_label) print('Accuracy of enhensed model on test set is ', acc_en)
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/cache/data' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) if device_num > 1: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) init() local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=1, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor( get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'Start run evaluation.') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
batch_num = dataset.get_dataset_size() net = GoogleNet(num_classes=cfg.num_classes) # Continue training if set pre_trained to be True if cfg.pre_trained: param_dict = load_checkpoint(cfg.checkpoint_path) load_param_into_net(net, param_dict) loss_scale_manager = None if args_opt.dataset_name == 'cifar10': lr = lr_steps_cifar10(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=Tensor(lr), momentum=cfg.momentum, weight_decay=cfg.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') elif args_opt.dataset_name == 'imagenet': lr = lr_steps_imagenet(cfg, batch_num) def get_param_groups(network): """ get param groups """ decay_params = [] no_decay_params = [] for x in network.trainable_params(): parameter_name = x.name if parameter_name.endswith('.bias'): # all bias not using weight decay # print('no decay:{}'.format(parameter_name))
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/cache/data' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE) if device_num > 1: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) loss = SoftmaxCrossEntropyWithLogits(sparse=True) lr = Tensor( get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'Start run evaluation.') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
def train(): """Train function.""" args = parse_args() profiler = network_init(args) loss_meter = AverageMeter('loss') parallel_init(args) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) load_yolov3_params(args, network) network = YoloWithLossCell(network) args.logger.info('finish get network') config = ConfigYOLOV3DarkNet53() config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [conver_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch lr = get_lr(args) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) is_gpu = context.get_context("device_target") == "GPU" if is_gpu: loss_scale_value = 1.0 loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=False) keep_loss_fp32(network) else: network = TrainingWrapper(network, opt, sens=args.loss_scale) network.set_train() if args.rank_save_ckpt_flag: # checkpoint save ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1) for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] args.logger.info('iter[{}], shape{}'.format(i, input_shape[0])) images = Tensor.from_numpy(images) batch_y_true_0 = Tensor.from_numpy(data['bbox1']) batch_y_true_1 = Tensor.from_numpy(data['bbox2']) batch_y_true_2 = Tensor.from_numpy(data['bbox3']) batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2) loss_meter.update(loss.asnumpy()) if args.rank_save_ckpt_flag: # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) per_step_time = time_used / args.log_interval fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{},' ' per_step_time:{}'.format(epoch, i, loss_meter, fps, lr[i], per_step_time)) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: cb_params.cur_epoch_num += 1 if args.need_profiler: if i == 10: profiler.analyse() break args.logger.info('==========end training===============')
def train(): """Train function.""" args = parse_args() args.logger.save_args(args) if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) load_yolov3_quant_params(args, network) config = ConfigYOLOV3DarkNet53() # convert fusion network to quantization aware network if config.quantization_aware: network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) network = YoloWithLossCell(network) args.logger.info('finish get network') config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [conver_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch lr = get_lr(args) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) network = TrainingWrapper(network, opt) network.set_train() if args.rank_save_ckpt_flag: # checkpoint save ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1) shape_record = ShapeRecord() for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] args.logger.info('iter[{}], shape{}'.format(i, input_shape[0])) shape_record.set(input_shape) images = Tensor.from_numpy(images) annos = data["annotation"] if args.group_size == 1: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box(annos, config, input_shape) else: batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \ batch_preprocess_true_box_single(annos, config, input_shape) batch_y_true_0 = Tensor.from_numpy(batch_y_true_0) batch_y_true_1 = Tensor.from_numpy(batch_y_true_1) batch_y_true_2 = Tensor.from_numpy(batch_y_true_2) batch_gt_box0 = Tensor.from_numpy(batch_gt_box0) batch_gt_box1 = Tensor.from_numpy(batch_gt_box1) batch_gt_box2 = Tensor.from_numpy(batch_gt_box2) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) if args.rank_save_ckpt_flag: # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format( epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: cb_params.cur_epoch_num += 1 if args.need_profiler: if i == 10: profiler.analyse() break args.logger.info('==========end training===============')
if args_opt.resume: ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net, ckpt) # get learning rate loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) lr = Tensor(get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode=config.lr_decay_mode)) # define optimization opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay, config.loss_scale) # define model model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level='O3', keep_batchnorm_fp32=True) # define callbacks cb = [Monitor(lr_init=lr.asnumpy())] if config.save_checkpoint: save_ckpt_path = os.path.join(config.save_checkpoint_path, 'model_' + str(rank) + '/') config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(f"Xception-rank{rank}", directory=save_ckpt_path, config=config_ck) # begin train if args_opt.is_distributed:
def train(): """Train function.""" args = parse_args() devid = int(os.getenv('DEVICE_ID', '0')) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=True, device_id=devid) # init distributed if args.is_distributed: if args.device_target == "Ascend": init() else: init("nccl") args.rank = get_rank() args.group_size = get_group_size() # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) args.logger.save_args(args) if args.need_profiler: from mindspore.profiler.profiling import Profiler profiler = Profiler(output_path=args.outputs_dir, is_detail=True, is_show_op_path=True) loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE degree = 1 if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL degree = get_group_size() context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=degree) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) load_yolov3_params(args, network) network = YoloWithLossCell(network) args.logger.info('finish get network') config = ConfigYOLOV3DarkNet53() config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [conver_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate ds, data_size = create_yolo_dataset(image_dir=args.data_root, anno_path=args.annFile, is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch lr = get_lr(args) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) is_gpu = context.get_context("device_target") == "GPU" if is_gpu: loss_scale_value = 1.0 loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=False) keep_loss_fp32(network) else: network = TrainingWrapper(network, opt, sens=args.loss_scale) network.set_train() if args.rank_save_ckpt_flag: # checkpoint save ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1) for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] args.logger.info('iter[{}], shape{}'.format(i, input_shape[0])) images = Tensor.from_numpy(images) batch_y_true_0 = Tensor.from_numpy(data['bbox1']) batch_y_true_1 = Tensor.from_numpy(data['bbox2']) batch_y_true_2 = Tensor.from_numpy(data['bbox3']) batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2) loss_meter.update(loss.asnumpy()) if args.rank_save_ckpt_flag: # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format( epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag: cb_params.cur_epoch_num += 1 if args.need_profiler: if i == 10: profiler.analyse() break args.logger.info('==========end training===============')
for para in train_net.trainable_params(): if fix_weight_layer in para.name: para.requires_grad = False if __name__ == "__main__": start_time = time.time() epoch_size = 3 args_opt.base_size = config.crop_size args_opt.crop_size = config.crop_size train_dataset = create_dataset(args_opt, args_opt.data_url, epoch_size, config.batch_size, usage="train", shuffle=False) dataset_size = train_dataset.get_dataset_size() callback = LossCallBack(dataset_size) net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size], infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates, decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride, fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid) net.set_train() model_fine_tune(args_opt, net, 'layer') loss = OhemLoss(config.seg_num_classes, config.ignore_label) opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) model = Model(net, loss, opt) model.train(epoch_size, train_dataset, callback) print(time.time() - start_time) print("expect loss: ", callback.loss / 3) print("expect time: ", callback.time) expect_loss = 0.5 expect_time = 35 assert callback.loss.asnumpy() / 3 <= expect_loss assert callback.time <= expect_time
def test_yolov3_darknet53(): devid = int(os.getenv('DEVICE_ID')) if os.getenv('DEVICE_ID') else 0 context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target="Ascend", device_id=devid) rank = 0 device_num = 1 lr_init = 0.001 epoch_size = 3 batch_size = 32 loss_scale = 1024 mindrecord_dir = DATA_DIR # It will generate mindrecord file in args_opt.mindrecord_dir, # and the file name is yolo.mindrecord0, 1, ... file_num. if not os.path.isdir(mindrecord_dir): raise KeyError("mindrecord path is not exist.") data_root = os.path.join(mindrecord_dir, 'train2014') annFile = os.path.join(mindrecord_dir, 'annotations/instances_train2014.json') # print("yolov3 mindrecord is ", mindrecord_file) if not os.path.exists(annFile): print("instances_train2014 file is not exist.") assert False loss_meter = AverageMeter('loss') context.reset_auto_parallel_context() parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1) network = YOLOV3DarkNet53(is_training=True) # default is kaiming-normal default_recurisive_init(network) network = YoloWithLossCell(network) print('finish get network') config = ConfigYOLOV3DarkNet53() label_smooth = 0 label_smooth_factor = 0.1 config.label_smooth = label_smooth config.label_smooth_factor = label_smooth_factor # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0. print("Create dataset begin!") training_shape = [int(416), int(416)] config.multi_scale = [training_shape] num_samples = 256 ds, data_size = create_yolo_dataset(image_dir=data_root, anno_path=annFile, is_training=True, batch_size=batch_size, max_epoch=epoch_size, device_num=device_num, rank=rank, config=config, num_samples=num_samples) print("Create dataset done!") per_batch_size = batch_size group_size = 1 print("data_size:", data_size) steps_per_epoch = int(data_size / per_batch_size / group_size) print("steps_per_epoch:", steps_per_epoch) warmup_epochs = 0. max_epoch = epoch_size T_max = 1 eta_min = 0 lr = warmup_cosine_annealing_lr(lr_init, steps_per_epoch, warmup_epochs, max_epoch, T_max, eta_min) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=0.9, weight_decay=0.0005, loss_scale=loss_scale) network = TrainingWrapper(network, opt) network.set_train() old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator(output_numpy=True) train_starttime = time.time() time_used_per_epoch = 0 print("time:", time.time()) for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] print('iter[{}], shape{}'.format(i, input_shape[0])) images = Tensor.from_numpy(images) batch_y_true_0 = Tensor.from_numpy(data['bbox1']) batch_y_true_1 = Tensor.from_numpy(data['bbox2']) batch_y_true_2 = Tensor.from_numpy(data['bbox3']) batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) if (i + 1) % steps_per_epoch == 0: time_used = time.time() - t_end epoch = int(i / steps_per_epoch) fps = per_batch_size * (i - old_progress) * group_size / time_used if rank == 0: print( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}, time_used:{}' .format(epoch, i, loss_meter, fps, lr[i], time_used)) t_end = time.time() loss_meter.reset() old_progress = i time_used_per_epoch = time_used train_endtime = time.time() - train_starttime print('train_time_used:{}'.format(train_endtime)) expect_loss_value = 3210.0 loss_value = re.findall(r"\d+\.?\d*", str(loss_meter)) print('loss_value:{}'.format(loss_value[0])) assert float(loss_value[0]) < expect_loss_value export_time_used = 20.0 print('time_used_per_epoch:{}'.format(time_used_per_epoch)) assert time_used_per_epoch < export_time_used print('==========test case passed===========')
if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) group_params = [{ 'params': decayed_params, 'weight_decay': 1e-4 }, { 'params': no_decayed_params }, { 'order_params': net.trainable_params() }] # define loss, model loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # Mixed precision if compute_type == "fp16": model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False)
return loss_func if __name__ == '__main__': if not args_opt.do_eval and args_opt.run_distribute: context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, all_reduce_fusion_config=[140]) init() context.set_context(mode=context.GRAPH_MODE) epoch_size = args_opt.epoch_size net = resnet50(args_opt.batch_size, args_opt.num_classes) loss = CrossEntropyLoss() opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) if args_opt.do_train: dataset = create_dataset(1) batch_num = dataset.get_dataset_size() config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10", directory="./", config=config_ck) time_cb = TimeMonitor(data_size=batch_num) loss_cb = LossMonitor() model.train(epoch_size, dataset,
def test_resnet50_quant(): set_seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") config = config_quant print("training configure: {}".format(config)) epoch_size = config.epoch_size # define network net = resnet50_quant(class_num=config.class_num) net.set_train(True) # define loss if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) #loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) # define dataset dataset = create_dataset(dataset_path=dataset_path, config=config, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # convert fusion network to quantization aware network quantizer = QuantizationAwareTraining(bn_fold=True, per_channel=[True, False], symmetric=[True, False]) net = quantizer.quantize(net) # get learning rate lr = Tensor( get_lr(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode='cosine')) # define optimization opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) # define model #model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) model = Model(net, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") monitor = Monitor(lr_init=lr.asnumpy(), step_threshold=config.step_threshold) callbacks = [monitor] model.train(epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False) print("============== End Training ==============") expect_avg_step_loss = 2.40 avg_step_loss = np.mean(np.array(monitor.losses)) print("average step loss:{}".format(avg_step_loss)) assert avg_step_loss < expect_avg_step_loss
device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch lr = get_lr(args) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) is_gpu = context.get_context("device_target") == "GPU" if is_gpu: loss_scale_value = 1.0 loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=False) keep_loss_fp32(network) else: network = TrainingWrapper(network, opt)
ckpt = load_checkpoint(args_opt.resume) load_param_into_net(net, ckpt) # get learning rate lr = get_lr(lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=batches_per_epoch, lr_decay_mode=config.decay_method) lr = Tensor(lr) # define optimization optimizer = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) # model loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=optimizer, amp_level=config.amp_level, loss_scale_manager=loss_scale_manager) # define callbacks cb = [TimeMonitor(), LossMonitor()] if config.save_checkpoint:
def train(): # set args dev = "GPU" epoch_size = int(args_opt.epoch_size) total_batch = int(args_opt.batch_size) print_per_steps = int(args_opt.print_per_steps) compute_type = str(args_opt.dtype).lower() ckpt_save_dir = str(args_opt.ckpt_path) save_ckpt = bool(args_opt.save_ckpt) device_num = 1 # init context if args_opt.mode == "GRAPH": mode = context.GRAPH_MODE else: mode = context.PYNATIVE_MODE context.set_context(mode=mode, device_target=dev, save_graphs=False) if args_opt.run_distribute: init() device_num = get_group_size() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[85, 160]) ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/" # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type, device_num=device_num) step_size = dataset.get_dataset_size() if (print_per_steps > step_size or print_per_steps < 1): print("Arg: print_per_steps should lessequal to dataset_size ", step_size) print("Change to default: 20") print_per_steps = 20 # define net net = resnet(class_num=1001, dtype=compute_type) # init weight for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.set_data( weight_init.initializer(weight_init.XavierUniform(), cell.weight.shape, cell.weight.dtype)) if isinstance(cell, nn.Dense): cell.weight.set_data( weight_init.initializer(weight_init.TruncatedNormal(), cell.weight.shape, cell.weight.dtype)) # init lr lr = get_liner_lr(lr_init=0, lr_end=0, lr_max=0.8, warmup_epochs=0, total_epochs=epoch_size, steps_per_epoch=step_size) lr = Tensor(lr) # define opt decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4) loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # Mixed precision if compute_type == "fp16": opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 1e-4, 1024) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) # define callbacks if mode == context.PYNATIVE_MODE: print_per_steps = 1 time_cb = MyTimeMonitor(total_batch, print_per_steps) cb = [time_cb] if save_ckpt: config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size, keep_checkpoint_max=5) ckpt_cb = ModelCheckpoint(prefix="resnet_benchmark", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] # train model print("========START RESNET50 GPU BENCHMARK========") if mode == context.GRAPH_MODE: model.train(int(epoch_size * step_size / print_per_steps), dataset, callbacks=cb, sink_size=print_per_steps) else: model.train(epoch_size, dataset, callbacks=cb)
def train(): """Train function.""" args = parse_args() devid = int(os.getenv('DEVICE_ID', '0')) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False, device_id=devid) loss_meter = AverageMeter('loss') network = YOLOV4CspDarkNet53(is_training=True) # default is kaiming-normal default_recursive_init(network) if args.pretrained_backbone: pretrained_backbone_slice = args.pretrained_backbone.split('/') backbone_ckpt_file = pretrained_backbone_slice[ len(pretrained_backbone_slice) - 1] local_backbone_ckpt_path = '/cache/' + backbone_ckpt_file # download backbone checkpoint mox.file.copy_parallel(src_url=args.pretrained_backbone, dst_url=local_backbone_ckpt_path) args.pretrained_backbone = local_backbone_ckpt_path load_yolov4_params(args, network) network = YoloWithLossCell(network) args.logger.info('finish get network') config = ConfigYOLOV4CspDarkNet53() config.label_smooth = args.label_smooth config.label_smooth_factor = args.label_smooth_factor if args.training_shape: config.multi_scale = [convert_training_shape(args)] if args.resize_rate: config.resize_rate = args.resize_rate # data download local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' print('Download data.') mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_path) ds, data_size = create_yolo_dataset( image_dir=os.path.join(local_data_path, 'images'), anno_path=os.path.join(local_data_path, 'annotation.json'), is_training=True, batch_size=args.per_batch_size, max_epoch=args.max_epoch, device_num=args.group_size, rank=args.rank, config=config) args.logger.info('Finish loading dataset') args.steps_per_epoch = int(data_size / args.per_batch_size / args.group_size) if not args.ckpt_interval: args.ckpt_interval = args.steps_per_epoch * 10 lr = get_lr(args) opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) is_gpu = context.get_context("device_target") == "GPU" if is_gpu: loss_scale_value = 1.0 loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False) network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale, level="O2", keep_batchnorm_fp32=False) keep_loss_fp32(network) else: network = TrainingWrapper(network, opt) network.set_train() # checkpoint save ckpt_max_num = 10 ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=local_ckpt_path, prefix='yolov4') cb_params = _InternalCallbackParam() cb_params.train_network = network cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) old_progress = -1 t_end = time.time() data_loader = ds.create_dict_iterator(output_numpy=True, num_epochs=1) for i, data in enumerate(data_loader): images = data["image"] input_shape = images.shape[2:4] images = Tensor.from_numpy(images) batch_y_true_0 = Tensor.from_numpy(data['bbox1']) batch_y_true_1 = Tensor.from_numpy(data['bbox2']) batch_y_true_2 = Tensor.from_numpy(data['bbox3']) batch_gt_box0 = Tensor.from_numpy(data['gt_box1']) batch_gt_box1 = Tensor.from_numpy(data['gt_box2']) batch_gt_box2 = Tensor.from_numpy(data['gt_box3']) input_shape = Tensor(tuple(input_shape[::-1]), ms.float32) loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2, input_shape) loss_meter.update(loss.asnumpy()) # ckpt progress cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 ckpt_cb.step_end(run_context) if i % args.log_interval == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.per_batch_size * ( i - old_progress) * args.group_size / time_used if args.rank == 0: args.logger.info( 'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format( epoch, i, loss_meter, fps, lr[i])) t_end = time.time() loss_meter.reset() old_progress = i if (i + 1) % args.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 args.logger.info('==========end training===============') # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args.train_url)