Exemplo n.º 1
0
def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
    """
    create a train or eval imagenet2012 dataset for resnet50

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num = int(os.getenv("DEVICE_NUM"))
        rank_id = int(os.getenv("RANK_ID"))
    else:
        init("nccl")
        rank_id = get_rank()
        device_num = get_group_size()

    if device_num == 1:
        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
    else:
        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
                                     num_shards=device_num, shard_id=rank_id)

    image_size = 224
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(256),
            C.CenterCrop(image_size),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
    ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Exemplo n.º 2
0
def train_process(q, device_id, epoch_size, device_num, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(device_id=device_id)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            parameter_broadcast=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
        init()

    # network
    net = resnet50(class_num=config.class_num)

    # evaluation network
    dist_eval_network = ClassifyCorrectCell(net)

    if not config.use_label_smooth:
        config.label_smooth_factor = 0.0

    # loss
    loss = nn.SoftmaxCrossEntropyWithLogits(
        sparse=True,
        reduction="mean",
        smooth_factor=config.label_smooth_factor,
        num_classes=config.class_num)

    # train dataset
    dataset = create_dataset(dataset_path=dataset_path,
                             do_train=True,
                             repeat_num=epoch_size,
                             batch_size=config.batch_size)

    step_size = dataset.get_dataset_size()
    eval_interval = config.eval_interval
    dataset.__loop_size__ = step_size * eval_interval

    # evalutation dataset
    eval_dataset = create_dataset(dataset_path=eval_path,
                                  do_train=False,
                                  repeat_num=epoch_size,
                                  batch_size=config.eval_batch_size)

    # loss scale
    loss_scale = FixedLossScaleManager(config.loss_scale,
                                       drop_overflow_update=False)

    # learning rate
    lr = Tensor(
        get_learning_rate(lr_init=config.lr_init,
                          lr_end=0.0,
                          lr_max=config.lr_max,
                          warmup_epochs=config.warmup_epochs,
                          total_epochs=config.epoch_size,
                          steps_per_epoch=step_size,
                          lr_decay_mode=config.lr_decay_mode))

    # optimizer
    decayed_params = list(
        filter(
            lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias'
            not in x.name, net.trainable_params()))
    no_decayed_params = [
        param for param in net.trainable_params()
        if param not in decayed_params
    ]
    group_params = [{
        'params': decayed_params,
        'weight_decay': config.weight_decay
    }, {
        'params': no_decayed_params
    }, {
        'order_params': net.trainable_params()
    }]

    if config.use_lars:
        momentum = nn.Momentum(filter(lambda x: x.requires_grad,
                                      net.get_parameters()),
                               lr,
                               config.momentum,
                               use_nesterov=config.use_nesterov)
        opt = nn.LARS(momentum,
                      epsilon=config.lars_epsilon,
                      hyperpara=config.lars_coefficient,
                      weight_decay=config.weight_decay,
                      decay_filter=lambda x: 'beta' not in x.name and 'gamma'
                      not in x.name and 'bias' not in x.name,
                      lars_filter=lambda x: 'beta' not in x.name and 'gamma'
                      not in x.name and 'bias' not in x.name,
                      loss_scale=config.loss_scale)

    else:
        opt = nn.Momentum(group_params,
                          lr,
                          config.momentum,
                          weight_decay=config.weight_decay,
                          loss_scale=config.loss_scale,
                          use_nesterov=config.use_nesterov)

    # model
    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale,
                  amp_level="O2",
                  keep_batchnorm_fp32=False,
                  metrics={
                      'acc':
                      DistAccuracy(batch_size=config.eval_batch_size,
                                   device_num=device_num)
                  },
                  eval_network=dist_eval_network)

    # model init
    print("init_start", device_id)
    model.init(dataset, eval_dataset)
    print("init_stop", device_id)

    # callbacks
    loss_cb = LossGet(1, step_size)

    # train and eval
    print("run_start", device_id)
    acc = 0.0
    time_cost = 0.0
    for epoch_idx in range(0, int(epoch_size / eval_interval)):
        model.train(1, dataset, callbacks=loss_cb)
        eval_start = time.time()
        output = model.eval(eval_dataset)
        eval_cost = (time.time() - eval_start) * 1000
        acc = float(output["acc"])
        time_cost = loss_cb.get_per_step_time()
        loss = loss_cb.get_loss()
        print(
            "the {} epoch's resnet result:\n "
            "device{}, training loss {}, acc {}, "
            "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms"
            .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost,
                    time_cost * step_size + eval_cost))
    q.put({'acc': acc, 'cost': time_cost})
Exemplo n.º 3
0
def parse_args(cloud_args=None):
    """parameters"""
    parser = argparse.ArgumentParser('mindspore classification training')
    parser.add_argument('--platform',
                        type=str,
                        default='Ascend',
                        choices=('Ascend', 'GPU'),
                        help='run platform')

    # dataset related
    parser.add_argument('--data_dir',
                        type=str,
                        default='',
                        help='train data dir')
    parser.add_argument('--per_batch_size',
                        default=128,
                        type=int,
                        help='batch size for per gpu')
    # network related
    parser.add_argument('--pretrained',
                        default='',
                        type=str,
                        help='model_path, local pretrained model to load')

    # distributed related
    parser.add_argument('--is_distributed',
                        type=int,
                        default=1,
                        help='if multi device')
    # roma obs
    parser.add_argument('--train_url', type=str, default="", help='train url')

    args, _ = parser.parse_known_args()
    args = merge_args(args, cloud_args)
    args.image_size = config.image_size
    args.num_classes = config.num_classes
    args.lr = config.lr
    args.lr_scheduler = config.lr_scheduler
    args.lr_epochs = config.lr_epochs
    args.lr_gamma = config.lr_gamma
    args.eta_min = config.eta_min
    args.T_max = config.T_max
    args.max_epoch = config.max_epoch
    args.backbone = config.backbone
    args.warmup_epochs = config.warmup_epochs
    args.weight_decay = config.weight_decay
    args.momentum = config.momentum
    args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
    args.loss_scale = config.loss_scale
    args.label_smooth = config.label_smooth
    args.label_smooth_factor = config.label_smooth_factor
    args.ckpt_interval = config.ckpt_interval
    args.ckpt_save_max = config.ckpt_save_max
    args.ckpt_path = config.ckpt_path
    args.is_save_on_master = config.is_save_on_master
    args.rank = config.rank
    args.group_size = config.group_size
    args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
    args.image_size = list(map(int, args.image_size.split(',')))

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
    else:
        args.rank = 0
        args.group_size = 1

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    return args
Exemplo n.º 4
0
args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
args.data_root = os.path.join(args.data_dir, 'train2017')
args.annFile = os.path.join(args.data_dir,
                            'annotations/instances_train2017.json')

device_id = int(os.getenv('DEVICE_ID', '0'))
context.set_context(mode=context.GRAPH_MODE,
                    enable_auto_mixed_precision=True,
                    device_target=args.device_target,
                    save_graphs=False,
                    device_id=device_id)

# init distributed
if args.is_distributed:
    if args.device_target == "Ascend":
        init()
    else:
        init("nccl")
    args.rank = get_rank()
    args.group_size = get_group_size()

# select for master rank save ckpt or all rank save, compatiable for model parallel
args.rank_save_ckpt_flag = 0
if args.is_save_on_master:
    if args.rank == 0:
        args.rank_save_ckpt_flag = 1
else:
    args.rank_save_ckpt_flag = 1

# logger
args.outputs_dir = os.path.join(
Exemplo n.º 5
0
        ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' +
                                     str(get_rank()),
                                     directory=config.ckpt_path,
                                     config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])


if __name__ == "__main__":
    wide_deep_config = WideDeepConfig()
    wide_deep_config.argparse_init()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=wide_deep_config.device_target,
                        save_graphs=True)
    if wide_deep_config.device_target == "Ascend":
        init("hccl")
    elif wide_deep_config.device_target == "GPU":
        init("nccl")
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
                                      mirror_mean=True,
                                      device_num=get_group_size())

    train_and_eval(wide_deep_config)
Exemplo n.º 6
0
def parse_args():
    """Parse train arguments."""
    parser = argparse.ArgumentParser('mindspore coco training')

    # device related
    parser.add_argument(
        '--device_target',
        type=str,
        default='GPU',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: GPU)')

    # dataset related
    parser.add_argument('--data_dir',
                        required=True,
                        type=str,
                        help='Train dataset directory.')
    parser.add_argument('--per_batch_size',
                        default=4,
                        type=int,
                        help='Batch size for Training. Default: 4.')
    parser.add_argument('--max_epoch',
                        required=True,
                        type=int,
                        default=320,
                        help='max epoch num to train the model. Default: 320.')
    parser.add_argument('--warmup_epochs',
                        default=0,
                        type=float,
                        help='Warmup epochs. Default: 0')

    # network related
    parser.add_argument('--pretrained_backbone',
                        default='',
                        type=str,
                        help='The ckpt file of DarkNet53. Default: "".')
    parser.add_argument(
        '--resume_yolov3',
        default='',
        type=str,
        help='The ckpt file of YOLOv3, which used to fine tune. Default: ""')

    # optimizer and lr related
    parser.add_argument(
        '--lr_scheduler',
        default='exponential',
        type=str,
        help=
        'Learning rate scheduler, options: exponential, cosine_annealing. Default: exponential'
    )
    parser.add_argument('--lr',
                        default=0.001,
                        type=float,
                        help='Learning rate. Default: 0.001')
    parser.add_argument(
        '--lr_epochs',
        type=str,
        default='220,250',
        help=
        'Epoch of changing of lr changing, split with ",". Default: 220,250')
    parser.add_argument(
        '--lr_gamma',
        type=float,
        default=0.1,
        help='Decrease lr by a factor of exponential lr_scheduler. Default: 0.1'
    )
    parser.add_argument(
        '--eta_min',
        type=float,
        default=0.,
        help='Eta_min in cosine_annealing scheduler. Default: 0')
    parser.add_argument(
        '--T_max',
        type=int,
        default=320,
        help='T-max in cosine_annealing scheduler. Default: 320')
    parser.add_argument('--weight_decay',
                        type=float,
                        default=0.0005,
                        help='Weight decay factor. Default: 0.0005')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        help='Momentum. Default: 0.9')

    # loss related
    parser.add_argument('--loss_scale',
                        type=int,
                        default=1024,
                        help='Static loss scale. Default: 1024')
    parser.add_argument('--label_smooth',
                        type=int,
                        default=0,
                        help='Whether to use label smooth in CE. Default:0')
    parser.add_argument(
        '--label_smooth_factor',
        type=float,
        default=0.1,
        help='Smooth strength of original one-hot. Default: 0.1')

    # logging related
    parser.add_argument('--log_interval',
                        type=int,
                        default=100,
                        help='Logging interval steps. Default: 100')
    parser.add_argument('--ckpt_path',
                        type=str,
                        default='outputs/',
                        help='Checkpoint save location. Default: outputs/')
    parser.add_argument('--ckpt_interval',
                        type=int,
                        default=None,
                        help='Save checkpoint interval. Default: None')

    # distributed related
    parser.add_argument(
        '--is_distributed',
        type=int,
        default=0,
        help='Distribute train or not, 1 for yes, 0 for no. Default: 0')
    parser.add_argument('--rank',
                        type=int,
                        default=0,
                        help='Local rank of distributed. Default: 0')
    parser.add_argument('--group_size',
                        type=int,
                        default=1,
                        help='World size of device. Default: 1')

    # reset default config
    parser.add_argument('--training_shape',
                        type=str,
                        default="",
                        help='Fix training shape. Default: ""')
    parser.add_argument(
        '--resize_rate',
        type=int,
        default=None,
        help='Resize rate for multi-scale training. Default: None')

    args, _ = parser.parse_known_args()

    args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
    args.data_root = os.path.join(args.data_dir, 'images')
    args.ann_file = os.path.join(args.data_dir, 'annotation.json')

    # init distributed
    if args.is_distributed:
        if args.device_target == "Ascend":
            init()
        else:
            init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    return args
Exemplo n.º 7
0
def train():
    """Train function."""

    args.outputs_dir = params['save_model_path']

    if args.group_size > 1:
        init()
        context.set_auto_parallel_context(
            device_num=get_group_size(),
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        args.outputs_dir = os.path.join(args.outputs_dir,
                                        "ckpt_{}/".format(str(get_rank())))
        args.rank = get_rank()
    else:
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/")
        args.rank = 0

    if args.group_size > 1:
        args.max_epoch = params["max_epoch_train_NP"]
        args.loss_scale = params['loss_scale'] / 2
        args.lr_steps = list(map(int, params["lr_steps_NP"].split(',')))
        params['train_type'] = params['train_type_NP']
        params['optimizer'] = params['optimizer_NP']
        params['group_params'] = params['group_params_NP']
    else:
        args.max_epoch = params["max_epoch_train"]
        args.loss_scale = params['loss_scale']
        args.lr_steps = list(map(int, params["lr_steps"].split(',')))

    # create network
    print('start create network')
    criterion = openpose_loss()
    criterion.add_flags_recursive(fp32=True)
    network = OpenPoseNet(vggpath=params['vgg_path'],
                          vgg_with_bn=params['vgg_with_bn'])
    if params["load_pretrain"]:
        print("load pretrain model:", params["pretrained_model_path"])
        load_model(network, params["pretrained_model_path"])
    train_net = BuildTrainNetwork(network, criterion)

    # create dataset
    if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \
            and os.path.exists(args.maskpath_train):
        print('start create dataset')
    else:
        print('Error: wrong data path')
        return 0

    num_worker = 20 if args.group_size > 1 else 48
    de_dataset_train = create_dataset(args.jsonpath_train,
                                      args.imgpath_train,
                                      args.maskpath_train,
                                      batch_size=params['batch_size'],
                                      rank=args.rank,
                                      group_size=args.group_size,
                                      num_worker=num_worker,
                                      multiprocessing=True,
                                      shuffle=True,
                                      repeat_num=1)
    steps_per_epoch = de_dataset_train.get_dataset_size()
    print("steps_per_epoch: ", steps_per_epoch)

    # lr scheduler
    lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size,
                                       params['lr_gamma'],
                                       steps_per_epoch,
                                       args.max_epoch,
                                       args.lr_steps,
                                       args.group_size,
                                       lr_type=params['lr_type'],
                                       warmup_epoch=params['warmup_epoch'])

    # optimizer
    if params['group_params']:
        vgg19_base_params = list(
            filter(lambda x: 'base.vgg_base' in x.name,
                   train_net.trainable_params()))
        base_params = list(
            filter(lambda x: 'base.conv' in x.name,
                   train_net.trainable_params()))
        stages_params = list(
            filter(lambda x: 'base' not in x.name,
                   train_net.trainable_params()))

        group_params = [{
            'params': vgg19_base_params,
            'lr': lr_vgg
        }, {
            'params': base_params,
            'lr': lr_base
        }, {
            'params': stages_params,
            'lr': lr_stage
        }]

        if params['optimizer'] == "Momentum":
            opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(group_params)
        else:
            raise ValueError("optimizer not support.")
    else:
        if params['optimizer'] == "Momentum":
            opt = Momentum(train_net.trainable_params(),
                           learning_rate=lr_stage,
                           momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(train_net.trainable_params(), learning_rate=lr_stage)
        else:
            raise ValueError("optimizer not support.")

    # callback
    config_ck = CheckpointConfig(
        save_checkpoint_steps=params['ckpt_interval'],
        keep_checkpoint_max=params["keep_checkpoint_max"])
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank),
                                 directory=args.outputs_dir,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size())
    if args.rank == 0:
        callback_list = [MyLossMonitor(), time_cb, ckpoint_cb]
    else:
        callback_list = [MyLossMonitor(), time_cb]

    # train
    if params['train_type'] == 'clip_grad':
        train_net = TrainOneStepWithClipGradientCell(train_net,
                                                     opt,
                                                     sens=args.loss_scale)
        train_net.set_train()
        model = Model(train_net)
    elif params['train_type'] == 'fix_loss_scale':
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        train_net.set_train()
        model = Model(train_net,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager)
    else:
        raise ValueError("Type {} is not support.".format(
            params['train_type']))

    print("============== Starting Training ==============")
    model.train(args.max_epoch,
                de_dataset_train,
                callbacks=callback_list,
                dataset_sink_mode=False)
    return 0
Exemplo n.º 8
0
def train_on_ascend():
    config = config_ascend_quant
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))
    print("parallel args: rank_id {}, device_id {}, rank_size {}".format(
        rank_id, device_id, rank_size))
    epoch_size = config.epoch_size

    # distribute init
    if run_distribute:
        context.set_auto_parallel_context(
            device_num=rank_size,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        init()

    # define network
    network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
                                           num_classes=config.num_classes)
    else:
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    # define dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             config=config,
                             device_target=args_opt.device_target,
                             repeat_num=1,
                             batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    # load pre trained ckpt
    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        load_nonquant_param_into_quant_net(network, param_dict)
    # convert fusion network to quantization aware network
    quantizer = QuantizationAwareTraining(bn_fold=True,
                                          per_channel=[True, False],
                                          symmetric=[True, False],
                                          one_conv_fold=False)
    network = quantizer.quantize(network)

    # get learning rate
    lr = Tensor(
        get_lr(global_step=config.start_epoch * step_size,
               lr_init=0,
               lr_end=0,
               lr_max=config.lr,
               warmup_epochs=config.warmup_epochs,
               total_epochs=epoch_size + config.start_epoch,
               steps_per_epoch=step_size))

    # define optimization
    opt = nn.Momentum(
        filter(lambda x: x.requires_grad, network.get_parameters()), lr,
        config.momentum, config.weight_decay)
    # define model
    model = Model(network, loss_fn=loss, optimizer=opt)

    print("============== Starting Training ==============")
    callback = None
    if rank_id == 0:
        callback = [Monitor(lr_init=lr.asnumpy())]
        if config.save_checkpoint:
            config_ck = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_epochs *
                step_size,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpt_cb = ModelCheckpoint(prefix="mobilenetV2",
                                      directory=config.save_checkpoint_path,
                                      config=config_ck)
            callback += [ckpt_cb]
    model.train(epoch_size, dataset, callbacks=callback)
    print("============== End Training ==============")
Exemplo n.º 9
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse_init()
    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    is_auto_enable_graph_kernel = _auto_enable_graph_kernel(
        args_opt.device_target, args_opt.enable_graph_kernel)
    _set_graph_kernel_context(args_opt.device_target,
                              args_opt.enable_graph_kernel,
                              is_auto_enable_graph_kernel)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init()
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init()
            device_num = D.get_group_size()
            rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
            get_rank()) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            device_num=device_num)
        _set_bert_all_reduce_split()
    else:
        rank = 0
        device_num = 1

    _check_compute_type(args_opt, is_auto_enable_graph_kernel)

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(
            args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(
            cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(
                args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle,
                             args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size(
    ) // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        train_steps = args_opt.train_steps * args_opt.accumulation_steps
        new_repeat_count = min(new_repeat_count,
                               train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size(
        ) // args_opt.accumulation_steps
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [
        TimeMonitor(args_opt.data_sink_steps),
        LossCallBack(ds.get_dataset_size())
    ]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(
            8, device_num) == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(
            prefix='checkpoint_bert',
            directory=None if ckpt_save_dir == "" else ckpt_save_dir,
            config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        accumulation_steps = args_opt.accumulation_steps
        enable_global_norm = cfg.enable_global_norm
        if accumulation_steps <= 1:
            if cfg.optimizer == 'AdamWeightDecay' and args_opt.device_target == 'GPU':
                net_with_grads = BertTrainOneStepWithLossScaleCellForAdam(
                    net_with_loss,
                    optimizer=optimizer,
                    scale_update_cell=update_cell)
            else:
                net_with_grads = BertTrainOneStepWithLossScaleCell(
                    net_with_loss,
                    optimizer=optimizer,
                    scale_update_cell=update_cell)
        else:
            allreduce_post = args_opt.distribute == "false" or args_opt.allreduce_post_accumulation == "true"
            net_with_accumulation = (
                BertTrainAccumulationAllReducePostWithLossScaleCell
                if allreduce_post else
                BertTrainAccumulationAllReduceEachWithLossScaleCell)
            net_with_grads = net_with_accumulation(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell,
                accumulation_steps=accumulation_steps,
                enable_global_norm=enable_global_norm)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)

    model = Model(net_with_grads)
    model = ConvertModelUtils().convert_to_thor_model(
        model,
        network=net_with_grads,
        optimizer=optimizer,
        frequency=cfg.Thor.frequency)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
Exemplo n.º 10
0
def train():
    args = parse_args()

    # backend
    assert args.device_target == 'GPU'
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    if args.distributed:
        init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()
        context.set_auto_parallel_context(parallel_mode=context.ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True,
                                          device_num=args.group_size)
    # experiments directory
    args.train_dir = os.path.join(env_dir, args.train_dir, 'ckpt')
    if args.rank == 0:
        if os.path.exists(args.train_dir):
            shutil.rmtree(args.train_dir, ignore_errors=True)  # rm existing dir
        makedir_p(args.train_dir)
    args.data_file = os.path.join(env_dir, args.data_file)

    # dataset
    dataset = TransformSegDataset(data_file=args.data_file,
                                  batch_size=args.batch_size,
                                  crop_size=args.crop_size,
                                  min_scale=args.min_scale,
                                  max_scale=args.max_scale,
                                  ignore_label=args.ignore_label,
                                  num_classes=args.num_classes,
                                  shard_id=args.rank,
                                  shard_num=args.group_size)
    dataset = dataset.get_transformed_dataset(repeat=1)

    # network
    network = get_model_by_name(args.model, nclass=args.num_classes, phase='train')
    loss = SoftmaxCrossEntropyLoss(args.num_classes, ignore_label=args.ignore_label)
    loss.add_flags_recursive(fp32=True)
    train_net = BuildTrainNetwork(network, loss)

    # optimizer
    iters_per_epoch = dataset.get_dataset_size()
    total_train_steps = iters_per_epoch * args.epochs
    lr_iter = lr_scheduler(lr_type=args.lr_type,
                           base_lr=args.base_lr,
                           total_train_steps=total_train_steps,
                           lr_decay_step=args.lr_decay_step,
                           lr_decay_rate=args.lr_decay_rate)
    opt = nn.Momentum(params=train_net.trainable_params(),
                      learning_rate=lr_iter,
                      momentum=args.momentum,
                      weight_decay=args.wd,
                      loss_scale=args.loss_scale)

    # loss scale
    manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
    model = Model(train_net, optimizer=opt, amp_level='O0', loss_scale_manager=manager_loss_scale)

    # callback for saving ckpts
    time_cb = TimeMonitor(data_size=iters_per_epoch)
    loss_cb = LossMonitor()
    cbs = [time_cb, loss_cb]

    if args.rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args.save_steps,
                                     keep_checkpoint_max=args.keep_checkpoint_max)
        ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck)
        cbs.append(ckpoint_cb)

    model.train(args.epochs, dataset, callbacks=cbs,
                dataset_sink_mode=(args.device_target != "CPU"))
Exemplo n.º 11
0
def train():
    """Train function."""
    args = parse_args()
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)

    if args.is_distributed:
        rank = args.rank_id
        device_num = args.device_num
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        init()
    else:
        rank = 0
        device_num = 1

    # Logger
    args.logger = get_logger(args.outputs_dir, rank)
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # DATASET
    dataset = create_ocr_train_dataset(args.mindrecord_file,
                                       config.batch_size,
                                       rank_size=device_num,
                                       rank_id=rank)
    args.steps_per_epoch = dataset.get_dataset_size()
    args.logger.info('Finish loading dataset')

    if not args.ckpt_interval:
        args.ckpt_interval = args.steps_per_epoch
    args.logger.save_args(args)

    network = AttentionOCR(config.batch_size,
                           int(config.img_width / 4),
                           config.encoder_hidden_size,
                           config.decoder_hidden_size,
                           config.decoder_output_size,
                           config.max_length,
                           config.dropout_p)

    if args.pre_checkpoint_path:
        param_dict = load_checkpoint(args.pre_checkpoint_path)
        load_param_into_net(network, param_dict)

    network = AttentionOCRWithLossCell(network, config.max_length)

    lr = Tensor(config.lr, mstype.float32)
    opt = nn.Adam(network.trainable_params(), lr, beta1=config.adam_beta1, beta2=config.adam_beta2,
                  loss_scale=config.loss_scale)

    network = TrainingWrapper(network, opt, sens=config.loss_scale)

    args.logger.info('Finished get network')

    callback = [TimeMonitor(data_size=1), LossMonitor()]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.steps_per_epoch,
                                       keep_checkpoint_max=config.keep_checkpoint_max)
        save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(rank) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=save_ckpt_path,
                                  prefix="crnn_seq2seq_ocr")
        callback.append(ckpt_cb)

    model = Model(network)
    model.train(config.num_epochs, dataset, callbacks=callback, dataset_sink_mode=False)

    args.logger.info('==========Training Done===============')
Exemplo n.º 12
0
def test():
    """test method"""

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    # logger
    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)

    context.reset_auto_parallel_context()
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE
    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      gradients_mean=True,
                                      device_num=1)

    args.logger.info('Creating Network....')
    network = YOLOV4CspDarkNet53(is_training=False)

    args.logger.info(args.pretrained)
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('yolo_network.'):
                param_dict_new[key[13:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load_model {} success'.format(args.pretrained))
    else:
        args.logger.info('{} not exists or not a pre-trained file'.format(
            args.pretrained))
        assert FileNotFoundError(
            '{} not exists or not a pre-trained file'.format(args.pretrained))
        exit(1)

    data_root = args.data_root
    # annFile = args.annFile

    config = ConfigYOLOV4CspDarkNet53()
    if args.testing_shape:
        config.test_img_shape = convert_testing_shape(args)

    data_txt = os.path.join(args.data_dir, 'testdev2017.txt')
    ds, data_size = create_yolo_datasetv2(data_root,
                                          data_txt=data_txt,
                                          batch_size=args.per_batch_size,
                                          max_epoch=1,
                                          device_num=args.group_size,
                                          rank=args.rank,
                                          shuffle=False,
                                          config=config)

    args.logger.info('testing shape : {}'.format(config.test_img_shape))
    args.logger.info('totol {} images to eval'.format(data_size))

    network.set_train(False)

    # init detection engine
    detection = DetectionEngine(args)

    input_shape = Tensor(tuple(config.test_img_shape), ms.float32)
    args.logger.info('Start inference....')
    for i, data in enumerate(ds.create_dict_iterator()):
        image = Tensor(data["image"])

        image_shape = Tensor(data["image_shape"])
        image_id = Tensor(data["img_id"])

        prediction = network(image, input_shape)
        output_big, output_me, output_small = prediction
        output_big = output_big.asnumpy()
        output_me = output_me.asnumpy()
        output_small = output_small.asnumpy()
        image_id = image_id.asnumpy()
        image_shape = image_shape.asnumpy()

        detection.detect([output_small, output_me, output_big],
                         args.per_batch_size, image_shape, image_id)
        if i % 1000 == 0:
            args.logger.info('Processing... {:.2f}% '.format(
                i * args.per_batch_size / data_size * 100))

    args.logger.info('Calculating mAP...')
    detection.do_nms_for_results()
    result_file_path = detection.write_result()
    args.logger.info('result file path: {}'.format(result_file_path))
Exemplo n.º 13
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="1",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt",
                        type=str,
                        default="true",
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale",
                        type=str,
                        default="true",
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default="1",
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument(
        "--accumulation_steps",
        type=int,
        default="1",
        help=
        "Accumulating gradients N times before weight update, default is 1.")
    parser.add_argument("--save_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=1000,
                        help="Save checkpoint steps, "
                        "default is 1000.")
    parser.add_argument("--train_steps",
                        type=int,
                        default=-1,
                        help="Training Steps, default is -1, "
                        "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num",
                        type=int,
                        default=1,
                        help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(
            args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(
            bert_net_cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(
                args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle,
                             args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size(
    ) // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size()
        logger.info("train steps: {}".format(args_opt.train_steps))

    if cfg.optimizer == 'Lamb':
        lr_schedule = BertLearningRate(
            learning_rate=cfg.Lamb.learning_rate,
            end_learning_rate=cfg.Lamb.end_learning_rate,
            warmup_steps=cfg.Lamb.warmup_steps,
            decay_steps=args_opt.train_steps,
            power=cfg.Lamb.power)
        params = net_with_loss.trainable_params()
        decay_params = list(filter(cfg.Lamb.decay_filter, params))
        other_params = list(
            filter(lambda x: not cfg.Lamb.decay_filter(x), params))
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.Lamb.weight_decay
        }, {
            'params': other_params
        }, {
            'order_params': params
        }]
        optimizer = Lamb(group_params,
                         learning_rate=lr_schedule,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(net_with_loss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = BertLearningRate(
            learning_rate=cfg.AdamWeightDecay.learning_rate,
            end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
            warmup_steps=cfg.AdamWeightDecay.warmup_steps,
            decay_steps=args_opt.train_steps,
            power=cfg.AdamWeightDecay.power)
        params = net_with_loss.trainable_params()
        decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params))
        other_params = list(
            filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.AdamWeightDecay.weight_decay
        }, {
            'params': other_params,
            'weight_decay': 0.0
        }, {
            'order_params': params
        }]

        optimizer = AdamWeightDecay(group_params,
                                    learning_rate=lr_schedule,
                                    eps=cfg.AdamWeightDecay.eps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecay]"
            .format(cfg.optimizer))
    callback = [
        TimeMonitor(args_opt.data_sink_steps),
        LossCallBack(ds.get_dataset_size())
    ]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(
            8, device_num) == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(
            prefix='checkpoint_bert',
            directory=None if ckpt_save_dir == "" else ckpt_save_dir,
            config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)

        if args_opt.accumulation_steps <= 1:
            net_with_grads = BertTrainOneStepWithLossScaleCell(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell)
        else:
            accumulation_steps = args_opt.accumulation_steps
            net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell,
                accumulation_steps=accumulation_steps)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)

    model = Model(net_with_grads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
Exemplo n.º 14
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
                        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.")
    parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.")
    parser.add_argument("--device_id", type=int, default=4, help="Device id, default is 0.")
    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale", type=str, default="false", help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle", type=str, default="false", help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps", type=int, default="100", help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, "
                                                                                "default is 1000.")
    parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, "
                                                                    "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target,
                        device_id=args_opt.device_id, save_graphs=False)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(max_call_depth=3000)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        D.init()
        device_num = D.get_group_size()
        rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(rank) + '/'
        context.reset_auto_parallel_context()
        _set_bert_all_reduce_split()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                          device_num=device_num)

    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        new_repeat_count = min(new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size()
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack()]
    if args_opt.enable_save_ckpt == "true" and rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value,
                                                 scale_factor=cfg.scale_factor,
                                                 scale_window=cfg.scale_window)
        net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer,
                                                           scale_update_cell=update_cell)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)

    model = Model(net_with_grads, frequency=cfg.Thor.frequency)
    model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
Exemplo n.º 15
0
def create_dataset_cifar(dataset_path,
                         do_train,
                         repeat_num=1,
                         batch_size=32,
                         target="Ascend"):
    """
    create a train or evaluate cifar10 dataset
    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        init()
        rank_id = get_rank()
        device_num = get_group_size()

    if device_num == 1:
        ds = de.Cifar10Dataset(dataset_path,
                               num_parallel_workers=8,
                               shuffle=True)
    else:
        ds = de.Cifar10Dataset(dataset_path,
                               num_parallel_workers=8,
                               shuffle=True,
                               num_shards=device_num,
                               shard_id=rank_id)

    # define map operations
    if do_train:
        trans = [
            C.RandomCrop((32, 32), (4, 4, 4, 4)),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4),
            C.Resize((227, 227)),
            C.Rescale(1.0 / 255.0, 0.0),
            C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
            C.CutOut(112),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Resize((227, 227)),
            C.Rescale(1.0 / 255.0, 0.0),
            C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Exemplo n.º 16
0
def train():
    # set args
    dev = "GPU"
    epoch_size = int(args_opt.epoch_size)
    total_batch = int(args_opt.batch_size)
    print_per_steps = int(args_opt.print_per_steps)
    compute_type = str(args_opt.dtype).lower()
    ckpt_save_dir = str(args_opt.ckpt_path)
    save_ckpt = bool(args_opt.save_ckpt)
    device_num = 1
    # init context
    if args_opt.mode == "GRAPH":
        mode = context.GRAPH_MODE
    else:
        mode = context.PYNATIVE_MODE
    context.set_context(mode=mode, device_target=dev, save_graphs=False)
    if args_opt.run_distribute:
        init()
        device_num = get_group_size()
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            all_reduce_fusion_config=[85, 160])
        ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"

    # create dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             repeat_num=1,
                             batch_size=total_batch,
                             target=dev,
                             dtype=compute_type,
                             device_num=device_num)
    step_size = dataset.get_dataset_size()
    if (print_per_steps > step_size or print_per_steps < 1):
        print("Arg: print_per_steps should lessequal to dataset_size ",
              step_size)
        print("Change to default: 20")
        print_per_steps = 20
    # define net
    net = resnet(class_num=1001, dtype=compute_type)

    # init weight
    for _, cell in net.cells_and_names():
        if isinstance(cell, nn.Conv2d):
            cell.weight.set_data(
                weight_init.initializer(weight_init.XavierUniform(),
                                        cell.weight.shape, cell.weight.dtype))
        if isinstance(cell, nn.Dense):
            cell.weight.set_data(
                weight_init.initializer(weight_init.TruncatedNormal(),
                                        cell.weight.shape, cell.weight.dtype))

    # init lr
    lr = get_liner_lr(lr_init=0,
                      lr_end=0,
                      lr_max=0.8,
                      warmup_epochs=0,
                      total_epochs=epoch_size,
                      steps_per_epoch=step_size)
    lr = Tensor(lr)

    # define opt
    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)

    # define loss, model
    loss = CrossEntropySmooth(sparse=True,
                              reduction='mean',
                              smooth_factor=0.1,
                              num_classes=1001)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                   0.9, 1e-4)
    loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False)
    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
    # Mixed precision
    if compute_type == "fp16":
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                       lr, 0.9, 1e-4, 1024)
        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale,
                      metrics={'acc'},
                      amp_level="O2",
                      keep_batchnorm_fp32=False)
    # define callbacks
    if mode == context.PYNATIVE_MODE:
        print_per_steps = 1
    time_cb = MyTimeMonitor(total_batch, print_per_steps, step_size, mode)
    cb = [time_cb]
    if save_ckpt:
        config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size,
                                     keep_checkpoint_max=5)
        ckpt_cb = ModelCheckpoint(prefix="resnet_benchmark",
                                  directory=ckpt_save_dir,
                                  config=config_ck)
        cb += [ckpt_cb]
    # train model
    print("========START RESNET50 GPU BENCHMARK========")
    if mode == context.GRAPH_MODE:
        model.train(int(epoch_size * step_size / print_per_steps),
                    dataset,
                    callbacks=cb,
                    sink_size=print_per_steps)
    else:
        model.train(epoch_size, dataset, callbacks=cb)
Exemplo n.º 17
0
def create_dataset_imagenet(dataset_path,
                            do_train,
                            repeat_num=1,
                            batch_size=32,
                            target="Ascend"):
    """
    create a train or eval imagenet dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        init()
        rank_id = get_rank()
        device_num = get_group_size()

    if device_num == 1:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=8,
                                   shuffle=True)
    else:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=8,
                                   shuffle=True,
                                   num_shards=device_num,
                                   shard_id=rank_id)

    image_size = 227
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size,
                                     scale=(0.08, 1.0),
                                     ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4),
            C.Normalize(mean=mean, std=std),
            C.CutOut(112),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize((256, 256)),
            C.CenterCrop(image_size),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Exemplo n.º 18
0
def train():
    """Train function."""
    args = parse_args()

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    if args.need_profiler:
        from mindinsight.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir,
                            is_detail=True,
                            is_show_op_path=True)

    loss_meter = AverageMeter('loss')

    context.reset_auto_parallel_context()
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
        degree = get_group_size()
    else:
        parallel_mode = ParallelMode.STAND_ALONE
        degree = 1
    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      mirror_mean=True,
                                      device_num=degree)

    network = YOLOV3DarkNet53(is_training=True)
    # default is kaiming-normal
    default_recurisive_init(network)

    if args.pretrained_backbone:
        network = load_backbone(network, args.pretrained_backbone, args)
        args.logger.info('load pre-trained backbone {} into network'.format(
            args.pretrained_backbone))
    else:
        args.logger.info('Not load pre-trained backbone, please be careful')

    if args.resume_yolov3:
        param_dict = load_checkpoint(args.resume_yolov3)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('yolo_network.'):
                param_dict_new[key[13:]] = values
                args.logger.info('in resume {}'.format(key))
            else:
                param_dict_new[key] = values
                args.logger.info('in resume {}'.format(key))

        args.logger.info('resume finished')
        load_param_into_net(network, param_dict_new)
        args.logger.info('load_model {} success'.format(args.resume_yolov3))

    network = YoloWithLossCell(network)
    args.logger.info('finish get network')

    config = ConfigYOLOV3DarkNet53()

    config.label_smooth = args.label_smooth
    config.label_smooth_factor = args.label_smooth_factor

    if args.training_shape:
        config.multi_scale = [conver_training_shape(args)]
    if args.resize_rate:
        config.resize_rate = args.resize_rate

    ds, data_size = create_yolo_dataset(image_dir=args.data_root,
                                        anno_path=args.annFile,
                                        is_training=True,
                                        batch_size=args.per_batch_size,
                                        max_epoch=args.max_epoch,
                                        device_num=args.group_size,
                                        rank=args.rank,
                                        config=config)
    args.logger.info('Finish loading dataset')

    args.steps_per_epoch = int(data_size / args.per_batch_size /
                               args.group_size)

    if not args.ckpt_interval:
        args.ckpt_interval = args.steps_per_epoch

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr = warmup_step_lr(
            args.lr,
            args.lr_epochs,
            args.steps_per_epoch,
            args.warmup_epochs,
            args.max_epoch,
            gamma=args.lr_gamma,
        )
    elif args.lr_scheduler == 'cosine_annealing':
        lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch,
                                        args.warmup_epochs, args.max_epoch,
                                        args.T_max, args.eta_min)
    elif args.lr_scheduler == 'cosine_annealing_V2':
        lr = warmup_cosine_annealing_lr_V2(args.lr, args.steps_per_epoch,
                                           args.warmup_epochs, args.max_epoch,
                                           args.T_max, args.eta_min)
    elif args.lr_scheduler == 'cosine_annealing_sample':
        lr = warmup_cosine_annealing_lr_sample(args.lr, args.steps_per_epoch,
                                               args.warmup_epochs,
                                               args.max_epoch, args.T_max,
                                               args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)

    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    network = TrainingWrapper(network, opt)
    network.set_train()

    if args.rank_save_ckpt_flag:
        # checkpoint save
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval,
            keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        cb_params = _InternalCallbackParam()
        cb_params.train_network = network
        cb_params.epoch_num = ckpt_max_num
        cb_params.cur_epoch_num = 1
        run_context = RunContext(cb_params)
        ckpt_cb.begin(run_context)

    old_progress = -1
    t_end = time.time()
    data_loader = ds.create_dict_iterator()

    shape_record = ShapeRecord()
    for i, data in enumerate(data_loader):
        images = data["image"]
        input_shape = images.shape[2:4]
        args.logger.info('iter[{}], shape{}'.format(i, input_shape[0]))
        shape_record.set(input_shape)

        images = Tensor(images)
        annos = data["annotation"]
        if args.group_size == 1:
            batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \
                batch_preprocess_true_box(annos, config, input_shape)
        else:
            batch_y_true_0, batch_y_true_1, batch_y_true_2, batch_gt_box0, batch_gt_box1, batch_gt_box2 = \
                batch_preprocess_true_box_single(annos, config, input_shape)

        batch_y_true_0 = Tensor(batch_y_true_0)
        batch_y_true_1 = Tensor(batch_y_true_1)
        batch_y_true_2 = Tensor(batch_y_true_2)
        batch_gt_box0 = Tensor(batch_gt_box0)
        batch_gt_box1 = Tensor(batch_gt_box1)
        batch_gt_box2 = Tensor(batch_gt_box2)

        input_shape = Tensor(tuple(input_shape[::-1]), ms.float32)
        loss = network(images, batch_y_true_0, batch_y_true_1, batch_y_true_2,
                       batch_gt_box0, batch_gt_box1, batch_gt_box2,
                       input_shape)
        loss_meter.update(loss.asnumpy())

        if args.rank_save_ckpt_flag:
            # ckpt progress
            cb_params.cur_step_num = i + 1  # current step number
            cb_params.batch_num = i + 2
            ckpt_cb.step_end(run_context)

        if i % args.log_interval == 0:
            time_used = time.time() - t_end
            epoch = int(i / args.steps_per_epoch)
            fps = args.per_batch_size * (
                i - old_progress) * args.group_size / time_used
            if args.rank == 0:
                args.logger.info(
                    'epoch[{}], iter[{}], {}, {:.2f} imgs/sec, lr:{}'.format(
                        epoch, i, loss_meter, fps, lr[i]))
            t_end = time.time()
            loss_meter.reset()
            old_progress = i

        if (i + 1) % args.steps_per_epoch == 0 and args.rank_save_ckpt_flag:
            cb_params.cur_epoch_num += 1

        if args.need_profiler:
            if i == 10:
                profiler.analyse()
                break

    args.logger.info('==========end training===============')
Exemplo n.º 19
0
def train():
    args = parse_args()

    if args.device_target == "CPU":
        context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target="CPU")
    else:
        context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
                            device_target="Ascend", device_id=int(os.getenv('DEVICE_ID')))

    # init multicards training
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=args.group_size)

    # dataset
    dataset = data_generator.SegDataset(image_mean=args.image_mean,
                                        image_std=args.image_std,
                                        data_file=args.data_file,
                                        batch_size=args.batch_size,
                                        crop_size=args.crop_size,
                                        max_scale=args.max_scale,
                                        min_scale=args.min_scale,
                                        ignore_label=args.ignore_label,
                                        num_classes=args.num_classes,
                                        num_readers=2,
                                        num_parallel_calls=4,
                                        shard_id=args.rank,
                                        shard_num=args.group_size)
    dataset = dataset.get_dataset(repeat=1)

    # network
    if args.model == 'deeplab_v3_s16':
        network = net_factory.nets_map[args.model]('train', args.num_classes, 16, args.freeze_bn)
    elif args.model == 'deeplab_v3_s8':
        network = net_factory.nets_map[args.model]('train', args.num_classes, 8, args.freeze_bn)
    else:
        raise NotImplementedError('model [{:s}] not recognized'.format(args.model))

    # loss
    loss_ = loss.SoftmaxCrossEntropyLoss(args.num_classes, args.ignore_label)
    loss_.add_flags_recursive(fp32=True)
    train_net = BuildTrainNetwork(network, loss_)

    # load pretrained model
    if args.ckpt_pre_trained:
        param_dict = load_checkpoint(args.ckpt_pre_trained)
        if args.filter_weight:
            filter_list = ["network.aspp.conv2.weight", "network.aspp.conv2.bias"]
            for key in list(param_dict.keys()):
                for filter_key in filter_list:
                    if filter_key not in key:
                        continue
                    print('filter {}'.format(key))
                    del param_dict[key]
        load_param_into_net(train_net, param_dict)
        print('load_model {} success'.format(args.ckpt_pre_trained))

    # optimizer
    iters_per_epoch = dataset.get_dataset_size()
    total_train_steps = iters_per_epoch * args.train_epochs
    if args.lr_type == 'cos':
        lr_iter = learning_rates.cosine_lr(args.base_lr, total_train_steps, total_train_steps)
    elif args.lr_type == 'poly':
        lr_iter = learning_rates.poly_lr(args.base_lr, total_train_steps, total_train_steps, end_lr=0.0, power=0.9)
    elif args.lr_type == 'exp':
        lr_iter = learning_rates.exponential_lr(args.base_lr, args.lr_decay_step, args.lr_decay_rate,
                                                total_train_steps, staircase=True)
    else:
        raise ValueError('unknown learning rate type')
    opt = nn.Momentum(params=train_net.trainable_params(), learning_rate=lr_iter, momentum=0.9, weight_decay=0.0001,
                      loss_scale=args.loss_scale)

    # loss scale
    manager_loss_scale = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)
    amp_level = "O0" if args.device_target == "CPU" else "O3"
    model = Model(train_net, optimizer=opt, amp_level=amp_level, loss_scale_manager=manager_loss_scale)

    # callback for saving ckpts
    time_cb = TimeMonitor(data_size=iters_per_epoch)
    loss_cb = LossMonitor()
    cbs = [time_cb, loss_cb]

    if args.rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args.save_steps,
                                     keep_checkpoint_max=args.keep_checkpoint_max)
        ckpoint_cb = ModelCheckpoint(prefix=args.model, directory=args.train_dir, config=config_ck)
        cbs.append(ckpoint_cb)

    model.train(args.train_epochs, dataset, callbacks=cbs, dataset_sink_mode=(args.device_target != "CPU"))
Exemplo n.º 20
0
def run_transformer_train():
    """
    Transformer training.
    """
    parser = argparse_init()
    args, _ = parser.parse_known_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args.device_id)
    context.set_context(reserve_class_name_in_scope=False,
                        enable_auto_mixed_precision=False)

    if args.distribute == "true":
        device_num = args.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            device_num=device_num)
        D.init()
        rank_id = args.device_id % device_num
        save_ckpt_path = os.path.join(args.save_checkpoint_path,
                                      'ckpt_' + str(get_rank()) + '/')
    else:
        device_num = 1
        rank_id = 0
        save_ckpt_path = os.path.join(args.save_checkpoint_path, 'ckpt_0/')
    dataset = create_transformer_dataset(
        epoch_count=1,
        rank_size=device_num,
        rank_id=rank_id,
        do_shuffle=args.do_shuffle,
        dataset_path=args.data_path,
        bucket_boundaries=args.bucket_boundaries)

    netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)

    if args.checkpoint_path:
        parameter_dict = load_checkpoint(args.checkpoint_path)
        load_param_into_net(netwithloss, parameter_dict)

    lr = Tensor(
        create_dynamic_lr(
            schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
            training_steps=dataset.get_dataset_size() * args.epoch_size,
            learning_rate=cfg.lr_schedule.learning_rate,
            warmup_steps=cfg.lr_schedule.warmup_steps,
            hidden_size=transformer_net_cfg.hidden_size,
            start_decay_step=cfg.lr_schedule.start_decay_step,
            min_lr=cfg.lr_schedule.min_lr), mstype.float32)
    optimizer = Adam(netwithloss.trainable_params(), lr)

    callbacks = [
        TimeMonitor(dataset.get_dataset_size()),
        LossCallBack(rank_id=rank_id)
    ]
    if args.enable_save_ckpt == "true":
        if device_num == 1 or (device_num > 1 and rank_id == 0):
            ckpt_config = CheckpointConfig(
                save_checkpoint_steps=args.save_checkpoint_steps,
                keep_checkpoint_max=args.save_checkpoint_num)
            ckpoint_cb = ModelCheckpoint(prefix='transformer',
                                         directory=save_ckpt_path,
                                         config=ckpt_config)
            callbacks.append(ckpoint_cb)

    if args.enable_lossscale == "true":
        scale_manager = DynamicLossScaleManager(
            init_loss_scale=cfg.init_loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        update_cell = scale_manager.get_update_cell()
        netwithgrads = TransformerTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = TransformerTrainOneStepCell(netwithloss,
                                                   optimizer=optimizer)

    netwithgrads.set_train(True)
    model = Model(netwithgrads)

    model.train(args.epoch_size,
                dataset,
                callbacks=callbacks,
                dataset_sink_mode=False)
Exemplo n.º 21
0
def get_args(phase):
    """Define the common options that are used in both training and test."""
    parser = argparse.ArgumentParser(description='Configuration')

    # Hardware specifications
    parser.add_argument('--seed', type=int, default=1, help='random seed')
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="device id, default is 0.")
    parser.add_argument('--device_num',
                        type=int,
                        default=1,
                        help='device num, default is 1.')
    parser.add_argument('--platform', type=str, default="Ascend", \
                        help='run platform, only support Ascend')
    parser.add_argument('--save_graphs', type=ast.literal_eval, default=False, \
                        help='whether save graphs, default is False.')
    parser.add_argument('--dataset', type=str, default="large", choices=("large", "small", "demo"), \
                        help='MIND dataset, support large, small and demo.')
    parser.add_argument('--dataset_path',
                        type=str,
                        default=None,
                        help='MIND dataset path.')

    # Model specifications
    parser.add_argument('--n_browsed_news',
                        type=int,
                        default=50,
                        help='number of browsed news per user')
    parser.add_argument('--n_words_title',
                        type=int,
                        default=16,
                        help='number of words per title')
    parser.add_argument('--n_words_abstract',
                        type=int,
                        default=48,
                        help='number of words per abstract')
    parser.add_argument('--word_embedding_dim',
                        type=int,
                        default=304,
                        help='dimension of word embedding vector')
    parser.add_argument('--category_embedding_dim', type=int, default=112, \
                        help='dimension of category embedding vector')
    parser.add_argument('--query_vector_dim',
                        type=int,
                        default=208,
                        help='dimension of the query vector in attention')
    parser.add_argument('--n_filters',
                        type=int,
                        default=400,
                        help='number of filters in CNN')
    parser.add_argument('--window_size',
                        type=int,
                        default=3,
                        help='size of filter in CNN')
    parser.add_argument("--checkpoint_path", type=str, default=None, \
                        help="Pre trained checkpoint path, default is None.")
    parser.add_argument('--batch_size',
                        type=int,
                        default=64,
                        help='size of each batch')
    # Training specifications
    if phase == "train":
        parser.add_argument('--epochs',
                            type=int,
                            default=None,
                            help='number of epochs for training')
        parser.add_argument('--lr',
                            type=float,
                            default=None,
                            help='learning rate')
        parser.add_argument('--beta1',
                            type=float,
                            default=0.9,
                            help='ADAM beta1')
        parser.add_argument('--beta2',
                            type=float,
                            default=0.999,
                            help='ADAM beta2')
        parser.add_argument('--epsilon',
                            type=float,
                            default=1e-8,
                            help='ADAM epsilon for numerical stability')
        parser.add_argument(
            '--neg_sample',
            type=int,
            default=4,
            help='number of negative samples in negative sampling')
        parser.add_argument("--mixed", type=ast.literal_eval, default=True, \
                            help="whether use mixed precision, default is True.")
        parser.add_argument("--sink_mode", type=ast.literal_eval, default=True, \
                            help="whether use dataset sink, default is True.")
        parser.add_argument('--print_times',
                            type=int,
                            default=None,
                            help='number of print times, default is None')
        parser.add_argument("--weight_decay", type=ast.literal_eval, default=True, \
                            help="whether use weight decay, default is True.")
        parser.add_argument('--save_checkpoint', type=ast.literal_eval, default=True, \
                            help='whether save checkpoint, default is True.')
        parser.add_argument("--save_checkpoint_path", type=str, default="./checkpoint", \
                            help="Save checkpoint path, default is checkpoint.")
        parser.add_argument('--dropout_ratio',
                            type=float,
                            default=0.2,
                            help='ratio of dropout')
    if phase == "eval":
        parser.add_argument('--neg_sample', type=int, default=-1, \
                            help='number of negative samples in negative sampling')
    if phase == "export":
        parser.add_argument('--file_format', type=str, choices=["AIR", "ONNX", "MINDIR"], default='AIR', \
                            help='file format')
        parser.add_argument('--neg_sample', type=int, default=-1, \
                            help='number of negative samples in negative sampling')
    args = parser.parse_args()
    if args.device_num > 1:
        context.set_context(mode=context.GRAPH_MODE,
                            device_target=args.platform,
                            save_graphs=args.save_graphs)
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            device_num=args.device_num)
        init()
        args.rank = get_rank()
        args.save_checkpoint_path = os.path.join(args.save_checkpoint_path,
                                                 "ckpt_" + str(args.rank))
    else:
        context.set_context(mode=context.GRAPH_MODE,
                            device_target=args.platform,
                            device_id=args.device_id,
                            save_graphs=args.save_graphs,
                            save_graphs_path="naml_ir")
        args.rank = 0
        args.device_num = 1
    args.phase = phase
    cfg = get_dataset_config(args.dataset)
    args.n_categories = cfg.n_categories
    args.n_sub_categories = cfg.n_sub_categories
    args.n_words = cfg.n_words
    if phase == "train":
        args.epochs = cfg.epochs * math.ceil(
            args.device_num**0.5) if args.epochs is None else args.epochs
        args.lr = cfg.lr if args.lr is None else args.lr
        args.print_times = cfg.print_times if args.print_times is None else args.print_times
    args.embedding_file = cfg.embedding_file.format(args.dataset_path)
    args.word_dict_path = cfg.word_dict_path.format(args.dataset_path)
    args.category_dict_path = cfg.category_dict_path.format(args.dataset_path)
    args.subcategory_dict_path = cfg.subcategory_dict_path.format(
        args.dataset_path)
    args.uid2index_path = cfg.uid2index_path.format(args.dataset_path)
    args.train_dataset_path = cfg.train_dataset_path.format(args.dataset_path)
    args.eval_dataset_path = cfg.eval_dataset_path.format(args.dataset_path)
    args_dict = vars(args)
    for key in args_dict.keys():
        print('--> {}:{}'.format(key, args_dict[key]), flush=True)
    return args
 def _init_parallel(self):
     self._init_parallel_flag = False
     init(backend_name='hccl')
     self._init_parallel_flag = True
Exemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser(description="YOLOv3 train")
    parser.add_argument("--only_create_dataset",
                        type=bool,
                        default=False,
                        help="If set it true, only create "
                        "Mindrecord, default is false.")
    parser.add_argument("--distribute",
                        type=bool,
                        default=False,
                        help="Run distribute, default is false.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="Learning rate, default is 0.001.")
    parser.add_argument("--mode",
                        type=str,
                        default="sink",
                        help="Run sink mode or not, default is sink")
    parser.add_argument("--epoch_size",
                        type=int,
                        default=10,
                        help="Epoch size, default is 10")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size, default is 32.")
    parser.add_argument("--pre_trained",
                        type=str,
                        default=None,
                        help="Pretrained checkpoint file path")
    parser.add_argument("--pre_trained_epoch_size",
                        type=int,
                        default=0,
                        help="Pretrained epoch size")
    parser.add_argument("--save_checkpoint_epochs",
                        type=int,
                        default=5,
                        help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale",
                        type=int,
                        default=1024,
                        help="Loss scale, default is 1024.")
    parser.add_argument(
        "--mindrecord_dir",
        type=str,
        default="./Mindrecord_train",
        help=
        "Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by"
        "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir "
        "rather than image_dir and anno_path. Default is ./Mindrecord_train")
    parser.add_argument("--image_dir",
                        type=str,
                        default="",
                        help="Dataset directory, "
                        "the absolute image path is joined by the image_dir "
                        "and the relative path in anno_path")
    parser.add_argument("--anno_path",
                        type=str,
                        default="",
                        help="Annotation path.")
    args_opt = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args_opt.device_id)
    if args_opt.distribute:
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    print("Start create dataset!")

    # It will generate mindrecord file in args_opt.mindrecord_dir,
    # and the file name is yolo.mindrecord0, 1, ... file_num.
    if not os.path.isdir(args_opt.mindrecord_dir):
        os.makedirs(args_opt.mindrecord_dir)

    prefix = "yolo.mindrecord"
    mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0")
    if not os.path.exists(mindrecord_file):
        if os.path.isdir(args_opt.image_dir) and os.path.exists(
                args_opt.anno_path):
            print("Create Mindrecord.")
            data_to_mindrecord_byte_image(args_opt.image_dir,
                                          args_opt.anno_path,
                                          args_opt.mindrecord_dir, prefix, 8)
            print("Create Mindrecord Done, at {}".format(
                args_opt.mindrecord_dir))
        else:
            print("image_dir or anno_path not exits.")

    if not args_opt.only_create_dataset:
        loss_scale = float(args_opt.loss_scale)

        # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0.
        dataset = create_yolo_dataset(mindrecord_file,
                                      batch_size=args_opt.batch_size,
                                      device_num=device_num,
                                      rank=rank)
        dataset_size = dataset.get_dataset_size()
        print("Create dataset done!")

        net = yolov3_resnet18(ConfigYOLOV3ResNet18())
        net = YoloWithLossCell(net, ConfigYOLOV3ResNet18())
        init_net_param(net, "XavierUniform")

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size *
                                       args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="yolov3",
                                     directory=None,
                                     config=ckpt_config)

        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError(
                    "pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
        total_epoch_size = 60
        if args_opt.distribute:
            total_epoch_size = 160
        lr = Tensor(
            get_lr(learning_rate=args_opt.lr,
                   start_step=args_opt.pre_trained_epoch_size * dataset_size,
                   global_step=total_epoch_size * dataset_size,
                   decay_step=1000,
                   decay_rate=0.95,
                   steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                      lr,
                      loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        callback = [
            TimeMonitor(data_size=dataset_size),
            LossMonitor(), ckpoint_cb
        ]

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print(
            "Start train YOLOv3, the first epoch will be slower because of the graph compilation."
        )
        model.train(args_opt.epoch_size,
                    dataset,
                    callbacks=callback,
                    dataset_sink_mode=dataset_sink_mode)
Exemplo n.º 24
0
def _setup_parallel_env():
    context.reset_auto_parallel_context()
    MultiAscend.init()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
                                      device_num=MultiAscend.get_group_size(),
                                      gradients_mean=True)
Exemplo n.º 25
0
def run_translation():
    '''
    run Summarization_task

    '''
    parser = argparse.ArgumentParser(
        description="Finetune and Evaluate Summrization")
    parser.add_argument("--device_target",
                        type=str,
                        default="Ascend",
                        help="Device type. Default: Ascend.")
    # parser.add_argument("--device_id", type=int, default=0,
    #                     help="ID of target device. ")
    parser.add_argument(
        "--metric_method",
        type=str,
        default="BLEU",
        help="The eval method including [BLEU]. Default: BLEU.")
    parser.add_argument("--do_train",
                        type=str,
                        default="true",
                        help="Enable train. Default: false.")
    parser.add_argument("--do_eval",
                        type=str,
                        default="false",
                        help="Enable evaluation. Default: false.")
    parser.add_argument("--epoch_num",
                        type=int,
                        default=5,
                        help="Epoch number. Default: 5.")
    parser.add_argument("--train_data_shuffle",
                        type=str,
                        default="true",
                        help="Enable train data shuffle. Default: true.")
    parser.add_argument("--eval_data_shuffle",
                        type=str,
                        default="false",
                        help="Enable eval data shuffle. Default: false.")
    parser.add_argument(
        "--save_finetune_ckpt_path",
        type=str,
        default="/home/tju/gpt2/MindSpore-GPT2/pretrained-weight/saved/",
        help="Save the checkpoint path.")
    parser.add_argument(
        "--load_pretrain_ckpt_path",
        type=str,
        default=
        "/home/tju/gpt2/MindSpore-GPT2/pretrained-weight/mindspore_model_small.ckpt",
        help="Load the checkpoint file path.")
    parser.add_argument(
        "--load_finetune_ckpt_path",
        type=str,
        default=
        "/home/tju/gpt2/MindSpore-GPT2/pretrained-weight/mindspore_model_small.ckpt",
        help="Load the checkpoint file path.")
    parser.add_argument(
        "--train_data_file_path",
        type=str,
        default=
        "/home/tju/gpt2/MindSpore-GPT2/mindspore-dataset/en-fr-train-mindrecord",
        help="Data path, it is better to use absolute path")
    parser.add_argument(
        "--eval_data_file_path",
        type=str,
        default=
        "/home/tju/gpt2/MindSpore-GPT2/mindspore-dataset/en-fr-test-mindrecord",
        help="Data path, it is better to use absolute path")
    # parser.add_argument("--translate_direction", type=str, default="en-fr",
    #                     help="translate from Language_A to Language_B: ['en-fr','fr-en']")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="device number")
    args_opt = parser.parse_args()

    epoch_num = args_opt.epoch_num
    metric = args_opt.metric_method
    save_finetune_ckpt_path = args_opt.save_finetune_ckpt_path
    load_finetune_ckpt_path = args_opt.load_finetune_ckpt_path
    load_pretrain_ckpt_path = args_opt.load_pretrain_ckpt_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower(
    ) == "false":
        raise ValueError(
            "At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower(
    ) == "true" and args_opt.train_data_file_path == "":
        raise ValueError(
            "'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower(
    ) == "true" and args_opt.eval_data_file_path == "":
        raise ValueError(
            "'eval_data_file_path' must be set when do evaluation task")

    # translate_direction = args_opt.translate_direction
    # if translate_direction not in ['en-fr','fr-en']:
    #     raise ValueError("--translatate_direction should be in set: ['en-fr','fr-en']'")

    device_target = args_opt.device_target

    if device_target == "GPU":
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="GPU",
                            device_id=args_opt.device_id,
                            max_call_depth=3000)
        context.set_auto_parallel_context(parallel_mode="stand_alone")

    elif device_target == "Ascend":
        device_id = int(os.getenv('DEVICE_ID'))
        # device_id = args_opt.device_id
        print(
            "-------| This is {} device, {} target, {} device numbers |------".
            format(device_id, device_target, args_opt.device_num))
        context.set_context(mode=context.GRAPH_MODE,
                            device_target=device_target,
                            device_id=device_id)
        context.set_auto_parallel_context(
            device_num=args_opt.device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        # context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,device_num = args_opt.device_num,gradients_mean=True)
        init()
        print("-------|   HCCL init finished    |-------")

        save_finetune_ckpt_path = save_finetune_ckpt_path + 'ckpt_' + str(
            get_rank()) + "/"
    else:
        raise Exception(
            "Device target error, Ascend and Nvidia GPU is supported.")

    # if device == "Ascend":
    #     device_num_,rank_id = _get_rank_info()

    if args_opt.do_train.lower() == "true":
        gpt2_loss = GPT2Translation(config=gpt2_net_cfg,
                                    is_training=True,
                                    use_one_hot_embeddings=False)
        print("============== Start Loading Train Dataset ==============")
        train_dataset = create_translation_dataset(
            dataset_path=args_opt.train_data_file_path)
        do_train(train_dataset, gpt2_loss, load_pretrain_ckpt_path,
                 save_finetune_ckpt_path, epoch_num)

    if args_opt.do_eval.lower() == "true":
        print("============ Start Loading Evaluation Dataset ============")
        eval_dataset = create_translation_dataset(
            dataset_path="/home/tju/gpt2/" + translate_direction +
            "-test-mindrecord",
            rank_id=device_id,
            device_num=args_opt.device_num)
        do_eval(eval_dataset, GPT2TranslationModel, metric,
                load_finetune_ckpt_path, translate_direction)
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
from mindspore import Tensor
from mindspore.ops import operations as P
import mindspore.nn as nn
import numpy as np
import mindspore.context as context
from mindspore.common.initializer import initializer
from mindspore.common.parameter import Parameter
from mindspore.communication.management import init, NCCL_WORLD_COMM_GROUP, get_rank, get_group_size
context.set_context(mode=context.GRAPH_MODE, device_target='GPU')

init('nccl')
rank = get_rank()
size = get_group_size()
x = np.ones([size, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1)


class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.x = Parameter(initializer(Tensor(x), x.shape), name='x')

        self.op0 = "sum"
        self.op1 = "max"
        self.op2 = "min"
        self.op3 = "prod"
Exemplo n.º 27
0
def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(device_id=device_id)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH_2
    os.environ['RANK_ID'] = str(device_id - 4)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            parameter_broadcast=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [107], "hccl_world_groupsum1")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum2")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum3")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum4")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum5")
        init()

    # network
    damping = get_model_damping(0, 0.03, 0.87, 50, 5004)
    net = resnet50_thor(class_num=thor_config.class_num,
                        damping=damping,
                        loss_scale=thor_config.loss_scale,
                        frequency=thor_config.frequency)

    # evaluation network
    dist_eval_network = ClassifyCorrectCell(net)

    if not thor_config.label_smooth:
        thor_config.label_smooth_factor = 0.0

    # loss
    loss = nn.SoftmaxCrossEntropyWithLogits(
        sparse=True,
        reduction="mean",
        smooth_factor=thor_config.label_smooth_factor,
        num_classes=thor_config.class_num)

    # train dataset
    dataset = create_dataset(dataset_path=dataset_path,
                             do_train=True,
                             repeat_num=epoch_size,
                             batch_size=thor_config.batch_size)

    step_size = dataset.get_dataset_size()
    eval_interval = thor_config.eval_interval

    # evalutation dataset
    eval_dataset = create_dataset(dataset_path=eval_path,
                                  do_train=False,
                                  repeat_num=epoch_size,
                                  batch_size=thor_config.eval_batch_size)

    # loss scale
    loss_scale = FixedLossScaleManager(thor_config.loss_scale,
                                       drop_overflow_update=False)

    # learning rate
    lr = Tensor(get_model_lr(0, 0.045, 6, 70, 5004))

    # optimizer
    opt = THOR(filter(lambda x: x.requires_grad,
                      net.get_parameters()), lr, thor_config.momentum,
               filter(lambda x: 'matrix_A' in x.name, net.get_parameters()),
               filter(lambda x: 'matrix_G' in x.name, net.get_parameters()),
               filter(lambda x: 'A_inv_max' in x.name, net.get_parameters()),
               filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()),
               thor_config.weight_decay, thor_config.loss_scale)

    # model
    model = THOR_Model(net,
                       loss_fn=loss,
                       optimizer=opt,
                       loss_scale_manager=loss_scale,
                       amp_level="O2",
                       keep_batchnorm_fp32=False,
                       metrics={
                           'acc':
                           DistAccuracy(batch_size=thor_config.eval_batch_size,
                                        device_num=device_num)
                       },
                       eval_network=dist_eval_network,
                       frequency=thor_config.frequency)

    # model init
    print("init_start", device_id)
    model.init(dataset, eval_dataset)
    print("init_stop", device_id)

    # callbacks
    loss_cb = LossGet(1, step_size)

    # train and eval
    acc = 0.0
    time_cost = 0.0
    print("run_start", device_id)
    for epoch_idx in range(0, int(epoch_size / eval_interval)):
        model.train(eval_interval, dataset, callbacks=loss_cb)
        eval_start = time.time()
        output = model.eval(eval_dataset)
        eval_cost = (time.time() - eval_start) * 1000
        acc = float(output["acc"])
        time_cost = loss_cb.get_per_step_time()
        loss = loss_cb.get_loss()
        print(
            "the {} epoch's resnet result:\n "
            "device{}, training loss {}, acc {}, "
            "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms"
            .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost,
                    time_cost * step_size + eval_cost))
    q.put({'acc': acc, 'cost': time_cost})
Exemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(description="SSD training")
    parser.add_argument("--only_create_dataset",
                        type=bool,
                        default=False,
                        help="If set it true, only create "
                        "Mindrecord, default is False.")
    parser.add_argument("--distribute",
                        type=bool,
                        default=False,
                        help="Run distribute, default is False.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--lr",
                        type=float,
                        default=0.05,
                        help="Learning rate, default is 0.05.")
    parser.add_argument("--mode",
                        type=str,
                        default="sink",
                        help="Run sink mode or not, default is sink.")
    parser.add_argument("--dataset",
                        type=str,
                        default="coco",
                        help="Dataset, defalut is coco.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default=250,
                        help="Epoch size, default is 250.")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size, default is 32.")
    parser.add_argument("--pre_trained",
                        type=str,
                        default=None,
                        help="Pretrained Checkpoint file path.")
    parser.add_argument("--pre_trained_epoch_size",
                        type=int,
                        default=0,
                        help="Pretrained epoch size.")
    parser.add_argument("--save_checkpoint_epochs",
                        type=int,
                        default=10,
                        help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale",
                        type=int,
                        default=1024,
                        help="Loss scale, default is 1024.")
    args_opt = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args_opt.device_id)

    if args_opt.distribute:
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    print("Start create dataset!")

    # It will generate mindrecord file in args_opt.mindrecord_dir,
    # and the file name is ssd.mindrecord0, 1, ... file_num.

    prefix = "ssd.mindrecord"
    mindrecord_dir = config.mindrecord_dir
    mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
    if not os.path.exists(mindrecord_file):
        if not os.path.isdir(mindrecord_dir):
            os.makedirs(mindrecord_dir)
        if args_opt.dataset == "coco":
            if os.path.isdir(config.coco_root):
                print("Create Mindrecord.")
                data_to_mindrecord_byte_image("coco", True, prefix)
                print("Create Mindrecord Done, at {}".format(mindrecord_dir))
            else:
                print("coco_root not exits.")
        else:
            if os.path.isdir(config.image_dir) and os.path.exists(
                    config.anno_path):
                print("Create Mindrecord.")
                data_to_mindrecord_byte_image("other", True, prefix)
                print("Create Mindrecord Done, at {}".format(mindrecord_dir))
            else:
                print("image_dir or anno_path not exits.")

    if not args_opt.only_create_dataset:
        loss_scale = float(args_opt.loss_scale)

        # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0.
        dataset = create_ssd_dataset(mindrecord_file,
                                     repeat_num=1,
                                     batch_size=args_opt.batch_size,
                                     device_num=device_num,
                                     rank=rank)

        dataset_size = dataset.get_dataset_size()
        print("Create dataset done!")

        backbone = ssd_mobilenet_v2()
        ssd = SSD300(backbone=backbone, config=config)
        net = SSDWithLossCell(ssd, config)
        init_net_param(net)

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size *
                                       args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="ssd",
                                     directory=None,
                                     config=ckpt_config)

        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError(
                    "pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)

        lr = Tensor(
            get_lr(global_step=config.global_step,
                   lr_init=config.lr_init,
                   lr_end=config.lr_end_rate * args_opt.lr,
                   lr_max=args_opt.lr,
                   warmup_epochs=config.warmup_epochs,
                   total_epochs=args_opt.epoch_size,
                   steps_per_epoch=dataset_size))
        opt = nn.Momentum(
            filter(lambda x: x.requires_grad, net.get_parameters()), lr,
            config.momentum, config.weight_decay, loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        callback = [
            TimeMonitor(data_size=dataset_size),
            LossMonitor(), ckpoint_cb
        ]

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print(
            "Start train SSD, the first epoch will be slower because of the graph compilation."
        )
        model.train(args_opt.epoch_size,
                    dataset,
                    callbacks=callback,
                    dataset_sink_mode=dataset_sink_mode)
from mindspore.common.initializer import One
from mindspore.train.model import Model, ParallelMode
from mindspore import context
import os
from mindspore.communication.management import init
import mindspore.ops.functional as F
from mindspore.nn.loss.loss import _Loss
from mindspore.train.callback import Callback
from mindspore.parallel import set_algo_parameters
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
context.set_context(enable_hccl=True)
context.set_context(enable_task_sink=True,
                    device_id=int(os.getenv('DEVICE_ID')))
context.set_context(enable_ir_fusion=True)
context.set_context(enable_loop_sink=False)
init()
context.set_auto_parallel_context(mirror_mean=True,
                                  parallel_mode=ParallelMode.AUTO_PARALLEL)


def weight_variable(shape, factor=0.1):
    return One()


def _conv3x3(in_channels, out_channels, stride=1, padding=0, pad_mode='same'):
    init_value = weight_variable((out_channels, in_channels, 3, 3))
    return nn.Conv2d(in_channels,
                     out_channels,
                     kernel_size=3,
                     stride=stride,
                     padding=padding,
Exemplo n.º 30
0
def train_net(args_opt,
              cross_valid_ind=1,
              epochs=400,
              batch_size=16,
              lr=0.0001,
              cfg=None):
    rank = 0
    group_size = 1
    data_dir = args_opt.data_url
    run_distribute = args_opt.run_distribute
    if run_distribute:
        init()
        group_size = get_group_size()
        rank = get_rank()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=group_size,
                                          gradients_mean=False)
    need_slice = False
    if cfg['model'] == 'unet_medical':
        net = UNetMedical(n_channels=cfg['num_channels'],
                          n_classes=cfg['num_classes'])
    elif cfg['model'] == 'unet_nested':
        net = NestedUNet(in_channel=cfg['num_channels'],
                         n_class=cfg['num_classes'],
                         use_deconv=cfg['use_deconv'],
                         use_bn=cfg['use_bn'],
                         use_ds=cfg['use_ds'])
        need_slice = cfg['use_ds']
    elif cfg['model'] == 'unet_simple':
        net = UNet(in_channel=cfg['num_channels'], n_class=cfg['num_classes'])
    else:
        raise ValueError("Unsupported model: {}".format(cfg['model']))

    if cfg['resume']:
        param_dict = load_checkpoint(cfg['resume_ckpt'])
        if cfg['transfer_training']:
            filter_checkpoint_parameter_by_list(param_dict,
                                                cfg['filter_weight'])
        load_param_into_net(net, param_dict)

    if 'use_ds' in cfg and cfg['use_ds']:
        criterion = MultiCrossEntropyWithLogits()
    else:
        criterion = CrossEntropyWithLogits()
    if 'dataset' in cfg and cfg['dataset'] == "Cell_nuclei":
        repeat = 10
        dataset_sink_mode = True
        per_print_times = 0
        train_dataset = create_cell_nuclei_dataset(data_dir,
                                                   cfg['img_size'],
                                                   repeat,
                                                   batch_size,
                                                   is_train=True,
                                                   augment=True,
                                                   split=0.8,
                                                   rank=rank,
                                                   group_size=group_size)
        valid_dataset = create_cell_nuclei_dataset(
            data_dir,
            cfg['img_size'],
            1,
            1,
            is_train=False,
            eval_resize=cfg["eval_resize"],
            split=0.8,
            python_multiprocessing=False)
    else:
        repeat = epochs
        dataset_sink_mode = False
        per_print_times = 1
        train_dataset, valid_dataset = create_dataset(
            data_dir, repeat, batch_size, True, cross_valid_ind,
            run_distribute, cfg["crop"], cfg['img_size'])
    train_data_size = train_dataset.get_dataset_size()
    print("dataset length is:", train_data_size)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=cfg['keep_checkpoint_max'])
    ckpoint_cb = ModelCheckpoint(prefix='ckpt_{}_adam'.format(cfg['model']),
                                 directory='./ckpt_{}/'.format(device_id),
                                 config=ckpt_config)

    optimizer = nn.Adam(params=net.trainable_params(),
                        learning_rate=lr,
                        weight_decay=cfg['weight_decay'],
                        loss_scale=cfg['loss_scale'])

    loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(
        cfg['FixedLossScaleManager'], False)

    model = Model(net,
                  loss_fn=criterion,
                  loss_scale_manager=loss_scale_manager,
                  optimizer=optimizer,
                  amp_level="O3")

    print("============== Starting Training ==============")
    callbacks = [
        StepLossTimeMonitor(batch_size=batch_size,
                            per_print_times=per_print_times), ckpoint_cb
    ]
    if args_opt.run_eval:
        eval_model = Model(UnetEval(net, need_slice=need_slice),
                           loss_fn=TempLoss(),
                           metrics={"dice_coeff": dice_coeff(cfg_unet, False)})
        eval_param_dict = {
            "model": eval_model,
            "dataset": valid_dataset,
            "metrics_name": args_opt.eval_metrics
        }
        eval_cb = EvalCallBack(apply_eval,
                               eval_param_dict,
                               interval=args_opt.eval_interval,
                               eval_start_epoch=args_opt.eval_start_epoch,
                               save_best_ckpt=True,
                               ckpt_directory='./ckpt_{}/'.format(device_id),
                               besk_ckpt_name="best.ckpt",
                               metrics_name=args_opt.eval_metrics)
        callbacks.append(eval_cb)
    model.train(int(epochs / repeat),
                train_dataset,
                callbacks=callbacks,
                dataset_sink_mode=dataset_sink_mode)
    print("============== End Training ==============")