Пример #1
0
    def __init__(self, network, total_steps=1, sens=16384.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())

        lr = dynamic_lr(0.01, total_steps, 5000)
        self.optimizer = nn.Adam(self.weights,
                                 learning_rate=lr,
                                 beta1=0.9,
                                 beta2=0.999,
                                 eps=1e-8,
                                 loss_scale=sens)

        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(
                self.optimizer.parameters, mean, degree)
Пример #2
0
def train():
    rank_id = 0
    if args.run_distribute:
        context.set_auto_parallel_context(
            device_num=args.device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        init()
        rank_id = get_rank()

    # dataset/network/criterion/optim
    ds = train_dataset_creator(args.device_id, args.device_num)
    step_size = ds.get_dataset_size()
    print('Create dataset done!')

    config.INFERENCE = False
    net = ETSNet(config)
    net = net.set_train()
    param_dict = load_checkpoint(args.pre_trained)
    load_param_into_net(net, param_dict)
    print('Load Pretrained parameters done!')

    criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE)

    lrs = dynamic_lr(config.BASE_LR, config.TRAIN_TOTAL_ITER,
                     config.WARMUP_STEP, config.WARMUP_RATIO)
    opt = nn.SGD(params=net.trainable_params(),
                 learning_rate=lrs,
                 momentum=0.99,
                 weight_decay=5e-4)

    # warp model
    net = WithLossCell(net, criterion)
    if args.run_distribute:
        net = TrainOneStepCell(net,
                               opt,
                               reduce_flag=True,
                               mean=True,
                               degree=args.device_num)
    else:
        net = TrainOneStepCell(net, opt)

    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossCallBack(per_print_times=10)
    # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH
    ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875,
                                  keep_checkpoint_max=2)
    ckpoint_cb = ModelCheckpoint(prefix="ETSNet",
                                 config=ckpoint_cf,
                                 directory="./ckpt_{}".format(rank_id))

    model = Model(net)
    model.train(config.TRAIN_REPEAT_NUM,
                ds,
                dataset_sink_mode=True,
                callbacks=[time_cb, loss_cb, ckpoint_cb])
Пример #3
0
def train_net(data_dir, seg_dir, run_distribute, config=None):

    network = UNet3d(config=config)

    lr = Tensor(dynamic_lr(config, 877), mstype.float32)
    print(lr)
    # loss = SoftmaxCrossEntropyWithLogits()
    loss = nn.DiceLoss()
    network.set_train()
    inputs = mindspore.Tensor(np.ones((1, 1, 144, 144, 144), np.float32))
    output = network(inputs)
Пример #4
0
def train_net__(data_dir, seg_dir, run_distribute, config=None):

    train_data_size = 5
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    # loss = nn.DiceLoss()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()
    network.to_float(mstype.float16)
    _do_keep_batchnorm_fp32(network)
    network = _add_loss_network(network, loss, mstype.float16)
    loss_scale = 1.0
    loss_scale = scale_manager.get_loss_scale()
    update_cell = scale_manager.get_update_cell()
    if update_cell is not None:
        model = nn.TrainOneStepWithLossScaleCell(
            network, optimizer, scale_sense=update_cell).set_train()
    else:
        model = nn.TrainOneStepCell(network, optimizer, loss_scale).set_train()

    inputs = mindspore.Tensor(np.random.rand(1, 1, 224, 224, 96),
                              mstype.float32)
    labels = mindspore.Tensor(np.random.rand(1, 4, 224, 224, 96),
                              mstype.float32)

    step_per_epoch = train_data_size
    print("============== Starting Training ==============")
    # for epoch_id in range(1):
    for epoch_id in range(cfg.epoch_size):
        time_epoch = 0.0
        for step_id in range(step_per_epoch):
            # for step_id in range(1):
            time_start = time.time()
            loss = model(inputs, labels)
            # loss = network(inputs, labels)
            # loss = network(inputs)
            loss = loss.asnumpy()
            time_end = time.time()
            time_step = time_end - time_start
            time_epoch = time_epoch + time_step
            print(
                'Epoch: [%3d/%3d], step: [%5d/%5d], loss: [%6.4f], time: [%.4f]'
                % (epoch_id, cfg.epoch_size, step_id, step_per_epoch, loss,
                   time_step))
        print('Epoch time: %10.4f, per step time: %7.4f' %
              (time_epoch, time_epoch / step_per_epoch))

    print("============== End Training ==============")
Пример #5
0
def train_net(data_dir, seg_dir, run_distribute, config=None):
    if run_distribute:
        init()
        rank_id = get_rank()
        rank_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=rank_size,
                                          gradients_mean=True)
    else:
        rank_id = 0
        rank_size = 1
    # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \
    #                                 rank_size=rank_size, rank_id=rank_id, is_training=True)
    train_dataset = create_dataset_diy()
    # for item in train_dataset:
    #     print(item)
    # exit(0)

    train_data_size = train_dataset.get_dataset_size()
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    # loss = nn.DiceLoss()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()

    model = Model(network,
                  loss_fn=loss,
                  optimizer=optimizer,
                  loss_scale_manager=scale_manager,
                  amp_level='O3')

    time_cb = TimeMonitor(data_size=train_data_size)
    loss_cb = LossMonitor(per_print_times=2)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model),
                                 directory='./ckpt_{}/'.format(rank_size),
                                 config=ckpt_config)
    callbacks_list = [loss_cb, time_cb, ckpoint_cb]
    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                train_dataset,
                callbacks=callbacks_list,
                dataset_sink_mode=False)
    print("============== End Training ==============")
Пример #6
0
    load_path = args_opt.pre_trained
    if args_opt.task_type == "Pretraining":
        print("load backbone vgg16 ckpt {}".format(args_opt.pre_trained))
        param_dict = load_checkpoint(load_path)
        for item in list(param_dict.keys()):
            if not item.startswith('vgg16_feature_extractor'):
                param_dict.pop(item)
        load_param_into_net(net, param_dict)
    else:
        if load_path != "":
            print("load pretrain ckpt {}".format(args_opt.pre_trained))
            param_dict = load_checkpoint(load_path)
            load_param_into_net(net, param_dict)
    loss = LossNet()
    lr = Tensor(dynamic_lr(training_cfg, dataset_size), mstype.float32)
    opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,\
        weight_decay=config.weight_decay, loss_scale=config.loss_scale)
    net_with_loss = WithLossCell(net, loss)
    if args_opt.run_distribute:
        net = TrainOneStepCell(net_with_loss,
                               opt,
                               sens=config.loss_scale,
                               reduce_flag=True,
                               mean=True,
                               degree=device_num)
    else:
        net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)

    time_cb = TimeMonitor(data_size=dataset_size)
    loss_cb = LossCallBack(rank_id=rank)
Пример #7
0
                                      batch_size=config.batch_size,
                                      device_num=device_num,
                                      rank_id=rank)

    dataset_size = dataset.get_dataset_size()
    print("Create dataset done! dataset_size = ", dataset_size)
    net = Deeptext_VGG16(config=config)
    net = net.set_train()

    load_path = args_opt.pre_trained
    if load_path != "":
        param_dict = load_checkpoint(load_path)
        load_param_into_net(net, param_dict)

    loss = LossNet()
    lr = Tensor(dynamic_lr(config, rank_size=device_num), mstype.float32)

    opt = Momentum(params=net.trainable_params(),
                   learning_rate=lr,
                   momentum=config.momentum,
                   weight_decay=config.weight_decay,
                   loss_scale=config.loss_scale)
    net_with_loss = WithLossCell(net, loss)
    if args_opt.run_distribute:
        net = TrainOneStepCell(net_with_loss,
                               net,
                               opt,
                               sens=config.loss_scale,
                               reduce_flag=True,
                               mean=True,
                               degree=device_num)
Пример #8
0
        net = net.set_train()

        load_path = args_opt.pre_trained
        if load_path != "":
            param_dict = load_checkpoint(load_path)
            if config.pretrain_epoch_size == 0:
                for item in list(param_dict.keys()):
                    if not (item.startswith('backbone')
                            or item.startswith('rcnn_mask')):
                        param_dict.pop(item)
            load_param_into_net(net, param_dict)

        loss = LossNet()
        lr = Tensor(
            dynamic_lr(config,
                       rank_size=device_num,
                       start_steps=config.pretrain_epoch_size * dataset_size),
            mstype.float32)
        opt = Momentum(params=net.trainable_params(),
                       learning_rate=lr,
                       momentum=config.momentum,
                       weight_decay=config.weight_decay,
                       loss_scale=config.loss_scale)

        net_with_loss = WithLossCell(net, loss)
        if args_opt.run_distribute:
            net = TrainOneStepCell(net_with_loss,
                                   net,
                                   opt,
                                   sens=config.loss_scale,
                                   reduce_flag=True,
Пример #9
0
        device_num = 1
    mindrecord_file = args.dataset_path
    if not os.path.exists(mindrecord_file):
        print("dataset file {} not exists, please check!".format(
            mindrecord_file))
        raise ValueError(mindrecord_file)
    dataset = create_gru_dataset(epoch_count=config.num_epochs,
                                 batch_size=config.batch_size,
                                 dataset_path=mindrecord_file,
                                 rank_size=device_num,
                                 rank_id=rank)
    dataset_size = dataset.get_dataset_size()
    print("dataset size is {}".format(dataset_size))
    network = Seq2Seq(config)
    network = GRUWithLossCell(network)
    lr = dynamic_lr(config, dataset_size)
    opt = Adam(network.trainable_params(), learning_rate=lr)
    scale_manager = DynamicLossScaleManager(
        init_loss_scale=config.init_loss_scale_value,
        scale_factor=config.scale_factor,
        scale_window=config.scale_window)
    update_cell = scale_manager.get_update_cell()
    netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell)

    time_cb = TimeMonitor(data_size=dataset_size)
    loss_cb = LossCallBack(rank_id=rank)
    cb = [time_cb, loss_cb]
    #Save Checkpoint
    if config.save_checkpoint:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=config.ckpt_epoch * dataset_size,
Пример #10
0
                if k in oldkey:
                    newkey = oldkey.replace(k, v)
                    param_dict[newkey] = param_dict.pop(oldkey)
                    break

        for item in list(param_dict.keys()):
            if not item.startswith('backbone'):
                param_dict.pop(item)

        for key, value in param_dict.items():
            tensor = value.asnumpy().astype(np.float32)
            param_dict[key] = Parameter(tensor, key)
        load_param_into_net(net, param_dict)

    loss = LossNet()
    lr = Tensor(dynamic_lr(config, dataset_size), mstype.float32)

    opt = SGD(params=net.trainable_params(),
              learning_rate=lr,
              momentum=config.momentum,
              weight_decay=config.weight_decay,
              loss_scale=config.loss_scale)
    net_with_loss = WithLossCell(net, loss)
    if args_opt.run_distribute:
        net = TrainOneStepCell(net_with_loss,
                               opt,
                               sens=config.loss_scale,
                               reduce_flag=True,
                               mean=True,
                               degree=device_num)
    else: