Пример #1
0
    train_data_path = "./datasets/MNIST_Data/train"
    eval_data_path = "./datasets/MNIST_Data/test"
    model_path = "./models/ckpt/custom_debugging_info/"

    net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    repeat_size = 1
    network = LeNet5()

    metrics = {
        'accuracy': nn.Accuracy(),
        'loss': nn.Loss(),
        'precision': nn.Precision(),
        'recall': nn.Recall(),
        'f1_score': nn.F1()
        }
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=model_path, config=config_ck)

    model = Model(network, net_loss, net_opt, metrics=metrics)

    print("============== Starting Training ==============")
    ds_train = create_dataset(train_data_path, repeat_size=repeat_size)
    stop_cb = StopAtTime(run_time=0.6)
    model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(375), stop_cb], dataset_sink_mode=False)

    print("============== Starting Testing ==============")
    ds_eval = create_dataset(eval_data_path, repeat_size=repeat_size)
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    print("============== Accuracy:{} ==============".format(acc))
Пример #2
0
"""
import mindspore.nn as nn
from mindspore import context, Model
from mindspore.train.callback import LossMonitor
from mindspore.nn.metrics import Accuracy
from src.lenet import LeNet5
from src.datasets import create_dataset


if __name__ == "__main__":

    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

    ds_train = create_dataset("./datasets/MNIST_Data/train", 32)
    ds_eval = create_dataset("./datasets/MNIST_Data/test", 32)
    # Initialize network
    network = LeNet5(10)

    # Define Loss and Optimizer
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), learning_rate=0.01, momentum=0.9)
    # amp_leval=O2 in GPU, amp_leval=O3 in Ascend, O0 is without mixed precision
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2")

    # Run training
    model.train(epoch=1, callbacks=[LossMonitor()], train_dataset=ds_train)

    # Run training
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    print("====Accuracy====:", acc)
Пример #3
0
                   warmup_epochs=cfg.warmup_epochs,
                   total_epochs=cfg.num_epochs,
                   steps_per_epoch=ds_train.get_dataset_size(),
                   lr_adjust_epoch=cfg.lr_adjust_epoch))
    else:
        lr = cfg.learning_rate

    opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum)
    loss_cb = LossMonitor()

    model = Model(network, loss, opt, {'acc': Accuracy()})

    print("============== Starting Training ==============")
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="lstm",
                                 directory=args.ckpt_path,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    if args.device_target == "CPU":
        model.train(cfg.num_epochs,
                    ds_train,
                    callbacks=[time_cb, ckpoint_cb, loss_cb],
                    dataset_sink_mode=False)
    else:
        model.train(cfg.num_epochs,
                    ds_train,
                    callbacks=[time_cb, ckpoint_cb, loss_cb])
    print("============== Training Success ==============")
Пример #4
0
def test_train():
    '''
    finetune function
    '''
    target = args_opt.device_target
    if target == "Ascend":
        devid = int(os.getenv('DEVICE_ID'))
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="Ascend",
                            device_id=devid)

    poetry, tokenizer, keep_words = create_tokenizer()
    print(len(keep_words))

    dataset = create_poetry_dataset(bert_net_cfg.batch_size, poetry, tokenizer)

    num_tokens = 3191
    poetrymodel = BertPoetryModel(bert_net_cfg,
                                  True,
                                  num_tokens,
                                  dropout_prob=0.1)
    netwithloss = BertPoetry(poetrymodel, bert_net_cfg, True, dropout_prob=0.1)
    callback = LossCallBack(poetrymodel)

    # optimizer
    steps_per_epoch = dataset.get_dataset_size()
    print("============ steps_per_epoch is {}".format(steps_per_epoch))
    lr_schedule = BertLearningRate(
        learning_rate=cfg.AdamWeightDecay.learning_rate,
        end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
        warmup_steps=1000,
        decay_steps=cfg.epoch_num * steps_per_epoch,
        power=cfg.AdamWeightDecay.power)
    optimizer = AdamWeightDecay(netwithloss.trainable_params(), lr_schedule)
    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix,
                                 directory=cfg.ckpt_dir,
                                 config=ckpt_config)

    param_dict = load_checkpoint(cfg.pre_training_ckpt)
    new_dict = {}

    # load corresponding rows of embedding_lookup
    for key in param_dict:
        if "bert_embedding_lookup" not in key:
            new_dict[key] = param_dict[key]
        else:
            value = param_dict[key]
            np_value = value.data.asnumpy()
            np_value = np_value[keep_words]
            tensor_value = Tensor(np_value, mstype.float32)
            parameter_value = Parameter(tensor_value, name=key)
            new_dict[key] = parameter_value

    load_param_into_net(netwithloss, new_dict)
    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    netwithgrads = BertPoetryCell(netwithloss,
                                  optimizer=optimizer,
                                  scale_update_cell=update_cell)

    model = Model(netwithgrads)
    model.train(cfg.epoch_num,
                dataset,
                callbacks=[callback, ckpoint_cb],
                dataset_sink_mode=True)
Пример #5
0
    args = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        save_graphs=False,
                        device_target='Ascend')

    ds_train = create_dataset(args.dataset_path, cfg.batch_size)

    network = Seq2Seq(cfg)
    network = WithLossCell(network, cfg)
    optimizer = nn.Adam(network.trainable_params(),
                        learning_rate=cfg.learning_rate,
                        beta1=0.9,
                        beta2=0.98)
    model = Model(network, optimizer=optimizer)

    loss_cb = LossMonitor()
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="gru",
                                 directory=args.ckpt_save_path,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    callbacks = [time_cb, ckpoint_cb, loss_cb]

    model.train(cfg.num_epochs,
                ds_train,
                callbacks=callbacks,
                dataset_sink_mode=False)
Пример #6
0
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    if cache_enable:
        config.full_batch = True
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        ds.config.set_seed(1)
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=1,
                                  batch_size=batch_size * get_group_size(),
                                  data_type=dataset_type)
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=1,
                                 batch_size=batch_size * get_group_size(),
                                 data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=1,
                                  batch_size=batch_size,
                                  rank_id=get_rank(),
                                  rank_size=get_group_size(),
                                  data_type=dataset_type)
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=1,
                                 batch_size=batch_size,
                                 rank_id=get_rank(),
                                 rank_size=get_group_size(),
                                 data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    if cache_enable:
        config.stra_ckpt = os.path.join(
            config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt")
        context.set_auto_parallel_context(
            strategy_ckpt_save_file=config.stra_ckpt)

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    if _is_role_worker():
        if cache_enable:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=ds_train.get_dataset_size() * epochs,
                keep_checkpoint_max=1,
                integrated_save=False)
        else:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=ds_train.get_dataset_size(),
                keep_checkpoint_max=5)
    else:
        ckptconfig = CheckpointConfig(save_checkpoint_steps=1,
                                      keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' +
                                 str(get_rank()) + '/',
                                 config=ckptconfig)
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                dataset_sink_mode=bool(parameter_server and cache_enable))
Пример #7
0
    charge = charge.reshape((-1, 129)).astype(np.float32)
    # define the model
    net = Mdnn()
    lr = 0.0001
    decay_rate = 0.8
    epoch_size = 1000
    batch_size = 500
    total_step = epoch_size * batch_size
    step_per_epoch = 100
    decay_epoch = epoch_size
    lr_rate = nn.exponential_decay_lr(lr, decay_rate, total_step,
                                      step_per_epoch, decay_epoch)
    net_loss = nn.loss.MSELoss(reduction='mean')
    net_opt = nn.Adam(net.trainable_params(), learning_rate=lr_rate)
    model = Model(net, net_loss, net_opt)
    ds_train = create_dataset(radial_angular, charge, batchsize=batch_size)
    model_params = net.trainable_params()
    net.set_train()
    init_weight(net)
    # config files
    path = './params/'
    config_ck = CheckpointConfig(save_checkpoint_steps=100,
                                 keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="mdnn_best",
                                 directory=path,
                                 config=config_ck)
    steps_loss = {"step": [], "loss_value": []}
    step_loss_acc_info = StepLossAccInfo(model, ds_train, steps_loss)
    # train the model
    model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(100)])
Пример #8
0
    dataset_size = train_dataset.get_dataset_size()
    time_cb = TimeMonitor(data_size=dataset_size)
    callback = [time_cb, LossCallBack()]
    if config.enable_save_ckpt:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=config.save_checkpoint_steps,
            keep_checkpoint_max=config.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_deeplabv3',
                                     config=config_ck)
        callback.append(ckpoint_cb)
    net = deeplabv3_resnet50(
        config.seg_num_classes,
        [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
        infer_scale_sizes=config.eval_scales,
        atrous_rates=config.atrous_rates,
        decoder_output_stride=config.decoder_output_stride,
        output_stride=config.output_stride,
        fine_tune_batch_norm=config.fine_tune_batch_norm,
        image_pyramid=config.image_pyramid)
    net.set_train()
    model_fine_tune(args_opt, net, 'layer')
    loss = OhemLoss(config.seg_num_classes, config.ignore_label)
    opt = Momentum(filter(
        lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth'
        not in x.name and 'bias' not in x.name, net.trainable_params()),
                   learning_rate=config.learning_rate,
                   momentum=config.momentum,
                   weight_decay=config.weight_decay)
    model = Model(net, loss, opt)
    model.train(config.epoch_size, train_dataset, callback)
Пример #9
0
def train_eval(config):
    """
    test evaluate
    """
    data_path = config.data_path + config.dataset_type
    ckpt_path = config.ckpt_path
    epochs = config.epochs
    batch_size = config.batch_size
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()

    train_model = Model(train_net)
    train_callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    train_model.train(epochs,
                      ds_train,
                      callbacks=[
                          TimeMonitor(ds_train.get_dataset_size()),
                          train_callback, ckpoint_cb
                      ])

    # data download
    print('Download data from modelarts server to obs.')
    mox.file.copy_parallel(src_url=config.ckpt_path, dst_url=config.train_url)

    param_dict = load_checkpoint(find_ckpt(ckpt_path))
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    eval_model = Model(train_net,
                       eval_network=eval_net,
                       metrics={"auc": auc_metric})
    eval_callback = EvalCallBack(eval_model, ds_eval, auc_metric, config)

    eval_model.eval(ds_eval, callbacks=eval_callback)
Пример #10
0
    self.fc3 = nn.Dense(84, 10, weight_init='TruncatedNormal', bias_init='TruncatedNormal')

  def construct(self, x):
    x = self.conv1(x)
    x = self.relu(x)
    x = self.maxpool(x)
    x = self.conv2(x)
    x = self.relu(x)
    x = self.maxpool(x)
    x = self.reshape(x, (32, 400))
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.relu(x)
    x = self.fc3(x)
    return x

if __name__ == '__main__':
  import numpy as np
  context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
  dataset = create_dataset('/fzl/mnist/train')
  net = LeNet()
  data = Tensor(np.ones((32, 1, 32, 32)), mindspore.float32)
  y = net(data)
  net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
  lr = 0.01
  momentum = 0.9
  opt = nn.Momentum(net.trainable_params(), lr, momentum)
  mod = Model(net, loss_fn=net_loss, optimizer=opt)
  mod.train(10, dataset, callbacks=[LossMonitor(),], dataset_sink_mode=False)
Пример #11
0
def train_net(args_opt,
              cross_valid_ind=1,
              epochs=400,
              batch_size=16,
              lr=0.0001,
              cfg=None):
    rank = 0
    group_size = 1
    data_dir = args_opt.data_url
    run_distribute = args_opt.run_distribute
    if run_distribute:
        init()
        group_size = get_group_size()
        rank = get_rank()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=group_size,
                                          gradients_mean=False)
    need_slice = False
    if cfg['model'] == 'unet_medical':
        net = UNetMedical(n_channels=cfg['num_channels'],
                          n_classes=cfg['num_classes'])
    elif cfg['model'] == 'unet_nested':
        net = NestedUNet(in_channel=cfg['num_channels'],
                         n_class=cfg['num_classes'],
                         use_deconv=cfg['use_deconv'],
                         use_bn=cfg['use_bn'],
                         use_ds=cfg['use_ds'])
        need_slice = cfg['use_ds']
    elif cfg['model'] == 'unet_simple':
        net = UNet(in_channel=cfg['num_channels'], n_class=cfg['num_classes'])
    else:
        raise ValueError("Unsupported model: {}".format(cfg['model']))

    if cfg['resume']:
        param_dict = load_checkpoint(cfg['resume_ckpt'])
        if cfg['transfer_training']:
            filter_checkpoint_parameter_by_list(param_dict,
                                                cfg['filter_weight'])
        load_param_into_net(net, param_dict)

    if 'use_ds' in cfg and cfg['use_ds']:
        criterion = MultiCrossEntropyWithLogits()
    else:
        criterion = CrossEntropyWithLogits()
    if 'dataset' in cfg and cfg['dataset'] == "Cell_nuclei":
        repeat = cfg['repeat']
        dataset_sink_mode = True
        per_print_times = 0
        train_dataset = create_cell_nuclei_dataset(data_dir,
                                                   cfg['img_size'],
                                                   repeat,
                                                   batch_size,
                                                   is_train=True,
                                                   augment=True,
                                                   split=0.8,
                                                   rank=rank,
                                                   group_size=group_size)
        valid_dataset = create_cell_nuclei_dataset(
            data_dir,
            cfg['img_size'],
            1,
            1,
            is_train=False,
            eval_resize=cfg["eval_resize"],
            split=0.8,
            python_multiprocessing=False)
    else:
        repeat = cfg['repeat']
        dataset_sink_mode = False
        per_print_times = 1
        train_dataset, valid_dataset = create_dataset(
            data_dir, repeat, batch_size, True, cross_valid_ind,
            run_distribute, cfg["crop"], cfg['img_size'])
    train_data_size = train_dataset.get_dataset_size()
    print("dataset length is:", train_data_size)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=cfg['keep_checkpoint_max'])
    ckpoint_cb = ModelCheckpoint(prefix='ckpt_{}_adam'.format(cfg['model']),
                                 directory='./ckpt_{}/'.format(device_id),
                                 config=ckpt_config)

    optimizer = nn.Adam(params=net.trainable_params(),
                        learning_rate=lr,
                        weight_decay=cfg['weight_decay'],
                        loss_scale=cfg['loss_scale'])

    loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager(
        cfg['FixedLossScaleManager'], False)

    model = Model(net,
                  loss_fn=criterion,
                  loss_scale_manager=loss_scale_manager,
                  optimizer=optimizer,
                  amp_level="O3")

    print("============== Starting Training ==============")
    callbacks = [
        StepLossTimeMonitor(batch_size=batch_size,
                            per_print_times=per_print_times), ckpoint_cb
    ]
    if args_opt.run_eval:
        eval_model = Model(UnetEval(net, need_slice=need_slice),
                           loss_fn=TempLoss(),
                           metrics={"dice_coeff": dice_coeff(cfg_unet, False)})
        eval_param_dict = {
            "model": eval_model,
            "dataset": valid_dataset,
            "metrics_name": args_opt.eval_metrics
        }
        eval_cb = EvalCallBack(apply_eval,
                               eval_param_dict,
                               interval=args_opt.eval_interval,
                               eval_start_epoch=args_opt.eval_start_epoch,
                               save_best_ckpt=True,
                               ckpt_directory='./ckpt_{}/'.format(device_id),
                               besk_ckpt_name="best.ckpt",
                               metrics_name=args_opt.eval_metrics)
        callbacks.append(eval_cb)
    model.train(int(epochs / repeat),
                train_dataset,
                callbacks=callbacks,
                dataset_sink_mode=dataset_sink_mode)
    print("============== End Training ==============")
def train_and_eval(config):
    """
    test_train_eval
    """
    np.random.seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              rank_id=get_rank(),
                              rank_size=get_group_size(),
                              data_type=dataset_type)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             rank_id=get_rank(),
                             rank_size=get_group_size(),
                             data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    if config.device_target == "Ascend":
        ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                     directory=config.ckpt_path,
                                     config=ckptconfig)
    elif config.device_target == "GPU":
        ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' +
                                     str(get_rank()),
                                     directory=config.ckpt_path,
                                     config=ckptconfig)
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ],
                dataset_sink_mode=(not parameter_server))
Пример #13
0
    input_data = input_data.batch(batch_size)
    input_data = input_data.repeat(repeat_size)
    return input_data


class LinearNet(nn.Cell):
    def __init__(self):
        super(LinearNet, self).__init__()
        self.fc = nn.Dense(1, 1, Normal(0.02), Normal(0.02))

    def construct(self, x):
        x = self.fc(x)
        return x


if __name__ == "__main__":

    data_number = 1600
    batch_number = 16
    repeat_number = 1
    lr = 0.005
    momentum = 0.9
    net = LinearNet()
    net_loss = nn.loss.MSELoss()
    opt = nn.Momentum(net.trainable_params(), lr, momentum)
    model = Model(net, net_loss, opt)
    ds_train = create_dataset(data_number, batch_size=batch_number, repeat_size=repeat_number)
    model.train(1, ds_train, callbacks=LossMonitor(), dataset_sink_mode=False)
    for param in net.trainable_params():
        print(param, param.asnumpy())
Пример #14
0
    dataset_size = train_dataset.get_dataset_size()
    time_cb = TimeMonitor(data_size=dataset_size)
    callback = [time_cb, LossCallBack()]
    if args_opt.enable_save_ckpt == "true":
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_deeplabv3',
                                     config=config_ck)
        callback.append(ckpoint_cb)
    net = deeplabv3_resnet50(
        config.seg_num_classes,
        [args_opt.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
        infer_scale_sizes=config.eval_scales,
        atrous_rates=config.atrous_rates,
        decoder_output_stride=config.decoder_output_stride,
        output_stride=config.output_stride,
        fine_tune_batch_norm=config.fine_tune_batch_norm,
        image_pyramid=config.image_pyramid)
    net.set_train()
    model_fine_tune(args_opt, net, 'layer')
    loss = OhemLoss(config.seg_num_classes, config.ignore_label)
    opt = Momentum(filter(
        lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth'
        not in x.name and 'bias' not in x.name, net.trainable_params()),
                   learning_rate=config.learning_rate,
                   momentum=config.momentum,
                   weight_decay=config.weight_decay)
    model = Model(net, loss, opt)
    model.train(args_opt.epoch_size, train_dataset, callback)
Пример #15
0
def test_train():
    """train entry method"""
    if args.is_distributed:
        if args.device_target == "Ascend":
            init()
            context.set_context(device_id=args.device_id)
        elif args.device_target == "GPU":
            init()

        args.rank = get_rank()
        args.group_size = get_group_size()
        device_num = args.group_size
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            parameter_broadcast=True,
            gradients_mean=True)
    else:
        context.set_context(device_id=args.device_id)
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)

    if not os.path.exists(args.output_path):
        os.makedirs(args.output_path)

    layers = cfg.layers
    num_factors = cfg.num_factors
    epochs = args.train_epochs

    ds_train, num_train_users, num_train_items = create_dataset(
        test_train=True,
        data_dir=args.data_path,
        dataset=args.dataset,
        train_epochs=1,
        batch_size=args.batch_size,
        num_neg=args.num_neg)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))

    ncf_net = NCFModel(num_users=num_train_users,
                       num_items=num_train_items,
                       num_factors=num_factors,
                       model_layers=layers,
                       mf_regularization=0,
                       mlp_reg_layers=[0.0, 0.0, 0.0, 0.0],
                       mf_dim=16)
    loss_net = NetWithLossClass(ncf_net)
    train_net = TrainStepWrap(loss_net,
                              ds_train.get_dataset_size() * (epochs + 1))

    train_net.set_train()

    model = Model(train_net)
    callback = LossMonitor(per_print_times=ds_train.get_dataset_size())
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=(4970845 + args.batch_size - 1) //
        (args.batch_size),
        keep_checkpoint_max=100)
    ckpoint_cb = ModelCheckpoint(prefix='NCF',
                                 directory=args.checkpoint_path,
                                 config=ckpt_config)
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), callback,
                    ckpoint_cb
                ],
                dataset_sink_mode=True)
Пример #16
0
def inception_v4_train():
    """
    Train Inceptionv4 in data parallelism
    """
    print('epoch_size: {} batch_size: {} class_num {}'.format(
        config.epoch_size, config.batch_size, config.num_classes))

    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=args.device_id)
    context.set_context(enable_graph_kernel=False)
    rank = 0
    if device_num > 1:
        init(backend_name='hccl')
        rank = get_rank()
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            all_reduce_fusion_config=[200, 400])

    # create dataset
    train_dataset = create_dataset(dataset_path=args.dataset_path,
                                   do_train=True,
                                   repeat_num=1,
                                   batch_size=config.batch_size)
    train_step_size = train_dataset.get_dataset_size()

    # create model
    net = Inceptionv4(classes=config.num_classes)
    # loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # learning rate
    lr = Tensor(
        generate_cosine_lr(steps_per_epoch=train_step_size,
                           total_epochs=config.epoch_size))

    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            param.set_data(
                initializer(XavierUniform(), param.data.shape,
                            param.data.dtype))
    group_params = [{
        'params': decayed_params,
        'weight_decay': config.weight_decay
    }, {
        'params': no_decayed_params
    }, {
        'order_params': net.trainable_params()
    }]

    opt = RMSProp(group_params,
                  lr,
                  decay=config.decay,
                  epsilon=config.epsilon,
                  weight_decay=config.weight_decay,
                  momentum=config.momentum,
                  loss_scale=config.loss_scale)

    if args.device_id == 0:
        print(lr)
        print(train_step_size)
    if args.resume:
        ckpt = load_checkpoint(args.resume)
        load_param_into_net(net, ckpt)

    loss_scale_manager = FixedLossScaleManager(config.loss_scale,
                                               drop_overflow_update=False)
    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                  loss_scale_manager=loss_scale_manager,
                  amp_level=config.amp_level)

    # define callbacks
    performance_cb = TimeMonitor(data_size=train_step_size)
    loss_cb = LossMonitor(per_print_times=train_step_size)
    ckp_save_step = config.save_checkpoint_epochs * train_step_size
    config_ck = CheckpointConfig(
        save_checkpoint_steps=ckp_save_step,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}",
                                 directory='ckpts_rank_' + str(rank),
                                 config=config_ck)
    callbacks = [performance_cb, loss_cb]
    if device_num > 1 and config.is_save_on_master:
        if args.device_id == 0:
            callbacks.append(ckpoint_cb)
    else:
        callbacks.append(ckpoint_cb)

    # train model
    model.train(config.epoch_size,
                train_dataset,
                callbacks=callbacks,
                dataset_sink_mode=True)
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        de.config.set_seed(1)
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=epochs,
                                  batch_size=batch_size * get_group_size())
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=epochs + 1,
                                 batch_size=batch_size * get_group_size())
    else:
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=epochs,
                                  batch_size=batch_size,
                                  rank_id=get_rank(),
                                  rank_size=get_group_size())
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=epochs + 1,
                                 batch_size=batch_size,
                                 rank_id=get_rank(),
                                 rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    context.set_auto_parallel_context(
        strategy_ckpt_save_file="./strategy_train.ckpt")
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])
Пример #18
0
    train_dataset = create_dataset("./datasets/MNIST_Data/train")
    eval_dataset = create_dataset("./datasets/MNIST_Data/test")

    print("========== The Training Model is Defined. ==========")

    # train the model and export the encrypted CheckPoint file through Callback
    config_ck = CheckpointConfig(save_checkpoint_steps=1875,
                                 keep_checkpoint_max=10,
                                 enc_key=b'0123456789ABCDEF',
                                 enc_mode='AES-GCM')
    ckpoint_cb = ModelCheckpoint(prefix='lenet_enc',
                                 directory=None,
                                 config=config_ck)
    model.train(10,
                train_dataset,
                dataset_sink_mode=False,
                callbacks=[ckpoint_cb, LossMonitor(1875)])
    acc = model.eval(eval_dataset, dataset_sink_mode=False)
    print("Accuracy: {}".format(acc["Accuracy"]))

    # export the encrypted CheckPoint file through save_checkpoint
    save_checkpoint(network,
                    'lenet_enc.ckpt',
                    enc_key=b'0123456789ABCDEF',
                    enc_mode='AES-GCM')

    # load encrypted CheckPoint file and eval
    param_dict = load_checkpoint('lenet_enc-10_1875.ckpt',
                                 dec_key=b'0123456789ABCDEF',
                                 dec_mode='AES-GCM')
    load_param_into_net(network, param_dict)
Пример #19
0
        "e2e_dump_settings": {
            "enable": True,
            "trans_flag": False
        }
    }
    with open("./data_dump.json", "w", encoding="GBK") as f:
        json.dump(data_dump, f)
    os.environ['MINDSPORE_DUMP_CONFIG'] = abspath + "/data_dump.json"

def set_log_info():
    os.environ['GLOG_v'] = '1'
    os.environ['GLOG_logtostderr'] = '1'
    os.environ['logger_maxBytes'] = '5242880'
    os.environ['GLOG_log_dir'] = 'D:/' if os.name == "nt" else '/var/log/mindspore'
    os.environ['logger_backupCount'] = '10'
    print(logger.get_log_config())

if __name__ == "__main__":
    set_dump_info()
    set_log_info()
    context.set_context(mode=context.GRAPH_MODE)
    train_dataset = create_train_dataset()
    eval_dataset = create_eval_dataset()
    net = Net()
    net_opt = Momentum(net.trainable_params(), 0.01, 0.9)
    net_loss = SoftmaxCrossEntropyWithLogits(reduction='mean')
    model = Model(network=net, loss_fn=net_loss, optimizer=net_opt, metrics={'Accuracy': nn.Accuracy()})
    model.train(epoch=100,
                train_dataset=train_dataset,
                callbacks=[LossMonitor(), StopAtTime(3), SaveCallback(model, eval_dataset)])
Пример #20
0
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    repeat_size = 1
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    if args.mode == 'train':  # train
        ds_train = create_dataset(os.path.join(args.data_path, args.mode),
                                  batch_size=cfg.batch_size,
                                  repeat_size=repeat_size)
        print("============== Starting Training ==============")
        config_ck = CheckpointConfig(
            save_checkpoint_steps=cfg.save_checkpoint_steps,
            keep_checkpoint_max=cfg.keep_checkpoint_max)
        ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                     config=config_ck,
                                     directory=args.ckpt_path)
        model.train(cfg['epoch_size'],
                    ds_train,
                    callbacks=[ckpoint_cb, LossMonitor()],
                    dataset_sink_mode=args.dataset_sink_mode)
    elif args.mode == 'test':  # test
        print("============== Starting Testing ==============")
        param_dict = load_checkpoint(args.ckpt_path)
        load_param_into_net(network, param_dict)
        ds_eval = create_dataset(os.path.join(args.data_path, "test"), 32, 1)
        acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
        print("============== Accuracy:{} ==============".format(acc))
    else:
        raise RuntimeError(
            'mode should be train or test, rather than {}'.format(args.mode))
Пример #21
0
        init()

    epoch_size = args_opt.epoch_size
    net = resnet50(args_opt.batch_size, args_opt.num_classes)
    ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)

    model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'})

    # as for train, users could use model.train
    if args_opt.do_train:
        dataset = create_dataset()
        batch_num = dataset.get_dataset_size()
        config_ck = CheckpointConfig(save_checkpoint_steps=batch_num,
                                     keep_checkpoint_max=35)
        ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10",
                                     directory="./",
                                     config=config_ck)
        loss_cb = LossMonitor()
        model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])

    # as for evaluation, users could use model.eval
    if args_opt.do_eval:
        if args_opt.checkpoint_path:
            param_dict = load_checkpoint(args_opt.checkpoint_path)
            load_param_into_net(net, param_dict)
        eval_dataset = create_dataset(training=False)
        res = model.eval(eval_dataset)
        print("result: ", res)
Пример #22
0
context.set_context(
    mode=context.PYNATIVE_MODE,
    device_target="GPU",
    enable_mem_reuse=False)
    # save_graphs=True, save_graphs_path="./graph/")
    # save_ms_model=True)
network = LeNet5(cfg.num_classes)
net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
# config_ck = CheckpointConfig(save_checkpoint_steps=1,
#                              keep_checkpoint_max=cfg.keep_checkpoint_max)
# ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./ckpt", config=config_ck)
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
#
#
# summary_writer = SummaryRecord(log_dir='./summary13', network=network)
# summary_callback = SummaryStep(summary_writer, flush_step=1)
#
# train_callback = TrainLineage(summary_writer)
saver_callback = DataSaverCallback()
ds_train = create_dataset(os.path.join(args.data_path, "train"),
                          cfg.batch_size,
                          cfg.epoch_size)
print("============== Starting Training ==============")
model.train(cfg['epoch_size'], ds_train,
            # callbacks=[LossMonitor()],
            callbacks=[saver_callback],
            dataset_sink_mode=args.dataset_sink_mode)
# summary_writer.close()
    for para in train_net.trainable_params():
        if fix_weight_layer in para.name:
            para.requires_grad = False

if __name__ == "__main__":
    start_time = time.time()
    epoch_size = 3
    args_opt.base_size = config.crop_size
    args_opt.crop_size = config.crop_size
    train_dataset = create_dataset(args_opt, args_opt.data_url, 1, config.batch_size,
                                   usage="train", shuffle=False)
    dataset_size = train_dataset.get_dataset_size()
    callback = LossCallBack(dataset_size)
    net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
                             infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates,
                             decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride,
                             fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid)
    net.set_train()
    model_fine_tune(args_opt, net, 'layer')
    loss = OhemLoss(config.seg_num_classes, config.ignore_label)
    opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    model = Model(net, loss, opt)
    model.train(epoch_size, train_dataset, callback)
    print(time.time() - start_time)
    print("expect loss: ", callback.loss / 3)
    print("expect time: ", callback.time)
    expect_loss = 0.5
    expect_time = 35
    assert callback.loss.asnumpy() / 3 <= expect_loss
    assert callback.time <= expect_time
Пример #24
0
"""
import mindspore.nn as nn
from mindspore.nn import Momentum, SoftmaxCrossEntropyWithLogits
from mindspore import Model, context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor

from src.dataset import create_train_dataset, create_eval_dataset
from src.net import Net


if __name__ == "__main__":
    context.set_context(mode=context.GRAPH_MODE)
    ds_train = create_train_dataset()
    ds_eval = create_eval_dataset()
    net = Net()
    net_opt = Momentum(net.trainable_params(), 0.01, 0.9)
    net_loss = SoftmaxCrossEntropyWithLogits(reduction='mean')
    metrics = {
        'Accuracy': nn.Accuracy(),
        'Loss': nn.Loss(),
        'Precision': nn.Precision(),
        'Recall': nn.Recall(),
        'F1_score': nn.F1()
    }
    config_ck = CheckpointConfig(save_checkpoint_steps=1000, keep_checkpoint_max=10)
    ckpoint = ModelCheckpoint(prefix="CKPT", config=config_ck)
    model = Model(network=net, loss_fn=net_loss, optimizer=net_opt, metrics=metrics)
    model.train(epoch=2, train_dataset=ds_train, callbacks=[ckpoint, LossMonitor()])
    result = model.eval(ds_eval)
    print(result)
Пример #25
0
    # set args, train it
    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
    train_data_path = "./datasets/MNIST_Data/train"
    eval_data_path = "./datasets/MNIST_Data/test"
    ckpt_save_dir = "./lenet_ckpt"
    epoch_size = 10
    eval_per_epoch = 2
    repeat = 1
    train_data = create_dataset(train_data_path, repeat_size=repeat)
    eval_data = create_dataset(eval_data_path, repeat_size=repeat)
    # define the net
    network = LeNet5()
    # define the loss function
    net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    # define the optimizer
    net_opt = nn.Momentum(network.trainable_params(),
                          learning_rate=0.01,
                          momentum=0.9)
    config_ck = CheckpointConfig(save_checkpoint_steps=eval_per_epoch * 1875,
                                 keep_checkpoint_max=15)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=ckpt_save_dir,
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
    epoch_per_eval = {"epoch": [], "acc": []}
    eval_cb = EvalCallBack(model, eval_data, eval_per_epoch, epoch_per_eval)
    model.train(epoch_size,
                train_data,
                callbacks=[ckpoint_cb, LossMonitor(375), eval_cb],
                dataset_sink_mode=False)
Пример #26
0
    # clean up old run files before in Linux
    os.system('rm -rf {0}*.ckpt {0}*.meta {0}*.pb'.format(model_path))

    # define the model
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    # save the network model and parameters for subsquenece fine-tuning
    config_ck = CheckpointConfig(save_checkpoint_steps=375,
                                 keep_checkpoint_max=16)

    # group layers into an object whith tarining and evaluation features
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=model_path,
                                 config=config_ck)

    steps_loss = {"step": [], "loss_value": []}
    steps_eval = {"step": [], "acc": []}

    # collect the steps,loss and accuracy infofmation
    step_loss_acc_info = StepLossAccInfo(model, ds_eval, steps_loss,
                                         steps_eval)

    model.train(epoch_size,
                ds_train,
                callbacks=[ckpoint_cb,
                           LossMonitor(125), step_loss_acc_info],
                dataset_sink_mode=False)

    loss_show(steps_loss)
    eval_show(steps_eval)
Пример #27
0
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    host_device_mix = bool(config.host_device_mix)
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        de.config.set_seed(1)
        if config.field_slice:
            compute_manual_shape(config, get_group_size())
            ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                      batch_size=batch_size*get_group_size(), data_type=dataset_type,
                                      manual_shape=config.manual_shape, target_column=config.field_size)
            ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                     batch_size=batch_size*get_group_size(), data_type=dataset_type,
                                     manual_shape=config.manual_shape, target_column=config.field_size)
        else:
            ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                      batch_size=batch_size*get_group_size(), data_type=dataset_type)
            ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                     batch_size=batch_size*get_group_size(), data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                  batch_size=batch_size, rank_id=get_rank(),
                                  rank_size=get_group_size(), data_type=dataset_type)
        ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                 batch_size=batch_size, rank_id=get_rank(),
                                 rank_size=get_group_size(), data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(
        model, ds_eval, auc_metric, config, host_device_mix=host_device_mix)

    callback = LossCallBack(config=config, per_print_times=20)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
                                  keep_checkpoint_max=5, integrated_save=False)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
    context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
    callback_list = [TimeMonitor(
        ds_train.get_dataset_size()), eval_callback, callback]
    if not host_device_mix:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train, callbacks=callback_list,
                dataset_sink_mode=(not host_device_mix))
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    cache_enable = config.vocab_cache_size > 0
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              data_type=dataset_type)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config=config)
    if _is_role_worker():
        if cache_enable:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=ds_train.get_dataset_size() * epochs,
                keep_checkpoint_max=1)
        else:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=ds_train.get_dataset_size(),
                keep_checkpoint_max=5)
    else:
        ckptconfig = CheckpointConfig(save_checkpoint_steps=1,
                                      keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback,
        ckpoint_cb
    ]

    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                dataset_sink_mode=(parameter_server and cache_enable))
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    sparse = config.sparse
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              rank_id=get_rank(),
                              rank_size=get_group_size(),
                              data_type=dataset_type)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             rank_id=get_rank(),
                             rank_size=get_group_size(),
                             data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' +
                                 str(get_rank()) + '/',
                                 config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                sink_size=ds_train.get_dataset_size(),
                dataset_sink_mode=(not sparse))
Пример #30
0
        self.fc2 = nn.Dense(hidden_size, 1)
        self.sig = ops.Sigmoid()

    def construct(self, x):
        x = self.fc1(x)
        x = self.sig(x)
        x = self.fc2(x)
        return x


m = Net(HIDDEN_SIZE)
optim = nn.Momentum(m.trainable_params(), 0.05, 0.9)

loss = nn.MSELoss()

loss_cb = LossMonitor()

model = Model(m, loss, optim, {'acc': Accuracy()})

time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

model.train(ITERATIONS,
            ds_train,
            callbacks=[time_cb, loss_cb],
            dataset_sink_mode=False)

print("TF", model.predict(Tensor([[1, 0]], mindspore.float32)).asnumpy())
print("FF", model.predict(Tensor([[0, 0]], mindspore.float32)).asnumpy())
print("TT", model.predict(Tensor([[1, 1]], mindspore.float32)).asnumpy())
print("FT", model.predict(Tensor([[0, 1]], mindspore.float32)).asnumpy())