示例#1
0
def test_deepfm():
    data_config = DataConfig()
    train_config = TrainConfig()
    device_id = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=device_id)
    rank_size = None
    rank_id = None

    dataset_path = "/home/workspace/mindspore_dataset/criteo_data/criteo_h5/"
    print("dataset_path:", dataset_path)
    ds_train = create_dataset(dataset_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=train_config.batch_size,
                              data_type=DataType(data_config.data_format),
                              rank_size=rank_size,
                              rank_id=rank_id)

    model_builder = ModelBuilder(ModelConfig, TrainConfig)
    train_net, eval_net = model_builder.get_train_eval_net()
    auc_metric = AUCMetric()
    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    loss_file_name = './loss.log'
    time_callback = TimeMonitor(data_size=ds_train.get_dataset_size())
    loss_callback = LossCallBack(loss_file_path=loss_file_name)
    callback_list = [time_callback, loss_callback]

    eval_file_name = './auc.log'
    ds_eval = create_dataset(dataset_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=train_config.batch_size,
                             data_type=DataType(data_config.data_format))
    eval_callback = EvalCallBack(model,
                                 ds_eval,
                                 auc_metric,
                                 eval_file_path=eval_file_name)
    callback_list.append(eval_callback)

    print("train_config.train_epochs:", train_config.train_epochs)
    model.train(train_config.train_epochs, ds_train, callbacks=callback_list)

    export_loss_value = 0.51
    print("loss_callback.loss:", loss_callback.loss)
    assert loss_callback.loss < export_loss_value
    export_per_step_time = 40.0
    print("time_callback:", time_callback.per_step_time)
    assert time_callback.per_step_time < export_per_step_time
    print("*******test case pass!********")
示例#2
0
def train_on_gpu():
    config = config_gpu_quant if args_opt.quantization_aware else config_gpu
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))

    # define network
    network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
                                           num_classes=config.num_classes)
    else:
        loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
    # define dataset
    epoch_size = config.epoch_size
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             config=config,
                             device_target=args_opt.device_target,
                             repeat_num=1,
                             batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    # resume
    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        load_param_into_net(network, param_dict)

    # convert fusion network to quantization aware network
    if config.quantization_aware:
        network = quant.convert_quant_network(network,
                                              bn_fold=True,
                                              per_channel=[True, False],
                                              symmetric=[True, True])

    # get learning rate
    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
    lr = Tensor(get_lr(global_step=config.start_epoch * step_size,
                       lr_init=0,
                       lr_end=0,
                       lr_max=config.lr,
                       warmup_epochs=config.warmup_epochs,
                       total_epochs=epoch_size + config.start_epoch,
                       steps_per_epoch=step_size))

    # define optimization
    opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum,
                      config.weight_decay, config.loss_scale)
    # define model
    model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale)

    print("============== Starting Training ==============")
    callback = [Monitor(lr_init=lr.asnumpy())]
    ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
    if config.save_checkpoint:
        config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
                                     keep_checkpoint_max=config.keep_checkpoint_max)
        ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck)
        callback += [ckpt_cb]
    model.train(epoch_size, dataset, callbacks=callback)
    print("============== End Training ==============")
示例#3
0
def val(net, data_dir, filename, num_consumer=4, batch=32):
    """
    Validation function, estimate the performance of trained model

    Input:
            net:        the trained neural network
            data_dir:       path to the validation dataset
            filename:       name of the validation dataset
            num_consumer:   split number of validation dataset
            batch:          validation batch size
    Outputs
            Float, AUC
    """
    data_train = create_dataset(data_dir, filename, 32, ['feature', 'label'],
                                num_consumer)
    data_train = data_train.create_tuple_iterator()
    res_pred = []
    res_true = []
    for data, label in data_train:
        x = net(Tensor(data, dtype=mstype.float32))
        res_pred.append(x.asnumpy())
        res_true.append(label.asnumpy())
    res_pred = np.concatenate(res_pred, axis=0)
    res_true = np.concatenate(res_true, axis=0)
    auc = calculate_auc(res_true, res_pred)
    return auc
示例#4
0
def train(model,
          dataset_direct,
          filename,
          columns_list,
          num_consumer=4,
          batch=16,
          epoch=50,
          save_checkpoint_steps=2172,
          keep_checkpoint_max=50,
          prefix="model",
          directory='./'):
    """
    train network
    """
    config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
                                 keep_checkpoint_max=keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=prefix,
                                 directory=directory,
                                 config=config_ck)
    data_train = create_dataset(dataset_direct, filename, batch, columns_list,
                                num_consumer)

    model.train(epoch,
                data_train,
                callbacks=[
                    ckpoint_cb,
                    LossMonitor(per_print_times=181),
                    TimeMonitor()
                ],
                dataset_sink_mode=True)
示例#5
0
def do_eval_standalone(args_opt):
    """
    do eval standalone
    """
    ckpt_file = os.path.join(args_opt.model_dir, args_opt.task_name)
    ckpt_file = get_ckpt(ckpt_file)
    print('ckpt file:', ckpt_file)
    task = task_cfg[args_opt.task_name]
    student_net_cfg.seq_length = task.seq_length
    eval_cfg.batch_size = args_opt.batch_size
    eval_data_dir = os.path.join(args_opt.data_dir, args_opt.task_name,
                                 DATA_NAME)

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args.device_id)

    eval_dataset = create_dataset(batch_size=eval_cfg.batch_size,
                                  device_num=1,
                                  rank=0,
                                  do_shuffle='false',
                                  data_dir=eval_data_dir,
                                  data_type=args_opt.dataset_type,
                                  seq_length=task.seq_length,
                                  task_type=task.task_type,
                                  drop_remainder=False)
    print('eval dataset size:', eval_dataset.get_dataset_size())
    print('eval dataset batch size:', eval_dataset.get_batch_size())

    eval_model = BertModelCLS(student_net_cfg,
                              False,
                              task.num_labels,
                              0.0,
                              phase_type='student')
    param_dict = load_checkpoint(ckpt_file)
    new_param_dict = {}
    for key, value in param_dict.items():
        new_key = re.sub('tinybert_', 'bert_', key)
        new_key = re.sub('^bert.', '', new_key)
        new_param_dict[new_key] = value
    load_param_into_net(eval_model, new_param_dict)
    eval_model.set_train(False)

    columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
    callback = task.metrics()
    for step, data in enumerate(eval_dataset.create_dict_iterator()):
        input_data = []
        for i in columns_list:
            input_data.append(data[i])
        input_ids, input_mask, token_type_id, label_ids = input_data
        _, _, logits, _ = eval_model(input_ids, token_type_id, input_mask)
        callback.update(logits, label_ids)
        print('eval step: {}, {}: {}'.format(step, callback.name,
                                             callback.get_metrics()))
    metrics = callback.get_metrics()
    print('The best {}: {}'.format(callback.name, metrics))
示例#6
0
def train_net(data_dir,
              seg_dir,
              config=None):
    train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, is_training=True)
    # for item in train_dataset:
    #     print(item)
    # exit(0)

    train_data_size = len(train_dataset)
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)
    criterion = DiceLoss()
    optimizer = torch.optim.Adam(params=network.parameters(), lr=1)
    scheduler = dynamic_lr_scheduler(config, train_data_size, optimizer)
    device = torch.device('cuda:0')
    network.to(device)

    print("============== Starting Training ==============")
    network.train()
    step_per_epoch = train_data_size
    for epoch_id in range(cfg.epoch_size):
        time_epoch = 0.0
        torch.cuda.synchronize(0)
        time_start = time.time()
        for batch_idx, data in enumerate(train_dataset, 0):
            inputs, labels = data
            inputs.squeeze_(0)
            labels.squeeze_(0)
            inputs = inputs.to(device)
            labels = labels.to(device)
            # zeros the parameter gradients
            optimizer.zero_grad()
            outputs = network(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            # print statistics
            running_loss = loss.item()
            torch.cuda.synchronize(0)
            time_end = time.time()
            time_step = time_end - time_start
            time_epoch = time_epoch + time_step
            print('Epoch: [%3d/%3d], step: [%5d/%5d], loss: [%6.4f], time: [%.4f]' %
                        (epoch_id, cfg.epoch_size, batch_idx + 1, step_per_epoch, 
                        running_loss, time_step))
            time_start = time.time()
        print('Epoch time: %10.4f, per step time: %7.4f' % (time_epoch, time_epoch / step_per_epoch))

        ckpt_path = "./ckpt_0/Unet3d"
        ckpt_file = ('%s-%d_%d.ckpt' % (ckpt_path, epoch_id + 1, step_per_epoch))
        torch.save(network, ckpt_file)

    print("============== End Training ==============")
示例#7
0
def predict():
    """Predict function."""
    args = get_args("predict")
    G_A = get_generator(args)
    G_B = get_generator(args)
    # Use BatchNorm2d with batchsize=1, affine=False, training=True instead of InstanceNorm2d
    # Use real mean and varance rather than moving_men and moving_varance in BatchNorm2d
    G_A.set_train(True)
    G_B.set_train(True)
    load_ckpt(args, G_A, G_B)

    imgs_out = os.path.join(args.outputs_dir, "predict")
    if not os.path.exists(imgs_out):
        os.makedirs(imgs_out)
    if not os.path.exists(os.path.join(imgs_out, "fake_A")):
        os.makedirs(os.path.join(imgs_out, "fake_A"))
    if not os.path.exists(os.path.join(imgs_out, "fake_B")):
        os.makedirs(os.path.join(imgs_out, "fake_B"))
    args.data_dir = 'testA'
    ds = create_dataset(args)
    reporter = Reporter(args)
    reporter.start_predict("A to B")
    for data in ds.create_dict_iterator(output_numpy=True):
        img_A = Tensor(data["image"])
        path_A = str(data["image_name"][0], encoding="utf-8")
        fake_B = G_A(img_A)
        save_image(fake_B, os.path.join(imgs_out, "fake_B", path_A))
    reporter.info('save fake_B at %s', os.path.join(imgs_out, "fake_B",
                                                    path_A))
    reporter.end_predict()
    args.data_dir = 'testB'
    ds = create_dataset(args)
    reporter.dataset_size = args.dataset_size
    reporter.start_predict("B to A")
    for data in ds.create_dict_iterator(output_numpy=True):
        img_B = Tensor(data["image"])
        path_B = str(data["image_name"][0], encoding="utf-8")
        fake_A = G_B(img_B)
        save_image(fake_A, os.path.join(imgs_out, "fake_A", path_B))
    reporter.info('save fake_A at %s', os.path.join(imgs_out, "fake_A",
                                                    path_B))
    reporter.end_predict()
示例#8
0
def run_eval():
    """eval method"""
    if not os.path.exists(config.output_path):
        os.makedirs(config.output_path)

    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Davinci",
                        save_graphs=False,
                        device_id=get_device_id())

    layers = config.layers
    num_factors = config.num_factors
    topk = rconst.TOP_K
    num_eval_neg = rconst.NUM_EVAL_NEGATIVES

    ds_eval, num_eval_users, num_eval_items = create_dataset(
        test_train=False,
        data_dir=config.data_path,
        dataset=config.dataset,
        train_epochs=0,
        eval_batch_size=config.eval_batch_size)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    ncf_net = NCFModel(num_users=num_eval_users,
                       num_items=num_eval_items,
                       num_factors=num_factors,
                       model_layers=layers,
                       mf_regularization=0,
                       mlp_reg_layers=[0.0, 0.0, 0.0, 0.0],
                       mf_dim=16)
    param_dict = load_checkpoint(config.checkpoint_file_path)
    load_param_into_net(ncf_net, param_dict)

    loss_net = NetWithLossClass(ncf_net)
    train_net = TrainStepWrap(loss_net)
    eval_net = PredictWithSigmoid(ncf_net, topk, num_eval_neg)

    ncf_metric = NCFMetric()
    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"ncf": ncf_metric})

    ncf_metric.clear()
    out = model.eval(ds_eval)

    eval_file_path = os.path.join(config.output_path, config.eval_file_name)
    eval_file = open(eval_file_path, "a+")
    eval_file.write("EvalCallBack: HR = {}, NDCG = {}\n".format(
        out['ncf'][0], out['ncf'][1]))
    eval_file.close()
    print("EvalCallBack: HR = {}, NDCG = {}".format(out['ncf'][0],
                                                    out['ncf'][1]))
    print("=" * 100 + "Eval Finish!" + "=" * 100)
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = cfg.batch_size
    class_num = cfg.class_num
    loss_scale_num = cfg.loss_scale
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss,
                  optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    time_cb = TimeMonitor(data_size=train_step_size)
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [time_cb, performance_cb, loss_cb]
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck)
    cb += [ckpt_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
示例#10
0
文件: train.py 项目: yrpang/mindspore
def train():
    """Train function."""
    args = get_args("train")
    if args.need_profiler:
        from mindspore.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir,
                            is_detail=True,
                            is_show_op_path=True)
    ds = create_dataset(args)
    G_A = get_generator(args)
    G_B = get_generator(args)
    D_A = get_discriminator(args)
    D_B = get_discriminator(args)
    load_ckpt(args, G_A, G_B, D_A, D_B)
    imgae_pool_A = ImagePool(args.pool_size)
    imgae_pool_B = ImagePool(args.pool_size)
    generator = Generator(G_A, G_B, args.lambda_idt > 0)

    loss_D = DiscriminatorLoss(args, D_A, D_B)
    loss_G = GeneratorLoss(args, generator, D_A, D_B)
    optimizer_G = nn.Adam(generator.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)
    optimizer_D = nn.Adam(loss_D.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)

    net_G = TrainOneStepG(loss_G, generator, optimizer_G)
    net_D = TrainOneStepD(loss_D, optimizer_D)

    data_loader = ds.create_dict_iterator()
    reporter = Reporter(args)
    reporter.info('==========start training===============')
    for _ in range(args.max_epoch):
        reporter.epoch_start()
        for data in data_loader:
            img_A = data["image_A"]
            img_B = data["image_B"]
            res_G = net_G(img_A, img_B)
            fake_A = res_G[0]
            fake_B = res_G[1]
            res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A),
                          imgae_pool_B.query(fake_B))
            reporter.step_end(res_G, res_D)
            reporter.visualizer(img_A, img_B, fake_A, fake_B)
        reporter.epoch_end(net_G)
        if args.need_profiler:
            profiler.analyse()
            break

    reporter.info('==========end training===============')
示例#11
0
def train_net(data_dir, seg_dir, run_distribute, config=None):
    if run_distribute:
        init()
        rank_id = get_rank()
        rank_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=rank_size,
                                          gradients_mean=True)
    else:
        rank_id = 0
        rank_size = 1
    train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \
                                    rank_size=rank_size, rank_id=rank_id, is_training=True)
    # for item in train_dataset:
    #     print(item)
    # exit(0)

    train_data_size = train_dataset.get_dataset_size()
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()

    model = Model(network,
                  loss_fn=loss,
                  optimizer=optimizer,
                  loss_scale_manager=scale_manager,
                  amp_level='O2')

    time_cb = TimeMonitor(data_size=train_data_size)
    loss_cb = LossMonitor()
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model),
                                 directory='./ckpt_{}/'.format(rank_size),
                                 config=ckpt_config)
    callbacks_list = [loss_cb, time_cb, ckpoint_cb]
    print("============== Starting Training ==============")
    model.train(config.epoch_size, train_dataset, callbacks=callbacks_list)
    # model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False)
    print("============== End Training ==============")
def resnet50_eval(args_opt):
    class_num = cfg.class_num
    local_data_path = '/cache/data'
    ckpt_file_slice = args_opt.checkpoint_path.split('/')
    ckpt_file = ckpt_file_slice[len(ckpt_file_slice) - 1]
    local_ckpt_path = '/cache/' + ckpt_file

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)
    mox.file.copy_parallel(src_url=args_opt.checkpoint_path,
                           dst_url=local_ckpt_path)

    # create dataset
    dataset = create_dataset(dataset_path=local_data_path,
                             do_train=False,
                             batch_size=cfg.batch_size)

    # load checkpoint into net
    net = resnet50(class_num=class_num)
    param_dict = load_checkpoint(local_ckpt_path)
    load_param_into_net(net, param_dict)
    net.set_train(False)

    # define loss and model
    if not cfg.use_label_smooth:
        cfg.label_smooth_factor = 0.0
    loss = CrossEntropySmooth(sparse=True,
                              reduction='mean',
                              smooth_factor=cfg.label_smooth_factor,
                              num_classes=cfg.class_num)
    model = Model(net,
                  loss_fn=loss,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})

    # eval model
    res = model.eval(dataset)
    print("result:", res, "ckpt=", args_opt.checkpoint_path)
示例#13
0
def test_net(data_dir, seg_dir, ckpt_path, config=None):
    eval_dataset = create_dataset(data_path=data_dir,
                                  seg_path=seg_dir,
                                  config=config,
                                  is_training=False)
    eval_data_size = eval_dataset.get_dataset_size()
    print("train dataset length is:", eval_data_size)

    network = UNet3d(config=config)
    network.set_train(False)
    param_dict = load_checkpoint(ckpt_path)
    load_param_into_net(network, param_dict)
    model = Model(network)
    index = 0
    total_dice = 0
    for batch in eval_dataset.create_dict_iterator(num_epochs=1,
                                                   output_numpy=True):
        image = batch["image"]
        seg = batch["seg"]
        print("current image shape is {}".format(image.shape), flush=True)
        sliding_window_list, slice_list = create_sliding_window(
            image, config.roi_size, config.overlap)
        image_size = (config.batch_size, config.num_classes) + image.shape[2:]
        output_image = np.zeros(image_size, np.float32)
        count_map = np.zeros(image_size, np.float32)
        importance_map = np.ones(config.roi_size, np.float32)
        for window, slice_ in zip(sliding_window_list, slice_list):
            window_image = Tensor(window, mstype.float32)
            pred_probs = model.predict(window_image)
            output_image[slice_] += pred_probs.asnumpy()
            count_map[slice_] += importance_map
        output_image = output_image / count_map
        dice, _ = CalculateDice(output_image, seg)
        print("The {} batch dice is {}".format(index, dice), flush=True)
        total_dice += dice
        index = index + 1
    avg_dice = total_dice / eval_data_size
    print(
        "**********************End Eval***************************************"
    )
    print("eval average dice is {}".format(avg_dice))
示例#14
0
def eval_lenet():

    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)

    network = LeNet5(config.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # repeat_size = config.epoch_size
    net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(ckpt_path)
    load_param_into_net(network, param_dict)
    ds_eval = create_dataset(os.path.join(config.data_path, "test"),
                             config.batch_size,
                             1)
    if ds_eval.get_dataset_size() == 0:
        raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")

    acc = model.eval(ds_eval)
    print("============== {} ==============".format(acc))
示例#15
0
def train_lenet():

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=config.device_target)
    ds_train = create_dataset(os.path.join(config.data_path, "train"),
                              config.batch_size)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")

    network = LeNet5(config.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), config.lr,
                          config.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(
        save_checkpoint_steps=config.save_checkpoint_steps,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=config.checkpoint_path,
                                 config=config_ck)

    if config.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"Accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"Accuracy": Accuracy()},
                      amp_level="O2")

    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
示例#16
0
文件: train.py 项目: yrpang/mindspore
def inception_v4_train():
    """
    Train Inceptionv4 in data parallelism
    """
    print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes))

    context.set_context(mode=context.GRAPH_MODE, device_target=args.platform)
    if args.platform == "Ascend":
        context.set_context(device_id=args.device_id)
        context.set_context(enable_graph_kernel=False)

    rank = 0
    if device_num > 1:
        if args.platform == "Ascend":
            init(backend_name='hccl')
        elif args.platform == "GPU":
            init()
        else:
            raise ValueError("Unsupported device target.")

        rank = get_rank()
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True,
                                          all_reduce_fusion_config=[200, 400])

    # create dataset
    train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True,
                                   repeat_num=1, batch_size=config.batch_size, shard_id=rank)
    train_step_size = train_dataset.get_dataset_size()

    # create model
    net = Inceptionv4(classes=config.num_classes)
    # loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # learning rate
    lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size))

    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype))
    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
                    {'params': no_decayed_params},
                    {'order_params': net.trainable_params()}]

    opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay,
                  momentum=config.momentum, loss_scale=config.loss_scale)

    if args.device_id == 0:
        print(lr)
        print(train_step_size)
    if args.resume:
        ckpt = load_checkpoint(args.resume)
        load_param_into_net(net, ckpt)

    loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)


    if args.platform == "Ascend":
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                      loss_scale_manager=loss_scale_manager, amp_level=config.amp_level)
    elif args.platform == "GPU":
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                      loss_scale_manager=loss_scale_manager, amp_level='O0')
    else:
        raise ValueError("Unsupported device target.")

    # define callbacks
    performance_cb = TimeMonitor(data_size=train_step_size)
    loss_cb = LossMonitor(per_print_times=train_step_size)
    ckp_save_step = config.save_checkpoint_epochs * train_step_size
    config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}",
                                 directory='ckpts_rank_' + str(rank), config=config_ck)
    callbacks = [performance_cb, loss_cb]
    if device_num > 1 and config.is_save_on_master:
        if args.device_id == 0:
            callbacks.append(ckpoint_cb)
    else:
        callbacks.append(ckpoint_cb)

    # train model
    model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
示例#17
0
def train():
    """Train function."""

    args.outputs_dir = params['save_model_path']

    if args.group_size > 1:
        init()
        context.set_auto_parallel_context(
            device_num=get_group_size(),
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        args.outputs_dir = os.path.join(args.outputs_dir,
                                        "ckpt_{}/".format(str(get_rank())))
        args.rank = get_rank()
    else:
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/")
        args.rank = 0

    if args.group_size > 1:
        args.max_epoch = params["max_epoch_train_NP"]
        args.loss_scale = params['loss_scale'] / 2
        args.lr_steps = list(map(int, params["lr_steps_NP"].split(',')))
        params['train_type'] = params['train_type_NP']
        params['optimizer'] = params['optimizer_NP']
        params['group_params'] = params['group_params_NP']
    else:
        args.max_epoch = params["max_epoch_train"]
        args.loss_scale = params['loss_scale']
        args.lr_steps = list(map(int, params["lr_steps"].split(',')))

    # create network
    print('start create network')
    criterion = openpose_loss()
    criterion.add_flags_recursive(fp32=True)
    network = OpenPoseNet(vggpath=params['vgg_path'],
                          vgg_with_bn=params['vgg_with_bn'])
    if params["load_pretrain"]:
        print("load pretrain model:", params["pretrained_model_path"])
        load_model(network, params["pretrained_model_path"])
    train_net = BuildTrainNetwork(network, criterion)

    # create dataset
    if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \
            and os.path.exists(args.maskpath_train):
        print('start create dataset')
    else:
        print('Error: wrong data path')
        return 0

    num_worker = 20 if args.group_size > 1 else 48
    de_dataset_train = create_dataset(args.jsonpath_train,
                                      args.imgpath_train,
                                      args.maskpath_train,
                                      batch_size=params['batch_size'],
                                      rank=args.rank,
                                      group_size=args.group_size,
                                      num_worker=num_worker,
                                      multiprocessing=True,
                                      shuffle=True,
                                      repeat_num=1)
    steps_per_epoch = de_dataset_train.get_dataset_size()
    print("steps_per_epoch: ", steps_per_epoch)

    # lr scheduler
    lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size,
                                       params['lr_gamma'],
                                       steps_per_epoch,
                                       args.max_epoch,
                                       args.lr_steps,
                                       args.group_size,
                                       lr_type=params['lr_type'],
                                       warmup_epoch=params['warmup_epoch'])

    # optimizer
    if params['group_params']:
        vgg19_base_params = list(
            filter(lambda x: 'base.vgg_base' in x.name,
                   train_net.trainable_params()))
        base_params = list(
            filter(lambda x: 'base.conv' in x.name,
                   train_net.trainable_params()))
        stages_params = list(
            filter(lambda x: 'base' not in x.name,
                   train_net.trainable_params()))

        group_params = [{
            'params': vgg19_base_params,
            'lr': lr_vgg
        }, {
            'params': base_params,
            'lr': lr_base
        }, {
            'params': stages_params,
            'lr': lr_stage
        }]

        if params['optimizer'] == "Momentum":
            opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(group_params)
        else:
            raise ValueError("optimizer not support.")
    else:
        if params['optimizer'] == "Momentum":
            opt = Momentum(train_net.trainable_params(),
                           learning_rate=lr_stage,
                           momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(train_net.trainable_params(), learning_rate=lr_stage)
        else:
            raise ValueError("optimizer not support.")

    # callback
    config_ck = CheckpointConfig(
        save_checkpoint_steps=params['ckpt_interval'],
        keep_checkpoint_max=params["keep_checkpoint_max"])
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank),
                                 directory=args.outputs_dir,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size())
    if args.rank == 0:
        callback_list = [MyLossMonitor(), time_cb, ckpoint_cb]
    else:
        callback_list = [MyLossMonitor(), time_cb]

    # train
    if params['train_type'] == 'clip_grad':
        train_net = TrainOneStepWithClipGradientCell(train_net,
                                                     opt,
                                                     sens=args.loss_scale)
        train_net.set_train()
        model = Model(train_net)
    elif params['train_type'] == 'fix_loss_scale':
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        train_net.set_train()
        model = Model(train_net,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager)
    else:
        raise ValueError("Type {} is not support.".format(
            params['train_type']))

    print("============== Starting Training ==============")
    model.train(args.max_epoch,
                de_dataset_train,
                callbacks=callback_list,
                dataset_sink_mode=False)
    return 0
示例#18
0
 # define net
 net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU")
 # define loss
 if config_gpu.label_smooth > 0:
     loss = CrossEntropyWithLabelSmooth(
         smooth_factor=config_gpu.label_smooth,
         num_classes=config_gpu.num_classes)
 else:
     loss = SoftmaxCrossEntropyWithLogits(is_grad=False,
                                          sparse=True,
                                          reduction='mean')
 # define dataset
 epoch_size = config_gpu.epoch_size
 dataset = create_dataset(dataset_path=args_opt.dataset_path,
                          do_train=True,
                          config=config_gpu,
                          platform=args_opt.platform,
                          repeat_num=1,
                          batch_size=config_gpu.batch_size)
 step_size = dataset.get_dataset_size()
 # resume
 if args_opt.pre_trained:
     param_dict = load_checkpoint(args_opt.pre_trained)
     load_param_into_net(net, param_dict)
 # define optimizer
 loss_scale = FixedLossScaleManager(config_gpu.loss_scale,
                                    drop_overflow_update=False)
 lr = Tensor(
     get_lr(global_step=0,
            lr_init=0,
            lr_end=0,
            lr_max=config_gpu.lr,
示例#19
0
def train_on_ascend():
    config = config_ascend_quant
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))
    print("parallel args: rank_id {}, device_id {}, rank_size {}".format(
        rank_id, device_id, rank_size))
    epoch_size = config.epoch_size

    # distribute init
    if run_distribute:
        context.set_auto_parallel_context(
            device_num=rank_size,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        init()

    # define network
    network = mobilenetV2(num_classes=config.num_classes)
    # define loss
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
                                           num_classes=config.num_classes)
    else:
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    # define dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             config=config,
                             device_target=args_opt.device_target,
                             repeat_num=1,
                             batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    # load pre trained ckpt
    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        load_nonquant_param_into_quant_net(network, param_dict)
    # convert fusion network to quantization aware network
    network = quant.convert_quant_network(network,
                                          bn_fold=True,
                                          per_channel=[True, False],
                                          symmetric=[True, False])

    # get learning rate
    lr = Tensor(
        get_lr(global_step=config.start_epoch * step_size,
               lr_init=0,
               lr_end=0,
               lr_max=config.lr,
               warmup_epochs=config.warmup_epochs,
               total_epochs=epoch_size + config.start_epoch,
               steps_per_epoch=step_size))

    # define optimization
    opt = nn.Momentum(
        filter(lambda x: x.requires_grad, network.get_parameters()), lr,
        config.momentum, config.weight_decay)
    # define model
    model = Model(network, loss_fn=loss, optimizer=opt)

    print("============== Starting Training ==============")
    callback = None
    if rank_id == 0:
        callback = [Monitor(lr_init=lr.asnumpy())]
        if config.save_checkpoint:
            config_ck = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_epochs *
                step_size,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpt_cb = ModelCheckpoint(prefix="mobilenetV2",
                                      directory=config.save_checkpoint_path,
                                      config=config_ck)
            callback += [ckpt_cb]
    model.train(epoch_size, dataset, callbacks=callback)
    print("============== End Training ==============")
示例#20
0
                        help='path where the dataset is saved')
    parser.add_argument('--summary_path', type=str, default="./summary",
                        help='path where the summary to be saved')
    parser.add_argument('--dataset_sink_mode', type=bool, default=True, help='dataset_sink_mode is False or True')
    parser.add_argument('--device_id', type=int, default=0, help='device id of GPU. (Default: 0)')
    args = parser.parse_args()

    if args.device_target == "CPU":
        args.dataset_sink_mode = False

    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=args.device_id)

    network = Inceptionv3(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, 
                reduction="mean", smooth_factor=cfg.label_smoothing_eps)
    ds_train = create_dataset(args.data_path, cfg.batch_size, cfg.epoch_size)
    step_per_epoch = ds_train.get_dataset_size()
    total_step = step_per_epoch * cfg.epoch_size
    lr = exponential_decay_lr(learning_rate=cfg.lr_init, 
            decay_rate=cfg.lr_decay_rate, total_step=total_step, 
            step_per_epoch=step_per_epoch, decay_epoch=cfg.lr_decay_epoch)
    net_opt = nn.RMSProp(network.trainable_params(), learning_rate=lr, 
                decay=cfg.rmsprop_decay, momentum=cfg.rmsprop_momentum, 
                epsilon=cfg.rmsprop_epsilon)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_inceptionv3", config=config_ck)
    # summary_cb = SummaryCollector(args.summary_path,
    #                             collect_freq=1,
    #                             keep_default_action=False,
示例#21
0
            init()
        else:
            init("nccl")
        cfg.rank = get_rank()
        cfg.group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=cfg.group_size,
                                          parameter_broadcast=True,
                                          mirror_mean=True)
    else:
        cfg.rank = 0
        cfg.group_size = 1

    # dataloader
    dataset = create_dataset(args_opt.dataset_path, True, cfg.rank,
                             cfg.group_size)
    batches_per_epoch = dataset.get_dataset_size()

    # network
    net = InceptionV3(num_classes=cfg.num_classes)

    # loss
    loss = CrossEntropy(smooth_factor=cfg.smooth_factor,
                        num_classes=cfg.num_classes,
                        factor=cfg.aux_factor)

    # learning rate schedule
    lr = get_lr(lr_init=cfg.lr_init,
                lr_end=cfg.lr_end,
                lr_max=cfg.lr_max,
                warmup_epochs=cfg.warmup_epochs,
示例#22
0
                    device_id=device_id)


def add_write(file_path, print_str):
    with open(file_path, 'a+', encoding='utf-8') as file_out:
        file_out.write(print_str + '\n')


if __name__ == '__main__':
    data_config = DataConfig()
    model_config = ModelConfig()
    train_config = TrainConfig()

    ds_eval = create_dataset(args_opt.dataset_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=train_config.batch_size,
                             data_type=DataType(data_config.data_format))
    model_builder = ModelBuilder(ModelConfig, TrainConfig)
    train_net, eval_net = model_builder.get_train_eval_net()
    train_net.set_train()
    eval_net.set_train(False)
    auc_metric = AUCMetric()
    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})
    param_dict = load_checkpoint(args_opt.checkpoint_path)
    load_param_into_net(eval_net, param_dict)

    start = time.time()
    res = model.eval(ds_eval)
示例#23
0
    # init distributed
    if args_opt.is_distributed:
        init("nccl")
        cfg.rank = get_rank()
        cfg.group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=cfg.group_size,
                                          gradients_mean=True)
    else:
        cfg.rank = 0
        cfg.group_size = 1

    # dataloader
    dataset = create_dataset(args_opt.dataset_path, cfg, True)
    batches_per_epoch = dataset.get_dataset_size()

    # network
    net_with_loss = NASNetAMobileWithLoss(cfg)
    if args_opt.resume:
        ckpt = load_checkpoint(args_opt.resume)
        load_param_into_net(net_with_loss, ckpt)

    # learning rate schedule
    lr = get_lr(lr_init=cfg.lr_init,
                lr_decay_rate=cfg.lr_decay_rate,
                num_epoch_per_decay=cfg.num_epoch_per_decay,
                total_epochs=cfg.epoch_size,
                steps_per_epoch=batches_per_epoch,
                is_stair=True)
示例#24
0
            rank = get_rank()
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
    else:
        device_num = 1
        rank = 0

    max_captcha_digits = cf.max_captcha_digits
    input_size = m.ceil(cf.captcha_height / 64) * 64 * 3
    # create dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             batch_size=cf.batch_size,
                             num_shards=device_num,
                             shard_id=rank,
                             device_target=args_opt.platform)
    step_size = dataset.get_dataset_size()
    # define lr
    lr_init = cf.learning_rate if not args_opt.run_distribute else cf.learning_rate * device_num * lr_scale
    lr = get_lr(cf.epoch_size, step_size, lr_init)
    loss = CTCLoss(max_sequence_length=cf.captcha_width,
                   max_label_length=max_captcha_digits,
                   batch_size=cf.batch_size)
    if args_opt.platform == 'Ascend':
        net = StackedRNN(input_size=input_size,
                         batch_size=cf.batch_size,
                         hidden_size=cf.hidden_size)
    else:
        net = StackedRNNForGPU(input_size=input_size,
示例#25
0
                        path where the trained ckpt file')
    parser.add_argument('--dataset_sink_mode',
                        type=bool,
                        default=False,
                        help='dataset_sink_mode is False or True')

    args = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)

    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    repeat_size = cfg.epoch_size
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Testing ==============")
    param_dict = load_checkpoint(args.ckpt_path)
    load_param_into_net(network, param_dict)
    ds_eval = create_dataset(os.path.join(args.data_path, "test"),
                             cfg.batch_size, 1)
    acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
    print("============== {} ==============".format(acc))
示例#26
0
                        default="./ckpt",
                        help='if is test, must provide\
                        path where the trained ckpt file')
    parser.add_argument('--dataset_sink_mode',
                        type=bool,
                        default=True,
                        help='dataset_sink_mode is False or True')

    args = parser.parse_args()

    if args.device_target == "CPU":
        args.dataset_sink_mode = False

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)
    ds_train = create_dataset(os.path.join(args.data_path, "train"),
                              cfg.batch_size)

    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=args.ckpt_path,
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
示例#27
0
                parallel_mode=ParallelMode.DATA_PARALLEL,
                mirror_mean=True)
            init()
    elif device_target == "GPU":
        init("nccl")

        if device_num > 1:
            context.reset_auto_parallel_context()
            context.set_auto_parallel_context(
                device_num=device_num,
                parallel_mode=ParallelMode.DATA_PARALLEL,
                mirror_mean=True)
    else:
        raise ValueError("Unsupport platform.")

    dataset = create_dataset(cfg.data_path, 1)
    batch_num = dataset.get_dataset_size()

    net = GoogleNet(num_classes=cfg.num_classes)
    # Continue training if set pre_trained to be True
    if cfg.pre_trained:
        param_dict = load_checkpoint(cfg.checkpoint_path)
        load_param_into_net(net, param_dict)
    lr = lr_steps(0,
                  lr_max=cfg.lr_init,
                  total_epochs=cfg.epoch_size,
                  steps_per_epoch=batch_num)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   Tensor(lr),
                   cfg.momentum,
                   weight_decay=cfg.weight_decay)
示例#28
0
                        save_graphs=False)
    # define network
    net = xception(class_num=config.class_num)
    if args_opt.device_target == "Ascend":
        net.to_float(mstype.float16)

    # define loss
    if not config.use_label_smooth:
        config.label_smooth_factor = 0.0
    loss = CrossEntropySmooth(smooth_factor=config.label_smooth_factor,
                              num_classes=config.class_num)

    # define dataset
    dataset = create_dataset(args_opt.dataset_path,
                             do_train=True,
                             batch_size=config.batch_size,
                             device_num=group_size,
                             rank=rank)
    step_size = dataset.get_dataset_size()

    # resume
    if args_opt.resume:
        ckpt = load_checkpoint(args_opt.resume)
        load_param_into_net(net, ckpt)

    # get learning rate
    loss_scale = FixedLossScaleManager(config.loss_scale,
                                       drop_overflow_update=False)
    lr = Tensor(
        get_lr(lr_init=config.lr_init,
               lr_end=config.lr_end,
示例#29
0
set_seed(1)

if __name__ == '__main__':
    args_opt = train_parse_args()
    args_opt.dataset_path = os.path.abspath(args_opt.dataset_path)
    config = set_config(args_opt)
    start = time.time()

    print(f"train args: {args_opt}\ncfg: {config}")

    #set context and device init
    context_device_init(config)

    # define network
    backbone_net, head_net, net = define_net(config, args_opt.is_training)
    dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config)
    step_size = dataset.get_dataset_size()
    if args_opt.pretrain_ckpt:
        if args_opt.freeze_layer == "backbone":
            load_ckpt(backbone_net, args_opt.pretrain_ckpt, trainable=False)
            step_size = extract_features(backbone_net, args_opt.dataset_path, config)
        else:
            load_ckpt(net, args_opt.pretrain_ckpt)
    if step_size == 0:
        raise ValueError("The step_size of dataset is zero. Check if the images' count of train dataset is more \
            than batch_size in config.py")

    # Currently, only Ascend support switch precision.
    switch_precision(net, mstype.float16, config)

    # define loss
示例#30
0
        else:
            init()
            context.set_auto_parallel_context(
                device_num=get_group_size(),
                parallel_mode=ParallelMode.DATA_PARALLEL,
                gradients_mean=True)
            if args.net == "resnet50":
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[85, 160])
        ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(
            get_rank()) + "/"

    # create dataset
    dataset = create_dataset(dataset_path=os.path.join(args.dataset_path,
                                                       "train"),
                             do_train=True,
                             repeat_num=1,
                             batch_size=config.batch_size,
                             target=target)
    eval_dataset = create_dataset(dataset_path=os.path.join(
        args.dataset_path, "val"),
                                  do_train=False,
                                  batch_size=config.batch_size,
                                  target=target)
    step_size = dataset.get_dataset_size()

    # define net
    net = resnet(class_num=config.class_num)
    if args.parameter_server:
        net.set_param_ps()

    # init weight