def train_process(device_id, epoch_size, num_classes, device_num, batch_size):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(device_id=device_id)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    context.set_context(mode=context.GRAPH_MODE)
    net = resnet50(batch_size, num_classes)
    loss = CrossEntropyLoss()
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)

    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    dataset = create_dataset(epoch_size, training=True, batch_size=batch_size)
    batch_num = dataset.get_dataset_size()
    config_ck = CheckpointConfig(save_checkpoint_steps=batch_num,
                                 keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10_device_id_" +
                                 str(device_id),
                                 directory="./",
                                 config=config_ck)
    loss_cb = LossGet()
    model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
def test_batchnorm_batch_parallel():
    num_classes = 1001
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = 0

    predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)

    dataset = DatasetLenet(predict, label, 2)
    net = batchnorm_net(num_classes)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1)))
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   learning_rate, momentum)

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num)
    context.set_context(mode=context.GRAPH_MODE)
    model = Model(net, loss, opt)
    model.train(epoch_size, dataset, dataset_sink_mode=False)
def loss_scale_manager_common(strategy1):
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL,
                                      device_num=8)
    predict = Tensor(np.ones([32, 128]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = all_to_all_net(strategy1)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1)))
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    scale_manager = DynamicLossScaleManager(32, 2, 2000)
    model = Model(net, loss, opt, loss_scale_manager=scale_manager)
    # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor.
    try:
        model.train(epoch_size, dataset, dataset_sink_mode=False)
    except TypeError:
        pass
    else:
        assert False
示例#4
0
def test_nad():
    """UT for natural adversarial defense."""
    num_classes = 10
    batch_size = 32

    sparse = False
    context.set_context(mode=context.GRAPH_MODE)
    context.set_context(device_target='Ascend')

    # create test data
    inputs = np.random.rand(batch_size, 1, 32, 32).astype(np.float32)
    labels = np.random.randint(num_classes, size=batch_size).astype(np.int32)
    if not sparse:
        labels = np.eye(num_classes)[labels].astype(np.float32)

    net = Net()
    loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=sparse)
    optimizer = Momentum(net.trainable_params(), 0.001, 0.9)

    # defense
    nad = NaturalAdversarialDefense(net, loss_fn=loss_fn, optimizer=optimizer)
    LOGGER.set_level(logging.DEBUG)
    LOGGER.debug(TAG, '---start natural adversarial defense--')
    loss = nad.defense(inputs, labels)
    LOGGER.debug(TAG, '---end natural adversarial defense--')
    assert np.any(loss >= 0.0)
示例#5
0
def train_common(net):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    device_num = 4
    context.reset_auto_parallel_context()
    auto_parallel_context().set_enable_all_reduce_fusion(
        enable_all_reduce_fusion=True)
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
        device_num=device_num,
        parameter_broadcast=False)
    context.set_context(mode=context.GRAPH_MODE)

    predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, loss, opt)

    model.train(epoch_size, dataset, dataset_sink_mode=False)
    allreduce_fusion_dict = _executor._get_allreduce_fusion(
        model._train_network)

    print(allreduce_fusion_dict)
    return allreduce_fusion_dict
def test_train_64k_8p(epoch_size=3,
                      batch_size=32,
                      num_classes=65536):  #1048576 #131072 #32768 #8192
    dev_num = 8
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL,
                                      device_num=dev_num)
    cost_model_context.set_cost_model_context(costmodel_gamma=0.001,
                                              costmodel_beta=260.0)
    set_algo_parameters(elementwise_op_strategy_follow=True)
    resset_op_id()
    np.random.seed(6)
    input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32)
    label_np = np.zeros([batch_size]).astype(np.int32)
    for i in range(0, batch_size):
        label_np[i] = i % num_classes
    dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)
    model = Model(net, loss_fn=loss, optimizer=opt)
    model.train(5, dataset, dataset_sink_mode=False)
    strategies = _executor._get_strategy(model._train_network)
    for (k, v) in strategies.items():
        if re.match(k, 'Conv2D-op') is not None:
            assert v[0][0] == dev_num
        elif re.match(k, 'MatMul-op') is not None:
            assert v == [[1, 1], [dev_num, 1]]
        elif re.match(k, 'ReduceSum-op') is not None:
            assert v == [[1, dev_num]]
def train_process(q, device_id, epoch_size, num_classes, device_num,
                  batch_size, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(device_id=device_id)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(
            device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
        init()
    context.set_context(mode=context.GRAPH_MODE)
    net = resnet50(batch_size, num_classes)
    loss = CrossEntropyLoss()
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)

    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    dataset = create_dataset(1,
                             training=True,
                             batch_size=batch_size,
                             rank_id=device_id,
                             rank_size=device_num,
                             enable_hccl=enable_hccl)

    loss_cb = LossGet()
    model.train(epoch_size, dataset, callbacks=[loss_cb])
    q.put(loss_cb.get_loss())
示例#8
0
def test_deeplabv3_1p():
    start_time = time.time()
    epoch_size = 100
    args_opt = argparse.Namespace(base_size=513, crop_size=513, batch_size=2)
    args_opt.base_size = config.crop_size
    args_opt.crop_size = config.crop_size
    args_opt.batch_size = config.batch_size
    train_dataset = create_dataset(args_opt, data_url, 1, config.batch_size,
                                   usage="eval")
    dataset_size = train_dataset.get_dataset_size()
    callback = LossCallBack(dataset_size)
    net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size],
                             infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates,
                             decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride,
                             fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid)
    net.set_train()
    model_fine_tune(net, 'layer')
    loss = OhemLoss(config.seg_num_classes, config.ignore_label)
    opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay)
    model = Model(net, loss, opt)
    model.train(epoch_size, train_dataset, callback)
    print(time.time() - start_time)
    print("expect loss: ", callback.loss)
    print("expect time: ", callback.time)
    expect_loss = 0.92
    expect_time = 40
    assert callback.loss.asnumpy() <= expect_loss
    assert callback.time <= expect_time
def test_pynative_resnet50():
    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")

    batch_size = 32
    num_classes = 10
    net = resnet50(batch_size, num_classes)
    criterion = CrossEntropyLoss()
    optimizer = Momentum(learning_rate=0.01, momentum=0.9,
                         params=filter(lambda x: x.requires_grad, net.get_parameters()))

    net_with_criterion = WithLossCell(net, criterion)
    net_with_criterion.set_grad()
    train_network = GradWrap(net_with_criterion)
    train_network.set_train()

    step = 0
    max_step = 21
    exceed_num = 0
    data_set = create_dataset(repeat_num=1, training=True, batch_size=batch_size)
    for element in data_set.create_dict_iterator(num_epochs=1):
        step = step + 1
        if step > max_step:
            break
        start_time = time.time()
        input_data = element["image"]
        input_label = element["label"]
        loss_output = net_with_criterion(input_data, input_label)
        grads = train_network(input_data, input_label)
        optimizer(grads)
        end_time = time.time()
        cost_time = end_time - start_time
        print("======step: ", step, " loss: ", loss_output.asnumpy(), " cost time: ", cost_time)
        if step > 1 and cost_time > 0.25:
            exceed_num = exceed_num + 1
    assert exceed_num < 20
示例#10
0
def test(cloud_args=None):
    """test"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.device_target,
                        save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = LogUtil.get_instance()
    args.logger.set_level(20)

    net = vgg16(num_classes=args.num_classes, args=args)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01,
                   args.momentum,
                   weight_decay=args.weight_decay)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    param_dict = load_checkpoint(args.pre_trained)
    load_param_into_net(net, param_dict)
    net.set_train(False)

    dataset_test = vgg_create_dataset100(args.data_path,
                                         args.image_size,
                                         args.per_batch_size,
                                         training=False)
    res = model.eval(dataset_test)
    print("result: ", res)
示例#11
0
def test_ead():
    """UT for ensemble adversarial defense."""
    num_classes = 10
    batch_size = 64

    sparse = False
    context.set_context(mode=context.GRAPH_MODE)
    context.set_context(device_target='Ascend')

    # create test data
    inputs = np.random.rand(batch_size, 1, 32, 32).astype(np.float32)
    labels = np.random.randint(num_classes, size=batch_size).astype(np.int32)
    if not sparse:
        labels = np.eye(num_classes)[labels].astype(np.float32)

    net = Net()
    loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=sparse)
    optimizer = Momentum(net.trainable_params(), 0.001, 0.9)

    net = Net()
    fgsm = FastGradientSignMethod(net, loss_fn=loss_fn)
    pgd = ProjectedGradientDescent(net, loss_fn=loss_fn)
    ead = EnsembleAdversarialDefense(net, [fgsm, pgd],
                                     loss_fn=loss_fn,
                                     optimizer=optimizer)
    LOGGER.set_level(logging.DEBUG)
    LOGGER.debug(TAG, '---start ensemble adversarial defense--')
    loss = ead.defense(inputs, labels)
    LOGGER.debug(TAG, '---end ensemble adversarial defense--')
    assert np.any(loss >= 0.0)
def test_resnet_model_parallel():
    num_classes = 1024
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(device_num=dev_num, global_rank=0)
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num)
    context.set_context(mode=context.GRAPH_MODE)
    predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)

    dataset = DatasetLenet(predict, label, 2)
    net = resnet_model_parallel_net(num_classes)

    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    loss.softmax_cross_entropy.shard(((dev_num, 1), (dev_num, 1)))
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   learning_rate, momentum)

    model = Model(net, loss, opt)
    model.train(epoch_size, dataset, dataset_sink_mode=False)
示例#13
0
 def mix_parallel_matmul_trains(self):
     parallel_callback = ModelCallback()
     matmul_stra = ((device_num, 1), (1, 1))
     reduce_max_stra = ((1, device_num), )
     sub_stra = ((device_num, 1), (device_num, 1))
     exp_stra = ((1, device_num), )
     reduce_sum_stra = ((1, device_num), )
     div_stra = ((1, device_num), (1, 1))
     log_stra = ((1, device_num), )
     mul_stra = ((1, device_num), (1, device_num))
     sum_cross_entropy_stra = ((1, device_num), )
     mul2_stra = ((), (device_num, ))
     reduce_mean_stra = ((device_num, ), )
     onehot_stra = ((1, device_num), (), ())
     loss_stra_list = [
         exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra,
         sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra,
         reduce_max_stra, sub_stra
     ]
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list)
     optimizer = Momentum(net.trainable_params(),
                          learning_rate=0.1,
                          momentum=0.9)
     model = Model(net, optimizer=optimizer)
     epoch_size = 6
     dataset = Dataset(self.input_part, self.label_part)
     model.train(epoch_size,
                 dataset,
                 callbacks=parallel_callback,
                 dataset_sink_mode=False)
     loss_value = np.array(parallel_callback.loss_list)
     return loss_value
示例#14
0
def test_train_4k_8p_gpu(batch_size=32, num_classes=4096):
    dev_num = 8
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num)
    set_algo_parameters(elementwise_op_strategy_follow=True)
    resset_op_id()
    np.random.seed(6)
    input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32)
    label_np = np.zeros([batch_size]).astype(np.int32)
    for i in range(0, batch_size):
        label_np[i] = i % num_classes
    dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1)
    net = resnet50(num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9)
    model = Model(net, loss_fn=loss, optimizer=opt)
    model.train(5, dataset, dataset_sink_mode=False)
    strategies = _executor._get_shard_strategy(model._train_network)
    for (k, v) in strategies.items():
        if re.search('Conv2D-op', k) is not None:
            assert v[0][0] == dev_num
        elif re.search('MatMul-op', k) is not None:
            assert v == [[dev_num, 1], [1, 1]]
        elif re.search('ReduceSum-op', k) is not None:
            assert v == [[dev_num, 1]]
def test_multi_grads():
    context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
    sparse = False
    inputs_np = np.random.rand(32, 1, 32, 32).astype(np.float32)
    labels_np = np.random.randint(10, size=32).astype(np.int32)
    inputs_np_2 = np.random.rand(64, 1, 32, 32).astype(np.float32)
    labels_np_2 = np.random.randint(10, size=64).astype(np.int32)
    if not sparse:
        labels_np = np.eye(10)[labels_np].astype(np.float32)
        labels_np_2 = np.eye(10)[labels_np_2].astype(np.float32)

    net = LeNet()

    # grad operation
    loss_fn = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=sparse)
    with_loss_cell = WithLossCell(net, loss_fn)
    grad_all = GradWrapWithLoss(with_loss_cell)
    grad_out = grad_all(Tensor(inputs_np), Tensor(labels_np)).asnumpy()
    assert np.any(grad_out != 0), 'grad result can not be all zeros'

    # train-one-step operation
    loss_fn = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=sparse)
    optimizer = Momentum(
        filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9)
    loss_net = WithLossCell(net, loss_fn)
    train_net = TrainOneStepCell(loss_net, optimizer)
    train_net.set_train()
    train_net(Tensor(inputs_np_2), Tensor(labels_np_2))
示例#16
0
def bn_common(parallel_mode, train_flag, strategy_loss=None):
    context.set_context(mode=context.GRAPH_MODE)
    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      device_num=8)
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = 8

    predict = Tensor(np.ones([32, 512]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = bn_net()

    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    loss.softmax_cross_entropy.shard(strategy_loss)
    opt = Momentum(net.trainable_params(), learning_rate, momentum, 0.0001,
                   1024 * rank_size)

    if not train_flag:
        net = WithLossCell(net, loss)
        net.set_train()

    if parallel_mode == ParallelMode.DATA_PARALLEL:
        context.set_auto_parallel_context(parameter_broadcast=True)
    model = Model(net, loss, opt)
    if train_flag:
        model.train(epoch_size, dataset, dataset_sink_mode=False)
    else:
        model._predict(predict, label)
示例#17
0
def test_ad():
    """UT for adversarial defense."""
    num_classes = 10
    batch_size = 32

    sparse = False
    context.set_context(mode=context.GRAPH_MODE)
    context.set_context(device_target='Ascend')

    # create test data
    inputs = np.random.rand(batch_size, 1, 32, 32).astype(np.float32)
    labels = np.random.randint(num_classes, size=batch_size).astype(np.int32)
    if not sparse:
        labels = np.eye(num_classes)[labels].astype(np.float32)

    net = Net()
    loss_fn = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=sparse)
    optimizer = Momentum(learning_rate=Tensor(np.array([0.001], np.float32)),
                         momentum=0.9,
                         params=net.trainable_params())

    ad_defense = AdversarialDefense(net, loss_fn=loss_fn, optimizer=optimizer)
    LOGGER.set_level(logging.DEBUG)
    LOGGER.debug(TAG, '--start adversarial defense--')
    loss = ad_defense.defense(inputs, labels)
    LOGGER.debug(TAG, '--end adversarial defense--')
    assert np.any(loss >= 0.0)
示例#18
0
def test_auto_parallel_arithmetic_model():
    class NetOneHot(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul = P.MatMul()
            self.one_hot = P.OneHot().shard(((1, 8), (), ()))
            self.on_value = Tensor(1.0, ms.float32)
            self.off_value = Tensor(0.0, ms.float32)
            self.matmul2 = P.MatMul()
            self.w = Parameter(Tensor(np.zeros([32, 64]).astype(np.float32)),
                               "weight",
                               requires_grad=True)

        def construct(self, x, b):
            out = self.matmul(x, self.w)
            out1 = self.one_hot(b, 64, self.on_value, self.off_value)
            out2 = self.matmul2(out, out1)
            return out2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(device_num=8,
                                      global_rank=0,
                                      parallel_mode=ParallelMode.AUTO_PARALLEL)
    net = NetOneHot()

    x = Tensor(np.ones([8, 32]), dtype=ms.float32)
    b = Tensor(np.ones([8]), dtype=ms.int32)
    dataset = Dataset(x, b, 2)

    opt = Momentum(net.trainable_params(), 0.1, 0.9)
    model = Model(net, optimizer=opt)

    model.train(2, dataset, dataset_sink_mode=False)
示例#19
0
def resnet50_train(args_opt):
    device_id = 0
    device_num = 1
    epoch_size = args_opt.epoch_size
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/home/share/dataset/cifar-10-batches-bin/' # your cifar10 path

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    context.set_context(device_id=device_id)
    if device_num > 1:
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        init()
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=1, batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
                                   repeat_num=1, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num = class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    if device_num == 1 or device_id == 0:
         print(f'=================================Start run evaluation.=================================')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')
示例#20
0
def test_exec_save_checkpoint():
    net = Net()
    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024)

    loss_net = WithLossCell(net, loss)
    train_network = TrainOneStepCell(loss_net, opt)
    _exec_save_checkpoint(train_network, ckpoint_file_name="./new_ckpt.ckpt")

    load_checkpoint("new_ckpt.ckpt")
示例#21
0
 def single_matmul_trains(self):
     single_callback = ModelCallback()
     net = MatmulNet()
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     model = Model(net, optimizer=optimizer)
     epoch_size = 6
     dataset = Dataset(self.input_full, self.label_full)
     model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False)
     loss_value = np.array(single_callback.loss_list)
     return loss_value
示例#22
0
 def data_parallel_matmul_trains(self):
     parallel_callback = ModelCallback()
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net = MatmulNet()
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     model = Model(net, optimizer=optimizer)
     epoch_size = 6
     dataset = Dataset(self.input_part, self.label_part)
     model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
     loss_value = np.array(parallel_callback.loss_list)
     return loss_value
示例#23
0
 def __init__(self):
     context.set_context(reserve_class_name_in_scope=False)
     net = resnet50(batch_size, num_classes)
     ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
     opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                    0.01, 0.9)
     model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'})
     self.model = model
     self.model.train(1,
                      create_dataset(list(range(32))),
                      dataset_sink_mode=False)
示例#24
0
def test_save_checkpoint_for_network():
    """ test save_checkpoint for network"""
    net = Net()
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    opt = Momentum(net.trainable_params(), 0.0, 0.9, 0.0001, 1024)

    loss_net = WithLossCell(net, loss)
    train_network = TrainOneStepCell(loss_net, opt)
    save_checkpoint(train_network, ckpt_file_name="./new_ckpt.ckpt")

    load_checkpoint("new_ckpt.ckpt")
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = cfg.batch_size
    class_num = cfg.class_num
    loss_scale_num = cfg.loss_scale
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss,
                  optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    time_cb = TimeMonitor(data_size=train_step_size)
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [time_cb, performance_cb, loss_cb]
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck)
    cb += [ckpt_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
示例#26
0
def loss_scale_manager_sens(strategy1, sens):
    learning_rate = 0.1
    momentum = 0.9
    device_num = 8
    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num)
    predict = Tensor(np.ones([32 * device_num, 128]), dtype=ms.float32)
    net = all_to_all_net(strategy1)
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    train_net = TrainOneStepCell(net, opt)
    train_net.set_train()
    train_net(predict, sens)
示例#27
0
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/cache/data'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    context.set_context(enable_task_sink=True, device_id=device_id)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    if device_num > 1:
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          mirror_mean=True)
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
                                   repeat_num=1, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num = class_num)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    if device_num == 1 or device_id == 0:
        print(f'Start run evaluation.')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')
    def __init__(self, network, loss_fn=None, optimizer=None):
        super(AdversarialDefense, self).__init__(network)
        network = check_model('network', network, Cell)
        if loss_fn is None:
            loss_fn = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)

        if optimizer is None:
            optimizer = Momentum(params=network.trainable_params(),
                                 learning_rate=0.01,
                                 momentum=0.9)

        loss_net = WithLossCell(network, loss_fn)
        self._train_net = TrainOneStepCell(loss_net, optimizer)
        self._train_net.set_train()
示例#29
0
def test_train_cifar(num_classes=10, epoch_size=10):
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL,
                                      mirror_mean=True)
    loss_cb = LossMonitor()
    dataset = create_dataset(epoch_size)
    net = resnet50(32, num_classes)
    loss = SoftmaxCrossEntropyExpand(sparse=True)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)
    model = Model(net, loss_fn=loss, optimizer=opt)
    model.train(epoch_size,
                dataset,
                callbacks=[loss_cb],
                dataset_sink_mode=False)
示例#30
0
def test_loss_scale2():
    context.set_context(mode=context.GRAPH_MODE, save_graphs=True)
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8)
    predict = Tensor(np.ones([64, 64]), dtype=ms.float32)
    label = Tensor(np.ones([
        64,
    ]), dtype=ms.int32)
    dataset = DatasetLenet(predict, label)
    net = Net2()
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)
    net = nn.TrainOneStepWithLossScaleCell(net, opt, update_cell)
    model = Model(network=net)
    model.train(2, dataset, dataset_sink_mode=False)