Пример #1
0
def test_clip_by_norm_forward(seed, shape, clip_norm, axis):
    rng = np.random.RandomState(seed)
    x_data = rng.randn(*shape)
    x = nn.Variable.from_numpy_array(x_data)
    with nn.auto_forward(True):
        y = F.clip_by_norm(x, clip_norm, axis)
    y_ref = ref_clip_by_norm(x_data, clip_norm, axis=axis)
    assert np.allclose(y.d, y_ref)
Пример #2
0
 def execute_clip_by_norm(x, x_data, clip_norm, clip_norm_value, axis):
     if isinstance(clip_norm, (nn.Variable, nn.NdArray)):
         if clip_norm_value <= 0:
             pytest.skip()
         else:
             with nn.auto_forward(True):
                 y = F.clip_by_norm(x, clip_norm, axis)
             y_ref = ref_clip_by_norm(x_data, clip_norm_value, axis=axis)
             assert_allclose(y.d, y_ref)
     else:
         if clip_norm_value > 0:
             with nn.auto_forward(True):
                 y = F.clip_by_norm(x, clip_norm, axis)
             y_ref = ref_clip_by_norm(x_data, clip_norm_value, axis=axis)
             assert_allclose(y.d, y_ref)
         else:
             with pytest.raises(ValueError):
                 y = F.clip_by_norm(x, clip_norm, axis)
Пример #3
0
def CNN_run(args, ops, alphas_dict):
    """
        Based on the given model architecture,
        construct CNN and execute training.
        input:
            args: arguments set by user.
            ops: operations used in the network.
            arch_dict: a dictionary containing architecture information.
    """

    data_iterator = data_iterator_cifar10
    all_data = data_iterator(args.batch_size, True)
    tdata = all_data.slice(rng=None, slice_start=0, slice_end=25000)
    vdata = all_data.slice(rng=None, slice_start=25000, slice_end=50000)

    # CIFAR10 statistics, mean and variance
    CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1))
    CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1))

    channels, image_height, image_width = 3, 32, 32
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = tdata.size // batch_size
    max_iter = args.epoch * one_epoch

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Validation loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Validation error", monitor, interval=100)

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}
    pred_train = construct_networks(args, ops, image_train, test=False)
    loss_train = loss_function(pred_train, label_train)

    # prepare solvers for model parameters
    model_params_dict = \
        {k: v for k, v in nn.get_parameters().items() if "alpha_" not in k}
    solver_model = S.Momentum(initial_model_lr)
    solver_model.set_parameters(
        {
            k: v
            for k, v in nn.get_parameters().items()
            if k in model_params_dict.keys()
        },
        reset=False,
        retain_state=True)

    # prepare solvers for architecture parameters
    solver_archs = S.Adam(alpha=args.arch_lr, beta1=0.5, beta2=0.999)
    solver_archs.set_parameters(
        {
            k: v
            for k, v in nn.get_parameters().items() if k in alphas_dict.keys()
        },
        reset=False,
        retain_state=True)

    # Training-loop
    for i in range(max_iter):

        # Update Model Parameters.

        if args.second_order:
            # store the weights before update.
            original_weights = {
                k: nn.Variable(v.shape, need_grad=True).apply(
                    data=nn.NdArray(v.shape).copy_from(v.data))
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

            # gradients refuge
            accumulated_gradient = \
                {k: nn.Variable(v.shape).apply(d=0)
                 for k, v in alphas_dict.items()}

        image, label = tdata.next()
        image = image / 255.0
        image = (image - CIFAR_MEAN) / CIFAR_STD
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        loss_train.forward()

        e = categorical_error(pred_train.d, input_image_train["label"].d)
        monitor_loss.add(i, loss_train.d.copy())
        monitor_err.add(i, e)

        if args.lr_control_model:
            new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0)
            solver_model.set_learning_rate(new_lr)

        solver_model.zero_grad()
        loss_train.backward(clear_buffer=True)

        if args.with_grad_clip_model:
            for k, v in model_params_dict.items():
                v.grad.copy_from(
                    F.clip_by_norm(v.grad, args.grad_clip_value_model))

        solver_model.weight_decay(args.weight_decay_model)
        solver_model.update()  # weights update ( w -> w')

        if args.second_order:
            updated_weights = {
                k: nn.Variable(v.shape, need_grad=True).apply(
                    data=nn.NdArray(v.shape).copy_from(v.data))
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

        # Update Architecture Parameters.

        ve, vloss = 0., 0.
        v_image, v_label = vdata.next()
        v_image = v_image / 255.0
        v_image = (v_image - CIFAR_MEAN) / CIFAR_STD
        input_image_train["image"].d = v_image
        input_image_train["label"].d = v_label
        # compute Loss_on_valid(w', alpha)
        loss_train.forward(clear_no_need_grad=True)

        ve = categorical_error(pred_train.d, input_image_train["label"].d)
        monitor_vloss.add(i, loss_train.d.copy())
        monitor_verr.add(i, ve)

        solver_archs.zero_grad()
        solver_model.zero_grad()
        loss_train.backward(clear_buffer=True)  # its gradient is stored

        if args.second_order:
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict,
                                                  coeff=1.)

            # grad_alpha_L_val(w', alpha).  Note that gradient stored into .data
            delta_gradient_w = {
                k: nn.Variable(v.shape).apply(data=nn.NdArray(
                    v.shape).copy_from(v.grad),
                                              need_grad=True)
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

            epsilon = 0.01 / np.sum(
                [np.linalg.norm(v.d) for v in delta_gradient_w.values()])

            coeff = 1.0 * epsilon
            # w -> w+ (= w + epsilon*grad_Loss_on_val(w', alpha))
            weight_modify(original_weights, delta_gradient_w,
                          model_params_dict, coeff)

            input_image_train["image"].d = image  # reuse the same data
            input_image_train["label"].d = label

            # compute Loss_on_train(w+, alpha)
            loss_train.forward()
            solver_archs.zero_grad()
            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)  # its gradient is stored

            # accumulate currently registered gradient
            coeff = (-1.) * args.eta / 2. * epsilon
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict, coeff)

            coeff = -1.0 * epsilon
            # w -> w- (= w - epsilon*grad_Loss_on_val(w', alpha))
            weight_modify(original_weights, delta_gradient_w,
                          model_params_dict, coeff)

            # compute Loss_on_train(w-, alpha)
            loss_train.forward()
            solver_archs.zero_grad()
            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)  # its gradient is stored

            # accumulate currently registered gradient again
            coeff = (+1.) * args.eta / 2. * epsilon
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict, coeff)

            # replace the weights
            for k, v in alphas_dict.items():
                nn.parameter.set_parameter(
                    k,
                    nn.Variable(v.shape).apply(data=v.data,
                                               grad=accumulated_gradient[k],
                                               need_grad=True))
            for k, v in model_params_dict.items():
                nn.parameter.set_parameter(
                    k,
                    nn.Variable(v.shape).apply(data=updated_weights[k].data,
                                               need_grad=True))

        solver_archs.weight_decay(args.weight_decay_archs)
        solver_archs.update()

        if i % 1000 == 0:
            for k, v in alphas_dict.items():
                keynames = k.split("_")
                print("\nParameters for {} cell, node {} to {};".format(
                    keynames[1], keynames[2], keynames[3]))
                show_ops_and_prob(v.d, ops)

    return alphas_dict
Пример #4
0
def projection(x: nn.NdArray, eps: float = 1e-5) -> nn.NdArray:
    norm = F.pow_scalar(F.sum(x**2, axis=1), val=0.5)
    return F.where(condition=F.greater_equal_scalar(norm, val=1.),
                   x_true=F.clip_by_norm(x, clip_norm=1 - eps, axis=1),
                   x_false=x)
Пример #5
0
def CNN_run(args, both_archs, data_dict, with_train=False, after_search=False):
    """
    """

    num_cells = args.num_cells
    num_nodes = args.num_nodes

    if after_search:
        assert with_train is True, "when you train the network after architecture search, set with_train=True"
    tdata, mean_val_train, std_val_train = data_dict["train_data"]
    vdata, mean_val_valid, std_val_valid = data_dict["valid_data"]
    channels, image_height, image_width, num_class = data_dict["basic_info"]
    batch_size = args.batch_size

    output_filter = args.output_filter

    if with_train:
        if after_search:
            num_epoch = args.epoch_on_retrain
            if args.additional_filters_on_retrain > 0:
                output_filter += args.additional_filters_on_retrain
        else:
            num_epoch = args.epoch_per_search

        one_epoch = tdata.size // batch_size
        max_iter = num_epoch * one_epoch

    val_iter = args.val_iter

    monitor_path = args.monitor_path
    model_save_path = args.monitor_path
    decay_rate = args.weight_decay
    initial_lr = args.child_lr

    model_save_interval = args.model_save_interval

    image_valid = nn.Variable(
        (batch_size, channels, image_height, image_width))
    input_image_valid = {"image": image_valid}

    vdata._reset()  # rewind data

    test = True
    pred_valid, _, _ = construct_architecture(image_valid, num_class, num_cells, num_nodes,
                                              both_archs, output_filter, test)

    if with_train:
        if after_search:
            # setting for training after architecture search
            with_grad_clip = args.with_grad_clip_on_retrain
            grad_clip = args.grad_clip_value
            lr_control = args.lr_control_on_retrain
        else:
            with_grad_clip = args.with_grad_clip_on_search
            grad_clip = args.grad_clip_value
            lr_control = args.lr_control_on_search

        # prepare variables used for training
        image_train = nn.Variable(
            (batch_size, channels, image_height, image_width))
        label_train = nn.Variable((batch_size, 1))
        input_image_train = {"image": image_train, "label": label_train}

        tdata._reset()  # rewind data

        test = False
        pred_train, aux_logits, used_weights = construct_architecture(image_train, num_class, num_cells, num_nodes,
                                                                      both_archs, output_filter, test)
        loss_train = loss_function(pred_train, aux_logits, label_train)

        used_weights_dict = {key_name: nn.get_parameters(
        )[key_name] for key_name in used_weights}

        # Create monitor.
        monitor = Monitor(monitor_path)
        monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
        # modified to display accuracy.
        monitor_err = MonitorSeries("Training accuracy", monitor, interval=100)
        # modified to display accuracy.
        monitor_verr = MonitorSeries("Test accuracy", monitor, interval=1)

        # Solvers
        solver = S.Momentum(initial_lr)
        solver.set_parameters(
            used_weights_dict, reset=False, retain_state=True)

        # Training-loop
        for i in range(max_iter):
            if i > 0 and i % one_epoch == 0:
                # Validation during training.
                ve = 0.
                for j in range(val_iter):
                    image, label = vdata.next()
                    image = image / 255.0
                    image = (image - mean_val_valid) / std_val_valid
                    input_image_valid["image"].d = image
                    pred_valid.forward()
                    ve += categorical_error(pred_valid.d, label)
                ve /= val_iter
                monitor_verr.add(i, 1.0 - ve)  # modified to display accuracy.

            if after_search and int(i % args.model_save_interval) == 0:
                nn.save_parameters(os.path.join(
                    args.model_save_path, 'params_%06d.h5' % i))

            # Forward/Zerograd/Backward
            image, label = tdata.next()
            image = image / 255.0
            image = (image - mean_val_train) / std_val_train
            input_image_train["image"].d = image
            input_image_train["label"].d = label
            loss_train.forward()

            if lr_control:
                new_lr = learning_rate_scheduler(i, max_iter, initial_lr, 0)
                solver.set_learning_rate(new_lr)

            solver.zero_grad()
            loss_train.backward()

            if with_grad_clip:
                for k, v in used_weights_dict.items():
                    if np.linalg.norm(v.g) > grad_clip:
                        v.grad.copy_from(F.clip_by_norm(v.grad, grad_clip))

            # Solvers update
            solver.weight_decay(decay_rate)
            solver.update()
            e = categorical_error(pred_train.d, input_image_train["label"].d)
            monitor_loss.add(i, loss_train.d.copy())
            monitor_err.add(i, 1.0 - e)  # modified to display accuracy.

    # Validation (After training or when called for evaluation only)
    ve = 0.
    for j in range(val_iter):
        image, label = vdata.next()
        image = image / 255.0
        image = (image - mean_val_valid) / std_val_valid
        input_image_valid["image"].d = image
        pred_valid.forward()
        ve += categorical_error(pred_valid.d, label)
    ve /= val_iter

    if with_train:
        print("Validation Accuracy on Trained CNN:",
              '{:.2f}'.format(100*(1.0 - ve)), "%\n")

    if after_search:
        nn.save_parameters(os.path.join(
            args.model_save_path, 'params_%06d.h5' % (max_iter)))

    return 1.0 - ve
Пример #6
0
def CNN_run(args, ops, arch_dict):
    """
        Based on the given model architecture,
        construct CNN and execute training.
        input:
            args: arguments set by user.
            ops: operations used in the network.
            arch_dict: a dictionary containing architecture information.
    """

    data_iterator = data_iterator_cifar10
    tdata = data_iterator(args.batch_size, True)
    vdata = data_iterator(args.batch_size, False)

    # CIFAR10 statistics, mean and variance
    CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1))
    CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1))

    channels, image_height, image_width = 3, 32, 32
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = tdata.size // batch_size
    max_iter = args.epoch * one_epoch
    val_iter = 10000 // batch_size

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Test loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Test error", monitor, interval=100)

    # prepare variables and graph used for test
    image_valid = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_valid = nn.Variable((batch_size, 1))
    input_image_valid = {"image": image_valid, "label": label_valid}
    pred_valid, _ = construct_networks(args,
                                       ops,
                                       arch_dict,
                                       image_valid,
                                       test=True)
    loss_valid = loss_function(pred_valid, label_valid)

    # set dropout rate in advance
    nn.parameter.get_parameter_or_create("drop_rate",
                                         shape=(1, 1, 1, 1),
                                         need_grad=False)
    initial_drop_rate = nn.Variable((1, 1, 1, 1)).apply(d=args.dropout_rate)
    nn.parameter.set_parameter("drop_rate", initial_drop_rate)

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}
    pred_train, aux_logits = construct_networks(args,
                                                ops,
                                                arch_dict,
                                                image_train,
                                                test=False)
    loss_train = loss_function(pred_train, label_train, aux_logits,
                               args.auxiliary_weight)

    # prepare solvers
    model_params_dict = nn.get_parameters()
    solver_model = S.Momentum(initial_model_lr)
    solver_model.set_parameters(model_params_dict,
                                reset=False,
                                retain_state=True)

    # Training-loop
    for curr_epoch in range(args.epoch):
        print("epoch {}".format(curr_epoch))

        curr_dropout_rate = F.add_scalar(
            F.mul_scalar(initial_drop_rate, (curr_epoch / args.epoch)), 1e-8)
        nn.parameter.set_parameter("drop_rate", curr_dropout_rate)

        for i in range(one_epoch):
            image, label = tdata.next()
            image = image / 255.0
            image = (image - CIFAR_MEAN) / CIFAR_STD
            if args.cutout:
                image = cutout(image, args)
            input_image_train["image"].d = image
            input_image_train["label"].d = label
            loss_train.forward(clear_no_need_grad=True)

            e = categorical_error(pred_train.d, input_image_train["label"].d)
            monitor_loss.add(one_epoch * curr_epoch + i, loss_train.d.copy())
            monitor_err.add(one_epoch * curr_epoch + i, e)

            if args.lr_control_model:
                new_lr = learning_rate_scheduler(one_epoch * curr_epoch + i,
                                                 max_iter, initial_model_lr, 0)
                solver_model.set_learning_rate(new_lr)

            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)

            if args.with_grad_clip_model:
                for k, v in model_params_dict.items():
                    v.grad.copy_from(
                        F.clip_by_norm(v.grad, args.grad_clip_value_model))

            # update parameters
            solver_model.weight_decay(args.weight_decay_model)
            solver_model.update()

            if (one_epoch * curr_epoch + i) % args.model_save_interval == 0:
                nn.save_parameters(
                    os.path.join(
                        args.model_save_path,
                        'params_{}.h5'.format(one_epoch * curr_epoch + i)))

        # Validation during training.
        ve = 0.
        vloss = 0.
        for j in range(val_iter):
            image, label = vdata.next()
            image = image / 255.0
            image = (image - CIFAR_MEAN) / CIFAR_STD
            input_image_valid["image"].d = image
            input_image_valid["label"].d = label
            loss_valid.forward(clear_no_need_grad=True)
            vloss += loss_valid.d.copy()
            ve += categorical_error(pred_valid.d.copy(), label)
        ve /= val_iter
        vloss /= val_iter
        monitor_vloss.add(one_epoch * curr_epoch + i, vloss)
        monitor_verr.add(one_epoch * curr_epoch + i, ve)

    return
Пример #7
0
def CNN_run(args, model):

    data_iterator_train, data_iterator_valid, num_class = \
                get_data_iterator_and_num_class(args)

    channels, image_height, image_width = 3, args.height, args.width
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = data_iterator_train.size // batch_size
    max_iter = args.epoch * one_epoch
    val_iter = data_iterator_valid.size // batch_size

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Test loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Test error", monitor, interval=100)

    # prepare variables and graph used for test
    image_valid = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_valid = nn.Variable((batch_size, 1))
    input_image_valid = {"image": image_valid, "label": label_valid}

    pred_valid = construct_networks(args,
                                    image_valid,
                                    model,
                                    num_class,
                                    test=True)
    pred_valid.persistent = True
    loss_valid = loss_function(pred_valid, label_valid)
    top_1e_valid = F.mean(F.top_n_error(pred_valid, label_valid))

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}

    pred_train = construct_networks(args,
                                    image_train,
                                    model,
                                    num_class,
                                    test=False)
    loss_train = loss_function(pred_train, label_train)
    top_1e_train = F.mean(F.top_n_error(pred_train, label_train))

    # prepare solvers
    solver = S.Momentum(initial_model_lr)
    solver.set_parameters(nn.get_parameters())

    # Training-loop
    for i in range(max_iter):
        image, label = data_iterator_train.next()
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        nn.forward_all([loss_train, top_1e_train], clear_no_need_grad=True)

        monitor_loss.add(i, loss_train.d.copy())
        monitor_err.add(i, top_1e_train.d.copy())

        if args.lr_control_model:
            new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0)
            solver.set_learning_rate(new_lr)

        solver.zero_grad()
        loss_train.backward(clear_buffer=True)

        if args.with_grad_clip_model:
            for k, v in nn.get_parameters().items():
                v.grad.copy_from(
                    F.clip_by_norm(v.grad, args.grad_clip_value_model))

        # update parameters
        solver.weight_decay(args.weight_decay_model)
        solver.update()

        if i % args.model_save_interval == 0:
            # Validation during training.
            ve = 0.
            vloss = 0.
            for j in range(val_iter):
                v_image, v_label = data_iterator_valid.next()
                input_image_valid["image"].d = v_image
                input_image_valid["label"].d = v_label
                nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True)
                vloss += loss_valid.d.copy()
                ve += top_1e_valid.d.copy()

            ve /= val_iter
            vloss /= val_iter
            monitor_vloss.add(i, vloss)
            monitor_verr.add(i, ve)

            nn.save_parameters(
                os.path.join(args.model_save_path, 'params_{}.h5'.format(i)))

    ve = 0.
    vloss = 0.
    for j in range(val_iter):
        v_image, v_label = data_iterator_valid.next()
        input_image_valid["image"].d = v_image
        input_image_valid["label"].d = v_label
        nn.forward_all([loss_valid, top_1e_valid], clear_buffer=True)
        vloss += loss_valid.d.copy()
        ve += top_1e_valid.d.copy()

    ve /= val_iter
    vloss /= val_iter
    monitor_vloss.add(i, vloss)
    monitor_verr.add(i, ve)

    nn.save_parameters(
        os.path.join(args.model_save_path, 'params_{}.h5'.format(i)))

    return