Пример #1
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_transform(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_t[('biases', i)] = 0.0
        if name == 'universal':
            t_mean = np.mean([np.mean(all_t[('weights', i)])
                              for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data['X'].shape[0]
        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters)

    all_transforms, all_tests_loss = [], []
    def train_reg(transform_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)
        cur_transform = transform_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_transforms.append(cur_transform.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_transform, i_hyper, *cur_split)
            constrained_grad = constrain_transform(raw_grad, constraint)
            cur_transform -= constrained_grad * meta_alpha
        return cur_transform

    transform = np.zeros(N_weights)
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        transform = train_reg(transform, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_loss
Пример #2
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_transform(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_t[("biases", i)] = 0.0
        if name == "universal":
            t_mean = np.mean([np.mean(all_t[("weights", i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_t[("weights", i)] = t_mean
        elif name == "layers":
            for i in range(N_layers):
                all_t[("weights", i)] = np.mean(all_t[("weights", i)])
        elif name == "units":
            for i in range(N_layers):
                all_t[("weights", i)] = np.mean(all_t[("weights", i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[("weights", i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data["X"].shape[0]

        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters)

    all_transforms, all_tests_loss = [], []

    def train_reg(transform_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_transform = transform_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_transforms.append(cur_transform.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_transform, i_hyper, *cur_split)
            constrained_grad = constrain_transform(raw_grad, constraint)
            cur_transform -= constrained_grad * meta_alpha
        return cur_transform

    transform = np.zeros(N_weights)
    constraints = ["universal", "layers", "units"]
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        transform = train_reg(transform, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_loss
Пример #3
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(
            z_vect, transform):  #TODO: isn't this a scale transformation?
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_reg(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):  #Don't regularize biases
            all_t[('biases', i)] = 0.0
        if name == 'universal':  #One regularization hyperparameter for all weights
            #TODO: does computing means of means make sense? Not the same as just the mean of all.
            t_mean = np.mean(
                [np.mean(all_t[('weights', i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers':  #One regularization hyperparameter for each layer
            #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained?
            #print t_vect.shape
            for i in range(N_layers):
                #print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)])))
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            print t_vect.shape  #44860; this is correct
            #for i in range(N_layers):
            #print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True)))
            #for i in range(N_layers):
            #TODO: This was the same as layer-wise
            #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0, ))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:,
                                       0]  #TODO: equivalent regularization weights
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data['X'].shape[0]

        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(
                z_vect, transform
            )  #TODO: this is a scale transformation, not regularization!
            loss = loss_fun(w_vect, **minibatch)  #use new scale for prediction
            reg = regularization(z_vect)  #regularize original scale
            #TODO: should be equivalent: w = z*e^transform, so
            # f(z*e^transform) + e^\lambda||z||^2 = f(w) + e^\lambda||z||^2 = f(w) + e^(\lambda)||e^-2transform w||^2
            # see process_transform

            #if record_results and i_primal % N_thin == 0:
            #print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd_meta_only_mad(grad(primal_loss), transform, z_vect_0, alpha,
                                 beta, N_iters)

    def train_z_exact(data, z_vect_0, transform, meta_iteration=0):
        N_data = data['X'].shape[0]

        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            #if record_results and i_primal % N_thin == 0:
            #    print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd_meta_only(grad(primal_loss),
                             transform,
                             z_vect_0,
                             alpha,
                             beta,
                             N_iters,
                             meta_iteration=meta_iteration)

    def train_z2(data, z_vect_0, transform):
        N_data = data['X'].shape[0]

        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            return loss + reg

        return sgd_meta_only_mad2(grad(primal_loss), transform, z_vect_0,
                                  alpha, beta, N_iters)


    all_transforms, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_angles2, hypergrad_signs_angles, hypergrad_signs_angles2, hypergrad_norms, hypergrad_norms2, exact_hypergrad_norms = [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data,
                      cur_tests_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(
                z_vect_final,
                transform)  #TODO: initial scale AND regularization

            train_loss = getval(loss_fun(w_vect_final, **cur_train_data))
            print "Training loss (unregularized) = " + str(train_loss)
            all_train_loss.append(train_loss)
            valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data))
            print "Validation loss = " + str(valid_loss)
            all_valid_loss.append(valid_loss)
            tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data))
            print "Test loss = " + str(tests_loss)
            all_tests_loss.append(tests_loss)
            """plt.plot(all_train_loss, label="training loss (unregularized)")
            plt.plot(all_valid_loss, label="validation loss")
            plt.plot(all_tests_loss, label="test loss")
            plt.title("loss vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("loss")
            plt.legend()
            plt.savefig("loss"+str(N_iters)+"_corrected.png")
            plt.clf()"""

            train_rate = getval(frac_err(w_vect_final, **cur_train_data))
            print "Training error rate = " + str(train_rate)
            all_train_rates.append(train_rate)
            valid_rate = getval(frac_err(w_vect_final, **cur_valid_data))
            print "Validation error rate = " + str(valid_rate)
            all_valid_rates.append(valid_rate)
            tests_rate = getval(frac_err(w_vect_final, **cur_tests_data))
            print "Test error rate = " + str(tests_rate)
            all_tests_rates.append(tests_rate)
            """plt.plot(all_train_rates, label="training error rate")
            plt.plot(all_valid_rates, label="validation error rate")
            plt.plot(all_tests_rates, label="test error rate")
            plt.title("error rate vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("error rate")
            plt.legend()
            plt.savefig("error"+str(N_iters)+"_corrected.png")
            plt.clf()"""

            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)  #No chain rule here

        def hyperloss_exact(transform,
                            i_hyper,
                            cur_train_data,
                            cur_valid_data,
                            cur_tests_data,
                            meta_it=0):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z_exact(cur_train_data,
                                         z_vect_0,
                                         transform,
                                         meta_iteration=meta_it)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad_exact = grad(hyperloss_exact)  #No chain rule here

        def hyperloss2(transform, i_hyper, cur_train_data, cur_valid_data,
                       cur_tests_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z2(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad2 = grad(hyperloss2)
        '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path?
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)'''

        cur_reg = reg_0  #initial regularization, besides regularization() function
        for i_hyper in range(N_meta_iter):
            print "Hyper iter " + str(i_hyper)
            """if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)"""
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data
            #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            cur_train_data, cur_valid_data = random_partition(
                train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data,
                                 cur_valid_data, tests_data)
            raw_grad2 = hypergrad2(cur_reg, i_hyper, cur_train_data,
                                   cur_valid_data, tests_data)
            raw_grad_exact = hypergrad_exact(cur_reg,
                                             i_hyper,
                                             cur_train_data,
                                             cur_valid_data,
                                             tests_data,
                                             meta_it=i_hyper)
            #print "before constraining grad"
            constrained_grad = constrain_reg(raw_grad, constraint)
            constrained_grad2 = constrain_reg(raw_grad2, constraint)
            constrained_grad_exact = constrain_reg(raw_grad_exact, constraint)
            print(np.linalg.norm(raw_grad))
            print(np.linalg.norm(raw_grad2))
            #TODO: #Exploding DrMAD gradient; ~10^10x larger than exact gradient with N_safe_sampling = N_iters
            print(np.linalg.norm(raw_grad_exact))
            # TODO: sometimes negative???

            hypergrad_angle = np.dot(
                constrained_grad, constrained_grad_exact) / (
                    np.linalg.norm(constrained_grad) *
                    np.linalg.norm(constrained_grad_exact))
            hypergrad_angles.append(hypergrad_angle)
            hypergrad_angle2 = np.dot(
                constrained_grad2, constrained_grad_exact) / (
                    np.linalg.norm(constrained_grad2) *
                    np.linalg.norm(constrained_grad_exact))
            hypergrad_angles2.append(hypergrad_angle2)
            print("cosine of angle between DrMAD and exact = " +
                  str(hypergrad_angle))
            print("cosine of angle between DrMAD2 and exact = " +
                  str(hypergrad_angle2))

            hypergrad_signs_angle = np.dot(
                np.sign(constrained_grad),
                np.sign(constrained_grad_exact)) / len(constrained_grad)
            hypergrad_signs_angles.append(hypergrad_signs_angle)
            print("cosine of angle between signs of DrMAD and exact = " +
                  str(hypergrad_signs_angle))
            hypergrad_signs_angle2 = np.dot(
                np.sign(constrained_grad2),
                np.sign(constrained_grad_exact)) / len(constrained_grad2)
            hypergrad_signs_angles2.append(hypergrad_signs_angle2)
            print("cosine of angle between signs of DrMAD2 and exact = " +
                  str(hypergrad_signs_angle2))
            """plt.plot(hypergrad_angles, label="exact vs DrMAD")
            plt.plot(hypergrad_signs_angles, label="signs exact vs signs DrMAD")
            plt.plot(hypergrad_angles2, label="exact vs DrMAD2")
            plt.plot(hypergrad_signs_angles2, label="signs exact vs signs DrMAD2")
            plt.title("Cosine of angle between hypergradients vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("cosine of angle")
            plt.legend()
            plt.savefig("angle"+str(N_iters)+"_corrected2.png")
            plt.clf()"""

            hypergrad_norm = np.linalg.norm(constrained_grad)
            hypergrad_norms.append(hypergrad_norm)
            print("DrMAD norm = " + str(hypergrad_norm))
            hypergrad_norm2 = np.linalg.norm(constrained_grad2)
            hypergrad_norms2.append(hypergrad_norm2)
            print("DrMAD2 norm = " + str(hypergrad_norm2))
            exact_hypergrad_norm = np.linalg.norm(constrained_grad_exact)
            exact_hypergrad_norms.append(exact_hypergrad_norm)
            print("Exact norm = " + str(exact_hypergrad_norm))
            """plt.plot(hypergrad_norms, label="DrMAD hypergradient")
            plt.plot(hypergrad_norms2, label="DrMAD2 hypergradient")
            plt.plot(exact_hypergrad_norms, label="Exact hypergradient")
            plt.title("Norms of hypergradients vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("norm")
            plt.legend()
            plt.savefig("norms"+str(N_iters)+"_corrected2.png")
            plt.clf()"""

            cur_reg -= np.sign(
                constrained_grad) * meta_alpha  #TODO: signs of gradient...
            #TODO: momentum
        return cur_reg

    reg = np.zeros(
        N_weights) + 0.2  #TODO: initial -log regularization; not in log scale?
    constraints = ['universal', 'layers', 'units']
    # TODO: uses multiple kinds of hyperparameter sharing, but in order
    for i_top, (N_meta_iter,
                constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top), constraint
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    #return all_L2_regs, all_tests_rates, all_avg_regs
    all_L2_regs, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_signs_angles, hypergrad_norms, exact_hypergrad_norms
Пример #4
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    init_scales = w_parser.new_vect(np.zeros(N_weights))
    for i in range(N_layers):
        init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i])
        init_scales[('biases',  i)] = 1.0
    init_scales = init_scales.vect

    def regularization(w_vect, reg):
        return np.dot(w_vect, w_vect * np.exp(reg))

    def constrain_reg(t_vect, name):
        all_r = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_r[('biases', i)] = 0.0
        if name == 'universal':
            r_mean = np.mean([np.mean(all_r[('weights', i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_r[('weights', i)] = r_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_r.vect

    def process_reg(t_vect):
        # Remove the redundancy due to sharing regularization within units
        all_r = w_parser.new_vect(t_vect)
        new_r = np.zeros((0,))
        for i in range(N_layers):
            layer = all_r[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_r = layer[:, 0]
            new_r = np.concatenate((new_r, cur_r))
        return new_r

    def train_z(data, w_vect_0, reg):
        N_data = data['X'].shape[0]
        def primal_loss(w_vect, reg, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(w_vect, reg)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters)

    all_regs, all_tests_loss = [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
                print "Cur_reg", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            print constrained_grad
            # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
            cur_reg -= constrained_grad * meta_alpha

        return cur_reg


    def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        w_vect_0 = RS.randn(N_weights) * init_scales
        w_vect_final = train_z(cur_train_data, w_vect_0, reg)
        return loss_fun(w_vect_final, **cur_valid_data)

    # t_scale = [-1, 0, 1]
    # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
    # for s in t_scale:
    #     reg = np.ones(N_weights) * log_L2_init + s
    #     loss = new_hyperloss(reg, 0, *cur_split)
    #     print "Results: s= {0}, loss = {1}".format(s, loss)

    reg = np.ones(N_weights) * log_L2_init
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_reg, all_regs)))
    return all_L2_regs, all_tests_loss
Пример #5
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    init_scales = w_parser.new_vect(np.zeros(N_weights))
    for i in range(N_layers):
        init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i])
        init_scales[('biases', i)] = 1.0
    init_scales = init_scales.vect

    def regularization(w_vect, reg):
        return np.dot(w_vect, w_vect * np.exp(reg))

    def constrain_reg(t_vect, name):
        all_r = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_r[('biases', i)] = 0.0
        if name == 'universal':
            r_mean = np.mean(
                [np.mean(all_r[('weights', i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_r[('weights', i)] = r_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)],
                                                axis=1,
                                                keepdims=True)
        else:
            raise Exception
        return all_r.vect

    def process_reg(t_vect):
        # Remove the redundancy due to sharing regularization within units
        all_r = w_parser.new_vect(t_vect)
        new_r = np.zeros((0, ))
        for i in range(N_layers):
            layer = all_r[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_r = layer[:, 0]
            new_r = np.concatenate((new_r, cur_r))
        return new_r

    def train_z(data, w_vect_0, reg):
        N_data = data['X'].shape[0]

        def primal_loss(w_vect, reg, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(w_vect, reg)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters)

    all_regs, all_tests_loss = [], []

    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data,
                                       tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(
                    i_hyper, all_tests_loss[-1])
                print "Cur_reg", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS,
                                         [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            print constrained_grad
            # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
            cur_reg -= constrained_grad * meta_alpha

        return cur_reg

    def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        w_vect_0 = RS.randn(N_weights) * init_scales
        w_vect_final = train_z(cur_train_data, w_vect_0, reg)
        return loss_fun(w_vect_final, **cur_valid_data)

    # t_scale = [-1, 0, 1]
    # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
    # for s in t_scale:
    #     reg = np.ones(N_weights) * log_L2_init + s
    #     loss = new_hyperloss(reg, 0, *cur_split)
    #     print "Results: s= {0}, loss = {1}".format(s, loss)

    reg = np.ones(N_weights) * log_L2_init
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter,
                constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_reg, all_regs)))
    return all_L2_regs, all_tests_loss
Пример #6
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    exact_metagrad = [np.array([0])] #just a placeholder

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_reg(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers): #Don't regularize biases
            all_t[('biases', i)] = 0.0
        if name == 'universal': #One regularization hyperparameter for all weights
            #TODO: does computing means of means make sense? Not the same as just the mean of all.
            t_mean = np.mean([np.mean(all_t[('weights', i)])
                              for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers': #One regularization hyperparameter for each layer
            #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained?
            print t_vect.shape
            for i in range(N_layers):
                print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)])))
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            print t_vect.shape #44860; this is correct
            for i in range(N_layers):
                print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True)))
            #for i in range(N_layers):
                #TODO: This was the same as layer-wise
                #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t
        
    #TODO: make sure the exact_metagrad gets passed by reference
    def train_z(data, z_vect_0, transform, exact_metagrad):
        N_data = data['X'].shape[0]
        
        def primal_loss(z_vect, transform, i_primal, record_results=False): #exact_metagrad=exact_metagrad2, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), transform, z_vect_0, exact_metagrad, alpha, beta, N_iters)

    all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top, exact_metagrad):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad)
            w_vect_final = transform_weights(z_vect_final, transform)
            #TODO: print/store losses and error rates here
            print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data)))
            print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data)))
            print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data)))
            print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data)))
            print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data)))
            print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data)))
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss) #No chain rule here

            
        '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path?
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)'''

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            print "Hyper iter "+ str(i_hyper)
            """if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)"""
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data
            #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, exact_metagrad)
            #print "before constraining grad"
            constrained_grad = constrain_reg(raw_grad, constraint)
            # TODO: can put exact hypergradient here, using constraint
            #print "after constraining grad, before constraining exact"
            # TODO: DrMAD norm matches after constraining, but not exact norm?? Why???
            # This one is about 4x larger than constrained one
            print np.linalg.norm(raw_grad)
            print np.linalg.norm(exact_metagrad[0])
            constrained_exact_grad = constrain_reg(exact_metagrad[0], constraint)
            #print "after constraining exact"
            # TODO: compute statistics
            # TODO: sometimes negative???
            print("cosine of angle between DrMAD and exact = "
                +str(np.dot(constrained_grad, constrained_exact_grad)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_exact_grad))))
            print("cosine of angle between signs of DrMAD and exact = "
                +str(np.dot(np.sign(constrained_grad), np.sign(constrained_exact_grad))/len(constrained_grad)))
            print("DrMAD norm = "+ str(np.linalg.norm(constrained_grad)))
            print("Exact norm = "+ str(np.linalg.norm(constrained_exact_grad)))
            cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient...
            #TODO: momentum
        return cur_reg

    reg = np.zeros(N_weights)+0.2
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top), constraint
        reg = train_reg(reg, constraint, N_meta_iter, i_top, exact_metagrad)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_rates, all_avg_regs