Exemplo n.º 1
0
def run():
    train_images, train_labels, _, _, _ = load_data()
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)
    iter_per_epoch = len(batch_idxs)
    N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs])

    log_alphas = np.full(N_iters, log_alpha_0)
    betas      = np.full(N_iters, beta_0)
    npr.seed(2)
    V0 = npr.randn(N_weights) * velocity_scale
    #W0 = npr.randn(N_weights) * np.exp(log_param_scale)
    bins = np.linspace(-1,1,N_bins) * np.exp(log_param_scale)
    W_uniform = npr.rand(N_weights)
    output = []
    for i in range(N_meta_iter):
        print "Meta iteration {0}".format(i)
        W0, dW_dbins = bininvcdf(W_uniform, bins)
        results = sgd(indexed_loss_fun, batch_idxs, N_iters,
                      W0, V0, np.exp(log_alphas), betas, record_learning_curve=True)
        dL_dx = results['d_x']
        dL_dbins = np.dot(dL_dx, dW_dbins)
        learning_curve = results['learning_curve']
        output.append((learning_curve, bins))
        bins = bins - dL_dbins * bin_stepsize
        bins[[0,-1]] = bins[[0,-1]] - dL_dbins[[0,1]] * bin_stepsize
        bins.sort()  # Sort in place.

    return output
Exemplo n.º 2
0
def run(oiter):
    # ----- Variable for this run -----
    log_alpha_0 = all_log_alpha_0[oiter]

    print "Running job {0} on {1}".format(oiter + 1, socket.gethostname())
    train_images, train_labels, _, _, _ = load_data()
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)
    iter_per_epoch = len(batch_idxs)
    N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs])

    V0 = npr.randn(N_weights) * velocity_scale
    losses = []
    d_losses = []
    alpha_0 = np.exp(log_alpha_0)
    for N_iters in all_N_iters:
        alphas = np.full(N_iters, alpha_0)
        betas = np.full(N_iters, beta_0)
        npr.seed(1)
        W0 = npr.randn(N_weights) * np.exp(log_param_scale)
        results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas)
        losses.append(results['loss_final'])
        d_losses.append(d_log_loss(alpha_0, results['d_alphas']))

    return losses, d_losses
Exemplo n.º 3
0
def run():
    train_images, train_labels, _, _, _ = load_data()
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)

    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = len(parser.vect)
    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg)

    losses = []
    d_losses = []
    for log_alpha_0 in log_stepsizes:
        npr.seed(0)
        V0 = npr.randn(N_weights) * velocity_scale
        alpha_0 = np.exp(log_alpha_0)
        alphas = np.full(N_iters, alpha_0)
        betas = np.full(N_iters, beta_0)
        W0 = npr.randn(N_weights) * np.exp(log_param_scale)
        results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas)
        losses.append(results['loss_final'])
        d_losses.append(d_log_loss(alpha_0, results['d_alphas']))

    return losses, d_losses
Exemplo n.º 4
0
def run():
    train_images, train_labels, _, _, _ = load_data()
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)
    iter_per_epoch = len(batch_idxs)
    N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs])

    V0 = npr.randn(N_weights) * velocity_scale
    losses = []
    d_losses = []
    for N_iters in all_N_iters:
        alphas = np.full(N_iters, alpha_0)
        betas = np.full(N_iters, beta_0)
        loss_curve = []
        d_loss_curve = []
        for log_param_scale in all_log_param_scale:
            print "log_param_scale {0}, N_iters {1}".format(log_param_scale, N_iters)
            npr.seed(1)
            W0 = npr.randn(N_weights) * np.exp(log_param_scale)
            results = sgd(indexed_loss_fun, batch_idxs, N_iters, W0, V0, alphas, betas)
            loss_curve.append(results['loss_final'])
            d_loss_curve.append(d_log_loss(W0, results['d_x']))
        losses.append(loss_curve)
        d_losses.append(d_loss_curve)

    with open('results.pkl', 'w') as f:
        pickle.dump((losses, d_losses), f)
Exemplo n.º 5
0
def run():
    train_images, train_labels, _, _, _ = load_data()
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)
    iter_per_epoch = len(batch_idxs)
    N_weights, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs])

    log_alphas = np.full(N_iters, log_alpha_0)
    betas      = np.full(N_iters, beta_0)
    npr.seed(1)
    V0 = npr.randn(N_weights) * velocity_scale
    W0 = npr.randn(N_weights) * np.exp(log_param_scale)
    output = []
    for i in range(N_meta_iter):
        print "Meta iteration {0}".format(i)
        results = sgd(indexed_loss_fun, batch_idxs, N_iters,
                      W0, V0, np.exp(log_alphas), betas, record_learning_curve=True)
        learning_curve = results['learning_curve']
        d_log_alphas = np.exp(log_alphas) * results['d_alphas']
        output.append((learning_curve, log_alphas, d_log_alphas))
        log_alphas = log_alphas - meta_alpha * d_log_alphas

    return output
Exemplo n.º 6
0
def run():
    train_images, train_labels, _, _, _ = load_data(normalize=True)
    train_images = train_images[:N_real_data, :]
    train_labels = train_labels[:N_real_data, :]
    batch_idxs = BatchList(N_fake_data, batch_size)
    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True)
    N_weights = parser.N

    fake_data = npr.randn(*(train_images[:N_fake_data, :].shape)) * init_fake_data_scale
    fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes)  # One of each.

    def indexed_loss_fun(x, meta_params, idxs):   # To be optimized by SGD.
        return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs])
    def meta_loss_fun(x):                         # To be optimized in the outer loop.
        return loss_fun(x, X=train_images, T=train_labels)
    log_alphas = np.full(N_iters, log_alpha_0)
    betas      = np.full(N_iters, beta_0)
    npr.seed(0)
    v0 = npr.randn(N_weights) * velocity_scale
    x0 = npr.randn(N_weights) * np.exp(log_param_scale)

    output = []
    for i in range(N_meta_iter):
        results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters,
                       x0, v0, np.exp(log_alphas), betas, fake_data)
        learning_curve = results['learning_curve']
        validation_loss = results['M_final']
        output.append((learning_curve, validation_loss, fake_data))
        fake_data -= results['dMd_meta'] * data_stepsize   # Update data with one gradient step.
        print "Meta iteration {0} Valiation loss {1}".format(i, validation_loss)
    return output
Exemplo n.º 7
0
def run(superparams):
    alpha, log_scale_init, offset_init_std = superparams
    RS = RandomState((seed, "top_rs"))
    all_alphabets = omniglot.load_data()
    RS.shuffle(all_alphabets)
    train_alphabets = all_alphabets[:-N_test_alphabets]
    tests_alphabets = all_alphabets[-N_test_alphabets:]
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    hyperparams_0 = VectorParser()
    hyperparams_0['log_scale']  = log_scale_init * np.ones(N_weights)
    hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights)

    def reg_loss_fun(W, data, hyperparam_vect, reg_penalty):
        hyperparams = hyperparams_0.new_vect(hyperparam_vect)
        Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset']
        return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty

    def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))        
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))
        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 30 == 0:
                print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
                
            return loss

        W0 = np.zeros(N_weights)
        W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None)
        return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)

    results = defaultdict(list)
    def record_results(hyperparam_vect, i_hyper, g):
        # print "Meta iter {0}. Recording results".format(i_hyper)
        RS = RandomState((seed, i_hyper, "evaluation"))
        new_seed = RS.int32()
        def loss_fun(alphabets, report_train_loss):
            return np.mean([hyperloss(hyperparam_vect, new_seed, alphabets=alphabets,
                                      verbose=False, report_train_loss=report_train_loss)
                            for i in range(N_alphabets_eval)])
        cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy())
        if i_hyper % N_hyper_thin == 0:
            # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision
            for field in cur_hyperparams.names:
                results[field].append(cur_hyperparams[field].astype(np.float16))
        results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True))
        results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False))

    record_results(hyperparams_0.vect, 0, None)
    return [results['train_loss'][0], results['valid_loss'][0]]
Exemplo n.º 8
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)

    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)
    #fixed_hyperparams = VectorParser()
    #fixed_hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(npr.RandomState(global_seed + i_hyper).randint(1000))
            seed = i_hyper * 10**6 + i_iter   # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve = []
        def callback(x, i_iter):
            if i_iter % N_batches == 0:
                learning_curve.append(loss_fun(x, **train_data))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        V0 = np.zeros(W0.size)
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        return W_opt, learning_curve

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        # return loss_fun(W_opt, **valid_data)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    def meta_callback(hyperparam_vect, i_hyper):
        print "Meta Epoch {0}".format(i_hyper)
        x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve)

    final_result = rms_prop(hyperloss_grad, hyperparams.vect,
                            meta_callback, N_meta_iter, meta_alpha, gamma=0.0)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 9
0
def run():
    (train_images, train_labels), (val_images, val_labels), (test_images, test_labels) = load_data_subset(
        N_train_data, N_val_data, N_test_data
    )

    batch_idxs = BatchList(N_train_data, batch_size)
    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = parser.N

    hyperparser = WeightsParser()
    hyperparser.add_weights("log_L2_reg", (N_weights,))
    metas = np.zeros(hyperparser.N)
    print "Number of hyperparameters to be trained:", hyperparser.N

    npr.seed(0)
    hyperparser.set(metas, "log_L2_reg", log_L2_reg_scale + np.ones(N_weights))

    def indexed_loss_fun(x, meta_params, idxs):  # To be optimized by SGD.
        L2_reg = np.exp(hyperparser.get(meta_params, "log_L2_reg"))
        return loss_fun(x, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg)

    def meta_loss_fun(x, meta_params):  # To be optimized in the outer loop.
        L2_reg = np.exp(hyperparser.get(meta_params, "log_L2_reg"))
        log_prior = -meta_L2_reg * np.dot(L2_reg.ravel(), L2_reg.ravel())
        return loss_fun(x, X=val_images, T=val_labels) - log_prior

    def test_loss_fun(x):  # To measure actual performance.
        return loss_fun(x, X=test_images, T=test_labels)

    log_alphas = np.full(N_iters, log_alpha_0)
    betas = np.full(N_iters, beta_0)

    v0 = npr.randn(N_weights) * velocity_scale
    x0 = npr.randn(N_weights) * np.exp(log_param_scale)

    output = []
    for i in range(N_meta_iter):
        results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, metas)

        learning_curve = results["learning_curve"]
        validation_loss = results["M_final"]
        test_loss = test_loss_fun(results["x_final"])
        output.append(
            (
                learning_curve,
                validation_loss,
                test_loss,
                parser.get(results["x_final"], (("weights", 0))),
                parser.get(np.exp(hyperparser.get(metas, "log_L2_reg")), (("weights", 0))),
            )
        )
        metas -= results["dMd_meta"] * meta_stepsize
        print "Meta iteration {0} Valiation loss {1} Test loss {2}".format(i, validation_loss, test_loss)
    return output
Exemplo n.º 10
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    initial_hypergrad = hyperloss_grad( hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    avg_hypergrad = initial_hypergrad.copy()
    for i in xrange(1, N_meta_iter):
        avg_hypergrad += hyperloss_grad( hyperparams.vect, i)
        print i
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None # No need to pickle zeros
    return parser, parsed_init_hypergrad, parsed_avg_hypergrad
Exemplo n.º 11
0
def run():
    train_images, train_labels, _, _, _ = load_data(normalize=True)
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)
    iter_per_epoch = len(batch_idxs)
    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True)
    N_weights = parser.N
    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs])
    log_alphas = np.full(N_iters, log_alpha_0)
    betas      = np.full(N_iters, beta_0)
    npr.seed(2)
    V0 = npr.randn(N_weights) * velocity_scale
    #W0 = npr.randn(N_weights) * np.exp(log_param_scale)

    bindict = {k : np.linspace(-1,1,N_bins) * np.exp(log_param_scale)  # Different cdf per layer.
                   for k, v in parser.idxs_and_shapes.iteritems()}
    output = []
    for i in range(N_meta_iter):
        print "Meta iteration {0}".format(i)
        #X0, dX_dbins = bininvcdf(W_uniform, bins)
        X_uniform = npr.rand(N_weights)  # Weights are uniform passed through an inverse cdf.
        X0 = np.zeros(N_weights)
        dX_dbins = {}
        for k, cur_bins in bindict.iteritems():
            cur_slice, cur_shape = parser.idxs_and_shapes[k]
            cur_xs = X_uniform[cur_slice]
            cur_X0, cur_dX_dbins = bininvcdf(cur_xs, cur_bins)
            X0[cur_slice] = cur_X0
            dX_dbins[k] = cur_dX_dbins
        results = sgd(indexed_loss_fun, batch_idxs, N_iters,
                      X0, V0, np.exp(log_alphas), betas, record_learning_curve=True)
        dL_dx = results['d_x']

        learning_curve = results['learning_curve']
        output.append((learning_curve, bindict))

        # Update bins with one gradient step.
        for k, bins in bindict.iteritems():
            dL_dbins = np.dot(parser.get(dL_dx, k).flatten(), dX_dbins[k])
            bins = bins - dL_dbins * bin_stepsize
            bins[[0,-1]] = bins[[0,-1]] - dL_dbins[[0,1]] * bin_stepsize
            bindict[k] = np.sort(bins)
        bindict = bindict.copy()

    return output
Exemplo n.º 12
0
def run():
    train_images, train_labels, _, _, _ = load_data(normalize=True)
    train_images = train_images[:N_real_data, :]
    train_labels = train_labels[:N_real_data, :]
    batch_idxs = BatchList(N_fake_data, batch_size)
    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg, return_parser=True)
    N_weights = parser.N

    # fake_data = npr.randn(*(train_images[:N_fake_data, :].shape))
    fake_data = np.zeros(train_images[:N_fake_data, :].shape)
    one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :], dtype=int)
    fake_labels = one_hot(np.array(range(0, 10)), 10)  # One of each label.

    def indexed_loss_fun(x, meta_params, idxs):  # To be optimized by SGD.
        return loss_fun(x, X=meta_params[idxs], T=fake_labels[idxs])

    def meta_loss_fun(x):  # To be optimized in the outer loop.
        return loss_fun(x, X=train_images, T=train_labels)

    log_alphas = np.full(N_iters, log_alpha_0)
    betas = np.full(N_iters, beta_0)
    npr.seed(0)
    v0 = npr.randn(N_weights) * velocity_scale
    x0 = npr.randn(N_weights) * np.exp(log_param_scale)

    output = []
    for i in range(N_meta_iter):
        print "Meta iteration {0}".format(i)
        results = sgd2(
            indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters, x0, v0, np.exp(log_alphas), betas, fake_data
        )

        learning_curve = results["learning_curve"]
        output.append((learning_curve, fake_data))
        fake_data -= results["dMd_meta"] * data_stepsize  # Update data with one gradient step.

    return output
Exemplo n.º 13
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams["log_param_scale"] = np.full(N_weight_types, init_log_param_scale)
    hyperparams["log_alphas"] = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams["invlogit_betas"] = np.full((N_iters, N_weight_types), init_invlogit_betas)
    for name in parser.names:
        hyperparams[("rescale", name)] = np.full(N_iters, init_rescales)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams["log_L2_reg"] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data["X"][idxs], train_data["T"][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict["learning_curve"].append(loss_fun(x, **train_data))
                learning_curve_dict["grad_norm"].append(np.linalg.norm(g))
                learning_curve_dict["weight_norm"].append(np.linalg.norm(x))
                learning_curve_dict["velocity_norm"].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams["log_param_scale"]))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams["log_alphas"])
        betas = logit(cur_hyperparams["invlogit_betas"])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams["log_L2_reg"]))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results["train_loss"].append(loss_fun(x, **train_data))
        meta_results["valid_loss"].append(loss_fun(x, **valid_data))
        meta_results["tests_loss"].append(loss_fun(x, **tests_data))
        meta_results["test_err"].append(frac_err(x, **tests_data))
        meta_results["learning_curves"].append(learning_curve_dict)
        if metagrad is not None:
            meta_results["meta_grad_magnitude"].append(np.linalg.norm(metagrad))
            meta_results["meta_grad_angle"].append(
                np.dot(old_metagrad[0], metagrad) / (np.linalg.norm(metagrad) * np.linalg.norm(old_metagrad[0]))
            )
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper,
            meta_results["train_loss"][-1],
            meta_results["valid_loss"][-1],
            meta_results["train_loss"][-1],
            meta_results["test_err"][-1],
        )

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 14
0
def run( ):
    RS = RandomState((seed, "to p_rs"))
    data = loadData.loadMnist()
    train_data, tests_data = loadData.load_data_as_dict(data, classNum)
    train_data = random_partition(train_data, RS, [N_train]).__getitem__(0)
    tests_data = random_partition(tests_data, RS, [ N_tests]).__getitem__(0)


    print "training samples {0}: testing samples: {1}".format(N_train,N_tests)


    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    init_scales = w_parser.new_vect(np.zeros(N_weights))
    for i in range(N_layers):
        init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i])
        init_scales[('biases',  i)] = 1.0
    init_scales = init_scales.vect

    def regularization(w_vect, reg):
        return np.dot(w_vect, w_vect * np.exp(reg))



    def constrain_reg(t_vect, name):
        all_r = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_r[('biases', i)] = 0.0
        if name == 'universal':
            r_mean = np.mean([np.mean(all_r[('weights', i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_r[('weights', i)] = r_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_r.vect

    def process_reg(t_vect):
        # Remove the redundancy due to sharing regularization within units
        all_r = w_parser.new_vect(t_vect)
        new_r = np.zeros((0,))
        for i in range(N_layers):
            layer = all_r[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_r = layer[:, 0]
            new_r = np.concatenate((new_r, cur_r))
        return new_r

    def train_z(data, w_vect_0, reg):
        N_data = data['X'].shape[0]
        def primal_loss(w_vect, reg, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(w_vect, reg)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters)

    all_regs, all_tests_loss = [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
                print "Cur_reg", cur_reg
                # print "Cur_reg", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            # print("calculate hypergradients")
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            # print "constrained_grad",constrained_grad
            print "\n"
            # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
            cur_reg -= constrained_grad * meta_alpha
            # cur_reg -= np.sign(constrained_grad) * meta_alpha

        return cur_reg


    def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        w_vect_0 = RS.randn(N_weights) * init_scales
        w_vect_final = train_z(cur_train_data, w_vect_0, reg)
        return loss_fun(w_vect_final, **cur_valid_data)

    # t_scale = [-1, 0, 1]
    # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
    # for s in t_scale:
    #     reg = np.ones(N_weights) * log_L2_init + s
    #     loss = new_hyperloss(reg, 0, *cur_split)
    #     print "Results: s= {0}, loss = {1}".format(s, loss)

    reg = np.ones(N_weights) * log_L2_init

    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_reg, all_regs)))
    return all_L2_regs, all_tests_loss
Exemplo n.º 15
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform): #TODO: isn't this a scale transformation?
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_reg(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers): #Don't regularize biases
            all_t[('biases', i)] = 0.0
        if name == 'universal': #One regularization hyperparameter for all weights
            #TODO: does computing means of means make sense? Not the same as just the mean of all.
            t_mean = np.mean([np.mean(all_t[('weights', i)])
                              for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers': #One regularization hyperparameter for each layer
            #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained?
            #print t_vect.shape
            for i in range(N_layers):
                #print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)])))
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            print t_vect.shape #44860; this is correct
            #for i in range(N_layers):
                #print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True)))
            #for i in range(N_layers):
                #TODO: This was the same as layer-wise
                #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0] #TODO: equivalent regularization weights
            new_t = np.concatenate((new_t, cur_t))
        return new_t
        
    def train_z(data, z_vect_0, transform):
        N_data = data['X'].shape[0]
        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform) #TODO: this is a scale transformation, not regularization!
            loss = loss_fun(w_vect, **minibatch) #use new scale for prediction
            reg = regularization(z_vect) #regularize original scale
            #TODO: should be equivalent: w = z*e^transform, so 
            # f(z*e^transform) + e^\lambda||z||^2 = f(w) + e^\lambda||z||^2 = f(w) + e^(\lambda)||e^-2transform w||^2
            # see process_transform
            
            #if record_results and i_primal % N_thin == 0:
                #print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd_meta_only_mad(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters)
    
    def train_z_exact(data, z_vect_0, transform, meta_iteration=0):
        N_data = data['X'].shape[0]
        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            #if record_results and i_primal % N_thin == 0:
            #    print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd_meta_only(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters, meta_iteration=meta_iteration)

    all_transforms, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_signs_angles, hypergrad_norms, exact_hypergrad_norms = [], [], [], [], [], [], [], [], [], [], [], []
    
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform) #TODO: initial scale AND regularization
            
         
            train_loss = getval(loss_fun(w_vect_final, **cur_train_data))
            print "Training loss (unregularized) = " +str(train_loss)
            all_train_loss.append(train_loss)
            valid_loss = getval(loss_fun(w_vect_final, **cur_valid_data))
            print "Validation loss = " +str(valid_loss)
            all_valid_loss.append(valid_loss)
            tests_loss = getval(loss_fun(w_vect_final, **cur_tests_data))
            print "Test loss = " +str(tests_loss)
            all_tests_loss.append(tests_loss)
            
            plt.plot(all_train_loss, label="training loss (unregularized)")
            plt.plot(all_valid_loss, label="validation loss")
            plt.plot(all_tests_loss, label="test loss")
            plt.title("loss vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("loss")
            plt.legend()
            plt.savefig("loss2000_corrected.png")
            plt.clf()
            
            
            train_rate = getval(frac_err(w_vect_final, **cur_train_data))
            print "Training error rate = " +str(train_rate)
            all_train_rates.append(train_rate)
            valid_rate = getval(frac_err(w_vect_final, **cur_valid_data))
            print "Validation error rate = " +str(valid_rate)
            all_valid_rates.append(valid_rate)
            tests_rate = getval(frac_err(w_vect_final, **cur_tests_data))
            print "Test error rate = " +str(tests_rate)
            all_tests_rates.append(tests_rate)
            
            plt.plot(all_train_rates, label="training error rate")
            plt.plot(all_valid_rates, label="validation error rate")
            plt.plot(all_tests_rates, label="test error rate")
            plt.title("error rate vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("error rate")
            plt.legend()
            plt.savefig("error2000_corrected.png")
            plt.clf()

            
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss) #No chain rule here

        def hyperloss_exact(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, meta_it=0):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z_exact(cur_train_data, z_vect_0, transform, meta_iteration=meta_it)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad_exact = grad(hyperloss_exact) #No chain rule here
            
        '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path?
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)'''

        cur_reg = reg_0 #initial regularization, besides regularization() function
        for i_hyper in range(N_meta_iter):
            print "Hyper iter "+ str(i_hyper)
            """if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)"""
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data
            #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data)
            raw_grad_exact = hypergrad_exact(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, meta_it=i_hyper)
            #print "before constraining grad"
            constrained_grad = constrain_reg(raw_grad, constraint)
            constrained_grad_exact = constrain_reg(raw_grad_exact, constraint)
            print(np.linalg.norm(raw_grad))
            #TODO: #Exploding DrMAD gradient; ~10^10x larger than exact gradient with N_safe_sampling = N_iters
            print(np.linalg.norm(raw_grad_exact))
            # TODO: sometimes negative???
            
            
            hypergrad_angle = np.dot(constrained_grad, constrained_grad_exact)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_grad_exact))
            hypergrad_angles.append(hypergrad_angle)
            print("cosine of angle between DrMAD and exact = " +str(hypergrad_angle))
            
            hypergrad_signs_angle = np.dot(np.sign(constrained_grad), np.sign(constrained_grad_exact))/len(constrained_grad)
            hypergrad_signs_angles.append(hypergrad_signs_angle)
            print("cosine of angle between signs of DrMAD and exact = " +str(hypergrad_signs_angle))
            
            plt.plot(hypergrad_angles, label="between exact and DrMAD hypergradients")
            plt.plot(hypergrad_signs_angles, label="between signs of DrMAD and exact")
            plt.title("Cosine of angle vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("cosine of angle")
            plt.legend()
            plt.savefig("angle2000_corrected.png")
            plt.clf()
            
            hypergrad_norm = np.linalg.norm(constrained_grad)
            hypergrad_norms.append(hypergrad_norm)
            print("DrMAD norm = "+ str(hypergrad_norm))
            exact_hypergrad_norm = np.linalg.norm(constrained_grad_exact)
            exact_hypergrad_norms.append(exact_hypergrad_norm)
            print("Exact norm = "+ str(exact_hypergrad_norm))
            
            plt.plot(hypergrad_norms, label="DrMAD hypergradient")
            plt.plot(exact_hypergrad_norms, label="Exact hypergradient")
            plt.title("Norms of hypergradients vs meta iteration")
            plt.xlabel("meta iteration")
            plt.ylabel("norm")
            plt.legend()
            plt.savefig("norms2000_corrected.png")
            plt.clf()
            
            cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient...
            #TODO: momentum
        return cur_reg

    reg = np.zeros(N_weights)+0.2 #TODO: initial -log regularization; not in log scale?
    constraints = ['universal', 'layers', 'units']
    # TODO: uses multiple kinds of hyperparameter sharing, but in order
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top), constraint
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    #return all_L2_regs, all_tests_rates, all_avg_regs
    all_L2_regs, all_train_loss, all_valid_loss, all_tests_loss, all_train_rates, all_valid_rates, all_tests_rates, all_avg_regs, hypergrad_angles, hypergrad_signs_angles, hypergrad_norms, exact_hypergrad_norms
Exemplo n.º 16
0
def plot():
    import matplotlib.pyplot as plt
    import matplotlib as mpl

    def plot_filters(ax, parser, lims, N_cols=10, L_img=28, padding=2):
        bg_val = 0
        filters = parser[('weights', 0)]
        output_weights = parser[('weights', 1)]
        N_outputs = output_weights.shape[1]
        N_filters = filters.shape[1]
        N_rows = ceil_div(N_filters, N_cols)
        L_extra = ceil_div(N_outputs, L_img)
        output_weights_padded = np.full((N_filters, L_img * L_extra), bg_val)
        output_weights_padded[:, :N_outputs] = output_weights
        output_weights_padded = output_weights_padded.reshape(
            (N_filters, L_extra, L_img))
        filters = filters.reshape((L_img, L_img, N_filters))
        row_height = L_img + L_extra + padding * 2
        col_width = L_img + padding
        image = np.full((row_height * N_rows, col_width * N_cols), bg_val)

        def pix_range_x(i):
            offset = i * col_width
            return slice(offset, offset + L_img)

        def pix_range_y(i):
            offset = i * row_height
            return slice(offset, offset + L_img + L_extra + padding)

        for i_x, i_y in it.product(range(N_rows), range(N_cols)):
            i_filter = i_x + i_y * N_cols
            if i_filter < N_filters:
                cur_frame = np.concatenate(
                    (filters[:, :, i_filter], np.full(
                        (padding, L_img),
                        bg_val), output_weights_padded[i, :, :]),
                    axis=0)
                image[pix_range_y(i_y), pix_range_x(i_x)] = cur_frame

        img_min, img_max = lims
        image = (image - img_min) / (img_max - img_min)
        image = np.minimum(np.maximum(image, 0.0), 1.0)
        ax.imshow(image, cmap=mpl.cm.binary)
        ax.set_xticks([])
        ax.set_yticks([])

    with open('results.pkl') as f:
        results = pickle.load(f)

    fig = plt.figure(0)
    fig.set_size_inches((6, 4))
    ax = fig.add_subplot(111)
    ax.set_title('Meta learning curves')
    losses = ['train_loss', 'valid_loss', 'tests_loss']
    for loss_type in losses:
        ax.plot(results[loss_type], 'o-', label=loss_type)
    ax.set_xlabel('Meta iter number')
    ax.set_ylabel('Negative log prob')
    ax.legend(loc=1, frameon=False)
    plt.savefig('learning_curves.png')

    fig.clf()
    fig.set_size_inches((6, 8))
    ax = fig.add_subplot(211)
    ax.set_title('Parameter scale')
    for i, log_scale in enumerate(results['log_scale']):
        ax.plot(np.sort(log_scale),
                label="Meta iter {0}".format(i * N_hyper_thin))
    ax.legend(loc=2, frameon=False)

    ax = fig.add_subplot(212)
    ax.set_title('Parameter offset')
    for i, offset in enumerate(results['offset']):
        ax.plot(np.sort(offset),
                label="Meta iter {0}".format(i * N_hyper_thin))
    plt.savefig('Learned regularization.png')

    w_parser, _, _, _ = make_nn_funs(layer_sizes)
    log_scales = w_parser.new_vect(np.exp(results['log_scale'])[-1])
    offset = w_parser.new_vect(results['offset'][-1])
    fig.clf()
    fig.set_size_inches((6, 6))
    ax = fig.add_subplot(111)
    plot_filters(ax, log_scales, [5, 10])
    ax.set_title("Scales")
    plt.savefig("L2_scale_filters.png")
    plt.savefig("L2_scale_filters.pdf")

    fig.clf()
    fig.set_size_inches((6, 6))
    ax = fig.add_subplot(111)
    plot_filters(ax, offset, [-1, 1])
    ax.set_title("Offsets")
    plt.savefig("L2_mean_filters.png")
    plt.savefig("L2_mean_filters.pdf")
Exemplo n.º 17
0
def run():
    (train_images, train_labels),\
    (valid_images, valid_labels),\
    (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests)
    batch_idxs = BatchList(N_train, batch_size)
    N_iters = N_epochs * len(batch_idxs)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)

    def indexed_loss_fun(w, log_L2_reg, i):
        idxs = batch_idxs[i % len(batch_idxs)]
        partial_vects = [np.full(parser[name].size, np.exp(log_L2_reg[i]))
                         for i, name in enumerate(parser.names)]
        L2_reg_vect = np.concatenate(partial_vects, axis=0)
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect)

    def train_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=train_images, T=train_labels)

    def valid_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=valid_images, T=valid_labels)

    def tests_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=tests_images, T=tests_labels)

    all_learning_curves = []
    all_x = []

    def hyperloss(hyperparam_vect, i):
        learning_curve = []
        def callback(x, i):
            if i % len(batch_idxs) == 0:
                learning_curve.append(loss_fun(x, X=train_images, T=train_labels))

        npr.seed(i)
        N_weights = parser.vect.size
        V0 = np.zeros(N_weights)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        layer_param_scale = [np.full(parser[name].size, 
                                     np.exp(cur_hyperparams['log_param_scale'][i]))
                             for i, name in enumerate(parser.names)]
        W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0)
        alphas     = np.exp(cur_hyperparams['log_alphas'])
        betas      =  logit(cur_hyperparams['invlogit_betas'])
        log_L2_reg =        cur_hyperparams['log_L2_reg']
        W_opt = sgd5(grad(indexed_loss_fun), kylist(W0, alphas, betas, log_L2_reg), callback)
        all_x.append(getval(W_opt))
        all_learning_curves.append(learning_curve)
        return valid_loss_fun(W_opt)

    hyperloss_grad = grad(hyperloss)

    add_fields = ['train_loss', 'valid_loss', 'tests_loss']
    meta_results = {field : [] for field in add_fields + hyperparams.names}
    def meta_callback(hyperparam_vect, i):
        x = all_x[-1]
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        log_L2_reg = cur_hyperparams['log_L2_reg']
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])

        meta_results['train_loss'].append(train_loss_fun(x))
        meta_results['valid_loss'].append(valid_loss_fun(x))
        meta_results['tests_loss'].append(tests_loss_fun(x))

    final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_results['all_learning_curves'] = all_learning_curves
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 18
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    avg_hypergrad = initial_hypergrad.copy()
    for i in xrange(1, N_meta_iter):
        avg_hypergrad += hyperloss_grad(hyperparams.vect, i)
        print i
    parsed_avg_hypergrad = hyperparams.new_vect(avg_hypergrad)

    parser.vect = None  # No need to pickle zeros
    return parser, parsed_init_hypergrad, parsed_avg_hypergrad
Exemplo n.º 19
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    rs = RandomState((seed))
    init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale
    one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
    fake_labels = one_hot(np.array(range(N_train)) % N_classes, N_classes)  # One of each.

    hyperparams = VectorParser()
    hyperparams['fake_data']  = init_fake_data
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    fixed_hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    fixed_hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    cur_primal_results = {}

    loss_meta_parser = VectorParser()
    loss_meta_parser['']

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
        #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))


        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas  = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, meta),
                           parser, callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = cur_primal_results['weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
        meta_results['train_loss'].append(0)
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            print metagrad
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback, N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 20
0
def run():
    (train_images, train_labels),\
    (valid_images, valid_labels),\
    (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests)
    batch_idxs = BatchList(N_train, batch_size)
    N_iters = N_epochs * len(batch_idxs)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas)

    def train_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=train_images, T=train_labels)

    def valid_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=valid_images, T=valid_labels)

    def tests_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=tests_images, T=tests_labels)

    all_learning_curves = []
    all_x = []

    def hyperloss_grad(hyperparam_vect, ii):
        learning_curve = []

        def callback(x, i):
            if i % len(batch_idxs) == 0:
                learning_curve.append(
                    loss_fun(x, X=train_images, T=train_labels))

        def indexed_loss_fun(w, log_L2_reg, j):
            # idxs = batch_idxs[i % len(batch_idxs)]
            npr.seed(1000 * ii + j)
            idxs = npr.randint(N_train, size=len(batch_idxs))
            partial_vects = [
                np.full(parser[name].size, np.exp(log_L2_reg[i]))
                for i, name in enumerate(parser.names)
            ]
            L2_reg_vect = np.concatenate(partial_vects, axis=0)
            return loss_fun(w,
                            X=train_images[idxs],
                            T=train_labels[idxs],
                            L2_reg=L2_reg_vect)

        npr.seed(ii)
        N_weights = parser.vect.size
        V0 = np.zeros(N_weights)

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        layer_param_scale = [
            np.full(parser[name].size,
                    np.exp(cur_hyperparams['log_param_scale'][i]))
            for i, name in enumerate(parser.names)
        ]
        W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        log_L2_reg = cur_hyperparams['log_L2_reg']
        results = sgd3(indexed_loss_fun,
                       valid_loss_fun,
                       W0,
                       V0,
                       alphas,
                       betas,
                       log_L2_reg,
                       callback=callback)
        hypergrads = hyperparams.copy()
        hypergrads['log_L2_reg'] = results['dMd_meta']
        weights_grad = parser.new_vect(W0 * results['dMd_x'])
        hypergrads['log_param_scale'] = [
            np.sum(weights_grad[name]) for name in parser.names
        ]
        hypergrads['log_alphas'] = results['dMd_alphas'] * alphas
        hypergrads['invlogit_betas'] = (
            results['dMd_betas'] * d_logit(cur_hyperparams['invlogit_betas']))
        all_x.append(results['x_final'])
        all_learning_curves.append(learning_curve)
        return hypergrads.vect

    add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num']
    meta_results = {field: [] for field in add_fields + hyperparams.names}

    def meta_callback(hyperparam_vect, i):
        if i % N_meta_thin == 0:
            print "Meta iter {0}".format(i)
            x = all_x[-1]
            cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
            log_L2_reg = cur_hyperparams['log_L2_reg']
            for field in cur_hyperparams.names:
                meta_results[field].append(cur_hyperparams[field])

            meta_results['train_loss'].append(train_loss_fun(x))
            meta_results['valid_loss'].append(valid_loss_fun(x))
            meta_results['tests_loss'].append(tests_loss_fun(x))
            meta_results['iter_num'].append(i)

    final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback,
                            N_meta_iter, meta_alpha, meta_gamma)
    meta_results['all_learning_curves'] = all_learning_curves
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 21
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_alphabets = omniglot.load_data()
    RS.shuffle(all_alphabets)
    train_alphabets = all_alphabets[:-N_test_alphabets]
    tests_alphabets = all_alphabets[-N_test_alphabets:]
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    hyperparams_0 = VectorParser()
    hyperparams_0['log_scale']  = log_scale_init * np.ones(N_weights)
    hyperparams_0['offset'] = offset_init_std * RS.randn(N_weights)

    def reg_loss_fun(W, data, hyperparam_vect, reg_penalty):
        hyperparams = hyperparams_0.new_vect(hyperparam_vect)
        Z = np.exp(hyperparams['log_scale']) * W + hyperparams['offset']
        return loss_fun(Z, **data) + np.dot(W, W) * reg_penalty

    def hyperloss(hyperparam_vect, i_hyper, alphabets, verbose=True, report_train_loss=False):
        RS = RandomState((seed, i_hyper, "hyperloss"))        
        alphabet = shuffle_alphabet(RS.choice(alphabets), RS)
        N_train = alphabet['X'].shape[0] - N_valid_dpts
        train_data = dictslice(alphabet, slice(None, N_train))
        if report_train_loss:
            valid_data = dictslice(alphabet, slice(None, N_valid_dpts))
        else:
            valid_data = dictslice(alphabet, slice(N_train, None))
        def primal_loss(W, hyperparam_vect, i_primal, reg_penalty=True):
            RS = RandomState((seed, i_hyper, i_primal))
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data, idxs)
            loss = reg_loss_fun(W, minibatch, hyperparam_vect, reg_penalty)
            if verbose and i_primal % 10 == 0: print "Iter {0}, loss, {1}".format(i_primal, getval(loss))
            return loss

        W0 = np.zeros(N_weights)
        W_final = sgd(grad(primal_loss), hyperparam_vect, W0, alpha, beta, N_iters, callback=None)
        return reg_loss_fun(W_final, valid_data, hyperparam_vect, reg_penalty=False)

    results = defaultdict(list)
    def record_results(hyperparam_vect, i_hyper, g):
        print "Meta iter {0}. Recording results".format(i_hyper)
        RS = RandomState((seed, i_hyper, "evaluation"))
        def loss_fun(alphabets, report_train_loss):
            return np.mean([hyperloss(hyperparam_vect, RS.int32(), alphabets=alphabets,
                                      verbose=False, report_train_loss=report_train_loss)
                            for i in range(N_alphabets_eval)])
        cur_hyperparams = hyperparams_0.new_vect(hyperparam_vect.copy())
        if i_hyper % N_hyper_thin == 0:
            # Storing O(N_weights) is a bit expensive so we thin it out and store in low precision
            for field in cur_hyperparams.names:
                results[field].append(cur_hyperparams[field].astype(np.float16))
        results['train_loss'].append(loss_fun(train_alphabets, report_train_loss=True))
        results['valid_loss'].append(loss_fun(train_alphabets, report_train_loss=False))
        results['tests_loss'].append(loss_fun(tests_alphabets, report_train_loss=False))
        print "Train:", results['train_loss']
        print "Valid:", results['valid_loss']
        print "Tests:", results['tests_loss']

    train_hyperloss = partial(hyperloss, alphabets=train_alphabets)
    rms_prop(grad(train_hyperloss), hyperparams_0.vect, record_results, N_meta_iter, meta_alpha, gamma=0)
    return results
Exemplo n.º 22
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas)

    hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas)

    #fixed_hyperparams = VectorParser()
    #fixed_hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(
                npr.RandomState(global_seed + i_hyper).randint(1000))
            seed = i_hyper * 10**6 + i_iter  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve = []

        def callback(x, i_iter):
            if i_iter % N_batches == 0:
                learning_curve.append(loss_fun(x, **train_data))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        V0 = np.zeros(W0.size)
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                     callback)
        return W_opt, learning_curve

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)

    def meta_callback(hyperparam_vect, i_hyper):
        print "Meta Epoch {0}".format(i_hyper)
        x, learning_curve = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve)

    final_result = rms_prop(hyperloss_grad,
                            hyperparams.vect,
                            meta_callback,
                            N_meta_iter,
                            meta_alpha,
                            gamma=0.0)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 23
0
def plot():
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    def plot_filters(ax, parser, lims, N_cols=10, L_img=28, padding=2):
        bg_val = 0
        filters = parser[('weights', 0)]
        output_weights = parser[('weights', 1)]
        N_outputs = output_weights.shape[1]
        N_filters = filters.shape[1]
        N_rows = ceil_div(N_filters, N_cols)
        L_extra = ceil_div(N_outputs, L_img)
        output_weights_padded = np.full((N_filters, L_img * L_extra), bg_val)
        output_weights_padded[:, :N_outputs] = output_weights
        output_weights_padded = output_weights_padded.reshape((N_filters, L_extra, L_img))
        filters = filters.reshape((L_img, L_img, N_filters))
        row_height = L_img + L_extra + padding * 2
        col_width  = L_img + padding
        image = np.full((row_height * N_rows, col_width * N_cols), bg_val)
        def pix_range_x(i):
            offset = i * col_width
            return slice(offset, offset + L_img)
        def pix_range_y(i):
            offset = i * row_height
            return slice(offset, offset + L_img + L_extra + padding)
        for i_x, i_y in it.product(range(N_rows), range(N_cols)):
            i_filter = i_x + i_y * N_cols
            if i_filter < N_filters:
                cur_frame = np.concatenate((filters[:, :, i_filter],
                                            np.full((padding, L_img), bg_val),
                                            output_weights_padded[i, :, :]), axis=0)
                image[pix_range_y(i_y), pix_range_x(i_x)] = cur_frame

        img_min, img_max = lims
        image = (image - img_min) / (img_max - img_min)
        image = np.minimum(np.maximum(image, 0.0), 1.0)
        ax.imshow(image, cmap = mpl.cm.binary)
        ax.set_xticks([])
        ax.set_yticks([])

    with open('results.pkl') as f:
         results = pickle.load(f)

    fig = plt.figure(0)
    fig.set_size_inches((6,4))
    ax = fig.add_subplot(111)
    ax.set_title('Meta learning curves')
    losses = ['train_loss', 'valid_loss', 'tests_loss']
    for loss_type in losses:
        ax.plot(results[loss_type], 'o-', label=loss_type)
    ax.set_xlabel('Meta iter number')
    ax.set_ylabel('Negative log prob')
    ax.legend(loc=1, frameon=False)
    plt.savefig('learning_curves.png')

    fig.clf()
    fig.set_size_inches((6,8))
    ax = fig.add_subplot(211)
    ax.set_title('Parameter scale')
    for i, log_scale in enumerate(results['log_scale']):
        ax.plot(np.sort(log_scale), label = "Meta iter {0}".format(i * N_hyper_thin))
    ax.legend(loc=2, frameon=False)

    ax = fig.add_subplot(212)
    ax.set_title('Parameter offset')
    for i, offset in enumerate(results['offset']):
        ax.plot(np.sort(offset), label = "Meta iter {0}".format(i * N_hyper_thin))
    plt.savefig('Learned regularization.png')

    w_parser, _, _, _ = make_nn_funs(layer_sizes)
    log_scales = w_parser.new_vect(np.exp(results['log_scale'])[-1])
    offset     = w_parser.new_vect(results['offset'][-1])
    fig.clf()
    fig.set_size_inches((6,6))
    ax = fig.add_subplot(111)
    plot_filters(ax, log_scales, [5, 10])
    ax.set_title("Scales")
    plt.savefig("L2_scale_filters.png")
    plt.savefig("L2_scale_filters.pdf")

    fig.clf()
    fig.set_size_inches((6,6))
    ax = fig.add_subplot(111)
    plot_filters(ax, offset, [-1, 1])
    ax.set_title("Offsets")
    plt.savefig("L2_mean_filters.png")
    plt.savefig("L2_mean_filters.pdf")
Exemplo n.º 24
0
def run():
    (train_images, train_labels),\
    (valid_images, valid_labels),\
    (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests)
    batch_idxs = BatchList(N_train, batch_size)
    N_iters = N_epochs * len(batch_idxs)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas)

    def indexed_loss_fun(w, log_L2_reg, i):
        idxs = batch_idxs[i % len(batch_idxs)]
        partial_vects = [
            np.full(parser[name].size, np.exp(log_L2_reg[i]))
            for i, name in enumerate(parser.names)
        ]
        L2_reg_vect = np.concatenate(partial_vects, axis=0)
        return loss_fun(w,
                        X=train_images[idxs],
                        T=train_labels[idxs],
                        L2_reg=L2_reg_vect)

    def train_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=train_images, T=train_labels)

    def valid_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=valid_images, T=valid_labels)

    def tests_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=tests_images, T=tests_labels)

    all_learning_curves = []
    all_x = []

    def hyperloss(hyperparam_vect, i):
        learning_curve = []

        def callback(x, i):
            if i % len(batch_idxs) == 0:
                learning_curve.append(
                    loss_fun(x, X=train_images, T=train_labels))

        npr.seed(i)
        N_weights = parser.vect.size
        V0 = np.zeros(N_weights)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        layer_param_scale = [
            np.full(parser[name].size,
                    np.exp(cur_hyperparams['log_param_scale'][i]))
            for i, name in enumerate(parser.names)
        ]
        W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        log_L2_reg = cur_hyperparams['log_L2_reg']
        W_opt = sgd5(grad(indexed_loss_fun),
                     kylist(W0, alphas, betas, log_L2_reg), callback)
        all_x.append(getval(W_opt))
        all_learning_curves.append(learning_curve)
        return valid_loss_fun(W_opt)

    hyperloss_grad = grad(hyperloss)

    add_fields = ['train_loss', 'valid_loss', 'tests_loss']
    meta_results = {field: [] for field in add_fields + hyperparams.names}

    def meta_callback(hyperparam_vect, i):
        x = all_x[-1]
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        log_L2_reg = cur_hyperparams['log_L2_reg']
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])

        meta_results['train_loss'].append(train_loss_fun(x))
        meta_results['valid_loss'].append(valid_loss_fun(x))
        meta_results['tests_loss'].append(tests_loss_fun(x))

    final_result = rms_prop(hyperloss_grad, hyperparams.vect, meta_callback,
                            N_meta_iter, meta_alpha)
    meta_results['all_learning_curves'] = all_learning_curves
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 25
0
def run():
    val_images, val_labels, test_images, test_labels, _ = load_data(normalize=True)
    val_images = val_images[:N_val_data, :]
    val_labels = val_labels[:N_val_data, :]
    true_data_scale = np.std(val_images)

    test_images = test_images[:N_test_data, :]
    test_labels = test_labels[:N_test_data, :]
    batch_idxs = BatchList(N_fake_data, batch_size)
    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = len(parser.vect)

    npr.seed(0)
    init_fake_data = npr.randn(*(val_images[:N_fake_data, :].shape)) * init_fake_data_scale
    one_hot = lambda x, K : np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
    fake_labels = one_hot(np.array(range(N_fake_data)) % N_classes, N_classes)  # One of each.

    hyperparser = WeightsParser()
    hyperparser.add_weights('log_L2_reg', (1,))
    hyperparser.add_weights('fake_data', init_fake_data.shape)
    metas = np.zeros(hyperparser.N)
    print "Number of hyperparameters to be trained:", hyperparser.N
    hyperparser.set(metas, 'log_L2_reg', init_log_L2_reg)
    hyperparser.set(metas, 'fake_data', init_fake_data)

    def indexed_loss_fun(x, meta_params, idxs):   # To be optimized by SGD.
        L2_reg=np.exp(hyperparser.get(meta_params, 'log_L2_reg')[0])
        fake_data=hyperparser.get(meta_params, 'fake_data')
        return loss_fun(x, X=fake_data[idxs], T=fake_labels[idxs], L2_reg=L2_reg)
    def meta_loss_fun(x, meta_params):            # To be optimized in the outer loop.
        fake_data=hyperparser.get(meta_params, 'fake_data')
        log_prior = -fake_data_L2_reg * np.dot(fake_data.ravel(), fake_data.ravel())
        return loss_fun(x, X=val_images, T=val_labels) - log_prior
    def test_loss_fun(x):                         # To measure actual performance.
        return loss_fun(x, X=test_images, T=test_labels)

    log_alphas = np.full(N_iters, log_alpha_0)
    betas      = np.full(N_iters, beta_0)

    output = []
    for i in range(N_meta_iter):
        print "L2 reg is ", np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), "| ",

        npr.seed(0)
        v0 = npr.randn(N_weights) * velocity_scale
        x0 = npr.randn(N_weights) * np.exp(log_param_scale)

        results = sgd2(indexed_loss_fun, meta_loss_fun, batch_idxs, N_iters,
                       x0, v0, np.exp(log_alphas), betas, metas)

        learning_curve = results['learning_curve']
        validation_loss = results['M_final']
        test_err = frac_err(results['x_final'], test_images, test_labels)
        fake_data_scale = np.std(hyperparser.get(metas, 'fake_data')) / true_data_scale
        test_loss = test_loss_fun(results['x_final'])
        output.append((learning_curve, validation_loss, test_loss, fake_data_scale,
                       np.exp(hyperparser.get(metas, 'log_L2_reg')[0]), test_err))

        metas -= results['dMd_meta'] * meta_stepsize
        print "Meta iteration {0} Validation loss {1} Test loss {2} Test err {3}"\
            .format(i, validation_loss, test_loss, test_err)
    return output, hyperparser.get(metas, 'fake_data')
Exemplo n.º 26
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full(N_iters, init_log_alphas)

    hyperparams['invlogit_betas'] = np.full(N_iters, init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_iters,
                                                   init_log_param_scale)

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(
                npr.RandomState(global_seed + i_hyper +
                                i_iter * 10000).randint(1000))
            seed = i_hyper * 10**6 + i_iter  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                     callback)
        #callback(W_opt, N_iters)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
        meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                               / (np.linalg.norm(metagrad)*
                                                  np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1]) #Michael: train->tests

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback,
                        N_meta_iter, meta_alpha)
    #meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 27
0
def run():
    RS = RandomState((seed, "top_rs"))
    data = loadData.loadMnist()

    train_data_subclass = []

    train_data, tests_data = loadData.load_data_as_dict(data, classNum)
    train_data = random_partition(train_data, RS, [N_train_Full]).__getitem__(0)
    tests_data = random_partition(tests_data, RS, [ N_tests]).__getitem__(0)


    train_data_subclass= loadData.loadSubsetData(train_data,RS, N_train, clientNum)

    print "training samples {0}: testing samples: {1}".format(N_train,N_tests)

    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_reg(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_t[('biases', i)] = 0.0
        if name == 'universal':
            t_mean = np.mean([np.mean(all_t[('weights', i)])
                              for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data['X'].shape[0]
        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters)

    all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)
            tempConstrained_grad = np.zeros(N_weights)
            for client_i in range (0,clientNum):

                RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
                cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train-N_valid, N_valid])
                print("calculate hypergradients")
                raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
                print("calculate hypergradients end ")

                constrained_grad = constrain_reg(raw_grad, constraint)

                tempConstrained_grad += constrained_grad/clientNum

            cur_reg -= np.sign(tempConstrained_grad) * meta_alpha

            print("calculate hypergradients end ")

        return cur_reg

    reg = np.zeros(N_weights)+0.2
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_rates, all_avg_regs
Exemplo n.º 28
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_reg(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_t[('biases', i)] = 0.0
        if name == 'universal':
            t_mean = np.mean(
                [np.mean(all_t[('weights', i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_t[('weights', i)] = np.mean(all_t[('weights', i)],
                                                axis=1,
                                                keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0, ))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data['X'].shape[0]

        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta,
                   N_iters)

    all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], []

    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)

        def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data,
                                       tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(
                    i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS,
                                         [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            cur_reg -= np.sign(constrained_grad) * meta_alpha
        return cur_reg

    reg = np.zeros(N_weights) + 0.2
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter,
                constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_rates, all_avg_regs
Exemplo n.º 29
0
def run():
    (train_images, train_labels),\
    (valid_images, valid_labels),\
    (tests_images, tests_labels) = load_data_subset(N_train, N_valid, N_tests)
    batch_idxs = BatchList(N_train, batch_size)
    N_iters = N_epochs * len(batch_idxs)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)

    def train_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=train_images, T=train_labels)

    def valid_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=valid_images, T=valid_labels)

    def tests_loss_fun(w, log_L2_reg=0.0):
        return loss_fun(w, X=tests_images, T=tests_labels)

    all_learning_curves = []
    all_x = []
    def hyperloss_grad(hyperparam_vect, ii):
        learning_curve = []
        def callback(x, i):
            if i % len(batch_idxs) == 0:
                learning_curve.append(loss_fun(x, X=train_images, T=train_labels))

        def indexed_loss_fun(w, log_L2_reg, j):
            # idxs = batch_idxs[i % len(batch_idxs)]
            npr.seed(1000 * ii + j)
            idxs = npr.randint(N_train, size=len(batch_idxs))
            partial_vects = [np.full(parser[name].size, np.exp(log_L2_reg[i]))
                             for i, name in enumerate(parser.names)]
            L2_reg_vect = np.concatenate(partial_vects, axis=0)
            return loss_fun(w, X=train_images[idxs], T=train_labels[idxs], L2_reg=L2_reg_vect)

        npr.seed(ii)
        N_weights = parser.vect.size
        V0 = np.zeros(N_weights)

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        layer_param_scale = [np.full(parser[name].size, 
                                     np.exp(cur_hyperparams['log_param_scale'][i]))
                             for i, name in enumerate(parser.names)]
        W0 = npr.randn(N_weights) * np.concatenate(layer_param_scale, axis=0)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        log_L2_reg = cur_hyperparams['log_L2_reg']
        results = sgd3(indexed_loss_fun, valid_loss_fun, W0, V0,
                       alphas, betas, log_L2_reg, callback=callback)
        hypergrads = hyperparams.copy()
        hypergrads['log_L2_reg']      = results['dMd_meta']
        weights_grad = parser.new_vect(W0 * results['dMd_x'])
        hypergrads['log_param_scale'] = [np.sum(weights_grad[name])
                                         for name in parser.names]
        hypergrads['log_alphas']      = results['dMd_alphas'] * alphas
        hypergrads['invlogit_betas']  = (results['dMd_betas'] *
                                         d_logit(cur_hyperparams['invlogit_betas']))
        all_x.append(results['x_final'])
        all_learning_curves.append(learning_curve)
        return hypergrads.vect

    add_fields = ['train_loss', 'valid_loss', 'tests_loss', 'iter_num']
    meta_results = {field : [] for field in add_fields + hyperparams.names}
    def meta_callback(hyperparam_vect, i):
        if i % N_meta_thin == 0:
            print "Meta iter {0}".format(i)
            x = all_x[-1]
            cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
            log_L2_reg = cur_hyperparams['log_L2_reg']
            for field in cur_hyperparams.names:
                meta_results[field].append(cur_hyperparams[field])

            meta_results['train_loss'].append(train_loss_fun(x))
            meta_results['valid_loss'].append(valid_loss_fun(x))
            meta_results['tests_loss'].append(tests_loss_fun(x))
            meta_results['iter_num'].append(i)

    final_result = rms_prop(hyperloss_grad, hyperparams.vect,
                            meta_callback, N_meta_iter, meta_alpha, meta_gamma)
    meta_results['all_learning_curves'] = all_learning_curves
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 30
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = omniglot.load_rotated_alphabets(RS)
    train_data, tests_data = random_partition(all_data, RS, [12, 3])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)
    transform_parser = make_transform([0] * N_layers)

    def get_layers(vect):
        layers = []
        for i_layer in range(N_layers):
            weights_by_scripts = vect.reshape((N_scripts, N_weights))
            weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)]
            biases_idxs, _ = w_parser.idxs_and_shapes[('biases', i_layer)]
            assert weights_idxs.stop == biases_idxs.start
            layer_idxs = slice(weights_idxs.start, biases_idxs.stop)
            layers.append(weights_by_scripts[:, layer_idxs])
        return layers

    def transform_weights(z_vect, transform_vect):
        z_layers = get_layers(z_vect)
        transform = transform_parser.new_vect(transform_vect)
        w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)]
        return np.concatenate(w_layers, axis=1).ravel()

    def likelihood_loss(w_vect, data):
        w = script_parser.new_vect(w_vect)
        return sum([
            loss_fun(w[i], **script_data) for i, script_data in enumerate(data)
        ])

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def train_z(data, transform_vect, RS):
        def primal_loss(z_vect,
                        transform_vect,
                        i_primal,
                        record_results=False):
            w_vect = transform_weights(z_vect, transform_vect)
            loss = likelihood_loss(w_vect, data)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal,
                                                    getval(loss) / N_scripts)
            return loss + reg

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_init_scale)
        return sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta,
                   N_iters)

    def train_sharing():
        def hyperloss(transform_vect, i_hyper):
            RS = RandomState((seed, i_hyper, "hyperloss"))
            cur_train_data, cur_valid_data = random_partition(
                train_data, RS, [10, 2])
            z_vect_final = train_z(cur_train_data, transform_vect, RS)
            w_vect_final = transform_weights(z_vect_final, transform_vect)
            return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts

        hypergrad = grad(hyperloss)
        cur_transform_vect = make_transform([init_script_corr] * N_layers).vect
        for i_hyper in range(N_meta_iter):
            print "Hyper iter {0}".format(i_hyper)
            grad_transform = hypergrad(cur_transform_vect, i_hyper)
            cur_transform_vect = cur_transform_vect - grad_transform * meta_alpha
        return cur_transform_vect

    transform_vects, train_losses, tests_losses = {}, {}, {}
    transform_vects['no_sharing'] = make_transform([0, 0, 0]).vect
    transform_vects['full_sharing'] = make_transform([1, 0, 0]).vect
    transform_vects['learned_sharing'] = train_sharing()
    for name in transform_vects.keys():
        RS = RandomState("final_training")
        tv = transform_vects[name]
        trained_z = train_z(train_data, tv, RS)
        trained_w = transform_weights(trained_z, tv)
        train_losses[name] = likelihood_loss(trained_w, train_data) / N_scripts
        tests_losses[name] = likelihood_loss(trained_w, tests_data) / N_scripts
        print "{0} : train: {1}, test: {2}".format(name, train_losses[name],
                                                   tests_losses[name])
    return transform_parser, transform_vects, train_losses, tests_losses
Exemplo n.º 31
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    def build_hypervect(init_log_alphas, init_invlogit_betas,
                        init_log_param_scale):
        hyperparams = VectorParser()
        hyperparams['log_param_scale'] = np.full(N_weight_types,
                                                 init_log_param_scale)
        hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                            init_log_alphas)
        hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                                init_invlogit_betas)
        return hyperparams

    hyperparams = build_hypervect(
        init_log_alphas, init_invlogit_betas,
        init_log_param_scale)  # Build just for parser.
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def whetlab_optimize(loss, max_iters, callback):
        for i in xrange(max_iters):
            params = scientist.suggest()
            hyperparams = build_hypervect(**params)
            cur_loss = loss(hyperparams.vect, i)
            scientist.update(params, -cur_loss)
            if callback: callback(hyperparams.vect, i)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    whetlab_optimize(hyperloss, N_meta_iter, meta_callback)
    best_params = scientist.best()
    print "best params:", best_params

    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, best_params
Exemplo n.º 32
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0 or i_iter == N_iters or i_iter == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))
                learning_curve_dict['iteration'].append(i_iter + 1)
                print "iteration", i_iter

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    meta_callback(hyperparams.vect, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
def run(params):

    medianLayer0= params['ml1'][0]
    medianLayer1= params['ml2'][0]
    medianLayer2= params['ml3'][0]
    medianLayer3= params['ml4'][0]

    # medianLayer0= 0.3
    # medianLayer1= 1.3
    # medianLayer2= 2.3
    # medianLayer3= 3.3


    RS = RandomState((seed, "to p_rs"))
    data = loadData.loadMnist()

    train_data_subclass = []

    train_data, tests_data = loadData.load_data_as_dict(data, classNum)


    train_data_subclass= loadSubsetData(train_data,RS, N_train, clientNum)

    print "training samples {0}: testing samples: {1}".format(N_train,N_tests)


    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    init_scales = w_parser.new_vect(np.zeros(N_weights))
    for i in range(N_layers):
        init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i])
        init_scales[('biases',  i)] = 1.0
    init_scales = init_scales.vect

    def process_reg(t_vect):
        # Remove the redundancy due to sharing regularization within units
        all_r = w_parser.new_vect(t_vect)
        new_r = np.zeros((0,))
        for i in range(N_layers):
            layer = all_r[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_r = layer[:, 0]
            new_r = np.concatenate((new_r, cur_r))
        return new_r

    fraction_error = 0.00
    all_regs, all_tests_loss = [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg)
            # fraction_error = frac_err(w_vect_final,**cur_valid_data)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        #reg is the list of hyperparameters
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
                # print "Cur_reg", np.mean(cur_reg)
                print "Cur_reg", cur_reg

            for client_i in range (0,clientNum):

                RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
                cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train - N_valid, N_valid])
                # print("calculate hypergradients")
                raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
                constrained_grad = constrain_reg(w_parser, raw_grad, constraint)


                # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
                cur_reg -= constrained_grad * meta_alpha/clientNum

            print "\n"
            # print "constrained_grad",constrained_grad
        return cur_reg


    def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        w_vect_0 = RS.randn(N_weights) * init_scales
        w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg)
        return loss_fun(w_vect_final, **cur_valid_data)

    # t_scale = [-1, 0, 1]
    # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
    # for s in t_scale:
    #     reg = np.ones(N_weights) * log_L2_init + s
    #     loss = new_hyperloss(reg, 0, *cur_split)
    #     print "Results: s= {0}, loss = {1}".format(s, loss)

    # reg = np.ones(N_weights) * log_L2_init
    shape0 = layer_sizes.__getitem__(0)
    shape1 = layer_sizes.__getitem__(1)
    shape2 = layer_sizes.__getitem__(2)
    shape3 = layer_sizes.__getitem__(3)

    l1= np.ones(shape0*shape1)* medianLayer0
    l2= np.ones(shape1*shape2+shape1)* medianLayer1
    l3= np.ones(shape2*shape3+shape2)* medianLayer2
    l4= np.ones(shape3)* medianLayer3
    reg = np.concatenate([l1,l2,l3,l4])

    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_reg, all_regs)))
    # return all_L2_regs, all_tests_loss
    return all_tests_loss.__getitem__(all_tests_loss.__len__()-1)
Exemplo n.º 34
0
def run():
    train_images, train_labels, _, _, _ = load_data(normalize=True)
    train_images = train_images[:N_data, :]
    train_labels = train_labels[:N_data, :]
    batch_idxs = BatchList(N_data, batch_size)
    iter_per_epoch = len(batch_idxs)
    parser, _, loss_fun, frac_err = make_nn_funs(layer_sizes,
                                                 L2_reg,
                                                 return_parser=True)
    N_weights = parser.N

    def indexed_loss_fun(w, idxs):
        return loss_fun(w, X=train_images[idxs], T=train_labels[idxs])

    log_alphas = np.full(N_iters, log_alpha_0)
    betas = np.full(N_iters, beta_0)
    npr.seed(2)
    V0 = npr.randn(N_weights) * velocity_scale
    #W0 = npr.randn(N_weights) * np.exp(log_param_scale)

    bindict = {
        k: np.linspace(-1, 1, N_bins) *
        np.exp(log_param_scale)  # Different cdf per layer.
        for k, v in parser.idxs_and_shapes.iteritems()
    }
    output = []
    for i in range(N_meta_iter):
        print "Meta iteration {0}".format(i)
        #X0, dX_dbins = bininvcdf(W_uniform, bins)
        X_uniform = npr.rand(
            N_weights)  # Weights are uniform passed through an inverse cdf.
        X0 = np.zeros(N_weights)
        dX_dbins = {}
        for k, cur_bins in bindict.iteritems():
            cur_slice, cur_shape = parser.idxs_and_shapes[k]
            cur_xs = X_uniform[cur_slice]
            cur_X0, cur_dX_dbins = bininvcdf(cur_xs, cur_bins)
            X0[cur_slice] = cur_X0
            dX_dbins[k] = cur_dX_dbins
        results = sgd(indexed_loss_fun,
                      batch_idxs,
                      N_iters,
                      X0,
                      V0,
                      np.exp(log_alphas),
                      betas,
                      record_learning_curve=True)
        dL_dx = results['d_x']

        learning_curve = results['learning_curve']
        output.append((learning_curve, bindict))

        # Update bins with one gradient step.
        for k, bins in bindict.iteritems():
            dL_dbins = np.dot(parser.get(dL_dx, k).flatten(), dX_dbins[k])
            bins = bins - dL_dbins * bin_stepsize
            bins[[0, -1]] = bins[[0, -1]] - dL_dbins[[0, 1]] * bin_stepsize
            bindict[k] = np.sort(bins)
        bindict = bindict.copy()

    return output
Exemplo n.º 35
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full((N_iters, N_weight_types), init_log_alphas)
    hyperparams['invlogit_betas']  = np.full((N_iters, N_weight_types), init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState((seed, i_hyper, i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        init_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(init_hyperparams['log_alphas'])
        betas  = logit(init_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg),
                           parser, callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)
    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    # Average many gradient evaluations at the initial point.
    hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size))
    for i in xrange(N_gradients_in_average):
        hypergrads[i] = hyperloss_grad(hyperparams.vect, i)
        print i
    first_gradient = hypergrads[0]
    avg_gradient = np.mean(hypergrads, axis=0)

    # Now do a line search along that direction.
    parsed_avg_grad = hyperparams.new_vect(avg_gradient)
    stepsize_scale = stepsize_search_rescale/np.max(np.exp(parsed_avg_grad['log_alphas'].ravel()))
    stepsizes = np.linspace(-stepsize_scale, stepsize_scale, N_points_in_line_search)
    for i, stepsize in enumerate(stepsizes):
        cur_hypervect = hyperparams.vect - stepsize * avg_gradient
        meta_callback(cur_hypervect, 0)   # Use the same random seed every time.

    parser.vect = None # No need to pickle zeros
    return meta_results, parser, first_gradient, parsed_avg_grad, stepsizes
Exemplo n.º 36
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)

    rs = RandomState((seed))
    init_fake_data = rs.randn(*(train_data['X'].shape)) * init_fake_data_scale
    one_hot = lambda x, K: np.array(x[:, None] == np.arange(K)[None, :],
                                    dtype=int)
    fake_labels = one_hot(np.array(range(N_train)) % N_classes,
                          N_classes)  # One of each.

    hyperparams = VectorParser()
    hyperparams['fake_data'] = init_fake_data
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale'] = np.full(N_weight_types,
                                                   init_log_param_scale)
    fixed_hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                              init_log_alphas)
    fixed_hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                                  init_invlogit_betas)
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    cur_primal_results = {}

    loss_meta_parser = VectorParser()
    loss_meta_parser['']

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, meta_vect, i_iter):
            (train_data, train_labels, L2_vect) = meta
            return loss_fun(w, train_data, train_labels, L2_vect)
            #return loss_fun(w, train_data['X'], train_data['T'], L2_vect + np.sum(fake_data.ravel()))

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                #        learning_curve_dict['learning_curve'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        fake_data = cur_hyperparams['fake_data']
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(fixed_hyperparams['log_alphas'])
        betas = logit(fixed_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        meta = kylist(fake_data, fake_labels, L2_reg)
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, meta),
                           parser,
                           callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = cur_primal_results[
            'weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        #meta_results['train_loss'].append(loss_fun(x, getval(cur_hyperparams['fake_data']), fake_labels))
        meta_results['train_loss'].append(0)
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            print metagrad
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback,
                        N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 37
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    init_scales = w_parser.new_vect(np.zeros(N_weights))
    for i in range(N_layers):
        init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i])
        init_scales[('biases', i)] = 1.0
    init_scales = init_scales.vect

    def regularization(w_vect, reg):
        return np.dot(w_vect, w_vect * np.exp(reg))

    def constrain_reg(t_vect, name):
        all_r = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_r[('biases', i)] = 0.0
        if name == 'universal':
            r_mean = np.mean(
                [np.mean(all_r[('weights', i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_r[('weights', i)] = r_mean
        elif name == 'layers':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)])
        elif name == 'units':
            for i in range(N_layers):
                all_r[('weights', i)] = np.mean(all_r[('weights', i)],
                                                axis=1,
                                                keepdims=True)
        else:
            raise Exception
        return all_r.vect

    def process_reg(t_vect):
        # Remove the redundancy due to sharing regularization within units
        all_r = w_parser.new_vect(t_vect)
        new_r = np.zeros((0, ))
        for i in range(N_layers):
            layer = all_r[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_r = layer[:, 0]
            new_r = np.concatenate((new_r, cur_r))
        return new_r

    def train_z(data, w_vect_0, reg):
        N_data = data['X'].shape[0]

        def primal_loss(w_vect, reg, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(w_vect, reg)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd(grad(primal_loss), reg, w_vect_0, alpha, beta, N_iters)

    all_regs, all_tests_loss = [], []

    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data,
                                       tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(
                    i_hyper, all_tests_loss[-1])
                print "Cur_reg", np.mean(cur_reg)
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS,
                                         [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            constrained_grad = constrain_reg(raw_grad, constraint)
            print constrained_grad
            # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha
            cur_reg -= constrained_grad * meta_alpha

        return cur_reg

    def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        w_vect_0 = RS.randn(N_weights) * init_scales
        w_vect_final = train_z(cur_train_data, w_vect_0, reg)
        return loss_fun(w_vect_final, **cur_valid_data)

    # t_scale = [-1, 0, 1]
    # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
    # for s in t_scale:
    #     reg = np.ones(N_weights) * log_L2_init + s
    #     loss = new_hyperloss(reg, 0, *cur_split)
    #     print "Results: s= {0}, loss = {1}".format(s, loss)

    reg = np.ones(N_weights) * log_L2_init
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter,
                constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_reg, all_regs)))
    return all_L2_regs, all_tests_loss
Exemplo n.º 38
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_L2_reg'] = np.full(N_weight_types, init_log_L2_reg)

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        init_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(init_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(init_hyperparams['log_alphas'])
        betas = logit(init_hyperparams['invlogit_betas'])
        L2_reg = fill_parser(parser, np.exp(fixed_hyperparams['log_L2_reg']))
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **train_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['train_loss'][-1], meta_results['test_err'][-1])

    # Average many gradient evaluations at the initial point.
    hypergrads = np.zeros((N_gradients_in_average, hyperparams.vect.size))
    for i in xrange(N_gradients_in_average):
        hypergrads[i] = hyperloss_grad(hyperparams.vect, i)
        print i
    first_gradient = hypergrads[0]
    avg_gradient = np.mean(hypergrads, axis=0)

    # Now do a line search along that direction.
    parsed_avg_grad = hyperparams.new_vect(avg_gradient)
    stepsize_scale = 1000. / np.max(
        np.exp(parsed_avg_grad['log_alphas'].ravel()))
    stepsizes = np.linspace(-stepsize_scale, stepsize_scale,
                            N_points_in_line_search)
    for i, stepsize in enumerate(stepsizes):
        cur_hypervect = hyperparams.vect + stepsize * avg_gradient
        meta_callback(cur_hypervect, 0)  # Use the same random seed every time.

    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, first_gradient, avg_gradient, stepsizes
Exemplo n.º 39
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = omniglot.load_flipped_alphabets()
    train_data, tests_data = random_partition(all_data, RS, [12, 3])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)
    transform_parser = make_transform([0] * N_layers)

    def get_layers(vect):
        layers = []
        for i_layer in range(N_layers):
            weights_by_scripts = vect.reshape((N_scripts, N_weights))
            weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)]
            biases_idxs, _  = w_parser.idxs_and_shapes[('biases',  i_layer)]
            assert weights_idxs.stop == biases_idxs.start
            layer_idxs = slice(weights_idxs.start, biases_idxs.stop)
            layers.append(weights_by_scripts[:, layer_idxs])
        return layers

    def transform_weights(z_vect, transform_vect):
        z_layers = get_layers(z_vect)
        transform = transform_parser.new_vect(transform_vect)
        w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)]
        return np.concatenate(w_layers, axis=1).ravel()

    def likelihood_loss(w_vect, data):
        w = script_parser.new_vect(w_vect)
        return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)])

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def train_z(data, transform_vect, RS):
        def primal_loss(z_vect, transform_vect, i_primal, record_results=False):
            w_vect = transform_weights(z_vect, transform_vect)
            loss = likelihood_loss(w_vect, data)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss) / N_scripts)
            return loss + reg
        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_init_scale)
        return sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters)

    def train_sharing():
        def hyperloss(transform_vect, i_hyper):
            RS = RandomState((seed, i_hyper, "hyperloss"))
            cur_train_data, cur_valid_data = random_partition(train_data, RS, [10, 2])
            z_vect_final = train_z(cur_train_data, transform_vect, RS)
            w_vect_final = transform_weights(z_vect_final, transform_vect)
            return likelihood_loss(w_vect_final, cur_valid_data) / N_scripts
        hypergrad = grad(hyperloss)
        cur_transform_vect = make_transform([init_script_corr] * N_layers).vect
        for i_hyper in range(N_meta_iter):
            print "Hyper iter {0}".format(i_hyper)
            grad_transform = hypergrad(cur_transform_vect, i_hyper)
            cur_transform_vect = cur_transform_vect - grad_transform * meta_alpha
        return cur_transform_vect

    transform_vects, train_losses, tests_losses = {}, {}, {}
    transform_vects['no_sharing']      = make_transform([0, 0, 0]).vect
    transform_vects['full_sharing']    = make_transform([1, 0, 0]).vect
    transform_vects['learned_sharing'] = train_sharing()
    for name in transform_vects.keys():
        RS = RandomState("final_training")
        tv = transform_vects[name]
        trained_z = train_z(train_data, tv, RS)
        trained_w = transform_weights(trained_z, tv)
        train_losses[name] = likelihood_loss(trained_w, train_data) / N_scripts
        tests_losses[name] = likelihood_loss(trained_w, tests_data) / N_scripts
        print "{0} : train: {1}, test: {2}".format(name, train_losses[name], tests_losses[name])
    return transform_parser, transform_vects, train_losses, tests_losses
Exemplo n.º 40
0
def run():
    """Three different parsers:
    w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single  script
    script_parser[i_script]       : weights vector for each script
    transform_parser[i_layer]     : transform matrix (scripts x scripts) for each alphabet"""
    RS = RandomState((seed, "top_rs"))
    train_data, valid_data, tests_data = omniglot.load_data_split(
        [11, 2, 2], RS, num_alphabets=N_scripts)
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    transform_parser = make_transform(N_scripts, script_corr_init)
    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)

    def get_layers(vect):
        layers = []
        for i_layer in range(N_layers):
            weights_by_scripts = vect.reshape((N_scripts, N_weights))
            weights_idxs, _ = w_parser.idxs_and_shapes[('weights', i_layer)]
            biases_idxs, _  = w_parser.idxs_and_shapes[('biases',  i_layer)]
            assert weights_idxs.stop == biases_idxs.start
            layer_idxs = slice(weights_idxs.start, biases_idxs.stop)
            layers.append(weights_by_scripts[:, layer_idxs])
        return layers

    def transform_weights(z_vect, transform_vect):
        z_layers = get_layers(z_vect)
        transform = transform_parser.new_vect(transform_vect)
        w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)]
        return np.concatenate(w_layers, axis=1).ravel()

    def total_loss(w_vect, data):
        w = script_parser.new_vect(w_vect)
        return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)])

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2_init)

    results = defaultdict(list)
    def hyperloss(transform_vect, i_hyper, record_results=True):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        def primal_loss(z_vect, transform_vect, i_primal, record_results=False):
            w_vect = transform_weights(z_vect, transform_vect)
            loss = total_loss(w_vect, train_data)
            reg = regularization(z_vect)
            if VERBOSE and record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format(
                    i_primal,
                    getval(loss) / N_scripts,
                    total_loss(getval(w_vect), valid_data) / N_scripts,
                    getval(reg))
            return loss + reg

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0,
                           alpha, beta, N_iters, callback=None)
        w_vect_final = transform_weights(z_vect_final, transform_vect)
        valid_loss = total_loss(w_vect_final, valid_data)
        if record_results:
            results['valid_loss'].append(getval(valid_loss) / N_scripts) 
            results['train_loss'].append(total_loss(w_vect_final, train_data) / N_scripts)
            results['tests_loss'].append(total_loss(w_vect_final, tests_data) / N_scripts)
        return valid_loss

    grad_transform = 0.0
    for i_hyper in range(N_grad_averages):
        grad_transform += grad(hyperloss)(transform_parser.vect, i_hyper, record_results=False)
    grad_transform /= N_grad_averages

    i_hyper = N_grad_averages
    for i, d in enumerate(line_search_dists):
        new_transform_vect = transform_parser.vect - d * grad_transform
        hyperloss(new_transform_vect, i_hyper, record_results=True)
        print "Hyper iter {0}".format(i)
        print "Results", {k : v[-1] for k, v in results.iteritems()}
        
    grad_transform_dict = transform_parser.new_vect(grad_transform).as_dict()
    return results, grad_transform_dict
Exemplo n.º 41
0
def run(script_corr):
    """Three different parsers:
    w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single  script
    script_parser[i_script]       : weights vector for each script
    transform_parser[i_layer]     : transform matrix (scripts x scripts) for each alphabet"""
    RS = RandomState((seed, "top_rs"))
    train_data, valid_data, tests_data = omniglot.load_data_split([11, 2, 2], RS, num_alphabets=N_scripts)
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    uncorrelated_mat = np.eye(N_scripts)
    fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts)
    transform_mat = (1 - script_corr) * uncorrelated_mat + script_corr * fully_correlated_mat
    transform_parser = VectorParser()
    for i_layer in range(N_layers):
        if i_layer > 0:
            transform_parser[i_layer] = uncorrelated_mat
        else:
            transform_parser[i_layer] = transform_mat

    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)

    def transform_weights(all_z_vect, transform_vect, i_script_out):
        all_z     =    script_parser.new_vect(    all_z_vect)
        transform = transform_parser.new_vect(transform_vect)
        W = OrderedDict() # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported
        for k in w_parser.idxs_and_shapes.keys():
            W[k] = 0.0
        for i_layer in range(N_layers):
            script_weightings = transform[i_layer][i_script_out, :]
            for i_script in range(N_scripts):
                z_i_script = w_parser.new_vect(all_z[i_script])
                script_weighting = script_weightings[i_script]
                W[('biases', i_layer)]  += z_i_script[('biases',  i_layer)] * script_weighting
                W[('weights', i_layer)] += z_i_script[('weights', i_layer)] * script_weighting
        return np.concatenate([v.ravel() for v in W.values()])

    def loss_from_latents(z_vect, transform_vect, i_script, data):
        w_vect = transform_weights(z_vect, transform_vect, i_script)
        return loss_fun(w_vect, **data)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2_init)

    results = defaultdict(list)
    def hyperloss(transform_vect, i_hyper, record_results=False):
        def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal, i_script):
            RS = RandomState((seed, i_hyper, i_primal, i_script))
            N_train = train_data[i_script]['X'].shape[0]
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data[i_script], idxs)
            loss = loss_from_latents(z_vect, transform_vect, i_script, minibatch)
            if i_primal % N_thin == 0 and i_script == 0:
                print "Iter {0}, full losses: train: {1}, valid: {2}".format(
                    i_primal,
                    total_loss(train_data, getval(z_vect)),
                    total_loss(valid_data, getval(z_vect)))
            if i_script == 0: # Only add regularization once
                loss += regularization(z_vect)

            return loss

        def total_loss(data, z_vect):
            return np.mean([loss_from_latents(z_vect, transform_vect, i_script, data[i_script])
                            for i_script in range(N_scripts)])

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(sub_primal_stochastic_loss), transform_vect, z_vect_0,
                           alpha, beta, N_iters, N_scripts_per_iter, callback=None)
        valid_loss = total_loss(valid_data, z_vect_final)
        if record_results:
            results['valid_loss'].append(valid_loss)
            results['train_loss'].append(total_loss(train_data, z_vect_final))
            # results['tests_loss'].append(total_loss(tests_data, z_vect_final))
        return valid_loss

    hyperloss(transform_parser.vect, 0, record_results=True)
    return results['train_loss'][-1], results['valid_loss'][-1]
Exemplo n.º 42
0
def run(script_corr):
    """Three different parsers:
    w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single  script
    script_parser[i_script]       : weights vector for each script
    transform_parser[i_layer]     : transform matrix (scripts x scripts) for each alphabet"""
    RS = RandomState((seed, "top_rs"))
    train_data, valid_data, tests_data = omniglot.load_data_split(
        [11, 2, 2], RS, num_alphabets=N_scripts)
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    uncorrelated_mat = np.eye(N_scripts)
    fully_correlated_mat = np.full((N_scripts, N_scripts), 1.0 / N_scripts)
    transform_mat = (1 - script_corr
                     ) * uncorrelated_mat + script_corr * fully_correlated_mat
    transform_parser = VectorParser()
    for i_layer in range(N_layers):
        if i_layer > 0:
            transform_parser[i_layer] = uncorrelated_mat
        else:
            transform_parser[i_layer] = transform_mat

    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)

    def transform_weights(all_z_vect, transform_vect, i_script_out):
        all_z = script_parser.new_vect(all_z_vect)
        transform = transform_parser.new_vect(transform_vect)
        W = OrderedDict(
        )  # Can't use parser because setting plain array ranges with funkyyak nodes not yet supported
        for k in w_parser.idxs_and_shapes.keys():
            W[k] = 0.0
        for i_layer in range(N_layers):
            script_weightings = transform[i_layer][i_script_out, :]
            for i_script in range(N_scripts):
                z_i_script = w_parser.new_vect(all_z[i_script])
                script_weighting = script_weightings[i_script]
                W[('biases',
                   i_layer)] += z_i_script[('biases',
                                            i_layer)] * script_weighting
                W[('weights',
                   i_layer)] += z_i_script[('weights',
                                            i_layer)] * script_weighting
        return np.concatenate([v.ravel() for v in W.values()])

    def loss_from_latents(z_vect, transform_vect, i_script, data):
        w_vect = transform_weights(z_vect, transform_vect, i_script)
        return loss_fun(w_vect, **data)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2_init)

    results = defaultdict(list)

    def hyperloss(transform_vect, i_hyper, record_results=False):
        def sub_primal_stochastic_loss(z_vect, transform_vect, i_primal,
                                       i_script):
            RS = RandomState((seed, i_hyper, i_primal, i_script))
            N_train = train_data[i_script]['X'].shape[0]
            idxs = RS.permutation(N_train)[:batch_size]
            minibatch = dictslice(train_data[i_script], idxs)
            loss = loss_from_latents(z_vect, transform_vect, i_script,
                                     minibatch)
            if i_primal % N_thin == 0 and i_script == 0:
                print "Iter {0}, full losses: train: {1}, valid: {2}".format(
                    i_primal, total_loss(train_data, getval(z_vect)),
                    total_loss(valid_data, getval(z_vect)))
            if i_script == 0:  # Only add regularization once
                loss += regularization(z_vect)

            return loss

        def total_loss(data, z_vect):
            return np.mean([
                loss_from_latents(z_vect, transform_vect, i_script,
                                  data[i_script])
                for i_script in range(N_scripts)
            ])

        z_vect_0 = RS.randn(
            script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(sub_primal_stochastic_loss),
                           transform_vect,
                           z_vect_0,
                           alpha,
                           beta,
                           N_iters,
                           N_scripts_per_iter,
                           callback=None)
        valid_loss = total_loss(valid_data, z_vect_final)
        if record_results:
            results['valid_loss'].append(valid_loss)
            results['train_loss'].append(total_loss(train_data, z_vect_final))
            # results['tests_loss'].append(total_loss(tests_data, z_vect_final))
        return valid_loss

    hyperloss(transform_parser.vect, 0, record_results=True)
    return results['train_loss'][-1], results['valid_loss'][-1]
Exemplo n.º 43
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    exact_metagrad = [np.array([0])] #just a placeholder

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_reg(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers): #Don't regularize biases
            all_t[('biases', i)] = 0.0
        if name == 'universal': #One regularization hyperparameter for all weights
            #TODO: does computing means of means make sense? Not the same as just the mean of all.
            t_mean = np.mean([np.mean(all_t[('weights', i)])
                              for i in range(N_layers)])
            for i in range(N_layers):
                all_t[('weights', i)] = t_mean
        elif name == 'layers': #One regularization hyperparameter for each layer
            #TODO: changes the exact hypergradient norm, but not the DrMAD norm. Why??? DrMAD is already constrained?
            print t_vect.shape
            for i in range(N_layers):
                print "diff after contraining" + str(np.linalg.norm(all_t[('weights', i)] - np.mean(all_t[('weights', i)])))
                all_t[('weights', i)] = np.mean(all_t[('weights', i)])
        elif name == 'units':
            print t_vect.shape #44860; this is correct
            for i in range(N_layers):
                print "weights "+ str(i) + ": " + str(np.linalg.norm(np.mean(all_t[('weights', i)], axis=1, keepdims=True) - np.mean(all_t[('weights', i)], axis=1, keepdims=True)))
            #for i in range(N_layers):
                #TODO: This was the same as layer-wise
                #all_t[('weights', i)] = np.mean(all_t[('weights', i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[('weights', i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t
        
    #TODO: make sure the exact_metagrad gets passed by reference
    def train_z(data, z_vect_0, transform, exact_metagrad):
        N_data = data['X'].shape[0]
        
        def primal_loss(z_vect, transform, i_primal, record_results=False): #exact_metagrad=exact_metagrad2, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg
        return sgd(grad(primal_loss), transform, z_vect_0, exact_metagrad, alpha, beta, N_iters)

    all_transforms, all_tests_loss, all_tests_rates, all_avg_regs = [], [], [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top, exact_metagrad):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data, cur_tests_data, exact_metagrad):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform, exact_metagrad)
            w_vect_final = transform_weights(z_vect_final, transform)
            #TODO: print/store losses and error rates here
            print "Training loss (unregularized) = " +str(getval(loss_fun(w_vect_final, **cur_train_data)))
            print "Validation loss = " +str(getval(loss_fun(w_vect_final, **cur_valid_data)))
            print "Test loss = " +str(getval(loss_fun(w_vect_final, **tests_data)))
            print "Training error = "+ str(getval(frac_err(w_vect_final, **cur_train_data)))
            print "Validation error = "+ str(getval(frac_err(w_vect_final, **cur_valid_data)))
            print "Test error = "+ str(getval(frac_err(w_vect_final, **tests_data)))
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss) #No chain rule here

            
        '''def error_rate(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform) #TODO: recomputing path?
            w_vect_final = transform_weights(z_vect_final, transform)
            return frac_err(w_vect_final, **cur_valid_data)'''

        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            print "Hyper iter "+ str(i_hyper)
            """if i_hyper % N_meta_thin == 0:
                test_rate = error_rate(cur_reg, i_hyper, train_data, tests_data)
                all_tests_rates.append(test_rate)
                all_transforms.append(cur_reg.copy())
                all_avg_regs.append(np.mean(cur_reg))
                print "Hyper iter {0}, error rate {1}".format(i_hyper, all_tests_rates[-1])
                print "Cur_transform", np.mean(cur_reg)"""
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            #cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid]) #cur_train_data, cur_valid_data
            #raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
            cur_train_data, cur_valid_data = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_reg, i_hyper, cur_train_data, cur_valid_data, tests_data, exact_metagrad)
            #print "before constraining grad"
            constrained_grad = constrain_reg(raw_grad, constraint)
            # TODO: can put exact hypergradient here, using constraint
            #print "after constraining grad, before constraining exact"
            # TODO: DrMAD norm matches after constraining, but not exact norm?? Why???
            # This one is about 4x larger than constrained one
            print np.linalg.norm(raw_grad)
            print np.linalg.norm(exact_metagrad[0])
            constrained_exact_grad = constrain_reg(exact_metagrad[0], constraint)
            #print "after constraining exact"
            # TODO: compute statistics
            # TODO: sometimes negative???
            print("cosine of angle between DrMAD and exact = "
                +str(np.dot(constrained_grad, constrained_exact_grad)/(np.linalg.norm(constrained_grad)*np.linalg.norm(constrained_exact_grad))))
            print("cosine of angle between signs of DrMAD and exact = "
                +str(np.dot(np.sign(constrained_grad), np.sign(constrained_exact_grad))/len(constrained_grad)))
            print("DrMAD norm = "+ str(np.linalg.norm(constrained_grad)))
            print("Exact norm = "+ str(np.linalg.norm(constrained_exact_grad)))
            cur_reg -= np.sign(constrained_grad) * meta_alpha #TODO: signs of gradient...
            #TODO: momentum
        return cur_reg

    reg = np.zeros(N_weights)+0.2
    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top), constraint
        reg = train_reg(reg, constraint, N_meta_iter, i_top, exact_metagrad)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_rates, all_avg_regs
Exemplo n.º 44
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes) #only uses two different regularization hyperparameters, one for each layer?
    N_weight_types = len(parser.names) # = 2
    print(parser.names)
    hyperparams = VectorParser()
    hyperparams['log_L2_reg']      = np.full(N_weight_types, init_log_L2_reg)
    hyperparams['log_param_scale'] = np.full(N_weight_types, init_log_param_scale)
    hyperparams['log_alphas']      = np.full(N_iters, init_log_alphas)
    hyperparams['invlogit_betas']  = np.full(N_iters, init_invlogit_betas)
    fixed_hyperparams = VectorParser()
    fixed_hyperparams['log_param_scale']  = np.full(N_iters, init_log_param_scale) #don't update scale
    #TODO: remove scale from gradient, then?
    
    exact_metagrad = VectorParser()
    exact_metagrad['log_L2_reg']      = fill_parser(parser, hyperparams['log_L2_reg']) #np.zeros(N_weight_types)
    exact_metagrad['log_param_scale'] = fill_parser(parser, fixed_hyperparams['log_param_scale']) #np.zeros(N_weight_types)
    exact_metagrad['log_alphas']      = np.zeros(N_iters)
    exact_metagrad['invlogit_betas']  = np.zeros(N_iters)
    
    exact_metagrad2 = VectorParser()
    exact_metagrad2['log_L2_reg']      = np.zeros(N_weight_types)
    exact_metagrad2['log_param_scale'] = np.zeros(N_weight_types)
    exact_metagrad2['log_alphas']      = np.zeros(N_iters)
    exact_metagrad2['invlogit_betas']  = np.zeros(N_iters)
    
    #exact_metagrad = exact_metagradV.vect
    #print(hyperparams.vect)
    #exact_metagrad = [np.zeros(N_weight_types), np.zeros(N_weight_types), np.zeros(N_iters), np.zeros(N_iters)] #initialize

    # TODO: memoize
    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = npr.RandomState(npr.RandomState(global_seed + i_hyper + i_iter * 10000).randint(1000))
            seed = i_hyper * 10**6 + i_iter   # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs], L2_vect)

        learning_curve_dict = defaultdict(list)
        def callback(x, v, g, i_iter):
            if i_iter % thin == 0: # N_batches=10 times
                learning_curve_dict['learning_curve'].append(loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        # TODO: why doesn't the following line work with N_iter=1?
        W0 = fill_parser(parser, np.exp(fixed_hyperparams['log_param_scale'])) #don't update scale
        W0 *= npr.RandomState(global_seed + i_hyper).randn(W0.size)
        # TODO: Put on proper scale; no SGD on log/invlogit scale
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas  = logit(cur_hyperparams['invlogit_betas'])
        
        # TODO: check this
        L2_reg = fill_parser(parser, np.exp(cur_hyperparams['log_L2_reg']))
        
        W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), exact_metagrad, callback)
        #W_opt = sgd4(grad(indexed_loss_fun), kylist(W0, alphas, betas, L2_reg), callback)
        #callback(W_opt, N_iters)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)
    hyperloss_grad = grad(hyperloss)
    # TODO: This is where the chain rule happens, dhyperloss/dW_opt x dW_opt/dhyperparam_vect; first term is SGD

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]
    #def meta_callback(hyperparam_vect, i_hyper, metagrad):
    def meta_callback(hyperparam_vect, i_hyper, metagrad, exact_metagrad=exact_metagrad):
        x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field].append(cur_hyperparams[field])
        # these are the unregularized losses below; default sets L2_reg=0.0
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['train_err'].append(frac_err(x, **train_data))
        meta_results['valid_err'].append(frac_err(x, **valid_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        print("metagrad", len(metagrad))
        meta_results['meta_grad_magnitude'].append(np.linalg.norm(metagrad))
        meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                               / (np.linalg.norm(metagrad)*
                                                  np.linalg.norm(old_metagrad[0])))
        #Michael: added comparisons with exact metagrad here
        #(2) Angle condition:  More strongly, is the cosine of the angle between the two strictly bounded away from 0?
        #(3) Length: Since hypergradient optimization procedures do not necessarily use a proper line search, it may also be important for the approximate hypergradient to have a length comparable to the true hypergradient.
        
        
        exact_metagrad2['log_L2_reg']      = [sum(exact_metagrad['log_L2_reg'][0:7840]), sum(exact_metagrad['log_L2_reg'][7840:7850])]
        exact_metagrad2['log_param_scale'] = [sum(exact_metagrad['log_param_scale'][0:7840]), sum(exact_metagrad['log_param_scale'][7840:7850])]
        exact_metagrad2['log_alphas']      = exact_metagrad['log_alphas']
        exact_metagrad2['invlogit_betas']  = exact_metagrad['invlogit_betas']
    
        meta_results['exact_meta_grad_magnitude'].append(np.linalg.norm(exact_metagrad2.vect))
        meta_results['DrMAD_exact_angle'].append(np.dot(exact_metagrad2.vect, metagrad) \
                                               / (np.linalg.norm(metagrad)*
                                                  np.linalg.norm(exact_metagrad2.vect)))
    
        #TODO: do the above for parameters separately? E.g. check log_alphas separately
                                                  
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])  #Michael: train->tests
#    final_result = adam(hyperloss_grad, hyperparams.vect,
#                            meta_callback, N_meta_iter, meta_alpha)
    final_result = adam(hyperloss_grad, hyperparams.vect, exact_metagrad,
                            meta_callback, N_meta_iter, meta_alpha)
    #write modified adam to ignore exact hypergrad in sgd4_mad_with_exact
    #meta_callback(final_result, N_meta_iter)
    parser.vect = None # No need to pickle zeros
    return meta_results, parser
Exemplo n.º 45
0
def run():
    RS = RandomState((seed, "top_rs"))
    all_data = mnist.load_data_as_dict()
    train_data, tests_data = random_partition(all_data, RS, [N_train, N_tests])
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size

    def transform_weights(z_vect, transform):
        return z_vect * np.exp(transform)

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2)

    def constrain_transform(t_vect, name):
        all_t = w_parser.new_vect(t_vect)
        for i in range(N_layers):
            all_t[("biases", i)] = 0.0
        if name == "universal":
            t_mean = np.mean([np.mean(all_t[("weights", i)]) for i in range(N_layers)])
            for i in range(N_layers):
                all_t[("weights", i)] = t_mean
        elif name == "layers":
            for i in range(N_layers):
                all_t[("weights", i)] = np.mean(all_t[("weights", i)])
        elif name == "units":
            for i in range(N_layers):
                all_t[("weights", i)] = np.mean(all_t[("weights", i)], axis=1, keepdims=True)
        else:
            raise Exception
        return all_t.vect

    def process_transform(t_vect):
        # Remove the redundancy due to sharing transformations within units
        all_t = w_parser.new_vect(t_vect)
        new_t = np.zeros((0,))
        for i in range(N_layers):
            layer = all_t[("weights", i)]
            assert np.all(layer[:, 0] == layer[:, 1])
            cur_t = log_L2 - 2 * layer[:, 0]
            new_t = np.concatenate((new_t, cur_t))
        return new_t

    def train_z(data, z_vect_0, transform):
        N_data = data["X"].shape[0]

        def primal_loss(z_vect, transform, i_primal, record_results=False):
            RS = RandomState((seed, i_primal, "primal"))
            idxs = RS.randint(N_data, size=batch_size)
            minibatch = dictslice(data, idxs)
            w_vect = transform_weights(z_vect, transform)
            loss = loss_fun(w_vect, **minibatch)
            reg = regularization(z_vect)
            if record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}".format(i_primal, getval(loss))
            return loss + reg

        return sgd(grad(primal_loss), transform, z_vect_0, alpha, beta, N_iters)

    all_transforms, all_tests_loss = [], []

    def train_reg(transform_0, constraint, N_meta_iter, i_top):
        def hyperloss(transform, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            z_vect_0 = RS.randn(N_weights) * np.exp(log_init_scale)
            z_vect_final = train_z(cur_train_data, z_vect_0, transform)
            w_vect_final = transform_weights(z_vect_final, transform)
            return loss_fun(w_vect_final, **cur_valid_data)

        hypergrad = grad(hyperloss)
        cur_transform = transform_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_transform, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_transforms.append(cur_transform.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
            raw_grad = hypergrad(cur_transform, i_hyper, *cur_split)
            constrained_grad = constrain_transform(raw_grad, constraint)
            cur_transform -= constrained_grad * meta_alpha
        return cur_transform

    transform = np.zeros(N_weights)
    constraints = ["universal", "layers", "units"]
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        transform = train_reg(transform, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(process_transform, all_transforms)))
    return all_L2_regs, all_tests_loss
def run( subClassIndexList):
    RS = RandomState((seed, "to p_rs"))
    data = loadData.loadMnist()


    train_data,tests_data = loadData.load_data_as_dict(data, classNum, subClassIndexList.__getitem__(0))

    train_data_subclass = []
    train_data_subclass= loadSubsetData(train_data,RS, N_train, clientNum)



    print "training samples {0}: testing samples: {1}".format(N_train,N_tests)


    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    init_scales = w_parser.new_vect(np.zeros(N_weights))
    for i in range(N_layers):
        init_scales[('weights', i)] = 1 / np.sqrt(layer_sizes[i])
        init_scales[('biases',  i)] = 1.0
    init_scales = init_scales.vect


    all_regs, all_tests_loss = [], []
    def train_reg(reg_0, constraint, N_meta_iter, i_top):
        def hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
            RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
            w_vect_0 = RS.randn(N_weights) * init_scales
            w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg)
            return loss_fun(w_vect_final, **cur_valid_data)
        hypergrad = grad(hyperloss)

        #reg is the list of hyperparameters
        cur_reg = reg_0
        for i_hyper in range(N_meta_iter):
            if i_hyper % N_meta_thin == 0:
                tests_loss = hyperloss(cur_reg, i_hyper, train_data, tests_data)
                all_tests_loss.append(tests_loss)
                all_regs.append(cur_reg.copy())
                print "Hyper iter {0}, test loss {1}".format(i_hyper, all_tests_loss[-1])
                # print "Cur_reg", np.mean(cur_reg)
                print "Cur_reg", cur_reg

            for client_i in range (0,clientNum):

                RS = RandomState((seed, i_top, i_hyper, "hyperloss"))
                cur_split = random_partition(train_data_subclass.__getitem__(client_i), RS, [N_train - N_valid, N_valid])
                # print("calculate hypergradients")
                raw_grad = hypergrad(cur_reg, i_hyper, *cur_split)
                constrained_grad = constrain_reg(w_parser, raw_grad, constraint)


                # cur_reg -= constrained_grad / np.abs(constrained_grad + 1e-8) * meta_alpha/clientNum
                cur_reg -= constrained_grad * meta_alpha/clientNum

            print "\n"


        return cur_reg


    def new_hyperloss(reg, i_hyper, cur_train_data, cur_valid_data):
        RS = RandomState((seed, i_hyper, "hyperloss"))
        w_vect_0 = RS.randn(N_weights) * init_scales
        w_vect_final = train_z(loss_fun, cur_train_data, w_vect_0, reg)
        return loss_fun(w_vect_final, **cur_valid_data)

    # t_scale = [-1, 0, 1]
    # cur_split = random_partition(train_data, RS, [N_train - N_valid, N_valid])
    # for s in t_scale:
    #     reg = np.ones(N_weights) * log_L2_init + s
    #     loss = new_hyperloss(reg, 0, *cur_split)
    #     print "Results: s= {0}, loss = {1}".format(s, loss)

    reg = np.ones(N_weights) * log_L2_init

    constraints = ['universal', 'layers', 'units']
    for i_top, (N_meta_iter, constraint) in enumerate(zip(all_N_meta_iter, constraints)):
        print "Top level iter {0}".format(i_top)
        reg = train_reg(reg, constraint, N_meta_iter, i_top)

    all_L2_regs = np.array(zip(*map(w_parser, process_reg, all_regs)))
    return all_L2_regs, all_tests_loss
Exemplo n.º 47
0
def run(script_corr_init):
    """Three different parsers:
    w_parser[('biases', i_layer)] : neural net weights/biases per layer for a single  script
    script_parser[i_script]       : weights vector for each script
    transform_parser[i_layer]     : transform matrix (scripts x scripts) for each alphabet"""
    RS = RandomState((seed, "top_rs"))
    train_data, valid_data, tests_data = omniglot.load_data_split([11, 2, 2], RS, num_alphabets=N_scripts)
    w_parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weights = w_parser.vect.size
    transform_parser = make_transform(N_scripts, script_corr_init)
    script_parser = VectorParser()
    for i_script in range(N_scripts):
        script_parser[i_script] = np.zeros(N_weights)

    def get_layers(vect):
        layers = []
        for i_layer in range(N_layers):
            weights_by_scripts = vect.reshape((N_scripts, N_weights))
            weights_idxs, _ = w_parser.idxs_and_shapes[("weights", i_layer)]
            biases_idxs, _ = w_parser.idxs_and_shapes[("biases", i_layer)]
            assert weights_idxs.stop == biases_idxs.start
            layer_idxs = slice(weights_idxs.start, biases_idxs.stop)
            layers.append(weights_by_scripts[:, layer_idxs])
        return layers

    def transform_weights(z_vect, transform_vect):
        z_layers = get_layers(z_vect)
        transform = transform_parser.new_vect(transform_vect)
        w_layers = [np.dot(transform[i], z) for i, z in enumerate(z_layers)]
        return np.concatenate(w_layers, axis=1).ravel()

    def total_loss(w_vect, data):
        w = script_parser.new_vect(w_vect)
        return sum([loss_fun(w[i], **script_data) for i, script_data in enumerate(data)])

    def regularization(z_vect):
        return np.dot(z_vect, z_vect) * np.exp(log_L2_init)

    results = defaultdict(list)

    def hyperloss(transform_vect, i_hyper, record_results=True):
        RS = RandomState((seed, i_hyper, "hyperloss"))

        def primal_loss(z_vect, transform_vect, i_primal, record_results):
            RS = RandomState((seed, i_hyper, i_primal, i_script))
            w_vect = transform_weights(z_vect, transform_vect)
            loss = total_loss(w_vect, train_data)
            reg = regularization(z_vect)
            if VERBOSE and record_results and i_primal % N_thin == 0:
                print "Iter {0}: train: {1}, valid: {2}, reg: {3}".format(
                    i_primal, getval(loss) / N_scripts, total_loss(getval(w_vect), valid_data) / N_scripts, getval(reg)
                )
            return loss + reg

        z_vect_0 = RS.randn(script_parser.vect.size) * np.exp(log_initialization_scale)
        z_vect_final = sgd(grad(primal_loss), transform_vect, z_vect_0, alpha, beta, N_iters, callback=None)
        w_vect_final = transform_weights(z_vect_final, transform_vect)
        valid_loss = total_loss(w_vect_final, valid_data)
        if record_results:
            results["valid_loss"].append(getval(valid_loss) / N_scripts)
            results["train_loss"].append(total_loss(w_vect_final, train_data) / N_scripts)
        return valid_loss

    hyperloss(transform_parser.vect, 0)
    return results["train_loss"][-1], results["valid_loss"][-1]
Exemplo n.º 48
0
def run():
    train_data, valid_data, tests_data = load_data_dicts(
        N_train, N_valid, N_tests)
    parser, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes)
    N_weight_types = len(parser.names)
    N_weights = len(parser.vect)
    hyperparams = VectorParser()
    rs = RandomState((seed))
    hyperparams['log_L2_reg'] = np.full(N_weights, init_log_L2_reg)\
                              + rs.randn(N_weights) * init_log_L2_reg_noise
    hyperparams['log_param_scale'] = np.full(N_weight_types,
                                             init_log_param_scale)
    hyperparams['log_alphas'] = np.full((N_iters, N_weight_types),
                                        init_log_alphas)
    hyperparams['invlogit_betas'] = np.full((N_iters, N_weight_types),
                                            init_invlogit_betas)

    cur_primal_results = {}

    def primal_optimizer(hyperparam_vect, i_hyper):
        def indexed_loss_fun(w, L2_vect, i_iter):
            rs = RandomState(
                (seed, i_hyper,
                 i_iter))  # Deterministic seed needed for backwards pass.
            idxs = rs.randint(N_train, size=batch_size)
            return loss_fun(w, train_data['X'][idxs], train_data['T'][idxs],
                            L2_vect)

        learning_curve_dict = defaultdict(list)

        def callback(x, v, g, i_iter):
            if i_iter % thin == 0:
                learning_curve_dict['learning_curve'].append(
                    loss_fun(x, **train_data))
                learning_curve_dict['grad_norm'].append(np.linalg.norm(g))
                learning_curve_dict['weight_norm'].append(np.linalg.norm(x))
                learning_curve_dict['velocity_norm'].append(np.linalg.norm(v))

        cur_hyperparams = hyperparams.new_vect(hyperparam_vect)
        rs = RandomState((seed, i_hyper))
        W0 = fill_parser(parser, np.exp(cur_hyperparams['log_param_scale']))
        W0 *= rs.randn(W0.size)
        alphas = np.exp(cur_hyperparams['log_alphas'])
        betas = logit(cur_hyperparams['invlogit_betas'])
        L2_reg = np.exp(cur_hyperparams['log_L2_reg'])
        W_opt = sgd_parsed(grad(indexed_loss_fun),
                           kylist(W0, alphas, betas, L2_reg),
                           parser,
                           callback=callback)
        cur_primal_results['weights'] = getval(W_opt).copy()
        cur_primal_results['learning_curve'] = getval(learning_curve_dict)
        return W_opt, learning_curve_dict

    def hyperloss(hyperparam_vect, i_hyper):
        W_opt, _ = primal_optimizer(hyperparam_vect, i_hyper)
        return loss_fun(W_opt, **valid_data)

    hyperloss_grad = grad(hyperloss)

    meta_results = defaultdict(list)
    old_metagrad = [np.ones(hyperparams.vect.size)]

    def meta_callback(hyperparam_vect, i_hyper, metagrad=None):
        #x, learning_curve_dict = primal_optimizer(hyperparam_vect, i_hyper)
        x, learning_curve_dict = cur_primal_results[
            'weights'], cur_primal_results['learning_curve']
        cur_hyperparams = hyperparams.new_vect(hyperparam_vect.copy())
        for field in cur_hyperparams.names:
            meta_results[field] = cur_hyperparams[field]
        meta_results['train_loss'].append(loss_fun(x, **train_data))
        meta_results['valid_loss'].append(loss_fun(x, **valid_data))
        meta_results['tests_loss'].append(loss_fun(x, **tests_data))
        meta_results['test_err'].append(frac_err(x, **tests_data))
        meta_results['learning_curves'].append(learning_curve_dict)
        meta_results['example_weights'] = x
        if metagrad is not None:
            meta_results['meta_grad_magnitude'].append(
                np.linalg.norm(metagrad))
            meta_results['meta_grad_angle'].append(np.dot(old_metagrad[0], metagrad) \
                                                   / (np.linalg.norm(metagrad)*
                                                      np.linalg.norm(old_metagrad[0])))
        old_metagrad[0] = metagrad
        print "Meta Epoch {0} Train loss {1:2.4f} Valid Loss {2:2.4f}" \
              " Test Loss {3:2.4f} Test Err {4:2.4f}".format(
            i_hyper, meta_results['train_loss'][-1], meta_results['valid_loss'][-1],
            meta_results['tests_loss'][-1], meta_results['test_err'][-1])

    initial_hypergrad = hyperloss_grad(hyperparams.vect, 0)
    parsed_init_hypergrad = hyperparams.new_vect(initial_hypergrad.copy())
    final_result = adam(hyperloss_grad, hyperparams.vect, meta_callback,
                        N_meta_iter, meta_alpha)
    meta_callback(final_result, N_meta_iter)
    parser.vect = None  # No need to pickle zeros
    return meta_results, parser, parsed_init_hypergrad