Пример #1
0
def test_vae():
    minibatch_size = 100
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()

    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)

    l1_enc = relu_layer([X_sym, y_sym], graph, 'l1_enc', proj_dim=20,
                        random_state=random_state)
    mu = linear_layer([l1_enc], graph, 'mu', proj_dim=10,
                      random_state=random_state)
    log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=10,
                             random_state=random_state)
    samp = gaussian_log_sample_layer([mu], [log_sigma], graph,
                                     'gaussian_log_sample',
                                     random_state=random_state)
    l1_dec = relu_layer([samp], graph, 'l1_dec', proj_dim=20,
                        random_state=random_state)
    out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1],
                        random_state=random_state)

    kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean()
    cost = binary_crossentropy(out, X_sym).mean() + kl
    params, grads = get_params_and_grads(graph, cost)
    learning_rate = 0.001
    opt = sgd(params)
    updates = opt.updates(params, grads, learning_rate)

    train_function = theano.function([X_sym, y_sym], [cost], updates=updates,
                                     mode="FAST_COMPILE")

    iterate_function(train_function, [X, y], minibatch_size,
                     list_of_output_names=["cost"], n_epochs=1)
Пример #2
0
def test_gaussian_kl():
    graph = OrderedDict()
    X_sym = add_datasets_to_graph([X], ["X"], graph)
    fake_sigma = (.5 * X_sym + .001) ** 2
    kl = gaussian_kl([X_sym, X_sym], [fake_sigma, fake_sigma], graph,
                     'gaussian_kl')
    theano.function([X_sym], [kl], mode="FAST_COMPILE")
Пример #3
0
def test_feedforward_theano_mix():
    minibatch_size = 100
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()

    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)

    l1_o = linear_layer([X_sym], graph, 'l1', proj_dim=20,
                        random_state=random_state)
    l1_o = .999 * l1_o
    y_pred = softmax_layer([l1_o], graph, 'pred', n_classes,
                           random_state=random_state)

    cost = categorical_crossentropy(y_pred, y_sym).mean()
    params, grads = get_params_and_grads(graph, cost)
    learning_rate = 0.001
    opt = sgd(params)
    updates = opt.updates(params, grads, learning_rate)

    fit_function = theano.function([X_sym, y_sym], [cost], updates=updates,
                                   mode="FAST_COMPILE")

    cost_function = theano.function([X_sym, y_sym], [cost],
                                    mode="FAST_COMPILE")

    checkpoint_dict = {}
    train_indices = np.arange(len(X))
    valid_indices = np.arange(len(X))
    early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y],
                           minibatch_size,
                           train_indices, valid_indices,
                           fit_function_output_names=["cost"],
                           cost_function_output_name="valid_cost",
                           n_epochs=1)
Пример #4
0
def test_batch_normalization():
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph,
                                         list_of_test_values=[X, y])
    on_off = tensor.iscalar()
    on_off.tag.test_value = 1
    l1 = relu_layer([X_sym], graph, "proj", proj_dim=5,
                    batch_normalize=True, mode_switch=on_off,
                    random_state=random_state)
    l2 = relu_layer([l1], graph, "proj2", proj_dim=5,
                    batch_normalize=True, mode_switch=on_off,
                    random_state=random_state)
    f = theano.function([X_sym, on_off], [l2], mode="FAST_COMPILE")
    params, grads = get_params_and_grads(graph, l2.mean())
    opt = sgd(params, .1)
    updates = opt.updates(params, grads)
    train_f = theano.function([X_sym, on_off], [l2], mode="FAST_COMPILE",
                              updates=updates)
    valid_f = theano.function([X_sym, on_off], [l2], mode="FAST_COMPILE")
    X1 = random_state.rand(*X.shape)
    X2 = np.vstack([X1, .5 * X1])
    t1 = train_f(X1, 0)[0]
    t2 = valid_f(X1, 1)[0]
    t3 = train_f(X2, 0)[0]
    t4 = valid_f(X1, 1)[0]
    t5 = valid_f(X1, 1)[0]
    assert_almost_equal(t4, t5)
    assert_raises(AssertionError, assert_almost_equal, t2, t4)
Пример #5
0
def test_tanh_rnn():
    # random state so script is deterministic
    random_state = np.random.RandomState(1999)
    # home of the computational graph
    graph = OrderedDict()

    # number of hidden features
    n_hid = 10
    # number of output_features = input_features
    n_out = X.shape[-1]

    # input (where first dimension is time)
    datasets_list = [X, X_mask, y, y_mask]
    names_list = ["X", "X_mask", "y", "y_mask"]
    test_values_list = [X, X_mask, y, y_mask]
    X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
        datasets_list, names_list, graph, list_of_test_values=test_values_list)

    # Setup weights
    l1 = linear_layer([X_sym], graph, 'l1_proj', n_hid, random_state)

    h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec',
                             random_state)

    # linear output activation
    y_hat = linear_layer([h], graph, 'l2_proj', n_out, random_state)

    # error between output and target
    cost = squared_error(y_hat, y_sym)
    cost = masked_cost(cost, y_mask_sym).mean()
    # Parameters of the model
    params, grads = get_params_and_grads(graph, cost)

    # Use stochastic gradient descent to optimize
    opt = sgd(params)
    learning_rate = 0.001
    updates = opt.updates(params, grads, learning_rate)

    fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                   [cost],
                                   updates=updates,
                                   mode="FAST_COMPILE")

    cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                    [cost],
                                    mode="FAST_COMPILE")
    checkpoint_dict = {}
    train_indices = np.arange(X.shape[1])
    valid_indices = np.arange(X.shape[1])
    early_stopping_trainer(fit_function,
                           cost_function,
                           checkpoint_dict, [X, y],
                           minibatch_size,
                           train_indices,
                           valid_indices,
                           fit_function_output_names=["cost"],
                           cost_function_output_name="valid_cost",
                           n_epochs=1)
Пример #6
0
def test_conditional_gru_recurrent():
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()
    n_hid = 5
    n_out = n_chars

    # input (where first dimension is time)
    datasets_list = [X_mb, X_mask, y_mb, y_mask]
    names_list = ["X", "X_mask", "y", "y_mask"]
    X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
        datasets_list, names_list, graph)

    h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end',
                            random_state)

    shifted_y_sym = shift_layer([y_sym], graph, 'shift')

    h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym,
                                                     n_hid, graph, 'l2_dec',
                                                     random_state)

    # linear output activation
    y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj',
                          n_out, random_state=random_state)

    # error between output and target
    cost = categorical_crossentropy(y_hat, y_sym)
    cost = masked_cost(cost, y_mask_sym).mean()
    # Parameters of the model
    """
    params, grads = get_params_and_grads(graph, cost)

    # Use stochastic gradient descent to optimize
    opt = sgd(params)
    learning_rate = 0.00000
    updates = opt.updates(params, grads, learning_rate)


    fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                   [cost], updates=updates,
                                   mode="FAST_COMPILE")
    """

    cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                    [cost], mode="FAST_COMPILE")

    checkpoint_dict = {}
    train_indices = np.arange(len(X))
    valid_indices = np.arange(len(X))
    early_stopping_trainer(cost_function, cost_function,
                           train_indices, valid_indices,
                           checkpoint_dict,
                           [X, y],
                           minibatch_size,
                           list_of_minibatch_functions=[text_minibatch_func],
                           list_of_train_output_names=["cost"],
                           valid_output_name="valid_cost",
                           n_epochs=1)
Пример #7
0
def test_conditional_gru_recurrent():
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()
    n_hid = 5
    n_out = n_chars

    # input (where first dimension is time)
    datasets_list = [X_mb, X_mask, y_mb, y_mask]
    names_list = ["X", "X_mask", "y", "y_mask"]
    X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
        datasets_list, names_list, graph)

    h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end',
                            random_state)

    shifted_y_sym = shift_layer([y_sym], graph, 'shift')

    h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym,
                                                     n_hid, graph, 'l2_dec',
                                                     random_state)

    # linear output activation
    y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj',
                          n_out, random_state)

    # error between output and target
    cost = categorical_crossentropy(y_hat, y_sym)
    cost = masked_cost(cost, y_mask_sym).mean()
    # Parameters of the model
    """
    params, grads = get_params_and_grads(graph, cost)

    # Use stochastic gradient descent to optimize
    opt = sgd(params)
    learning_rate = 0.00000
    updates = opt.updates(params, grads, learning_rate)


    fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                   [cost], updates=updates,
                                   mode="FAST_COMPILE")
    """

    cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                    [cost], mode="FAST_COMPILE")

    checkpoint_dict = {}
    train_indices = np.arange(len(X))
    valid_indices = np.arange(len(X))
    early_stopping_trainer(cost_function, cost_function, checkpoint_dict,
                           [X, y],
                           minibatch_size, train_indices, valid_indices,
                           list_of_minibatch_functions=[text_minibatch_func],
                           fit_function_output_names=["cost"],
                           cost_function_output_name="valid_cost",
                           n_epochs=1)
Пример #8
0
def test_tanh_rnn():
    # random state so script is deterministic
    random_state = np.random.RandomState(1999)
    # home of the computational graph
    graph = OrderedDict()

    # number of hidden features
    n_hid = 10
    # number of output_features = input_features
    n_out = X.shape[-1]

    # input (where first dimension is time)
    datasets_list = [X, X_mask, y, y_mask]
    names_list = ["X", "X_mask", "y", "y_mask"]
    test_values_list = [X, X_mask, y, y_mask]
    X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
        datasets_list, names_list, graph, list_of_test_values=test_values_list)

    # Setup weights
    l1 = linear_layer([X_sym], graph, 'l1_proj', proj_dim=n_hid,
                      random_state=random_state)

    h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec',
                             random_state)

    # linear output activation
    y_hat = linear_layer([h], graph, 'l2_proj', proj_dim=n_out,
                         random_state=random_state)

    # error between output and target
    cost = squared_error(y_hat, y_sym)
    cost = masked_cost(cost, y_mask_sym).mean()
    # Parameters of the model
    params, grads = get_params_and_grads(graph, cost)

    # Use stochastic gradient descent to optimize
    learning_rate = 0.001
    opt = sgd(params, learning_rate)
    updates = opt.updates(params, grads)

    fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                   [cost], updates=updates, mode="FAST_COMPILE")

    cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                    [cost], mode="FAST_COMPILE")
    checkpoint_dict = {}
    train_indices = np.arange(X.shape[1])
    valid_indices = np.arange(X.shape[1])
    early_stopping_trainer(fit_function, cost_function,
                           train_indices, valid_indices,
                           checkpoint_dict,
                           [X, y], minibatch_size,
                           list_of_train_output_names=["cost"],
                           valid_output_name="valid_cost",
                           n_epochs=1)
Пример #9
0
def test_conv2d_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    # 3 channel mnist
    X_r = np.random.randn(10, 3, 28, 28).astype(theano.config.floatX)
    X_sym = add_datasets_to_graph([X_r], ["X"], graph)
    l1 = conv2d_layer([X_sym], graph, 'l1', 5, random_state=random_state)
    # test that they can stack as well
    l2 = conv2d_layer([l1], graph, 'l2', 6, random_state=random_state)
    f = theano.function([X_sym], [l1, l2], mode="FAST_COMPILE")
    l1, l2 = f(X_r)
Пример #10
0
def test_softmax_sample_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    softmax = softmax_layer([X_sym], graph, 'softmax', proj_dim=20,
                            random_state=random_state)
    samp = softmax_sample_layer([softmax], graph, 'softmax_sample',
                                random_state=random_state)
    out = linear_layer([samp], graph, 'out', proj_dim=10,
                       random_state=random_state)
    f = theano.function([X_sym], [out], mode="FAST_COMPILE")
Пример #11
0
def test_gaussian_sample_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    mu = linear_layer([X_sym], graph, 'mu', proj_dim=20,
                      random_state=random_state)
    sigma = softplus_layer([X_sym], graph, 'sigma', proj_dim=20,
                           random_state=random_state)
    samp = gaussian_sample_layer([mu], [sigma], graph, 'gaussian_sample',
                                 random_state=random_state)
    out = linear_layer([samp], graph, 'out', proj_dim=10,
                       random_state=random_state)
    f = theano.function([X_sym], [out], mode="FAST_COMPILE")
Пример #12
0
def test_dropout_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    on_off = tensor.iscalar()
    dropped = dropout_layer([X_sym], graph, 'dropout', on_off,
                            random_state=random_state)

    f = theano.function([X_sym, on_off], [dropped], mode="FAST_COMPILE")
    drop = f(np.ones_like(X), 1)[0]
    full = f(np.ones_like(X), 0)[0]
    # Make sure drop switch works
    assert_almost_equal((full.sum() / 2) / drop.sum(), 1., decimal=2)
Пример #13
0
def test_maxout_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    single_o = maxout_layer([X_sym], graph, 'single', proj_dim=5,
                            random_state=random_state)
    concat_o = maxout_layer([X_sym, y_sym], graph, 'concat', proj_dim=5,
                            random_state=random_state)
    # Check that strict mode raises an error if repeated
    assert_raises(AssertionError, maxout_layer, [X_sym], graph, 'concat')

    f = theano.function([X_sym, y_sym], [single_o, concat_o],
                        mode="FAST_COMPILE")
    single, concat = f(X, y)
Пример #14
0
def test_softmax_zeros_layer():
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    single_o = softmax_zeros_layer([X_sym], graph, 'single', proj_dim=5)
    concat_o = softmax_zeros_layer([X_sym, y_sym], graph, 'concat', proj_dim=5)
    # Check that things can be reused
    repeated_o = softmax_layer([X_sym], graph, 'single', strict=False)

    # Check that strict mode raises an error if repeated
    assert_raises(AttributeError, softmax_layer, [X_sym], graph, 'concat')

    f = theano.function([X_sym, y_sym], [single_o, concat_o, repeated_o],
                        mode="FAST_COMPILE")
    single, concat, repeat = f(X, y)
    assert_almost_equal(single, repeat)
Пример #15
0
def test_dropout_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    on_off = tensor.iscalar()
    dropped = dropout_layer([X_sym],
                            graph,
                            'dropout',
                            on_off,
                            random_state=random_state)

    f = theano.function([X_sym, on_off], [dropped], mode="FAST_COMPILE")
    drop = f(np.ones_like(X), 1)[0]
    full = f(np.ones_like(X), 0)[0]
    # Make sure drop switch works
    assert_almost_equal((full.sum() / 2) / drop.sum(), 1., decimal=2)
Пример #16
0
def run_common_layer(layer):
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    single_o = layer([X_sym], graph, 'single', proj_dim=5,
                     random_state=random_state)
    concat_o = layer([X_sym, y_sym], graph, 'concat', proj_dim=5,
                     random_state=random_state)
    # Check that things can be reused
    repeated_o = layer([X_sym], graph, 'single', strict=False)

    # Check that strict mode raises an error if repeated
    assert_raises(AttributeError, layer, [X_sym], graph, 'concat')

    f = theano.function([X_sym, y_sym], [single_o, concat_o, repeated_o],
                        mode="FAST_COMPILE")
    single, concat, repeat = f(X, y)
    assert_almost_equal(single, repeat)
Пример #17
0
def test_vae():
    minibatch_size = 10
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()

    X_sym = add_datasets_to_graph([X], ["X"], graph)

    l1_enc = softplus_layer([X_sym], graph, 'l1_enc', proj_dim=100,
                            random_state=random_state)
    mu = linear_layer([l1_enc], graph, 'mu', proj_dim=50,
                      random_state=random_state)
    log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=50,
                             random_state=random_state)
    samp = gaussian_log_sample_layer([mu], [log_sigma], graph,
                                     'gaussian_log_sample',
                                     random_state=random_state)
    l1_dec = softplus_layer([samp], graph, 'l1_dec', proj_dim=100,
                            random_state=random_state)
    out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1],
                        random_state=random_state)

    kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean()
    cost = binary_crossentropy(out, X_sym).mean() + kl
    params, grads = get_params_and_grads(graph, cost)
    learning_rate = 0.00000
    opt = sgd(params, learning_rate)
    updates = opt.updates(params, grads)

    fit_function = theano.function([X_sym], [cost], updates=updates,
                                   mode="FAST_COMPILE")

    cost_function = theano.function([X_sym], [cost],
                                    mode="FAST_COMPILE")

    checkpoint_dict = {}
    train_indices = np.arange(len(X))
    valid_indices = np.arange(len(X))
    early_stopping_trainer(fit_function, cost_function,
                           train_indices, valid_indices,
                           checkpoint_dict, [X],
                           minibatch_size,
                           list_of_train_output_names=["cost"],
                           valid_output_name="valid_cost",
                           n_epochs=1)
Пример #18
0
def test_feedforward_classifier():
    minibatch_size = 100
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()

    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)

    l1_o = linear_layer([X_sym], graph, "l1", proj_dim=20, random_state=random_state)
    y_pred = softmax_layer([l1_o], graph, "pred", n_classes, random_state=random_state)

    cost = categorical_crossentropy(y_pred, y_sym).mean()
    params, grads = get_params_and_grads(graph, cost)
    learning_rate = 0.001
    opt = sgd(params)
    updates = opt.updates(params, grads, learning_rate)

    train_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE")

    iterate_function(train_function, [X, y], minibatch_size, list_of_output_names=["cost"], n_epochs=1)
Пример #19
0
def test_rnn_correlated_mixture_density():
    # graph holds information necessary to build layers from parents
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()
    minibatch_size = 5
    X_seq = np.array([bernoulli_X for i in range(minibatch_size)])
    y_seq = np.array([bernoulli_y for i in range(minibatch_size)])
    X_mb, X_mb_mask = make_masked_minibatch(X_seq, slice(0, minibatch_size))
    y_mb, y_mb_mask = make_masked_minibatch(y_seq, slice(0, minibatch_size))
    datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask]
    names_list = ["X", "X_mask", "y", "y_mask"]
    X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
        datasets_list, names_list, graph)
    n_hid = 5
    train_indices = np.arange(len(X_seq))
    valid_indices = np.arange(len(X_seq))

    l1 = tanh_layer([X_sym], graph, 'l1', proj_dim=n_hid,
                    random_state=random_state)
    h = gru_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec',
                            random_state=random_state)
    rval = bernoulli_and_correlated_log_gaussian_mixture_layer(
        [h], graph, 'hw', proj_dim=2, n_components=3,
        random_state=random_state)
    binary, coeffs, mus, log_sigmas, corr = rval
    cost = bernoulli_and_correlated_log_gaussian_mixture_cost(
        binary, coeffs, mus, log_sigmas, corr, y_sym)
    cost = masked_cost(cost, y_mask_sym).mean()
    cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym],
                                    [cost],
                                    mode="FAST_COMPILE")

    checkpoint_dict = create_checkpoint_dict(locals())

    epoch_results = fixed_n_epochs_trainer(
        cost_function, cost_function, train_indices, valid_indices,
        checkpoint_dict, [X_seq, y_seq],
        minibatch_size,
        list_of_minibatch_functions=[make_masked_minibatch,
                                     make_masked_minibatch],
        list_of_train_output_names=["train_cost"],
        valid_output_name="valid_cost",
        n_epochs=1)
Пример #20
0
def test_softmax_sample_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    softmax = softmax_layer([X_sym],
                            graph,
                            'softmax',
                            proj_dim=20,
                            random_state=random_state)
    samp = softmax_sample_layer([softmax],
                                graph,
                                'softmax_sample',
                                random_state=random_state)
    out = linear_layer([samp],
                       graph,
                       'out',
                       proj_dim=10,
                       random_state=random_state)
    f = theano.function([X_sym], [out], mode="FAST_COMPILE")
Пример #21
0
def test_fixed_projection_layer():
    random_state = np.random.RandomState(1999)
    rand_projection = random_state.randn(64, 12)

    graph = OrderedDict()
    X_sym = add_datasets_to_graph([X], ["X"], graph)
    out = fixed_projection_layer([X_sym], rand_projection, graph, 'proj')
    out2 = fixed_projection_layer([X_sym],
                                  rand_projection,
                                  graph,
                                  'proj',
                                  pre=rand_projection[:, 0])
    out3 = fixed_projection_layer([X_sym],
                                  rand_projection,
                                  graph,
                                  'proj',
                                  post=rand_projection[0])
    final = linear_layer([out2],
                         graph,
                         'linear',
                         17,
                         random_state=random_state)
    # Test that it compiles with and without bias
    f = theano.function([X_sym], [out, out2, out3, final], mode="FAST_COMPILE")

    # Test updates
    params, grads = get_params_and_grads(graph, final.mean())
    opt = sgd(params)
    updates = opt.updates(params, grads, .1)
    f2 = theano.function([X_sym], [out2, final], updates=updates)
    ret = f(np.ones_like(X))[0]
    assert ret.shape[1] != X.shape[1]
    ret2 = f(np.ones_like(X))[1]
    assert ret.shape[1] != X.shape[1]
    out1, final1 = f2(X)
    out2, final2 = f2(X)

    # Make sure fixed basis is unchanged
    assert_almost_equal(out1, out2)

    # Make sure linear layer is updated
    assert_raises(AssertionError, assert_almost_equal, final1, final2)
Пример #22
0
def test_correlated_mixture_density():
    # graph holds information necessary to build layers from parents
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([bernoulli_X, bernoulli_y], ["X", "y"],
                                         graph)
    n_hid = 20
    minibatch_size = len(bernoulli_X)
    train_indices = np.arange(len(bernoulli_X))
    valid_indices = np.arange(len(bernoulli_X))

    l1 = tanh_layer([X_sym], graph, 'l1', proj_dim=n_hid,
                    random_state=random_state)
    rval = bernoulli_and_correlated_log_gaussian_mixture_layer(
        [l1], graph, 'hw', proj_dim=2, n_components=3,
        random_state=random_state)
    binary, coeffs, mus, log_sigmas, corr = rval
    cost = bernoulli_and_correlated_log_gaussian_mixture_cost(
        binary, coeffs, mus, log_sigmas, corr, y_sym).mean()
    params, grads = get_params_and_grads(graph, cost)

    learning_rate = 1E-6
    opt = sgd(params, learning_rate)
    updates = opt.updates(params, grads)

    fit_function = theano.function([X_sym, y_sym], [cost], updates=updates,
                                   mode="FAST_COMPILE")
    cost_function = theano.function([X_sym, y_sym], [cost],
                                    mode="FAST_COMPILE")

    checkpoint_dict = create_checkpoint_dict(locals())

    epoch_results = fixed_n_epochs_trainer(
        fit_function, cost_function, train_indices, valid_indices,
        checkpoint_dict, [bernoulli_X, bernoulli_y],
        minibatch_size,
        list_of_train_output_names=["train_cost"],
        valid_output_name="valid_cost",
        n_epochs=1)
Пример #23
0
def test_fixed_projection_layer():
    random_state = np.random.RandomState(1999)
    rand_projection = random_state.randn(64, 12)

    graph = OrderedDict()
    X_sym = add_datasets_to_graph([X], ["X"], graph)
    out = fixed_projection_layer([X_sym], rand_projection,
                                 graph, 'proj')
    out2 = fixed_projection_layer([X_sym], rand_projection,
                                  graph, 'proj',
                                  pre=rand_projection[:, 0])
    out3 = fixed_projection_layer([X_sym], rand_projection,
                                  graph, 'proj',
                                  post=rand_projection[0])
    final = linear_layer([out2], graph, 'linear', 17,
                         random_state=random_state)
    # Test that it compiles with and without bias
    f = theano.function([X_sym], [out, out2, out3, final],
                        mode="FAST_COMPILE")

    # Test updates
    params, grads = get_params_and_grads(
        graph, final.mean())
    opt = sgd(params, .1)
    updates = opt.updates(params, grads)
    f2 = theano.function([X_sym], [out2, final],
                         updates=updates)
    ret = f(np.ones_like(X))[0]
    assert ret.shape[1] != X.shape[1]
    ret2 = f(np.ones_like(X))[1]
    assert ret.shape[1] != X.shape[1]
    out1, final1 = f2(X)
    out2, final2 = f2(X)

    # Make sure fixed basis is unchanged
    assert_almost_equal(out1, out2)

    # Make sure linear layer is updated
    assert_raises(AssertionError, assert_almost_equal, final1, final2)
Пример #24
0
def run_common_layer(layer):
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    single_o = layer([X_sym],
                     graph,
                     'single',
                     proj_dim=5,
                     random_state=random_state)
    concat_o = layer([X_sym, y_sym],
                     graph,
                     'concat',
                     proj_dim=5,
                     random_state=random_state)
    # Check that things can be reused
    repeated_o = layer([X_sym], graph, 'single', strict=False)

    # Check that strict mode raises an error if repeated
    assert_raises(AttributeError, layer, [X_sym], graph, 'concat')

    f = theano.function([X_sym, y_sym], [single_o, concat_o, repeated_o],
                        mode="FAST_COMPILE")
    single, concat, repeat = f(X, y)
    assert_almost_equal(single, repeat)
Пример #25
0
def test_gaussian_log_sample_layer():
    random_state = np.random.RandomState(42)
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    mu = linear_layer([X_sym],
                      graph,
                      'mu',
                      proj_dim=20,
                      random_state=random_state)
    log_sigma = linear_layer([X_sym],
                             graph,
                             'log_sigma',
                             proj_dim=20,
                             random_state=random_state)
    samp = gaussian_log_sample_layer([mu], [log_sigma],
                                     graph,
                                     'gaussian_sample',
                                     random_state=random_state)
    out = linear_layer([samp],
                       graph,
                       'out',
                       proj_dim=10,
                       random_state=random_state)
    f = theano.function([X_sym], [out], mode="FAST_COMPILE")
Пример #26
0
minibatch_size = 32
n_emb = 50
n_hid = 100
X_story_mb, X_story_mask = make_embedding_minibatch(
    X_story, slice(0, minibatch_size))
X_query_mb, X_query_mask = make_embedding_minibatch(
    X_query, slice(0, minibatch_size))

embedding_datasets = [X_story_mb, X_query_mb]
masks = [X_story_mask, X_query_mask]
r = add_embedding_datasets_to_graph(embedding_datasets, masks, "babi_data",
                                    graph)
(X_story_syms, X_query_syms), (X_story_mask_sym, X_query_mask_sym) = r

y_sym = add_datasets_to_graph([y_answer], ["y"], graph)


l1_story = embedding_layer(X_story_syms, vocab_size, n_emb, graph, 'l1_story',
                           random_state=random_state)
masked_story = X_story_mask_sym.dimshuffle(0, 1, 'x') * l1_story
h_story = gru_recurrent_layer([masked_story], X_story_mask_sym, n_hid, graph,
                              'story_rec', random_state)

l1_query = embedding_layer(X_query_syms, vocab_size, n_emb, graph, 'l1_query',
                           random_state)
h_query = gru_recurrent_layer([l1_query], X_query_mask_sym, n_hid, graph,
                              'query_rec', random_state)
y_pred = softmax_layer([h_query[-1], h_story[-1]], graph, 'y_pred',
                       y_answer.shape[1], random_state=random_state)
cost = categorical_crossentropy(y_pred, y_sym).mean()
Пример #27
0
minibatch_size = 32
n_emb = 50
n_hid = 100
X_story_mb, X_story_mask = make_embedding_minibatch(
    X_story, slice(0, minibatch_size))
X_query_mb, X_query_mask = make_embedding_minibatch(
    X_query, slice(0, minibatch_size))

embedding_datasets = [X_story_mb, X_query_mb]
masks = [X_story_mask, X_query_mask]
r = add_embedding_datasets_to_graph(embedding_datasets, masks, "babi_data",
                                    graph)
(X_story_syms, X_query_syms), (X_story_mask_sym, X_query_mask_sym) = r

y_sym = add_datasets_to_graph([y_answer], ["y"], graph)


l1_story = embedding_layer(X_story_syms, vocab_size, n_emb, graph, 'l1_story',
                           random_state)
masked_story = X_story_mask_sym.dimshuffle(0, 1, 'x') * l1_story
h_story = gru_recurrent_layer([masked_story], X_story_mask_sym, n_hid, graph,
                              'story_rec', random_state)

l1_query = embedding_layer(X_query_syms, vocab_size, n_emb, graph, 'l1_query',
                           random_state)
h_query = gru_recurrent_layer([l1_query], X_query_mask_sym, n_hid, graph,
                              'query_rec', random_state)
y_pred = softmax_layer([h_query[-1], h_story[-1]], graph, 'y_pred',
                       y_answer.shape[1], random_state)
cost = categorical_crossentropy(y_pred, y_sym).mean()
Пример #28
0
from dagbldr.utils import TrainingLoop
from dagbldr.nodes import tanh_layer, softmax_zeros_layer
from dagbldr.nodes import categorical_crossentropy

mnist = fetch_mnist()
train_indices = mnist["train_indices"]
valid_indices = mnist["valid_indices"]
X = mnist["data"]
y = mnist["target"]
n_targets = 10
y = convert_to_one_hot(y, n_targets)

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"],
                                     graph,
                                     list_of_test_values=[X[:10], y[:10]])
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = 128
n_hid = 1000

on_off = tensor.iscalar()
on_off.tag.test_value = 0
l1 = tanh_layer([X_sym],
                graph,
                'l1',
                proj_dim=n_hid,
                batch_normalize=True,
                mode_switch=on_off,
Пример #29
0
from dagbldr.nodes import gaussian_log_sample_layer, gaussian_log_kl
from dagbldr.nodes import squared_error


fer = fetch_fer()
data = fer["data"]
mean_norm = fer["mean0"]
train_indices = fer["train_indices"]
valid_indices = fer["valid_indices"]
X = data - mean_norm
pca_tf = fer["pca_matrix"]
X = np.dot(X, pca_tf.T)

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym = add_datasets_to_graph([X], ["X"], graph)
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = 100
n_code = 400
n_enc_layer = [600, 600]
n_dec_layer = [600, 600]
width = 48
height = 48
n_input = width * height

# encode path aka q
l1_enc = softplus_layer([X_sym], graph, 'l1_enc', n_enc_layer[0], random_state)
l2_enc = softplus_layer([l1_enc], graph, 'l2_enc',  n_enc_layer[1],
                        random_state)
Пример #30
0
base_string = "cat"
true_strings = sorted(list(set(["".join(i) for i in [
    s for s in itertools.permutations(base_string)]])))
ocr = make_ocr(true_strings)
X = ocr["data"]
vocab = ocr["vocabulary"]
y = convert_to_one_hot(ocr["target"], n_classes=len(vocab)).astype(
    theano.config.floatX)
minibatch_size = mbs = 2
train_itr = minibatch_iterator([X, y], minibatch_size, make_mask=True, axis=1)
X_mb, X_mb_mask, y_mb, y_mb_mask = next(train_itr)
train_itr.reset()
valid_itr = minibatch_iterator([X, y], minibatch_size, make_mask=True, axis=1)
datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask]
names_list = ["X", "X_mask", "y", "y_mask"]
X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
    datasets_list, names_list, graph, list_of_test_values=datasets_list)

n_hid = 256
n_out = 8

h = location_attention_tanh_recurrent_layer(
    [X_sym], [y_sym], X_mask_sym, y_mask_sym, n_hid, graph, 'l1_att_rec',
    random_state=random_state)

X_hat = sigmoid_layer([h], graph, 'output', proj_dim=n_out,
                      random_state=random_state)
cost = binary_crossentropy(X_hat, X_sym).mean()
cost = masked_cost(cost, X_mask_sym).mean()
params, grads = get_params_and_grads(graph, cost)
opt = adadelta(params)
updates = opt.updates(params, grads)
Пример #31
0
def test_gaussian_log_kl():
    graph = OrderedDict()
    X_sym = add_datasets_to_graph([X], ["X"], graph)
    kl = gaussian_log_kl([X_sym, X_sym], [X_sym, X_sym], graph,
                         'gaussian_log_kl')
    theano.function([X_sym], [kl], mode="FAST_COMPILE")
Пример #32
0
from dagbldr.utils import early_stopping_trainer
from dagbldr.nodes import relu_layer, softmax_zeros_layer
from dagbldr.nodes import categorical_crossentropy


mnist = fetch_mnist()
train_indices = mnist["train_indices"]
valid_indices = mnist["valid_indices"]
X = mnist["data"]
y = mnist["target"]
n_targets = 10
y = convert_to_one_hot(y, n_targets)

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph,
                                     list_of_test_values=[X[:10], y[:10]])
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = 128
n_hid = 1000

on_off = tensor.iscalar()
on_off.tag.test_value = 0
l1 = relu_layer([X_sym], graph, 'l1', proj_dim=n_hid,
                batch_normalize=True, mode_switch=on_off,
                random_state=random_state)
y_pred = softmax_zeros_layer([l1], graph, 'y_pred',  proj_dim=n_targets)
nll = categorical_crossentropy(y_pred, y_sym).mean()
weights = get_weights_from_graph(graph)
L2 = sum([(w ** 2).sum() for w in weights])
Пример #33
0

sine_x, sine_y = make_noisy_sinusoid(n_samples=10000)
# Swap X and Y to create a one to many relationship

sine_x, sine_y = sine_y, sine_x
# Make 1 minibatch with feature dimension 1
sine_x = sine_x[:, None]
sine_y = sine_y[:, None]

X = sine_x
y = sine_y

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph, list_of_test_values=[sine_x, sine_y])
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = len(sine_y) / 20
n_hid = 20
n_out = 1

l1 = tanh_layer([X_sym], graph, "l1", proj_dim=n_hid, random_state=random_state)
coeffs, mus, log_sigmas = log_gaussian_mixture_layer(
    [l1], graph, "mdn", proj_dim=1, n_components=24, random_state=random_state
)
cost = log_gaussian_mixture_cost(coeffs, mus, log_sigmas, y_sym).mean()
params, grads = get_params_and_grads(graph, cost)

opt = adadelta(params)
Пример #34
0
def test_log_gaussian_error():
    graph = OrderedDict()
    X_sym = add_datasets_to_graph([X], ["X"], graph)
    cost = log_gaussian_error(.5 * X_sym, .5 * X_sym, X_sym)
    theano.function([X_sym], cost, mode="FAST_COMPILE")
Пример #35
0
def test_categorical_crossentropy():
    graph = OrderedDict()
    y_sym = add_datasets_to_graph([y], ["y"], graph)
    cost = categorical_crossentropy(.99 * y_sym + .001, y_sym)
    theano.function([y_sym], cost, mode="FAST_COMPILE")
Пример #36
0
def test_binary_entropy():
    graph = OrderedDict()
    X_sym = add_datasets_to_graph([X], ["X"], graph)
    cost = binary_entropy(X_sym)
    theano.function([X_sym], cost, mode="FAST_COMPILE")
Пример #37
0
sine_x, sine_y = make_noisy_sinusoid(n_samples=10000)
# Swap X and Y to create a one to many relationship

sine_x, sine_y = sine_y, sine_x
# Make 1 minibatch with feature dimension 1
sine_x = sine_x[:, None]
sine_y = sine_y[:, None]

X = sine_x
y = sine_y

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"],
                                     graph,
                                     list_of_test_values=[sine_x, sine_y])
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = len(sine_y) / 20
n_hid = 20
n_out = 1

l1 = tanh_layer([X_sym],
                graph,
                'l1',
                proj_dim=n_hid,
                random_state=random_state)
coeffs, mus, log_sigmas = log_gaussian_mixture_layer([l1],
                                                     graph,
Пример #38
0
from dagbldr.utils import convert_to_one_hot, early_stopping_trainer
from dagbldr.nodes import conv2d_layer, pool2d_layer
from dagbldr.nodes import softmax_layer, categorical_crossentropy

mnist = fetch_mnist()
train_indices = mnist["train_indices"]
valid_indices = mnist["valid_indices"]
X = mnist["images"]
y = mnist["target"]
n_targets = 10
y = convert_to_one_hot(y, n_targets)
minibatch_size = 128

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X[:minibatch_size], y[:minibatch_size]],
                                     ["X", "y"], graph)
# random state so script is deterministic
random_state = np.random.RandomState(1999)

l1 = conv2d_layer([X_sym], graph, 'conv1', 8, random_state=random_state)
l2 = pool2d_layer([l1], graph, 'pool1')
l3 = conv2d_layer([l2], graph, 'conv2', 16, random_state=random_state)
l4 = pool2d_layer([l3], graph, 'pool2')
l5 = l4.reshape((l4.shape[0], -1))
y_pred = softmax_layer([l5],
                       graph,
                       'y_pred',
                       n_targets,
                       random_state=random_state)
nll = categorical_crossentropy(y_pred, y_sym).mean()
cost = nll
Пример #39
0
def test_masked_cost():
    graph = OrderedDict()
    X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
    cost = gaussian_error(.5 * X_sym, .5 * X_sym, X_sym)
    masked = masked_cost(X_sym, y_sym)
    theano.function([X_sym, y_sym], [cost, masked], mode="FAST_COMPILE")
Пример #40
0
def test_vae():
    minibatch_size = 10
    random_state = np.random.RandomState(1999)
    graph = OrderedDict()

    X_sym = add_datasets_to_graph([X], ["X"], graph)

    l1_enc = softplus_layer([X_sym],
                            graph,
                            'l1_enc',
                            proj_dim=100,
                            random_state=random_state)
    mu = linear_layer([l1_enc],
                      graph,
                      'mu',
                      proj_dim=50,
                      random_state=random_state)
    log_sigma = linear_layer([l1_enc],
                             graph,
                             'log_sigma',
                             proj_dim=50,
                             random_state=random_state)
    samp = gaussian_log_sample_layer([mu], [log_sigma],
                                     graph,
                                     'gaussian_log_sample',
                                     random_state=random_state)
    l1_dec = softplus_layer([samp],
                            graph,
                            'l1_dec',
                            proj_dim=100,
                            random_state=random_state)
    out = sigmoid_layer([l1_dec],
                        graph,
                        'out',
                        proj_dim=X.shape[1],
                        random_state=random_state)

    kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean()
    cost = binary_crossentropy(out, X_sym).mean() + kl
    params, grads = get_params_and_grads(graph, cost)
    learning_rate = 0.00000
    opt = sgd(params)
    updates = opt.updates(params, grads, learning_rate)

    fit_function = theano.function([X_sym], [cost],
                                   updates=updates,
                                   mode="FAST_COMPILE")

    cost_function = theano.function([X_sym], [cost], mode="FAST_COMPILE")

    checkpoint_dict = {}
    train_indices = np.arange(len(X))
    valid_indices = np.arange(len(X))
    early_stopping_trainer(fit_function,
                           cost_function,
                           checkpoint_dict, [X],
                           minibatch_size,
                           train_indices,
                           valid_indices,
                           fit_function_output_names=["cost"],
                           cost_function_output_name="valid_cost",
                           n_epochs=1)
Пример #41
0
X = data["data"]
y = data["target"]
vocab_size = data["vocabulary_size"]
vocab = data["vocabulary"]
train_indices = data["train_indices"]
valid_indices = train_indices

X_mb, X_mb_mask = make_masked_minibatch(X, slice(0, len(X)))
y_mb, y_mb_mask = make_masked_minibatch(y, slice(0, len(y)))

n_hid = 256
n_out = vocab_size + 1

datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask]
names_list = ["X", "X_mask", "y", "y_mask"]
X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(
    datasets_list, names_list, graph)

h = gru_recurrent_layer([X_sym],
                        X_mask_sym,
                        n_hid,
                        graph,
                        'l1_rec',
                        random_state=random_state)
y_pred = softmax_layer([h], graph, 'l2_proj', n_out, random_state=random_state)

cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean()
params, grads = get_params_and_grads(graph, cost)

opt = adadelta(params)
updates = opt.updates(params, grads)
Пример #42
0
X = data["data"]
y = data["target"]
vocab_size = data["vocabulary_size"]
vocab = data["vocabulary"]
train_indices = data["train_indices"]
valid_indices = train_indices

X_mb, X_mb_mask = make_masked_minibatch(X, slice(0, len(X)))
y_mb, y_mb_mask = make_masked_minibatch(y, slice(0, len(y)))

n_hid = 256
n_out = vocab_size + 1

datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask]
names_list = ["X", "X_mask", "y", "y_mask"]
X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph(datasets_list, names_list, graph)

h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, "l1_rec", random_state=random_state)
y_pred = softmax_layer([h], graph, "l2_proj", n_out, random_state=random_state)

cost = log_ctc_cost(y_sym, y_mask_sym, y_pred, X_mask_sym).mean()
params, grads = get_params_and_grads(graph, cost)

opt = adadelta(params)
updates = opt.updates(params, grads)

checkpoint_dict = {}

fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates)
cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost])
predict_function = theano.function([X_sym, X_mask_sym], [y_pred])
Пример #43
0
from dagbldr.utils import TrainingLoop
from dagbldr.utils import create_checkpoint_dict
from dagbldr.nodes import relu_layer, softmax_zeros_layer
from dagbldr.nodes import categorical_crossentropy

mnist = fetch_mnist()
train_indices = mnist["train_indices"]
valid_indices = mnist["valid_indices"]
X = mnist["data"]
y = mnist["target"]
n_targets = 10
y = convert_to_one_hot(y, n_targets)

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = 20
n_hid = 1000

l1 = relu_layer([X_sym],
                graph,
                'l1',
                proj_dim=n_hid,
                random_state=random_state)
y_pred = softmax_zeros_layer([l1], graph, 'y_pred', proj_dim=n_targets)
nll = categorical_crossentropy(y_pred, y_sym).mean()
weights = get_weights_from_graph(graph)
L2 = sum([(w**2).sum() for w in weights])
Пример #44
0
from dagbldr.utils import TrainingLoop
from dagbldr.nodes import tanh_layer, softmax_zeros_layer
from dagbldr.nodes import categorical_crossentropy


mnist = fetch_mnist()
train_indices = mnist["train_indices"]
valid_indices = mnist["valid_indices"]
X = mnist["data"]
y = mnist["target"]
n_targets = 10
y = convert_to_one_hot(y, n_targets)

# graph holds information necessary to build layers from parents
graph = OrderedDict()
X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph)
# random state so script is deterministic
random_state = np.random.RandomState(1999)

minibatch_size = 20
n_hid = 1000

l1 = tanh_layer([X_sym], graph, 'l1', proj_dim=n_hid, random_state=random_state)
y_pred = softmax_zeros_layer([l1], graph, 'y_pred',  proj_dim=n_targets)
nll = categorical_crossentropy(y_pred, y_sym).mean()
weights = get_weights_from_graph(graph)
L2 = sum([(w ** 2).sum() for w in weights])
cost = nll + .0001 * L2


params, grads = get_params_and_grads(graph, cost)