示例#1
0
def test_graph_diamond():
    npr.seed(2)

    N  = 10
    D  = 5
    H1 = 6
    H2 = 7

    X   = kayak.Inputs(npr.randn(N,D))
    W1  = kayak.Parameter(npr.randn(D,H1))
    W2a = kayak.Parameter(npr.randn(H1,H2))
    W2b = kayak.Parameter(npr.randn(H1,H2))
    W3  = kayak.Parameter(npr.randn(H2,1))

    U1 = kayak.SoftReLU(kayak.MatMult(X, W1))
    U2a = kayak.SoftReLU(kayak.MatMult(U1, W2a))
    U2b = kayak.SoftReLU(kayak.MatMult(U1, W2b))
    U3a = kayak.SoftReLU(kayak.MatMult(U2a, W3))
    U3b = kayak.SoftReLU(kayak.MatMult(U2b, W3))
    
    out = kayak.MatSum(kayak.MatAdd(U3a, U3b))

    out.value
    print kayak.util.checkgrad(W1, out)
    print kayak.util.checkgrad(W2a, out)
    print kayak.util.checkgrad(W2b, out)
    print kayak.util.checkgrad(W3, out)
    assert kayak.util.checkgrad(W1, out) < MAX_GRAD_DIFF
    assert kayak.util.checkgrad(W2a, out) < MAX_GRAD_DIFF
    assert kayak.util.checkgrad(W2b, out) < MAX_GRAD_DIFF
    assert kayak.util.checkgrad(W3, out) < MAX_GRAD_DIFF
 def __init__(self, maxnum, reduced_dims):
     self.threshold = 1e-2
     dummyword = np.zeros((maxnum, 1))
     W1 = np.random.randn(reduced_dims, maxnum) * 0.1
     W2 = np.random.randn(maxnum, reduced_dims) * 0.1
     self.input = ky.Parameter(dummyword)
     self.W1 = ky.Parameter(W1)
     self.W2 = ky.Parameter(W2)
     self.output = ky.MatMult(self.W1, self.input)
     self.recons = ky.MatMult(self.W2, self.output)
     self.loss = ky.MatSum(ky.L2Loss(self.recons, self.input))
     #self.totloss = ky.MatAdd(self.loss,ky.L2Norm(self.W2,weight=1e-2),ky.L2Norm(self.W1,weight = 1e-2))
     self.totloss = self.loss
示例#3
0
def test_cache_utility():
    npr.seed(3)

    num_layers = 17
    num_dims   = 3
    
    X = kayak.Inputs(npr.randn(10, num_dims))
    W1 = kayak.Parameter(npr.randn(num_dims, num_dims))
    W2 = kayak.Parameter(npr.randn(num_dims, num_dims))

    Z = kayak.MatMult(X, W1)

    for jj in xrange(num_layers):
        Z = kayak.SoftReLU(kayak.MatAdd(kayak.MatMult(Z, W2),
                                        kayak.MatMult(Z, W2)))

    out = kayak.MatSum(Z)
    assert kayak.util.checkgrad(W1, out) < 1e-4
示例#4
0
def test_matmult_values_2():
    npr.seed(2)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(5, 5)
        A = kayak.Parameter(np_A)
        C = kayak.MatMult(A, A)

        assert C.value.shape == (5, 5)
        assert np.all(close_float(C.value, np.dot(np_A, np_A)))
示例#5
0
def test_reshape_2():
    npr.seed(2)

    np_A = npr.randn(5,10)
    A    = kayak.Parameter(np_A)
    B    = kayak.Reshape(A, (2,25))
    C    = kayak.Parameter(npr.randn(25,5))
    D    = kayak.MatMult(B, C)
    out  = kayak.MatSum(D)

    out.value
    assert out.grad(A).shape == np_A.shape
    assert kayak.util.checkgrad(A, out) < MAX_GRAD_DIFF
示例#6
0
def test_matmult_grad_2():
    npr.seed(4)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(5, 5)
        A = kayak.Parameter(np_A)
        C = kayak.MatMult(A, A)
        D = kayak.MatSum(C)

        D.value
        assert D.grad(A).shape == (5, 5)
        assert kayak.util.checkgrad(A, D) < MAX_GRAD_DIFF
示例#7
0
def test_transpose_3():
    npr.seed(3)

    np_A = npr.randn(5, 10)
    A = kayak.Parameter(np_A)
    B = kayak.Transpose(A)
    C = kayak.Parameter(npr.randn(5, 5))
    D = kayak.MatMult(B, C)
    out = kayak.MatSum(D)

    out.value
    assert out.grad(A).shape == np_A.shape
    assert kayak.util.checkgrad(A, out) < MAX_GRAD_DIFF
示例#8
0
def test_matmult_values_1():
    npr.seed(1)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(5, 6)
        np_B = npr.randn(6, 7)
        A = kayak.Parameter(np_A)
        B = kayak.Parameter(np_B)
        C = kayak.MatMult(A, B)

        assert C.value.shape == (5, 7)
        assert np.all(close_float(C.value, np.dot(np_A, np_B)))
示例#9
0
def test_graph_chain():
    npr.seed(1)

    N  = 10
    D  = 5
    H1 = 6
    H2 = 7

    X  = kayak.Inputs(npr.randn(N,D))
    W1 = kayak.Parameter(npr.randn(D,H1))
    W2 = kayak.Parameter(npr.randn(H1,H2))
    W3 = kayak.Parameter(npr.randn(H2,1))

    U1 = kayak.SoftReLU(kayak.MatMult(X, W1))
    U2 = kayak.SoftReLU(kayak.MatMult(U1, W2))
    U3 = kayak.SoftReLU(kayak.MatMult(U2, W3))
    
    out = kayak.MatSum(U3)

    out.value
    assert kayak.util.checkgrad(W1, out) < MAX_GRAD_DIFF
    assert kayak.util.checkgrad(W2, out) < MAX_GRAD_DIFF
    assert kayak.util.checkgrad(W3, out) < MAX_GRAD_DIFF
示例#10
0
def test_graph_dag():
    npr.seed(3)

    num_layers = 7
    num_dims   = 5
    
    for ii in xrange(NUM_TRIALS):
        probs = npr.rand()

        X = kayak.Inputs(npr.randn(25,num_dims))

        wts    = []
        layers = []
        for jj in xrange(num_layers):

            U = kayak.Constant(np.zeros((25,num_dims)))

            if npr.rand() < probs:
                W = kayak.Parameter(0.1*npr.randn(num_dims, num_dims))
                wts.append(W)
                U = kayak.MatAdd( U, kayak.SoftReLU(kayak.MatMult(X, W)) )

            for kk in xrange(jj):
                if npr.rand() < probs:
                    W = kayak.Parameter(0.1*npr.randn(num_dims, num_dims))
                    wts.append(W)
                    U = kayak.MatAdd( U, kayak.SoftReLU(kayak.MatMult(layers[kk], W)) )
            
            layers.append(U)
            
        out = kayak.MatSum(layers[-1])

        out.value
        for jj, wt in enumerate(wts):
            diff = kayak.util.checkgrad(wt, out, 1e-4)
            print diff
            assert diff < 1e-4
示例#11
0
def test_matmult_values_3():
    npr.seed(3)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(5, 6)
        np_B = npr.randn(6, 7)
        np_C = npr.randn(7, 8)
        A = kayak.Parameter(np_A)
        B = kayak.Parameter(np_B)
        C = kayak.Parameter(np_C)
        D = kayak.MatMult(A, B, C)

        assert D.value.shape == (5, 8)
        assert np.all(close_float(D.value, np.dot(np_A, np.dot(np_B, np_C))))
示例#12
0
def test_graph_simple():
    npr.seed(1)

    N  = 1
    D  = 1
    H1 = 1

    X  = kayak.Inputs(npr.randn(N,D))
    W1 = kayak.Parameter(npr.randn(D,H1))
    U3 = kayak.MatMult(W1, X)

    out = U3

    print "Value: ", out.value
    print "Gradient: ", out.grad(W1)
    print "Grad error: ", kayak.util.checkgrad(W1, out)
    assert kayak.util.checkgrad(W1, out) < MAX_GRAD_DIFF
示例#13
0
def test_matmult_grad_1():
    npr.seed(3)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(5, 6)
        np_B = npr.randn(6, 7)
        A = kayak.Parameter(np_A)
        B = kayak.Parameter(np_B)
        C = kayak.MatMult(A, B)
        D = kayak.MatSum(C)

        D.value
        assert D.grad(A).shape == (5, 6)
        assert D.grad(B).shape == (6, 7)
        assert kayak.util.checkgrad(A, D) < MAX_GRAD_DIFF
        assert kayak.util.checkgrad(B, D) < MAX_GRAD_DIFF
示例#14
0
def test_matmult_grad_vect_mat():
    npr.seed(5)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(6, )
        np_B = npr.randn(6, 7)
        np_C = npr.randn(7, )
        A = kayak.Parameter(np_A)
        B = kayak.Parameter(np_B)
        C = kayak.Parameter(np_C)
        D = kayak.MatMult(A, B)
        E = kayak.MatSum(kayak.ElemMult(C, D))

        assert E.grad(A).shape == (6, )
        assert E.grad(B).shape == (6, 7)
        assert kayak.util.checkgrad(A, E) < MAX_GRAD_DIFF
        assert kayak.util.checkgrad(B, E) < MAX_GRAD_DIFF
示例#15
0
def test_batchnorm_values_1():
    npr.seed(1)

    for ii in xrange(NUM_TRIALS):

        np_X = npr.randn(5, 4)
        np_A = npr.randn(4, 2)
        A = kayak.Parameter(np_A)
        X = kayak.Parameter(np_X)
        Y = kayak.BatchNormalize(X)
        J = kayak.TanH(kayak.MatMult(Y, A))
        Z = kayak.MatSum(J)

        mu = np.mean(np_X, axis=0, keepdims=True)
        sig = np.mean((np_X - mu)**2, axis=0, keepdims=True) + 1e-6
        np_Y = (np_X - mu) / np.sqrt(sig)

        assert np.all(close_float(Y.value, np_Y))
        assert kayak.util.checkgrad(X, Z, verbose=True) < MAX_GRAD_DIFF
示例#16
0
def test_matmult_grad_3():
    npr.seed(5)

    for ii in xrange(NUM_TRIALS):

        np_A = npr.randn(5, 6)
        np_B = npr.randn(6, 7)
        np_C = npr.randn(7, 8)
        A = kayak.Parameter(np_A)
        B = kayak.Parameter(np_B)
        C = kayak.Parameter(np_C)
        D = kayak.MatMult(A, B, C)
        E = kayak.MatSum(kayak.SoftReLU(D))

        assert E.grad(A).shape == (5, 6)
        assert E.grad(B).shape == (6, 7)
        assert E.grad(C).shape == (7, 8)
        assert kayak.util.checkgrad(A, E) < MAX_GRAD_DIFF
        assert kayak.util.checkgrad(B, E) < MAX_GRAD_DIFF
        assert kayak.util.checkgrad(C, E) < MAX_GRAD_DIFF
示例#17
0
def train(inputs, targets):
    # Create a batcher object.
    batcher = kayak.Batcher(batch_size, inputs.shape[0])

    # Inputs and targets need access to the batcher.
    X = kayak.Inputs(inputs, batcher)
    T = kayak.Targets(targets, batcher)

    # First-layer weights and biases, with random initializations.
    W1 = kayak.Parameter(0.1 * npr.randn(inputs.shape[1], layer1_sz))
    B1 = kayak.Parameter(0.1 * npr.randn(1, layer1_sz))

    # First hidden layer: ReLU + Dropout
    H1 = kayak.Dropout(kayak.HardReLU(kayak.ElemAdd(kayak.MatMult(X, W1), B1)),
                       layer1_dropout,
                       batcher=batcher)

    # Second-layer weights and biases, with random initializations.
    W2 = kayak.Parameter(0.1 * npr.randn(layer1_sz, layer2_sz))
    B2 = kayak.Parameter(0.1 * npr.randn(1, layer2_sz))

    # Second hidden layer: ReLU + Dropout
    H2 = kayak.Dropout(kayak.HardReLU(kayak.ElemAdd(kayak.MatMult(H1, W2),
                                                    B2)),
                       layer2_dropout,
                       batcher=batcher)

    # Output layer weights and biases, with random initializations.
    W3 = kayak.Parameter(0.1 * npr.randn(layer2_sz, 10))
    B3 = kayak.Parameter(0.1 * npr.randn(1, 10))

    # Output layer.
    Y = kayak.LogSoftMax(kayak.ElemAdd(kayak.MatMult(H2, W3), B3))

    # The training loss is negative multinomial log likelihood.
    loss = kayak.MatSum(kayak.LogMultinomialLoss(Y, T))

    # Use momentum for the gradient-based optimization.
    mom_grad_W1 = np.zeros(W1.shape)
    mom_grad_W2 = np.zeros(W2.shape)
    mom_grad_W3 = np.zeros(W3.shape)

    # Loop over epochs.
    for epoch in xrange(10):

        # Track the total loss.
        total_loss = 0.0

        # Loop over batches -- using batcher as iterator.
        for batch in batcher:
            # Compute the loss of this minibatch by asking the Kayak
            # object for its value and giving it reset=True.
            total_loss += loss.value

            # Now ask the loss for its gradient in terms of the
            # weights and the biases -- the two things we're trying to
            # learn here.
            grad_W1 = loss.grad(W1)
            grad_B1 = loss.grad(B1)
            grad_W2 = loss.grad(W2)
            grad_B2 = loss.grad(B2)
            grad_W3 = loss.grad(W3)
            grad_B3 = loss.grad(B3)

            # Use momentum on the weight gradients.
            mom_grad_W1 = momentum * mom_grad_W1 + (1.0 - momentum) * grad_W1
            mom_grad_W2 = momentum * mom_grad_W2 + (1.0 - momentum) * grad_W2
            mom_grad_W3 = momentum * mom_grad_W3 + (1.0 - momentum) * grad_W3

            # Now make the actual parameter updates.
            W1.value -= learn_rate * mom_grad_W1
            B1.value -= learn_rate * grad_B1
            W2.value -= learn_rate * mom_grad_W2
            B2.value -= learn_rate * grad_B2
            W3.value -= learn_rate * mom_grad_W3
            B3.value -= learn_rate * grad_B3

        print epoch, total_loss

    # After we've trained, we return a sugary little function handle
    # that makes things easy.  Basically, what we're doing here is
    # handing the output object (not the loss!) a dictionary where the
    # key is the Kayak input object 'X' (that is the features being
    # used here for logistic regression) and the value in that
    # dictionary is being determined by the argument to the lambda
    # expression.  The point here is that we wind up with a function
    # handle the can be called with a numpy object and it produces the
    # target values for novel data, using the parameters we just learned.

    def compute_predictions(x):
        X.data = x
        batcher.test_mode()
        return Y.value

    return compute_predictions
示例#18
0
batcher = kayak.Batcher(batch_size, N)

# Build network.
kyk_inputs = kayak.Inputs(X, batcher)

# Labels.
kyk_targets = kayak.Targets(Y, batcher)

# First layer weights and biases.
kyk_W1 = kayak.Parameter(npr.randn(D, H1))
kyk_B1 = kayak.Parameter(npr.randn(1, H1))

# First layer weight mult plus biases, then nonlinearity.
kyk_H1 = kayak.Dropout(kayak.HardReLU(
    kayak.ElemAdd(kayak.MatMult(kyk_inputs, kyk_W1), kyk_B1)),
                       drop_prob=0.5,
                       batcher=batcher)

# Second layer weights and bias.
kyk_W2 = kayak.Parameter(npr.randn(H1, P))
kyk_B2 = kayak.Parameter(npr.randn(1, P))

# Second layer multiplication.
kyk_out = kayak.Dropout(kayak.HardReLU(
    kayak.ElemAdd(kayak.MatMult(kyk_H1, kyk_W2), kyk_B2)),
                        drop_prob=0.5,
                        batcher=batcher)

# Elementwise Loss.
kyk_el_loss = kayak.L2Loss(kyk_out, kyk_targets)
示例#19
0
Y = npr.poisson(lam)

kyk_batcher = kayak.Batcher(batch_size, N)

# Build network.
kyk_inputs = kayak.Inputs(X, kyk_batcher)

# Labels.
kyk_targets = kayak.Targets(Y, kyk_batcher)

# Weights.
W = 0.01 * npr.randn(D, P)
kyk_W = kayak.Parameter(W)

# Linear layer.
kyk_activation = kayak.MatMult(kyk_inputs, kyk_W)

# Exponential inverse-link function.
kyk_lam = kayak.ElemExp(kyk_activation)

# Poisson negative log likelihood.
kyk_nll = kyk_lam - kayak.ElemLog(kyk_lam) * kyk_targets

# Sum the losses.
kyk_loss = kayak.MatSum(kyk_nll)

for ii in xrange(100):

    for batch in kyk_batcher:
        loss = kyk_loss.value
        print loss, np.sum((kyk_W.value - true_W)**2)
示例#20
0
def train(inputs, targets, batch_size, learn_rate, momentum, l1_weight, l2_weight, dropout):

    # Create a batcher object.
    batcher = kayak.Batcher(batch_size, inputs.shape[0])

    # Inputs and targets need access to the batcher.
    X    = kayak.Inputs(inputs, batcher)
    T    = kayak.Targets(targets, batcher)

    # Weights and biases, with random initializations.
    W    = kayak.Parameter( 0.1*npr.randn( inputs.shape[1], 10 ))
    B    = kayak.Parameter( 0.1*npr.randn(1,10) )

    # Nothing fancy here: inputs times weights, plus bias, then softmax.
    dropout_layer = kayak.Dropout(X, dropout, batcher=batcher)
    Y    = kayak.LogSoftMax( kayak.ElemAdd( kayak.MatMult(dropout_layer, W), B ) )

    # The training loss is negative multinomial log likelihood.
    loss = kayak.MatAdd(kayak.MatSum(kayak.LogMultinomialLoss(Y, T)),
                        kayak.L2Norm(W, l2_weight),
                        kayak.L1Norm(W, l1_weight))

    # Use momentum for the gradient-based optimization.
    mom_grad_W = np.zeros(W.shape)

    # Loop over epochs.
    for epoch in xrange(10):

        # Track the total loss and the overall gradient.
        total_loss   = 0.0
        total_grad_W = np.zeros(W.shape)

        # Loop over batches -- using batcher as iterator.
        for batch in batcher:
            # Compute the loss of this minibatch by asking the Kayak
            # object for its value and giving it reset=True.
            total_loss += loss.value

            # Now ask the loss for its gradient in terms of the
            # weights and the biases -- the two things we're trying to
            # learn here.
            grad_W = loss.grad(W)
            grad_B = loss.grad(B)
            
            # Use momentum on the weight gradient.
            mom_grad_W = momentum*mom_grad_W + (1.0-momentum)*grad_W

            # Now make the actual parameter updates.
            W.value -= learn_rate * mom_grad_W
            B.value -= learn_rate * grad_B

            # Keep track of the gradient to see if we're converging.
            total_grad_W += grad_W

        #print epoch, total_loss, np.sum(total_grad_W**2)

    # After we've trained, we return a sugary little function handle
    # that makes things easy.  Basically, what we're doing here is
    # handing the output object (not the loss!) a dictionary where the
    # key is the Kayak input object 'X' (that is the features being
    # used here for logistic regression) and the value in that
    # dictionary is being determined by the argument to the lambda
    # expression.  The point here is that we wind up with a function
    # handle the can be called with a numpy object and it produces the
    # target values for novel data, using the parameters we just learned.
    
    def compute_predictions(x):
        X.data = x
        batcher.test_mode()
        return Y.value

    return compute_predictions
示例#21
0
Y = np.dot(X, true_W) + 0.1 * npr.randn(N, P)

kyk_batcher = kayak.Batcher(batch_size, N)

# Build network.
kyk_inputs = kayak.Inputs(X, kyk_batcher)

# Labels.
kyk_targets = kayak.Targets(Y, kyk_batcher)

# Weights.
W = 0.01 * npr.randn(D, P)
kyk_W = kayak.Parameter(W)

# Linear layer.
kyk_out = kayak.MatMult(kyk_inputs, kyk_W)

# Elementwise Loss.
kyk_el_loss = kayak.L2Loss(kyk_out, kyk_targets)

# Sum the losses.
kyk_loss = kayak.MatSum(kyk_el_loss)

for ii in xrange(100):

    for batch in kyk_batcher:
        loss = kyk_loss.value
        print loss, np.sum((kyk_W.value - true_W)**2)
        grad = kyk_loss.grad(kyk_W)
        kyk_W.value -= learn * grad
示例#22
0
def initial_latent_trace(body, inpt, voltage, t):
    I_true = np.diff(voltage) * body.C
    T = I_true.shape[0]
    gs = np.diag([c.g for c in body.children])
    D = int(sum([c.D for c in body.children]))

    driving_voltage = np.dot(np.ones((len(body.children), 1)),
                             np.array([voltage]))[:, :T]

    child_i = 0
    for i in range(D):
        driving_voltage[i, :] = voltage[:T] - body.children[child_i].E

    K = np.array([[max(i - j, 0) for i in range(T)] for j in range(T)])
    K = K.T + K
    K = -1 * (K**2)
    K = np.exp(K / 2)

    L = np.linalg.cholesky(K + (1e-7) * np.eye(K.shape[0]))
    Linv = scipy.linalg.solve_triangular(L.transpose(),
                                         np.identity(K.shape[0]))

    N = 1
    batch_size = 5000
    learn = .0000001
    runs = 10000

    batcher = kayak.Batcher(batch_size, N)

    inputs = kayak.Parameter(driving_voltage)
    targets = kayak.Targets(np.array([I_true]), batcher)

    g_params = kayak.Parameter(gs)
    I_input = kayak.Parameter(inpt.T[:, :T])
    Kinv = kayak.Parameter(np.dot(Linv.transpose(), Linv))

    initial_latent = np.random.randn(D, T)
    latent_trace = kayak.Parameter(initial_latent)
    sigmoid = kayak.Logistic(latent_trace)

    quadratic = kayak.ElemMult(
        sigmoid,
        kayak.MatMult(
            kayak.Parameter(np.array([[0, 1, 0], [0, 0, 0], [0, 0, 0]])),
            sigmoid))
    three_quadratic = kayak.MatMult(
        kayak.Parameter(np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0]])),
        quadratic)
    linear = kayak.MatMult(
        kayak.Parameter(np.array([[0, 0, 0], [0, 0, 0], [0, 0, 1]])), sigmoid)

    leak_open = kayak.Parameter(np.vstack((np.ones((1, T)), np.ones((2, T)))))
    open_fractions = kayak.ElemAdd(leak_open,
                                   kayak.ElemAdd(three_quadratic, linear))

    I_channels = kayak.ElemMult(kayak.MatMult(g_params, inputs),
                                open_fractions)

    I_ionic = kayak.MatMult(kayak.Parameter(np.array([[1, 1, 1]])), I_channels)

    predicted = kayak.MatAdd(I_ionic, I_input)

    nll = kayak.ElemPower(predicted - targets, 2)

    hack_vec = kayak.Parameter(np.array([1, 0, 0, 0, 1, 0, 0, 0, 1]))
    kyk_loss = kayak.MatSum(nll) + kayak.MatMult(
        kayak.Reshape(
            kayak.MatMult(kayak.MatMult(latent_trace, Kinv),
                          kayak.Transpose(latent_trace)),
            (9, )), hack_vec) + kayak.MatSum(kayak.ElemPower(I_channels, 2))

    grad = kyk_loss.grad(latent_trace)
    for ii in xrange(runs):
        for batch in batcher:
            loss = kyk_loss.value
            if ii % 100 == 0:
                print ii, loss, np.sum(np.power(predicted.value - I_true,
                                                2)) / T
            grad = kyk_loss.grad(latent_trace) + .5 * grad
            latent_trace.value -= learn * grad

    return sigmoid.value
示例#23
0
def train(inputs, targets, batch_size, learn_rate, momentum, l1_weight,
          l2_weight, dropout, improvement_thresh):

    # Create a batcher object.
    batcher = kayak.Batcher(batch_size, inputs.shape[0])

    # Inputs and targets need access to the batcher.
    X = kayak.Inputs(inputs, batcher)
    T = kayak.Targets(targets, batcher)

    # Put some dropout regularization on the inputs
    H = kayak.Dropout(X, dropout)

    # Weights and biases, with random initializations.
    W = kayak.Parameter(0.1 * npr.randn(inputs.shape[1], 10))
    B = kayak.Parameter(0.1 * npr.randn(1, 10))

    # Nothing fancy here: inputs times weights, plus bias, then softmax.
    Y = kayak.LogSoftMax(kayak.ElemAdd(kayak.MatMult(H, W), B))

    # The training loss is negative multinomial log likelihood.
    loss = kayak.MatAdd(kayak.MatSum(kayak.LogMultinomialLoss(Y, T)),
                        kayak.L2Norm(W, l2_weight), kayak.L1Norm(W, l1_weight))

    # Use momentum for the gradient-based optimization.
    mom_grad_W = np.zeros(W.shape)

    best_loss = np.inf
    best_epoch = -1

    # Loop over epochs.
    for epoch in range(100):

        # Track the total loss.
        total_loss = 0.0

        # Loop over batches -- using batcher as iterator.
        for batch in batcher:

            # Draw new random dropouts
            H.draw_new_mask()

            # Compute the loss of this minibatch by asking the Kayak
            # object for its value and giving it reset=True.
            total_loss += loss.value

            # Now ask the loss for its gradient in terms of the
            # weights and the biases -- the two things we're trying to
            # learn here.
            grad_W = loss.grad(W)
            grad_B = loss.grad(B)

            # Use momentum on the weight gradient.
            mom_grad_W *= momentum
            mom_grad_W += (1.0 - momentum) * grad_W

            # Now make the actual parameter updates.
            W.value -= learn_rate * mom_grad_W
            B.value -= learn_rate * grad_B

        print("Epoch: %d, total loss: %f" % (epoch, total_loss))

        if not np.isfinite(total_loss):
            print("Training diverged. Returning constraint violation.")
            break

        if total_loss < best_loss:
            best_epoch = epoch
        else:
            if (epoch - best_epoch) > improvement_thresh:
                print("Has been %d epochs without improvement. Aborting." %
                      (epoch - best_epoch))
                break

    # After we've trained, we return a sugary little function handle
    # that makes things easy.  Basically, what we're doing here is
    # simply replacing the inputs in the above defined graph and then
    # running through it to produce the outputs.
    # The point here is that we wind up with a function
    # handle the can be called with a numpy object and it produces the
    # target values for novel data, using the parameters we just learned.
    def predict(x):
        X.value = x
        H.reinstate_units()
        return Y.value

    return predict
def kayak_mlp(X, y):
    """
    Kayak implementation of a mlp with relu hidden layers and dropout
    """
    # Create a batcher object.
    batcher = kayak.Batcher(batch_size, X.shape[0])

    # count number of rows and columns
    num_examples, num_features = np.shape(X)

    X = kayak.Inputs(X, batcher)
    T = kayak.Targets(y, batcher)

    # ----------------------------- first hidden layer -------------------------------

    # set up weights for our input layer
    # use the same scheme as our numpy mlp
    input_range = 1.0 / num_features**(1 / 2)
    weights_1 = kayak.Parameter(0.1 * np.random.randn(X.shape[1], layer1_size))
    bias_1 = kayak.Parameter(0.1 * np.random.randn(1, layer1_size))

    # linear combination of weights and inputs
    hidden_1_input = kayak.ElemAdd(kayak.MatMult(X, weights_1), bias_1)

    # apply activation function to hidden layer
    hidden_1_activation = kayak.HardReLU(hidden_1_input)

    # apply a dropout for regularization
    hidden_1_out = kayak.Dropout(hidden_1_activation,
                                 layer1_dropout,
                                 batcher=batcher)

    # ----------------------------- output layer -----------------------------------

    weights_out = kayak.Parameter(0.1 * np.random.randn(layer1_size, 9))
    bias_out = kayak.Parameter(0.1 * np.random.randn(1, 9))

    # linear combination of layer2 output and output weights
    out = kayak.ElemAdd(kayak.MatMult(hidden_1_out, weights_out), bias_out)

    # apply activation function to output
    yhat = kayak.SoftMax(out)

    # ----------------------------- loss function -----------------------------------

    loss = kayak.MatAdd(kayak.MatSum(kayak.L2Loss(yhat, T)),
                        kayak.L2Norm(weights_1, layer1_l2))

    # Use momentum for the gradient-based optimization.
    mom_grad_W1 = np.zeros(weights_1.shape)
    mom_grad_W2 = np.zeros(weights_out.shape)

    # Loop over epochs.
    plot_loss = np.ones((iterations, 2))
    for epoch in xrange(iterations):

        # Track the total loss.
        total_loss = 0.0

        for batch in batcher:
            # Compute the loss of this minibatch by asking the Kayak
            # object for its value and giving it reset=True.
            total_loss += loss.value

            # Now ask the loss for its gradient in terms of the
            # weights and the biases -- the two things we're trying to
            # learn here.
            grad_W1 = loss.grad(weights_1)
            grad_B1 = loss.grad(bias_1)
            grad_W2 = loss.grad(weights_out)
            grad_B2 = loss.grad(bias_out)

            # Use momentum on the weight gradients.
            mom_grad_W1 = momentum * mom_grad_W1 + (1.0 - momentum) * grad_W1
            mom_grad_W2 = momentum * mom_grad_W2 + (1.0 - momentum) * grad_W2

            # Now make the actual parameter updates.
            weights_1.value -= learn_rate * mom_grad_W1
            bias_1.value -= learn_rate * grad_B1
            weights_out.value -= learn_rate * mom_grad_W2
            bias_out.value -= learn_rate * grad_B2

        # save values into table to print learning curve at the end of trianing
        plot_loss[epoch, 0] = epoch
        plot_loss[epoch, 1] = total_loss
        print epoch, total_loss

    #pyplot.plot(plot_loss[:,0], plot_loss[:,1], linewidth=2.0)
    #pyplot.show()

    def compute_predictions(x):
        X.data = x
        batcher.test_mode()
        return yhat.value

    return compute_predictions