def test_hardrelu_values(): npr.seed(1) for ii in xrange(NUM_TRIALS): np_X = npr.randn(6, 5) X = kayak.Parameter(np_X) Y = kayak.HardReLU(X) assert np.all(Y.value >= 0.0) assert np.all(np.maximum(np_X, 0.0) == Y.value)
def test_hardrelu_grad(): npr.seed(2) # Needs to be small due to non-differentiability. epsilon = 1e-6 for ii in xrange(NUM_TRIALS): np_X = npr.randn(6, 5) X = kayak.Parameter(np_X) Y = kayak.HardReLU(X) Z = kayak.MatSum(Y) Z.value assert np.all(Z.grad(X) >= 0.0) print "CHECKGRAD: ", ii, kayak.util.checkgrad(X, Z, epsilon) assert kayak.util.checkgrad(X, Z, epsilon) < MAX_GRAD_DIFF
def kayak_mlp(X, y): """ Kayak implementation of a mlp with relu hidden layers and dropout """ # Create a batcher object. batcher = kayak.Batcher(batch_size, X.shape[0]) # count number of rows and columns num_examples, num_features = np.shape(X) X = kayak.Inputs(X, batcher) T = kayak.Targets(y, batcher) # ----------------------------- first hidden layer ------------------------------- # set up weights for our input layer # use the same scheme as our numpy mlp input_range = 1.0 / num_features**(1 / 2) weights_1 = kayak.Parameter(0.1 * np.random.randn(X.shape[1], layer1_size)) bias_1 = kayak.Parameter(0.1 * np.random.randn(1, layer1_size)) # linear combination of weights and inputs hidden_1_input = kayak.ElemAdd(kayak.MatMult(X, weights_1), bias_1) # apply activation function to hidden layer hidden_1_activation = kayak.HardReLU(hidden_1_input) # apply a dropout for regularization hidden_1_out = kayak.Dropout(hidden_1_activation, layer1_dropout, batcher=batcher) # ----------------------------- output layer ----------------------------------- weights_out = kayak.Parameter(0.1 * np.random.randn(layer1_size, 9)) bias_out = kayak.Parameter(0.1 * np.random.randn(1, 9)) # linear combination of layer2 output and output weights out = kayak.ElemAdd(kayak.MatMult(hidden_1_out, weights_out), bias_out) # apply activation function to output yhat = kayak.SoftMax(out) # ----------------------------- loss function ----------------------------------- loss = kayak.MatAdd(kayak.MatSum(kayak.L2Loss(yhat, T)), kayak.L2Norm(weights_1, layer1_l2)) # Use momentum for the gradient-based optimization. mom_grad_W1 = np.zeros(weights_1.shape) mom_grad_W2 = np.zeros(weights_out.shape) # Loop over epochs. plot_loss = np.ones((iterations, 2)) for epoch in xrange(iterations): # Track the total loss. total_loss = 0.0 for batch in batcher: # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W1 = loss.grad(weights_1) grad_B1 = loss.grad(bias_1) grad_W2 = loss.grad(weights_out) grad_B2 = loss.grad(bias_out) # Use momentum on the weight gradients. mom_grad_W1 = momentum * mom_grad_W1 + (1.0 - momentum) * grad_W1 mom_grad_W2 = momentum * mom_grad_W2 + (1.0 - momentum) * grad_W2 # Now make the actual parameter updates. weights_1.value -= learn_rate * mom_grad_W1 bias_1.value -= learn_rate * grad_B1 weights_out.value -= learn_rate * mom_grad_W2 bias_out.value -= learn_rate * grad_B2 # save values into table to print learning curve at the end of trianing plot_loss[epoch, 0] = epoch plot_loss[epoch, 1] = total_loss print epoch, total_loss #pyplot.plot(plot_loss[:,0], plot_loss[:,1], linewidth=2.0) #pyplot.show() def compute_predictions(x): X.data = x batcher.test_mode() return yhat.value return compute_predictions
Y = npr.randn(N, P) batcher = kayak.Batcher(batch_size, N) # Build network. kyk_inputs = kayak.Inputs(X, batcher) # Labels. kyk_targets = kayak.Targets(Y, batcher) # First layer weights and biases. kyk_W1 = kayak.Parameter(npr.randn(D, H1)) kyk_B1 = kayak.Parameter(npr.randn(1, H1)) # First layer weight mult plus biases, then nonlinearity. kyk_H1 = kayak.Dropout(kayak.HardReLU( kayak.ElemAdd(kayak.MatMult(kyk_inputs, kyk_W1), kyk_B1)), drop_prob=0.5, batcher=batcher) # Second layer weights and bias. kyk_W2 = kayak.Parameter(npr.randn(H1, P)) kyk_B2 = kayak.Parameter(npr.randn(1, P)) # Second layer multiplication. kyk_out = kayak.Dropout(kayak.HardReLU( kayak.ElemAdd(kayak.MatMult(kyk_H1, kyk_W2), kyk_B2)), drop_prob=0.5, batcher=batcher) # Elementwise Loss. kyk_el_loss = kayak.L2Loss(kyk_out, kyk_targets)
def train(inputs, targets): # Create a batcher object. batcher = kayak.Batcher(batch_size, inputs.shape[0]) # Inputs and targets need access to the batcher. X = kayak.Inputs(inputs, batcher) T = kayak.Targets(targets, batcher) # First-layer weights and biases, with random initializations. W1 = kayak.Parameter(0.1 * npr.randn(inputs.shape[1], layer1_sz)) B1 = kayak.Parameter(0.1 * npr.randn(1, layer1_sz)) # First hidden layer: ReLU + Dropout H1 = kayak.Dropout(kayak.HardReLU(kayak.ElemAdd(kayak.MatMult(X, W1), B1)), layer1_dropout, batcher=batcher) # Second-layer weights and biases, with random initializations. W2 = kayak.Parameter(0.1 * npr.randn(layer1_sz, layer2_sz)) B2 = kayak.Parameter(0.1 * npr.randn(1, layer2_sz)) # Second hidden layer: ReLU + Dropout H2 = kayak.Dropout(kayak.HardReLU(kayak.ElemAdd(kayak.MatMult(H1, W2), B2)), layer2_dropout, batcher=batcher) # Output layer weights and biases, with random initializations. W3 = kayak.Parameter(0.1 * npr.randn(layer2_sz, 10)) B3 = kayak.Parameter(0.1 * npr.randn(1, 10)) # Output layer. Y = kayak.LogSoftMax(kayak.ElemAdd(kayak.MatMult(H2, W3), B3)) # The training loss is negative multinomial log likelihood. loss = kayak.MatSum(kayak.LogMultinomialLoss(Y, T)) # Use momentum for the gradient-based optimization. mom_grad_W1 = np.zeros(W1.shape) mom_grad_W2 = np.zeros(W2.shape) mom_grad_W3 = np.zeros(W3.shape) # Loop over epochs. for epoch in xrange(10): # Track the total loss. total_loss = 0.0 # Loop over batches -- using batcher as iterator. for batch in batcher: # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W1 = loss.grad(W1) grad_B1 = loss.grad(B1) grad_W2 = loss.grad(W2) grad_B2 = loss.grad(B2) grad_W3 = loss.grad(W3) grad_B3 = loss.grad(B3) # Use momentum on the weight gradients. mom_grad_W1 = momentum * mom_grad_W1 + (1.0 - momentum) * grad_W1 mom_grad_W2 = momentum * mom_grad_W2 + (1.0 - momentum) * grad_W2 mom_grad_W3 = momentum * mom_grad_W3 + (1.0 - momentum) * grad_W3 # Now make the actual parameter updates. W1.value -= learn_rate * mom_grad_W1 B1.value -= learn_rate * grad_B1 W2.value -= learn_rate * mom_grad_W2 B2.value -= learn_rate * grad_B2 W3.value -= learn_rate * mom_grad_W3 B3.value -= learn_rate * grad_B3 print epoch, total_loss # After we've trained, we return a sugary little function handle # that makes things easy. Basically, what we're doing here is # handing the output object (not the loss!) a dictionary where the # key is the Kayak input object 'X' (that is the features being # used here for logistic regression) and the value in that # dictionary is being determined by the argument to the lambda # expression. The point here is that we wind up with a function # handle the can be called with a numpy object and it produces the # target values for novel data, using the parameters we just learned. def compute_predictions(x): X.data = x batcher.test_mode() return Y.value return compute_predictions