def test_scalar_value(): npr.seed(1) for ii in xrange(NUM_TRIALS): np_X = npr.randn() X = kayak.Parameter(np_X) out = kayak.L1Norm(X) assert close_float(out.value, np.abs(np_X))
def test_matrix_value(): npr.seed(7) for ii in xrange(NUM_TRIALS): np_X = npr.randn(10, 20) wt = np.exp(npr.randn()) X = kayak.Parameter(np_X) out = kayak.L1Norm(X, weight=wt) assert close_float(out.value, wt * np.sum(np.abs(np_X)))
def test_scalar_value_2(): npr.seed(3) for ii in xrange(NUM_TRIALS): np_X = npr.randn() wt = np.exp(npr.randn()) X = kayak.Parameter(np_X) out = kayak.L1Norm(X, weight=wt) assert close_float(out.value, wt * np.abs(np_X))
def test_scalar_grad(): npr.seed(2) for ii in xrange(NUM_TRIALS): while True: np_X = npr.randn() if np.abs(np_X) > 0.1: break X = kayak.Parameter(np_X) out = kayak.L1Norm(X) assert close_float(out.grad(X), np.sign(np_X)) assert kayak.util.checkgrad(X, out) < MAX_GRAD_DIFF
def test_vector_grad(): npr.seed(6) for ii in xrange(NUM_TRIALS): while True: np_X = npr.randn() if np.all(np.abs(np_X) > 0.1): break wt = np.exp(npr.randn()) X = kayak.Parameter(np_X) out = kayak.L1Norm(X, weight=wt) assert np.all(close_float(out.grad(X), wt * np.sign(np_X))) assert kayak.util.checkgrad(X, out) < MAX_GRAD_DIFF
def train(inputs, targets, batch_size, learn_rate, momentum, l1_weight, l2_weight, dropout, improvement_thresh): # Create a batcher object. batcher = kayak.Batcher(batch_size, inputs.shape[0]) # Inputs and targets need access to the batcher. X = kayak.Inputs(inputs, batcher) T = kayak.Targets(targets, batcher) # Put some dropout regularization on the inputs H = kayak.Dropout(X, dropout) # Weights and biases, with random initializations. W = kayak.Parameter(0.1 * npr.randn(inputs.shape[1], 10)) B = kayak.Parameter(0.1 * npr.randn(1, 10)) # Nothing fancy here: inputs times weights, plus bias, then softmax. Y = kayak.LogSoftMax(kayak.ElemAdd(kayak.MatMult(H, W), B)) # The training loss is negative multinomial log likelihood. loss = kayak.MatAdd(kayak.MatSum(kayak.LogMultinomialLoss(Y, T)), kayak.L2Norm(W, l2_weight), kayak.L1Norm(W, l1_weight)) # Use momentum for the gradient-based optimization. mom_grad_W = np.zeros(W.shape) best_loss = np.inf best_epoch = -1 # Loop over epochs. for epoch in range(100): # Track the total loss. total_loss = 0.0 # Loop over batches -- using batcher as iterator. for batch in batcher: # Draw new random dropouts H.draw_new_mask() # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W = loss.grad(W) grad_B = loss.grad(B) # Use momentum on the weight gradient. mom_grad_W *= momentum mom_grad_W += (1.0 - momentum) * grad_W # Now make the actual parameter updates. W.value -= learn_rate * mom_grad_W B.value -= learn_rate * grad_B print("Epoch: %d, total loss: %f" % (epoch, total_loss)) if not np.isfinite(total_loss): print("Training diverged. Returning constraint violation.") break if total_loss < best_loss: best_epoch = epoch else: if (epoch - best_epoch) > improvement_thresh: print("Has been %d epochs without improvement. Aborting." % (epoch - best_epoch)) break # After we've trained, we return a sugary little function handle # that makes things easy. Basically, what we're doing here is # simply replacing the inputs in the above defined graph and then # running through it to produce the outputs. # The point here is that we wind up with a function # handle the can be called with a numpy object and it produces the # target values for novel data, using the parameters we just learned. def predict(x): X.value = x H.reinstate_units() return Y.value return predict
kyk_B2 = kayak.Parameter(npr.randn(1, P)) # Second layer multiplication. kyk_out = kayak.Dropout(kayak.HardReLU( kayak.ElemAdd(kayak.MatMult(kyk_H1, kyk_W2), kyk_B2)), drop_prob=0.5, batcher=batcher) # Elementwise Loss. kyk_el_loss = kayak.L2Loss(kyk_out, kyk_targets) # Sum the losses. kyk_loss = kayak.MatSum(kyk_el_loss) # Roll in the weight regularization. kyk_obj = kayak.ElemAdd(kyk_loss, kayak.L1Norm(kyk_W1, weight=100.0), kayak.L1Norm(kyk_W2, weight=100.0)) print "W2:", kayak.util.checkgrad(kyk_W2, kyk_obj) print "B2:", kayak.util.checkgrad(kyk_B2, kyk_obj) print "W1:", kayak.util.checkgrad(kyk_W1, kyk_obj) print "B1:", kayak.util.checkgrad(kyk_B1, kyk_obj) t0 = time.time() for ii in xrange(10): for batch in batcher: val = kyk_obj.value grad_W1 = kyk_obj.grad(kyk_W1) grad_B1 = kyk_obj.grad(kyk_B1) grad_W2 = kyk_obj.grad(kyk_W2)
def train(inputs, targets, batch_size, learn_rate, momentum, l1_weight, l2_weight, dropout): # Create a batcher object. batcher = kayak.Batcher(batch_size, inputs.shape[0]) # Inputs and targets need access to the batcher. X = kayak.Inputs(inputs, batcher) T = kayak.Targets(targets, batcher) # Weights and biases, with random initializations. W = kayak.Parameter( 0.1*npr.randn( inputs.shape[1], 10 )) B = kayak.Parameter( 0.1*npr.randn(1,10) ) # Nothing fancy here: inputs times weights, plus bias, then softmax. dropout_layer = kayak.Dropout(X, dropout, batcher=batcher) Y = kayak.LogSoftMax( kayak.ElemAdd( kayak.MatMult(dropout_layer, W), B ) ) # The training loss is negative multinomial log likelihood. loss = kayak.MatAdd(kayak.MatSum(kayak.LogMultinomialLoss(Y, T)), kayak.L2Norm(W, l2_weight), kayak.L1Norm(W, l1_weight)) # Use momentum for the gradient-based optimization. mom_grad_W = np.zeros(W.shape) # Loop over epochs. for epoch in xrange(10): # Track the total loss and the overall gradient. total_loss = 0.0 total_grad_W = np.zeros(W.shape) # Loop over batches -- using batcher as iterator. for batch in batcher: # Compute the loss of this minibatch by asking the Kayak # object for its value and giving it reset=True. total_loss += loss.value # Now ask the loss for its gradient in terms of the # weights and the biases -- the two things we're trying to # learn here. grad_W = loss.grad(W) grad_B = loss.grad(B) # Use momentum on the weight gradient. mom_grad_W = momentum*mom_grad_W + (1.0-momentum)*grad_W # Now make the actual parameter updates. W.value -= learn_rate * mom_grad_W B.value -= learn_rate * grad_B # Keep track of the gradient to see if we're converging. total_grad_W += grad_W #print epoch, total_loss, np.sum(total_grad_W**2) # After we've trained, we return a sugary little function handle # that makes things easy. Basically, what we're doing here is # handing the output object (not the loss!) a dictionary where the # key is the Kayak input object 'X' (that is the features being # used here for logistic regression) and the value in that # dictionary is being determined by the argument to the lambda # expression. The point here is that we wind up with a function # handle the can be called with a numpy object and it produces the # target values for novel data, using the parameters we just learned. def compute_predictions(x): X.data = x batcher.test_mode() return Y.value return compute_predictions