def initialize_parameters(input_size, hidden_sizes, output_size): """ Initializes the learnable parameters for a neural network (basically the connection weights and biases). The parameters are designed to work well with ReLU. :param input_size: The size of the input layer. :param hidden_sizes: The hidden layer sizes as array. :param output_size: The size of the output layer. :return: The connection weights and biases for the neural networks. """ wh = [] bh = [] for i in range(len(hidden_sizes)): if i == 0: wh.append( np.random.randn(input_size, hidden_sizes[i]) * sqrt(2 / input_size)) else: wh.append( np.random.randn(hidden_sizes[i - 1], hidden_sizes[i]) * sqrt(2 / input_size)) bh.append(np.zeros((1, hidden_sizes[i]))) w_out = np.random.randn(hidden_sizes[-1], output_size) * sqrt( 2 / input_size) b_out = np.zeros((1, output_size)) return wh, bh, w_out, b_out
def main(args): # Create model. model = TwoLayerNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) img = np.zeros((args.batch_size, 784)) label = np.zeros((args.batch_size,)) start = time.time() for l in range(num_loops): def loss_func(*params): f = model.forward(img, 'train') return model.loss(f, label) if args.only_forward: loss_func() loss.asnumpy() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = minpy.core.grad_and_loss( loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) for g in grad_arrays: g.get_data(minpy.array_variants.ArrayType.MXNET).wait_to_read() dur = time.time() - start print('Per Loop Time: %.6f' % (dur / num_loops))
def lstm_temporal(x, h0, Wx, Wh, b): """ Forward pass for an LSTM over an entire sequence of data. We assume an input sequence composed of T vectors, each of dimension D. The LSTM uses a hidden size of H, and we work over a minibatch containing N sequences. After running the LSTM forward, we return the hidden states for all timesteps. Note that the initial cell state is passed as input, but the initial cell state is set to zero. Also note that the cell state is not returned; it is an internal variable to the LSTM and is not accessed from outside. Inputs: - x: Input data of shape (N, T, D) - h0: Initial hidden state of shape (N, H) - Wx: Weights for input-to-hidden connections, of shape (D, 4H) - Wh: Weights for hidden-to-hidden connections, of shape (H, 4H) - b: Biases of shape (4H,) Returns a tuple of: - h: Hidden states for all timesteps of all sequences, of shape (N, T, H) """ N, T, D = x.shape _, H = h0.shape c = np.zeros([N, 0, H]) h = np.zeros([N, 0, H]) for t in xrange(T): h_step, c_step = lstm_step( x[:, t, :], h[:, t - 1, :] if t > 0 else h0, c[:, t - 1, :] if t > 0 else np.zeros((N, H)), Wx, Wh, b) h_step = h_step.reshape(N, 1, H) c_step = c_step.reshape(N, 1, H) h = np.append(h, h_step, axis=1) c = np.append(c, c_step, axis=1) return h
def main(args): # Create model. model = RNNNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) data = np.zeros( (args.batch_size, args.input_size)) # Data of only one time step. label = np.zeros((args.batch_size, )) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(data, 'train') return model.loss(f, label) if args.only_forward: loss = loss_func() loss.wait_to_read() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range( len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) for g in grad_arrays: g.wait_to_read() dur = time.time() - start print('Per Loop Time: %.6f' % (dur / (args.num_loops - num_cold)))
def __init__(self, input_dim=3 * 32 * 32, hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0, conv_mode='lazy', dtype=py_np.float64): """ Initialize a new network. Inputs: - input_dim: An integer giving the size of the input - hidden_dim: An integer giving the size of the hidden layer - num_classes: An integer giving the number of classes to classify - dropout: Scalar between 0 and 1 giving dropout strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - reg: Scalar giving L2 regularization strength. """ super(TwoLayerNet, self).__init__(conv_mode) self.params = {} self.reg = reg self.params['W1'] = random.randn(input_dim, hidden_dim) * weight_scale self.params['b1'] = np.zeros((hidden_dim)) self.params['W2'] = random.randn(hidden_dim, num_classes) * weight_scale self.params['b2'] = np.zeros((num_classes))
def main(args): # Create model. model = RNNNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) data = np.zeros((args.batch_size, args.input_size)) # Data of only one time step. label = np.zeros((args.batch_size,), dtype=np.int) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(data, 'train') return model.loss(f, label) if args.only_forward: loss = loss_func() loss.asnumpy() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss( loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) dur = time.time() - start print('Per Loop Time: %.6f' % (dur / (args.num_loops - num_cold)))
def getLaplacian(self, W): D = np.zeros((W.shape[0], W.shape[1])) L = np.zeros((W.shape[0], W.shape[1])) for i in range(W.shape[1]): D[i][i] = np.sum(W[:, i]) L = D - W return [D, L]
def lstm_temporal(x, h0, Wx, Wh, b): """ Forward pass for an LSTM over an entire sequence of data. We assume an input sequence composed of T vectors, each of dimension D. The LSTM uses a hidden size of H, and we work over a minibatch containing N sequences. After running the LSTM forward, we return the hidden states for all timesteps. Note that the initial cell state is passed as input, but the initial cell state is set to zero. Also note that the cell state is not returned; it is an internal variable to the LSTM and is not accessed from outside. Inputs: - x: Input data of shape (N, T, D) - h0: Initial hidden state of shape (N, H) - Wx: Weights for input-to-hidden connections, of shape (D, 4H) - Wh: Weights for hidden-to-hidden connections, of shape (H, 4H) - b: Biases of shape (4H,) Returns a tuple of: - h: Hidden states for all timesteps of all sequences, of shape (N, T, H) """ N, T, D = x.shape _, H = h0.shape c = np.zeros([N, 0, H]) h = np.zeros([N, 0, H]) for t in xrange(T): h_step, c_step = lstm_step( x[:, t, :], h[:, t-1, :] if t > 0 else h0, c[:, t-1, :] if t > 0 else np.zeros((N, H)), Wx, Wh, b) h_step = h_step.reshape(N, 1, H) c_step = c_step.reshape(N, 1, H) h = np.append(h, h_step, axis=1) c = np.append(c, c_step, axis=1) return h
def main(args): # Create model. model = TwoLayerNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) img = np.zeros((args.batch_size, 784)) label = np.zeros((args.batch_size, ), dtype=np.int) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(img, 'train') return model.loss(f, label) if args.only_forward: loss = loss_func() loss.asnumpy() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range( len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) dur = time.time() - start print('Per Loop Time: %.6f' % (dur / (args.num_loops - num_cold)))
def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5): mu = np.random.rand(num_classes, num_features) sigma = np.ones((num_classes, num_features)) * 0.1 num_cls_samples = int(num_samples / num_classes) x = np.zeros((num_samples, num_features)) y = np.zeros((num_samples, num_classes)) for i in range(num_classes): cls_samples = np.random.normal(mu[i,:], sigma[i,:], (num_cls_samples, num_features)) x[i*num_cls_samples:(i+1)*num_cls_samples] = cls_samples y[i*num_cls_samples:(i+1)*num_cls_samples,i] = 1 return x, y
def getP(self, Xmask, F, X_big): size = F.shape[0] Q = np.zeros((size, size)) P = np.zeros((size, size)) for i in range(size): for j in range(size): Q[i, j] = self.norm(F[i] - F[j]) P = (2 * Xmask * X_big - self.rho * Q) / (2 * Xmask + np.full( (size, size), self.alpha)) return P
def forward(self, X, mode): seq_len = X.shape[1] batch_size = X.shape[0] hidden_size = self.params['Wh'].shape[0] h = np.zeros((batch_size, hidden_size)) c = np.zeros((batch_size, hidden_size)) for t in xrange(seq_len): h, c = layers.lstm_step(X[:, t, :], h, c, self.params['Wx'], self.params['Wh'], self.params['b']) y = layers.affine(h, self.params['Wa'], self.params['ba']) return y
def statistics(data, length): xax = np.zeros(length) yax = np.zeros(length) fixed_2daxis_slice(data, xax, length, axis=1) fixed_2daxis_slice(data, yax, length, axis=2) print(xax) print(yax) xax = xax.asnumpy() yax = yax.asnumpy() plt.plot(xax, yax) plt.grid() plt.show()
def __init__(self, word_to_idx, input_dim=512, wordvec_dim=128, hidden_dim=128, cell_type='rnn', dtype=None): """ Construct a new CaptioningRNN instance. Inputs: - word_to_idx: A dictionary giving the vocabulary. It contains V entries, and maps each string to a unique integer in the range [0, V). - input_dim: Dimension D of input image feature vectors. - wordvec_dim: Dimension W of word vectors. - hidden_dim: Dimension H for the hidden state of the RNN. - cell_type: What type of RNN to use; either 'rnn' or 'l#stm'. - dtype: numpy datatype to use; use float32 for training and float64 for numeric gradient checking. """ if cell_type not in {'rnn', 'lstm'}: raise ValueError('Invalid cell_type "%s"' % cell_type) self.cell_type = cell_type self.dtype = dtype self.word_to_idx = word_to_idx self.idx_to_word = {i: w for w, i in word_to_idx.iteritems()} self.params = {} vocab_size = len(word_to_idx) self._null = word_to_idx['<NULL>'] self._start = word_to_idx.get('<START>', None) self._end = word_to_idx.get('<END>', None) # Initialize word vectors self.params['W_embed'] = np.random.randn(vocab_size, wordvec_dim) self.params['W_embed'] /= 100 # Initialize CNN -> hidden state projection parameters self.params['W_proj'] = np.random.randn(input_dim, hidden_dim) self.params['W_proj'] /= np.sqrt(input_dim) self.params['b_proj'] = np.zeros(hidden_dim) # Initialize parameters for the RNN dim_mul = {'lstm': 4, 'rnn': 1}[cell_type] self.params['Wx'] = np.random.randn(wordvec_dim, dim_mul * hidden_dim) self.params['Wx'] /= np.sqrt(wordvec_dim) self.params['Wh'] = np.random.randn(hidden_dim, dim_mul * hidden_dim) self.params['Wh'] /= np.sqrt(hidden_dim) self.params['b'] = np.zeros(dim_mul * hidden_dim) # Initialize output to vocab weights self.params['W_vocab'] = np.random.randn(hidden_dim, vocab_size) self.params['W_vocab'] /= np.sqrt(hidden_dim) self.params['b_vocab'] = np.zeros(vocab_size)
def lr_hmc(y, X, epsilon, L, alpha, n_iter): def U(beta): return mp.sum(mp.log(1 + mp.exp(mp.dot(X, beta))))-mp.dot(y.T,(mp.dot(X,beta)))+(0.5/alpha)*mp.sum(beta**2) def dU(beta): return mp.dot(X.T, (mp.exp(mp.dot(X,beta))/(1+mp.exp(mp.dot(X,beta))) - y)) + beta/alpha D = X.shape[1] q = mp.zeros((D, 1), dtype=mp.float32) out = mp.zeros((n_iter, D), dtype=mp.float32) for i in range(n_iter): q = hmc(U, dU, epsilon, L, q) out[i,:] = mp.ravel(q) return out
def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5): mu = np.random.rand(num_classes, num_features) sigma = np.ones((num_classes, num_features)) * 0.1 num_cls_samples = int(num_samples / num_classes) x = np.zeros((num_samples, num_features)) y = np.zeros((num_samples, num_classes)) for i in range(num_classes): cls_samples = np.random.normal(mu[i, :], sigma[i, :], (num_cls_samples, num_features)) x[i * num_cls_samples:(i + 1) * num_cls_samples] = cls_samples y[i * num_cls_samples:(i + 1) * num_cls_samples, i] = 1 return x, y
def test_lr_grad(): def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero( np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 wshape = (500, 250) weights = random.rand(*wshape) - 0.5 xshape = (256, 500) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 gradient_checker.quick_grad_check(training_loss, inputs)
def getMask(self, M): mask = np.zeros((M.shape[0], M.shape[1])) (index_i, index_j) = np.nonzero(M) for i in range(index_i.shape[0]): mask[index_i[i]][index_j[i]] = 1 # print mask return mask
def softmax_cross_entropy(prob, label): """ Computes the cross entropy for softmax activation. Inputs: - prob: Probability, of shape (N, C) where x[i, j] is the probability for the jth class for the ith input. - label: Either of the followings: - One hot encoding of labels, of shape (N, C) - Label index of shape (N, ), each y[i] is the label of i^th example (0 <= y[i] < C) Returns a Value: - cross_entropy """ N = prob.shape[0] C = prob.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label return -np.sum(np.log(prob) * onehot_label) / N
def softmax_loss(x, label): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Either of the followings: - One hot encoding of labels, of shape (N, C) - Label index of shape (N, ), each y[i] is the label of i^th example (0 <= y[i] < C) Returns a tuple of: - loss: Scalar giving the loss """ N = x.shape[0] C = x.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label prob = np.softmax_output(x, onehot_label) return softmax_cross_entropy(prob, onehot_label)
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Either of the followings: - One hot encoding of labels, of shape (N, C) - Label index of shape (N, ), each y[i] is the label of i^th example (0 <= y[i] < C) Returns a tuple of: - loss: Scalar giving the loss """ N = x.shape[0] C = x.shape[1] if len(y.shape) == 1: #convert it to one hot encoding onehot_y = np.zeros([N, C]) np.onehot_encode(y, onehot_y) else: onehot_y = y probs = x - np.max(x, axis=1, keepdims=True) loss = -np.sum(probs * onehot_y) / N loss += np.sum(np.log(np.sum(np.exp(probs), axis=1, keepdims=True))) / N return loss
def rnn_temporal(x, h0, Wx, Wh, b): """ Run a vanilla RNN forward on an entire sequence of data. We assume an input sequence composed of T vectors, each of dimension D. The RNN uses a hidden size of H, and we work over a minibatch containing N sequences. After running the RNN forward, we return the hidden states for all timesteps. Inputs: - x: Input data for the entire timeseries, of shape (N, T, D). - h0: Initial hidden state, of shape (N, H) - Wx: Weight matrix for input-to-hidden connections, of shape (D, H) - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H) - b: Biases of shape (H,) Returns a tuple of: - h: Hidden states for the entire timeseries, of shape (N, T, H). """ N, T, _ = x.shape H = h0.shape[1] h = np.zeros([N, 0, H]) for t in range(T): h_step = rnn_step(x[:, t, :], h0 if t == 0 else h[:, t - 1, :], Wx, Wh, b).reshape(N, 1, H) h = np.append(h, h_step, axis=1) return h
def rnn_temporal(x, h0, Wx, Wh, b): """ Run a vanilla RNN forward on an entire sequence of data. We assume an input sequence composed of T vectors, each of dimension D. The RNN uses a hidden size of H, and we work over a minibatch containing N sequences. After running the RNN forward, we return the hidden states for all timesteps. Inputs: - x: Input data for the entire timeseries, of shape (N, T, D). - h0: Initial hidden state, of shape (N, H) - Wx: Weight matrix for input-to-hidden connections, of shape (D, H) - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H) - b: Biases of shape (H,) Returns a tuple of: - h: Hidden states for the entire timeseries, of shape (N, T, H). """ N, T, D = x.shape H = h0.shape[1] h = np.zeros([N, 0, H]) for t in xrange(T): h_step = rnn_step(x[:, t, :], h0 if t == 0 else h[:, t - 1, :], Wx, Wh, b).reshape(N, 1, H) h = np.append(h, h_step, axis=1) return h
def forward(self, X, mode): h = np.zeros(self.hshape) # init hidden state for t in range(self.num_unroll_steps): h = layers.rnn_step(X, h, self.params['Wx'], self.params['Wh'], self.params['b']) y = layers.affine(h, self.params['Wa'], self.params['ba']) return y
def forward(self, X, mode): N, sequence_length, D = X.shape WX = self.params['WX'] Wh = self.params['Wh'] bias_h = self.params['bias_h'] WY = self.params['WY'] bias_Y = self.params['bias_Y'] WY0 = self.params['WY0'] bias_Y0 = self.params['bias_Y0'] h = np.zeros((N, self._n_hidden)) self.previous_h = [h] for t in xrange(sequence_length): X_t = X[:, t, :] h0 = self._update_h(X_t, h, WX, Wh, bias_h) projected_h = sum( batch_scalar_product(h, h0) * h for t, h in enumerate(self.previous_h)) h = np.dot(X_t, WX) + np.dot(h, Wh) + projected_h h = self._nonlinear(h) self.previous_h.append(h) Y0 = layers.relu(layers.affine(h, WY0, bias_Y0)) Y = layers.affine(Y0, WY, bias_Y) return Y
def forward(self, X, mode): h = np.zeros(self.hshape) # init hidden state for t in xrange(self.num_unroll_steps): h = layers.rnn_step(X, h, self.params['Wx'], self.params['Wh'], self.params['b']) y = layers.affine(h, self.params['Wa'], self.params['ba']) return y
def test_lr_grad(): def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero(np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 wshape = (500, 250) weights = random.rand(*wshape) - 0.5 xshape = (256, 500) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 gradient_checker.quick_grad_check(training_loss, inputs)
def test_context(): set_context(gpu(1)) # set the global context as gpu(1) def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero(np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 with gpu(0): xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) for i in range(20): print('Trained loss accuracy #{}: {}%'.format(i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs) weights -= gr * 0.01 print("\nff and bp on {0}".format(weights.context)) print("\nexecute on cpu") with cpu(): x_cpu = random.rand(32, 64) - 0.5 y_cpu = random.rand(64, 32) - 0.5 z_cpu = np.dot(x_cpu, y_cpu) print('z_cpu.context = {0}'.format(z_cpu.context)) print("\nexecute on gpu(0)") with gpu(0): x_gpu0 = random.rand(32, 64) - 0.5 y_gpu0 = random.rand(64, 32) - 0.5 z_gpu0 = np.dot(x_gpu0, y_gpu0) z_gpu0.asnumpy() print('z_gpu0.context = {0}'.format(z_gpu0.context)) print("\n[use global context] execute on gpu(1)") x_gpu1 = random.rand(32, 64) - 0.5 y_gpu1 = random.rand(64, 32) - 0.5 z_gpu1 = np.dot(x_gpu1, y_gpu1) z_gpu1.asnumpy() print('z_gpu1.context = {0}'.format(z_gpu1.context))
def blob_normalization(X, settings, gamma, beta, mode='train', epsilon=1e-5, momentum=0.9, running_mean=None, running_variance=None): N, D = map(int, X.shape) size = N * D if running_mean is None: running_mean = np.zeros(1) if running_variance is None: running_variance = np.zeros(1) if mode == 'train': if 'shared_mean' in settings: mean = np.sum(X) / size else: mean = np.sum(X, axis=0) / N mean = np.reshape(mean, (1, D)) centered_X = X - mean if 'shared_deviation' in settings: variance = np.sum(centered_X**2) / size else: variance = np.sum(centered_X**2, axis=0) / N variance = np.reshape(variance, (1, D)) deviation = variance**0.5 rescaled_X = centered_X / deviation out = gamma * rescaled_X + beta running_mean = momentum * running_mean + (1.0 - momentum) * mean running_variance = momentum * running_variance + (1.0 - momentum) * variance elif mode == 'test': X_hat = (X - running_mean) / np.sqrt(running_variance + epsilon) out = gamma * X_hat + beta return out, running_mean, running_variance
def forward(self, X, mode): out = self.conv(X=X, **self.params) out = layers.affine(out, self.params['w1'], self.params['b1']) out = layers.relu(out) out = layers.affine(out, self.params['w2'], self.params['b2']) # This verifies whether symbols can be reused. trash = self.conv(X=np.zeros(X.shape), **self.params) return out
def forward(self, X, mode): N, sequence_length, D = X.shape h = np.zeros((N, self._n_hidden)) c = np.zeros((N, self._n_hidden)) WX = self.params['WX'] Wh = self.params['Wh'] bias = self.params['bias'] WY = self.params['WY'] bias_Y = self.params['bias_Y'] for t in range(sequence_length): X_t = X[:, t, :] h, c = layers.lstm_step(X_t, h, c, WX, Wh, bias) Y = layers.affine(h, WY, bias_Y) return Y
def softmax_probability(p, channel): N, C = p.shape p -= np.max(p, axis=1).reshape((N, 1)) code = np.zeros((N, C)) np.onehot_encode(channel, code) p = np.exp(p) selected_p = p * code total_p = np.sum(p, axis=1).reshape((N, 1)) return np.sum(selected_p / total_p, axis=1)
def set_param(self): self.params = {} c_cnt, height, width = self.input_dim f_cnt = self.num_filters f_h, f_w = self.filter_size, self.filter_size self.params['conv1_weight'] = random.randn(f_cnt, c_cnt, f_h, f_w) * self.weight_scale self.params['conv1_bias'] = np.zeros(f_cnt) #TODO(Haoran): whole stuff about all dimension calculations #should be substituted by quering symbol.arg_list conv_stride = 1 conv_pad = (f_h - 1) / 2 Hc, Wc = 1 + (height + 2 * conv_pad - f_h) / conv_stride, 1 + ( width + 2 * conv_pad - f_w) / conv_stride pool_height, pool_width = 2, 2 pool_stride = 2 Hp, Wp = (Hc - pool_height) / pool_stride + 1, ( Wc - pool_width) / pool_stride + 1 # weight has to be tranposed to fit mxnet's symbol self.params['fc1_weight'] = np.transpose( random.randn(5408, self.hidden_dim) * self.weight_scale) self.params['fc1_bias'] = np.zeros((self.hidden_dim)) # weight has to be tranposed to fit mxnet's symbol self.params['fc2_weight'] = np.transpose( random.randn(self.hidden_dim, self.num_classes) * self.weight_scale) self.params['fc2_bias'] = np.zeros((self.num_classes)) #TODO(Haoran): move following into parent structured model class self.param_keys = self.params.keys() # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i
def softmax_cross_entropy(prob, label): N = prob.shape[0] C = prob.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label return -np.sum(np.log(prob) * onehot_label) / N
def set_param(self): self.params = {} c_cnt, height, width = self.input_dim f_cnt = self.num_filters f_h, f_w = self.filter_size, self.filter_size self.params['conv1_weight'] = random.randn(f_cnt, c_cnt, f_h, f_w) * self.weight_scale self.params['conv1_bias'] = np.zeros(f_cnt) #TODO(Haoran): whole stuff about all dimension calculations #should be substituted by quering symbol.arg_list conv_stride = 1 conv_pad = (f_h - 1) / 2 Hc, Wc = 1 + (height + 2 * conv_pad - f_h) / conv_stride, 1 + ( width + 2 * conv_pad - f_w) / conv_stride pool_height, pool_width = 2, 2 pool_stride = 2 Hp, Wp = (Hc - pool_height) / pool_stride + 1, (Wc - pool_width ) / pool_stride + 1 # weight has to be tranposed to fit mxnet's symbol self.params['fc1_weight'] = np.transpose(random.randn( 5408, self.hidden_dim) * self.weight_scale) self.params['fc1_bias'] = np.zeros((self.hidden_dim)) # weight has to be tranposed to fit mxnet's symbol self.params['fc2_weight'] = np.transpose(random.randn( self.hidden_dim, self.num_classes) * self.weight_scale) self.params['fc2_bias'] = np.zeros((self.num_classes)) #TODO(Haoran): move following into parent structured model class self.param_keys = self.params.keys() # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i
def build_dataset(filename, sequence_length, alphabet_size, max_lines=-1): """Loads a text file, and turns each line into an encoded sequence.""" with open(filename) as f: content = f.readlines() content = content[:max_lines] content = [line for line in content if len(line) > 2] # Remove blank lines seqs = np.zeros((sequence_length, len(content), alphabet_size)) for ix, line in enumerate(content): padded_line = (line + " " * sequence_length)[:sequence_length] seqs[:, ix, :] = string_to_one_hot(padded_line, alphabet_size) return seqs
def main(): X = np.random.randn(10, 3, 256, 256) w = np.random.randn(8, 3, 3, 3) b = np.zeros((8, )) params = {'pad': 1, 'stride': 2} start = time.time() conv_forward_naive(X, w, b, params) print time.time() - start
def forward(self, X, mode): batch_size = X.shape[0] seq_len = X.shape[1] X_emb = self.params['W_Emb'][X] hm1 = np.zeros((batch_size, self.HID_DIM)) hs = [] for t in xrange(seq_len): hm1 = self.one_step(X_emb[:,t,:], hm1) hs.append(hm1) hs = np.stack(hs, axis=1).reshape((batch_size*seq_len, self.HID_DIM)) pred_out = layers.affine(hs, self.params['W_Softmax'], self.params['b_Softmax']) return pred_out.reshape((batch_size, seq_len, self.WORD_DIM))
def l2_loss(x, label): """ The Mean Square Error loss for regression. """ N = x.shape[0] C = x.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label return np.sum((x - onehot_label) ** 2) / N
def preprocess(self, img): """ Preprocess a 210x160x3 uint8 frame into a 6400 (80x80) (1 x input_size) float vector.""" # Crop, down-sample, erase background and set foreground to 1. # Ref: https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5 img = img[35:195] img = img[::2, ::2, 0] img[img == 144] = 0 img[img == 109] = 0 img[img != 0] = 1 curr = np.expand_dims(img.astype(numpy.float).ravel(), axis=0) # Subtract the last preprocessed image. diff = curr - self.prev if self.prev is not None else np.zeros((1, curr.shape[1])) self.prev = curr return diff
def main(): X = np.random.randn(10,3,256,256) w = np.random.randn(8,3,3,3) b = np.zeros((8,)) params = { 'pad':1, 'stride':2 } start = time.time() conv_forward_naive(X,w,b,params) print time.time() - start
def train_loss(*args): inputs = args[0] softmax_label = args[1] probs = self.symbol_func(**self.make_mxnet_weight_dict(inputs, softmax_label, args[self.data_target_cnt:len(args)])) if softmax_label is None: return probs samples_num = X.shape[0] targets = np.zeros((samples_num, self.num_classes)) targets[np.arange(samples_num), softmax_label] = 1 loss = -np.sum(targets * np.log(probs)) / samples_num for i in self.get_index_reg_weight(): loss = loss + np.sum(0.5*args[i]**2*self.reg) return loss
def test_op_statistics(): def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero( np.argmax( preds, axis=1) - np.argmax( targets, axis=1)) return (256 - error) * 100 / 256.0 np.record_op_stat() xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) for i in range(30): print('Trained accuracy #{}: {}%'.format(i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs) weights -= gr * 0.01 # Print Op Statistics Info np.show_op_stat()
def test_lr_grad(): inputs = rng.rand(32, 64) * 0.1 targets = np.zeros((32, 10)) truth = rng.randint(0, 10, 32) targets[np.arange(32), truth] = 1 def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def training_loss(weights): preds = sigmoid(np.dot(inputs, weights)) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l weights = rng.rand(64, 10) * 0.01 return gradient_checker.quick_grad_check(training_loss, weights, rs=rng)
def test_mxnet_logistic(): def sigmoid(x): return np.multiply(0.5, np.add(np.tanh(x), 1)) xshape = (256, 500) #needs to reverse. because of mxnet's setting wshape = (250, 500) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = np.random.rand(*wshape) - 0.5 x = mx.sym.Variable(name='x') fc = mx.sym.FullyConnected(name='fc', data=x, num_hidden=250) act = mx.sym.Activation(data=fc, act_type='sigmoid') f = core.Function(act, {'x': xshape}) def predict(weights, inputs): #return f( data=[('x', inputs)], weight=[('fc_weight', weights)], ctx=mx.cpu()) return f(x=inputs, fc_weight=weights) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities)) training_gradient_fun = core.grad(training_loss) print('Initial loss: {}'.format(training_loss(weights, inputs))) for i in range(100): gr = training_gradient_fun(weights, inputs) #print('Training gradient: {}'.format(gr)) weights -= gr * 0.1 if i % 10 == 0: print('Trained loss: {}'.format(training_loss(weights, inputs))) # The training loss should be around 300 in a bug-free Minpy if (training_loss(weights, inputs)[0] > 600): assert (False)
def test_logistic(): def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): return sigmoid(np.dot(inputs, weights)) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero( np.argmax( preds, axis=1) - np.argmax( targets, axis=1)) return (256 - error) * 100 / 256.0 xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) for i in range(200): print('Trained accuracy #{}: {}%'.format( i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs) weights -= gr * 0.01 # The accuracy should be 100 in bug-free MinPy if (training_accuracy(weights, inputs) < 95): assert (False)
def test_slice(): def sigmoid(x): return 0.5 * (np.tanh(x / 2) + 1) def predict(weights, inputs): # Test Slice sliced_weights = weights[:, ::2] y = sigmoid(np.dot(inputs, sliced_weights)) return y def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero(np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 xshape = (256, 500) # wshape = (500, 250) wshape = (500, 500) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) for i in range(20): print('Trained loss accuracy #{}: {}%'.format(i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs) print('Gradient Size', gr.shape) print('Gradient example', gr[0,:10].asnumpy()) weights -= gr * 0.01
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10, dropout=0, use_batchnorm=False, reg=0.0, weight_scale=1e-2, seed=None, dtype=py_np.float64, conv_mode='lazy'): """ Initialize a new FullyConnectedNet. Inputs: - hidden_dims: A list of integers giving the size of each hidden layer. - input_dim: An integer giving the size of the input. - num_classes: An integer giving the number of classes to classify. - dropout: Scalar between 0 and 1 giving dropout strength. If dropout=0 then the network should not use dropout at all. - use_batchnorm: Whether or not the network should use batch normalization. - reg: Scalar giving L2 regularization strength. - weight_scale: Scalar giving the standard deviation for random initialization of the weights. - seed: If not None, then pass this random seed to the dropout layers. This will make the dropout layers deteriminstic so we can gradient check the model. """ super(FullyConnectedNet, self).__init__(conv_mode) self.use_batchnorm = use_batchnorm self.use_dropout = dropout > 0 self.reg = reg self.num_layers = 1 + len(hidden_dims) self.params = {} #Define parameter name given # layer self.w_name = lambda l: 'W' + str(l) self.b_name = lambda l: 'b' + str(l) self.bn_ga_name = lambda l: 'bn_ga' + str(l) self.bn_bt_name = lambda l: 'bn_bt' + str(l) for l in range(self.num_layers): if l == 0: input_d = input_dim else: input_d = hidden_dims[l-1] if l < self.num_layers - 1: out_d = hidden_dims[l] else: out_d = num_classes self.params[self.w_name(l)] = random.randn(input_d, out_d) * weight_scale self.params[self.b_name(l)] = np.zeros((out_d)) if l < self.num_layers and self.use_batchnorm: self.params[self.bn_ga_name(l)] = np.ones((out_d)) self.params[self.bn_bt_name(l)] = np.zeros((out_d)) self.param_keys = self.params.keys() # When using dropout we need to pass a dropout_param dictionary to each # dropout layer so that the layer knows the dropout probability and the mode # (train / test). You can pass the same dropout_param to each dropout layer. self.dropout_param = {} if self.use_dropout: self.dropout_param = {'mode': 'train', 'p': dropout} if seed is not None: self.dropout_param['seed'] = seed # With batch normalization we need to keep track of running means and # variances, so we need to pass a special bn_param object to each batch # normalization layer. self.bn_params = [] if self.use_batchnorm: self.bn_params = [{'mode': 'train'} for i in xrange(self.num_layers - 1)] # Build key's index in loss func's arglist self.key_args_index = {} for i, key in enumerate(self.param_keys): # data, targets would be the first two elments in arglist self.key_args_index[key] = self.data_target_cnt + i # Init Key to index in loss_function args self.w_idx = self.wrap_param_idx(self.w_name) self.b_idx = self.wrap_param_idx(self.b_name) self.bn_ga_idx = self.wrap_param_idx(self.bn_ga_name) self.bn_bt_idx = self.wrap_param_idx(self.bn_bt_name)
preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero(np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 xshape = (256, 500) # wshape = (500, 250) wshape = (500, 500) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 training_gradient_fun = grad(training_loss) def NumpyVarToMinpy(var): return minpy.array.Value.wrap(var) def MinpyVarToNumpy(var): return minpy.array.Value.wrap(var).get_data(ArrayType.NUMPY) for i in range(20): print('Trained loss accuracy #{}: {}%'.format(i, training_accuracy(weights, inputs))) gr = training_gradient_fun(weights, inputs)
def batchnorm_forward(x, gamma, beta, bn_param): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9) N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype)) running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype)) out, cache = None, None if mode == 'train': ############################################################################# # TODO: Implement the training-time forward pass for batch normalization. # # Use minibatch statistics to compute the mean and variance, use these # # statistics to normalize the incoming data, and scale and shift the # # normalized data using gamma and beta. # # # # You should store the output in the variable out. Any intermediates that # # you need for the backward pass should be stored in the cache variable. # # # # You should also use your computed sample mean and variance together with # # the momentum variable to update the running mean and running variance, # # storing your result in the running_mean and running_var variables. # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# elif mode == 'test': ############################################################################# # TODO: Implement the test-time forward pass for batch normalization. Use # # the running mean and variance to normalize the incoming data, then scale # # and shift the normalized data using gamma and beta. Store the result in # # the out variable. # ############################################################################# pass ############################################################################# # END OF YOUR CODE # ############################################################################# else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # Store the updated running means back into bn_param bn_param['running_mean'] = running_mean bn_param['running_var'] = running_var return out, cache
def batchnorm_forward(x, gamma, beta, bn_param): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9) N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D)) running_var = bn_param.get('running_var', np.zeros(D)) out = None if mode == 'train': mean = np.sum(x, axis = 0)/float(N) x_mean = (x - mean) sqr_x_mean = x_mean ** 2 var = np.sum(sqr_x_mean, axis = 0)/float(N) sqrt_var = np.sqrt(var + eps) inv_sqrt_var = 1.0/sqrt_var x_hat = x_mean * inv_sqrt_var out = gamma * x_hat + beta running_mean = momentum*running_mean + (1.0 - momentum) * mean running_var = momentum*running_var + (1.0 - momentum) * var elif mode == 'test': x_hat = (x - running_mean)/np.sqrt(running_var + eps) out = gamma * x_hat + beta else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # Store the updated running means back into bn_param bn_param['running_mean'] = running_mean bn_param['running_var'] = running_var return out
preds = predict(weights, bias, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l def training_accuracy(weights, bias, inputs): preds = predict(weights, bias, inputs) error = np.count_nonzero(np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 xshape = (256, 500) wshape = (500, 250) bshape = (250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 #bias = random.rand(bshape) - 0.5 #print bias.shape bias = np.zeros(bshape) print bias.shape training_gradient_fun = grad(training_loss) for i in range(20): print('Trained loss accuracy #{}: {}%'.format(i, training_accuracy(weights, bias, inputs))) gr = training_gradient_fun(weights, bias, inputs) weights -= gr * 0.01