def generate_2D(N, corners): if corners: print("Generating %dx%d 2-D adjacency system with corners..." % (N**2, N**2)) A = np.zeros((N**2, N**2)) + 8 * np.eye(N**2) else: print("Generating %dx%d 2-D adjacency system without corners..." % (N**2, N**2)) A = np.zeros((N**2, N**2)) + 4 * np.eye(N**2) # These are the same for both cases off_one = np.full(N**2 - 1, -1, dtype=np.float64) A += np.diag(off_one, k=1) A += np.diag(off_one, k=-1) off_N = np.full(N * (N - 1), -1, dtype=np.float64) A += np.diag(off_N, k=N) A += np.diag(off_N, k=-N) # If we have corners then we have four more cases if corners: off_N_plus = np.full(N * (N - 1) - 1, -1, dtype=np.float64) A += np.diag(off_N_plus, k=N + 1) A += np.diag(off_N_plus, k=-(N + 1)) off_N_minus = np.full(N * (N - 1) + 1, -1, dtype=np.float64) A += np.diag(off_N_minus, k=N - 1) A += np.diag(off_N_minus, k=-(N - 1)) # Then we can generate a random b matrix b = np.random.rand(N**2) return A, b
def cross_correlate(x, y, C, K, R, S, B, H, W): dw = np.zeros(shape=(R, S, C, K)) # cross-correlate images to compute weight gradients y_pad = np.zeros(shape=(K, B, H + R - 1, W + S - 1)) y_pad[:, :, R / 2:-(R / 2), S / 2:-(S / 2)] = y for r in range(R): for s in range(S): y_shift = y_pad[:, :, r:r + H, s:s + W] for c in range(C): for k in range(K): dw[r, s, c, k] = np.sum(x[c, :, :, :] * y_shift[k, :, :, :]) return dw
def solve(A, b, conv_iters, max_iters, verbose): print("Solving system...") x = np.zeros(A.shape[1]) r = b - A.dot(x) p = r rsold = r.dot(r) converged = -1 # Should always converge in fewer iterations than this max_iters = (min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]) for i in range(max_iters): Ap = A.dot(p) alpha = rsold / (p.dot(Ap)) x = x + alpha * p r = r - alpha * Ap rsnew = r.dot(r) # We only do the convergence test every conv_iters or on the last # iteration if (i % conv_iters == 0 or i == (max_iters - 1)) and np.sqrt(rsnew) < 1e-10: converged = i break if verbose: print("Residual: " + str(rsnew)) beta = rsnew / rsold p = r + beta * p rsold = rsnew if converged < 0: print("Convergence FAILURE!") else: print("Converged in %d iterations" % (converged)) return x
def logistic_regression( T, features, target, steps, learning_rate, sample, add_intercept=False ): if add_intercept: intercept = np.ones((features.shape[0], 1), dtype=T) features = np.hstack((intercept, features)) weights = np.zeros(features.shape[1], dtype=T) for step in range(steps): scores = np.dot(features, weights) predictions = sigmoid(scores) error = target - predictions gradient = np.dot(error, features) weights += learning_rate * gradient if step % sample == 0: print( "Log Likelihood of step " + str(step) + ": " + str(log_likelihood(features, target, weights)) ) return weights
def forward(X, WLSTM, c0=None, h0=None): """ X should be of shape (n,b,input_size), where n = length of sequence, b = batch size """ n, b, input_size = X.shape d = int(WLSTM.shape[1] / 4) # hidden size if c0 is None: c0 = np.zeros((b, d)) if h0 is None: h0 = np.zeros((b, d)) # Perform the LSTM forward pass with X as the input xphpb = WLSTM.shape[0] # x plus h plus bias, lol Hin = np.zeros( (n, b, xphpb)) # input [1, xt, ht-1] to each tick of the LSTM Hout = np.zeros( (n, b, d)) # hidden representation of the LSTM (gated cell content) IFOG = np.zeros((n, b, d * 4)) # input, forget, output, gate (IFOG) IFOGf = np.zeros((n, b, d * 4)) # after nonlinearity C = np.zeros((n, b, d)) # cell content Ct = np.zeros((n, b, d)) # tanh of cell content for t in range(n): # concat [x,h] as input to the LSTM prevh = Hout[t - 1] if t > 0 else h0 Hin[t, :, 0] = 1 # bias Hin[t, :, 1:input_size + 1] = X[t] Hin[t, :, input_size + 1:] = prevh # compute all gate activations. dots: (most work is this line) IFOG[t] = Hin[t].dot(WLSTM) # non-linearities IFOGf[t, :, :3 * d] = 1.0 / (1.0 + np.exp(-IFOG[t, :, :3 * d]) ) # sigmoids; these are the gates IFOGf[t, :, 3 * d:] = np.tanh(IFOG[t, :, 3 * d:]) # tanh # compute the cell activation prevc = C[t - 1] if t > 0 else c0 C[t] = (IFOGf[t, :, :d] * IFOGf[t, :, 3 * d:] + IFOGf[t, :, d:2 * d] * prevc) Ct[t] = np.tanh(C[t]) Hout[t] = IFOGf[t, :, 2 * d:3 * d] * Ct[t] cache = {} cache["WLSTM"] = WLSTM cache["Hout"] = Hout cache["IFOGf"] = IFOGf cache["IFOG"] = IFOG cache["C"] = C cache["Ct"] = Ct cache["Hin"] = Hin cache["c0"] = c0 cache["h0"] = h0 # return C[t], as well so we can continue LSTM with prev state # init if needed return Hout, C[t], Hout[t], cache
def initialize(N): print("Initializing stencil grid...") grid = np.zeros((N + 2, N + 2)) grid[:, 0] = -273.15 grid[:, -1] = -273.15 grid[-1, :] = -273.15 grid[0, :] = 40.0 return grid
def solve(A, b, iters, verbose): print("Solving system...") x = np.zeros(A.shape[1]) d = np.diag(A) R = A - np.diag(d) for i in range(iters): x = (b - np.dot(R, x)) / d return x
def __init__(self, H_size, X_size, z_size, weight_sd): self.W_f = Param( "W_f", np.random.randn(H_size, z_size) * weight_sd + 0.5 ) self.b_f = Param("b_f", np.zeros((H_size, 1))) self.W_i = Param( "W_i", np.random.randn(H_size, z_size) * weight_sd + 0.5 ) self.b_i = Param("b_i", np.zeros((H_size, 1))) self.W_C = Param("W_C", np.random.randn(H_size, z_size) * weight_sd) self.b_C = Param("b_C", np.zeros((H_size, 1))) self.W_o = Param( "W_o", np.random.randn(H_size, z_size) * weight_sd + 0.5 ) self.b_o = Param("b_o", np.zeros((H_size, 1))) # For final layer to predict the next character self.W_v = Param("W_v", np.random.randn(X_size, H_size) * weight_sd) self.b_v = Param("b_v", np.zeros((X_size, 1)))
def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741 print("Running kmeans...") print("Number of data points: " + str(N)) print("Number of dimensions: " + str(D)) print("Number of centroids: " + str(C)) print("Max iterations: " + str(I)) start = datetime.datetime.now() data, centroids = initialize(N, D, C, T) data_dots = np.square(np.linalg.norm(data, ord=2, axis=1)) zero_point = np.zeros((1, data.shape[1]), dtype=data.dtype) labels = None iteration = 0 prior_distance_sum = None # We run for max iterations or until we converge # We only test convergence every S iterations while iteration < I: pairwise_distances = calculate_distances(data, centroids, data_dots) new_labels = relabel(pairwise_distances) distance_sum = find_centroids(centroids, data, new_labels, pairwise_distances, zero_point, C, D) if iteration > 0 and iteration % S == 0: changes = np.not_equal(labels, new_labels) total_changes = np.sum(changes) delta = distance_sum / prior_distance_sum print("Iteration " + str(iteration) + " produced " + str(total_changes) + " changes, and total distance is " + str(distance_sum)) # We ignore the result of the threshold test in the case # that we are running performance benchmarks to measure # performance for a certain number of iterations if delta > 1 - 0.000001 and not benchmarking: print("Threshold triggered, terminating iterations early") break prior_distance_sum = distance_sum labels = new_labels iteration += 1 # This final distance sum also synchronizes the results print("Final distance sum at iteration " + str(iteration) + ": " + str(prior_distance_sum)) stop = datetime.datetime.now() delta = stop - start total = delta.total_seconds() * 1000.0 print("Elapsed Time: " + str(total) + " ms") return total
def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing): start = datetime.datetime.now() X = np.random.randn(sentence_length, batch_size, hidden_size) h0 = np.random.randn(1, hidden_size) WLSTM = np.random.randn( word_size + hidden_size, 4 * hidden_size ) / np.sqrt(word_size + hidden_size) xphpb = WLSTM.shape[0] d = hidden_size n = sentence_length b = batch_size Hin = np.zeros((n, b, xphpb)) Hout = np.zeros((n, b, d)) IFOG = np.zeros((n, b, d * 4)) IFOGf = np.zeros((n, b, d * 4)) C = np.zeros((n, b, d)) Ct = np.zeros((n, b, d)) for t in range(0, n): if t == 0: prev = np.tile(h0, (b, 1)) else: prev = Hout[t - 1] Hin[t, :, :word_size] = X[t] Hin[t, :, word_size:] = prev # compute all gate activations. dots: IFOG[t] = Hin[t].dot(WLSTM) # non-linearities IFOGf[t, :, : 3 * d] = 1.0 / ( 1.0 + np.exp(-IFOG[t, :, : 3 * d]) ) # sigmoids these are the gates IFOGf[t, :, 3 * d :] = np.tanh(IFOG[t, :, 3 * d :]) # tanh # compute the cell activation C[t] = IFOGf[t, :, :d] * IFOGf[t, :, 3 * d :] if t > 0: C[t] += IFOGf[t, :, d : 2 * d] * C[t - 1] Ct[t] = np.tanh(C[t]) Hout[t] = IFOGf[t, :, 2 * d : 3 * d] * Ct[t] # Do a little sum of the outputs to synchronize and check for NaNs total = np.sum(Hout) assert not math.isnan(total) stop = datetime.datetime.now() delta = stop - start total = delta.total_seconds() * 1000.0 if timing: print("Elapsed Time: " + str(total) + " ms") return total
def test(): np.random.seed(50) datanp = np.random.randn(2000000, 3) data = lg.array(datanp) pointsnp = np.random.choice(lg.arange(len(data)), 4, False) points = lg.array(pointsnp) centroids = data[points] centroidsnp = datanp[pointsnp] sqdists = lg.zeros((4, len(data))) sqdistsnp = np.zeros((4, len(datanp))) for i in range(4): vec = data - centroids[i] vecnp = datanp - centroidsnp[i] sqdists[i] = lg.linalg.norm(vec, axis=1) sqdistsnp[i] = np.linalg.norm(vecnp, axis=1) clusters = lg.argmin(sqdists, axis=0) clustersnp = np.argmin(sqdistsnp, axis=0) assert lg.array_equal(lg.where(clusters == 0), np.where(clustersnp == 0))
def test(): x = lg.array([[1, 2], [3, 4], [5, 6]]) assert lg.array_equal(x[[0, 1, 2], [0, 1, 0]], [1, 4, 5]) x = lg.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]) rows = lg.array([0, 3]) columns = lg.array([0, 2]) assert lg.array_equal(x[rows[:, np.newaxis], columns], [[0, 2], [9, 11]]) zg = lg.array([[-1.2 + 0.5j, 1.2 - 2j], [-2.2 + 3.5j, 4.2 - 6.2j]]) m = lg.array([[True, False], [False, True]]) assert lg.array_equal(zg[m], [-1.2 + 0.5j, 4.2 - 6.2j]) anp = np.array([[[2, 1], [3, 2]], [[2, 4], [4, 1]]]) a = lg.array(anp) nznp = anp < 3 nzgp = a < 3 assert lg.array_equal(anp[nznp], a[nzgp]) y = lg.array([[[True, True], [False, True]], [[True, False], [False, True]]]) z = lg.nonzero(y) assert lg.array_equal(a[z], lg.array([2, 1, 2, 2, 1])) np.random.seed(42) anp = np.random.randn(10, 10, 4) a = lg.array(anp) bnp = np.array([3, 4, 6]) cnp = np.array([1, 4, 5]) b = lg.array(bnp) c = lg.array(cnp) assert lg.array_equal(a[b], anp[bnp]) assert lg.array_equal(a[(b, c)], anp[(b, c)]) onesnp = np.zeros(10, int) ones = lg.zeros(10, int) dnp = np.random.randn(20, 4) d = lg.array(dnp) assert lg.array_equal(dnp[np.where(onesnp)], d[lg.where(ones)])
def test(): x = lg.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) x[0:5] = lg.array([11, 12, 13, 14, 15]) x[5:10] = lg.array([16, 17, 18, 19, 20]) x[4:8] = lg.array([21, 22, 23, 24]) assert np.array_equal(x[5:10], [22, 23, 24, 19, 20]) assert np.array_equal(x, [11, 12, 13, 14, 21, 22, 23, 24, 19, 20]) anp = np.zeros((5, 6)) bnp = np.random.random((5, 4)) cnp = np.random.random((5, 2)) a = lg.zeros((5, 6)) b = lg.array(bnp) c = lg.array(cnp) a[:, :4] = b a[:, 0] = 1 a[:, 3:5] = c anp[:, :4] = bnp anp[:, 0] = 1 anp[:, 3:5] = cnp assert np.array_equal(a, anp) dnp = np.random.random((2, 3, 4)) enp = np.random.random((2, 3, 4)) fnp = np.random.random((3, 2)) d = lg.array(dnp) e = lg.array(enp) f = lg.array(fnp) d[1, :, 0] = 1 d[1, :, 1:3] = f d[0] = e[1] dnp[1, :, 0] = 1 dnp[1, :, 1:3] = fnp dnp[0] = enp[1] assert np.array_equal(d, dnp) return
def test(): height = 10 width = 10 grid = lg.zeros((height + 2, width + 2), np.float32) grid[:, 0] = -273.15 grid[:, -1] = -273.15 grid[-1, :] = -273.15 grid[0, :] = 40.0 center = grid[1:-1, 1:-1] north = grid[0:-2, 1:-1] east = grid[1:-1, 2:] west = grid[1:-1, 0:-2] south = grid[2:, 1:-1] for i in range(2): average = center + north + east + west + south work = 0.2 * average delta = lg.sum(lg.absolute(work - center)) center[:] = work npGrid = np.zeros((height + 2, width + 2), np.float32) npGrid[:, 0] = -273.15 npGrid[:, -1] = -273.15 npGrid[-1, :] = -273.15 npGrid[0, :] = 40.0 npcenter = npGrid[1:-1, 1:-1] npnorth = npGrid[0:-2, 1:-1] npeast = npGrid[1:-1, 2:] npwest = npGrid[1:-1, 0:-2] npsouth = npGrid[2:, 1:-1] for i in range(2): npaverage = npcenter + npnorth + npeast + npwest + npsouth npwork = 0.2 * npaverage nptemp = np.absolute(npwork - npcenter) npdelta = np.sum(nptemp) npcenter[:] = npwork assert np.allclose(delta, npdelta) return
def test(): word_size = 10 hidden_size = 10 sentence_length = 2 batch_size = 3 X = np.random.randn(sentence_length, batch_size, hidden_size) h0 = np.random.randn(1, hidden_size) WLSTM = np.random.randn(word_size + hidden_size, 4 * hidden_size) / np.sqrt(word_size + hidden_size) xphpb = WLSTM.shape[0] d = hidden_size n = sentence_length b = batch_size Hin = np.zeros((n, b, xphpb)) Hout = np.zeros((n, b, d)) IFOG = np.zeros((n, b, d * 4)) IFOGf = np.zeros((n, b, d * 4)) C = np.zeros((n, b, d)) Ct = np.zeros((n, b, d)) for t in range(0, n): if t == 0: prev = np.tile(h0, (b, 1)) else: prev = Hout[t - 1] Hin[t, :, :word_size] = X[t] Hin[t, :, word_size:] = prev # compute all gate activations. dots: IFOG[t] = Hin[t].dot(WLSTM) # non-linearities IFOGf[t, :, :3 * d] = 1.0 / (1.0 + np.exp(-IFOG[t, :, :3 * d]) ) # sigmoids these are the gates IFOGf[t, :, 3 * d:] = np.tanh(IFOG[t, :, 3 * d:]) # tanh # compute the cell activation C[t] = IFOGf[t, :, :d] * IFOGf[t, :, 3 * d:] if t > 0: C[t] += IFOGf[t, :, d:2 * d] * C[t - 1] Ct[t] = np.tanh(C[t]) Hout[t] = IFOGf[t, :, 2 * d:3 * d] * Ct[t] return
def linear_regression(T, features, target, steps, learning_rate, sample, add_intercept=False): if add_intercept: intercept = np.ones((features.shape[0], 1), dtype=T) features = np.hstack((intercept, features)) weights = np.zeros(features.shape[1], dtype=T) for step in range(steps): scores = np.dot(features, weights) error = scores - target gradient = -(1.0 / len(features)) * error.dot(features) weights += learning_rate * gradient if step % sample == 0: print("Error of step " + str(step) + ": " + str(np.sum(np.power(error, 2)))) return weights
def testtion(): word_size = 10 hidden_size = 10 sentence_length = 5 batch_size = 3 lg.random.seed(42) WLSTM = lg.random.randn(word_size + hidden_size, 4 * hidden_size) / lg.sqrt(word_size + hidden_size) xphpb = WLSTM.shape[0] d = hidden_size n = sentence_length b = batch_size dHout = lg.random.randn(n, b, d) IFOGf = lg.random.randn(n, b, d * 4) C = lg.random.randn(n, b, d) Ct = lg.random.randn(n, b, d) Hin = lg.random.randn(n, b, xphpb) dIFOG = lg.zeros((n, b, d * 4)) dIFOGf = lg.zeros(IFOGf.shape) dHin = lg.zeros(Hin.shape) dC = lg.zeros(C.shape) dh0 = lg.zeros((1, d)) for t in reversed(range(n)): tanhCt = Ct[t] dIFOGf[t, :, 2 * d:3 * d] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1 - tanhCt**2) * (IFOGf[t, :, 2 * d:3 * d] * dHout[t]) if t > 0: dIFOGf[t, :, d:2 * d] = C[t - 1] * dC[t] dC[t - 1] += IFOGf[t, :, d:2 * d] * dC[t] dIFOGf[t, :, :d] = IFOGf[t, :, 3 * d:] * dC[t] dIFOGf[t, :, 3 * d:] = IFOGf[t, :, :d] * dC[t] # backprop activation functions dIFOG[t, :, 3 * d:] = (1 - IFOGf[t, :, 3 * d:]**2) * dIFOGf[t, :, 3 * d:] y = IFOGf[t, :, :3 * d] dIFOG[t, :, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * d] # backprop matrix multiply dHin[t] = dIFOG[t].dot(WLSTM.transpose()) # backprop the identity transforms into Hin if t > 0: dHout[t - 1, :] += dHin[t, :, word_size:] else: dh0[0] += lg.sum(dHin[t, :, word_size:], 0) np.random.seed(42) WLSTM = np.random.randn(word_size + hidden_size, 4 * hidden_size) / np.sqrt(word_size + hidden_size) xphpb = WLSTM.shape[0] d = hidden_size n = sentence_length b = batch_size dHout = np.random.randn(n, b, d) IFOGf = np.random.randn(n, b, d * 4) C = np.random.randn(n, b, d) Ct = np.random.randn(n, b, d) Hin = np.random.randn(n, b, xphpb) dIFOG = np.zeros((n, b, d * 4)) dIFOGf = np.zeros(IFOGf.shape) dHin = np.zeros(Hin.shape) dC = np.zeros(C.shape) dhnp0 = np.zeros((1, d)) for t in reversed(range(n)): tanhCt = Ct[t] dIFOGf[t, :, 2 * d:3 * d] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1 - tanhCt**2) * (IFOGf[t, :, 2 * d:3 * d] * dHout[t]) if t > 0: dIFOGf[t, :, d:2 * d] = C[t - 1] * dC[t] dC[t - 1] += IFOGf[t, :, d:2 * d] * dC[t] dIFOGf[t, :, :d] = IFOGf[t, :, 3 * d:] * dC[t] dIFOGf[t, :, 3 * d:] = IFOGf[t, :, :d] * dC[t] # backprop activation functions dIFOG[t, :, 3 * d:] = (1 - IFOGf[t, :, 3 * d:]**2) * dIFOGf[t, :, 3 * d:] y = IFOGf[t, :, :3 * d] dIFOG[t, :, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * d] # backprop matrix multiply dHin[t] = dIFOG[t].dot(WLSTM.transpose()) # backprop the identity transforms into Hin if t > 0: dHout[t - 1, :] += dHin[t, :, word_size:] else: dhnp0[0] += np.sum(dHin[t, :, word_size:], 0) assert np.allclose(dh0[0], dhnp0[0])
def backward(dHout_in, cache, dcn=None, dhn=None): WLSTM = cache["WLSTM"] Hout = cache["Hout"] IFOGf = cache["IFOGf"] IFOG = cache["IFOG"] C = cache["C"] Ct = cache["Ct"] Hin = cache["Hin"] c0 = cache["c0"] # h0 = cache["h0"] n, b, d = Hout.shape input_size = WLSTM.shape[0] - d - 1 # -1 due to bias # backprop the LSTM dIFOG = np.zeros(IFOG.shape) dIFOGf = np.zeros(IFOGf.shape) dWLSTM = np.zeros(WLSTM.shape) dHin = np.zeros(Hin.shape) dC = np.zeros(C.shape) dX = np.zeros((n, b, input_size)) dh0 = np.zeros((b, d)) dc0 = np.zeros((b, d)) dHout = (dHout_in.copy() ) # make a copy so we don't have any funny side effects if dcn is not None: dC[n - 1] += dcn.copy() # carry over gradients from later if dhn is not None: dHout[n - 1] += dhn.copy() for t in reversed(range(n)): tanhCt = Ct[t] dIFOGf[t, :, 2 * d:3 * d] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1 - tanhCt**2) * (IFOGf[t, :, 2 * d:3 * d] * dHout[t]) if t > 0: dIFOGf[t, :, d:2 * d] = C[t - 1] * dC[t] dC[t - 1] += IFOGf[t, :, d:2 * d] * dC[t] else: dIFOGf[t, :, d:2 * d] = c0 * dC[t] dc0 = IFOGf[t, :, d:2 * d] * dC[t] dIFOGf[t, :, :d] = IFOGf[t, :, 3 * d:] * dC[t] dIFOGf[t, :, 3 * d:] = IFOGf[t, :, :d] * dC[t] # backprop activation functions dIFOG[t, :, 3 * d:] = (1 - IFOGf[t, :, 3 * d:]**2) * dIFOGf[t, :, 3 * d:] y = IFOGf[t, :, :3 * d] dIFOG[t, :, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * d] # backprop matrix multiply dWLSTM += np.dot(Hin[t].transpose(), dIFOG[t]) dHin[t] = dIFOG[t].dot(WLSTM.transpose()) # backprop the identity transforms into Hin dX[t] = dHin[t, :, 1:input_size + 1] if t > 0: dHout[t - 1, :] += dHin[t, :, input_size + 1:] else: dh0 += dHin[t, :, input_size + 1:] return dX, dWLSTM, dc0, dh0
def checkSequentialMatchesBatch(): """ check LSTM I/O forward/backward interactions """ n, b, d = (5, 3, 4) # sequence length, batch size, hidden size input_size = 10 WLSTM = LSTM.init(input_size, d) # input size, hidden size X = np.random.randn(n, b, input_size) h0 = np.random.randn(b, d) c0 = np.random.randn(b, d) # sequential forward cprev = c0 hprev = h0 caches = [{} for t in range(n)] Hcat = np.zeros((n, b, d)) for t in range(n): xt = X[t:t + 1] _, cprev, hprev, cache = LSTM.forward(xt, WLSTM, cprev, hprev) caches[t] = cache Hcat[t] = hprev # sanity check: perform batch forward to check that we get the same thing H, _, _, batch_cache = LSTM.forward(X, WLSTM, c0, h0) assert np.allclose(H, Hcat), "Sequential and Batch forward don" "t match!" # eval loss wrand = np.random.randn(*Hcat.shape) # loss = np.sum(Hcat * wrand) dH = wrand # get the batched version gradients BdX, BdWLSTM, Bdc0, Bdh0 = LSTM.backward(dH, batch_cache) # now perform sequential backward dX = np.zeros_like(X) dWLSTM = np.zeros_like(WLSTM) dc0 = np.zeros_like(c0) dh0 = np.zeros_like(h0) dcnext = None dhnext = None for t in reversed(range(n)): dht = dH[t].reshape((1, b, d)) # print("dht") # print(dht.shape) # print(dht[0]) dx, dWLSTMt, dcprev, dhprev = LSTM.backward(dht, caches[t], dcnext, dhnext) dhnext = dhprev dcnext = dcprev dWLSTM += dWLSTMt # accumulate LSTM gradient dX[t] = dx[0] if t == 0: dc0 = dcprev dh0 = dhprev # and make sure the gradients match print( "Making sure batched version agrees with sequential version: (should " "all be True)") print(np.allclose(BdX, dX)) print(np.allclose(BdWLSTM, dWLSTM)) print(np.allclose(Bdc0, dc0)) print(np.allclose(Bdh0, dh0))
def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing): start = datetime.datetime.now() WLSTM = np.random.randn(word_size + hidden_size, 4 * hidden_size) / np.sqrt(word_size + hidden_size) xphpb = WLSTM.shape[0] d = hidden_size n = sentence_length b = batch_size dHout = np.random.randn(n, b, d) IFOGf = np.random.randn(n, b, d * 4) C = np.random.randn(n, b, d) Ct = np.random.randn(n, b, d) Hin = np.random.randn(n, b, xphpb) dIFOG = np.zeros((n, b, d * 4)) dIFOGf = np.zeros(IFOGf.shape) dHin = np.zeros(Hin.shape) dC = np.zeros(C.shape) dh0 = np.zeros((1, d)) for t in reversed(range(n)): tanhCt = Ct[t] dIFOGf[t, :, 2 * d:3 * d] = tanhCt * dHout[t] # backprop tanh non-linearity first then continue backprop dC[t] += (1 - tanhCt**2) * (IFOGf[t, :, 2 * d:3 * d] * dHout[t]) if t > 0: dIFOGf[t, :, d:2 * d] = C[t - 1] * dC[t] dC[t - 1] += IFOGf[t, :, d:2 * d] * dC[t] dIFOGf[t, :, :d] = IFOGf[t, :, 3 * d:] * dC[t] dIFOGf[t, :, 3 * d:] = IFOGf[t, :, :d] * dC[t] # backprop activation functions dIFOG[t, :, 3 * d:] = (1 - IFOGf[t, :, 3 * d:]**2) * dIFOGf[t, :, 3 * d:] y = IFOGf[t, :, :3 * d] dIFOG[t, :, :3 * d] = (y * (1.0 - y)) * dIFOGf[t, :, :3 * d] # backprop matrix multiply dHin[t] = dIFOG[t].dot(WLSTM.transpose()) # backprop the identity transforms into Hin if t > 0: dHout[t - 1, :] += dHin[t, :, word_size:] else: dh0[0] += np.sum(dHin[t, :, word_size:], 0) # Do a little sum to synchronize and check for NaNs total = np.sum(dh0) assert not math.isnan(total) stop = datetime.datetime.now() delta = stop - start total = delta.total_seconds() * 1000.0 if timing: print("Elapsed Time: " + str(total) + " ms") return total
def testtion(): word_size = 10 hidden_size = 10 sentence_length = 5 batch_size = 3 np.random.seed(42) WLSTM_np = np.random.randn( word_size + hidden_size, 4 * hidden_size ) / np.sqrt(word_size + hidden_size) xphpb = WLSTM_np.shape[0] d = hidden_size n = sentence_length b = batch_size WLSTM_lg = lg.array(WLSTM_np) dHout_np = np.random.randn(n, b, d) IFOGf_np = np.random.randn(n, b, d * 4) C_np = np.random.randn(n, b, d) Ct_np = np.random.randn(n, b, d) Hin_np = np.random.randn(n, b, xphpb) dIFOG_np = np.zeros((n, b, d * 4)) dIFOGf_np = np.zeros(IFOGf_np.shape) dHin_np = np.zeros(Hin_np.shape) dC_np = np.zeros(C_np.shape) dh0_np = np.zeros((1, d)) dHout_lg = lg.array(dHout_np) IFOGf_lg = lg.array(IFOGf_np) C_lg = lg.array(C_np) Ct_lg = lg.array(Ct_np) Hin_lg = lg.array(Hin_np) dIFOG_lg = lg.zeros((n, b, d * 4)) dIFOGf_lg = lg.zeros(IFOGf_lg.shape) dHin_lg = lg.zeros(Hin_lg.shape) dC_lg = lg.zeros(C_lg.shape) dh0_lg = lg.zeros((1, d)) for t in reversed(range(n)): tanhCt_np = Ct_np[t] tanhCt_lg = Ct_lg[t] # assert lg.allclose(tanhCt_np, tanhCt_lg) dIFOGf_np[t, :, 2 * d : 3 * d] = tanhCt_np * dHout_np[t] dIFOGf_lg[t, :, 2 * d : 3 * d] = tanhCt_lg * dHout_lg[t] # assert lg.allclose(dIFOGf_np[t,:,2*d:3*d], dIFOGf_lg[t,:,2*d:3*d]) # backprop tanh non-linearity first then continue backprop dC_np[t] += (1 - tanhCt_np ** 2) * ( IFOGf_np[t, :, 2 * d : 3 * d] * dHout_np[t] ) dC_lg[t] += (1 - tanhCt_lg ** 2) * ( IFOGf_lg[t, :, 2 * d : 3 * d] * dHout_lg[t] ) # assert lg.allclose(dC_np[t], dC_lg[t]) if t > 0: dIFOGf_np[t, :, d : 2 * d] = C_np[t - 1] * dC_np[t] dIFOGf_lg[t, :, d : 2 * d] = C_lg[t - 1] * dC_lg[t] # assert lg.allclose(dIFOGf_np[t,:,d:2*d], dIFOGf_lg[t,:,d:2*d]) dC_np[t - 1] += IFOGf_np[t, :, d : 2 * d] * dC_np[t] dC_lg[t - 1] += IFOGf_lg[t, :, d : 2 * d] * dC_lg[t] # assert lg.allclose(dC_np[t-1], dC_lg[t-1]) dIFOGf_np[t, :, :d] = IFOGf_np[t, :, 3 * d :] * dC_np[t] dIFOGf_lg[t, :, :d] = IFOGf_lg[t, :, 3 * d :] * dC_lg[t] # assert lg.allclose(dIFOGf_np[t,:,:d], dIFOGf_lg[t,:,:d]) dIFOGf_np[t, :, 3 * d :] = IFOGf_np[t, :, :d] * dC_np[t] dIFOGf_lg[t, :, 3 * d :] = IFOGf_lg[t, :, :d] * dC_lg[t] # assert lg.allclose(dIFOGf_np, dIFOGf_lg) # backprop activation functions dIFOG_np[t, :, 3 * d :] = ( 1 - IFOGf_np[t, :, 3 * d :] ** 2 ) * dIFOGf_np[t, :, 3 * d :] dIFOG_lg[t, :, 3 * d :] = ( 1 - IFOGf_lg[t, :, 3 * d :] ** 2 ) * dIFOGf_lg[t, :, 3 * d :] # assert lg.allclose(dIFOG_np[t,:,3*d:], dIFOG_lg[t,:,3*d:]) y_np = IFOGf_np[t, :, : 3 * d] y_lg = IFOGf_lg[t, :, : 3 * d] # assert lg.allclose(y_np, y_lg) dIFOG_np[t, :, : 3 * d] = (y_np * (1.0 - y_np)) * dIFOGf_np[ t, :, : 3 * d ] dIFOG_lg[t, :, : 3 * d] = (y_lg * (1.0 - y_lg)) * dIFOGf_lg[ t, :, : 3 * d ] # assert lg.allclose(dIFOG_np[t,:,:3*d], dIFOG_lg[t,:,:3*d]) # backprop matrix multiply dHin_np[t] = dIFOG_np[t].dot(WLSTM_np.transpose()) dHin_lg[t] = dIFOG_lg[t].dot(WLSTM_lg.transpose()) # assert lg.allclose(dHin_np[t], dHin_lg[t]) # backprop the identity transforms into Hin if t > 0: dHout_np[t - 1, :] += dHin_np[t, :, word_size:] dHout_lg[t - 1, :] += dHin_lg[t, :, word_size:] # assert lg.allclose(dHout_np[t-1,:], dHout_lg[t-1,:]) else: dh0_np[0] += np.sum(dHin_np[t, :, word_size:], 0) dh0_lg[0] += lg.sum(dHin_lg[t, :, word_size:], 0) # Check this one at the end # print(dh0_np[0]) # print(dh0_lg[0]) assert np.allclose(dh0_np[0], dh0_lg[0])
def run_lstm( file_name, H_size, T_steps, max_iters, learning_rate, weight_sd, dump, timing, ): with open(file_name, "r") as f: data = f.read() chars = list(set(data)) data_size, X_size = len(data), len(chars) print("data has %d characters, %d unique" % (data_size, X_size)) char_to_idx = {ch: i for i, ch in enumerate(chars)} z_size = H_size + X_size # Size of concatenate(H, X) vector parameters = Parameters(H_size, X_size, z_size, weight_sd) # Exponential average of loss # Initialize to a error of a random model smooth_loss = -np.log(1.0 / X_size) * T_steps pointer = 0 start = datetime.datetime.now() for iteration in range(max_iters): # Reset if pointer + T_steps >= len(data) or iteration == 0: g_h_prev = np.zeros((H_size, 1)) g_C_prev = np.zeros((H_size, 1)) pointer = 0 inputs = [char_to_idx[ch] for ch in data[pointer : pointer + T_steps]] targets = [ char_to_idx[ch] for ch in data[pointer + 1 : pointer + T_steps + 1] ] loss, g_h_prev, g_C_prev = forward_backward( inputs, targets, g_h_prev, g_C_prev, T_steps, H_size, X_size, parameters, ) smooth_loss = smooth_loss * 0.999 + loss * 0.001 # Print every hundred steps if iteration % dump == 0: update_status(iteration, smooth_loss) update_parameters(learning_rate, parameters) pointer += T_steps update_status(max_iters, smooth_loss) stop = datetime.datetime.now() delta = stop - start total = delta.total_seconds() * 1000.0 if timing: print("Elapsed Time: " + str(total) + " ms") return total
def forward_backward( inputs, targets, h_prev, C_prev, T_steps, H_size, X_size, parameters ): # To store the values for each time step x_s, z_s, f_s, i_s, = ( {}, {}, {}, {}, ) C_bar_s, C_s, o_s, h_s = {}, {}, {}, {} v_s, y_s = {}, {} # Values at t - 1 h_s[-1] = np.copy(h_prev) C_s[-1] = np.copy(C_prev) loss = 0 # Loop through time steps assert len(inputs) == T_steps for t in range(len(inputs)): x_s[t] = np.zeros((X_size, 1)) x_s[t][inputs[t]] = 1 # Input character ( z_s[t], f_s[t], i_s[t], C_bar_s[t], C_s[t], o_s[t], h_s[t], v_s[t], y_s[t], ) = forward( x_s[t], h_s[t - 1], C_s[t - 1], H_size, X_size, parameters ) # Forward pass loss += -np.log(y_s[t][targets[t], 0]) # Loss for at t clear_gradients(parameters) dh_next = np.zeros_like(h_s[0]) # dh from the next character dC_next = np.zeros_like(C_s[0]) # dh from the next character for t in reversed(range(len(inputs))): # Backward pass dh_next, dC_next = backward( target=targets[t], dh_next=dh_next, dC_next=dC_next, C_prev=C_s[t - 1], H_size=H_size, X_size=X_size, z=z_s[t], f=f_s[t], i=i_s[t], C_bar=C_bar_s[t], C=C_s[t], o=o_s[t], h=h_s[t], v=v_s[t], y=y_s[t], p=parameters, ) clip_gradients(parameters) return loss, h_s[len(inputs) - 1], C_s[len(inputs) - 1]
def test(): x = lg.array([1, 2, 3]) y = np.array([1, 2, 3]) z = lg.array(y) assert np.array_equal(x, z) assert x.dtype == z.dtype xe = lg.empty((2, 3)) ye = np.empty((2, 3)) assert lg.shape(xe) == np.shape(ye) assert xe.dtype == ye.dtype xz = lg.zeros((2, 3)) yz = np.zeros((2, 3)) assert np.array_equal(xz, yz) assert xz.dtype == yz.dtype xo = lg.ones((2, 3)) yo = np.ones((2, 3)) assert np.array_equal(xo, yo) assert xo.dtype == yo.dtype xf = lg.full((2, 3), 3) yf = np.full((2, 3), 3) assert np.array_equal(xf, yf) assert xf.dtype == yf.dtype xel = lg.empty_like(x) yel = np.empty_like(y) assert lg.shape(xel) == np.shape(yel) assert xel.dtype == yel.dtype xzl = lg.zeros_like(x) yzl = np.zeros_like(y) assert np.array_equal(xzl, yzl) assert xzl.dtype == yzl.dtype xol = lg.ones_like(x) yol = np.ones_like(y) assert np.array_equal(xol, yol) assert xol.dtype == yol.dtype xfl = lg.full_like(x, 3) yfl = np.full_like(y, 3) assert np.array_equal(xfl, yfl) assert xfl.dtype == yfl.dtype x = lg.arange(10) y = np.arange(10) assert np.array_equal(x, y) assert x.dtype == y.dtype x = lg.arange(10, dtype=np.int32) y = np.arange(10, dtype=np.int32) assert np.array_equal(x, y) assert x.dtype == y.dtype x = lg.arange(2.0, 10.0) y = np.arange(2.0, 10.0) assert np.array_equal(x, y) assert x.dtype == y.dtype x = lg.arange(2, 30, 3) y = np.arange(2, 30, 3) assert np.array_equal(x, y) assert x.dtype == y.dtype # xfls = lg.full_like(x, '3', dtype=np.str_) # yfls = np.full_like(y, '3', dtype=np.str_) # assert(lg.array_equal(xfls, yfls)) # assert(xfls.dtype == yfls.dtype) return
def initialize(M, N, K, ft): A = np.random.rand(N, N).astype(ft) B = np.random.rand(N, N).astype(ft) C = np.zeros((N, N), dtype=ft) return A, B, C