def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) W = self.get_orthogonal_matrix(x_dim, output_dim) b = self.rng.rand(1, output_dim).astype(np.float32) if self.rng.randint(2) else None device_id = 0 state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = Connector(Matrix.from_npa(x), device_id) W_gpu = Connector(Matrix.from_npa(W), device_id) b_gpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b dot_block_gpu = DotBlock(W_gpu, b_gpu, x_gpu) x_gpu.fprop() W_gpu.fprop() if b_gpu: b_gpu.fprop() dot_block_gpu.fprop() _, dL_doutput = dot_block_gpu.output.register_usage(device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) dot_block_gpu.bprop() if b is not None: dL_db_gpu = b_gpu.backward_matrix.to_host() dL_dW_gpu = W_gpu.backward_matrix.to_host() dL_dx_gpu = x_gpu.backward_matrix.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = Connector(Matrix.from_npa(x), device_id) W_cpu = Connector(Matrix.from_npa(W), device_id) b_cpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b dot_block_cpu = DotBlock(W_cpu, b_cpu, x_cpu) x_cpu.fprop() W_cpu.fprop() if b_cpu: b_cpu.fprop() dot_block_cpu.fprop() _, dL_doutput = dot_block_cpu.output.register_usage(device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) dot_block_cpu.bprop() if b is not None: dL_db_cpu = b_cpu.backward_matrix.to_host() dL_dW_cpu = W_cpu.backward_matrix.to_host() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu, atol=1e-5)) r.append(np.allclose(dL_dW_gpu, dL_dW_cpu, atol=1e-5)) if b is not None: r.append(np.allclose(dL_db_gpu, dL_db_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) W = self.get_orthogonal_matrix(x_dim, output_dim) b = self.rng.rand(1, output_dim).astype( np.float32) if self.rng.randint(2) else None quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x)) W_gpu = Connector(Matrix.from_npa(W)) b_gpu = Connector(Matrix.from_npa(b)) if b is not None else b dot_block_gpu = DotBlock(W_gpu, b_gpu, x_gpu) x_gpu.fprop() W_gpu.fprop() if b_gpu: b_gpu.fprop() dot_block_gpu.fprop() output_gpu = dot_block_gpu.output.to_host() quagga.processor_type = 'cpu' x_cpu = Connector(Matrix.from_npa(x)) W_cpu = Connector(Matrix.from_npa(W)) b_cpu = Connector(Matrix.from_npa(b)) if b is not None else b dot_block_cpu = DotBlock(W_cpu, b_cpu, x_cpu) x_cpu.fprop() W_cpu.fprop() if b_cpu: b_cpu.fprop() dot_block_cpu.fprop() output_cpu = dot_block_cpu.output.to_host() r.append(np.allclose(output_gpu, output_cpu, atol=1e-5)) self.assertEqual(sum(r), self.N)
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) W = self.get_orthogonal_matrix(x_dim, output_dim) b = self.rng.rand(1, output_dim).astype(np.float32) if self.rng.randint(2) else None quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x)) W_gpu = Connector(Matrix.from_npa(W)) b_gpu = Connector(Matrix.from_npa(b)) if b is not None else b dot_block_gpu = DotBlock(W_gpu, b_gpu, x_gpu) x_gpu.fprop() W_gpu.fprop() if b_gpu: b_gpu.fprop() dot_block_gpu.fprop() output_gpu = dot_block_gpu.output.to_host() quagga.processor_type = 'cpu' x_cpu = Connector(Matrix.from_npa(x)) W_cpu = Connector(Matrix.from_npa(W)) b_cpu = Connector(Matrix.from_npa(b)) if b is not None else b dot_block_cpu = DotBlock(W_cpu, b_cpu, x_cpu) x_cpu.fprop() W_cpu.fprop() if b_cpu: b_cpu.fprop() dot_block_cpu.fprop() output_cpu = dot_block_cpu.output.to_host() r.append(np.allclose(output_gpu, output_cpu, atol=1e-5)) self.assertEqual(sum(r), self.N)
def test_theano_grad(self): class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) lr_dot_W = self.rng.rand(x_dim, 1).astype(np.float32) lr_dot_b = self.rng.rand(1, 1).astype(np.float32) true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) dropout_prob = self.rng.uniform() seed = self.rng.randint(1000) device_id = 0 # quagga model state = self.rng.get_state() x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) lr_dot_W_gpu = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b_gpu = Connector(Matrix.from_npa(lr_dot_b), device_id) dropout_block = DropoutBlock(x_gpu, dropout_prob, seed) lrdot_block = DotBlock(lr_dot_W_gpu, lr_dot_b_gpu, dropout_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels_gpu) x_gpu.fprop() true_labels_gpu.fprop() lr_dot_W_gpu.fprop() lr_dot_b_gpu.fprop() dropout_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() dropout_block.bprop() q_grads = [ lr_dot_W_gpu.backward_matrix.to_host(), lr_dot_b_gpu.backward_matrix.to_host(), x_gpu.backward_matrix.to_host() ] mask = (dropout_block.output.to_host() != 0).astype(np.float32) # Theano model self.rng.set_state(state) th_x = T.fmatrix() th_true_labels = T.fmatrix() lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = lr_layer.get_output_expr(th_x * mask) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) th_grads = T.grad(loss, wrt=[lr_layer.W, lr_layer.b, th_x]) get_theano_grads = theano.function([th_x, th_true_labels], th_grads) th_grads = get_theano_grads(x, true_labels) for i, (q_grad, th_grad) in enumerate(izip(q_grads, th_grads)): r.append(np.allclose(q_grad, th_grad)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class AttentionLayer(object): def __init__(self, u, mask=None): self.u = theano.shared(value=u) self.mask = mask def get_output_expr(self, input_expr): input_expr = input_expr.dimshuffle(0, 2, 1) pre_a = T.dot(input_expr, self.u)[:, :, 0] if self.mask: pre_a = self.mask * pre_a - \ (1 - self.mask) * 3.402823466e+38 a = T.nnet.softmax(pre_a)[:, :, np.newaxis] return T.sum(a * input_expr, axis=1) class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) else: return T.nnet.sigmoid(T.dot(input_expr, self.W)) r = [] for i in xrange(self.N): batch_size = self.rng.random_integers(500) x_dim = self.rng.random_integers(3000) n_ts = self.rng.random_integers(100) x = [ self.rng.rand(batch_size, x_dim).astype(np.float32) for _ in xrange(n_ts) ] u = self.get_orthogonal_matrix(x_dim, 1) lr_dot_W = self.get_orthogonal_matrix(x_dim, 1) lr_dot_b = self.rng.rand(1, 1).astype( np.float32) if self.rng.randint(2) else None true_labels = self.rng.randint( 2, size=(batch_size, 1)).astype(np.float32) mask = self.rng.randint( 2, size=(batch_size, n_ts)).astype( np.float32) if self.rng.randint(2) else None device_id = 0 # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_mask = T.fmatrix() if mask is not None else None th_true_labels = T.fmatrix() attnt_layer = AttentionLayer(u, th_mask) lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = th_x for layer in [attnt_layer, lr_layer]: probs = layer.get_output_expr(probs) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) params = [lr_layer.W, attnt_layer.u, th_x] if hasattr(lr_layer, 'b'): params.append(lr_layer.b) th_grads = T.grad(loss, wrt=params) get_theano_grads = theano.function( [th_x, th_true_labels ] + ([th_mask] if mask is not None else []), th_grads) th_grads = get_theano_grads( *([np.dstack(x), true_labels] + ([mask] if mask is not None else []))) # quagga model self.rng.set_state(state) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) u = Connector(Matrix.from_npa(u), device_id) lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b = Connector( Matrix.from_npa(lr_dot_b), device_id) if lr_dot_b is not None else lr_dot_b true_labels = Connector(Matrix.from_npa(true_labels)) if mask is not None: mask = Connector(Matrix.from_npa(mask)) attnt_block = AttentionBlock(x, u, mask) lrdot_block = DotBlock(lr_dot_W, lr_dot_b, attnt_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels) x.fprop() true_labels.fprop() u.fprop() lr_dot_W.fprop() if lr_dot_b: lr_dot_b.fprop() attnt_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() attnt_block.bprop() q_grads = [ lr_dot_W.backward_matrix.to_host(), u.backward_matrix.to_host(), np.dstack([e.backward_matrix.to_host() for e in x]) ] if lr_dot_b: q_grads.append(lr_dot_b.backward_matrix.to_host()) for th_grad, q_grad in izip(th_grads, q_grads): r.append(np.allclose(th_grad, q_grad, atol=1.e-7)) print r[-1] self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) lrdot_W = self.rng.rand(x_dim, 1).astype(np.float32) lrdot_b = self.rng.rand(1, 1).astype(np.float32) true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) device_id = 0 for nonlinearity in ['sigmoid', 'tanh', 'relu']: # Theano model state = self.rng.get_state() th_x = T.fmatrix() th_true_labels = T.fmatrix() lr_layer = LogisticRegressionLayer(lrdot_W, lrdot_b) if nonlinearity == 'sigmoid': f = T.nnet.sigmoid elif nonlinearity == 'tanh': f = T.tanh elif nonlinearity == 'relu': f = T.nnet.relu probs = lr_layer.get_output_expr(f(th_x)) loss = T.mean(T.nnet.binary_crossentropy( probs, th_true_labels)) th_grads = T.grad(loss, wrt=[lr_layer.W, lr_layer.b, th_x]) get_theano_grads = theano.function([th_x, th_true_labels], th_grads) th_grads = get_theano_grads(x, true_labels) # quagga model self.rng.set_state(state) x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) lrdot_W_gpu = Connector(Matrix.from_npa(lrdot_W), device_id) lrdot_b_gpu = Connector(Matrix.from_npa(lrdot_b), device_id) nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity) lrdot_block = DotBlock(lrdot_W_gpu, lrdot_b_gpu, nonlinearity_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels_gpu) x_gpu.fprop() true_labels_gpu.fprop() lrdot_W_gpu.fprop() lrdot_b_gpu.fprop() nonlinearity_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() nonlinearity_block.bprop() q_grads = [ lrdot_W_gpu.backward_matrix.to_host(), lrdot_b_gpu.backward_matrix.to_host(), x_gpu.backward_matrix.to_host() ] for q_grad, th_grad in izip(q_grads, th_grads): r.append(np.allclose(q_grad, th_grad, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) lrdot_W = self.rng.rand(x_dim, 1).astype(np.float32) lrdot_b = self.rng.rand(1, 1).astype(np.float32) true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) device_id = 0 for nonlinearity in ['sigmoid', 'tanh', 'relu']: # Theano model state = self.rng.get_state() th_x = T.fmatrix() th_true_labels = T.fmatrix() lr_layer = LogisticRegressionLayer(lrdot_W, lrdot_b) if nonlinearity == 'sigmoid': f = T.nnet.sigmoid elif nonlinearity == 'tanh': f = T.tanh elif nonlinearity == 'relu': f = T.nnet.relu probs = lr_layer.get_output_expr(f(th_x)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) th_grads = T.grad(loss, wrt=[lr_layer.W, lr_layer.b, th_x]) get_theano_grads = theano.function([th_x, th_true_labels], th_grads) th_grads = get_theano_grads(x, true_labels) # quagga model self.rng.set_state(state) x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) lrdot_W_gpu = Connector(Matrix.from_npa(lrdot_W), device_id) lrdot_b_gpu = Connector(Matrix.from_npa(lrdot_b), device_id) nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity) lrdot_block = DotBlock(lrdot_W_gpu, lrdot_b_gpu, nonlinearity_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels_gpu) x_gpu.fprop() true_labels_gpu.fprop() lrdot_W_gpu.fprop() lrdot_b_gpu.fprop() nonlinearity_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() nonlinearity_block.bprop() q_grads = [lrdot_W_gpu.backward_matrix.to_host(), lrdot_b_gpu.backward_matrix.to_host(), x_gpu.backward_matrix.to_host()] for q_grad, th_grad in izip(q_grads, th_grads): r.append(np.allclose(q_grad, th_grad, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class DotLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.dot(input_expr, self.W) + self.b else: return T.dot(input_expr, self.W) class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) else: return T.nnet.sigmoid(T.dot(input_expr, self.W)) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) dot_W = self.get_orthogonal_matrix(x_dim, output_dim) dot_b = self.rng.rand(1, output_dim).astype(np.float32) if self.rng.randint(2) else None lr_dot_W = self.get_orthogonal_matrix(output_dim, 1) lr_dot_b = self.rng.rand(1, 1).astype(np.float32) if self.rng.randint(2) else None true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) device_id = 0 # Theano model state = self.rng.get_state() th_x = T.fmatrix() th_true_labels = T.fmatrix() dot_layer = DotLayer(dot_W, dot_b) lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = th_x for layer in [dot_layer, lr_layer]: probs = layer.get_output_expr(probs) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) params = [lr_layer.W, dot_layer.W, th_x] if hasattr(lr_layer, 'b'): params.append(lr_layer.b) if hasattr(dot_layer, 'b'): params.append(dot_layer.b) th_grads = T.grad(loss, wrt=params) get_theano_grads = theano.function([th_x, th_true_labels], th_grads) th_grads = get_theano_grads(x, true_labels) # quagga model self.rng.set_state(state) x = Connector(Matrix.from_npa(x), device_id) true_labels = Connector(Matrix.from_npa(true_labels)) dot_W = Connector(Matrix.from_npa(dot_W), device_id) dot_b = Connector(Matrix.from_npa(dot_b), device_id) if dot_b is not None else dot_b lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b = Connector(Matrix.from_npa(lr_dot_b), device_id) if lr_dot_b is not None else lr_dot_b dot_block = DotBlock(dot_W, dot_b, x) lrdot_block = DotBlock(lr_dot_W, lr_dot_b, dot_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels) x.fprop() true_labels.fprop() dot_W.fprop() if dot_b: dot_b.fprop() lr_dot_W.fprop() if lr_dot_b: lr_dot_b.fprop() dot_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() dot_block.bprop() q_grads = [lr_dot_W.backward_matrix.to_host(), dot_W.backward_matrix.to_host(), x.backward_matrix.to_host()] if lr_dot_b: q_grads.append(lr_dot_b.backward_matrix.to_host()) if dot_b: q_grads.append(dot_b.backward_matrix.to_host()) for th_grad, q_grad in izip(th_grads, q_grads): r.append(np.allclose(th_grad, q_grad, atol=1e-7)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) lr_dot_W = self.rng.rand(x_dim, 1).astype(np.float32) lr_dot_b = self.rng.rand(1, 1).astype(np.float32) true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) dropout_prob = self.rng.uniform() seed = self.rng.randint(1000) device_id = 0 # quagga model state = self.rng.get_state() x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) lr_dot_W_gpu = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b_gpu = Connector(Matrix.from_npa(lr_dot_b), device_id) dropout_block = DropoutBlock(x_gpu, dropout_prob, seed) lrdot_block = DotBlock(lr_dot_W_gpu, lr_dot_b_gpu, dropout_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels_gpu) x_gpu.fprop() true_labels_gpu.fprop() lr_dot_W_gpu.fprop() lr_dot_b_gpu.fprop() dropout_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() dropout_block.bprop() q_grads = [lr_dot_W_gpu.backward_matrix.to_host(), lr_dot_b_gpu.backward_matrix.to_host(), x_gpu.backward_matrix.to_host()] mask = (dropout_block.output.to_host() != 0).astype(np.float32) # Theano model self.rng.set_state(state) th_x = T.fmatrix() th_true_labels = T.fmatrix() lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = lr_layer.get_output_expr(th_x * mask) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) th_grads = T.grad(loss, wrt=[lr_layer.W, lr_layer.b, th_x]) get_theano_grads = theano.function([th_x, th_true_labels], th_grads) th_grads = get_theano_grads(x, true_labels) for i, (q_grad, th_grad) in enumerate(izip(q_grads, th_grads)): r.append(np.allclose(q_grad, th_grad)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class SequentialMeanPoolingLayer(object): def get_output_expr(self, input_sequence): return T.mean(input_sequence, axis=2) class LogisticRegressionLayer(object): def __init__(self, W_init, b_init): self.W = theano.shared(value=W_init()) self.b = theano.shared(value=b_init()) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] true_labels = self.rng.randint(1, size=(batch_size, 1)).astype(dtype=np.float32) W_init = self.get_orthogonal_initializer(dim, 1) b_init = lambda: self.rng.rand(1, 1).astype(dtype=np.float32) # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_true_labels = T.fmatrix() smp_layer = SequentialMeanPoolingLayer() lr_layer = LogisticRegressionLayer(W_init, lambda: b_init()[0]) probs = lr_layer.get_output_expr(smp_layer.get_output_expr(th_x)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) grad_x = T.grad(loss, wrt=th_x) get_grad_x = theano.function([th_x, th_true_labels], grad_x) # quagga model self.rng.set_state(state) context = Context() x = List([Connector(Matrix.from_npa(e), context, context) for e in x]) true_labels = Connector(Matrix.from_npa(true_labels)) smp_block = SequentialMeanPoolingBlock(x) dot_block = DotBlock(W_init, b_init, smp_block.output) sce_block = SigmoidCeBlock(dot_block.output, true_labels) x.set_length(sequence_len) smp_block.fprop() dot_block.fprop() sce_block.fprop() sce_block.bprop() dot_block.bprop() smp_block.bprop() dL_dx = [e.backward_matrix.to_host() for e in x] dL_dx_th = get_grad_x(np.dstack([e.to_host() for e in x]), true_labels.to_host()) for i in xrange(dL_dx_th.shape[-1]): if not np.allclose(dL_dx[i], dL_dx_th[..., i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) W = self.get_orthogonal_matrix(x_dim, output_dim) b = self.rng.rand(1, output_dim).astype( np.float32) if self.rng.randint(2) else None device_id = 0 state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = Connector(Matrix.from_npa(x), device_id) W_gpu = Connector(Matrix.from_npa(W), device_id) b_gpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b dot_block_gpu = DotBlock(W_gpu, b_gpu, x_gpu) x_gpu.fprop() W_gpu.fprop() if b_gpu: b_gpu.fprop() dot_block_gpu.fprop() _, dL_doutput = dot_block_gpu.output.register_usage( device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) dot_block_gpu.bprop() if b is not None: dL_db_gpu = b_gpu.backward_matrix.to_host() dL_dW_gpu = W_gpu.backward_matrix.to_host() dL_dx_gpu = x_gpu.backward_matrix.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = Connector(Matrix.from_npa(x), device_id) W_cpu = Connector(Matrix.from_npa(W), device_id) b_cpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b dot_block_cpu = DotBlock(W_cpu, b_cpu, x_cpu) x_cpu.fprop() W_cpu.fprop() if b_cpu: b_cpu.fprop() dot_block_cpu.fprop() _, dL_doutput = dot_block_cpu.output.register_usage( device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) dot_block_cpu.bprop() if b is not None: dL_db_cpu = b_cpu.backward_matrix.to_host() dL_dW_cpu = W_cpu.backward_matrix.to_host() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu, atol=1e-5)) r.append(np.allclose(dL_dW_gpu, dL_dW_cpu, atol=1e-5)) if b is not None: r.append(np.allclose(dL_db_gpu, dL_db_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class DotLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.dot(input_expr, self.W) + self.b else: return T.dot(input_expr, self.W) class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) else: return T.nnet.sigmoid(T.dot(input_expr, self.W)) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) dot_W = self.get_orthogonal_matrix(x_dim, output_dim) dot_b = self.rng.rand(1, output_dim).astype( np.float32) if self.rng.randint(2) else None lr_dot_W = self.get_orthogonal_matrix(output_dim, 1) lr_dot_b = self.rng.rand(1, 1).astype( np.float32) if self.rng.randint(2) else None true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) device_id = 0 # Theano model state = self.rng.get_state() th_x = T.fmatrix() th_true_labels = T.fmatrix() dot_layer = DotLayer(dot_W, dot_b) lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = th_x for layer in [dot_layer, lr_layer]: probs = layer.get_output_expr(probs) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) params = [lr_layer.W, dot_layer.W, th_x] if hasattr(lr_layer, 'b'): params.append(lr_layer.b) if hasattr(dot_layer, 'b'): params.append(dot_layer.b) th_grads = T.grad(loss, wrt=params) get_theano_grads = theano.function([th_x, th_true_labels], th_grads) th_grads = get_theano_grads(x, true_labels) # quagga model self.rng.set_state(state) x = Connector(Matrix.from_npa(x), device_id) true_labels = Connector(Matrix.from_npa(true_labels)) dot_W = Connector(Matrix.from_npa(dot_W), device_id) dot_b = Connector(Matrix.from_npa(dot_b), device_id) if dot_b is not None else dot_b lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b = Connector( Matrix.from_npa(lr_dot_b), device_id) if lr_dot_b is not None else lr_dot_b dot_block = DotBlock(dot_W, dot_b, x) lrdot_block = DotBlock(lr_dot_W, lr_dot_b, dot_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels) x.fprop() true_labels.fprop() dot_W.fprop() if dot_b: dot_b.fprop() lr_dot_W.fprop() if lr_dot_b: lr_dot_b.fprop() dot_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() dot_block.bprop() q_grads = [ lr_dot_W.backward_matrix.to_host(), dot_W.backward_matrix.to_host(), x.backward_matrix.to_host() ] if lr_dot_b: q_grads.append(lr_dot_b.backward_matrix.to_host()) if dot_b: q_grads.append(dot_b.backward_matrix.to_host()) for th_grad, q_grad in izip(th_grads, q_grads): r.append(np.allclose(th_grad, q_grad, atol=1e-7)) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): device_id = 0 class SequentialHorizontalStackLayer(object): def get_output_expr(self, x_sequence, y_sequence): return T.concatenate((x_sequence, y_sequence), axis=1) class SequentialMeanPoolingLayer(object): def get_output_expr(self, input_sequence): return T.mean(input_sequence, axis=2) class LogisticRegressionLayer(object): def __init__(self, W_init, b_init): self.W = theano.shared(value=W_init()) self.b = theano.shared(value=b_init()) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) dim_x, dim_y = self.rng.random_integers(1280, size=2) x = [self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] y = [self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] true_labels = self.rng.randint(1, size=(batch_size, 1)).astype(dtype=np.float32) W_init = self.get_orthogonal_initializer(dim_x + dim_y, 1) b_init = lambda: self.rng.rand(1, 1).astype(dtype=np.float32) # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_y = T.ftensor3() th_true_labels = T.fmatrix() shs_layer = SequentialHorizontalStackLayer() smp_layer = SequentialMeanPoolingLayer() lr_layer = LogisticRegressionLayer(W_init, lambda: b_init()[0]) probs = shs_layer.get_output_expr(th_x, th_y) probs = lr_layer.get_output_expr(smp_layer.get_output_expr(probs)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) grads = T.grad(loss, wrt=[th_x, th_y]) get_grads = theano.function([th_x, th_y, th_true_labels], grads) dL_dx_sequence_th, dL_dy_sequence_th = get_grads(np.dstack(x[:sequence_len]), np.dstack(y[:sequence_len]), true_labels) # quagga model self.rng.set_state(state) W = Connector(Matrix.from_npa(W_init(), device_id=device_id), device_id) b = Connector(Matrix.from_npa(b_init(), device_id=device_id), device_id) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) y = List([Connector(Matrix.from_npa(e), device_id) for e in y]) true_labels = Connector(Matrix.from_npa(true_labels)) shs_block = SequentialHorizontalStackBlock(x, y) smp_block = SequentialMeanPoolingBlock(shs_block.output) dot_block = DotBlock(W, b, smp_block.output) sce_block = SigmoidCeBlock(dot_block.output, true_labels) x.length = sequence_len y.length = sequence_len shs_block.fprop() smp_block.fprop() dot_block.fprop() sce_block.fprop() sce_block.bprop() dot_block.bprop() smp_block.bprop() shs_block.bprop() dL_dx_sequence = [e.backward_matrix.to_host() for e in x] dL_dy_sequence = [e.backward_matrix.to_host() for e in y] for i in xrange(dL_dx_sequence_th.shape[-1]): if not np.allclose(dL_dx_sequence[i], dL_dx_sequence_th[..., i], atol=1.e-6): r.append(False) break else: r.append(True) for i in xrange(dL_dy_sequence_th.shape[-1]): if not np.allclose(dL_dy_sequence[i], dL_dy_sequence_th[..., i], atol=1.e-6): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N * 2)
def test_theano_grad(self): class AttentionLayer(object): def __init__(self, u, mask=None): self.u = theano.shared(value=u) self.mask = mask def get_output_expr(self, input_expr): input_expr = input_expr.dimshuffle(0, 2, 1) pre_a = T.dot(input_expr, self.u)[:, :, 0] if self.mask: pre_a = self.mask * pre_a - \ (1 - self.mask) * 3.402823466e+38 a = T.nnet.softmax(pre_a)[:, :, np.newaxis] return T.sum(a * input_expr, axis=1) class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) else: return T.nnet.sigmoid(T.dot(input_expr, self.W)) r = [] for i in xrange(self.N): batch_size = self.rng.random_integers(500) x_dim = self.rng.random_integers(3000) n_ts = self.rng.random_integers(100) x = [ self.rng.rand(batch_size, x_dim).astype(np.float32) for _ in xrange(n_ts) ] u = self.get_orthogonal_matrix(x_dim, 1) lr_dot_W = self.get_orthogonal_matrix(x_dim, 1) lr_dot_b = self.rng.rand(1, 1).astype( np.float32) if self.rng.randint(2) else None true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) mask = self.rng.randint(2, size=(batch_size, n_ts)).astype( np.float32) if self.rng.randint(2) else None device_id = 0 # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_mask = T.fmatrix() if mask is not None else None th_true_labels = T.fmatrix() attnt_layer = AttentionLayer(u, th_mask) lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = th_x for layer in [attnt_layer, lr_layer]: probs = layer.get_output_expr(probs) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) params = [lr_layer.W, attnt_layer.u, th_x] if hasattr(lr_layer, 'b'): params.append(lr_layer.b) th_grads = T.grad(loss, wrt=params) get_theano_grads = theano.function( [th_x, th_true_labels] + ([th_mask] if mask is not None else []), th_grads) th_grads = get_theano_grads( *([np.dstack(x), true_labels] + ([mask] if mask is not None else []))) # quagga model self.rng.set_state(state) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) u = Connector(Matrix.from_npa(u), device_id) lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b = Connector( Matrix.from_npa(lr_dot_b), device_id) if lr_dot_b is not None else lr_dot_b true_labels = Connector(Matrix.from_npa(true_labels)) if mask is not None: mask = Connector(Matrix.from_npa(mask)) attnt_block = AttentionBlock(x, u, mask) lrdot_block = DotBlock(lr_dot_W, lr_dot_b, attnt_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels) x.fprop() true_labels.fprop() u.fprop() lr_dot_W.fprop() if lr_dot_b: lr_dot_b.fprop() attnt_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() attnt_block.bprop() q_grads = [ lr_dot_W.backward_matrix.to_host(), u.backward_matrix.to_host(), np.dstack([e.backward_matrix.to_host() for e in x]) ] if lr_dot_b: q_grads.append(lr_dot_b.backward_matrix.to_host()) for th_grad, q_grad in izip(th_grads, q_grads): r.append(np.allclose(th_grad, q_grad, atol=1.e-7)) print r[-1] self.assertEqual(sum(r), len(r))
def test_theano_grad(self): class SequentialMeanPoolingLayer(object): def get_output_expr(self, input_sequence): return T.mean(input_sequence, axis=2) class LogisticRegressionLayer(object): def __init__(self, W_init, b_init): self.W = theano.shared(value=W_init()) self.b = theano.shared(value=b_init()) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = self.rng.randint(1, size=(batch_size, 1)).astype(dtype=np.float32) W_init = self.get_orthogonal_initializer(dim, 1) b_init = lambda: self.rng.rand(1, 1).astype(dtype=np.float32) # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_true_labels = T.fmatrix() smp_layer = SequentialMeanPoolingLayer() lr_layer = LogisticRegressionLayer(W_init, lambda: b_init()[0]) probs = lr_layer.get_output_expr(smp_layer.get_output_expr(th_x)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) grad_x = T.grad(loss, wrt=th_x) get_grad_x = theano.function([th_x, th_true_labels], grad_x) # quagga model self.rng.set_state(state) context = Context() x = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) true_labels = Connector(Matrix.from_npa(true_labels)) smp_block = SequentialMeanPoolingBlock(x) dot_block = DotBlock(W_init, b_init, smp_block.output) sce_block = SigmoidCeBlock(dot_block.output, true_labels) x.set_length(sequence_len) smp_block.fprop() dot_block.fprop() sce_block.fprop() sce_block.bprop() dot_block.bprop() smp_block.bprop() dL_dx = [e.backward_matrix.to_host() for e in x] dL_dx_th = get_grad_x(np.dstack([e.to_host() for e in x]), true_labels.to_host()) for i in xrange(dL_dx_th.shape[-1]): if not np.allclose(dL_dx[i], dL_dx_th[..., i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_theano_grad(self): device_id = 0 class SequentialHorizontalStackLayer(object): def get_output_expr(self, x_sequence, y_sequence): return T.concatenate((x_sequence, y_sequence), axis=1) class SequentialMeanPoolingLayer(object): def get_output_expr(self, input_sequence): return T.mean(input_sequence, axis=2) class LogisticRegressionLayer(object): def __init__(self, W_init, b_init): self.W = theano.shared(value=W_init()) self.b = theano.shared(value=b_init()) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) dim_x, dim_y = self.rng.random_integers(1280, size=2) x = [ self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] y = [ self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = self.rng.randint(1, size=(batch_size, 1)).astype(dtype=np.float32) W_init = self.get_orthogonal_initializer(dim_x + dim_y, 1) b_init = lambda: self.rng.rand(1, 1).astype(dtype=np.float32) # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_y = T.ftensor3() th_true_labels = T.fmatrix() shs_layer = SequentialHorizontalStackLayer() smp_layer = SequentialMeanPoolingLayer() lr_layer = LogisticRegressionLayer(W_init, lambda: b_init()[0]) probs = shs_layer.get_output_expr(th_x, th_y) probs = lr_layer.get_output_expr(smp_layer.get_output_expr(probs)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) grads = T.grad(loss, wrt=[th_x, th_y]) get_grads = theano.function([th_x, th_y, th_true_labels], grads) dL_dx_sequence_th, dL_dy_sequence_th = get_grads( np.dstack(x[:sequence_len]), np.dstack(y[:sequence_len]), true_labels) # quagga model self.rng.set_state(state) W = Connector(Matrix.from_npa(W_init(), device_id=device_id), device_id) b = Connector(Matrix.from_npa(b_init(), device_id=device_id), device_id) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) y = List([Connector(Matrix.from_npa(e), device_id) for e in y]) true_labels = Connector(Matrix.from_npa(true_labels)) shs_block = SequentialHorizontalStackBlock(x, y) smp_block = SequentialMeanPoolingBlock(shs_block.output) dot_block = DotBlock(W, b, smp_block.output) sce_block = SigmoidCeBlock(dot_block.output, true_labels) x.length = sequence_len y.length = sequence_len shs_block.fprop() smp_block.fprop() dot_block.fprop() sce_block.fprop() sce_block.bprop() dot_block.bprop() smp_block.bprop() shs_block.bprop() dL_dx_sequence = [e.backward_matrix.to_host() for e in x] dL_dy_sequence = [e.backward_matrix.to_host() for e in y] for i in xrange(dL_dx_sequence_th.shape[-1]): if not np.allclose(dL_dx_sequence[i], dL_dx_sequence_th[..., i], atol=1.e-6): r.append(False) break else: r.append(True) for i in xrange(dL_dy_sequence_th.shape[-1]): if not np.allclose(dL_dy_sequence[i], dL_dy_sequence_th[..., i], atol=1.e-6): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N * 2)