def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) for nonlinearity in ['sigmoid', 'tanh', 'relu']: state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x)) nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity) x_gpu.fprop() nonlinearity_block.fprop() output_gpu = nonlinearity_block.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = Connector(Matrix.from_npa(x)) nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity) x_cpu.fprop() nonlinearity_block.fprop() output_cpu = nonlinearity_block.output.to_host() r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), len(r))
def test_fprop_matrix(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() output[processor_type] = row_slicing_block.output.to_host() for output_gpu, output_cpu in izip(output['gpu'], output['cpu']): r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), len(r))
def test_theano_fprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() q_output = row_slicing_block.output.to_host() th_row_idxs = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) th_output = theano.function([th_row_idxs], toutput)(row_idxs) for i in xrange(sequence_len): r.append(np.allclose(q_output[i], th_output[i])) self.assertEqual(sum(r), len(r))
def test_bprop_vector(self): r = [] for _ in xrange(self.N): embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32) true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = Connector(Matrix.from_npa(true_labels)) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels) qW.fprop() qrow_idxs.fprop() row_slicing_block.fprop() sce_block.fprop() sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) smean_pooling_block_gpu.fprop() output_gpu = smean_pooling_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) smean_pooling_block_cpu.fprop() output_cpu = smean_pooling_block_cpu.output.to_host() r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), self.N)
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) smean_pooling_block_gpu.fprop() output_gpu = smean_pooling_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) smean_pooling_block_cpu.fprop() output_cpu = smean_pooling_block_cpu.output.to_host() r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), self.N)
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, dim = self.rng.random_integers(2000, size=2) y_hat = self.rng.randn(batch_size, dim).astype(dtype=np.float32) y = self.rng.randn(batch_size, dim).astype(dtype=np.float32) # Theano model th_y_hat, th_y = T.fmatrix(), T.fmatrix() loss = T.mean(T.sum((th_y_hat - th_y) ** 2, axis=1)) get_theano_grads = theano.function([th_y_hat, th_y], T.grad(loss, wrt=th_y_hat)) th_dL_dy_hat = get_theano_grads(y_hat, y) # quagga model context = Context() y_hat_gpu = Connector(Matrix.from_npa(y_hat), context, context) y_gpu = Connector(Matrix.from_npa(y)) sigmoid_ce_block = SseBlock(y_hat_gpu, y_gpu) sigmoid_ce_block.fprop() sigmoid_ce_block.bprop() q_dL_dy_hat = y_hat_gpu.backward_matrix.to_host() r.append(np.allclose(th_dL_dy_hat, q_dL_dy_hat)) self.assertEqual(sum(r), self.N)
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, dim = self.rng.random_integers(2000, size=2) y_hat = self.rng.randn(batch_size, dim).astype(dtype=np.float32) y = self.rng.randn(batch_size, dim).astype(dtype=np.float32) quagga.processor_type = 'gpu' context = Context() y_hat_gpu = Connector(Matrix.from_npa(y_hat), context, context) y_gpu = Connector(Matrix.from_npa(y)) sse_block = SseBlock(y_hat_gpu, y_gpu) sse_block.fprop() sse_block.bprop() dL_dy_hat_gpu = y_hat_gpu.backward_matrix.to_host() quagga.processor_type = 'cpu' context = Context() y_hat_cpu = Connector(Matrix.from_npa(y_hat), context, context) y_cpu = Connector(Matrix.from_npa(y)) sse_block = SseBlock(y_hat_cpu, y_cpu) sse_block.fprop() sse_block.bprop() dL_dy_hat_cpu = y_hat_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dy_hat_gpu, dL_dy_hat_cpu)) self.assertEqual(sum(r), self.N)
def test_bprop(self): r = [] for i in xrange(self.N): repeats = self.rng.random_integers(42) axis = self.rng.randint(2) input_dim, output_dim = self.rng.random_integers(2000, size=2) x = self.get_normal_matrix(input_dim, output_dim) input_dim = input_dim if axis else input_dim * repeats true_labels = self.rng.randint(output_dim, size=(input_dim, 1)).astype(np.int32) device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = Connector(Matrix.from_npa(x), device_id) qtrue_labels = Connector(Matrix.from_npa(true_labels)) repeat_block = RepeatBlock(qx, repeats, axis) sce_block = SoftmaxCeBlock(repeat_block.output, qtrue_labels) qx.fprop() qtrue_labels.fprop() repeat_block.fprop() sce_block.fprop() sce_block.bprop() repeat_block.bprop() output[processor_type] = qx.backward_matrix.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_theano_fprop_vector(self): r = [] for _ in xrange(self.N): embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32) quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) qW.fprop() qrow_idxs.fprop() row_slicing_block.fprop() q_output = row_slicing_block.output.to_host() trow_idxs = T.ivector() row_slicing_layer = RowSlicingLayer(W) t_output = row_slicing_layer.get_output_expr(trow_idxs) t_output = theano.function([trow_idxs], t_output)(row_idxs[:, 0]) r.append(np.allclose(q_output, t_output)) self.assertEqual(sum(r), len(r))
def test_bprop(self): r = [] for i in xrange(self.N): matrices = [] ncols = self.rng.random_integers(1, 3000) nrows = [0] row_slices = [] device_ids = [] for _ in xrange(self.rng.random_integers(1, 10)): _nrows = self.rng.random_integers(1, 2000) nrows.append(nrows[-1] + _nrows) if self.rng.choice([True, False]): device_ids.append(0) row_slices.append((nrows[-2], nrows[-1])) else: device_ids.append(None) matrices.append( self.rng.rand(_nrows, ncols).astype(np.float32)) true_labels = self.rng.randint(ncols, size=(nrows[-1], 1)).astype(np.int32) if not row_slices: r.append(True) continue output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qmatrices = [ Connector(Matrix.from_npa(m), d_id) for m, d_id in izip(matrices, device_ids) ] qtrue_labels = Connector(Matrix.from_npa(true_labels)) vstack_block = VerticalStackBlock(*qmatrices) sce_block = SoftmaxCeBlock(vstack_block.output, qtrue_labels) for m in qmatrices: m.fprop() qtrue_labels.fprop() vstack_block.fprop() sce_block.fprop() sce_block.bprop() vstack_block.bprop() output[processor_type] = [ m.backward_matrix.to_host() for m in qmatrices if m.bpropagable ] for dL_dm_gpu, dL_dm_cpu in izip(output['gpu'], output['cpu']): if not np.allclose(dL_dm_gpu, dL_dm_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_bprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] true_labels = [self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads[processor_type] = [qW.backward_matrix.to_host()] if with_bias: quagga_grads[processor_type].append(qb.backward_matrix.to_host()) quagga_grads[processor_type].extend(e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector( Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host( ) for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim_x, dim_y = self.rng.random_integers(1500, size=2) x = [ self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] y = [ self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_gpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu) x_gpu.length = sequence_len y_gpu.length = sequence_len if sequence_len == 0: pass seq_hstack_block_gpu.fprop() output_sequence_gpu = seq_hstack_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_cpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu) x_cpu.length = sequence_len y_cpu.length = sequence_len seq_hstack_block_cpu.fprop() output_sequence_cpu = seq_hstack_block_cpu.output.to_host() for out_gpu, out_cpu in izip(output_sequence_gpu, output_sequence_cpu): if not np.allclose(out_gpu, out_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size = self.rng.random_integers(2000) true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(dtype=np.float32) mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32) x = self.rng.randn(batch_size, 1).astype(dtype=np.float32) device_id = 0 for with_mask in [False, True]: # Theano model th_x = T.fmatrix() th_mask = T.fmatrix() th_true_labels = T.fmatrix() if with_mask: probs = T.nnet.sigmoid(th_mask * th_x) else: probs = T.nnet.sigmoid(th_x) loss = T.mean(T.nnet.binary_crossentropy( probs, th_true_labels)) if with_mask: get_theano_grads = theano.function( [th_x, th_true_labels, th_mask], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels, mask) else: get_theano_grads = theano.function([th_x, th_true_labels], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels) # quagga model x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) mask_gpu = Connector( Matrix.from_npa(mask)) if with_mask else None sigmoid_ce_block = SigmoidCeBlock(x_gpu, true_labels_gpu, mask_gpu) x_gpu.fprop() true_labels_gpu.fprop() if with_mask: mask_gpu.fprop() sigmoid_ce_block.fprop() sigmoid_ce_block.bprop() q_dL_dx = x_gpu.backward_matrix.to_host() r.append(np.allclose(th_dL_dx, q_dL_dx)) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3) x = self.rng.rand(batch_size, x_dim).astype(np.float32) W = self.get_orthogonal_matrix(x_dim, output_dim) b = self.rng.rand(1, output_dim).astype(np.float32) if self.rng.randint(2) else None device_id = 0 state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = Connector(Matrix.from_npa(x), device_id) W_gpu = Connector(Matrix.from_npa(W), device_id) b_gpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b dot_block_gpu = DotBlock(W_gpu, b_gpu, x_gpu) x_gpu.fprop() W_gpu.fprop() if b_gpu: b_gpu.fprop() dot_block_gpu.fprop() _, dL_doutput = dot_block_gpu.output.register_usage(device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) dot_block_gpu.bprop() if b is not None: dL_db_gpu = b_gpu.backward_matrix.to_host() dL_dW_gpu = W_gpu.backward_matrix.to_host() dL_dx_gpu = x_gpu.backward_matrix.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = Connector(Matrix.from_npa(x), device_id) W_cpu = Connector(Matrix.from_npa(W), device_id) b_cpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b dot_block_cpu = DotBlock(W_cpu, b_cpu, x_cpu) x_cpu.fprop() W_cpu.fprop() if b_cpu: b_cpu.fprop() dot_block_cpu.fprop() _, dL_doutput = dot_block_cpu.output.register_usage(device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) dot_block_cpu.bprop() if b is not None: dL_db_cpu = b_cpu.backward_matrix.to_host() dL_dW_cpu = W_cpu.backward_matrix.to_host() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu, atol=1e-5)) r.append(np.allclose(dL_dW_gpu, dL_dW_cpu, atol=1e-5)) if b is not None: r.append(np.allclose(dL_db_gpu, dL_db_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function( [th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): for sparse in [True, False]: batch_size, dim = self.rng.random_integers(2000, size=2) if sparse: true_labels = np.zeros((batch_size, dim), np.float32) for k, j in enumerate(self.rng.randint(dim, size=batch_size)): true_labels[k, j] = 1.0 else: true_labels = self.rng.randint(dim, size=(batch_size, 1)).astype(np.int32) x = self.rng.randn(batch_size, dim).astype(np.float32) mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32) device_id = 0 for with_mask in [False, True]: # Theano model th_x = T.fmatrix() th_mask = T.fcol() th_true_labels = T.fmatrix() if sparse else T.ivector() if with_mask: probs = T.nnet.softmax(th_mask * th_x) else: probs = T.nnet.softmax(th_x) loss = T.mean(T.nnet.categorical_crossentropy(probs, th_true_labels)) if with_mask: get_theano_grads = theano.function([th_x, th_true_labels, th_mask], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels if sparse else true_labels[:, 0], mask) else: get_theano_grads = theano.function([th_x, th_true_labels], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels if sparse else true_labels[:, 0]) # quagga model x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None softmax_ce_block = SoftmaxCeBlock(x_gpu, true_labels_gpu, mask_gpu) x_gpu.fprop() true_labels_gpu.fprop() if with_mask: mask_gpu.fprop() softmax_ce_block.fprop() softmax_ce_block.bprop() q_dL_dx = x_gpu.backward_matrix.to_host() r.append(np.allclose(th_dL_dx, q_dL_dx)) self.assertEqual(sum(r), len(r))
def test_bprop(self): r = [] for i in xrange(self.N): matrices = [] nrows = self.rng.random_integers(1, 3000) ncols = [0] col_slices = [] device_ids = [] for _ in xrange(self.rng.random_integers(1, 10)): _ncols = self.rng.random_integers(1, 2000) ncols.append(ncols[-1] + _ncols) if self.rng.choice([True, False]): device_ids.append(0) col_slices.append((ncols[-2], ncols[-1])) else: device_ids.append(None) matrices.append(self.rng.rand(nrows, _ncols).astype(np.float32)) true_labels = self.rng.randint(ncols[-1], size=(nrows, 1)).astype(np.int32) if not col_slices: r.append(True) continue output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qmatrices = [Connector(Matrix.from_npa(m), d_id) for m, d_id in izip(matrices, device_ids)] qtrue_labels = Connector(Matrix.from_npa(true_labels)) hstack_block = HorizontalStackBlock(*qmatrices) sce_block = SoftmaxCeBlock(hstack_block.output, qtrue_labels) for m in qmatrices: m.fprop() qtrue_labels.fprop() hstack_block.fprop() sce_block.fprop() sce_block.bprop() hstack_block.bprop() output[processor_type] = [m.backward_matrix.to_host() for m in qmatrices if m.bpropagable] for dL_dm_gpu, dL_dm_cpu in izip(output['gpu'], output['cpu']): if not np.allclose(dL_dm_gpu, dL_dm_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host() for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): batch_size, dim = self.rng.random_integers(2000, size=2) true_labels = self.rng.randint(2, size=(batch_size, dim)).astype(dtype=np.float32) mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32) x = self.rng.randn(batch_size, dim).astype(dtype=np.float32) device_id = 0 for with_mask in [False, True]: # Theano model th_x = T.fmatrix() th_mask = T.fmatrix() th_true_labels = T.fmatrix() if with_mask: probs = T.nnet.sigmoid(theano.compile.ops.Rebroadcast((0, False), (1, True))(th_mask) * th_x) else: probs = T.nnet.sigmoid(th_x) loss = - th_true_labels * T.log(probs) - \ (1.0 - th_true_labels) * T.log(1.0 - probs) loss = T.sum(loss, axis=1).mean() if with_mask: get_theano_grads = theano.function([th_x, th_true_labels, th_mask], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels, mask) else: get_theano_grads = theano.function([th_x, th_true_labels], T.grad(loss, wrt=th_x)) th_dL_dx = get_theano_grads(x, true_labels) # quagga model x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None sigmoid_ce_block = SigmoidCeBlock(x_gpu, true_labels_gpu, mask_gpu) x_gpu.fprop() true_labels_gpu.fprop() if with_mask: mask_gpu.fprop() sigmoid_ce_block.fprop() sigmoid_ce_block.bprop() q_dL_dx = x_gpu.backward_matrix.to_host() r.append(np.allclose(th_dL_dx, q_dL_dx)) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) device_id = 0 for nonlinearity in ['sigmoid', 'tanh', 'relu']: state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x), device_id) nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity) x_gpu.fprop() nonlinearity_block.fprop() _, dL_doutput = nonlinearity_block.output.register_usage( device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(Context(), Matrix.from_npa(random_matrix, 'float')) nonlinearity_block.bprop() dL_dx_gpu = x_gpu.backward_matrix.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = Connector(Matrix.from_npa(x), device_id) nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity) x_cpu.fprop() nonlinearity_block.fprop() _, dL_doutput = nonlinearity_block.output.register_usage( device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(Context(), Matrix.from_npa(random_matrix, 'float')) nonlinearity_block.bprop() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu)) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(512) dim_x, dim_y = self.rng.random_integers(1500, size=2) x = [self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] y = [self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_gpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu) x_gpu.length = sequence_len y_gpu.length = sequence_len if sequence_len == 0: pass seq_hstack_block_gpu.fprop() output_sequence_gpu = seq_hstack_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_cpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu) x_cpu.length = sequence_len y_cpu.length = sequence_len seq_hstack_block_cpu.fprop() output_sequence_cpu = seq_hstack_block_cpu.output.to_host() for out_gpu, out_cpu in izip(output_sequence_gpu, output_sequence_cpu): if not np.allclose(out_gpu, out_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_theano_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.imatrix() th_true_labels = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs, np.hstack(true_labels[:sequence_len])) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer(W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function([th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def __init__(self, **kwargs): self.parameters = {} self.trainable_parameters = {} for name, definition in kwargs.iteritems(): device_id = definition['device_id'] matrix = Matrix.from_npa(definition['init'](), device_id=device_id) if 'trainable' not in definition or definition['trainable']: param = Connector(matrix, device_id) self.trainable_parameters[name] = param else: param = Connector(matrix) self.parameters[name] = param
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size, x_dim = self.rng.random_integers(3000, size=2) x = self.rng.rand(batch_size, x_dim).astype(np.float32) device_id = 0 for nonlinearity in ['sigmoid', 'tanh', 'relu']: state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x), device_id) nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity) x_gpu.fprop() nonlinearity_block.fprop() _, dL_doutput = nonlinearity_block.output.register_usage(device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(Context(), Matrix.from_npa(random_matrix, 'float')) nonlinearity_block.bprop() dL_dx_gpu = x_gpu.backward_matrix.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = Connector(Matrix.from_npa(x), device_id) nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity) x_cpu.fprop() nonlinearity_block.fprop() _, dL_doutput = nonlinearity_block.output.register_usage(device_id, device_id) random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(Context(), Matrix.from_npa(random_matrix, 'float')) nonlinearity_block.bprop() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu)) self.assertEqual(sum(r), len(r))
def __init__(self, ptb_train, ptb_valid, batch_size, sentence_max_len, device_id): self.blocking_contexts = None self.context = Context(device_id) device_id = self.context.device_id self.train_offsets = HomogeneousDataGenerator(ptb_train, batch_size, sentence_max_len, randomize=True, infinite=True) self.valid_offsets = HomogeneousDataGenerator(ptb_valid, batch_size, sentence_max_len) train_sentences = np.array([self.train_offsets.flatten_sentences]) valid_sentences = np.array([self.valid_offsets.flatten_sentences]) self.train_sents = Matrix.from_npa(train_sentences, 'int', device_id) self.valid_sents = Matrix.from_npa(valid_sentences, 'int', device_id) self._sent_lengths = np.empty((batch_size, 1), dtype=np.int32, order='F')[...] self.sent_lengths = Matrix.from_npa(self._sent_lengths, device_id=device_id) sentence_batch = Matrix.empty(batch_size, sentence_max_len, 'int', device_id) self.sentence_batch = Connector(sentence_batch, self.context) self.sentence_batch.sync_fill(0) self._mask = Matrix.empty(sentence_batch.nrows, self.sentence_batch.ncols, 'float', device_id) self.mask = List([Connector(self._mask[:, i]) for i in xrange(sentence_max_len)], self.sentence_batch.ncols) self.train_offsets_iterator = iter(self.train_offsets) self.valid_offsets_iterator = iter(self.valid_offsets) self.training_mode = True
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_gpu.output.register_usage( context, context) smean_pooling_block_gpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_gpu.bprop() dL_dmatrices_gpu = [e.backward_matrix.to_host() for e in x_gpu] self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_cpu.output.register_usage( context, context) smean_pooling_block_cpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_cpu.bprop() dL_dmatrices_cpu = [e.backward_matrix.to_host() for e in x_cpu] for dL_dmatrix_gpu, dL_dmatrix_cpu in izip(dL_dmatrices_gpu, dL_dmatrices_cpu): if not np.allclose(dL_dmatrix_gpu, dL_dmatrix_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_theano_bprop_vector(self): r = [] for _ in xrange(self.N): embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32) true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W), device_id) qtrue_labels = Connector(Matrix.from_npa(true_labels)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels) qtrue_labels.fprop() qW.fprop() qrow_idxs.fprop() row_slicing_block.fprop() sce_block.fprop() sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.ivector() th_true_labels = T.ivector() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs[:, 0], true_labels[:, 0]) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value())) self.assertEqual(sum(r), len(r))
def __init__(self, ptb_train, ptb_valid, batch_size, sentence_max_len, device_id): self.blocking_contexts = None self.context = Context(device_id) device_id = self.context.device_id self.train_offsets = HomogeneousDataGenerator(ptb_train, batch_size, sentence_max_len, randomize=True, infinite=True) self.valid_offsets = HomogeneousDataGenerator(ptb_valid, batch_size, sentence_max_len) train_sentences = np.array([self.train_offsets.flatten_sentences]) valid_sentences = np.array([self.valid_offsets.flatten_sentences]) self.train_sents = Matrix.from_npa(train_sentences, 'int', device_id) self.valid_sents = Matrix.from_npa(valid_sentences, 'int', device_id) self._sent_lengths = np.empty((batch_size, 1), dtype=np.int32, order='F')[...] self.sent_lengths = Matrix.from_npa(self._sent_lengths, device_id=device_id) sentence_batch = Matrix.empty(batch_size, sentence_max_len, 'int', device_id) self.sentence_batch = Connector(sentence_batch, self.context) self.sentence_batch.sync_fill(0) self._mask = Matrix.empty(sentence_batch.nrows, self.sentence_batch.ncols, 'float', device_id) self.mask = List( [Connector(self._mask[:, i]) for i in xrange(sentence_max_len)], self.sentence_batch.ncols) self.train_offsets_iterator = iter(self.train_offsets) self.valid_offsets_iterator = iter(self.valid_offsets) self.training_mode = True
def test_theano_bprop(self): r = [] for i in xrange(self.N): repeats = self.rng.random_integers(42) axis = self.rng.randint(2) input_dim, output_dim = self.rng.random_integers(2000, size=2) x = self.get_normal_matrix(input_dim, output_dim) input_dim = input_dim if axis else input_dim * repeats true_labels = self.rng.randint(output_dim, size=(input_dim, 1)).astype(np.int32) device_id = 0 quagga.processor_type = 'gpu' qx = Connector(Matrix.from_npa(x), device_id) qtrue_labels = Connector(Matrix.from_npa(true_labels)) repeat_block = RepeatBlock(qx, repeats, axis) sce_block = SoftmaxCeBlock(repeat_block.output, qtrue_labels) qx.fprop() qtrue_labels.fprop() repeat_block.fprop() sce_block.fprop() sce_block.bprop() repeat_block.bprop() q_dL_dx = qx.backward_matrix.to_host() th_x = T.fmatrix() th_true_labels = T.ivector() reps = [1, 1] reps[axis] = repeats th_output = T.tile(th_x, reps) th_output = T.nnet.softmax(th_output) loss = T.mean(T.nnet.categorical_crossentropy(th_output, th_true_labels)) get_grads = theano.function([th_x, th_true_labels], T.grad(loss, th_x)) th_dL_dx = get_grads(x, true_labels[:, 0]) r.append(np.allclose(q_dL_dx, th_dL_dx)) self.assertEqual(sum(r), len(r))
def __init__(self, train_x, train_y, valid_x, valid_y, batch_size, device_id): self.context = Context(device_id) device_id = self.context.device_id self.train_x = Matrix.from_npa(train_x.T.astype(np.float32), device_id=device_id) self.valid_x = Matrix.from_npa(valid_x.T.astype(np.float32), device_id=device_id) self.train_y = Matrix.from_npa(train_y[:, np.newaxis], 'int', device_id=device_id) self.valid_y = Matrix.from_npa(valid_y[:, np.newaxis], 'int', device_id=device_id) self.batch_size = batch_size x = Matrix.empty(self.batch_size, self.train_x.nrows, device_id=device_id) y = Matrix.empty(self.batch_size, 1, 'int', device_id) self.x = Connector(x) self.y = Connector(y) self.train_indices = np.arange(int(self.train_x.ncols), dtype=np.int32) self.valid_indices = np.arange(int(self.valid_x.ncols), dtype=np.int32) self.indices = Matrix.empty(self.batch_size, 1, 'int', device_id) self.rng = np.random.RandomState(42) self.rng.shuffle(self.train_indices) self.train_i = 0 self.valid_i = 0 self.training_mode = True self.blocking_contexts = None
def test_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_numpy_fprop(self): r = [] quagga.processor_type = 'gpu' for _ in xrange(self.N): matrices = [] nrows = self.rng.random_integers(1, 5000) for _ in xrange(self.rng.random_integers(1, 10)): ncols = self.rng.random_integers(1, 5000) matrices.append(self.rng.rand(nrows, ncols).astype(np.float32)) numpy_output = np.hstack([m for m in matrices]) matrices = [Connector(Matrix.from_npa(m)) for m in matrices] hstack_block = HorizontalStackBlock(*matrices) for m in matrices: m.fprop() hstack_block.fprop() quagga_output = hstack_block.output.to_host() r.append(np.allclose(numpy_output, quagga_output)) self.assertEqual(sum(r), self.N)
def test_numpy_fprop(self): r = [] quagga.processor_type = 'gpu' for _ in xrange(self.N): matrices = [] ncols = self.rng.random_integers(1, 5000) for _ in xrange(self.rng.random_integers(1, 10)): nrows = self.rng.random_integers(1, 5000) matrices.append(self.rng.rand(nrows, ncols).astype(np.float32)) numpy_output = np.vstack([m for m in matrices]) matrices = [Connector(Matrix.from_npa(m)) for m in matrices] vstack_block = VerticalStackBlock(*matrices) for m in matrices: m.fprop() vstack_block.fprop() quagga_output = vstack_block.output.to_host() r.append(np.allclose(numpy_output, quagga_output)) self.assertEqual(sum(r), self.N)
def test_fprop(self): r = [] for i in xrange(self.N): repeats = self.rng.random_integers(42) axis = self.rng.randint(2) input_dim, output_dim = self.rng.random_integers(2000, size=2) x = self.get_normal_matrix(input_dim, output_dim) output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = Connector(Matrix.from_npa(x)) repeat_block = RepeatBlock(qx, repeats, axis) qx.fprop() repeat_block.fprop() output[processor_type] = repeat_block.output.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): for sparse in [True, False]: batch_size, dim = self.rng.random_integers(2000, size=2) if sparse: true_labels = np.zeros((batch_size, dim), np.float32) for k, j in enumerate(self.rng.randint(dim, size=batch_size)): true_labels[k, j] = 1.0 else: true_labels = self.rng.randint(dim, size=(batch_size, 1)).astype(np.int32) x = self.rng.randn(batch_size, dim).astype(np.float32) mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32) device_id = 0 for with_mask in [False, True]: quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None softmax_ce_block = SoftmaxCeBlock(x_gpu, true_labels_gpu, mask_gpu) x_gpu.fprop() true_labels_gpu.fprop() if with_mask: mask_gpu.fprop() softmax_ce_block.fprop() softmax_ce_block.bprop() dL_dx_gpu = x_gpu.backward_matrix.to_host() quagga.processor_type = 'cpu' x_cpu = Connector(Matrix.from_npa(x), device_id) true_labels_cpu = Connector(Matrix.from_npa(true_labels)) mask_cpu = Connector(Matrix.from_npa(mask)) if with_mask else None softmax_ce_block = SoftmaxCeBlock(x_cpu, true_labels_cpu, mask_cpu) x_cpu.fprop() true_labels_cpu.fprop() if with_mask: mask_cpu.fprop() softmax_ce_block.fprop() softmax_ce_block.bprop() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu)) self.assertEqual(sum(r), len(r))
def test_fprop(self): r = [] for i in xrange(self.N): matrices = [] nrows = self.rng.random_integers(1, 5000) for _ in xrange(self.rng.random_integers(1, 10)): ncols = self.rng.random_integers(1, 5000) matrices.append(self.rng.rand(nrows, ncols).astype(np.float32)) output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qmatrices = [Connector(Matrix.from_npa(m)) for m in matrices] for m in qmatrices: m.fprop() hstack_block = HorizontalStackBlock(*qmatrices) hstack_block.fprop() output[processor_type] = hstack_block.output.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), self.N)
def test_fprop(self): r = [] for i in xrange(self.N): matrices = [] ncols = self.rng.random_integers(1, 5000) for _ in xrange(self.rng.random_integers(1, 10)): nrows = self.rng.random_integers(1, 5000) matrices.append(self.rng.rand(nrows, ncols).astype(np.float32)) output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qmatrices = [Connector(Matrix.from_npa(m)) for m in matrices] for m in qmatrices: m.fprop() vstack_block = VerticalStackBlock(*qmatrices) vstack_block.fprop() output[processor_type] = vstack_block.output.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), self.N)
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)] state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = List([Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_gpu.output.register_usage(context, context) smean_pooling_block_gpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_gpu.bprop() dL_dmatrices_gpu = [e.backward_matrix.to_host() for e in x_gpu] self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = List([Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_cpu.output.register_usage(context, context) smean_pooling_block_cpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_cpu.bprop() dL_dmatrices_cpu = [e.backward_matrix.to_host() for e in x_cpu] for dL_dmatrix_gpu, dL_dmatrix_cpu in izip(dL_dmatrices_gpu, dL_dmatrices_cpu): if not np.allclose(dL_dmatrix_gpu, dL_dmatrix_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): batch_size = self.rng.random_integers(2000) true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32) x = self.rng.randn(batch_size, 1).astype(np.float32) device_id = 0 for with_mask in [False, True]: quagga.processor_type = 'gpu' x_gpu = Connector(Matrix.from_npa(x), device_id) true_labels_gpu = Connector(Matrix.from_npa(true_labels)) mask_gpu = Connector( Matrix.from_npa(mask)) if with_mask else None sigmoid_ce_block = SigmoidCeBlock(x_gpu, true_labels_gpu, mask_gpu) x_gpu.fprop() true_labels_gpu.fprop() if with_mask: mask_gpu.fprop() sigmoid_ce_block.fprop() sigmoid_ce_block.bprop() dL_dx_gpu = x_gpu.backward_matrix.to_host() x_cpu = Connector(Matrix.from_npa(x), device_id) true_labels_cpu = Connector(Matrix.from_npa(true_labels)) mask_cpu = Connector( Matrix.from_npa(mask)) if with_mask else None sigmoid_ce_block = SigmoidCeBlock(x_cpu, true_labels_cpu, mask_cpu) x_cpu.fprop() true_labels_cpu.fprop() if with_mask: mask_cpu.fprop() sigmoid_ce_block.fprop() sigmoid_ce_block.bprop() dL_dx_cpu = x_cpu.backward_matrix.to_host() r.append(np.allclose(dL_dx_gpu, dL_dx_cpu)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): r = [] for i in xrange(self.N): repeats = self.rng.random_integers(42) axis = self.rng.randint(2) input_dim, output_dim = self.rng.random_integers(2000, size=2) x = self.get_normal_matrix(input_dim, output_dim) quagga.processor_type = 'gpu' qx = Connector(Matrix.from_npa(x)) repeat_block = RepeatBlock(qx, repeats, axis) qx.fprop() repeat_block.fprop() qoutput = repeat_block.output.to_host() th_x = T.fmatrix() reps = [1, 1] reps[axis] = repeats th_output = T.tile(th_x, reps) th_output = theano.function([th_x], th_output)(x) r.append(np.allclose(qoutput, th_output)) self.assertEqual(sum(r), len(r))