def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector( Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host( ) for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] true_labels = [self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads[processor_type] = [qW.backward_matrix.to_host()] if with_bias: quagga_grads[processor_type].append(qb.backward_matrix.to_host()) quagga_grads[processor_type].extend(e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function( [th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host() for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.imatrix() th_true_labels = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs, np.hstack(true_labels[:sequence_len])) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer(W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function([th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_theano_bprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 for reverse in [False, True]: for with_bias in [False, True]: qx = List( [Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List( [Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads = [qW.backward_matrix.to_host()] if with_bias: quagga_grads.append(qb.backward_matrix.to_host()) quagga_grads.append( [e.backward_matrix.to_host() for e in qx]) seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) seq_sce_layer = SequentialSoftmaxLayer() th_x = T.ftensor3() th_true_labels = T.imatrix() loss = seq_sce_layer.get_loss( seq_dot_layer.get_output_expr(th_x), th_true_labels) wrt = [seq_dot_layer.W] if with_bias: wrt.append(seq_dot_layer.b) wrt.append(th_x) grads = T.grad(loss, wrt) get_theano_grads = theano.function([th_x, th_true_labels], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip( quagga_grads[:-1], theano_grads[:-1]): r.append( np.allclose(quagga_grad, theano_grad, atol=1e-5)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads[processor_type] = [ qW.backward_matrix.to_host() ] if with_bias: quagga_grads[processor_type].append( qb.backward_matrix.to_host()) quagga_grads[processor_type].extend( e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) qh = {} for reverse in [False, True]: for with_mask in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type context = Context() qx = List([Connector(Matrix.from_npa(e)) for e in x]) qmask = Matrix.empty(batch_size, len(qx), 'float') qh_0 = Connector(Matrix.from_npa(h_0)) qc_0 = Connector(Matrix.from_npa(c_0)) qW = Connector(Matrix.from_npa(W)) qR = Connector(Matrix.from_npa(R)) sequences = [qx] if with_mask: sequences.append( List([ Connector(qmask[:, i]) for i in xrange(len(qx)) ], len(qx))) qmask.assign_npa(context, mask) qmask = sequences[-1] else: sequences.append([None] * len(qx)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=sequences, output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) qx.length = sequence_len if with_mask: qmask.fprop() qx.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() qh[processor_type] = lstm.h.to_host() for h_gpu, h_cpu in izip(qh['gpu'], qh['cpu']): if not np.allclose(h_gpu, h_cpu, rtol=1e-7, atol=1e-3): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(128) input_dim, hidden_dim, class_num = self.rng.random_integers(1500, size=3) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(class_num, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) lr_W = self.get_orthogonal_matrix(hidden_dim, class_num) lr_b = self.rng.rand(1, class_num).astype(dtype=np.float32) device_id = 0 for reverse in [False, True]: for with_mask in [False, True]: for learn_inital_states in [False, True]: # quagga model context = Context() qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], qx.length) qmask = Matrix.empty(batch_size, qx.length, 'float') qmask_list = [ Connector(qmask[:, i]) for i in xrange(qmask.ncols) ] qmask = Connector(qmask) qh_0 = Connector( Matrix.from_npa(h_0), device_id if learn_inital_states else None) qc_0 = Connector( Matrix.from_npa(c_0), device_id if learn_inital_states else None) qW = Connector(Matrix.from_npa(W), device_id) qR = Connector(Matrix.from_npa(R), device_id) qlr_W = Connector(Matrix.from_npa(lr_W), device_id) qlr_b = Connector(Matrix.from_npa(lr_b), device_id) lstm = SequencerBlock( block_class=LstmBlock, params=[qW, qR], sequences=[ qx, qmask_list if with_mask else [None] * len(qx) ], output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qlr_W, qlr_b], sequences=[lstm.h], output_names=['output']) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[ seq_dot_block.output, qtrue_labels, qmask_list if with_mask else [None] * len(qx) ]) qx.length = sequence_len for e in qx: e.fprop() for e in qtrue_labels: e.fprop() qmask.assign_npa(context, mask) qmask.fprop() qlr_W.fprop() qlr_b.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() lstm.bprop() quagga_grads = [ qlr_b.backward_matrix.to_host(), qlr_W.backward_matrix.to_host(), qW.backward_matrix.to_host(), qR.backward_matrix.to_host() ] if learn_inital_states: quagga_grads.append(qc_0.backward_matrix.to_host()) quagga_grads.append(qh_0.backward_matrix.to_host()) quagga_grads.append( [e.backward_matrix.to_host() for e in qx]) del qx del qlr_b del qlr_W del qW del qR del qmask del lstm del seq_dot_block del seq_sce_block # theano model th_x = T.ftensor3() th_true_labels = T.imatrix() th_mask = T.fmatrix() lstm_layer = LstmLayer(W, R, c_0, h_0, reverse=reverse) th_h = lstm_layer.get_output_expr( th_x, th_mask if with_mask else None) seq_softmax_layer = SequentialSoftmaxLayer( lr_W, lr_b, reverse) loss = seq_softmax_layer.get_loss( th_h, th_true_labels, th_mask if with_mask else None) wrt = [ seq_softmax_layer.b, seq_softmax_layer.W, lstm_layer.W, lstm_layer.R ] if learn_inital_states: wrt.append(lstm_layer.c0) wrt.append(lstm_layer.h0) wrt.append(th_x) grads = T.grad(loss, wrt) if with_mask: get_theano_grads = theano.function( [th_x, th_true_labels, th_mask], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len]), mask[:, :sequence_len]) else: get_theano_grads = theano.function( [th_x, th_true_labels], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip( quagga_grads[:-1], theano_grads[:-1]): r.append( np.allclose(quagga_grad, theano_grad, atol=1e-6)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-6): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) for reverse in [False, True]: for with_mask in [False, True]: context = Context() qx = List([Connector(Matrix.from_npa(e)) for e in x]) qmask = Connector( Matrix.empty(batch_size, len(qx), 'float')) qh_0 = Connector(Matrix.from_npa(h_0)) qc_0 = Connector(Matrix.from_npa(c_0)) qW = Connector(Matrix.from_npa(W)) qR = Connector(Matrix.from_npa(R)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=[qx] + ([qmask] if with_mask else []), output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) qx.length = sequence_len for e in qx: e.fprop() qmask.assign_npa(context, mask) qmask.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() q_h = lstm.h.to_host() th_x = T.ftensor3() lstm_layer = LstmLayer(W, R, c_0, h_0, reverse) if with_mask: th_mask = T.fmatrix() get_th_h = theano.function([th_x, th_mask], lstm_layer.get_output_expr( th_x, th_mask)) th_h = get_th_h(np.dstack(x[:sequence_len]), mask[:, :sequence_len]) else: get_th_h = theano.function( [th_x], lstm_layer.get_output_expr(th_x)) th_h = get_th_h(np.dstack(x[:sequence_len])) for i in xrange(th_h.shape[0]): if not np.allclose(q_h[i], th_h[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) lr_W = self.get_orthogonal_matrix(hidden_dim, 1) lr_b = self.rng.rand(1, 1).astype(dtype=np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_mask in [False, True]: for learn_inital_states in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type context = Context() qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], len(qx)) qmask = Matrix.empty(batch_size, len(qx)) qh_0 = Connector( Matrix.from_npa(h_0), device_id if learn_inital_states else None) qc_0 = Connector( Matrix.from_npa(c_0), device_id if learn_inital_states else None) qW = Connector(Matrix.from_npa(W), device_id) qR = Connector(Matrix.from_npa(R), device_id) qlr_W = Connector(Matrix.from_npa(lr_W), device_id) qlr_b = Connector(Matrix.from_npa(lr_b), device_id) sequences = [qx] if with_mask: sequences.append( List([ Connector(qmask[:, i]) for i in xrange(len(qx)) ], len(qx))) qmask.assign_npa(context, mask) qmask = sequences[-1] else: sequences.append([None] * len(qx)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=sequences, output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) seq_dot_block = SequencerBlock( block_class=DotBlock, params=[qlr_W, qlr_b], sequences=[lstm.h], output_names=['output']) seq_sce_block = SequencerBlock( block_class=SigmoidCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels ] + ([qmask] if with_mask else [])) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() if with_mask: qmask.fprop() qlr_W.fprop() qlr_b.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() lstm.bprop() quagga_grads[processor_type] = [ qlr_b.backward_matrix.to_host(), qlr_W.backward_matrix.to_host(), qW.backward_matrix.to_host(), qR.backward_matrix.to_host() ] if learn_inital_states: quagga_grads[processor_type].append( qc_0.backward_matrix.to_host()) quagga_grads[processor_type].append( qh_0.backward_matrix.to_host()) quagga_grads[processor_type].extend( e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-6)) self.assertEqual(sum(r), len(r))
def test_theano_bprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] true_labels = [self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads = [qW.backward_matrix.to_host()] if with_bias: quagga_grads.append(qb.backward_matrix.to_host()) quagga_grads.append([e.backward_matrix.to_host() for e in qx]) seq_dot_layer = SequentialDotLayer(W, b if with_bias else None, reverse) seq_sce_layer = SequentialSoftmaxLayer() th_x = T.ftensor3() th_true_labels = T.imatrix() loss = seq_sce_layer.get_loss(seq_dot_layer.get_output_expr(th_x), th_true_labels) wrt = [seq_dot_layer.W] if with_bias: wrt.append(seq_dot_layer.b) wrt.append(th_x) grads = T.grad(loss, wrt) get_theano_grads = theano.function([th_x, th_true_labels], grads) theano_grads = get_theano_grads(np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip(quagga_grads[:-1], theano_grads[:-1]): r.append(np.allclose(quagga_grad, theano_grad, atol=1e-5)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))