def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector( Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host( ) for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] true_labels = [self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads[processor_type] = [qW.backward_matrix.to_host()] if with_bias: quagga_grads[processor_type].append(qb.backward_matrix.to_host()) quagga_grads[processor_type].extend(e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function( [th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host() for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer(W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function([th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.imatrix() th_true_labels = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs, np.hstack(true_labels[:sequence_len])) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
x_device_id=1, y_device_id=0) embd_block = RowSlicingBlock(W=p['embd_W'], row_indexes=data_block.x) f_c_repeat_block = RepeatBlock(p['f_lstm_c0'], data_block.x.nrows, axis=0, device_id=1) f_h_repeat_block = RepeatBlock(p['f_lstm_h0'], data_block.x.nrows, axis=0, device_id=1) f_lstm_rnn_block = SequencerBlock( block_class=LstmBlock, params=[p['f_lstm_W'], p['f_lstm_R'], None], sequences=[embd_block.output, data_block.mask], output_names=['h'], prev_names=['c', 'h'], paddings=[f_c_repeat_block.output, f_h_repeat_block.output], reverse=False, device_id=1) s_c_repeat_block = RepeatBlock(p['s_lstm_c0'], data_block.x.nrows, axis=0, device_id=1) s_h_repeat_block = RepeatBlock(p['s_lstm_h0'], data_block.x.nrows, axis=0, device_id=1) s_lstm_rnn_block = SequencerBlock( block_class=LstmBlock, params=[p['s_lstm_W'], p['s_lstm_R'], None],
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) lr_W = self.get_orthogonal_matrix(hidden_dim, 1) lr_b = self.rng.rand(1, 1).astype(dtype=np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_mask in [False, True]: for learn_inital_states in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type context = Context() qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], len(qx)) qmask = Matrix.empty(batch_size, len(qx)) qh_0 = Connector( Matrix.from_npa(h_0), device_id if learn_inital_states else None) qc_0 = Connector( Matrix.from_npa(c_0), device_id if learn_inital_states else None) qW = Connector(Matrix.from_npa(W), device_id) qR = Connector(Matrix.from_npa(R), device_id) qlr_W = Connector(Matrix.from_npa(lr_W), device_id) qlr_b = Connector(Matrix.from_npa(lr_b), device_id) sequences = [qx] if with_mask: sequences.append( List([ Connector(qmask[:, i]) for i in xrange(len(qx)) ], len(qx))) qmask.assign_npa(context, mask) qmask = sequences[-1] else: sequences.append([None] * len(qx)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=sequences, output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) seq_dot_block = SequencerBlock( block_class=DotBlock, params=[qlr_W, qlr_b], sequences=[lstm.h], output_names=['output']) seq_sce_block = SequencerBlock( block_class=SigmoidCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels ] + ([qmask] if with_mask else [])) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() if with_mask: qmask.fprop() qlr_W.fprop() qlr_b.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() lstm.bprop() quagga_grads[processor_type] = [ qlr_b.backward_matrix.to_host(), qlr_W.backward_matrix.to_host(), qW.backward_matrix.to_host(), qR.backward_matrix.to_host() ] if learn_inital_states: quagga_grads[processor_type].append( qc_0.backward_matrix.to_host()) quagga_grads[processor_type].append( qh_0.backward_matrix.to_host()) quagga_grads[processor_type].extend( e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-6)) self.assertEqual(sum(r), len(r))
seq_embd_block = RowSlicingBlock(p['embd_W'], data_block.sentence_batch) # remove last in the list output = List(seq_embd_block.output[:-1], seq_embd_block.output.length - 1) c_fwd_repeat_block = RepeatBlock(p['lstm_fwd_c0'], data_block.sentence_batch.nrows, axis=0, device_id=0) h_fwd_repeat_block = RepeatBlock(p['lstm_fwd_h0'], data_block.sentence_batch.nrows, axis=0, device_id=0) fwd_lstm_block = SequencerBlock( block_class=LstmBlock, params=[p['lstm_fwd_W'], p['lstm_fwd_R'], 0.5], sequences=[output, data_block.mask], output_names=['h'], prev_names=['c', 'h'], paddings=[c_fwd_repeat_block.output, h_fwd_repeat_block.output], reverse=False, device_id=0) # remove first in the list output = List(seq_embd_block.output[1:], seq_embd_block.output.length - 1) c_bwd_repeat_block = RepeatBlock(p['lstm_bwd_c0'], data_block.sentence_batch.nrows, axis=0, device_id=0) h_bwd_repeat_block = RepeatBlock(p['lstm_bwd_h0'], data_block.sentence_batch.nrows, axis=0, device_id=0) bwd_lstm_block = SequencerBlock(
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) for reverse in [False, True]: for with_mask in [False, True]: context = Context() qx = List([Connector(Matrix.from_npa(e)) for e in x]) qmask = Connector( Matrix.empty(batch_size, len(qx), 'float')) qh_0 = Connector(Matrix.from_npa(h_0)) qc_0 = Connector(Matrix.from_npa(c_0)) qW = Connector(Matrix.from_npa(W)) qR = Connector(Matrix.from_npa(R)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=[qx] + ([qmask] if with_mask else []), output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) qx.length = sequence_len for e in qx: e.fprop() qmask.assign_npa(context, mask) qmask.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() q_h = lstm.h.to_host() th_x = T.ftensor3() lstm_layer = LstmLayer(W, R, c_0, h_0, reverse) if with_mask: th_mask = T.fmatrix() get_th_h = theano.function([th_x, th_mask], lstm_layer.get_output_expr( th_x, th_mask)) th_h = get_th_h(np.dstack(x[:sequence_len]), mask[:, :sequence_len]) else: get_th_h = theano.function( [th_x], lstm_layer.get_output_expr(th_x)) th_h = get_th_h(np.dstack(x[:sequence_len])) for i in xrange(th_h.shape[0]): if not np.allclose(q_h[i], th_h[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_bprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 for reverse in [False, True]: for with_bias in [False, True]: qx = List( [Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List( [Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads = [qW.backward_matrix.to_host()] if with_bias: quagga_grads.append(qb.backward_matrix.to_host()) quagga_grads.append( [e.backward_matrix.to_host() for e in qx]) seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) seq_sce_layer = SequentialSoftmaxLayer() th_x = T.ftensor3() th_true_labels = T.imatrix() loss = seq_sce_layer.get_loss( seq_dot_layer.get_output_expr(th_x), th_true_labels) wrt = [seq_dot_layer.W] if with_bias: wrt.append(seq_dot_layer.b) wrt.append(th_x) grads = T.grad(loss, wrt) get_theano_grads = theano.function([th_x, th_true_labels], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip( quagga_grads[:-1], theano_grads[:-1]): r.append( np.allclose(quagga_grad, theano_grad, atol=1e-5)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads[processor_type] = [ qW.backward_matrix.to_host() ] if with_bias: quagga_grads[processor_type].append( qb.backward_matrix.to_host()) quagga_grads[processor_type].extend( e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
dec_lstm_W={'init': H5pyInitializer(model_file_name, 'dec_lstm_W'), 'device_id': 0}, dec_lstm_R={'init': H5pyInitializer(model_file_name, 'dec_lstm_R'), 'device_id': 0}, sce_dot_block_W={'init': H5pyInitializer(model_file_name, 'sce_dot_block_W'), 'device_id': 0}, sce_dot_block_b={'init': H5pyInitializer(model_file_name, 'sce_dot_block_b'), 'device_id': 0}) data_block = DataBlock(train_data, valid_data, 64, word_dropout_prob=0.99, device_id=0) enc_embd_block = RowSlicingBlock(p['embd_W'], data_block.enc_x) enc_c_repeat_block = RepeatBlock(p['enc_lstm_c0'], data_block.enc_x.nrows, axis=0, device_id=0) enc_h_repeat_block = RepeatBlock(p['enc_lstm_h0'], data_block.enc_x.nrows, axis=0, device_id=0) enc_lstm_block = SequencerBlock(block_class=LstmBlock, params=[p['enc_lstm_W'], p['enc_lstm_R'], 0.25], sequences=[enc_embd_block.output, data_block.enc_mask], output_names=['h'], prev_names=['c', 'h'], paddings=[enc_c_repeat_block.output, enc_h_repeat_block.output], reverse=False, device_id=0) dec_embd_block = RowSlicingBlock(p['embd_W'], data_block.dec_x) dec_c_repeat_block = RepeatBlock(p['dec_lstm_c0'], data_block.enc_x.nrows, axis=0, device_id=0) last_selector_block = LastSelectorBlock(enc_lstm_block.h) l2_reg_block = L2RegularizationBlock(last_selector_block.output, 0.001) dec_lstm_block = SequencerBlock(block_class=LstmBlock, params=[p['dec_lstm_W'], p['dec_lstm_R'], 0.25], sequences=[dec_embd_block.output, data_block.dec_mask], output_names=['h'], prev_names=['c', 'h'], paddings=[dec_c_repeat_block.output, last_selector_block.output], reverse=False, device_id=0)
def test_theano_bprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)] true_labels = [self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads = [qW.backward_matrix.to_host()] if with_bias: quagga_grads.append(qb.backward_matrix.to_host()) quagga_grads.append([e.backward_matrix.to_host() for e in qx]) seq_dot_layer = SequentialDotLayer(W, b if with_bias else None, reverse) seq_sce_layer = SequentialSoftmaxLayer() th_x = T.ftensor3() th_true_labels = T.imatrix() loss = seq_sce_layer.get_loss(seq_dot_layer.get_output_expr(th_x), th_true_labels) wrt = [seq_dot_layer.W] if with_bias: wrt.append(seq_dot_layer.b) wrt.append(th_x) grads = T.grad(loss, wrt) get_theano_grads = theano.function([th_x, th_true_labels], grads) theano_grads = get_theano_grads(np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip(quagga_grads[:-1], theano_grads[:-1]): r.append(np.allclose(quagga_grad, theano_grad, atol=1e-5)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) qh = {} for reverse in [False, True]: for with_mask in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type context = Context() qx = List([Connector(Matrix.from_npa(e)) for e in x]) qmask = Matrix.empty(batch_size, len(qx), 'float') qh_0 = Connector(Matrix.from_npa(h_0)) qc_0 = Connector(Matrix.from_npa(c_0)) qW = Connector(Matrix.from_npa(W)) qR = Connector(Matrix.from_npa(R)) sequences = [qx] if with_mask: sequences.append( List([ Connector(qmask[:, i]) for i in xrange(len(qx)) ], len(qx))) qmask.assign_npa(context, mask) qmask = sequences[-1] else: sequences.append([None] * len(qx)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=sequences, output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) qx.length = sequence_len if with_mask: qmask.fprop() qx.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() qh[processor_type] = lstm.h.to_host() for h_gpu, h_cpu in izip(qh['gpu'], qh['cpu']): if not np.allclose(h_gpu, h_cpu, rtol=1e-7, atol=1e-3): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(128) input_dim, hidden_dim, class_num = self.rng.random_integers(1500, size=3) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(class_num, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) lr_W = self.get_orthogonal_matrix(hidden_dim, class_num) lr_b = self.rng.rand(1, class_num).astype(dtype=np.float32) device_id = 0 for reverse in [False, True]: for with_mask in [False, True]: for learn_inital_states in [False, True]: # quagga model context = Context() qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], qx.length) qmask = Matrix.empty(batch_size, qx.length, 'float') qmask_list = [ Connector(qmask[:, i]) for i in xrange(qmask.ncols) ] qmask = Connector(qmask) qh_0 = Connector( Matrix.from_npa(h_0), device_id if learn_inital_states else None) qc_0 = Connector( Matrix.from_npa(c_0), device_id if learn_inital_states else None) qW = Connector(Matrix.from_npa(W), device_id) qR = Connector(Matrix.from_npa(R), device_id) qlr_W = Connector(Matrix.from_npa(lr_W), device_id) qlr_b = Connector(Matrix.from_npa(lr_b), device_id) lstm = SequencerBlock( block_class=LstmBlock, params=[qW, qR], sequences=[ qx, qmask_list if with_mask else [None] * len(qx) ], output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qlr_W, qlr_b], sequences=[lstm.h], output_names=['output']) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[ seq_dot_block.output, qtrue_labels, qmask_list if with_mask else [None] * len(qx) ]) qx.length = sequence_len for e in qx: e.fprop() for e in qtrue_labels: e.fprop() qmask.assign_npa(context, mask) qmask.fprop() qlr_W.fprop() qlr_b.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() lstm.bprop() quagga_grads = [ qlr_b.backward_matrix.to_host(), qlr_W.backward_matrix.to_host(), qW.backward_matrix.to_host(), qR.backward_matrix.to_host() ] if learn_inital_states: quagga_grads.append(qc_0.backward_matrix.to_host()) quagga_grads.append(qh_0.backward_matrix.to_host()) quagga_grads.append( [e.backward_matrix.to_host() for e in qx]) del qx del qlr_b del qlr_W del qW del qR del qmask del lstm del seq_dot_block del seq_sce_block # theano model th_x = T.ftensor3() th_true_labels = T.imatrix() th_mask = T.fmatrix() lstm_layer = LstmLayer(W, R, c_0, h_0, reverse=reverse) th_h = lstm_layer.get_output_expr( th_x, th_mask if with_mask else None) seq_softmax_layer = SequentialSoftmaxLayer( lr_W, lr_b, reverse) loss = seq_softmax_layer.get_loss( th_h, th_true_labels, th_mask if with_mask else None) wrt = [ seq_softmax_layer.b, seq_softmax_layer.W, lstm_layer.W, lstm_layer.R ] if learn_inital_states: wrt.append(lstm_layer.c0) wrt.append(lstm_layer.h0) wrt.append(th_x) grads = T.grad(loss, wrt) if with_mask: get_theano_grads = theano.function( [th_x, th_true_labels, th_mask], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len]), mask[:, :sequence_len]) else: get_theano_grads = theano.function( [th_x, th_true_labels], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip( quagga_grads[:-1], theano_grads[:-1]): r.append( np.allclose(quagga_grad, theano_grad, atol=1e-6)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-6): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))