예제 #1
0
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size, x_dim = self.rng.random_integers(3000, size=2)
            x = self.rng.rand(batch_size, x_dim).astype(np.float32)

            for nonlinearity in ['sigmoid', 'tanh', 'relu']:
                state = self.rng.get_state()
                quagga.processor_type = 'gpu'
                x_gpu = Connector(Matrix.from_npa(x))
                nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity)
                x_gpu.fprop()
                nonlinearity_block.fprop()
                output_gpu = nonlinearity_block.output.to_host()

                self.rng.set_state(state)
                quagga.processor_type = 'cpu'
                x_cpu = Connector(Matrix.from_npa(x))
                nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity)
                x_cpu.fprop()
                nonlinearity_block.fprop()
                output_cpu = nonlinearity_block.output.to_host()

                r.append(np.allclose(output_gpu, output_cpu))

        self.assertEqual(sum(r), len(r))
예제 #2
0
    def test_fprop_matrix(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(300)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            embd_dim = self.rng.random_integers(10000)
            batch_size, output_dim = self.rng.random_integers(2000, size=2)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32)

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qrow_idxs = Connector(Matrix.from_npa(row_idxs))
                qW = Connector(Matrix.from_npa(W))
                row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
                qW.fprop()
                qrow_idxs.ncols = sequence_len
                qrow_idxs.fprop()
                row_slicing_block.fprop()
                output[processor_type] = row_slicing_block.output.to_host()

            for output_gpu, output_cpu in izip(output['gpu'], output['cpu']):
                r.append(np.allclose(output_gpu, output_cpu))

        self.assertEqual(sum(r), len(r))
예제 #3
0
    def test_theano_fprop_matrix(self):
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(300)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            embd_dim = self.rng.random_integers(10000)
            batch_size = self.rng.random_integers(500)
            output_dim = self.rng.random_integers(2000)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32)

            quagga.processor_type = 'gpu'
            qrow_idxs = Connector(Matrix.from_npa(row_idxs))
            qW = Connector(Matrix.from_npa(W))
            row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
            qW.fprop()
            qrow_idxs.ncols = sequence_len
            qrow_idxs.fprop()
            row_slicing_block.fprop()
            q_output = row_slicing_block.output.to_host()

            th_row_idxs = T.imatrix()
            row_slicing_layer = RowSlicingLayer(W)
            toutput = row_slicing_layer.get_output_expr(th_row_idxs)
            th_output = theano.function([th_row_idxs], toutput)(row_idxs)

            for i in xrange(sequence_len):
                r.append(np.allclose(q_output[i], th_output[i]))

        self.assertEqual(sum(r), len(r))
예제 #4
0
    def test_bprop_vector(self):
        r = []
        for _ in xrange(self.N):
            embd_dim = self.rng.random_integers(10000)
            batch_size, output_dim = self.rng.random_integers(2000, size=2)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32)
            true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32)
            device_id = 0

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qrow_idxs = Connector(Matrix.from_npa(row_idxs))
                qtrue_labels = Connector(Matrix.from_npa(true_labels))
                qW = Connector(Matrix.from_npa(W), device_id)
                row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
                sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels)
                qW.fprop()
                qrow_idxs.fprop()
                row_slicing_block.fprop()
                sce_block.fprop()
                sce_block.bprop()
                row_slicing_block.bprop()
                qW.add(Context(), qW.backward_matrix)
                output[processor_type] = qW.to_host()

            r.append(np.allclose(output['gpu'], output['cpu']))

        self.assertEqual(sum(r), len(r))
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            batch_size = self.rng.random_integers(512)
            dim = self.rng.random_integers(1500)
            x = [self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)]

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            x_gpu = List([Connector(Matrix.from_npa(e)) for e in x])
            smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu)
            x_gpu.set_length(sequence_len)
            smean_pooling_block_gpu.fprop()
            output_gpu = smean_pooling_block_gpu.output.to_host()

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            x_cpu = List([Connector(Matrix.from_npa(e)) for e in x])
            smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu)
            x_cpu.set_length(sequence_len)
            smean_pooling_block_cpu.fprop()
            output_cpu = smean_pooling_block_cpu.output.to_host()

            r.append(np.allclose(output_gpu, output_cpu))

        self.assertEqual(sum(r), self.N)
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(
                max_input_sequence_len)
            batch_size = self.rng.random_integers(512)
            dim = self.rng.random_integers(1500)
            x = [
                self.rng.rand(batch_size, dim).astype(dtype=np.float32)
                for _ in xrange(max_input_sequence_len)
            ]

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            x_gpu = List([Connector(Matrix.from_npa(e)) for e in x])
            smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu)
            x_gpu.set_length(sequence_len)
            smean_pooling_block_gpu.fprop()
            output_gpu = smean_pooling_block_gpu.output.to_host()

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            x_cpu = List([Connector(Matrix.from_npa(e)) for e in x])
            smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu)
            x_cpu.set_length(sequence_len)
            smean_pooling_block_cpu.fprop()
            output_cpu = smean_pooling_block_cpu.output.to_host()

            r.append(np.allclose(output_gpu, output_cpu))

        self.assertEqual(sum(r), self.N)
예제 #7
0
    def test_theano_grad(self):
        quagga.processor_type = 'gpu'
        r = []
        for i in xrange(self.N):
            batch_size, dim = self.rng.random_integers(2000, size=2)
            y_hat = self.rng.randn(batch_size, dim).astype(dtype=np.float32)
            y = self.rng.randn(batch_size, dim).astype(dtype=np.float32)

            # Theano model
            th_y_hat, th_y = T.fmatrix(), T.fmatrix()
            loss = T.mean(T.sum((th_y_hat - th_y) ** 2, axis=1))
            get_theano_grads = theano.function([th_y_hat, th_y], T.grad(loss, wrt=th_y_hat))
            th_dL_dy_hat = get_theano_grads(y_hat, y)

            # quagga model
            context = Context()
            y_hat_gpu = Connector(Matrix.from_npa(y_hat), context, context)
            y_gpu = Connector(Matrix.from_npa(y))
            sigmoid_ce_block = SseBlock(y_hat_gpu, y_gpu)
            sigmoid_ce_block.fprop()
            sigmoid_ce_block.bprop()
            q_dL_dy_hat = y_hat_gpu.backward_matrix.to_host()

            r.append(np.allclose(th_dL_dy_hat, q_dL_dy_hat))

        self.assertEqual(sum(r), self.N)
예제 #8
0
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size, dim = self.rng.random_integers(2000, size=2)
            y_hat = self.rng.randn(batch_size, dim).astype(dtype=np.float32)
            y = self.rng.randn(batch_size, dim).astype(dtype=np.float32)

            quagga.processor_type = 'gpu'
            context = Context()
            y_hat_gpu = Connector(Matrix.from_npa(y_hat), context, context)
            y_gpu = Connector(Matrix.from_npa(y))
            sse_block = SseBlock(y_hat_gpu, y_gpu)
            sse_block.fprop()
            sse_block.bprop()
            dL_dy_hat_gpu = y_hat_gpu.backward_matrix.to_host()

            quagga.processor_type = 'cpu'
            context = Context()
            y_hat_cpu = Connector(Matrix.from_npa(y_hat), context, context)
            y_cpu = Connector(Matrix.from_npa(y))
            sse_block = SseBlock(y_hat_cpu, y_cpu)
            sse_block.fprop()
            sse_block.bprop()
            dL_dy_hat_cpu = y_hat_cpu.backward_matrix.to_host()

            r.append(np.allclose(dL_dy_hat_gpu, dL_dy_hat_cpu))

        self.assertEqual(sum(r), self.N)
예제 #9
0
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size, x_dim = self.rng.random_integers(3000, size=2)
            x = self.rng.rand(batch_size, x_dim).astype(np.float32)

            for nonlinearity in ['sigmoid', 'tanh', 'relu']:
                state = self.rng.get_state()
                quagga.processor_type = 'gpu'
                x_gpu = Connector(Matrix.from_npa(x))
                nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity)
                x_gpu.fprop()
                nonlinearity_block.fprop()
                output_gpu = nonlinearity_block.output.to_host()

                self.rng.set_state(state)
                quagga.processor_type = 'cpu'
                x_cpu = Connector(Matrix.from_npa(x))
                nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity)
                x_cpu.fprop()
                nonlinearity_block.fprop()
                output_cpu = nonlinearity_block.output.to_host()

                r.append(np.allclose(output_gpu, output_cpu))

        self.assertEqual(sum(r), len(r))
예제 #10
0
    def test_bprop(self):
        r = []
        for i in xrange(self.N):
            repeats = self.rng.random_integers(42)
            axis = self.rng.randint(2)
            input_dim, output_dim = self.rng.random_integers(2000, size=2)
            x = self.get_normal_matrix(input_dim, output_dim)
            input_dim = input_dim if axis else input_dim * repeats
            true_labels = self.rng.randint(output_dim, size=(input_dim, 1)).astype(np.int32)
            device_id = 0

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qx = Connector(Matrix.from_npa(x), device_id)
                qtrue_labels = Connector(Matrix.from_npa(true_labels))
                repeat_block = RepeatBlock(qx, repeats, axis)
                sce_block = SoftmaxCeBlock(repeat_block.output, qtrue_labels)
                qx.fprop()
                qtrue_labels.fprop()
                repeat_block.fprop()
                sce_block.fprop()
                sce_block.bprop()
                repeat_block.bprop()
                output[processor_type] = qx.backward_matrix.to_host()

            r.append(np.allclose(output['gpu'], output['cpu']))

        self.assertEqual(sum(r), len(r))
예제 #11
0
    def test_theano_fprop_vector(self):
        r = []
        for _ in xrange(self.N):
            embd_dim = self.rng.random_integers(10000)
            batch_size, output_dim = self.rng.random_integers(2000, size=2)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32)

            quagga.processor_type = 'gpu'
            qrow_idxs = Connector(Matrix.from_npa(row_idxs))
            qW = Connector(Matrix.from_npa(W))
            row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
            qW.fprop()
            qrow_idxs.fprop()
            row_slicing_block.fprop()
            q_output = row_slicing_block.output.to_host()

            trow_idxs = T.ivector()
            row_slicing_layer = RowSlicingLayer(W)
            t_output = row_slicing_layer.get_output_expr(trow_idxs)
            t_output = theano.function([trow_idxs], t_output)(row_idxs[:, 0])

            r.append(np.allclose(q_output, t_output))

        self.assertEqual(sum(r), len(r))
예제 #12
0
    def test_bprop(self):
        r = []
        for i in xrange(self.N):
            matrices = []
            ncols = self.rng.random_integers(1, 3000)
            nrows = [0]
            row_slices = []
            device_ids = []
            for _ in xrange(self.rng.random_integers(1, 10)):
                _nrows = self.rng.random_integers(1, 2000)
                nrows.append(nrows[-1] + _nrows)
                if self.rng.choice([True, False]):
                    device_ids.append(0)
                    row_slices.append((nrows[-2], nrows[-1]))
                else:
                    device_ids.append(None)
                matrices.append(
                    self.rng.rand(_nrows, ncols).astype(np.float32))
            true_labels = self.rng.randint(ncols, size=(nrows[-1],
                                                        1)).astype(np.int32)
            if not row_slices:
                r.append(True)
                continue

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qmatrices = [
                    Connector(Matrix.from_npa(m), d_id)
                    for m, d_id in izip(matrices, device_ids)
                ]
                qtrue_labels = Connector(Matrix.from_npa(true_labels))
                vstack_block = VerticalStackBlock(*qmatrices)
                sce_block = SoftmaxCeBlock(vstack_block.output, qtrue_labels)

                for m in qmatrices:
                    m.fprop()
                qtrue_labels.fprop()
                vstack_block.fprop()
                sce_block.fprop()
                sce_block.bprop()
                vstack_block.bprop()

                output[processor_type] = [
                    m.backward_matrix.to_host() for m in qmatrices
                    if m.bpropagable
                ]

            for dL_dm_gpu, dL_dm_cpu in izip(output['gpu'], output['cpu']):
                if not np.allclose(dL_dm_gpu, dL_dm_cpu):
                    r.append(False)
                    break
            else:
                r.append(True)
        self.assertEqual(sum(r), self.N)
예제 #13
0
    def test_bprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """

        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            batch_size = self.rng.random_integers(256)
            input_dim, hidden_dim = self.rng.random_integers(1500, size=2)
            x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)]
            true_labels = [self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)]
            W = self.get_orthogonal_matrix(input_dim, hidden_dim)
            b = self.rng.rand(1, hidden_dim).astype(np.float32)
            device_id = 0

            quagga_grads = {}
            for reverse in [False, True]:
                for with_bias in [False, True]:
                    for processor_type in ['gpu', 'cpu']:
                        quagga.processor_type = processor_type
                        qx = List([Connector(Matrix.from_npa(e), device_id) for e in x])
                        qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], len(qx))
                        qW = Connector(Matrix.from_npa(W), device_id)
                        qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None
                        seq_dot_block = SequencerBlock(block_class=DotBlock,
                                                       params=[qW, qb],
                                                       sequences=[qx],
                                                       output_names=['output'],
                                                       reverse=reverse)
                        seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock,
                                                       params=[],
                                                       sequences=[seq_dot_block.output, qtrue_labels],
                                                       reverse=reverse)
                        qx.length = sequence_len
                        qx.fprop()
                        qtrue_labels.fprop()
                        qW.fprop()
                        if qb:
                            qb.fprop()
                        seq_dot_block.fprop()
                        seq_sce_block.fprop()
                        seq_sce_block.bprop()
                        seq_dot_block.bprop()
                        quagga_grads[processor_type] = [qW.backward_matrix.to_host()]
                        if with_bias:
                            quagga_grads[processor_type].append(qb.backward_matrix.to_host())
                        quagga_grads[processor_type].extend(e.backward_matrix.to_host() for e in qx)

                    for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']):
                        r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5))

        self.assertEqual(sum(r), len(r))
예제 #14
0
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """

        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(
                max_input_sequence_len)
            batch_size = self.rng.random_integers(256)
            input_dim, hidden_dim = self.rng.random_integers(1500, size=2)
            x = [
                self.rng.randn(batch_size, input_dim).astype(np.float32)
                for _ in xrange(max_input_sequence_len)
            ]
            W = self.get_orthogonal_matrix(input_dim, hidden_dim)
            b = self.rng.rand(1, hidden_dim).astype(np.float32)

            from quagga.cuda import cudart
            cudart.cuda_set_device(1)

            qoutput = {}
            for reverse in [False, True]:
                for with_bias in [False, True]:
                    for processor_type in ['gpu', 'cpu']:
                        quagga.processor_type = processor_type
                        qx = List([Connector(Matrix.from_npa(e)) for e in x])
                        qW = Connector(Matrix.from_npa(W))
                        qb = Connector(
                            Matrix.from_npa(b)) if with_bias else None
                        seq_dot_block = SequencerBlock(block_class=DotBlock,
                                                       params=[qW, qb],
                                                       sequences=[qx],
                                                       output_names=['output'],
                                                       reverse=reverse)
                        qx.length = sequence_len
                        qx.fprop()
                        qW.fprop()
                        if qb:
                            qb.fprop()
                        seq_dot_block.fprop()
                        qoutput[processor_type] = seq_dot_block.output.to_host(
                        )

                    for output_gpu, output_cpu in izip(qoutput['gpu'],
                                                       qoutput['cpu']):
                        if not np.allclose(output_gpu, output_cpu, atol=1e-5):
                            r.append(False)
                            break
                    else:
                        r.append(True)

        self.assertEqual(sum(r), len(r))
예제 #15
0
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(
                max_input_sequence_len)
            batch_size = self.rng.random_integers(512)
            dim_x, dim_y = self.rng.random_integers(1500, size=2)
            x = [
                self.rng.rand(batch_size, dim_x).astype(dtype=np.float32)
                for _ in xrange(max_input_sequence_len)
            ]
            y = [
                self.rng.rand(batch_size, dim_y).astype(dtype=np.float32)
                for _ in xrange(max_input_sequence_len)
            ]

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            x_gpu = List([Connector(Matrix.from_npa(e)) for e in x])
            y_gpu = List([Connector(Matrix.from_npa(e)) for e in y])
            seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu)
            x_gpu.length = sequence_len
            y_gpu.length = sequence_len
            if sequence_len == 0:
                pass
            seq_hstack_block_gpu.fprop()
            output_sequence_gpu = seq_hstack_block_gpu.output.to_host()

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            x_cpu = List([Connector(Matrix.from_npa(e)) for e in x])
            y_cpu = List([Connector(Matrix.from_npa(e)) for e in y])
            seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu)
            x_cpu.length = sequence_len
            y_cpu.length = sequence_len
            seq_hstack_block_cpu.fprop()
            output_sequence_cpu = seq_hstack_block_cpu.output.to_host()

            for out_gpu, out_cpu in izip(output_sequence_gpu,
                                         output_sequence_cpu):
                if not np.allclose(out_gpu, out_cpu):
                    r.append(False)
                    break
            else:
                r.append(True)

        self.assertEqual(sum(r), self.N)
예제 #16
0
    def test_theano_grad(self):
        quagga.processor_type = 'gpu'
        r = []
        for i in xrange(self.N):
            batch_size = self.rng.random_integers(2000)
            true_labels = self.rng.randint(2,
                                           size=(batch_size,
                                                 1)).astype(dtype=np.float32)
            mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32)
            x = self.rng.randn(batch_size, 1).astype(dtype=np.float32)
            device_id = 0

            for with_mask in [False, True]:
                # Theano model
                th_x = T.fmatrix()
                th_mask = T.fmatrix()
                th_true_labels = T.fmatrix()
                if with_mask:
                    probs = T.nnet.sigmoid(th_mask * th_x)
                else:
                    probs = T.nnet.sigmoid(th_x)
                loss = T.mean(T.nnet.binary_crossentropy(
                    probs, th_true_labels))
                if with_mask:
                    get_theano_grads = theano.function(
                        [th_x, th_true_labels, th_mask], T.grad(loss,
                                                                wrt=th_x))
                    th_dL_dx = get_theano_grads(x, true_labels, mask)
                else:
                    get_theano_grads = theano.function([th_x, th_true_labels],
                                                       T.grad(loss, wrt=th_x))
                    th_dL_dx = get_theano_grads(x, true_labels)

                # quagga model
                x_gpu = Connector(Matrix.from_npa(x), device_id)
                true_labels_gpu = Connector(Matrix.from_npa(true_labels))
                mask_gpu = Connector(
                    Matrix.from_npa(mask)) if with_mask else None
                sigmoid_ce_block = SigmoidCeBlock(x_gpu, true_labels_gpu,
                                                  mask_gpu)
                x_gpu.fprop()
                true_labels_gpu.fprop()
                if with_mask:
                    mask_gpu.fprop()
                sigmoid_ce_block.fprop()
                sigmoid_ce_block.bprop()
                q_dL_dx = x_gpu.backward_matrix.to_host()

                r.append(np.allclose(th_dL_dx, q_dL_dx))

        self.assertEqual(sum(r), len(r))
예제 #17
0
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size, x_dim, output_dim = self.rng.random_integers(2000, size=3)
            x = self.rng.rand(batch_size, x_dim).astype(np.float32)
            W = self.get_orthogonal_matrix(x_dim, output_dim)
            b = self.rng.rand(1, output_dim).astype(np.float32) if self.rng.randint(2) else None
            device_id = 0

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            context = Context()
            x_gpu = Connector(Matrix.from_npa(x), device_id)
            W_gpu = Connector(Matrix.from_npa(W), device_id)
            b_gpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b
            dot_block_gpu = DotBlock(W_gpu, b_gpu, x_gpu)
            x_gpu.fprop()
            W_gpu.fprop()
            if b_gpu:
                b_gpu.fprop()
            dot_block_gpu.fprop()
            _, dL_doutput = dot_block_gpu.output.register_usage(device_id, device_id)
            random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
            dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float'))
            dot_block_gpu.bprop()
            if b is not None:
                dL_db_gpu = b_gpu.backward_matrix.to_host()
            dL_dW_gpu = W_gpu.backward_matrix.to_host()
            dL_dx_gpu = x_gpu.backward_matrix.to_host()

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            context = Context()
            x_cpu = Connector(Matrix.from_npa(x), device_id)
            W_cpu = Connector(Matrix.from_npa(W), device_id)
            b_cpu = Connector(Matrix.from_npa(b), device_id) if b is not None else b
            dot_block_cpu = DotBlock(W_cpu, b_cpu, x_cpu)
            x_cpu.fprop()
            W_cpu.fprop()
            if b_cpu:
                b_cpu.fprop()
            dot_block_cpu.fprop()
            _, dL_doutput = dot_block_cpu.output.register_usage(device_id, device_id)
            random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
            dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float'))
            dot_block_cpu.bprop()
            if b is not None:
                dL_db_cpu = b_cpu.backward_matrix.to_host()
            dL_dW_cpu = W_cpu.backward_matrix.to_host()
            dL_dx_cpu = x_cpu.backward_matrix.to_host()

            r.append(np.allclose(dL_dx_gpu, dL_dx_cpu, atol=1e-5))
            r.append(np.allclose(dL_dW_gpu, dL_dW_cpu, atol=1e-5))
            if b is not None:
                r.append(np.allclose(dL_db_gpu, dL_db_cpu, atol=1e-5))

        self.assertEqual(sum(r), len(r))
예제 #18
0
    def test_theano_fprop(self):
        quagga.processor_type = 'gpu'
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(
                max_input_sequence_len)
            batch_size = self.rng.random_integers(256)
            input_dim, hidden_dim = self.rng.random_integers(1500, size=2)
            x = [
                self.rng.randn(batch_size, input_dim).astype(np.float32)
                for _ in xrange(max_input_sequence_len)
            ]
            W = self.get_orthogonal_matrix(input_dim, hidden_dim)
            b = self.rng.rand(1, hidden_dim).astype(np.float32)

            for reverse in [False, True]:
                for with_bias in [False, True]:
                    qx = List([Connector(Matrix.from_npa(e)) for e in x])
                    qW = Connector(Matrix.from_npa(W))
                    qb = Connector(Matrix.from_npa(b)) if with_bias else None
                    seq_dot_block = SequencerBlock(block_class=DotBlock,
                                                   params=[qW, qb],
                                                   sequences=[qx],
                                                   output_names=['output'],
                                                   reverse=reverse)
                    qx.length = sequence_len
                    qx.fprop()
                    qW.fprop()
                    if qb:
                        qb.fprop()
                    seq_dot_block.fprop()
                    qoutput = seq_dot_block.output.to_host()

                    seq_dot_layer = SequentialDotLayer(
                        W, b if with_bias else None, reverse)
                    th_x = T.ftensor3()
                    get_th_output = theano.function(
                        [th_x], seq_dot_layer.get_output_expr(th_x))
                    th_output = get_th_output(np.dstack(x[:sequence_len]))

                    for i in xrange(th_output.shape[0]):
                        if not np.allclose(qoutput[i], th_output[i]):
                            r.append(False)
                            break
                    else:
                        r.append(True)

        self.assertEqual(sum(r), len(r))
예제 #19
0
    def test_theano_grad(self):
        quagga.processor_type = 'gpu'
        r = []
        for i in xrange(self.N):
            for sparse in [True, False]:
                batch_size, dim = self.rng.random_integers(2000, size=2)
                if sparse:
                    true_labels = np.zeros((batch_size, dim), np.float32)
                    for k, j in enumerate(self.rng.randint(dim, size=batch_size)):
                        true_labels[k, j] = 1.0
                else:
                    true_labels = self.rng.randint(dim, size=(batch_size, 1)).astype(np.int32)
                x = self.rng.randn(batch_size, dim).astype(np.float32)
                mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32)
                device_id = 0
                for with_mask in [False, True]:
                    # Theano model
                    th_x = T.fmatrix()
                    th_mask = T.fcol()
                    th_true_labels = T.fmatrix() if sparse else T.ivector()
                    if with_mask:
                        probs = T.nnet.softmax(th_mask * th_x)
                    else:
                        probs = T.nnet.softmax(th_x)
                    loss = T.mean(T.nnet.categorical_crossentropy(probs, th_true_labels))
                    if with_mask:
                        get_theano_grads = theano.function([th_x, th_true_labels, th_mask], T.grad(loss, wrt=th_x))
                        th_dL_dx = get_theano_grads(x, true_labels if sparse else true_labels[:, 0], mask)
                    else:
                        get_theano_grads = theano.function([th_x, th_true_labels], T.grad(loss, wrt=th_x))
                        th_dL_dx = get_theano_grads(x, true_labels if sparse else true_labels[:, 0])

                    # quagga model
                    x_gpu = Connector(Matrix.from_npa(x), device_id)
                    true_labels_gpu = Connector(Matrix.from_npa(true_labels))
                    mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None
                    softmax_ce_block = SoftmaxCeBlock(x_gpu, true_labels_gpu, mask_gpu)
                    x_gpu.fprop()
                    true_labels_gpu.fprop()
                    if with_mask:
                        mask_gpu.fprop()
                    softmax_ce_block.fprop()
                    softmax_ce_block.bprop()
                    q_dL_dx = x_gpu.backward_matrix.to_host()

                    r.append(np.allclose(th_dL_dx, q_dL_dx))

        self.assertEqual(sum(r), len(r))
예제 #20
0
    def test_bprop(self):
        r = []
        for i in xrange(self.N):
            matrices = []
            nrows = self.rng.random_integers(1, 3000)
            ncols = [0]
            col_slices = []
            device_ids = []
            for _ in xrange(self.rng.random_integers(1, 10)):
                _ncols = self.rng.random_integers(1, 2000)
                ncols.append(ncols[-1] + _ncols)
                if self.rng.choice([True, False]):
                    device_ids.append(0)
                    col_slices.append((ncols[-2], ncols[-1]))
                else:
                    device_ids.append(None)
                matrices.append(self.rng.rand(nrows, _ncols).astype(np.float32))
            true_labels = self.rng.randint(ncols[-1], size=(nrows, 1)).astype(np.int32)
            if not col_slices:
                r.append(True)
                continue

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qmatrices = [Connector(Matrix.from_npa(m), d_id) for m, d_id in izip(matrices, device_ids)]
                qtrue_labels = Connector(Matrix.from_npa(true_labels))
                hstack_block = HorizontalStackBlock(*qmatrices)
                sce_block = SoftmaxCeBlock(hstack_block.output, qtrue_labels)

                for m in qmatrices:
                    m.fprop()
                qtrue_labels.fprop()
                hstack_block.fprop()
                sce_block.fprop()
                sce_block.bprop()
                hstack_block.bprop()

                output[processor_type] = [m.backward_matrix.to_host()
                                          for m in qmatrices if m.bpropagable]

            for dL_dm_gpu, dL_dm_cpu in izip(output['gpu'], output['cpu']):
                if not np.allclose(dL_dm_gpu, dL_dm_cpu):
                    r.append(False)
                    break
            else:
                r.append(True)
        self.assertEqual(sum(r), self.N)
예제 #21
0
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """

        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            batch_size = self.rng.random_integers(256)
            input_dim, hidden_dim = self.rng.random_integers(1500, size=2)
            x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)]
            W = self.get_orthogonal_matrix(input_dim, hidden_dim)
            b = self.rng.rand(1, hidden_dim).astype(np.float32)

            from quagga.cuda import cudart
            cudart.cuda_set_device(1)

            qoutput = {}
            for reverse in [False, True]:
                for with_bias in [False, True]:
                    for processor_type in ['gpu', 'cpu']:
                        quagga.processor_type = processor_type
                        qx = List([Connector(Matrix.from_npa(e)) for e in x])
                        qW = Connector(Matrix.from_npa(W))
                        qb = Connector(Matrix.from_npa(b)) if with_bias else None
                        seq_dot_block = SequencerBlock(block_class=DotBlock,
                                                       params=[qW, qb],
                                                       sequences=[qx],
                                                       output_names=['output'],
                                                       reverse=reverse)
                        qx.length = sequence_len
                        qx.fprop()
                        qW.fprop()
                        if qb:
                            qb.fprop()
                        seq_dot_block.fprop()
                        qoutput[processor_type] = seq_dot_block.output.to_host()

                    for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']):
                        if not np.allclose(output_gpu, output_cpu, atol=1e-5):
                            r.append(False)
                            break
                    else:
                        r.append(True)

        self.assertEqual(sum(r), len(r))
예제 #22
0
    def test_theano_grad(self):
        quagga.processor_type = 'gpu'
        r = []
        for i in xrange(self.N):
            batch_size, dim = self.rng.random_integers(2000, size=2)
            true_labels = self.rng.randint(2, size=(batch_size, dim)).astype(dtype=np.float32)
            mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32)
            x = self.rng.randn(batch_size, dim).astype(dtype=np.float32)
            device_id = 0

            for with_mask in [False, True]:
                # Theano model
                th_x = T.fmatrix()
                th_mask = T.fmatrix()
                th_true_labels = T.fmatrix()
                if with_mask:
                    probs = T.nnet.sigmoid(theano.compile.ops.Rebroadcast((0, False), (1, True))(th_mask) * th_x)
                else:
                    probs = T.nnet.sigmoid(th_x)
                loss = - th_true_labels * T.log(probs) - \
                       (1.0 - th_true_labels) * T.log(1.0 - probs)
                loss = T.sum(loss, axis=1).mean()

                if with_mask:
                    get_theano_grads = theano.function([th_x, th_true_labels, th_mask], T.grad(loss, wrt=th_x))
                    th_dL_dx = get_theano_grads(x, true_labels, mask)
                else:
                    get_theano_grads = theano.function([th_x, th_true_labels], T.grad(loss, wrt=th_x))
                    th_dL_dx = get_theano_grads(x, true_labels)

                # quagga model
                x_gpu = Connector(Matrix.from_npa(x), device_id)
                true_labels_gpu = Connector(Matrix.from_npa(true_labels))
                mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None
                sigmoid_ce_block = SigmoidCeBlock(x_gpu, true_labels_gpu, mask_gpu)
                x_gpu.fprop()
                true_labels_gpu.fprop()
                if with_mask:
                    mask_gpu.fprop()
                sigmoid_ce_block.fprop()
                sigmoid_ce_block.bprop()
                q_dL_dx = x_gpu.backward_matrix.to_host()

                r.append(np.allclose(th_dL_dx, q_dL_dx))

        self.assertEqual(sum(r), len(r))
예제 #23
0
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size, x_dim = self.rng.random_integers(3000, size=2)
            x = self.rng.rand(batch_size, x_dim).astype(np.float32)
            device_id = 0

            for nonlinearity in ['sigmoid', 'tanh', 'relu']:
                state = self.rng.get_state()
                quagga.processor_type = 'gpu'

                x_gpu = Connector(Matrix.from_npa(x), device_id)
                nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity)
                x_gpu.fprop()
                nonlinearity_block.fprop()
                _, dL_doutput = nonlinearity_block.output.register_usage(
                    device_id, device_id)
                random_matrix = self.rng.rand(dL_doutput.nrows,
                                              dL_doutput.ncols)
                dL_doutput.assign(Context(),
                                  Matrix.from_npa(random_matrix, 'float'))
                nonlinearity_block.bprop()
                dL_dx_gpu = x_gpu.backward_matrix.to_host()

                self.rng.set_state(state)
                quagga.processor_type = 'cpu'
                x_cpu = Connector(Matrix.from_npa(x), device_id)
                nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity)
                x_cpu.fprop()
                nonlinearity_block.fprop()
                _, dL_doutput = nonlinearity_block.output.register_usage(
                    device_id, device_id)
                random_matrix = self.rng.rand(dL_doutput.nrows,
                                              dL_doutput.ncols)
                dL_doutput.assign(Context(),
                                  Matrix.from_npa(random_matrix, 'float'))
                nonlinearity_block.bprop()
                dL_dx_cpu = x_cpu.backward_matrix.to_host()

                r.append(np.allclose(dL_dx_gpu, dL_dx_cpu))

        self.assertEqual(sum(r), len(r))
    def test_fprop(self):
        """
        compare `fprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            batch_size = self.rng.random_integers(512)
            dim_x, dim_y = self.rng.random_integers(1500, size=2)
            x = [self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)]
            y = [self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)]

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            x_gpu = List([Connector(Matrix.from_npa(e)) for e in x])
            y_gpu = List([Connector(Matrix.from_npa(e)) for e in y])
            seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu)
            x_gpu.length = sequence_len
            y_gpu.length = sequence_len
            if sequence_len == 0:
                pass
            seq_hstack_block_gpu.fprop()
            output_sequence_gpu = seq_hstack_block_gpu.output.to_host()

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            x_cpu = List([Connector(Matrix.from_npa(e)) for e in x])
            y_cpu = List([Connector(Matrix.from_npa(e)) for e in y])
            seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu)
            x_cpu.length = sequence_len
            y_cpu.length = sequence_len
            seq_hstack_block_cpu.fprop()
            output_sequence_cpu = seq_hstack_block_cpu.output.to_host()

            for out_gpu, out_cpu in izip(output_sequence_gpu, output_sequence_cpu):
                if not np.allclose(out_gpu, out_cpu):
                    r.append(False)
                    break
            else:
                r.append(True)

        self.assertEqual(sum(r), self.N)
예제 #25
0
    def test_theano_bprop_matrix(self):
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(300)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len)
            embd_dim = self.rng.random_integers(10000)
            batch_size = self.rng.random_integers(500)
            output_dim = self.rng.random_integers(2000)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32)
            true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)]
            device_id = 0

            quagga.processor_type = 'gpu'
            qrow_idxs = Connector(Matrix.from_npa(row_idxs))
            qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols)
            qW = Connector(Matrix.from_npa(W), device_id)
            row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
            seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock,
                                           params=[],
                                           sequences=[row_slicing_block.output, qtrue_labels])
            qW.fprop()
            qrow_idxs.ncols = sequence_len
            qrow_idxs.fprop()
            row_slicing_block.fprop()
            seq_sce_block.fprop()
            seq_sce_block.bprop()
            row_slicing_block.bprop()
            qW.add(Context(), qW.backward_matrix)

            th_row_idxs = T.imatrix()
            th_true_labels = T.imatrix()
            row_slicing_layer = RowSlicingLayer(W)
            toutput = row_slicing_layer.get_output_expr(th_row_idxs)
            loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels)
            dL_dW = T.grad(loss, row_slicing_layer.W)
            fun = theano.function([th_row_idxs, th_true_labels],
                                  updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)])
            fun(row_idxs, np.hstack(true_labels[:sequence_len]))

            r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5))

        self.assertEqual(sum(r), len(r))
예제 #26
0
    def test_theano_fprop(self):
        quagga.processor_type = 'gpu'
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            batch_size = self.rng.random_integers(256)
            input_dim, hidden_dim = self.rng.random_integers(1500, size=2)
            x = [self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len)]
            W = self.get_orthogonal_matrix(input_dim, hidden_dim)
            b = self.rng.rand(1, hidden_dim).astype(np.float32)

            for reverse in [False, True]:
                for with_bias in [False, True]:
                    qx = List([Connector(Matrix.from_npa(e)) for e in x])
                    qW = Connector(Matrix.from_npa(W))
                    qb = Connector(Matrix.from_npa(b)) if with_bias else None
                    seq_dot_block = SequencerBlock(block_class=DotBlock,
                                                   params=[qW, qb],
                                                   sequences=[qx],
                                                   output_names=['output'],
                                                   reverse=reverse)
                    qx.length = sequence_len
                    qx.fprop()
                    qW.fprop()
                    if qb:
                        qb.fprop()
                    seq_dot_block.fprop()
                    qoutput = seq_dot_block.output.to_host()

                    seq_dot_layer = SequentialDotLayer(W, b if with_bias else None, reverse)
                    th_x = T.ftensor3()
                    get_th_output = theano.function([th_x], seq_dot_layer.get_output_expr(th_x))
                    th_output = get_th_output(np.dstack(x[:sequence_len]))

                    for i in xrange(th_output.shape[0]):
                        if not np.allclose(qoutput[i], th_output[i]):
                            r.append(False)
                            break
                    else:
                        r.append(True)

        self.assertEqual(sum(r), len(r))
예제 #27
0
 def __init__(self, **kwargs):
     self.parameters = {}
     self.trainable_parameters = {}
     for name, definition in kwargs.iteritems():
         device_id = definition['device_id']
         matrix = Matrix.from_npa(definition['init'](), device_id=device_id)
         if 'trainable' not in definition or definition['trainable']:
             param = Connector(matrix, device_id)
             self.trainable_parameters[name] = param
         else:
             param = Connector(matrix)
         self.parameters[name] = param
예제 #28
0
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size, x_dim = self.rng.random_integers(3000, size=2)
            x = self.rng.rand(batch_size, x_dim).astype(np.float32)
            device_id = 0

            for nonlinearity in ['sigmoid', 'tanh', 'relu']:
                state = self.rng.get_state()
                quagga.processor_type = 'gpu'

                x_gpu = Connector(Matrix.from_npa(x), device_id)
                nonlinearity_block = NonlinearityBlock(x_gpu, nonlinearity)
                x_gpu.fprop()
                nonlinearity_block.fprop()
                _, dL_doutput = nonlinearity_block.output.register_usage(device_id, device_id)
                random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
                dL_doutput.assign(Context(), Matrix.from_npa(random_matrix, 'float'))
                nonlinearity_block.bprop()
                dL_dx_gpu = x_gpu.backward_matrix.to_host()

                self.rng.set_state(state)
                quagga.processor_type = 'cpu'
                x_cpu = Connector(Matrix.from_npa(x), device_id)
                nonlinearity_block = NonlinearityBlock(x_cpu, nonlinearity)
                x_cpu.fprop()
                nonlinearity_block.fprop()
                _, dL_doutput = nonlinearity_block.output.register_usage(device_id, device_id)
                random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
                dL_doutput.assign(Context(), Matrix.from_npa(random_matrix, 'float'))
                nonlinearity_block.bprop()
                dL_dx_cpu = x_cpu.backward_matrix.to_host()

                r.append(np.allclose(dL_dx_gpu, dL_dx_cpu))

        self.assertEqual(sum(r), len(r))
예제 #29
0
    def __init__(self, ptb_train, ptb_valid, batch_size, sentence_max_len, device_id):
        self.blocking_contexts = None
        self.context = Context(device_id)
        device_id = self.context.device_id
        self.train_offsets = HomogeneousDataGenerator(ptb_train, batch_size, sentence_max_len, randomize=True, infinite=True)
        self.valid_offsets = HomogeneousDataGenerator(ptb_valid, batch_size, sentence_max_len)

        train_sentences = np.array([self.train_offsets.flatten_sentences])
        valid_sentences = np.array([self.valid_offsets.flatten_sentences])
        self.train_sents = Matrix.from_npa(train_sentences, 'int', device_id)
        self.valid_sents = Matrix.from_npa(valid_sentences, 'int', device_id)
        self._sent_lengths = np.empty((batch_size, 1), dtype=np.int32, order='F')[...]
        self.sent_lengths = Matrix.from_npa(self._sent_lengths, device_id=device_id)

        sentence_batch = Matrix.empty(batch_size, sentence_max_len, 'int', device_id)
        self.sentence_batch = Connector(sentence_batch, self.context)
        self.sentence_batch.sync_fill(0)

        self._mask = Matrix.empty(sentence_batch.nrows, self.sentence_batch.ncols, 'float', device_id)
        self.mask = List([Connector(self._mask[:, i]) for i in xrange(sentence_max_len)], self.sentence_batch.ncols)
        self.train_offsets_iterator = iter(self.train_offsets)
        self.valid_offsets_iterator = iter(self.valid_offsets)
        self.training_mode = True
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(
                max_input_sequence_len)
            batch_size = self.rng.random_integers(512)
            dim = self.rng.random_integers(1500)
            x = [
                self.rng.rand(batch_size, dim).astype(dtype=np.float32)
                for _ in xrange(max_input_sequence_len)
            ]

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            context = Context()
            x_gpu = List(
                [Connector(Matrix.from_npa(e), context, context) for e in x])
            smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu)
            x_gpu.set_length(sequence_len)
            _, dL_doutput = smean_pooling_block_gpu.output.register_usage(
                context, context)
            smean_pooling_block_gpu.fprop()
            random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
            Matrix.from_npa(random_matrix,
                            'float').copy_to(context, dL_doutput)
            smean_pooling_block_gpu.bprop()
            dL_dmatrices_gpu = [e.backward_matrix.to_host() for e in x_gpu]

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            context = Context()
            x_cpu = List(
                [Connector(Matrix.from_npa(e), context, context) for e in x])
            smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu)
            x_cpu.set_length(sequence_len)
            _, dL_doutput = smean_pooling_block_cpu.output.register_usage(
                context, context)
            smean_pooling_block_cpu.fprop()
            random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
            Matrix.from_npa(random_matrix,
                            'float').copy_to(context, dL_doutput)
            smean_pooling_block_cpu.bprop()
            dL_dmatrices_cpu = [e.backward_matrix.to_host() for e in x_cpu]

            for dL_dmatrix_gpu, dL_dmatrix_cpu in izip(dL_dmatrices_gpu,
                                                       dL_dmatrices_cpu):
                if not np.allclose(dL_dmatrix_gpu, dL_dmatrix_cpu):
                    r.append(False)
                    break
            else:
                r.append(True)

        self.assertEqual(sum(r), self.N)
예제 #31
0
    def test_theano_bprop_vector(self):
        r = []
        for _ in xrange(self.N):
            embd_dim = self.rng.random_integers(10000)
            batch_size, output_dim = self.rng.random_integers(2000, size=2)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32)
            true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32)
            device_id = 0

            quagga.processor_type = 'gpu'
            qrow_idxs = Connector(Matrix.from_npa(row_idxs))
            qW = Connector(Matrix.from_npa(W), device_id)
            qtrue_labels = Connector(Matrix.from_npa(true_labels))
            row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
            sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels)
            qtrue_labels.fprop()
            qW.fprop()
            qrow_idxs.fprop()
            row_slicing_block.fprop()
            sce_block.fprop()
            sce_block.bprop()
            row_slicing_block.bprop()
            qW.add(Context(), qW.backward_matrix)

            th_row_idxs = T.ivector()
            th_true_labels = T.ivector()
            row_slicing_layer = RowSlicingLayer(W)
            toutput = row_slicing_layer.get_output_expr(th_row_idxs)
            loss = SoftmaxLayer.get_loss(toutput, th_true_labels)
            dL_dW = T.grad(loss, row_slicing_layer.W)
            fun = theano.function([th_row_idxs, th_true_labels],
                                  updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)])
            fun(row_idxs[:, 0], true_labels[:, 0])
            r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value()))

        self.assertEqual(sum(r), len(r))
예제 #32
0
    def __init__(self, ptb_train, ptb_valid, batch_size, sentence_max_len,
                 device_id):
        self.blocking_contexts = None
        self.context = Context(device_id)
        device_id = self.context.device_id
        self.train_offsets = HomogeneousDataGenerator(ptb_train,
                                                      batch_size,
                                                      sentence_max_len,
                                                      randomize=True,
                                                      infinite=True)
        self.valid_offsets = HomogeneousDataGenerator(ptb_valid, batch_size,
                                                      sentence_max_len)

        train_sentences = np.array([self.train_offsets.flatten_sentences])
        valid_sentences = np.array([self.valid_offsets.flatten_sentences])
        self.train_sents = Matrix.from_npa(train_sentences, 'int', device_id)
        self.valid_sents = Matrix.from_npa(valid_sentences, 'int', device_id)
        self._sent_lengths = np.empty((batch_size, 1),
                                      dtype=np.int32,
                                      order='F')[...]
        self.sent_lengths = Matrix.from_npa(self._sent_lengths,
                                            device_id=device_id)

        sentence_batch = Matrix.empty(batch_size, sentence_max_len, 'int',
                                      device_id)
        self.sentence_batch = Connector(sentence_batch, self.context)
        self.sentence_batch.sync_fill(0)

        self._mask = Matrix.empty(sentence_batch.nrows,
                                  self.sentence_batch.ncols, 'float',
                                  device_id)
        self.mask = List(
            [Connector(self._mask[:, i]) for i in xrange(sentence_max_len)],
            self.sentence_batch.ncols)
        self.train_offsets_iterator = iter(self.train_offsets)
        self.valid_offsets_iterator = iter(self.valid_offsets)
        self.training_mode = True
예제 #33
0
    def test_theano_bprop(self):
        r = []
        for i in xrange(self.N):
            repeats = self.rng.random_integers(42)
            axis = self.rng.randint(2)
            input_dim, output_dim = self.rng.random_integers(2000, size=2)
            x = self.get_normal_matrix(input_dim, output_dim)
            input_dim = input_dim if axis else input_dim * repeats
            true_labels = self.rng.randint(output_dim, size=(input_dim, 1)).astype(np.int32)
            device_id = 0

            quagga.processor_type = 'gpu'
            qx = Connector(Matrix.from_npa(x), device_id)
            qtrue_labels = Connector(Matrix.from_npa(true_labels))
            repeat_block = RepeatBlock(qx, repeats, axis)
            sce_block = SoftmaxCeBlock(repeat_block.output, qtrue_labels)
            qx.fprop()
            qtrue_labels.fprop()
            repeat_block.fprop()
            sce_block.fprop()
            sce_block.bprop()
            repeat_block.bprop()
            q_dL_dx = qx.backward_matrix.to_host()

            th_x = T.fmatrix()
            th_true_labels = T.ivector()
            reps = [1, 1]
            reps[axis] = repeats
            th_output = T.tile(th_x, reps)
            th_output = T.nnet.softmax(th_output)
            loss = T.mean(T.nnet.categorical_crossentropy(th_output, th_true_labels))
            get_grads = theano.function([th_x, th_true_labels], T.grad(loss, th_x))
            th_dL_dx = get_grads(x, true_labels[:, 0])

            r.append(np.allclose(q_dL_dx, th_dL_dx))

        self.assertEqual(sum(r), len(r))
예제 #34
0
    def __init__(self, train_x, train_y, valid_x, valid_y, batch_size, device_id):
        self.context = Context(device_id)
        device_id = self.context.device_id
        self.train_x = Matrix.from_npa(train_x.T.astype(np.float32), device_id=device_id)
        self.valid_x = Matrix.from_npa(valid_x.T.astype(np.float32), device_id=device_id)
        self.train_y = Matrix.from_npa(train_y[:, np.newaxis], 'int', device_id=device_id)
        self.valid_y = Matrix.from_npa(valid_y[:, np.newaxis], 'int', device_id=device_id)
        self.batch_size = batch_size

        x = Matrix.empty(self.batch_size, self.train_x.nrows, device_id=device_id)
        y = Matrix.empty(self.batch_size, 1, 'int', device_id)
        self.x = Connector(x)
        self.y = Connector(y)

        self.train_indices = np.arange(int(self.train_x.ncols), dtype=np.int32)
        self.valid_indices = np.arange(int(self.valid_x.ncols), dtype=np.int32)
        self.indices = Matrix.empty(self.batch_size, 1, 'int', device_id)
        self.rng = np.random.RandomState(42)
        self.rng.shuffle(self.train_indices)
        self.train_i = 0
        self.valid_i = 0
        self.training_mode = True

        self.blocking_contexts = None
예제 #35
0
    def test_bprop_matrix(self):
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            embd_dim = self.rng.random_integers(10000)
            batch_size = self.rng.random_integers(500)
            output_dim = self.rng.random_integers(2000)
            W = self.get_orthogonal_matrix(embd_dim, output_dim)
            row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32)
            true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)]
            device_id = 0

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qrow_idxs = Connector(Matrix.from_npa(row_idxs))
                qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols)
                qW = Connector(Matrix.from_npa(W), device_id)
                row_slicing_block = RowSlicingBlock(qW, qrow_idxs)
                seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock,
                                               params=[],
                                               sequences=[row_slicing_block.output, qtrue_labels])
                qW.fprop()
                qrow_idxs.ncols = sequence_len
                qrow_idxs.fprop()
                row_slicing_block.fprop()
                seq_sce_block.fprop()
                seq_sce_block.bprop()
                row_slicing_block.bprop()
                qW.add(Context(), qW.backward_matrix)
                output[processor_type] = qW.to_host()

            r.append(np.allclose(output['gpu'], output['cpu']))

        self.assertEqual(sum(r), len(r))
예제 #36
0
    def test_numpy_fprop(self):
        r = []
        quagga.processor_type = 'gpu'
        for _ in xrange(self.N):
            matrices = []
            nrows = self.rng.random_integers(1, 5000)
            for _ in xrange(self.rng.random_integers(1, 10)):
                ncols = self.rng.random_integers(1, 5000)
                matrices.append(self.rng.rand(nrows, ncols).astype(np.float32))

            numpy_output = np.hstack([m for m in matrices])
            matrices = [Connector(Matrix.from_npa(m)) for m in matrices]
            hstack_block = HorizontalStackBlock(*matrices)
            for m in matrices:
                m.fprop()
            hstack_block.fprop()
            quagga_output = hstack_block.output.to_host()

            r.append(np.allclose(numpy_output, quagga_output))
        self.assertEqual(sum(r), self.N)
예제 #37
0
    def test_numpy_fprop(self):
        r = []
        quagga.processor_type = 'gpu'
        for _ in xrange(self.N):
            matrices = []
            ncols = self.rng.random_integers(1, 5000)
            for _ in xrange(self.rng.random_integers(1, 10)):
                nrows = self.rng.random_integers(1, 5000)
                matrices.append(self.rng.rand(nrows, ncols).astype(np.float32))

            numpy_output = np.vstack([m for m in matrices])
            matrices = [Connector(Matrix.from_npa(m)) for m in matrices]
            vstack_block = VerticalStackBlock(*matrices)
            for m in matrices:
                m.fprop()
            vstack_block.fprop()
            quagga_output = vstack_block.output.to_host()

            r.append(np.allclose(numpy_output, quagga_output))
        self.assertEqual(sum(r), self.N)
예제 #38
0
    def test_fprop(self):
        r = []
        for i in xrange(self.N):
            repeats = self.rng.random_integers(42)
            axis = self.rng.randint(2)
            input_dim, output_dim = self.rng.random_integers(2000, size=2)
            x = self.get_normal_matrix(input_dim, output_dim)

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qx = Connector(Matrix.from_npa(x))
                repeat_block = RepeatBlock(qx, repeats, axis)
                qx.fprop()
                repeat_block.fprop()
                output[processor_type] = repeat_block.output.to_host()

            r.append(np.allclose(output['gpu'], output['cpu']))

        self.assertEqual(sum(r), len(r))
예제 #39
0
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            for sparse in [True, False]:
                batch_size, dim = self.rng.random_integers(2000, size=2)
                if sparse:
                    true_labels = np.zeros((batch_size, dim), np.float32)
                    for k, j in enumerate(self.rng.randint(dim, size=batch_size)):
                        true_labels[k, j] = 1.0
                else:
                    true_labels = self.rng.randint(dim, size=(batch_size, 1)).astype(np.int32)
                x = self.rng.randn(batch_size, dim).astype(np.float32)
                mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32)
                device_id = 0
                for with_mask in [False, True]:
                    quagga.processor_type = 'gpu'
                    x_gpu = Connector(Matrix.from_npa(x), device_id)
                    true_labels_gpu = Connector(Matrix.from_npa(true_labels))
                    mask_gpu = Connector(Matrix.from_npa(mask)) if with_mask else None
                    softmax_ce_block = SoftmaxCeBlock(x_gpu, true_labels_gpu, mask_gpu)
                    x_gpu.fprop()
                    true_labels_gpu.fprop()
                    if with_mask:
                        mask_gpu.fprop()
                    softmax_ce_block.fprop()
                    softmax_ce_block.bprop()
                    dL_dx_gpu = x_gpu.backward_matrix.to_host()

                    quagga.processor_type = 'cpu'
                    x_cpu = Connector(Matrix.from_npa(x), device_id)
                    true_labels_cpu = Connector(Matrix.from_npa(true_labels))
                    mask_cpu = Connector(Matrix.from_npa(mask)) if with_mask else None
                    softmax_ce_block = SoftmaxCeBlock(x_cpu, true_labels_cpu, mask_cpu)
                    x_cpu.fprop()
                    true_labels_cpu.fprop()
                    if with_mask:
                        mask_cpu.fprop()
                    softmax_ce_block.fprop()
                    softmax_ce_block.bprop()
                    dL_dx_cpu = x_cpu.backward_matrix.to_host()

                    r.append(np.allclose(dL_dx_gpu, dL_dx_cpu))

        self.assertEqual(sum(r), len(r))
예제 #40
0
    def test_fprop(self):
        r = []
        for i in xrange(self.N):
            matrices = []
            nrows = self.rng.random_integers(1, 5000)
            for _ in xrange(self.rng.random_integers(1, 10)):
                ncols = self.rng.random_integers(1, 5000)
                matrices.append(self.rng.rand(nrows, ncols).astype(np.float32))

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qmatrices = [Connector(Matrix.from_npa(m)) for m in matrices]
                for m in qmatrices:
                    m.fprop()
                hstack_block = HorizontalStackBlock(*qmatrices)
                hstack_block.fprop()
                output[processor_type] = hstack_block.output.to_host()

            r.append(np.allclose(output['gpu'], output['cpu']))

        self.assertEqual(sum(r), self.N)
예제 #41
0
    def test_fprop(self):
        r = []
        for i in xrange(self.N):
            matrices = []
            ncols = self.rng.random_integers(1, 5000)
            for _ in xrange(self.rng.random_integers(1, 10)):
                nrows = self.rng.random_integers(1, 5000)
                matrices.append(self.rng.rand(nrows, ncols).astype(np.float32))

            output = {}
            for processor_type in ['gpu', 'cpu']:
                quagga.processor_type = processor_type
                qmatrices = [Connector(Matrix.from_npa(m)) for m in matrices]
                for m in qmatrices:
                    m.fprop()
                vstack_block = VerticalStackBlock(*qmatrices)
                vstack_block.fprop()
                output[processor_type] = vstack_block.output.to_host()

            r.append(np.allclose(output['gpu'], output['cpu']))

        self.assertEqual(sum(r), self.N)
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            max_input_sequence_len = self.rng.random_integers(500)
            sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len)
            batch_size = self.rng.random_integers(512)
            dim = self.rng.random_integers(1500)
            x = [self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len)]

            state = self.rng.get_state()
            quagga.processor_type = 'gpu'
            context = Context()
            x_gpu = List([Connector(Matrix.from_npa(e), context, context) for e in x])
            smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu)
            x_gpu.set_length(sequence_len)
            _, dL_doutput = smean_pooling_block_gpu.output.register_usage(context, context)
            smean_pooling_block_gpu.fprop()
            random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
            Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput)
            smean_pooling_block_gpu.bprop()
            dL_dmatrices_gpu = [e.backward_matrix.to_host() for e in x_gpu]

            self.rng.set_state(state)
            quagga.processor_type = 'cpu'
            context = Context()
            x_cpu = List([Connector(Matrix.from_npa(e), context, context) for e in x])
            smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu)
            x_cpu.set_length(sequence_len)
            _, dL_doutput = smean_pooling_block_cpu.output.register_usage(context, context)
            smean_pooling_block_cpu.fprop()
            random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols)
            Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput)
            smean_pooling_block_cpu.bprop()
            dL_dmatrices_cpu = [e.backward_matrix.to_host() for e in x_cpu]

            for dL_dmatrix_gpu, dL_dmatrix_cpu in izip(dL_dmatrices_gpu, dL_dmatrices_cpu):
                if not np.allclose(dL_dmatrix_gpu, dL_dmatrix_cpu):
                    r.append(False)
                    break
            else:
                r.append(True)

        self.assertEqual(sum(r), self.N)
예제 #43
0
    def test_bprop(self):
        """
        compare `bprop` results for cpu and gpu backends
        """
        r = []
        for i in xrange(self.N):
            batch_size = self.rng.random_integers(2000)
            true_labels = self.rng.randint(2, size=(batch_size,
                                                    1)).astype(np.float32)
            mask = (self.rng.rand(batch_size, 1) < 0.8).astype(np.float32)
            x = self.rng.randn(batch_size, 1).astype(np.float32)
            device_id = 0

            for with_mask in [False, True]:
                quagga.processor_type = 'gpu'
                x_gpu = Connector(Matrix.from_npa(x), device_id)
                true_labels_gpu = Connector(Matrix.from_npa(true_labels))
                mask_gpu = Connector(
                    Matrix.from_npa(mask)) if with_mask else None
                sigmoid_ce_block = SigmoidCeBlock(x_gpu, true_labels_gpu,
                                                  mask_gpu)
                x_gpu.fprop()
                true_labels_gpu.fprop()
                if with_mask:
                    mask_gpu.fprop()
                sigmoid_ce_block.fprop()
                sigmoid_ce_block.bprop()
                dL_dx_gpu = x_gpu.backward_matrix.to_host()

                x_cpu = Connector(Matrix.from_npa(x), device_id)
                true_labels_cpu = Connector(Matrix.from_npa(true_labels))
                mask_cpu = Connector(
                    Matrix.from_npa(mask)) if with_mask else None
                sigmoid_ce_block = SigmoidCeBlock(x_cpu, true_labels_cpu,
                                                  mask_cpu)
                x_cpu.fprop()
                true_labels_cpu.fprop()
                if with_mask:
                    mask_cpu.fprop()
                sigmoid_ce_block.fprop()
                sigmoid_ce_block.bprop()
                dL_dx_cpu = x_cpu.backward_matrix.to_host()

                r.append(np.allclose(dL_dx_gpu, dL_dx_cpu))

        self.assertEqual(sum(r), len(r))
예제 #44
0
    def test_theano_fprop(self):
        r = []
        for i in xrange(self.N):
            repeats = self.rng.random_integers(42)
            axis = self.rng.randint(2)
            input_dim, output_dim = self.rng.random_integers(2000, size=2)
            x = self.get_normal_matrix(input_dim, output_dim)

            quagga.processor_type = 'gpu'
            qx = Connector(Matrix.from_npa(x))
            repeat_block = RepeatBlock(qx, repeats, axis)
            qx.fprop()
            repeat_block.fprop()
            qoutput = repeat_block.output.to_host()

            th_x = T.fmatrix()
            reps = [1, 1]
            reps[axis] = repeats
            th_output = T.tile(th_x, reps)
            th_output = theano.function([th_x], th_output)(x)

            r.append(np.allclose(qoutput, th_output))

        self.assertEqual(sum(r), len(r))