示例#1
0
    def test_gpu3_mixture_dtype_outputs(self):
        def f_rnn(u_t, x_tm1, W_in, W):
            return (u_t * W_in + x_tm1 * W, theano.tensor.cast(u_t + x_tm1, "int64"))

        u = theano.tensor.fvector("u")
        x0 = theano.tensor.fscalar("x0")
        W_in = theano.tensor.fscalar("win")
        W = theano.tensor.fscalar("w")
        output, updates = scan(
            f_rnn,
            u,
            [x0, None],
            [W_in, W],
            n_steps=None,
            truncate_gradient=-1,
            go_backwards=False,
            mode=mode_with_gpu,
        )

        f2 = theano.function(
            [u, x0, W_in, W],
            output,
            updates=updates,
            allow_input_downcast=True,
            mode=mode_with_gpu,
        )

        # get random initial values
        rng = np.random.RandomState(utt.fetch_seed())
        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
        v_x0 = rng.uniform()
        W = rng.uniform()
        W_in = rng.uniform()

        # compute the output in numpy
        v_out1 = np.zeros((4,))
        v_out2 = np.zeros((4,), dtype="int64")
        v_out1[0] = v_u[0] * W_in + v_x0 * W
        v_out2[0] = v_u[0] + v_x0
        for step in range(1, 4):
            v_out1[step] = v_u[step] * W_in + v_out1[step - 1] * W
            v_out2[step] = np.int64(v_u[step] + v_out1[step - 1])

        theano_out1, theano_out2 = f2(v_u, v_x0, W_in, W)
        utt.assert_allclose(theano_out1, v_out1)
        utt.assert_allclose(theano_out2, v_out2)

        topo = f2.maker.fgraph.toposort()
        scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
        assert len(scan_node) == 1
        scan_node = scan_node[0]
        assert scan_node.op.gpua

        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()

        # check that there is no gpu transfer in the inner loop.
        assert not any([isinstance(node.op, HostFromGpu) for node in scan_node_topo])
        assert not any([isinstance(node.op, GpuFromHost) for node in scan_node_topo])
示例#2
0
    def test_one_sequence_one_output_weights_gpu2(self):
        def f_rnn(u_t, x_tm1, W_in, W):
            return u_t * W_in + x_tm1 * W

        u = theano.tensor.fvector("u")
        x0 = theano.tensor.fscalar("x0")
        W_in = theano.tensor.fscalar("win")
        W = theano.tensor.fscalar("w")
        output, updates = scan(
            f_rnn,
            u,
            x0,
            [W_in, W],
            n_steps=None,
            truncate_gradient=-1,
            go_backwards=False,
            mode=mode_with_gpu,
        )

        f2 = theano.function(
            [u, x0, W_in, W],
            output,
            updates=updates,
            allow_input_downcast=True,
            mode=mode_with_gpu,
        )

        # get random initial values
        rng = np.random.RandomState(utt.fetch_seed())
        v_u = rng.uniform(size=(4,), low=-5.0, high=5.0)
        v_x0 = rng.uniform()
        W = rng.uniform()
        W_in = rng.uniform()

        # compute the output in numpy
        v_out = np.zeros((4,))
        v_out[0] = v_u[0] * W_in + v_x0 * W
        for step in range(1, 4):
            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W

        theano_values = f2(v_u, v_x0, W_in, W)
        utt.assert_allclose(theano_values, v_out)

        topo = f2.maker.fgraph.toposort()
        assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 1
        assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4

        scan_node = [node for node in topo if isinstance(node.op, scan.op.Scan)]
        assert len(scan_node) == 1
        scan_node = scan_node[0]
        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()

        # check that there is no gpu transfer in the inner loop.
        assert any([isinstance(node.op, GpuElemwise) for node in scan_node_topo])
        assert not any([isinstance(node.op, HostFromGpu) for node in scan_node_topo])
        assert not any([isinstance(node.op, GpuFromHost) for node in scan_node_topo])
示例#3
0
 def setup_method(self):
     self.k = tensor.iscalar("k")
     self.A = tensor.vector("A")
     result, _ = scan(
         fn=lambda prior_result, A: prior_result * A,
         outputs_info=tensor.ones_like(self.A),
         non_sequences=self.A,
         n_steps=self.k,
     )
     result_check, _ = scan_checkpoints(
         fn=lambda prior_result, A: prior_result * A,
         outputs_info=tensor.ones_like(self.A),
         non_sequences=self.A,
         n_steps=self.k,
         save_every_N=100,
     )
     self.result = result[-1]
     self.result_check = result_check[-1]
     self.grad_A = tensor.grad(self.result.sum(), self.A)
     self.grad_A_check = tensor.grad(self.result_check.sum(), self.A)
示例#4
0
    def test_gibbs_chain(self):
        rng = np.random.RandomState(utt.fetch_seed())
        v_vsample = np.array(
            rng.binomial(
                1,
                0.5,
                size=(3, 20),
            ),
            dtype="float32",
        )
        vsample = theano.shared(v_vsample)
        trng = theano.sandbox.rng_mrg.MRG_RandomStreams(utt.fetch_seed())

        def f(vsample_tm1):
            return (
                trng.binomial(vsample_tm1.shape, n=1, p=0.3, dtype="float32")
                * vsample_tm1
            )

        theano_vsamples, updates = scan(
            f,
            [],
            vsample,
            [],
            n_steps=10,
            truncate_gradient=-1,
            go_backwards=False,
            mode=self.mode_with_gpu,
        )
        my_f = theano.function(
            [],
            theano_vsamples[-1],
            updates=updates,
            allow_input_downcast=True,
            mode=self.mode_with_gpu,
        )

        # I leave this to tested by debugmode, this test was anyway more of
        # doest the graph compile kind of test
        my_f()
示例#5
0
    def test_memory_reuse_gpudimshuffle(self):
        # Test the memory pre-allocation feature in scan when one output is
        # the result of a GpuDimshuffle (because an optimization in
        # GpuDimshuffle can cause issues with the memory pre-allocation
        # where it falsely thinks that a pre-allocated memory region has
        # been used when it hasn't).
        def inner_fn(seq1, recurrent_out):
            temp = seq1 + recurrent_out.sum()
            output1 = temp.dimshuffle(1, 0)
            output2 = temp.sum() + recurrent_out
            return output1, output2

        input1 = theano.tensor.ftensor3()
        init = theano.tensor.ftensor3()
        outputs_info = [None, init]

        out, _ = scan(
            inner_fn,
            sequences=[input1],
            outputs_info=outputs_info,
            mode=self.mode_with_gpu,
        )

        out1 = out[0].flatten()
        out2 = out[1].flatten()

        fct = theano.function([input1, init], [out1, out2],
                              mode=self.mode_with_gpu)

        output = fct(np.ones((2, 1, 1), dtype="float32"),
                     np.ones((1, 1, 1), dtype="float32"))

        expected_output = (
            np.array([2, 4], dtype="float32"),
            np.array([3, 7], dtype="float32"),
        )
        utt.assert_allclose(output, expected_output)
示例#6
0
    def test_gpu_memory_usage(self):
        # This test validates that the memory usage of the defined theano
        # function is reasonnable when executed on the GPU. It checks for
        # a bug in which one of scan's optimization was not applied which
        # made the scan node compute large and unnecessary outputs which
        # brought memory usage on the GPU to ~12G.

        # Dimensionality of input and output data (not one-hot coded)
        n_in = 100
        n_out = 100
        # Number of neurons in hidden layer
        n_hid = 4000

        # Number of minibatches
        mb_size = 2
        # Time steps in minibatch
        mb_length = 200

        # Define input variables
        xin = tensor.ftensor3(name="xin")
        yout = tensor.ftensor3(name="yout")

        # Initialize the network parameters
        U = theano.shared(np.zeros((n_in, n_hid), dtype="float32"),
                          name="W_xin_to_l1")
        V = theano.shared(np.zeros((n_hid, n_hid), dtype="float32"),
                          name="W_l1_to_l1")
        W = theano.shared(np.zeros((n_hid, n_out), dtype="float32"),
                          name="W_l1_to_l2")
        nparams = [U, V, W]

        # Build the forward pass
        l1_base = tensor.dot(xin, U)

        def scan_l(baseline, last_step):
            return baseline + tensor.dot(last_step, V)

        zero_output = tensor.alloc(np.asarray(0.0, dtype="float32"), mb_size,
                                   n_hid)

        l1_out, _ = scan(
            scan_l,
            sequences=[l1_base],
            outputs_info=[zero_output],
            mode=self.mode_with_gpu_nodebug,
        )

        l2_out = tensor.dot(l1_out, W)

        # Compute the cost and take the gradient wrt params
        cost = tensor.sum((l2_out - yout)**2)
        grads = tensor.grad(cost, nparams)
        updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))

        # Compile the theano function
        feval_backprop = theano.function([xin, yout],
                                         cost,
                                         updates=updates,
                                         mode=self.mode_with_gpu_nodebug)

        # Validate that the PushOutScanOutput optimization has been applied
        # by checking the number of outputs of the grad Scan node in the
        # compiled function.
        nodes = feval_backprop.maker.fgraph.toposort()
        scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]

        # The grad scan is always the 2nd one according to toposort. If the
        # optimization has been applied, it has 2 outputs, otherwise 3.
        grad_scan_node = scan_nodes[1]
        assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)

        # Call the theano function to ensure the absence of a memory error
        feval_backprop(
            np.zeros((mb_length, mb_size, n_in), dtype="float32"),
            np.zeros((mb_length, mb_size, n_out), dtype="float32"),
        )
示例#7
0
    def test_one_sequence_one_output_weights_gpu1(self):
        def f_rnn(u_t, x_tm1, W_in, W):
            return u_t * W_in + x_tm1 * W

        u = theano.tensor.fvector("u")
        x0 = theano.tensor.fscalar("x0")
        W_in = theano.tensor.fscalar("win")
        W = theano.tensor.fscalar("w")

        # The following line is needed to have the first case being used
        # Otherwise, it is the second that is tested.
        mode = self.mode_with_gpu.excluding("InputToGpuOptimizer")
        output, updates = scan(
            f_rnn,
            u,
            x0,
            [W_in, W],
            n_steps=None,
            truncate_gradient=-1,
            go_backwards=False,
            mode=mode,
        )

        output = self.gpu_backend.gpu_from_host(output)
        f2 = theano.function(
            [u, x0, W_in, W],
            output,
            updates=updates,
            allow_input_downcast=True,
            mode=self.mode_with_gpu,
        )

        # get random initial values
        rng = np.random.RandomState(utt.fetch_seed())
        v_u = rng.uniform(size=(4, ), low=-5.0, high=5.0)
        v_x0 = rng.uniform()
        W = rng.uniform()
        W_in = rng.uniform()

        v_u = np.asarray(v_u, dtype="float32")
        v_x0 = np.asarray(v_x0, dtype="float32")
        W = np.asarray(W, dtype="float32")
        W_in = np.asarray(W_in, dtype="float32")

        # compute the output in numpy
        v_out = np.zeros((4, ))
        v_out[0] = v_u[0] * W_in + v_x0 * W
        for step in range(1, 4):
            v_out[step] = v_u[step] * W_in + v_out[step - 1] * W
        theano_values = f2(v_u, v_x0, W_in, W)
        utt.assert_allclose(theano_values, v_out)

        # TO DEL
        topo = f2.maker.fgraph.toposort()
        scan_node = [node for node in topo if isinstance(node.op, Scan)]
        assert len(scan_node) == 1
        scan_node = scan_node[0]

        topo = f2.maker.fgraph.toposort()
        assert (sum([
            isinstance(node.op, self.gpu_backend.HostFromGpu) for node in topo
        ]) == 0)
        assert (sum([
            isinstance(node.op, self.gpu_backend.GpuFromHost) for node in topo
        ]) == 4)

        scan_node = [node for node in topo if isinstance(node.op, Scan)]
        assert len(scan_node) == 1
        scan_node = scan_node[0]
        scan_node_topo = scan_node.op.fn.maker.fgraph.toposort()

        # check that there is no gpu transfer in the inner loop.
        assert any([
            isinstance(node.op, self.gpu_backend.GpuElemwise)
            for node in scan_node_topo
        ])
        assert not any([
            isinstance(node.op, self.gpu_backend.HostFromGpu)
            for node in scan_node_topo
        ])
        assert not any([
            isinstance(node.op, self.gpu_backend.GpuFromHost)
            for node in scan_node_topo
        ])