def filter_variable(self, other, allow_convert=True): from theano.gpuarray.basic_ops import GpuFromHost if hasattr(other, '_as_GpuArrayVariable'): other = other._as_GpuArrayVariable(self.context_name) if not isinstance(other, Variable): other = self.Constant(type=self, data=other) if other.type == self: return other if not isinstance(other.type, tensor.TensorType): raise TypeError('Incompatible type', (self, other.type)) if (other.type.dtype != self.dtype): raise TypeError('Incompatible dtype', (self.dtype, other.type.dtype)) if other.type.ndim != self.ndim: raise TypeError('Incompatible number of dimensions.' ' Expected %d, got %d.' % (self.ndim, other.ndim)) if other.type.broadcastable != self.broadcastable: if allow_convert: type2 = other.type.clone(broadcastable=self.broadcastable) other2 = type2.convert_variable(other) else: other2 = None if other2 is None: raise TypeError('Incompatible broadcastable dimensions.' ' Expected %s, got %s.' % (str(other.type.broadcastable), str(self.broadcastable))) other = other2 return GpuFromHost(self.context_name)(other)
def test_transfer_cpu_gpu(): a = tt.fmatrix("a") g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g") av = np.asarray(rng.rand(5, 4), dtype="float32") gv = gpuarray.array(av, context=get_context(test_ctx_name)) f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert np.all(fv == av)
def traverse(out, x, x_copy, d, visited=None): """ Function used by scan to parse the tree and figure out which nodes it needs to replace. There are two options : 1) x and x_copy or on host, then you would replace x with x_copy 2) x is on gpu, x_copy on host, then you need to replace host_from_gpu(x) with x_copy This happens because initially shared variables are on GPU... which is fine for the main computational graph but confuses things a bit for the inner graph of scan. """ # ``visited`` is a set of nodes that are already known and don't need to be # checked again, speeding up the traversal of multiply-connected graphs. # if a ``visited`` set is given, it will be updated in-place so the callee # knows which nodes we have seen. if visited is None: visited = set() if out in visited: return d visited.add(out) from theano.sandbox import cuda from theano.gpuarray.basic_ops import GpuFromHost, host_from_gpu from theano.gpuarray import pygpu_activated from theano.gpuarray.type import GpuArrayType if out == x: if isinstance(x.type, cuda.CudaNdarrayType): d[out] = cuda.gpu_from_host(x_copy) else: assert isinstance(x.type, GpuArrayType) d[out] = GpuFromHost(x.type.context_name)(x_copy) return d elif out.owner is None: return d elif (cuda.cuda_available and out.owner.op == cuda.host_from_gpu and out.owner.inputs == [x]): d[out] = tensor.as_tensor_variable(x_copy) return d elif (pygpu_activated and out.owner.op == host_from_gpu and out.owner.inputs == [x]): d[out] = tensor.as_tensor_variable(x_copy) return d else: for inp in out.owner.inputs: d = traverse(inp, x, x_copy, d, visited) return d
def test_transfer_strided(): # This is just to ensure that it works in theano # libgpuarray has a much more comprehensive suit of tests to # ensure correctness a = tt.fmatrix("a") g = GpuArrayType(dtype="float32", broadcastable=(False, False))("g") av = np.asarray(rng.rand(5, 8), dtype="float32") gv = gpuarray.array(av, context=get_context(test_ctx_name)) av = av[:, ::2] gv = gv[:, ::2] f = theano.function([a], GpuFromHost(test_ctx_name)(a)) fv = f(av) assert GpuArrayType.values_eq(fv, gv) f = theano.function([g], host_from_gpu(g)) fv = f(gv) assert np.all(fv == av)
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = theano.tensor.fvector("u") x0 = theano.tensor.fscalar("x0") W_in = theano.tensor.fscalar("win") W = theano.tensor.fscalar("w") mode = mode_with_gpu.excluding("InputToGpuOptimizer") output, updates = scan( f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode, ) output = GpuFromHost(test_ctx_name)(output) f2 = theano.function( [u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode, ) rng = np.random.RandomState(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5.0, high=5.0) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = np.asarray(v_u, dtype="float32") v_x0 = np.asarray(v_x0, dtype="float32") W = np.asarray(W, dtype="float32") W_in = np.asarray(W_in, dtype="float32") # compute the output in numpy v_out = np.zeros((4, )) v_out[0] = v_u[0] * W_in + v_x0 * W for step in range(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W theano_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(theano_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0 assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4 scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any( [isinstance(node.op, GpuElemwise) for node in scan_node_topo]) assert not any( [isinstance(node.op, HostFromGpu) for node in scan_node_topo]) assert not any( [isinstance(node.op, GpuFromHost) for node in scan_node_topo])
def safe_to_gpu(x, ctx_name): if isinstance(x.type, tensor.TensorType): return GpuFromHost(ctx_name)(x) else: return x