Exemplo n.º 1
0
    def todense(self, out=None, allocator=mem_alloc, stream=None):
        if out is None:
            out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C")

        if self.nnz == 0:  # weird but happens
            out.fill(0.0, stream=stream)
            return out

        # we need to out-of-place transpose if we want rowmajor outputs
        # thus we need a temporary to store our results
        if out.flags.c_contiguous:
            tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C")
        else:
            tmp = out

        if stream is not None:
            cusparse.cusparseSetStream(cusparse_handle, stream.handle)
            cublas.cublasSetStream(cublas_handle, stream.handle)

        cusparse.cusparseScsr2dense(cusparse_handle, self.shape[0],
            self.shape[1], self.descr, self.data.gpudata, self.indptr.gpudata,
            self.indices.gpudata, tmp.gpudata, tmp.shape[0])

        if out.flags.c_contiguous:
            cublas.cublasSgeam(cublas_handle, 1, 1, tmp.shape[1], tmp.shape[0],
                           1.0, tmp.gpudata, tmp.shape[0],
                           0.0, 0, tmp.shape[0], out.gpudata, out.shape[1])
        if stream is not None:
            cusparse.cusparseSetStream(cusparse_handle, 0)
            cublas.cublasSetStream(cublas_handle, 0)

        return out
Exemplo n.º 2
0
def cgemm(A,  B, transa=False, transb=False, alpha=1,beta=1):
    """This function uses the C-wrapper to use cuBLAS.
        """
    CUBLAS_OP_N = cublas._CUBLAS_OP['n']
    CUBLAS_OP_T = cublas._CUBLAS_OP['t']

    m, n, k = A.size(1),B.size(2),A.size(2)
    batchCount = A.size(0)

    C = A.new(batchCount,m,n)

    lda = m
    ldb = k
    ldc = m


    trans_a_ptr = CUBLAS_OP_N if not transa else CUBLAS_OP_T
    trans_b_ptr = CUBLAS_OP_N if not transb else CUBLAS_OP_T

    alpha_tensor = torch.cuda.tensor([1]).fill_(alpha)
    beta_tensor = torch.cuda.tensor([1]).fill_(beta)

    handle = torch.cuda.current_blas_handle()
    stream = torch.cuda.current_stream()._as_parameter_
    cublas.cublasSetStream(handle, stream)
    cublas.cublasCgemmBatched(handle, trans_a_ptr, trans_b_ptr, m, n, k, alpha_tensor.data_ptr(), A.data_ptr(), lda, B.data_ptr(),ldb, beta_tensor.data_ptr(), C.data_ptr(), ldc, batchCount)
    return C
    def backward(ctx, grad_output):
        A, B = ctx.saved_tensors
        conjA = A.clone()
        conjB = B.clone()
        conjA[..., 1] = -A[..., 1]
        conjB[..., 1] = -B[..., 1]
        #conjA[:,:,:,:,1] = -A[:,:,:,:,1]
        #conjB[:,:,1] = -B[:,:,1]
        m, n = conjB.nelement() // 2, conjA.nelement() // conjB.nelement()
        # n is the B*C
        # m is the M*N
        gradA = conjA.new(conjA.size())  # (n,m), col-major
        gradC = grad_output.contiguous()  # (n,m), col-major
        # grad_A = grad_C * conj(B)
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, gradC.data_ptr(), lda,
                           conjB.data_ptr(), incx, gradA.data_ptr(), ldc)

        # grad_B = sum_n grad_C * conj(A)
        # view grad_C and conjA as one vector of size n*m
        gradB_ = gradC.new(gradC.size())  # mul(gradC,conjA) # (B,C,M,N,2)
        lda = m * n
        ldc = m * n
        incx = 1
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m * n, 1, gradC.data_ptr(), lda,
                           conjA.data_ptr(), incx, gradB_.data_ptr(), ldc)
        gradB = torch.sum(torch.sum(gradB_, 0), 0)  # (m)

        return gradA, gradB
    def forward(ctx, A, B):
        # assume A and B has the same size , with last dim = 2
        A, B = A.contiguous(), B.contiguous()
        ctx.save_for_backward(A, B)

        if not iscomplex(A) or not iscomplex(B):
            raise TypeError('The input, filter and output should be complex')

        if A.nelement() != B.nelement():
            raise TypeError('The input and filter should have same size')

        if type(A) is not type(B):
            raise RuntimeError('A and B should be same type!')

        if not A.is_cuda:
            raise RuntimeError('Use the torch backend for cpu tensors!')

        C = A.new(A.size())
        m, n = B.nelement() // 2, A.nelement() // B.nelement()
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(),
                           incx, C.data_ptr(), ldc)
        return C
Exemplo n.º 5
0
def cdgmm(A, B, inplace=False):
    """
        Complex pointwise multiplication between (batched) tensor A and tensor B.

        Parameters
        ----------
        A : tensor
            A is a complex tensor of size (B, C, M, N, 2)
        B : tensor
            B is a complex tensor of size (M, N, 2) or real tensor of (M, N, 1)
        inplace : boolean, optional
            if set to True, all the operations are performed inplace

        Returns
        -------
        C : tensor
            output tensor of size (B, C, M, N, 2) such that:
            C[b, c, m, n, :] = A[b, c, m, n, :] * B[m, n, :]
    """
    if not iscomplex(A):
        raise TypeError('The input must be complex, indicated by a last '
                        'dimension of size 2')

    if B.ndimension() != 3:
        raise RuntimeError('The filter must be a 3-tensor, with a last '
                           'dimension of size 1 or 2 to indicate it is real '
                           'or complex, respectively')

    if not iscomplex(B) and not isreal(B):
        raise TypeError('The filter must be complex or real, indicated by a '
                        'last dimension of size 2 or 1, respectively')

    if A.size()[-3:-1] != B.size()[-3:-1]:
        raise RuntimeError('The filters are not compatible for multiplication!')

    if A.dtype is not B.dtype:
        raise RuntimeError('A and B must be of the same dtype')

    if A.device != B.device:
        raise RuntimeError('A and B must be on the same device')

    if isreal(B):
        if inplace:
            return A.mul_(B)
        else:
            return A * B
    else:
        A, B = A.contiguous(), B.contiguous()
        C = A.new(A.size()) if not inplace else A
        m, n = B.nelement() // 2, A.nelement() // B.nelement()
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc)
        return C
Exemplo n.º 6
0
def cublas_dot(a,
               b,
               out=None,
               transpose_a=False,
               transpose_b=False,
               increment=False,
               stream=None):
    """Matrix multiplication using CUBLAS."""

    assert not increment or out is not None

    dtype = a.dtype

    if transpose_a:
        a1, a0 = a.shape
    else:
        a0, a1 = a.shape

    if transpose_b:
        b1, b0 = b.shape
    else:
        b0, b1 = b.shape

    assert a1 == b0

    if out is None:
        out = gpuarray.zeros((a0, b1), dtype=dtype)

    assert a.dtype == b.dtype == out.dtype

    # note: we swap the order of a and b and swap the transposes because
    # cublas assumes column-major ordering
    transa = "t" if transpose_a else "n"
    transb = "t" if transpose_b else "n"
    beta = dtype.type(1.0) if increment else dtype.type(0.0)
    lda = a0 if transpose_a else a1
    ldb = b0 if transpose_b else b1
    ldout = b1

    if stream is not None:
        # note: this assumes that the stream is set to the default stream to
        # start
        cublas.cublasSetStream(misc._global_cublas_handle, stream.handle)

    if dtype == np.float32:
        gemm = cublas.cublasSgemm
    else:
        gemm = cublas.cublasDgemm

    gemm(misc._global_cublas_handle, transb, transa, b1, a0, a1,
         dtype.type(1.0), b.gpudata, ldb, a.gpudata, lda, beta, out.gpudata,
         ldout)

    if stream is not None:
        cublas.cublasSetStream(misc._global_cublas_handle, 0)

    return out
Exemplo n.º 7
0
    def todense(self, out=None, allocator=mem_alloc, stream=None):
        if out is None:
            out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C")

        if self.nnz == 0:  # weird but happens
            out.fill(0.0, stream=stream)
            return out

        # we need to out-of-place transpose if we want rowmajor outputs
        # thus we need a temporary to store our results
        if out.flags.c_contiguous:
            tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C")
        else:
            tmp = out

        if stream is not None:
            cusparse.cusparseSetStream(cusparse_handle, stream.handle)
            cublas.cublasSetStream(cublas_handle, stream.handle)

        cusparse.cusparseScsr2dense(
            cusparse_handle,
            self.shape[0],
            self.shape[1],
            self.descr,
            self.data.gpudata,
            self.indptr.gpudata,
            self.indices.gpudata,
            tmp.gpudata,
            tmp.shape[0],
        )

        if out.flags.c_contiguous:
            cublas.cublasSgeam(
                cublas_handle,
                1,
                1,
                tmp.shape[1],
                tmp.shape[0],
                1.0,
                tmp.gpudata,
                tmp.shape[0],
                0.0,
                0,
                tmp.shape[0],
                out.gpudata,
                out.shape[1],
            )
        if stream is not None:
            cusparse.cusparseSetStream(cusparse_handle, 0)
            cublas.cublasSetStream(cublas_handle, 0)

        return out
Exemplo n.º 8
0
def cublas_dot(a, b, out=None, transpose_a=False, transpose_b=False,
               increment=False, stream=None):
    """Matrix multiplication using CUBLAS."""

    assert not increment or out is not None

    dtype = a.dtype

    if transpose_a:
        a1, a0 = a.shape
    else:
        a0, a1 = a.shape

    if transpose_b:
        b1, b0 = b.shape
    else:
        b0, b1 = b.shape

    assert a1 == b0

    if out is None:
        out = gpuarray.zeros((a0, b1), dtype=dtype)

    assert a.dtype == b.dtype == out.dtype

    # note: we swap the order of a and b and swap the transposes because
    # cublas assumes column-major ordering
    transa = "t" if transpose_a else "n"
    transb = "t" if transpose_b else "n"
    beta = dtype.type(1.0) if increment else dtype.type(0.0)
    lda = a0 if transpose_a else a1
    ldb = b0 if transpose_b else b1
    ldout = b1

    if stream is not None:
        # note: this assumes that the stream is set to the default stream to
        # start
        cublas.cublasSetStream(misc._global_cublas_handle, stream.handle)

    if dtype == np.float32:
        gemm = cublas.cublasSgemm
    else:
        gemm = cublas.cublasDgemm

    gemm(misc._global_cublas_handle, transb, transa, b1, a0, a1,
         dtype.type(1.0), b.gpudata, ldb, a.gpudata, lda, beta, out.gpudata,
         ldout)

    if stream is not None:
        cublas.cublasSetStream(misc._global_cublas_handle, 0)

    return out
    def forward(ctx, A, B):
        """
        Complex pointwise multiplication between (batched) tensor A and tensor B.

        Parameters
        ----------
        A : tensor
            input tensor with size (B, C, M, N, 2)
        B : tensor
            B is a complex tensor of size (M, N, 2)
        inplace : boolean, optional
            if set to True, all the operations are performed inplace

        Returns
        -------
        C : tensor
            output tensor of size (B, C, M, N, 2) such that:
            C[b, c, m, n, :] = A[b, c, m, n, :] * B[m, n, :]
        """
        A, B = A.contiguous(), B.contiguous()
        ctx.save_for_backward(A, B)

        if A.size()[-3:] != B.size():
            raise RuntimeError(
                'The filters are not compatible for multiplication!')

        if not iscomplex(A) or not iscomplex(B):
            raise TypeError('The input, filter and output should be complex')

        if B.ndimension() != 3:
            raise RuntimeError('The filters must be simply a complex array!')

        if type(A) is not type(B):
            raise RuntimeError('A and B should be same type!')

        if not A.is_cuda:
            raise RuntimeError('Use the torch backend for cpu tensors!')

        C = A.new(A.size())
        m, n = B.nelement() // 2, A.nelement() // B.nelement()
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(),
                           incx, C.data_ptr(), ldc)
        return C
Exemplo n.º 10
0
def cdgmm(A, B, jit=True, inplace=False):
    """This function uses the C-wrapper to use cuBLAS.
        """
    A, B = A.contiguous(), B.contiguous()

    if A.size()[-3:] != B.size():
        raise RuntimeError(
            'The filters are not compatible for multiplication!')

    if not iscomplex(A) or not iscomplex(B):
        raise TypeError('The input, filter and output should be complex')

    if B.ndimension() != 3:
        raise RuntimeError('The filters must be simply a complex array!')

    if type(A) is not type(B):
        raise RuntimeError('A and B should be same type!')

    if not jit or isinstance(A, (torch.FloatTensor, torch.DoubleTensor)):
        C = A.new(A.size())

        A_r = A[..., 0].contiguous().view(-1, A.size(-2) * A.size(-3))
        A_i = A[..., 1].contiguous().view(-1, A.size(-2) * A.size(-3))

        B_r = B[..., 0].contiguous().view(
            B.size(-2) * B.size(-3)).unsqueeze(0).expand_as(A_i)
        B_i = B[..., 1].contiguous().view(
            B.size(-2) * B.size(-3)).unsqueeze(0).expand_as(A_r)

        C[..., 0].copy_(A_r * B_r - A_i * B_i)
        C[..., 1].copy_(A_r * B_i + A_i * B_r)

        # faster if B is actually real
        #B[...,1] = B[...,0]
        #C = A * B.unsqueeze(0).expand_as(A)
        return C if not inplace else A.copy_(C)
    else:
        C = A.new(A.size()) if not inplace else A
        m, n = B.nelement() // 2, A.nelement() // B.nelement()
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(),
                           incx, C.data_ptr(), ldc)
        return C
Exemplo n.º 11
0
def cublas_dgmm(A, x, out=None):
    if out is not None:
        assert out.is_contiguous() and out.size() == A.size()
    else:
        out = A.new(A.size())
    assert x.dim() == 1
    assert x.numel() == A.size(-1) or x.numel() == A.size(0)
    assert A.type() == x.type() == out.type()
    assert A.is_contiguous()

    if not isinstance(A, (torch.cuda.FloatTensor, torch.cuda.DoubleTensor)):
        if x.numel() == A.size(-1):
            return A.mm(torch.diag(x), out=out.view_as(A))
        else:
            return torch.diag(x).mm(A, out=out.view_as(A))
    else:
        if x.numel() == A.size(-1):
            m, n = A.size(-1), A.numel() // A.size(-1)
            mode = 'l'
            # A.mm(x.diag(), out=out)
            # return out
        elif x.numel() == A.size(0):
            n, m = A.size(0), A.numel() // A.size(0)
            mode = 'r'
            # if A.stride(0) == 1:
            #     mode = 'l'
            #     n, m = m, n
            # x.diag().mm(A, out=out)
            # return out
        lda, ldc = m, m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        from skcuda import cublas
        cublas.cublasSetStream(handle, stream)
        args = [
            handle, mode, m, n,
            A.data_ptr(), lda,
            x.data_ptr(), incx,
            out.data_ptr(), ldc
        ]
        if isinstance(A, torch.cuda.FloatTensor):
            cublas.cublasSdgmm(*args)
        elif isinstance(A, torch.cuda.DoubleTensor):
            cublas.cublasDdgmm(*args)
        return out
Exemplo n.º 12
0
 def forward(self, vector1, vector2):
     with torch.cuda.device_of(vector1):
         output = vector1.new(1)
         handle = torch.cuda.current_blas_handle()
         stream = torch.cuda.current_stream()
         cublas.cublasSetStream(handle, stream)
         if isinstance(vector1, torch.cuda.FloatTensor):
             result = cublas.cublasSdot(handle, vector1.numel(),
                                        vector1.data_ptr(), 1,
                                        vector2.data_ptr(), 1)
         elif isinstance(vector1, torch.cuda.DoubleTensor):
             result = cublas.cublasDdot(handle, vector1.numel(),
                                        vector1.data_ptr(), 1,
                                        vector2.data_ptr(), 1)
         output = output.fill_(float(result))
     self.save_for_backward(vector1, vector2)
     return output
Exemplo n.º 13
0
def cdgmm3d(A, B, inplace=False):
    """
    Pointwise multiplication of complex tensors.

    ----------
    A: complex tensor
    B: complex tensor of the same size as A

    Returns
    -------
    output : tensor of the same size as A containing the result of the
             elementwise complex multiplication of  A with B
    """
    if not A.is_contiguous():
        warnings.warn("cdgmm3d: tensor A is converted to a contiguous array")
        A = A.contiguous()
    if not B.is_contiguous():
        warnings.warn("cdgmm3d: tensor B is converted to a contiguous array")
        B = B.contiguous()

    if A.size()[-4:] != B.size():
        raise RuntimeError('The filters are not compatible for multiplication.')

    if not iscomplex(A) or not iscomplex(B):
        raise TypeError('The input, filter and output should be complex.')

    if B.ndimension() != 4:
        raise RuntimeError('The filters must be simply a complex array.')

    if type(A) is not type(B):
        raise RuntimeError('A and B should be same type.')

    if not A.is_cuda:
        raise RuntimeError('Use the torch backend for cpu tensors.')

    C = A.new(A.size()) if not inplace else A
    m, n = B.nelement() // 2, A.nelement() // B.nelement()
    lda = m
    ldc = m
    incx = 1
    handle = torch.cuda.current_blas_handle()
    stream = torch.cuda.current_stream()._as_parameter_
    cublas.cublasSetStream(handle, stream)
    cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc)
    return C
def cdgmm(A, B, jit=True, inplace=False):
    """This function uses the C-wrapper to use cuBLAS.
        """
    A, B = A.contiguous(), B.contiguous()

    if A.size()[-3:] != B.size():
        raise RuntimeError('The filters are not compatible for multiplication!')

    if not iscomplex(A) or not iscomplex(B):
        raise TypeError('The input, filter and output should be complex')

    if B.ndimension() != 3:
        raise RuntimeError('The filters must be simply a complex array!')

    if type(A) is not type(B):
        raise RuntimeError('A and B should be same type!')

    if not jit or isinstance(A, (torch.FloatTensor, torch.DoubleTensor)):
        C = A.new(A.size())

        A_r = A[..., 0].contiguous().view(-1, A.size(-2)*A.size(-3))
        A_i = A[..., 1].contiguous().view(-1, A.size(-2)*A.size(-3))

        B_r = B[...,0].contiguous().view(B.size(-2)*B.size(-3)).unsqueeze(0).expand_as(A_i)
        B_i = B[..., 1].contiguous().view(B.size(-2)*B.size(-3)).unsqueeze(0).expand_as(A_r)

        C[..., 0].copy_(A_r * B_r - A_i * B_i)
        C[..., 1].copy_(A_r * B_i + A_i * B_r)

        # faster if B is actually real
        #B[...,1] = B[...,0]
        #C = A * B.unsqueeze(0).expand_as(A)
        return C if not inplace else A.copy_(C)
    else:
        C = A.new(A.size()) if not inplace else A
        m, n = B.nelement() // 2, A.nelement() // B.nelement()
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc)
        return C
Exemplo n.º 15
0
 def forward(self, matrix1, matrix2):
     with torch.cuda.device_of(matrix1):
         dim1, dim2 = matrix1.size()
         dim2, dim3 = matrix2.size()
         output = matrix1.new(dim1, dim3)
         handle = torch.cuda.current_blas_handle()
         stream = torch.cuda.current_stream()
         cublas.cublasSetStream(handle, stream)
         if isinstance(matrix1, torch.cuda.FloatTensor):
             cublas.cublasSgemm(handle, 'n', 'n', dim3, dim1, dim2, 1,
                                matrix2.data_ptr(), dim3,
                                matrix1.data_ptr(), dim2, 0,
                                output.data_ptr(), dim3)
         elif isinstance(matrix1, torch.cuda.DoubleTensor):
             cublas.cublasDgemm(handle, 'n', 'n', dim3, dim1, dim2, 1,
                                matrix2.data_ptr(), dim3,
                                matrix1.data_ptr(), dim2, 0,
                                output.data_ptr(), dim3)
     self.save_for_backward(matrix1, matrix2)
     return output
Exemplo n.º 16
0
def cublas_cdgmm(A, x, out=None):
    if out is not None:
        assert out.is_contiguous() and out.size() == A.size()
    else:
        out = A.new(A.size())
    assert x.dim() == 2 and x.size(-1) == 2 and A.size(-1) == 2
    assert A.dim() == 3
    assert x.size(0) == A.size(1) or x.size(0) == A.size(0)
    assert A.type() == x.type() == out.type()
    assert A.is_contiguous()

    if not isinstance(A, (torch.cuda.FloatTensor, torch.cuda.DoubleTensor)):
        raise NotImplementedError
    else:
        m, n = A.size(1), A.size(0)
        if x.size(0) == A.size(1):
            mode = 'l'
        elif x.size(0) == A.size(0):
            mode = 'r'
        lda, ldc = m, m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        from skcuda import cublas
        cublas.cublasSetStream(handle, stream)
        args = [
            handle, mode, m, n,
            A.data_ptr(), lda,
            x.data_ptr(), incx,
            out.data_ptr(), ldc
        ]
        if isinstance(A, torch.cuda.FloatTensor):
            cublas.cublasCdgmm(*args)
        elif isinstance(A, torch.cuda.DoubleTensor):
            cublas.cublasZdgmm(*args)
        return out
Exemplo n.º 17
0
def cublas_handle():
    cublas_handle = cublas.cublasCreate()
    cublas.cublasSetStream(cublas_handle, stream().handle)
    return cublas_handle
Exemplo n.º 18
0
def cdgmm3d(A, B, inplace=False):
    """Complex pointwise multiplication.

        Complex pointwise multiplication between (batched) tensor A and tensor B.

        Parameters
        ----------
        A : torch tensor
            Complex torch tensor.
        B : torch tensor
            Complex of the same size as A.
        inplace : boolean, optional
            If set True, all the operations are performed inplace.

        Raises
        ------
        RuntimeError
            In the event that the tensors are not compatibile for multiplication
            (i.e. the final four dimensions of A do not match with the dimensions
            of B), or in the event that B is not complex, or in the event that the
            type of A and B are not the same.
        TypeError
            In the event that x is not complex i.e. does not have a final dimension
            of 2, or in the event that both tensors are not on the same device.

        Returns
        -------
        output : torch tensor
            Torch tensor of the same size as A containing the result of the
            elementwise complex multiplication of A with B.

    """
    if not A.is_contiguous():
        warnings.warn("cdgmm3d: tensor A is converted to a contiguous array")
        A = A.contiguous()
    if not B.is_contiguous():
        warnings.warn("cdgmm3d: tensor B is converted to a contiguous array")
        B = B.contiguous()

    if A.shape[-4:] != B.shape:
        raise RuntimeError(
            'The filters are not compatible for multiplication.')

    if not _is_complex(A) or not _is_complex(B):
        raise TypeError('The input, filter and output should be complex.')

    if B.ndimension() != 4:
        raise RuntimeError('The filters must be simply a complex array.')

    if type(A) is not type(B):
        raise RuntimeError('A and B should be same type.')

    if not A.is_cuda:
        raise RuntimeError('Use the torch backend for CPU tensors.')

    C = A.new(A.shape) if not inplace else A
    m, n = B.nelement() // 2, A.nelement() // B.nelement()
    lda = m
    ldc = m
    incx = 1
    handle = torch.cuda.current_blas_handle()
    stream = torch.cuda.current_stream()._as_parameter_
    cublas.cublasSetStream(handle, stream)
    cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(),
                       incx, C.data_ptr(), ldc)
    return C
Exemplo n.º 19
0
def vis_gpu(antpos,
            freq,
            eq2tops,
            crd_eq,
            I_sky,
            bm_cube,
            nthreads=NTHREADS,
            max_memory=MAX_MEMORY,
            real_dtype=np.float32,
            complex_dtype=np.complex64,
            verbose=False):
    # ensure shapes
    nant = antpos.shape[0]
    assert (antpos.shape == (nant, 3))
    npix = crd_eq.shape[1]
    assert (crd_eq.shape == (3, npix))
    assert (I_sky.shape == (npix, ))
    beam_px = bm_cube.shape[1]
    assert (bm_cube.shape == (nant, beam_px, beam_px))
    ntimes = eq2tops.shape[0]
    assert (eq2tops.shape == (ntimes, 3, 3))
    # ensure data types
    antpos = antpos.astype(real_dtype)
    eq2tops = eq2tops.astype(real_dtype)
    crd_eq = crd_eq.astype(real_dtype)
    Isqrt = np.sqrt(I_sky).astype(real_dtype)
    bm_cube = bm_cube.astype(real_dtype)  # XXX complex?
    chunk = max(min(npix, MIN_CHUNK),
                2**int(ceil(np.log2(float(nant * npix) / max_memory / 2))))
    npixc = npix / chunk
    # blocks of threads are mapped to (pixels,ants,freqs)
    block = (max(1, nthreads / nant), min(nthreads, nant), 1)
    grid = (int(ceil(npixc / float(block[0]))),
            int(ceil(nant / float(block[1]))))
    gpu_code = GPU_TEMPLATE % {
        'NANT': nant,
        'NPIX': npixc,
        'BEAM_PX': beam_px,
        'BLOCK_PX': block[0],
    }
    gpu_module = compiler.SourceModule(gpu_code)
    bm_interp = gpu_module.get_function("InterpolateBeam")
    meas_eq = gpu_module.get_function("MeasEq")
    bm_texref = gpu_module.get_texref("bm_tex")
    import pycuda.autoinit
    h = cublasCreate()  # handle for managing cublas
    # define GPU buffers and transfer initial values
    bm_texref.set_array(
        numpy3d_to_array(bm_cube)
    )  # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT)
    antpos_gpu = gpuarray.to_gpu(
        antpos)  # never changes, set to -2*pi*antpos/c
    Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype)
    A_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=real_dtype)  # will be set on GPU by bm_interp
    crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype)
    eq2top_gpu = gpuarray.empty(shape=(3, 3),
                                dtype=real_dtype)  # sent from CPU each time
    crdtop_gpu = gpuarray.empty(shape=(3, npixc),
                                dtype=real_dtype)  # will be set on GPU
    tau_gpu = gpuarray.empty(shape=(nant, npixc),
                             dtype=real_dtype)  # will be set on GPU
    v_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=complex_dtype)  # will be set on GPU
    vis_gpus = [
        gpuarray.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    # output CPU buffers for downloading answers
    vis_cpus = [
        np.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    streams = [driver.Stream() for i in xrange(chunk)]
    event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate',
                   'meas_eq', 'vis', 'end')
    vis = np.empty((ntimes, nant, nant), dtype=complex_dtype)
    for t in xrange(ntimes):
        if verbose: print '%d/%d' % (t + 1, ntimes)
        eq2top_gpu.set(
            eq2tops[t])  # defines sky orientation for this time step
        events = [{e: driver.Event()
                   for e in event_order} for i in xrange(chunk)]
        for c in xrange(chunk + 2):
            cc = c - 1
            ccc = c - 2
            if 0 <= ccc < chunk:
                stream = streams[ccc]
                vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream)
                events[ccc]['end'].record(stream)
            if 0 <= cc < chunk:
                stream = streams[cc]
                cublasSetStream(h, stream.handle)
                ## compute crdtop = dot(eq2top,crd_eq)
                # cublas arrays are in Fortran order, so P=M*N is actually
                # peformed as P.T = N.T * M.T
                cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata,
                            npixc, eq2top_gpu.gpudata, 3, 0.,
                            crdtop_gpu.gpudata, npixc)
                events[cc]['eq2top'].record(stream)
                ## compute tau = dot(antpos,crdtop)
                cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1.,
                            crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3,
                            0., tau_gpu.gpudata, npixc)
                events[cc]['tau'].record(stream)
                ## interpolate bm_tex at specified topocentric coords, store interpolation in A
                ## threads are parallelized across pixel axis
                bm_interp(crdtop_gpu,
                          A_gpu,
                          grid=grid,
                          block=block,
                          stream=stream)
                events[cc]['interpolate'].record(stream)
                # compute v = A * I * exp(1j*tau*freq)
                meas_eq(A_gpu,
                        Isqrt_gpu,
                        tau_gpu,
                        real_dtype(freq),
                        v_gpu,
                        grid=grid,
                        block=block,
                        stream=stream)
                events[cc]['meas_eq'].record(stream)
                # compute vis = dot(v, v.T)
                # transpose below incurs about 20% overhead
                cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata,
                            npixc, v_gpu.gpudata, npixc, 0.,
                            vis_gpus[cc].gpudata, nant)
                events[cc]['vis'].record(stream)
            if c < chunk:
                stream = streams[c]
                events[c]['start'].record(stream)
                crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc],
                                     stream=stream)
                Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc],
                                    stream=stream)
                events[c]['upload'].record(stream)
        events[chunk - 1]['end'].synchronize()
        vis[t] = sum(vis_cpus)
        if verbose:
            for c in xrange(chunk):
                print '%d:%d START->END:' % (
                    c, chunk), events[c]['start'].time_till(
                        events[c]['end']) * 1e-3
                #for i,e in enumerate(event_order[:-1]):
                #    print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3
            print 'TOTAL:', events[0]['start'].time_till(
                events[chunk - 1]['end']) * 1e-3
    # teardown GPU configuration
    cublasDestroy(h)
    return vis
Exemplo n.º 20
0
def cdgmm(A, B, inplace=False):
    """Complex pointwise multiplication.

        Complex pointwise multiplication between (batched) tensor A and tensor
        B.

        Parameters
        ----------
        A : tensor
            A is a complex tensor of size (B, C, M, N, 2).
        B : tensor
            B is a complex tensor of size (M, N, 2) or real tensor of (M, N,
            1).
        inplace : boolean, optional
            If set to True, all the operations are performed in place.

        Raises
        ------
        RuntimeError
            In the event that the filter B is not a 3-tensor with a last
            dimension of size 1 or 2, or A and B are not compatible for
            multiplication, or if A or B are not contiguous.
        TypeError
            In the event that A is not complex, or B does not have a final
            dimension of 1 or 2, or A and B are not of the same dtype, or
            if A or B are not cuda tensors, or if A and B are not on the same
            device.

        Returns
        -------
        C : tensor
            Output tensor of size (B, C, M, N, 2) such that:
            C[b, c, m, n, :] = A[b, c, m, n, :] * B[m, n, :].

    """
    if not _is_complex(A):
        raise TypeError(
            'The input should be complex (i.e. last dimension is 2).')

    if not _is_complex(B) and not _is_real(B):
        raise TypeError('The filter should be complex or real, indicated by a '
                        'last dimension of size 2 or 1, respectively.')

    if A.shape[-len(B.shape):-1] != B.shape[:-1]:
        raise RuntimeError(
            'The filters are not compatible for multiplication.')

    if A.dtype is not B.dtype:
        raise TypeError('Input and filter must be of the same dtype.')

    if not A.is_cuda or not B.is_cuda:
        raise TypeError('Input and filter must be CUDA tensors.')

    if A.device.index != B.device.index:
        raise TypeError('Input and filter must be on the same GPU.')

    if _is_real(B):
        if inplace:
            return A.mul_(B)
        else:
            return A * B
    else:
        if not A.is_contiguous() or not B.is_contiguous():
            raise RuntimeError('Tensors must be contiguous.')

        C = A.new(A.shape) if not inplace else A
        m, n = B.nelement() // 2, A.nelement() // B.nelement()
        lda = m
        ldc = m
        incx = 1
        handle = torch.cuda.current_blas_handle()
        stream = torch.cuda.current_stream()._as_parameter_
        cublas.cublasSetStream(handle, stream)
        cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(),
                           incx, C.data_ptr(), ldc)
        return C
Exemplo n.º 21
0
    cuda.register_host_memory(x[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]
y_pin = [
    cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]

h = cublas.cublasCreate()

x_gpu = np.empty(nStreams, dtype=object)
y_gpu = np.empty(nStreams, dtype=object)
ans = np.empty(nStreams, dtype=object)

for i in range(nStreams):
    cublas.cublasSetStream(h, streams[i].handle)

    x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i])
    y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i])

    cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1,
                       y_gpu[i].gpudata, 1)
    ans[i] = y_gpu[i].get_async(stream=streams[i])

cublas.cublasDestroy(h)

# Uncomment to check for errors in the calculation
#y_gpu = np.array([yg.get() for yg in y_gpu])
#y_gpu = np.array(y_gpu).reshape(y.shape)
#print np.allclose(y_gpu, a*x + y)