Пример #1
0
def _load_torch_gpu_allocator_cpp_extension(verbosity, is_rocm_pytorch):
    gpu_identifier = "hip" if is_rocm_pytorch else "cuda"
    gpu_allocator_header = "HIPCachingAllocator" if is_rocm_pytorch else "CUDACachingAllocator"
    torch_gpu_allocator_addresses_cpp_source = f'''
        #include <torch/extension.h>
        #include <c10/{gpu_identifier}/{gpu_allocator_header}.h>

        size_t gpu_caching_allocator_raw_alloc_address() {{
            return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_alloc);
        }}

        size_t gpu_caching_allocator_raw_delete_address() {{
            return reinterpret_cast<size_t>(&c10::{gpu_identifier}::{gpu_allocator_header}::raw_delete);
        }}
    '''

    return load_inline(
        name='inline_extension',
        cpp_sources=[torch_gpu_allocator_addresses_cpp_source],
        extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''],
        functions=[
            'gpu_caching_allocator_raw_alloc_address',
            'gpu_caching_allocator_raw_delete_address'
        ],
        verbose=verbosity,
        with_cuda=True)
Пример #2
0
 def re_build_func_torch():
     mod = cpp_extension.load_inline(
         name=self.module_name,
         cpp_sources=self.source,
         functions=self.functions,
         verbose=True,
         build_directory=self.path,
         is_python_module=True,
     )
     return mod
Пример #3
0
def _load_torch_allocator_cpp_extension(verbosity):
    torch_cuda_allocator_addresses_cpp_source = """
    #include <torch/extension.h>
    #include <c10/cuda/CUDACachingAllocator.h>
    size_t cuda_caching_allocator_raw_alloc_address() {
        return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_alloc);
    }
    size_t cuda_caching_allocator_raw_delete_address() {
        return reinterpret_cast<size_t>(&c10::cuda::CUDACachingAllocator::raw_delete);
    }
    """

    return load_inline(name='inline_extension', cpp_sources=[torch_cuda_allocator_addresses_cpp_source],
                       functions=['cuda_caching_allocator_raw_alloc_address',
                                  'cuda_caching_allocator_raw_delete_address'],
                       verbose=verbosity < Verbosity.WARNING, with_cuda=True)
Пример #4
0
        scratch[current][0] = (int)h;
        scratch[current][1] = (int)mm;
        scratch[current][2] = (int)nn;
        current++;
      }
      m = firstm;
      n = firstn;
    }
  }
  ret.push_back({max_width, scratch.slice(0, 0, current).clone()});
}

ret_t load_balance(at::Tensor layout) {
  ret_t ret;
  at::Tensor scratch = at::empty({layout.sum().item<int>(), 3}, layout.dtype());
  for(int max_width = 4; max_width > 0; max_width /= 2)
    segment_blocks(layout, scratch, max_width, ret);
  return ret;
}
'''

block = 16
L = 256
stride = 128
layout = torch_blocksparse.MultiheadAttention._make_layout(1, L // block, 'fixed', stride // block, True, 1, 1)
module = load_inline(name='load_balance',
                    cpp_sources=[source],
                    functions=['load_balance'])
balanced = module.load_balance(layout)
print(layout)
print(balanced)
Пример #5
0
torch::Tensor batched_dot_mul_sum_v1(
    const torch::Tensor& a,
    const torch::Tensor& b) {
  return a.mul(b).sum(-1);
}
"""

# PyTorch makes it easy to test our C++ implementations by providing a utility
# to JIT compile C++ source into Python extensions:
import os
from torch.utils import cpp_extension
cpp_lib = cpp_extension.load_inline(
    name='cpp_lib',
    cpp_sources=batched_dot_src,
    extra_cflags=['-O3'],
    extra_include_paths=[
        # `load_inline` needs to know where to find Pybind11 headers.
        os.path.join(os.getenv('CONDA_PREFIX'), 'include')
    ],
    functions=['batched_dot_mul_sum_v0', 'batched_dot_mul_sum_v1'])

# `load_inline` will create a shared object that is loaded into Python. When we collect
# instruction counts Timer will create a subprocess, so we need to re-import it. The
# import process is slightly more complicated for C extensions, but that's all we're
# doing here.
module_import_str = f"""\
# https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
import importlib.util
spec = importlib.util.spec_from_file_location("cpp_lib", {repr(cpp_lib.__file__)})
cpp_lib = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cpp_lib)"""
Пример #6
0
import torch
from torch.utils.cpp_extension import load_inline
from .pair_wise_distance_cuda_source import source
import pair_wise_distance

print("compile cuda source of 'pair_wise_distance' function...")
print(
    "NOTE: if you avoid this process, you make .cu file and compile it following https://pytorch.org/tutorials/advanced/cpp_extension.html"
)
pair_wise_distance_cuda = load_inline("pair_wise_distance",
                                      cpp_sources="",
                                      cuda_sources=source)
print("done")


class PairwiseDistFunction(torch.autograd.Function):
    @staticmethod
    def forward(self, pixel_features, spixel_features, init_spixel_indices,
                num_spixels_width, num_spixels_height):
        self.num_spixels_width = num_spixels_width
        self.num_spixels_height = num_spixels_height
        output = pixel_features.new(pixel_features.shape[0], 9,
                                    pixel_features.shape[-1]).zero_()
        self.save_for_backward(pixel_features, spixel_features,
                               init_spixel_indices)

        return pair_wise_distance_cuda.forward(
            pixel_features.contiguous(), spixel_features.contiguous(),
            init_spixel_indices.contiguous(), output, self.num_spixels_width,
            self.num_spixels_height)
Пример #7
0
def _load_aten_op_executor_cpp_extension(verbosity, is_rocm_pytorch):
    aten_op_executor_cpp_source = """
#include <torch/torch.h>
#include <ATen/DLConvertor.h>
#include <unordered_map>
#include <tuple>
#include <vector>

class ATenOperatorCache {
 public:
  static ATenOperatorCache& Instance() {
    static ATenOperatorCache instance;
    return instance;
  }

  std::shared_ptr<torch::jit::Operator> GetOperator(const std::string& op_name) {
    if (ops_.find(op_name) == ops_.end()) {
      auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString(op_name));
      TORCH_INTERNAL_ASSERT(ops.size() == 1);
      ops_[op_name] = ops.front();
    }

    return ops_.at(op_name);
  }

 private:
  ATenOperatorCache() = default;
  std::unordered_map<std::string, std::shared_ptr<torch::jit::Operator>> ops_;
};

// Some arguments of backward operator are not from forward operator's input or output,
// but need some processing. Since we cannot build such processing to ONNX graph for now,
// we are putting such processing code here if needed.
// Take embedding_backward as example:
//   weight: embedding_backward(grad, indices, weight.size(0), padding_idx, scale_grad_by_freq, sparse)
// the 3rd argument (index 2) is weight.size(0), we add this processing here.
using TensorTransformFunc = std::function<c10::IValue(const at::Tensor&)>;
static const TensorTransformFunc embedding_num_weights = [](const at::Tensor& tensor) {
  return c10::IValue(tensor.size(0));
};

static const std::unordered_map<std::string, std::unordered_map<size_t, TensorTransformFunc>> TENSOR_TRANSFORM_FUNCS = {
    {"aten::embedding_backward", {{2, embedding_num_weights}}},
};

template <typename T>
void SetIValueArguments(const std::vector<std::tuple<size_t, T>>& raw_arguments,
                        std::vector<c10::IValue>& ivalue_arguments) {
  for (size_t i = 0; i < raw_arguments.size(); i++) {
    size_t index = std::get<0>(raw_arguments[i]);
    TORCH_INTERNAL_ASSERT(index < ivalue_arguments.size());
    ivalue_arguments[index] = c10::IValue(std::get<1>(raw_arguments[i]));
  }
}

// TODO: Add more argument types, such as list type.
std::vector<DLManagedTensor*> ExecuteATenOperator(
    const char* op_name, const std::vector<std::tuple<size_t, DLManagedTensor*>>& tensor_arguments,
    const std::vector<std::tuple<size_t, int64_t>>& int_arguments,
    const std::vector<std::tuple<size_t, float>>& float_arguments,
    const std::vector<std::tuple<size_t, bool>>& bool_arguments) {
  std::string op_name_str(op_name);
  std::shared_ptr<torch::jit::Operator> op = ATenOperatorCache::Instance().GetOperator(op_name_str);

  // TODO: need to handle optional argument and arguments with default values.
  std::vector<c10::IValue> arguments;
  arguments.resize(op->schema().arguments().size());
  for (size_t i = 0; i < tensor_arguments.size(); i++) {
    size_t index = std::get<0>(tensor_arguments[i]);
    at::Tensor tensor = at::fromDLPack(std::get<1>(tensor_arguments[i]));
    bool has_transform_func = false;
    if (TENSOR_TRANSFORM_FUNCS.find(op_name_str) != TENSOR_TRANSFORM_FUNCS.end()) {
      const auto& transform_funcs = TENSOR_TRANSFORM_FUNCS.at(op_name_str);
      if (transform_funcs.find(index) != transform_funcs.end()) {
        arguments[index] = transform_funcs.at(index)(tensor);
        has_transform_func = true;
      }
    }

    if (!has_transform_func) {
      arguments[index] = c10::IValue(tensor);
    }
  }

  SetIValueArguments<int64_t>(int_arguments, arguments);
  SetIValueArguments<float>(float_arguments, arguments);
  SetIValueArguments<bool>(bool_arguments, arguments);

  torch::jit::Stack stack;
  for (size_t i = 0; i < arguments.size(); i++) {
    torch::jit::push(stack, arguments[i]);
  }

  op->getOperation()(&stack);
  // TODO: need to handle multiple-tensor outputs.
  at::Tensor output;
  torch::jit::pop(stack, output);
  std::vector<DLManagedTensor*> result;
  result.emplace_back(at::toDLPack(output));
  return result;
}

size_t execute_aten_operator_address() { return reinterpret_cast<size_t>(&ExecuteATenOperator); }
    """

    aten_op_executor_cpp_extension = load_inline(
        name='inline_extension_aten_op_executor',
        cpp_sources=[aten_op_executor_cpp_source],
        extra_cflags=['-D__HIP_PLATFORM_HCC__=1' if is_rocm_pytorch else ''],
        functions=['execute_aten_operator_address'],
        verbose=verbosity,
        with_cuda=True)

    C.register_aten_op_executor(
        str(aten_op_executor_cpp_extension.execute_aten_operator_address()))
Пример #8
0
# NOTE: inspect the current parallel backend with
# print(th.__config__.parallel_info())
# ref: https://github.com/suphoff/pytorch_parallel_extension_cpp/blob/master/setup.py

flags = ['-DAT_PARALLEL_OPENMP', '-fopenmp']
# flags = ['-DAT_PARALLEL_NATIVE_TBB']
# flags = ['-DAT_PARALLEL_NATIVE']

__srcpath = os.path.join(os.path.dirname(__file__), '_srckernel.cc')
with open(__srcpath, 'rt') as f:
    srckernel = f.read()
SRC = load_inline('SRC', srckernel, functions=[
    'ShortRangeRankingCorrelation',
    'BatchShortRangeRankingCorrelation',
    ],
    extra_cflags=flags + ['-O2'],
    extra_ldflags=flags,
    verbose=True)

def BatchNearsightRankCorr(X, y, r):
    X = X.cpu()
    y = y.cpu()
    r = r.cpu()
    scores = SRC.BatchShortRangeRankingCorrelation(X, y, r)
    return scores.cpu().numpy().astype(np.float)

def NearsightRankCorr(x, y, r):
    x = x.cpu()
    y = y.cpu()
    r = r.cpu()
Пример #9
0
    import cupy
except BaseException as e:
    logging.info(f'spikingjelly.activation_based.spike_op: {e}')
    cupy = None

try:
    logging.warning(
        'spikingjelly.activation_based.spike_op: try to use `torch.utils.cpp_extension.load_inline` to load cudnn functions.'
    )
    logging.warning(
        f'If it is hanging, pleast try to delete torch_extensions cache directory. (In most cases, the directory is {torch.utils.cpp_extension._get_build_directory("", False)}.)'
    )
    cpp_wrapper = load_inline(name='cpp_wrapper',
                              cpp_sources='using namespace at;',
                              functions=[
                                  'cudnn_convolution_backward',
                                  'cudnn_convolution_backward_input',
                                  'cudnn_convolution_backward_weight'
                              ],
                              with_cuda=True)
except BaseException as e:
    logging.info(f'spikingjelly.activation_based.spike_op: {e}')
    cpp_wrapper = None
'''
aten/src/ATen/native/cudnn/ConvPlaceholders.cpp

at::Tensor cudnn_convolution(
    const at::Tensor& input, const at::Tensor& weight,
    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
    int64_t groups, bool benchmark, bool deterministic, bool allow_tf32)

There are two overloaded C++ methods `cudnn_convolution`. So, we need to use an alternative syntax to cast the overloaded function.
Пример #10
0
#!/usr/bin/env python

# Verify that PyTorch can JIT compile C++ extensions
# This requires at least Ninja and a working C++ compiler, preferably GCC
#
# Heavily based on the PyTorch tutorial for C++ extensions
# Author: Alexander Grund (TU Dresden)

from torch.utils.cpp_extension import load_inline

cpp_source = "torch::Tensor test_func(torch::Tensor x) { return x; }"

module = load_inline(name='inline_extension',
                     cpp_sources=cpp_source,
                     functions=['test_func'])
assert module
Пример #11
0
#torch::Tensor myfunc(torch::Tensor z) {
#	auto s = torch::ones_like(z);
#	return s + z;
#}
#'''
#
#mod = load_inline('mymod', source, functions=['myfunc'], verbose=True)
#x = th.rand(10)
#print(x)
#print(mod.myfunc(x))



with open("_srckernel.cc", 'rt') as f:
    srckernel = f.read()
SRC = load_inline('SRC', srckernel, functions=['ShortRangeRankingCorrelation'], verbose=True)

def BatchShortRangeRankingCorrelation(X, y, r):
    scores = np.zeros(X.shape[0])
    for (i, srt) in enumerate(X):
        scores[i] = SRC.ShortRangeRankingCorrelation(srt, y, r)
    return scores


import time
import rich
c = rich.get_console()

for i in range(100):
    for cansee in (5, 50, 1000):
        for k in (5, 10, 25):
Пример #12
0
# load_inline will automatically search /usr/include, but not conda include.
extra_include_paths: List[str] = []
conda_prefix = os.getenv("CONDA_PREFIX")
if conda_prefix is not None:
    extra_include_paths = [os.path.join(conda_prefix, "include")]

bindings = load_inline(
    name="callgrind_bindings",
    cpp_sources=textwrap.dedent("""
    #include <valgrind/callgrind.h>

    bool _valgrind_supported_platform() {
        #if defined(NVALGRIND)
        return false;
        #else
        return true;
        #endif
    }

    void _valgrind_toggle() {
        #if defined(NVALGRIND)
        TORCH_CHECK(false, "Valgrind is not supported.");
        #else
        CALLGRIND_TOGGLE_COLLECT;
        #endif
    }
    """),
    extra_include_paths=extra_include_paths,
    functions=["_valgrind_supported_platform", "_valgrind_toggle"],
)