예제 #1
0
    def ctor(self):
        code = pccm.FunctionCode()
        code.arg("is_cpu", "bool")
        code.arg("key_itemsize, value_itemsize", "int")
        code.arg("keys_data", "tv::Tensor")
        code.arg("values_data", "tv::Tensor")
        code.arg("stream", "std::uintptr_t", "0")

        code.ctor_init("is_cpu", "is_cpu")
        code.ctor_init("keys_data", "keys_data")
        code.ctor_init("values_data", "values_data")
        code.ctor_init("key_itemsize_", "key_itemsize")
        code.ctor_init("value_itemsize_", "value_itemsize")

        code.ctor_init("insert_count_", "0")

        code.raw(f"""
        TV_ASSERT_RT_ERR(key_itemsize == 4 || key_itemsize == 8, "key_itemsize must be 4 or 8");
        TV_ASSERT_RT_ERR(value_itemsize == 4 || value_itemsize == 8, "value_itemsize must be 4 or 8");

        if (!is_cpu){{
            TV_ASSERT_RT_ERR(!keys_data.empty() && !values_data.empty(), "key and value must not empty");
            TV_ASSERT_RT_ERR(keys_data.dim(0) == values_data.dim(0), "key and value must have same size");
            TV_ASSERT_RT_ERR(key_itemsize == keys_data.itemsize(), "key_itemsize must equal to key_data");
            TV_ASSERT_RT_ERR(value_itemsize == values_data.itemsize(), "value_itemsize must equal to values_data");
            // clear cuda table here.
            clear(stream);
        }}
        """)
        if CUMM_CPU_ONLY_BUILD:
            code.raw(
                f"TV_ASSERT_RT_ERR(is_cpu, \"spconv not built with CUDA\");")
        return code
예제 #2
0
파일: pointops.py 프로젝트: xmyqsh/spconv
    def voxel_empty_fill_mean(self):
        code = pccm.FunctionCode()
        code.arg("voxels", f"{self.dtype} *")

        code.arg("num_per_voxel", f"int *")
        code.arg("num_voxels", f"int")
        code.arg("num_points_per_voxel", f"int")
        code.arg("num_voxel_features", f"int")

        code.raw(f"""
        int voxel_stride = num_points_per_voxel * num_voxel_features;
        for (int i : tv::KernelLoopX<int>(num_voxels)){{
            int count = min(num_points_per_voxel, num_per_voxel[i]);
            num_per_voxel[i] = count;
            for (int j = 0; j < num_voxel_features; ++j){{
                auto voxel_ptr = voxels + i * voxel_stride + j;
                {self.dtype} sum_val = 0;
                for (int k = 0; k < count; ++k){{
                    sum_val += voxel_ptr[0];
                    voxel_ptr += num_voxel_features;
                }}
                sum_val = count == 0 ? 0 : sum_val / count;
                for (int k = count; k < num_points_per_voxel; ++k){{
                    voxel_ptr[0] = sum_val;
                    voxel_ptr += num_voxel_features;
                }}
            }}
        }}
        """)
        return code
예제 #3
0
파일: all.py 프로젝트: xmyqsh/spconv
    def generate_subm_conv_inds_cpu(self):
        code = pccm.FunctionCode()
        code.arg("indices", "tv::Tensor")
        code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
        code.arg("batch_size", "int")
        code.arg("input_dims", f"std::vector<int>")
        code.arg("ksize, dilation", f"std::vector<int>")

        code.raw(f"""
        int ndim = indices.dim(1) - 1;
        TV_ASSERT_RT_ERR(input_dims.size() == ndim &&
            ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim);
        """)
        for ndim in self.ndims:
            code.raw(f"""
            if (ndim == {ndim}){{
                tv::array<int, {ndim}> input_dims_;
                tv::array<int, {ndim}> ksize_, dilation_;
                for (int i = 0; i < {ndim}; ++i){{
                    input_dims_[i] = input_dims[i];
                    ksize_[i] = ksize[i];
                    dilation_[i] = dilation[i];
                }}
                return SpconvIndicesCPU{ndim}D::generate_subm_conv_inds(indices,
                    indice_pairs, out_inds, indice_num_per_loc,
                    batch_size, input_dims_, 
                    ksize_, dilation_);
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
        return code.ret("int")
예제 #4
0
파일: gather.py 프로젝트: xmyqsh/spconv
 def scatter_add(self):
     code = pccm.FunctionCode()
     code.arg("out", "tv::Tensor")
     code.arg("in", "tv::Tensor")
     code.arg("inds", "tv::Tensor")
     code.raw(f"""
     // tv::check_shape(inds, {{in.dim(0)}});
     auto nhot = inds.dim(0);
     int channel = in.dim(1);
     tv::dispatch<float, double>(out.dtype(), [&](auto I){{
         using T = TV_DECLTYPE(I);
         auto indices_data = inds.data_ptr<const int>();
         const T *buffer_data = in.data_ptr<const T>();
         T *features_data = out.data_ptr<T>();
         const T *buf = in.data_ptr<const T>();
         T *out_ptr = out.data_ptr<T>();
         tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
             for (int i = begin; i < end; i += step) {{
                 buf = buffer_data + i * channel;
                 out_ptr = features_data + indices_data[i] * channel;
                 for (int j = 0; j < channel; ++j) {{
                     out_ptr[j] = out_ptr[j] + buf[j];
                 }}
             }}
         }});
     }});
     """)
     return code
예제 #5
0
    def assign_arange_(self):
        """ this function assign "arange(NumItem)" to table values.
        useful in "unique-like" operations.
        unlike insert/query, this method only support i32/i64/u32/u64 for value.
        count must be u32/u64.
        """
        code = pccm.FunctionCode()
        if not CUMM_CPU_ONLY_BUILD:
            code.add_dependency(TensorViewHashKernel)
        code.arg("count", "tv::Tensor")

        code.arg("stream", "std::uintptr_t", "0")
        with code.if_("is_cpu"):
            map_name = "cpu_map"
            for k_type, v_type in self.cpu_map_storage_select(
                    "key_itemsize_", "value_itemsize_", map_name, code):
                code.raw(f"""
                {v_type} index = 0;
                for (auto it = {map_name}.begin(); it != {map_name}.end(); ++it){{
                    it.value() = index;
                    ++index;
                }}

                """)
        if not CUMM_CPU_ONLY_BUILD:
            with code.else_():
                code.raw(f"""
                TV_ASSERT_RT_ERR(count.device() == 0, "count must be cuda");
                auto custream = reinterpret_cast<cudaStream_t>(stream);
                """)
                for k_items in _dispatch_ints(code, [4, 8],
                                              "keys_data.itemsize()"):
                    code.raw(f"""
                    using K = tv::hash::itemsize_to_unsigned_t<{k_items}>;
                    constexpr K kEmptyKey = std::numeric_limits<K>::max();
                    auto count_ptr = count.data_ptr<K>();

                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
                    """)
                    val_dtypes = [
                        dtypes.int32, dtypes.int64, dtypes.uint32,
                        dtypes.uint64
                    ]
                    for v_dtype in _dispatch(code, val_dtypes,
                                             "values_data.dtype()"):
                        code.raw(f"""
                        using V = {v_dtype};
                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
                        using table_t =
                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
                                                        kEmptyKey, false>;
                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
                        tv::cuda::Launch launcher(table.size(), custream);
                        launcher(tv::hash::assign_arange_split<table_t, K>, table, count_ptr);
                        """)
        else:
            code.raw(f"""
            TV_THROW_RT_ERR("spconv not compiled with cuda, don't support cuda");
            """)
        return code
예제 #6
0
파일: gather.py 프로젝트: xmyqsh/spconv
    def gather(self):
        code = pccm.FunctionCode()
        code.arg("out", "tv::Tensor")
        code.arg("in", "tv::Tensor")
        code.arg("inds", "tv::Tensor")

        code.raw(f"""
        // tv::check_shape(inds, {{out.dim(0)}});

        auto nhot = inds.dim(0);
        int channel = in.dim(1);
        tv::dispatch<float, double>(out.dtype(), [&](auto I){{
            auto indices_data = inds.data_ptr<const int>();
            using T = TV_DECLTYPE(I);
            T *buffer_data = out.data_ptr<T>();
            const T *features_data = in.data_ptr<const T>();
            tv::kernel_1d(out.device(), nhot, [&](int begin, int end, int step){{
                for (int i = begin; i < end; i += step) {{
                    std::memcpy(buffer_data + i * channel,
                                features_data + indices_data[i] * channel,
                                sizeof(T) * channel);
                }}
            }});
        }});
        """)
        return code
예제 #7
0
파일: maxpool.py 프로젝트: xmyqsh/spconv
    def forward_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")

        code.arg("out_features", f"T*")
        code.arg("in_features", f"const T*")
        code.arg("out_indices", "const int*")
        code.arg("in_indices", "const int*")
        code.arg("size", "int")
        code.arg("num_features", "int")

        code.raw(f"""
        for (int i : tv::KernelLoopY<int>(size)) {{
            int in_idx = in_indices[i];
            int out_idx = out_indices[i];
            auto in_ptr = in_features + in_idx * num_features;
            auto out_ptr = out_features + out_idx * num_features;
            for (int j : tv::KernelLoopX<int>(num_features)) {{
                auto in = in_ptr[j];
                auto out = out_ptr[j];
                if (in > out){{
                    out_ptr[j] = in;
                }}
            }}
        }}
        """)
        return code
예제 #8
0
파일: maxpool.py 프로젝트: xmyqsh/spconv
    def backward_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")
        code.arg("out_features", f"const T*")
        code.arg("in_features", f"const T*")
        code.arg("dout_features", f"const T*")
        code.arg("din_features", f"T*")
        code.arg("out_indices", "const int*")
        code.arg("in_indices", "const int*")
        code.arg("size", "int")
        code.arg("num_features", "int")

        code.raw(f"""
        for (int i : tv::KernelLoopY<int>(size)) {{
            int in_idx_offset = in_indices[i] * num_features;
            int out_idx_offset = out_indices[i] * num_features;
            auto in_ptr = in_features + in_idx_offset;
            auto out_ptr = out_features + out_idx_offset;
            auto din_ptr = din_features + in_idx_offset;
            auto dout_ptr = dout_features + out_idx_offset;
            for (int j : tv::KernelLoopX<int>(num_features)) {{
                auto in = in_ptr[j];
                auto out = out_ptr[j];
                if (in == out){{
                    din_ptr[j] = din_ptr[j] + dout_ptr[j];
                }}
            }}
        }}
        """)
        return code
예제 #9
0
파일: pointops.py 프로젝트: xmyqsh/spconv
 def calc_meta_data(self):
     code = pccm.FunctionCode()
     code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
     code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
     code.raw(f"""
     return Point2VoxelCommon::calc_meta_data(vsize_xyz, coors_range_xyz);
     """)
     return code.ret(self.p2v_c.calc_meta_ret)
예제 #10
0
파일: all.py 프로젝트: xmyqsh/spconv
    def sort_1d_by_key_split_allocator(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("data", "tv::Tensor")
        code.arg("alloc_func", "std::function<std::uintptr_t(std::size_t)>")

        code.arg("mask", "tv::Tensor")

        code.arg("indices",
                 "tv::Tensor",
                 "tv::Tensor()",
                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.arg("mask_output", "bool", "false")

        code.code_after_include = f"""
        template <typename T> struct MaskedElementComp {{
            T mask_;
            TV_HOST_DEVICE_INLINE T operator()(const T &x, const T &y) const {{
                return (x & mask_) < (y & mask_);
            }}
        }};
        template <typename T> __global__ void mask_input(T* inp, T mask, int size){{
            for (int i : tv::KernelLoopX<int>(size)){{
                inp[i] &= mask;
            }}
        }}
        """
        code.add_dependency(CustomThrustLib, TensorViewKernel)
        code.add_param_class("cudakers", CudaCommonKernel())
        code.raw(f"""
        ThrustCustomAllocatorV2 allocator{{alloc_func}};

        cudaStream_t stream_cu = reinterpret_cast<cudaStream_t>(stream);
        // auto timer = tv::CudaContextTimer<>();
        if (indices.empty()){{
            indices = tv::empty({{data.dim(0)}}, tv::int32, 0);
        }}
        tv::cuda::Launch launcher(data.dim(0), stream_cu);
        launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
        tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
            using T = TV_DECLTYPE(I);
            auto masks_ptr = mask.data_ptr<T>();
            MaskedElementComp<T> op_comp{{masks_ptr[0]}};
            thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
            thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
            // auto thrust_ctx = thrust::cuda::par.on(stream_cu);
            auto ctx2 = thrust::cuda::par(allocator).on(stream_cu);
            thrust::sort_by_key(ctx2, ptr_tr, ptr_tr + data.dim(0), ptr_k, op_comp);
            if (mask_output){{
                launcher(mask_input<T>, data.data_ptr<T>(), masks_ptr[0], data.dim(0));
            }}
        }});
        // tv::ssprint("SORT_BY_KEY_MASKED", timer.report() / 1000.0);
        return indices;
        """)
        return code.ret("tv::Tensor")
예제 #11
0
파일: all.py 프로젝트: xmyqsh/spconv
    def count_bits(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()

        code.add_dependency(TensorViewKernel)
        code.arg("a", "tv::Tensor")
        code.code_after_include = f"""
        __global__ void count_bits_kernel_64(const uint64_t* data, int32_t* out, int size){{
            for (int i : tv::KernelLoopX<int>(size)){{
                out[i] = __popcll(reinterpret_cast<const unsigned long long*>(data)[i]);
            }}
        }}
        __global__ void count_bits_kernel(const uint32_t* data, int32_t* out, int size){{
            for (int i : tv::KernelLoopX<int>(size)){{
                out[i] = __popc(data[i]);
            }}
        }}

        int numberOfSetBits(uint32_t i)
        {{
            // https://stackoverflow.com/questions/109023/how-to-count-the-number-of-set-bits-in-a-32-bit-integer
            // Java: use int, and use >>> instead of >>. Or use Integer.bitCount()
            // C or C++: use uint32_t
            i = i - ((i >> 1) & 0x55555555);        // add pairs of bits
            i = (i & 0x33333333) + ((i >> 2) & 0x33333333);  // quads
            i = (i + (i >> 4)) & 0x0F0F0F0F;        // groups of 8
            return (i * 0x01010101) >> 24;          // horizontal sum of bytes
        }}

        int numberOfSetBits(uint64_t i)
        {{
            return numberOfSetBits(uint32_t(i)) + numberOfSetBits(uint32_t(i >> 32));
        }}
        """
        code.raw(f"""
        tv::Tensor res(a.shape(), tv::int32, a.device());
        tv::dispatch<uint32_t, uint64_t>(a.dtype(), [&](auto I){{
            auto res_ptr = res.data_ptr<int>();
            using T = TV_DECLTYPE(I);
            auto a_ptr = a.data_ptr<const T>();
            if (a.device() == -1){{
                for (int i = 0; i < a.size(); ++i){{
                    res_ptr[i] = numberOfSetBits(a_ptr[i]);
                }}
            }}else{{
                tv::cuda::Launch launcher(a.size());
                tv::if_constexpr<std::is_same<T, uint64_t>::value>([=](auto _)mutable{{
                    launcher(_(count_bits_kernel_64), a_ptr, res_ptr, int(a.size()));
                }}, [=](auto _)mutable{{
                    launcher(_(count_bits_kernel), a_ptr, res_ptr, int(a.size()));
                }});
            }}
        }});
        return res;
        """)
        return code.ret("tv::Tensor")
예제 #12
0
    def non_max_suppression_cpu(self):
        code = pccm.FunctionCode()
        code.arg("boxes, order", "tv::Tensor")
        code.arg("thresh", "float")
        code.arg("eps", "float", "0")

        code.raw(f"""
        auto ndets = boxes.dim(0);
        std::vector<int> keep(ndets);

        tv::dispatch<float, double>(boxes.dtype(), [&](auto I1){{
            using DType = TV_DECLTYPE(I1);
            auto boxes_r = boxes.tview<const DType, 2>();
            tv::dispatch<int, int64_t, uint32_t, uint64_t>(order.dtype(), [&](auto I2){{
                using T2 = TV_DECLTYPE(I2);
                auto order_r = order.tview<const T2, 1>();
                std::vector<DType> areas;
                for (int i = 0; i < ndets; ++i){{
                    areas[i] = (boxes_r(i, 2) - boxes_r(i, 0) + eps) * 
                               (boxes_r(i, 3) - boxes_r(i, 1) + eps);
                }}
                std::vector<int> suppressed(ndets, 0);

                int i, j;
                DType xx1, xx2, w, h, inter, ovr;
                for (int _i = 0; _i < ndets; ++_i) {{
                    i = order_r(_i);
                    if (suppressed[i] == 1)
                        continue;
                    keep.push_back(i);
                    for (int _j = _i + 1; _j < ndets; ++_j) {{
                        j = order_r(_j);
                        if (suppressed[j] == 1)
                            continue;
                        xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
                        xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
                        w = xx2 - xx1 + eps;
                        if (w > 0) {{
                            xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
                            xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
                            h = xx2 - xx1 + eps;
                            if (h > 0) {{
                            inter = w * h;
                            ovr = inter / (areas[i] + areas[j] - inter);
                            if (ovr >= thresh)
                                suppressed[j] = 1;
                            }}
                        }}
                    }}
                }}
            }});

        }});
        return keep;
        """)
        return code.ret("std::vector<int>")
예제 #13
0
파일: pointops.py 프로젝트: xmyqsh/spconv
 def get_grid_size(self):
     code = pccm.FunctionCode()
     code.raw(f"""
     std::array<int, {self.ndim}> res;
     for (int i = 0; i < {self.ndim}; ++i){{
         res[i] = grid_size[i];
     }}
     return res;
     """)
     return code.ret(f"std::array<int, {self.ndim}>")
예제 #14
0
파일: all.py 프로젝트: xmyqsh/spconv
 def scatter_add_cpu(self):
     code = pccm.FunctionCode()
     code.arg("out", "tv::Tensor")
     code.arg("inp", "tv::Tensor")
     code.arg("inds", "tv::Tensor")
     code.add_dependency(GatherCPU)
     code.raw(f"""
     return GatherCPU::scatter_add(out, inp, inds);
     """)
     return code
예제 #15
0
파일: pointops.py 프로젝트: xmyqsh/spconv
 def limit_num_per_voxel_value(self):
     code = pccm.FunctionCode()
     code.arg("num_per_voxel", f"int *")
     code.arg("num_voxels, num_points_per_voxel", f"int")
     code.raw(f"""
     for (int i : tv::KernelLoopX<int>(num_voxels)){{
         int count = min(num_points_per_voxel, num_per_voxel[i]);
         num_per_voxel[i] = count;
     }}
     """)
     return code
예제 #16
0
파일: all.py 프로젝트: xmyqsh/spconv
 def maxpool_forward_cpu(self):
     code = pccm.FunctionCode()
     code.arg("out", "tv::Tensor")
     code.arg("inp", "tv::Tensor")
     code.arg("out_inds", "tv::Tensor")
     code.arg("in_inds", "tv::Tensor")
     code.add_dependency(IndiceMaxPoolCPU)
     code.raw(f"""
     return IndiceMaxPoolCPU::forward(out, inp, out_inds, in_inds);
     """)
     return code
예제 #17
0
파일: all.py 프로젝트: xmyqsh/spconv
 def maxpool_implicit_gemm_forward(self):
     code = pccm.FunctionCode()
     if CUMM_CPU_ONLY_BUILD:
         return code.make_invalid()
     code.arg("out", "tv::Tensor")
     code.arg("inp", "tv::Tensor")
     code.arg("inds", "tv::Tensor")
     code.arg("stream", "std::uintptr_t", "0", pyanno="int")
     code.add_dependency(IndiceMaxPool)
     code.raw(f"""
     return IndiceMaxPool::forward_implicit_gemm(out, inp, inds, stream);
     """)
     return code
예제 #18
0
파일: pointops.py 프로젝트: xmyqsh/spconv
 def array2tvarray(self):
     code = pccm.FunctionCode()
     code.targ("T")
     code.nontype_targ("N", "size_t")
     code.arg("arr", "std::array<T, N>")
     code.raw(f"""
     tv::array<T, N> tarr;
     for (int i = 0; i < N; ++i){{
         tarr[i] = arr[i];
     }}
     return tarr;
     """)
     return code.ret("tv::array<T, N>")
예제 #19
0
파일: pointops.py 프로젝트: xmyqsh/spconv
    def point_to_voxel_empty_mean(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
        code.arg("clear_voxels", "bool", "true")
        code.raw(f"""
        tv::Tensor points_voxel_id = tv::empty({{points.dim(0)}}, tv::int64, -1);

        return point_to_voxel_empty_mean_static(points, voxels, indices, num_per_voxel, 
            densehashdata, points_voxel_id, tvarray2array(vsize), 
            tvarray2array(grid_size), tvarray2array(grid_stride), 
            tvarray2array(coors_range), clear_voxels);
        """)
        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
예제 #20
0
파일: all.py 프로젝트: xmyqsh/spconv
 def allocate(self):
     code = pccm.FunctionCode()
     code.arg("num_bytes", "std::ptrdiff_t")
     code.ret("char*")
     code.raw(f"""
     if (alloc_func){{
         char* result = reinterpret_cast<char*>(alloc_func(num_bytes));
         return result;
     }}
     else{{
         TV_THROW_RT_ERR("set alloc function first.");
     }}
     """)
     return code
예제 #21
0
파일: all.py 프로젝트: xmyqsh/spconv
 def generate_conv_inds_stage1_5(self):
     code = pccm.FunctionCode()
     code.arg("indice_pairs_uniq", "tv::Tensor")
     code.arg("ndim", "int")
     code.arg("uniq_size", "int64_t")
     code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
     if CUMM_CPU_ONLY_BUILD:
         return code.make_invalid()
     for ndim in self.ndims:
         code.raw(f"""
         if (ndim == {ndim}){{
             return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
         }}
         """)
     code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
     return code.ret("int")
예제 #22
0
파일: pointops.py 프로젝트: xmyqsh/spconv
    def calc_meta_data(self):
        code = pccm.FunctionCode()
        code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
        code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
        code.raw(f"""
        std::array<float, {self.ndim}> vsize;
        std::array<int, {self.ndim}> grid_size, grid_stride;

        std::array<float, {self.ndim * 2}> coors_range;

        """)
        if self.zyx:
            code.raw(f"""
            for (int i = 0; i < {self.ndim}; ++i){{
                vsize[{self.ndim - 1} - i] = vsize_xyz[i];
                coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
                coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
            }}
            """)
        else:
            code.raw(f"""
            for (int i = 0; i < {self.ndim}; ++i){{
                vsize[i] = vsize_xyz[i];
                coors_range[i] = coors_range_xyz[i];
                coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
            }}
            """)
        code.raw(f"""
        int64_t prod = 1;
        for (size_t i = 0; i < {self.ndim}; ++i) {{
            grid_size[i] =
                std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
        }}
        for (int i = {self.ndim} - 1; i >= 0; --i) {{
            grid_stride[i] = prod;
            prod *= grid_size[i];
        }}
        return std::make_tuple(vsize, grid_size, grid_stride, coors_range);
        """)
        ret_str = f"std::array<int, {self.ndim}>"
        retf_str = f"std::array<float, {self.ndim}>"
        retf2_str = f"std::array<float, {self.ndim * 2}>"

        return code.ret(
            f"std::tuple<{retf_str}, {ret_str}, {ret_str}, {retf2_str}>")
예제 #23
0
파일: all.py 프로젝트: xmyqsh/spconv
    def point2voxel_cpu(self):
        code = pccm.FunctionCode()
        code.arg("points", "tv::Tensor")
        code.arg("voxels, indices, num_per_voxel, densehashdata, pc_voxel_id",
                 "tv::Tensor")
        code.arg("vsize", f"std::vector<float>")
        code.arg("grid_size, grid_stride", f"std::vector<int>")
        code.arg("coors_range", f"std::vector<float>")

        code.arg("empty_mean", "bool", "false")
        code.arg("clear_voxels", "bool", "true")

        code.raw(f"""
        int ndim = vsize.size();
        TV_ASSERT_RT_ERR(vsize.size() == ndim && grid_stride.size() == ndim && 
            coors_range.size() == ndim * 2 && grid_size.size() == ndim, 
            "your params size not equal to ndim", ndim);
        // voxels: []
        """)
        for ndim in self.ndims:
            code.raw(f"""
            if (ndim == {ndim}){{
                std::array<float, {ndim}> vsize_;
                std::array<int, {ndim}> grid_size_, grid_stride_;
                std::array<float, {ndim * 2}> coors_range_;
                for (int i = 0; i < {ndim}; ++i){{
                    vsize_[i] = vsize[i];
                    grid_size_[i] = grid_size[i];
                    grid_stride_[i] = grid_stride[i];
                    coors_range_[i] = coors_range[i];
                    coors_range_[i + {ndim}] = coors_range[i + {ndim}];
                }}
                if (empty_mean){{
                    return Point2Voxel{ndim}DCPU::point_to_voxel_empty_mean_static(points, voxels, indices, 
                        num_per_voxel, densehashdata, pc_voxel_id,
                        vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels);
                }} else{{
                    return Point2Voxel{ndim}DCPU::point_to_voxel_static(points, voxels, indices, 
                        num_per_voxel, densehashdata, pc_voxel_id,
                        vsize_, grid_size_, grid_stride_, coors_range_, clear_voxels);
                }}
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
        return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
예제 #24
0
 def size_cpu(self):
     """ this function can only be used to get cpu hash table size.
     """
     code = pccm.FunctionCode()
     code.raw(f"""
     int64_t res = -1;
     TV_ASSERT_RT_ERR(is_cpu, "size_cpu can only be used in cpu hash table");
     """)
     with code.if_("is_cpu"):
         map_name = "cpu_map"
         for _ in self.cpu_map_storage_select("key_itemsize_",
                                              "value_itemsize_", map_name,
                                              code):
             code.raw(f"""
             res = {map_name}.size();
             """)
     code.raw(f"return res;")
     return code.ret("int64_t")
예제 #25
0
파일: pointops.py 프로젝트: xmyqsh/spconv
    def generate_voxel(self):
        code = pccm.FunctionCode()
        code.targ("TTable")
        code.arg("table", "TTable")
        code.arg("points", f"{self.dtype} const*")

        code.arg("points_indice_data", f"const int64_t*")
        code.arg("voxels", f"{self.dtype} *")
        code.arg("num_per_voxel", f"int *")
        code.arg("points_voxel_id", f"int64_t*")

        code.arg("point_stride", f"int")
        code.arg("max_points_per_voxel", f"int")
        code.arg("max_voxels", f"int")

        code.arg("vsize", f"tv::array<float, {self.ndim}>")
        code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
        code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
        code.arg("grid_stride", f"tv::array<int, {self.ndim}>")

        code.arg("num_points", f"int")
        # TODO add backward?
        code.raw(f"""
        int voxel_stride0 = point_stride * max_points_per_voxel;
        for (int i : tv::KernelLoopX<int>(num_points)){{
            int64_t prod = points_indice_data[i];
            int voxel_id = -1;
            if (prod != -1){{
                auto voxel_index_pair = table.lookup(prod);
                if (!voxel_index_pair.empty() &&
                    voxel_index_pair.second < max_voxels) {{
                    voxel_id = voxel_index_pair.second;
                    int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
                    if (old < max_points_per_voxel) {{
                        for (int j = 0; j < point_stride; ++j) {{
                            voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j];
                        }}
                    }}
                }}
            }}
            points_voxel_id[i] = voxel_id;
        }}
        """)
        return code
예제 #26
0
파일: maxpool.py 프로젝트: xmyqsh/spconv
    def backward_implicit_gemm_kernel(self):
        code = pccm.FunctionCode()
        code.targ("T")

        code.arg("out_features", f"const T*")
        code.arg("in_features", f"const T*")
        code.arg("dout_features", f"const T*")
        code.arg("din_features", f"T*")
        code.arg("indices_bwd", "const int*")
        code.arg("num_features", "int")
        code.arg("RS", "int")
        code.arg("num_indices", "int")

        code.raw(f"""

        for (int i : tv::KernelLoopY<int>(num_indices)) {{
            auto in_ptr = in_features + i * num_features;
            auto din_ptr = din_features + i * num_features;
            
            for (int j : tv::KernelLoopX<int>(num_features)) {{
                auto indices_ptr = indices_bwd + i;
                int out_idx = indices_ptr[0];
                T in = in_ptr[j];
                T sum_val = T(0);
                // if idx invalid, we only need to ensure in not equal to out.
                T out = out_idx != -1 ? out_features[out_idx * num_features + j] : T(0);
                T dout = out_idx != -1 ? dout_features[out_idx * num_features + j] : T(0);
                bool valid = in == out && out_idx != -1;
                sum_val = valid ? sum_val + dout : sum_val;

                indices_ptr += num_indices;
                for (int k = 1; k < RS; ++k){{
                    out_idx = indices_ptr[0];
                    out = out_idx != -1 ? out_features[out_idx * num_features + j] : T(0);
                    dout = out_idx != -1 ? dout_features[out_idx * num_features + j] : T(0);
                    valid = in == out && out_idx != -1;
                    sum_val = valid ? sum_val + dout : sum_val;
                    indices_ptr += num_indices;
                }}
                din_ptr[j] = sum_val;
            }}
        }}
        """)
        return code
예제 #27
0
파일: all.py 프로젝트: xmyqsh/spconv
    def sort_1d_by_key(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("data", "tv::Tensor")
        code.arg("indices",
                 "tv::Tensor",
                 "tv::Tensor()",
                 pyanno="cumm.tensorview.Tensor = Tensor()")
        code.arg("stream", "std::uintptr_t", "0", pyanno="int")
        code.code_after_include = f"""
        template <typename T> struct SmallOrEqualTo {{
            TV_HOST_DEVICE_INLINE T operator()(const T &x, const T &y) const {{
                return x < y;
            }}
        }};
        template <typename T> __global__ void mask_input(T* inp, T mask, int size){{
            for (int i : tv::KernelLoopX<int>(size)){{
                inp[i] &= mask;
            }}
        }}

        """
        code.add_dependency(ThrustLib, TensorViewKernel)
        code.add_param_class("cudakers", CudaCommonKernel())
        code.raw(f"""
        cudaStream_t stream_cu = reinterpret_cast<cudaStream_t>(stream);
        if (indices.empty()){{
            indices = tv::empty({{data.dim(0)}}, tv::int32, 0);
        }}
        tv::cuda::Launch launcher(data.dim(0), stream_cu);
        launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
        auto timer = tv::CUDATimer();
        tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
            using T = TV_DECLTYPE(I);
            thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
            thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
            auto thrust_ctx = thrust::cuda::par.on(stream_cu);
            thrust::stable_sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k, SmallOrEqualTo<uint32_t>());
        }});
        tv::ssprint("SORT BY KEY TIME", data.dim(0), timer.report() / 1000.0);
        return indices;
        """)
        return code.ret("tv::Tensor")
예제 #28
0
파일: all.py 프로젝트: xmyqsh/spconv
    def generate_conv_inds_mask_stage2(self):
        code = pccm.FunctionCode()
        if CUMM_CPU_ONLY_BUILD:
            return code.make_invalid()
        code.arg("indices, hashdata", "tv::Tensor")
        code.arg(
            "indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds",
            "tv::Tensor")
        code.arg("mask_fwd, mask_bwd", "tv::Tensor")
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"std::vector<int>")
        code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
        code.raw(f"""
        int ndim = indices.dim(1) - 1;
        TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
            ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
            padding.size() == ndim, "your params size not equal to ndim", ndim);
        """)

        for ndim in self.ndims:
            code.raw(f"""
            if (ndim == {ndim}){{
                tv::array<int, {ndim}> output_dims_, input_dims_;
                tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
                for (int i = 0; i < {ndim}; ++i){{
                    output_dims_[i] = output_dims[i];
                    input_dims_[i] = input_dims[i];
                    ksize_[i] = ksize[i];
                    stride_[i] = stride[i];
                    padding_[i] = padding[i];
                    dilation_[i] = dilation[i];
                }}
                return SpconvIndices{ndim}D::generate_conv_inds_stage2_mask(indices, hashdata,
                    indice_pairs_fwd, indice_pairs_bwd, indice_pairs_uniq, out_inds, mask_fwd, mask_bwd,
                    num_out_act, batch_size, output_dims_, input_dims_, 
                    ksize_, stride_, padding_, dilation_, transposed, stream_int);
            }}
            """)
        code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")

        return code.ret("int")
예제 #29
0
    def clear(self):
        """ in this function, if values is empty, it will be assigned to zero.
        """
        code = pccm.FunctionCode()
        if not CUMM_CPU_ONLY_BUILD:
            code.add_dependency(TensorViewHashKernel)
        code.arg("stream", "std::uintptr_t", "0")
        with code.if_("is_cpu"):
            code.raw(f"""
            if (is_cpu){{
                map_4_4.clear();
                map_4_8.clear();
                map_8_4.clear();
                map_8_8.clear();
                return;
            }}
            """)
        if not CUMM_CPU_ONLY_BUILD:
            with code.else_():
                code.raw(f"""
                auto custream = reinterpret_cast<cudaStream_t>(stream);
                """)
                for k_items in _dispatch_ints(code, [4, 8],
                                              "keys_data.itemsize()"):
                    code.raw(f"""
                    using K = tv::hash::itemsize_to_unsigned_t<{k_items}>;
                    constexpr K kEmptyKey = std::numeric_limits<K>::max();

                    K* key_data_ptr = reinterpret_cast<K*>(keys_data.raw_data());
                    """)
                    for v_items in _dispatch_ints(code, [4, 8],
                                                  "values_data.itemsize()"):
                        code.raw(f"""
                        using V = tv::hash::itemsize_to_unsigned_t<{v_items}>;
                        V* value_data_ptr = reinterpret_cast<V*>(values_data.raw_data());
                        using table_t =
                            tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
                                                        kEmptyKey, false>;
                        table_t table(key_data_ptr, value_data_ptr, keys_data.dim(0));
                        tv::cuda::Launch launcher(table.size(), custream);
                        launcher(tv::hash::clear_table_split<table_t>, table);
                        """)
        return code
예제 #30
0
파일: pointops.py 프로젝트: xmyqsh/spconv
 def ctor(self):
     code = pccm.FunctionCode()
     code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
     code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
     code.arg("num_point_features", f"int")
     code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
     if self.zyx:
         code.raw(f"""
         for (int i = 0; i < {self.ndim}; ++i){{
             vsize[{self.ndim - 1} - i] = vsize_xyz[i];
             coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
             coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
         }}
         """)
     else:
         code.raw(f"""
         for (int i = 0; i < {self.ndim}; ++i){{
             vsize[i] = vsize_xyz[i];
             coors_range[i] = coors_range_xyz[i];
             coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
         }}
         """)
     code.raw(f"""
     int64_t prod = 1;
     for (size_t i = 0; i < {self.ndim}; ++i) {{
         grid_size[i] =
             std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
     }}
     for (int i = {self.ndim} - 1; i >= 0; --i) {{
         grid_stride[i] = prod;
         prod *= grid_size[i];
     }}
     voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1);
     indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1);
     num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1);
     tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim});
     densehashdata = tv::zeros(grid_shape, tv::int32, -1);
     auto densehashdata_ptr = densehashdata.data_ptr<int>();
     for (int i= 0; i < densehashdata.size(); ++i){{
         densehashdata_ptr[i] = -1;
     }}
     """)
     return code