def count_features_dense_kernel(float_dtype, int_dtype): kernel_str = r''' ({0} *out, {0} *in, int n_rows, int n_cols, {1} *labels, int n_classes, bool square, bool rowMajor) { int row = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.y + threadIdx.y; if(row >= n_rows || col >= n_cols) return; {0} val = !rowMajor ? in[col * n_rows + row] : in[row * n_cols + col]; if(val == 0.0) return; if(square) val *= val; {1} label = labels[row]; atomicAdd(out + ((col * n_classes) + label), val); }''' return cuda_kernel_factory(kernel_str, (float_dtype, int_dtype), "count_features_dense")
def count_features_coo_kernel(float_dtype, int_dtype): """ A simple reduction kernel that takes in a sparse (COO) array of features and computes the sum (or sum squared) for each class label """ kernel_str = r'''({0} *out, int *rows, int *cols, {0} *vals, int nnz, int n_rows, int n_cols, {1} *labels, int n_classes, bool square) { int i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= nnz) return; int row = rows[i]; int col = cols[i]; {0} val = vals[i]; if(square) val *= val; {1} label = labels[row]; atomicAdd(out + ((col * n_classes) + label), val); }''' return cuda_kernel_factory(kernel_str, (float_dtype, int_dtype), "count_features_coo")
def _map_l2_norm_kernel(dtype): """Creates cupy RawKernel for csr_raw_normalize_l2 function.""" map_kernel_str = r''' ({0} *data, {1} *indices, {2} *indptr, int n_samples) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if(tid >= n_samples) return; {0} sum = 0.0; for(int i = indptr[tid]; i < indptr[tid+1]; i++) { sum += (data[i] * data[i]); } if(sum == 0) return; sum = sqrt(sum); for(int i = indptr[tid]; i < indptr[tid+1]; i++) { data[i] /= sum; } } ''' return cuda_kernel_factory(map_kernel_str, dtype, "map_l2_norm_kernel")
def count_classes_kernel(float_dtype, int_dtype): kernel_str = r''' ({0} *out, int n_rows, {1} *labels) { int row = blockIdx.x * blockDim.x + threadIdx.x; if(row >= n_rows) return; {1} label = labels[row]; atomicAdd(out + label, 1); }''' return cuda_kernel_factory(kernel_str, (float_dtype, int_dtype), "count_classes")
def _binarize_kernel(x_dtype): binarize_kernel_str = r'''({0} *x, float threshold, int x_n) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if(tid >= x_n) return; {0} val = x[tid]; if(val > threshold) val = 1; else val = 0; x[tid] = val; }''' return cuda_kernel_factory(binarize_kernel_str, (x_dtype,), "binarize_kernel")
def _validate_kernel(dtype): return cuda_kernel_factory(validate_kernel_str, (dtype, ), "validate_labels_kernel")
def _inverse_map_kernel(dtype): return cuda_kernel_factory(inverse_map_kernel_str, (dtype, ), "inv_map_labels_kernel")
def _map_kernel(dtype): return cuda_kernel_factory(map_kernel_str, (dtype, ), "map_labels_kernel")
def _cov_kernel(dtype): return cuda_kernel_factory(cov_kernel_str, (dtype, ), "cov_kernel")