def device_var(inputs): div = inputs.size - 1 if div == 0: return NAN mean = device_reduce_sum(inputs) / inputs.size diff = np.empty(inputs.size, dtype=result.dtype) nelem = inputs.size threads = 256 groups = (nelem + threads - 1) // threads hsa_var_diff_kernel[groups, threads](diff, inputs, mean) psum = device_reduce_sum(diff) return psum / div
def device_mean(inputs): return device_reduce_sum(inputs) / inputs.size