def test_scan_int32(self): in_ary = np.random.randint(0, 10000, ARRAY_SIZE).astype(np.int32) in_ary_d = cuda.to_device(in_ary) output = MyScan.scan_gpu(in_ary_d) out_carry = output.getitem(0) out_ary = in_ary_d.copy_to_host() # check last carry assert np.isclose(out_carry, in_ary.sum()), 'carry return is not sum' # check array carry = 0 for i in xrange(ARRAY_SIZE): assert out_ary[i] == carry, 'output array not correct' carry += in_ary[i]
def test_scan_fp64(self): in_ary = np.random.rand(ARRAY_SIZE).astype(np.float64) in_ary_d = cuda.to_device(in_ary) output = MyScan.scan_gpu(in_ary_d) out_carry = output.getitem(0) out_ary = in_ary_d.copy_to_host() # check last carry print out_carry, in_ary.sum() # self.assertTrue(np.isclose(out_carry, in_ary.sum()), # 'carry return is not sum') # check array carry = 0 for i in xrange(ARRAY_SIZE): self.assertEqual(out_ary[i], carry, 'output array not correct') carry += in_ary[i]
def test_recursive_big_scan(): print "running recursive scan test" MAX_TPB = 512 n = 2e6 n = int(n) a = np.arange(n).astype(np.int32) reference = np.empty_like(a) start = timer() sum_ref = MyScan.exprefixsumNumba(a, reference, init=0) end = timer() dA = cuda.to_device(a) # e1, e2 = cuda.event(), cuda.event() # e1.record() # e2.record() start2 = timer() total_sum = MyScan.scan_gpu(dA) end2 = timer() dA.copy_to_host(ary=a) sum_gpu = total_sum.copy_to_host() print "sum_ref = ", sum_ref print "sum_gpu = ", sum_gpu print "CPU took: ", (end - start) * 1000, " ms" print "Kernel took: ", (end2 - start2) * 1000, " ms" print (a == reference).all()