def make_fp_tree(): #### Allocate host memory offsets, transactions, num_transactions, all_items_in_transactions = readFile( "data.txt") print num_transactions, all_items_in_transactions flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32) #### Allocate and initialize GPU/Device memory d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) d_flist = cuda.to_device(flist) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) t1 = time() makeFlistGPU[number_of_blocks, threads_per_block](d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions) cuda.synchronize() t2 = time() d_flist.copy_to_host(flist) cuda.synchronize() # # for i in range(0, MAX_UNIQUE_ITEMS): # print i, flist[i] t3 = time() flist_cpu = makeFlist(transactions, all_items_in_transactions) t4 = time() # match = 1 for i in range(1, MAX_UNIQUE_ITEMS): if i not in flist_cpu and flist[i] == 0: continue #print i, flist[i], flist_cpu[i] if flist[i] != flist_cpu[i]: match = -1 break if match == 1: print "Test Passed" else: print "Test Failed" print "Number of transactions = ", num_transactions print "All items in transactions = ", all_items_in_transactions print "GPU time = ", t2 - t1 print "CPU TIME = ", t4 - t3
def time_this(kernel, gridsz, blocksz, args): timings = [] cuda.synchronize() try: for i in range(10): # best of 10 ts = timer() kernel[gridsz, blocksz](*args) cuda.synchronize() te = timer() timings.append(te - ts) except cudadrv.error.CudaDriverError, e: print 'exc suppressed', e return -1
def make_fp_tree(): #### Allocate host memory offsets, transactions, num_transactions, all_items_in_transactions = readFile("data.txt") print num_transactions, all_items_in_transactions flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32) #### Allocate and initialize GPU/Device memory d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) d_flist = cuda.to_device(flist) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) t1 = time() makeFlistGPU [number_of_blocks, threads_per_block] (d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions) cuda.synchronize() t2 = time() d_flist.copy_to_host(flist) cuda.synchronize() # # for i in range(0, MAX_UNIQUE_ITEMS): # print i, flist[i] t3 = time() flist_cpu = makeFlist(transactions, all_items_in_transactions) t4 = time() # match = 1 for i in range(1, MAX_UNIQUE_ITEMS): if i not in flist_cpu and flist[i] == 0: continue #print i, flist[i], flist_cpu[i] if flist[i] != flist_cpu[i]: match = -1 break if match == 1: print "Test Passed" else: print "Test Failed" print "Number of transactions = ", num_transactions print "All items in transactions = ", all_items_in_transactions print "GPU time = ", t2 - t1 print "CPU TIME = ", t4 - t3
def test_histogram(): #Allocate host memory input_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) bins_h = np.zeros(BIN_SIZE, dtype=np.uint32) myprint("Bin Size = " + str(bins_h.size)) ## Initialize host memory for i in range(0, NUM_ELEMENTS): input_h[i] = randint(0, BIN_SIZE - 1) ## Allocate and initialize GPU/device memory input_d = cuda.to_device(input_h) bins_d = cuda.to_device(bins_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)#((NUM_ELEMENTS / threads_per_block[0]) + 1, 1) t1 = time() histogramGPU [number_of_blocks, threads_per_block] (input_d, bins_d, NUM_ELEMENTS) cuda.synchronize() t2 = time() bins_d.copy_to_host(bins_h) t3 = time() bins_cpu = makeHist(input_h) t4 = time() # for i in range(0, BIN_SIZE): # print i, bins_h[i], bins_cpu[i] print "GPU time = ", t2 - t1 print "CPU TIME = ", t4 - t3 match = 1 for i in range(0, BIN_SIZE): if bins_h[i] != bins_cpu[i]: match = -1 break if match == 1: print "Test Passed" else: print "Test Failed"
def test_scan(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100) tac1 = time() in_d = cuda.to_device(in_h) out_d = cuda.to_device(out_h) cuda.synchronize() tac2 = time() tk1 = time() for i in range(0, 32): tk1 = time() preScan(out_d, in_d, NUM_ELEMENTS) cuda.synchronize() tk2 = time() print i, tk2 - tk1 tk2 = time() th1 = time() out_d.copy_to_host(out_h) cuda.synchronize() #print "Last = ", out_h[-1] + in_h[-1] th2 = time()
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS - i - 1 in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() for bit_shift in range(0, 32): tk1 = time() #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift) preScan(out_d, in_d, NUM_ELEMENTS) tk2 = time() #print bit_shift, tk2 - tk1 in_d = out_d out_d = temp_d temp_d = in_d tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = [NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = NUM_ELEMENTS - i - 1 in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() for bit_shift in range(0, 32): tk1 = time() #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift) preScan(out_d, in_d, NUM_ELEMENTS) tk2 = time() #print bit_shift, tk2 - tk1 in_d = out_d out_d = temp_d temp_d = in_d tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = [NUM_ELEMENTS - i - 1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1
def radix_sort(in_d, out_d, out_scan_d, last_inp_element, bit_shift=0): threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1) ################ Bit flip ######################## SplitGPU[number_of_blocks, threads_per_block](in_d, out_d, NUM_ELEMENTS, bit_shift) cuda.synchronize() # out_d.copy_to_host(out_h) # cuda.synchronize() ################################################## t1 = time() preScan(out_scan_d, out_d, NUM_ELEMENTS) cuda.synchronize() t2 = time() #print "Time = ", t2 - t1 # out_scan_d.copy_to_host(out_h) # cuda.synchronize() # ########################################################### IndexDefineGPU[number_of_blocks, threads_per_block](out_scan_d, out_d, NUM_ELEMENTS, last_inp_element) cuda.synchronize() # out_scan_d.copy_to_host(out_h) # cuda.synchronize() ########################################################### ############################################################ ScatterElementGPU[number_of_blocks, threads_per_block](in_d, out_scan_d, out_d, NUM_ELEMENTS) cuda.synchronize()
def radix_sort(in_d, out_d, out_scan_d, last_inp_element, bit_shift=0): threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1) ################ Bit flip ######################## SplitGPU [number_of_blocks, threads_per_block] (in_d, out_d, NUM_ELEMENTS, bit_shift) cuda.synchronize() # out_d.copy_to_host(out_h) # cuda.synchronize() ################################################## t1 = time() preScan(out_scan_d, out_d, NUM_ELEMENTS) cuda.synchronize() t2 = time() #print "Time = ", t2 - t1 # out_scan_d.copy_to_host(out_h) # cuda.synchronize() # ########################################################### IndexDefineGPU [number_of_blocks, threads_per_block] (out_scan_d, out_d, NUM_ELEMENTS, last_inp_element) cuda.synchronize() # out_scan_d.copy_to_host(out_h) # cuda.synchronize() ########################################################### ############################################################ ScatterElementGPU [number_of_blocks, threads_per_block] (in_d, out_scan_d, out_d, NUM_ELEMENTS) cuda.synchronize()
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 #in_h = np.array([4, 7, 2, 6, 3, 5, 1, 0], dtype=np.uint32) out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = randint(0, 100) #NUM_ELEMENTS - i - 1 #in_h = np.array([6, 44, 71, 79, 94, 92, 12, 56, 47, 17, 81, 98, 84, 9, 85, 99], dtype=np.uint32) #in_h = np.array([85, 37, 50, 73, 51, 46, 62, 84, 65, 99, 76, 59, 73, 16, 27, 4, 75, 81, 80, 33, 73, 11, 29, 24, 81, 49, 27, 71, 74, 64, 60, 91], dtype=np.uint32) print in_h in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int( ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) RadixGPU[number_of_blocks, threads_per_block](in_d, out_d, NUM_ELEMENTS) out_d.copy_to_host(out_h) #print "Rad = ", list(out_h) stride = 4 # while stride < NUM_ELEMENTS: # number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1) # bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride) # stride *= 2 # # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) # # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS) # # out_d = in_d # out_d.copy_to_host(out_h) # print "Str = ", list(out_h) # break # # stride /= 2 # while stride >= 4: # number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1) # bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride) # stride /= 2 # cuda.synchronize() # # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS) # out_d = in_d # # out_d.copy_to_host(out_h) # cuda.synchronize() # # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() #print "GPU = ", list(out_h) # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = list(in_h) #[NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() #print "CPU = ", in_cpu tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1 print len(in_cpu)
# Prepare data on the GPU dA = cuda.to_device(A) dB = cuda.to_device(B) dC = cuda.to_device(C) # device_array_like(A) # Time numpy version s = timer() np_ans = np.dot(A, B) e = timer() t = e - s # Time the unoptimized version s = timer() cu_matmul[grid_dim, block_dim](dA, dB, dC, n) cuda.synchronize() e = timer() unopt_ans = dC.copy_to_host() tcuda_unopt = e - s # Time the shared memory version s = timer() cu_matmul_sm[grid_dim, block_dim](dA, dB, dC, n, tpb, bpg) cuda.synchronize() e = timer() opt_ans = dC.copy_to_host() tcuda_opt = e - s # Time for CuBLAS version s = timer() blas = cublas.Blas()
def test_apriori(): output_file = open("apriori_out.txt", "w") offsets, transactions, num_transactions, num_elements = readFile("syncthetic_data.txt") print "Offset = ", offsets[:num_transactions] print "transactions = ", transactions[:num_elements] print "Num transactions = ", num_transactions print "Num elements = ", num_elements min_support = MIN_SUPPORT # to find number of max digits required to represent that many number of unique items power = 1 while MAX_UNIQUE_ITEMS / (10 ** power) != 0: power += 1 print "Power = ", power t = [item for item in transactions.tolist()] if num_elements > NUM_ELEMENTS: print "Error: Elements exceeding NUM_ELEMENTS. Exiting..." sys.exit(12) input_h = np.array(t, dtype=np.int32) print "Input transactions = ", list(input_h) print "Size of transactions = ", input_h.size ci_h = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.int32) li_h = np.empty(MAX_UNIQUE_ITEMS, dtype=np.int32) input_d = cuda.to_device(input_h) ci_d = cuda.to_device(ci_h) li_d = cuda.device_array(MAX_UNIQUE_ITEMS, dtype=np.int32) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)#((NUM_ELEMENTS / threads_per_block[0]) + 1, 1) histogramGPU [number_of_blocks, threads_per_block] (input_d, ci_d, num_elements) #cuda.synchronize() ci_d.copy_to_host(ci_h) print "Ci_H Histogram result = ", ci_h # support count for each item number_of_blocks = (int(ceil(MAX_UNIQUE_ITEMS / (1.0 * threads_per_block[0]))), 1) pruneGPU [number_of_blocks, threads_per_block] (ci_d, MAX_UNIQUE_ITEMS, min_support) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Keys = ", [i for i in range(0, len(ci_h))] print "Ci_H Pruning result = ", ci_h # support count for each item # calculate concise list of items satisfying min support l1_patterns = {} k = 0 # number of items whose sup_count > min_support for j in range(0, len(ci_h)): if ci_h[j] != 0: li_h[k] = j l1_patterns[(j, )] = ci_h[j] k += 1 print "\n=======================================================\n" print "L1 = ", list(li_h)[:k] #items whose support_count > min_support print "\n=======================================================\n" output_file.write(createFormattedPatterns(l1_patterns, 1)) print "K(num_items_with_good_sup_count = ", k #k = 102 ci_h = np.array([-1 for i in range(0, k ** 2)], dtype=np.int32) ci_d = cuda.to_device(ci_h) #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32) #tli_h = np.array([i for i in range(1, k + 1)], dtype=np.int32) t1 = time() li_d = cuda.to_device(li_h) number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1) print "Self join 2 number of blocks = ", number_of_blocks print "K = ", k print "Ci_H size = ", ci_h.size print "LI_H size = ", li_h.size selfJoinGPU [number_of_blocks, threads_per_block](li_d, ci_d, k, power) cuda.synchronize() li_d.copy_to_host(li_h) ci_d.copy_to_host(ci_h) t2 = time() #sys.exit(0) # f = open('join.txt', 'w') # # for i in range(0, k): # line = "" # for j in range(0, k): # line += str(ci_h[k * i + j]) + " " # f.write(line + "\n") # # f.close() #ci_h = ci_h.reshape(k, k) print "Initial Mask = ", ci_h.reshape(k, k) print "Self joining time = ", (t2 - t1) d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) number_of_blocks = (int(ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1) print "Num blocks for findFrequency = ", number_of_blocks print "Num transactions = ", num_transactions print "Num patterns = ", k print "index = ", list(li_h)[:k] findFrequencyGPU [number_of_blocks, threads_per_block] (d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Final Mask = ", ci_h.reshape(k, k) d_transactions.copy_to_host(transactions) threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0]))))) pruneMultipleGPU [number_of_blocks, threads_per_block] (ci_d, k, min_support) # prunes according to min_support ci_d.copy_to_host(ci_h) print "Outer Mask = ", ci_h.reshape(k, k) ci_hn = np.zeros(k, dtype=np.int32) ci_dn = cuda.to_device(ci_hn) combinationsAvailable [threads_per_block, number_of_blocks] (ci_d, ci_dn, k) #Number of possible patterns in each row ci_dn.copy_to_host(ci_hn) print "Ci_hn = ", list(ci_hn) ci_hnx = np.empty(k, dtype=np.int32) ci_dnx = cuda.to_device(ci_hnx) preScan(ci_dnx, ci_dn, k) # Prefix sum on patterns in each row ci_dnx.copy_to_host(ci_hnx) num_patterns = ci_hnx[-1] print "Ci_hnx = ", list(ci_hnx) sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32) sparseM_d = cuda.to_device(sparseM_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1) convert2Sparse [threads_per_block, number_of_blocks] (ci_d, ci_dnx, sparseM_d, num_patterns, k) sparseM_d.copy_to_host(sparseM_h) # sparseM_h = sparseM_h.reshape(3, num_patterns) print sparseM_h.reshape(3, num_patterns) patterns = {} for i in range(0, num_patterns): item1 = sparseM_h[i] item2 = sparseM_h[i + num_patterns] support = sparseM_h[i + 2 * num_patterns] patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support print "\n=======================================================\n" print "L2 = ", patterns print "\n=======================================================\n" output_file.write(createFormattedPatterns(patterns, 2)) new_modulo_map = {} index_id = 1 actual_pattern_items = [] index_items_lookup = [] #patterns = {(2, 3, 5) : 1, (2, 3, 6) : 1, (2, 3, 7) : 1, (2, 4, 5) : 1, (2, 4, 7) : 1, (3, 5, 7) : 1} for pattern in sorted(patterns.keys()): if pattern[:-1] not in new_modulo_map: new_modulo_map[pattern[:-1]] = index_id prev_len = len(actual_pattern_items) pattern_len = len(pattern[:-1]) actual_pattern_items += pattern[:-1] index_items_lookup += [index_id, prev_len, pattern_len] index_id += 1 if (pattern[-1],) not in new_modulo_map: new_modulo_map[(pattern[-1],)] = index_id prev_len = len(actual_pattern_items) pattern_len = len([pattern[-1]]) actual_pattern_items += [pattern[-1]] index_items_lookup += [index_id, prev_len, pattern_len] index_id += 1 #print "Actual pattern items = ", actual_pattern_items #print "Index lookup = ", index_items_lookup print new_modulo_map new_patterns = [] for pattern in patterns: new_patterns.append((new_modulo_map[pattern[:-1]], new_modulo_map[(pattern[-1],)])) print new_patterns new_new_pattern = [] for pattern in new_patterns: new_new_pattern.append(pattern[0] * 10 ** power + pattern[1]) new_new_pattern.sort() print new_new_pattern k = len(new_new_pattern) li_h = np.array(new_new_pattern, dtype=np.int32) ci_h = np.array([-1 for i in range(0, k ** 2)], dtype=np.int32) ci_d = cuda.to_device(ci_h) #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32) t1 = time() li_d = cuda.to_device(li_h) number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1) selfJoinGPU [number_of_blocks, threads_per_block](li_d, ci_d, k, power) li_d.copy_to_host(li_h) ci_d.copy_to_host(ci_h) api_h = np.array(actual_pattern_items, dtype=np.int32) iil_h = np.array(index_items_lookup, dtype=np.int32) api_d = cuda.to_device(api_h) iil_d = cuda.to_device(iil_h) print "Api_h = ", list(api_h), " Size = ", api_h.size print "IIL_H = ", list(iil_h), " Size = ", iil_h.size t2 = time() print "LI_H = ", li_h print "Initial Mask = ", ci_h.reshape(k, k) #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) number_of_blocks = (int(ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1) print "Num transactions = ", num_transactions print "Num patterns = ", k print "index = ", li_h print "Size of api_d = ", api_h.size print "Size of iil_h = ", iil_h.size findHigherPatternFrequencyGPU [number_of_blocks, threads_per_block] (d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k, api_d, iil_d, power, api_h.size, iil_h.size) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Final Mask = ", ci_h.reshape(k, k) #d_transactions.copy_to_host(transactions) #print transactions[:num_elements] threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0]))))) pruneMultipleGPU [number_of_blocks, threads_per_block] (ci_d, k, min_support) ci_d.copy_to_host(ci_h) print "Outer Mask = ", ci_h.reshape(k, k) print "K = ", k ci_hn = np.zeros(k, dtype=np.int32) ci_dn = cuda.to_device(ci_hn) combinationsAvailable [threads_per_block, number_of_blocks] (ci_d, ci_dn, k) ci_dn.copy_to_host(ci_hn) print "Ci_hn = ", list(ci_hn) ci_hnx = np.empty(k, dtype=np.int32) ci_dnx = cuda.to_device(ci_hnx) preScan(ci_dnx, ci_dn, k) ci_dnx.copy_to_host(ci_hnx) num_patterns = ci_hnx[-1] print list(ci_hnx) sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32) sparseM_d = cuda.to_device(sparseM_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1) print "K = ", k convert2Sparse [threads_per_block, number_of_blocks] (ci_d, ci_dnx, sparseM_d, num_patterns, k) sparseM_d.copy_to_host(sparseM_h) # sparseM_h = sparseM_h.reshape(3, num_patterns) print sparseM_h.reshape(3, num_patterns) patterns = {} for i in range(0, num_patterns): item1 = sparseM_h[i] item2 = sparseM_h[i + num_patterns] support = sparseM_h[i + 2 * num_patterns] patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support print patterns actual_patterns = {} for pattern in patterns: v_common_pat = pattern[0] / (10 ** power) vitem1 = pattern[0] % (10 ** power) vitem2 = pattern[1] % (10 ** power) item1 = actual_pattern_items[index_items_lookup[(vitem1-1) * 3 + 1]] item2 = actual_pattern_items[index_items_lookup[(vitem2-1) * 3 + 1]] common_pat_start = index_items_lookup[(v_common_pat-1) * 3 + 1] common_pat_length = index_items_lookup[(v_common_pat-1) * 3 + 2] common_pat_end = common_pat_start + common_pat_length common_pattern = actual_pattern_items[common_pat_start:common_pat_end] pattern_key = tuple(common_pattern) + tuple(sorted([item1, item2])) actual_patterns[pattern_key] = patterns[pattern] print "\n=======================================================\n" print "L3 = ", actual_patterns print "\n=======================================================\n" output_file.write(createFormattedPatterns(actual_patterns, 3)) output_file.close()
def test_sort(): in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) #4, 7, 2, 6, 3, 5, 1, 0 #in_h = np.array([4, 7, 2, 6, 3, 5, 1, 0], dtype=np.uint32) out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32) for i in range(0, NUM_ELEMENTS): in_h[i] = randint(0, 100)#NUM_ELEMENTS - i - 1 #in_h = np.array([6, 44, 71, 79, 94, 92, 12, 56, 47, 17, 81, 98, 84, 9, 85, 99], dtype=np.uint32) #in_h = np.array([85, 37, 50, 73, 51, 46, 62, 84, 65, 99, 76, 59, 73, 16, 27, 4, 75, 81, 80, 33, 73, 11, 29, 24, 81, 49, 27, 71, 74, 64, 60, 91], dtype=np.uint32) print in_h in_d = cuda.to_device(in_h) out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32) tkg1 = time() threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) RadixGPU [number_of_blocks, threads_per_block] (in_d, out_d, NUM_ELEMENTS) out_d.copy_to_host(out_h) #print "Rad = ", list(out_h) stride = 4 # while stride < NUM_ELEMENTS: # number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1) # bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride) # stride *= 2 # # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) # # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS) # # out_d = in_d # out_d.copy_to_host(out_h) # print "Str = ", list(out_h) # break # # stride /= 2 # while stride >= 4: # number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1) # bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride) # stride /= 2 # cuda.synchronize() # # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1) # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS) # out_d = in_d # # out_d.copy_to_host(out_h) # cuda.synchronize() # # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line tkg2 = time() out_d.copy_to_host(out_h) cuda.synchronize() #print "GPU = ", list(out_h) # line = "" # for i in range(0, NUM_ELEMENTS): # line += " " + str(out_h[i]) # # print line in_cpu = list(in_h)#[NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)] tc1 = time() in_cpu.sort() #print "CPU = ", in_cpu tc2 = time() print "GPU Time = ", tkg2 - tkg1 print "CPU Time = ", tc2 - tc1 print len(in_cpu)
def test_apriori(): output_file = open("apriori_out.txt", "w") offsets, transactions, num_transactions, num_elements = readFile( "syncthetic_data.txt") print "Offset = ", offsets[:num_transactions] print "transactions = ", transactions[:num_elements] print "Num transactions = ", num_transactions print "Num elements = ", num_elements min_support = MIN_SUPPORT # to find number of max digits required to represent that many number of unique items power = 1 while MAX_UNIQUE_ITEMS / (10**power) != 0: power += 1 print "Power = ", power t = [item for item in transactions.tolist()] if num_elements > NUM_ELEMENTS: print "Error: Elements exceeding NUM_ELEMENTS. Exiting..." sys.exit(12) input_h = np.array(t, dtype=np.int32) print "Input transactions = ", list(input_h) print "Size of transactions = ", input_h.size ci_h = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.int32) li_h = np.empty(MAX_UNIQUE_ITEMS, dtype=np.int32) input_d = cuda.to_device(input_h) ci_d = cuda.to_device(ci_h) li_d = cuda.device_array(MAX_UNIQUE_ITEMS, dtype=np.int32) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1) #((NUM_ELEMENTS / threads_per_block[0]) + 1, 1) histogramGPU[number_of_blocks, threads_per_block](input_d, ci_d, num_elements) #cuda.synchronize() ci_d.copy_to_host(ci_h) print "Ci_H Histogram result = ", ci_h # support count for each item number_of_blocks = (int( ceil(MAX_UNIQUE_ITEMS / (1.0 * threads_per_block[0]))), 1) pruneGPU[number_of_blocks, threads_per_block](ci_d, MAX_UNIQUE_ITEMS, min_support) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Keys = ", [i for i in range(0, len(ci_h))] print "Ci_H Pruning result = ", ci_h # support count for each item # calculate concise list of items satisfying min support l1_patterns = {} k = 0 # number of items whose sup_count > min_support for j in range(0, len(ci_h)): if ci_h[j] != 0: li_h[k] = j l1_patterns[(j, )] = ci_h[j] k += 1 print "\n=======================================================\n" print "L1 = ", list(li_h)[:k] #items whose support_count > min_support print "\n=======================================================\n" output_file.write(createFormattedPatterns(l1_patterns, 1)) print "K(num_items_with_good_sup_count = ", k #k = 102 ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32) ci_d = cuda.to_device(ci_h) #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32) #tli_h = np.array([i for i in range(1, k + 1)], dtype=np.int32) t1 = time() li_d = cuda.to_device(li_h) number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1) print "Self join 2 number of blocks = ", number_of_blocks print "K = ", k print "Ci_H size = ", ci_h.size print "LI_H size = ", li_h.size selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power) cuda.synchronize() li_d.copy_to_host(li_h) ci_d.copy_to_host(ci_h) t2 = time() #sys.exit(0) # f = open('join.txt', 'w') # # for i in range(0, k): # line = "" # for j in range(0, k): # line += str(ci_h[k * i + j]) + " " # f.write(line + "\n") # # f.close() #ci_h = ci_h.reshape(k, k) print "Initial Mask = ", ci_h.reshape(k, k) print "Self joining time = ", (t2 - t1) d_offsets = cuda.to_device(offsets) d_transactions = cuda.to_device(transactions) #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) number_of_blocks = (int( ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1) print "Num blocks for findFrequency = ", number_of_blocks print "Num transactions = ", num_transactions print "Num patterns = ", k print "index = ", list(li_h)[:k] findFrequencyGPU[number_of_blocks, threads_per_block](d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Final Mask = ", ci_h.reshape(k, k) d_transactions.copy_to_host(transactions) threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0]))))) pruneMultipleGPU[number_of_blocks, threads_per_block]( ci_d, k, min_support) # prunes according to min_support ci_d.copy_to_host(ci_h) print "Outer Mask = ", ci_h.reshape(k, k) ci_hn = np.zeros(k, dtype=np.int32) ci_dn = cuda.to_device(ci_hn) combinationsAvailable[threads_per_block, number_of_blocks]( ci_d, ci_dn, k) #Number of possible patterns in each row ci_dn.copy_to_host(ci_hn) print "Ci_hn = ", list(ci_hn) ci_hnx = np.empty(k, dtype=np.int32) ci_dnx = cuda.to_device(ci_hnx) preScan(ci_dnx, ci_dn, k) # Prefix sum on patterns in each row ci_dnx.copy_to_host(ci_hnx) num_patterns = ci_hnx[-1] print "Ci_hnx = ", list(ci_hnx) sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32) sparseM_d = cuda.to_device(sparseM_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1) convert2Sparse[threads_per_block, number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k) sparseM_d.copy_to_host(sparseM_h) # sparseM_h = sparseM_h.reshape(3, num_patterns) print sparseM_h.reshape(3, num_patterns) patterns = {} for i in range(0, num_patterns): item1 = sparseM_h[i] item2 = sparseM_h[i + num_patterns] support = sparseM_h[i + 2 * num_patterns] patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support print "\n=======================================================\n" print "L2 = ", patterns print "\n=======================================================\n" output_file.write(createFormattedPatterns(patterns, 2)) new_modulo_map = {} index_id = 1 actual_pattern_items = [] index_items_lookup = [] #patterns = {(2, 3, 5) : 1, (2, 3, 6) : 1, (2, 3, 7) : 1, (2, 4, 5) : 1, (2, 4, 7) : 1, (3, 5, 7) : 1} for pattern in sorted(patterns.keys()): if pattern[:-1] not in new_modulo_map: new_modulo_map[pattern[:-1]] = index_id prev_len = len(actual_pattern_items) pattern_len = len(pattern[:-1]) actual_pattern_items += pattern[:-1] index_items_lookup += [index_id, prev_len, pattern_len] index_id += 1 if (pattern[-1], ) not in new_modulo_map: new_modulo_map[(pattern[-1], )] = index_id prev_len = len(actual_pattern_items) pattern_len = len([pattern[-1]]) actual_pattern_items += [pattern[-1]] index_items_lookup += [index_id, prev_len, pattern_len] index_id += 1 #print "Actual pattern items = ", actual_pattern_items #print "Index lookup = ", index_items_lookup print new_modulo_map new_patterns = [] for pattern in patterns: new_patterns.append( (new_modulo_map[pattern[:-1]], new_modulo_map[(pattern[-1], )])) print new_patterns new_new_pattern = [] for pattern in new_patterns: new_new_pattern.append(pattern[0] * 10**power + pattern[1]) new_new_pattern.sort() print new_new_pattern k = len(new_new_pattern) li_h = np.array(new_new_pattern, dtype=np.int32) ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32) ci_d = cuda.to_device(ci_h) #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32) t1 = time() li_d = cuda.to_device(li_h) number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1) selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power) li_d.copy_to_host(li_h) ci_d.copy_to_host(ci_h) api_h = np.array(actual_pattern_items, dtype=np.int32) iil_h = np.array(index_items_lookup, dtype=np.int32) api_d = cuda.to_device(api_h) iil_d = cuda.to_device(iil_h) print "Api_h = ", list(api_h), " Size = ", api_h.size print "IIL_H = ", list(iil_h), " Size = ", iil_h.size t2 = time() print "LI_H = ", li_h print "Initial Mask = ", ci_h.reshape(k, k) #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1) number_of_blocks = (int( ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1) print "Num transactions = ", num_transactions print "Num patterns = ", k print "index = ", li_h print "Size of api_d = ", api_h.size print "Size of iil_h = ", iil_h.size findHigherPatternFrequencyGPU[number_of_blocks, threads_per_block](d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k, api_d, iil_d, power, api_h.size, iil_h.size) cuda.synchronize() ci_d.copy_to_host(ci_h) print "Final Mask = ", ci_h.reshape(k, k) #d_transactions.copy_to_host(transactions) #print transactions[:num_elements] threads_per_block = (BLOCK_SIZE, BLOCK_SIZE) number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0]))))) pruneMultipleGPU[number_of_blocks, threads_per_block](ci_d, k, min_support) ci_d.copy_to_host(ci_h) print "Outer Mask = ", ci_h.reshape(k, k) print "K = ", k ci_hn = np.zeros(k, dtype=np.int32) ci_dn = cuda.to_device(ci_hn) combinationsAvailable[threads_per_block, number_of_blocks](ci_d, ci_dn, k) ci_dn.copy_to_host(ci_hn) print "Ci_hn = ", list(ci_hn) ci_hnx = np.empty(k, dtype=np.int32) ci_dnx = cuda.to_device(ci_hnx) preScan(ci_dnx, ci_dn, k) ci_dnx.copy_to_host(ci_hnx) num_patterns = ci_hnx[-1] print list(ci_hnx) sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32) sparseM_d = cuda.to_device(sparseM_h) threads_per_block = (BLOCK_SIZE, 1) number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1) print "K = ", k convert2Sparse[threads_per_block, number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k) sparseM_d.copy_to_host(sparseM_h) # sparseM_h = sparseM_h.reshape(3, num_patterns) print sparseM_h.reshape(3, num_patterns) patterns = {} for i in range(0, num_patterns): item1 = sparseM_h[i] item2 = sparseM_h[i + num_patterns] support = sparseM_h[i + 2 * num_patterns] patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support print patterns actual_patterns = {} for pattern in patterns: v_common_pat = pattern[0] / (10**power) vitem1 = pattern[0] % (10**power) vitem2 = pattern[1] % (10**power) item1 = actual_pattern_items[index_items_lookup[(vitem1 - 1) * 3 + 1]] item2 = actual_pattern_items[index_items_lookup[(vitem2 - 1) * 3 + 1]] common_pat_start = index_items_lookup[(v_common_pat - 1) * 3 + 1] common_pat_length = index_items_lookup[(v_common_pat - 1) * 3 + 2] common_pat_end = common_pat_start + common_pat_length common_pattern = actual_pattern_items[common_pat_start:common_pat_end] pattern_key = tuple(common_pattern) + tuple(sorted([item1, item2])) actual_patterns[pattern_key] = patterns[pattern] print "\n=======================================================\n" print "L3 = ", actual_patterns print "\n=======================================================\n" output_file.write(createFormattedPatterns(actual_patterns, 3)) output_file.close()