Python synchronize 예제들, numbapro.cuda.synchronize Python 예제들

예제 #1

0

파일 보기

def make_fp_tree():
    #### Allocate host memory
    offsets, transactions, num_transactions, all_items_in_transactions = readFile(
        "data.txt")
    print num_transactions, all_items_in_transactions

    flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32)

    #### Allocate and initialize GPU/Device memory
    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)
    d_flist = cuda.to_device(flist)
    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) +
                        1, 1)

    t1 = time()
    makeFlistGPU[number_of_blocks,
                 threads_per_block](d_offsets, d_transactions, d_flist,
                                    num_transactions,
                                    all_items_in_transactions)
    cuda.synchronize()
    t2 = time()

    d_flist.copy_to_host(flist)
    cuda.synchronize()
    #
    # for i in range(0, MAX_UNIQUE_ITEMS):
    #     print i, flist[i]

    t3 = time()
    flist_cpu = makeFlist(transactions, all_items_in_transactions)
    t4 = time()
    #

    match = 1
    for i in range(1, MAX_UNIQUE_ITEMS):
        if i not in flist_cpu and flist[i] == 0:
            continue
        #print i, flist[i], flist_cpu[i]
        if flist[i] != flist_cpu[i]:
            match = -1
            break
    if match == 1:
        print "Test Passed"
    else:
        print "Test Failed"

    print "Number of transactions = ", num_transactions
    print "All items in transactions = ", all_items_in_transactions
    print "GPU time = ", t2 - t1
    print "CPU TIME = ", t4 - t3

예제 #2

0

파일 보기

파일: testilp.py 프로젝트: Aahung/numbapro-examples

def time_this(kernel, gridsz, blocksz, args):
    timings = []
    cuda.synchronize()
    try:
        for i in range(10): # best of 10
            ts = timer()
            kernel[gridsz, blocksz](*args)
            cuda.synchronize()
            te = timer()    
            timings.append(te - ts)
    except cudadrv.error.CudaDriverError, e:
        print 'exc suppressed', e
        return -1

예제 #3

0

파일 보기

파일: testilp.py 프로젝트: tishizuk/numbapro-examples

def time_this(kernel, gridsz, blocksz, args):
    timings = []
    cuda.synchronize()
    try:
        for i in range(10):  # best of 10
            ts = timer()
            kernel[gridsz, blocksz](*args)
            cuda.synchronize()
            te = timer()
            timings.append(te - ts)
    except cudadrv.error.CudaDriverError, e:
        print 'exc suppressed', e
        return -1

예제 #4

0

파일 보기

파일: fp_tree.py 프로젝트: jalatif/Python_Massively_Parallel_FP_Tree

def make_fp_tree():
    #### Allocate host memory
    offsets, transactions, num_transactions, all_items_in_transactions = readFile("data.txt")
    print num_transactions, all_items_in_transactions

    flist = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.uint32)

    #### Allocate and initialize GPU/Device memory
    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)
    d_flist = cuda.to_device(flist)
    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)

    t1 = time()
    makeFlistGPU [number_of_blocks, threads_per_block] (d_offsets, d_transactions, d_flist, num_transactions, all_items_in_transactions)
    cuda.synchronize()
    t2 = time()

    d_flist.copy_to_host(flist)
    cuda.synchronize()
    #
    # for i in range(0, MAX_UNIQUE_ITEMS):
    #     print i, flist[i]

    t3 = time()
    flist_cpu = makeFlist(transactions, all_items_in_transactions)
    t4 = time()
    #

    match = 1
    for i in range(1, MAX_UNIQUE_ITEMS):
        if i not in flist_cpu and flist[i] == 0:
            continue
        #print i, flist[i], flist_cpu[i]
        if flist[i] != flist_cpu[i]:
            match = -1
            break
    if match == 1:
        print "Test Passed"
    else:
        print "Test Failed"

    print "Number of transactions = ", num_transactions
    print "All items in transactions = ", all_items_in_transactions
    print "GPU time = ", t2 - t1
    print "CPU TIME = ", t4 - t3

예제 #5

0

파일 보기

def test_histogram():
    #Allocate host memory
    input_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    bins_h = np.zeros(BIN_SIZE, dtype=np.uint32)
    myprint("Bin Size = " + str(bins_h.size))
    ## Initialize host memory
    for i in range(0, NUM_ELEMENTS):
        input_h[i] = randint(0, BIN_SIZE - 1)

    ## Allocate and initialize GPU/device memory
    input_d = cuda.to_device(input_h)
    bins_d = cuda.to_device(bins_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)#((NUM_ELEMENTS / threads_per_block[0]) + 1, 1)

    t1 = time()
    histogramGPU [number_of_blocks, threads_per_block] (input_d, bins_d, NUM_ELEMENTS)
    cuda.synchronize()
    t2 = time()
    bins_d.copy_to_host(bins_h)

    t3 = time()
    bins_cpu = makeHist(input_h)
    t4 = time()

    # for i in range(0, BIN_SIZE):
    #     print i, bins_h[i], bins_cpu[i]

    print "GPU time = ", t2 - t1
    print "CPU TIME = ", t4 - t3

    match = 1
    for i in range(0, BIN_SIZE):
        if bins_h[i] != bins_cpu[i]:
            match = -1
            break
    if match == 1:
        print "Test Passed"
    else:
        print "Test Failed"

예제 #6

0

파일 보기

def test_scan():

    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32)

    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100)

    tac1 = time()

    in_d = cuda.to_device(in_h)
    out_d = cuda.to_device(out_h)
    cuda.synchronize()

    tac2 = time()

    tk1 = time()

    for i in range(0, 32):
        tk1 = time()
        preScan(out_d, in_d, NUM_ELEMENTS)
        cuda.synchronize()
        tk2 = time()
        print i, tk2 - tk1
    tk2 = time()

    th1 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "Last = ", out_h[-1] + in_h[-1]

    th2 = time()

예제 #7

0

파일 보기

파일: radix_sort.py 프로젝트: jalatif/Python_Massively_Parallel_FP_Tree

def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS - i - 1

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)
    temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    for bit_shift in range(0, 32):
        tk1 = time()
        #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift)
        preScan(out_d, in_d, NUM_ELEMENTS)
        tk2 = time()
        #print bit_shift, tk2 - tk1
        in_d = out_d
        out_d = temp_d
        temp_d = in_d

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()

    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = [NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1

예제 #8

0

파일 보기

파일: radix_sort.py 프로젝트: rafaelgov95/Python_Massively_Parallel_FP_Tree

def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS - i - 1

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)
    temp_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    for bit_shift in range(0, 32):
        tk1 = time()
        #radix_sort(in_d, out_d, temp_d, in_h[NUM_ELEMENTS - 1], bit_shift)
        preScan(out_d, in_d, NUM_ELEMENTS)
        tk2 = time()
        #print bit_shift, tk2 - tk1
        in_d = out_d
        out_d = temp_d
        temp_d = in_d

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()

    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = [NUM_ELEMENTS - i - 1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1

예제 #9

0

파일 보기

파일: radix_sort.py 프로젝트: rafaelgov95/Python_Massively_Parallel_FP_Tree

def radix_sort(in_d, out_d, out_scan_d, last_inp_element, bit_shift=0):

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))),
                        1)

    ################ Bit flip ########################
    SplitGPU[number_of_blocks, threads_per_block](in_d, out_d, NUM_ELEMENTS,
                                                  bit_shift)

    cuda.synchronize()

    # out_d.copy_to_host(out_h)
    # cuda.synchronize()
    ##################################################
    t1 = time()
    preScan(out_scan_d, out_d, NUM_ELEMENTS)

    cuda.synchronize()
    t2 = time()
    #print "Time = ", t2 - t1

    # out_scan_d.copy_to_host(out_h)
    # cuda.synchronize()
    #
    ###########################################################
    IndexDefineGPU[number_of_blocks,
                   threads_per_block](out_scan_d, out_d, NUM_ELEMENTS,
                                      last_inp_element)

    cuda.synchronize()

    # out_scan_d.copy_to_host(out_h)
    # cuda.synchronize()
    ###########################################################

    ############################################################
    ScatterElementGPU[number_of_blocks, threads_per_block](in_d, out_scan_d,
                                                           out_d, NUM_ELEMENTS)

    cuda.synchronize()

예제 #10

0

파일 보기

파일: radix_sort.py 프로젝트: jalatif/Python_Massively_Parallel_FP_Tree

def radix_sort(in_d, out_d, out_scan_d, last_inp_element, bit_shift=0):

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)


    ################ Bit flip ########################
    SplitGPU [number_of_blocks, threads_per_block] (in_d, out_d, NUM_ELEMENTS, bit_shift)

    cuda.synchronize()

    # out_d.copy_to_host(out_h)
    # cuda.synchronize()
    ##################################################
    t1 = time()
    preScan(out_scan_d, out_d, NUM_ELEMENTS)

    cuda.synchronize()
    t2 = time()
    #print "Time = ", t2 - t1

    # out_scan_d.copy_to_host(out_h)
    # cuda.synchronize()
    #
    ###########################################################
    IndexDefineGPU [number_of_blocks, threads_per_block] (out_scan_d, out_d, NUM_ELEMENTS, last_inp_element)

    cuda.synchronize()

    # out_scan_d.copy_to_host(out_h)
    # cuda.synchronize()
    ###########################################################

    ############################################################
    ScatterElementGPU [number_of_blocks, threads_per_block] (in_d, out_scan_d, out_d, NUM_ELEMENTS)

    cuda.synchronize()

예제 #11

0

파일 보기

파일: radix_shared.py 프로젝트: rafaelgov95/Python_Massively_Parallel_FP_Tree

def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    #in_h = np.array([4, 7, 2, 6, 3, 5, 1, 0], dtype=np.uint32)
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = randint(0, 100)  #NUM_ELEMENTS - i - 1
    #in_h = np.array([6, 44, 71, 79, 94, 92, 12, 56, 47, 17, 81, 98, 84,  9, 85, 99], dtype=np.uint32)
    #in_h = np.array([85, 37, 50, 73, 51, 46, 62, 84, 65, 99, 76, 59, 73, 16, 27, 4, 75, 81, 80, 33, 73, 11, 29, 24, 81, 49, 27, 71, 74, 64, 60, 91], dtype=np.uint32)
    print in_h

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(
        ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)

    RadixGPU[number_of_blocks, threads_per_block](in_d, out_d, NUM_ELEMENTS)
    out_d.copy_to_host(out_h)
    #print "Rad = ", list(out_h)

    stride = 4
    # while stride < NUM_ELEMENTS:
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1)
    #     bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride)
    #     stride *= 2
    #     # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)
    #     # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS)
    #     # out_d = in_d
    #     out_d.copy_to_host(out_h)
    #     print "Str = ", list(out_h)
    #     break
    # # stride /= 2
    # while stride >= 4:
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1)
    #     bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride)
    #     stride /= 2
    #     cuda.synchronize()
    #
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)
    #     RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS)
    #     out_d = in_d
    #
    # out_d.copy_to_host(out_h)
    # cuda.synchronize()
    #
    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "GPU = ", list(out_h)
    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = list(in_h)  #[NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    #print "CPU = ", in_cpu
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1
    print len(in_cpu)

예제 #12

0

파일 보기

파일: test_mm.py 프로젝트: morrisyoung/CUDA_Python_starter

# Prepare data on the GPU
dA = cuda.to_device(A)
dB = cuda.to_device(B)
dC = cuda.to_device(C) # device_array_like(A)

# Time numpy version
s = timer()
np_ans = np.dot(A, B)
e = timer()
t = e - s

# Time the unoptimized version
s = timer()
cu_matmul[grid_dim, block_dim](dA, dB, dC, n)
cuda.synchronize()
e = timer()
unopt_ans = dC.copy_to_host()
tcuda_unopt = e - s

# Time the shared memory version
s = timer()
cu_matmul_sm[grid_dim, block_dim](dA, dB, dC, n, tpb, bpg)
cuda.synchronize()
e = timer()
opt_ans = dC.copy_to_host()
tcuda_opt = e - s

# Time for CuBLAS version
s = timer()
blas = cublas.Blas()

예제 #13

0

파일 보기

파일: apriori.py 프로젝트: jalatif/Python_Massively_Parallel_FP_Tree

def test_apriori():

    output_file = open("apriori_out.txt", "w")

    offsets, transactions, num_transactions, num_elements = readFile("syncthetic_data.txt")
    print "Offset = ", offsets[:num_transactions]
    print "transactions = ", transactions[:num_elements]
    print "Num transactions = ", num_transactions
    print "Num elements = ", num_elements
    min_support = MIN_SUPPORT

    # to find number of max digits required to represent that many number of unique items

    power = 1
    while MAX_UNIQUE_ITEMS / (10 ** power) != 0:
        power += 1


    print "Power = ", power

    t = [item for item in transactions.tolist()]

    if num_elements > NUM_ELEMENTS:
        print "Error: Elements exceeding NUM_ELEMENTS. Exiting..."
        sys.exit(12)

    input_h = np.array(t, dtype=np.int32)
    print "Input transactions = ", list(input_h)
    print "Size of transactions = ", input_h.size
    ci_h = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.int32)
    li_h = np.empty(MAX_UNIQUE_ITEMS, dtype=np.int32)

    input_d = cuda.to_device(input_h)
    ci_d = cuda.to_device(ci_h)
    li_d = cuda.device_array(MAX_UNIQUE_ITEMS, dtype=np.int32)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))), 1)#((NUM_ELEMENTS / threads_per_block[0]) + 1, 1)

    histogramGPU [number_of_blocks, threads_per_block] (input_d, ci_d, num_elements)
    #cuda.synchronize()

    ci_d.copy_to_host(ci_h)
    print "Ci_H Histogram result = ", ci_h # support count for each item

    number_of_blocks = (int(ceil(MAX_UNIQUE_ITEMS / (1.0 * threads_per_block[0]))), 1)
    pruneGPU [number_of_blocks, threads_per_block] (ci_d, MAX_UNIQUE_ITEMS, min_support)
    cuda.synchronize()

    ci_d.copy_to_host(ci_h)
    print "Keys = ", [i for i in range(0, len(ci_h))]
    print "Ci_H Pruning result = ", ci_h # support count for each item

    # calculate concise list of items satisfying min support
    l1_patterns = {}

    k = 0 # number of items whose sup_count > min_support
    for j in range(0, len(ci_h)):
        if ci_h[j] != 0:
            li_h[k] = j
            l1_patterns[(j, )] = ci_h[j]
            k += 1

    print "\n=======================================================\n"
    print "L1 = ", list(li_h)[:k]  #items whose support_count > min_support
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(l1_patterns, 1))

    print "K(num_items_with_good_sup_count = ", k

    #k = 102
    ci_h = np.array([-1 for i in range(0, k ** 2)], dtype=np.int32)
    ci_d = cuda.to_device(ci_h)

    #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32)
    #tli_h = np.array([i for i in range(1, k + 1)], dtype=np.int32)

    t1 = time()
    li_d = cuda.to_device(li_h)
    number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1)
    print "Self join 2 number of blocks = ", number_of_blocks
    print "K = ", k
    print "Ci_H size = ", ci_h.size
    print "LI_H size = ", li_h.size
    selfJoinGPU [number_of_blocks, threads_per_block](li_d, ci_d, k, power)
    cuda.synchronize()
    li_d.copy_to_host(li_h)
    ci_d.copy_to_host(ci_h)
    t2 = time()

    #sys.exit(0)
    # f = open('join.txt', 'w')
    #
    # for i in range(0, k):
    #     line = ""
    #     for j in range(0, k):
    #         line += str(ci_h[k * i + j]) + " "
    #     f.write(line + "\n")
    #
    # f.close()
    #ci_h = ci_h.reshape(k, k)

    print "Initial Mask = ", ci_h.reshape(k, k)

    print "Self joining time = ", (t2 - t1)

    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)

    #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)
    number_of_blocks = (int(ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1)
    print "Num blocks for findFrequency = ", number_of_blocks

    print "Num transactions = ", num_transactions
    print "Num patterns = ", k
    print "index = ", list(li_h)[:k]
    findFrequencyGPU [number_of_blocks, threads_per_block] (d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k)
    cuda.synchronize()
    ci_d.copy_to_host(ci_h)
    print "Final Mask = ", ci_h.reshape(k, k)
    d_transactions.copy_to_host(transactions)

    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0])))))

    pruneMultipleGPU [number_of_blocks, threads_per_block] (ci_d, k, min_support) # prunes according to min_support

    ci_d.copy_to_host(ci_h)
    print "Outer Mask = ", ci_h.reshape(k, k)

    ci_hn = np.zeros(k, dtype=np.int32)
    ci_dn = cuda.to_device(ci_hn)

    combinationsAvailable [threads_per_block, number_of_blocks] (ci_d, ci_dn, k) #Number of possible patterns in each row

    ci_dn.copy_to_host(ci_hn)

    print "Ci_hn = ", list(ci_hn)

    ci_hnx = np.empty(k, dtype=np.int32)
    ci_dnx = cuda.to_device(ci_hnx)

    preScan(ci_dnx, ci_dn, k) # Prefix sum on patterns in each row

    ci_dnx.copy_to_host(ci_hnx)
    num_patterns = ci_hnx[-1]
    print "Ci_hnx = ", list(ci_hnx)

    sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32)
    sparseM_d = cuda.to_device(sparseM_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1)

    convert2Sparse [threads_per_block, number_of_blocks] (ci_d, ci_dnx, sparseM_d, num_patterns, k)

    sparseM_d.copy_to_host(sparseM_h)

    # sparseM_h = sparseM_h.reshape(3, num_patterns)
    print sparseM_h.reshape(3, num_patterns)

    patterns = {}
    for i in range(0, num_patterns):
        item1 = sparseM_h[i]
        item2 = sparseM_h[i + num_patterns]
        support = sparseM_h[i + 2 * num_patterns]
        patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support

    print "\n=======================================================\n"
    print "L2 = ", patterns
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(patterns, 2))

    new_modulo_map = {}
    index_id = 1

    actual_pattern_items = []
    index_items_lookup = []

    #patterns = {(2, 3, 5) : 1, (2, 3, 6) : 1, (2, 3, 7) : 1, (2, 4, 5) : 1, (2, 4, 7) : 1, (3, 5, 7) : 1}
    for pattern in sorted(patterns.keys()):
        if pattern[:-1] not in new_modulo_map:
            new_modulo_map[pattern[:-1]] = index_id
            prev_len = len(actual_pattern_items)
            pattern_len = len(pattern[:-1])
            actual_pattern_items += pattern[:-1]
            index_items_lookup += [index_id, prev_len, pattern_len]
            index_id += 1

        if (pattern[-1],) not in new_modulo_map:
            new_modulo_map[(pattern[-1],)] = index_id
            prev_len = len(actual_pattern_items)
            pattern_len = len([pattern[-1]])
            actual_pattern_items += [pattern[-1]]
            index_items_lookup += [index_id, prev_len, pattern_len]
            index_id += 1


    #print "Actual pattern items = ", actual_pattern_items
    #print "Index lookup = ", index_items_lookup
    print new_modulo_map

    new_patterns = []
    for pattern in patterns:
        new_patterns.append((new_modulo_map[pattern[:-1]], new_modulo_map[(pattern[-1],)]))
    print new_patterns

    new_new_pattern = []
    for pattern in new_patterns:
        new_new_pattern.append(pattern[0] * 10 ** power + pattern[1])

    new_new_pattern.sort()
    print new_new_pattern

    k = len(new_new_pattern)

    li_h = np.array(new_new_pattern, dtype=np.int32)

    ci_h = np.array([-1 for i in range(0, k ** 2)], dtype=np.int32)
    ci_d = cuda.to_device(ci_h)


    #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32)

    t1 = time()
    li_d = cuda.to_device(li_h)
    number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1)
    selfJoinGPU [number_of_blocks, threads_per_block](li_d, ci_d, k, power)

    li_d.copy_to_host(li_h)
    ci_d.copy_to_host(ci_h)

    api_h = np.array(actual_pattern_items, dtype=np.int32)
    iil_h = np.array(index_items_lookup, dtype=np.int32)

    api_d = cuda.to_device(api_h)
    iil_d = cuda.to_device(iil_h)

    print "Api_h = ", list(api_h), " Size = ", api_h.size
    print "IIL_H = ", list(iil_h), " Size = ", iil_h.size
    t2 = time()
    print "LI_H = ", li_h
    print "Initial Mask = ", ci_h.reshape(k, k)

    #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)
    number_of_blocks = (int(ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1)

    print "Num transactions = ", num_transactions
    print "Num patterns = ", k
    print "index = ", li_h
    print "Size of api_d = ", api_h.size
    print "Size of iil_h = ", iil_h.size
    findHigherPatternFrequencyGPU [number_of_blocks, threads_per_block] (d_transactions, d_offsets, num_transactions, num_elements, li_d, ci_d, k, api_d, iil_d, power, api_h.size, iil_h.size)
    cuda.synchronize()
    ci_d.copy_to_host(ci_h)


    print "Final Mask = ", ci_h.reshape(k, k)
    #d_transactions.copy_to_host(transactions)

    #print transactions[:num_elements]

    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))), (int(ceil(k / (1.0 * threads_per_block[0])))))

    pruneMultipleGPU [number_of_blocks, threads_per_block] (ci_d, k, min_support)

    ci_d.copy_to_host(ci_h)
    print "Outer Mask = ", ci_h.reshape(k, k)
    print "K = ", k

    ci_hn = np.zeros(k, dtype=np.int32)
    ci_dn = cuda.to_device(ci_hn)

    combinationsAvailable [threads_per_block, number_of_blocks] (ci_d, ci_dn, k)

    ci_dn.copy_to_host(ci_hn)

    print "Ci_hn = ", list(ci_hn)

    ci_hnx = np.empty(k, dtype=np.int32)
    ci_dnx = cuda.to_device(ci_hnx)

    preScan(ci_dnx, ci_dn, k)

    ci_dnx.copy_to_host(ci_hnx)
    num_patterns = ci_hnx[-1]
    print list(ci_hnx)

    sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32)
    sparseM_d = cuda.to_device(sparseM_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1)
    print "K = ", k

    convert2Sparse [threads_per_block, number_of_blocks] (ci_d, ci_dnx, sparseM_d, num_patterns, k)

    sparseM_d.copy_to_host(sparseM_h)

    # sparseM_h = sparseM_h.reshape(3, num_patterns)
    print sparseM_h.reshape(3, num_patterns)

    patterns = {}
    for i in range(0, num_patterns):
        item1 = sparseM_h[i]
        item2 = sparseM_h[i + num_patterns]
        support = sparseM_h[i + 2 * num_patterns]
        patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support
    print patterns

    actual_patterns = {}

    for pattern in patterns:
        v_common_pat = pattern[0] / (10 ** power)
        vitem1 = pattern[0] % (10 ** power)
        vitem2 = pattern[1] % (10 ** power)

        item1 = actual_pattern_items[index_items_lookup[(vitem1-1) * 3 + 1]]
        item2 = actual_pattern_items[index_items_lookup[(vitem2-1) * 3 + 1]]


        common_pat_start = index_items_lookup[(v_common_pat-1) * 3 + 1]
        common_pat_length = index_items_lookup[(v_common_pat-1) * 3 + 2]
        common_pat_end = common_pat_start + common_pat_length

        common_pattern = actual_pattern_items[common_pat_start:common_pat_end]

        pattern_key = tuple(common_pattern) + tuple(sorted([item1, item2]))
        actual_patterns[pattern_key] = patterns[pattern]

    print "\n=======================================================\n"
    print "L3 = ", actual_patterns
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(actual_patterns, 3))

    output_file.close()

예제 #14

0

파일 보기

파일: radix_shared.py 프로젝트: jalatif/Python_Massively_Parallel_FP_Tree

def test_sort():
    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)  #4, 7, 2, 6, 3, 5, 1, 0
    #in_h = np.array([4, 7, 2, 6, 3, 5, 1, 0], dtype=np.uint32)
    out_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    for i in range(0, NUM_ELEMENTS):
        in_h[i] = randint(0, 100)#NUM_ELEMENTS - i - 1
    #in_h = np.array([6, 44, 71, 79, 94, 92, 12, 56, 47, 17, 81, 98, 84,  9, 85, 99], dtype=np.uint32)
    #in_h = np.array([85, 37, 50, 73, 51, 46, 62, 84, 65, 99, 76, 59, 73, 16, 27, 4, 75, 81, 80, 33, 73, 11, 29, 24, 81, 49, 27, 71, 74, 64, 60, 91], dtype=np.uint32)
    print in_h

    in_d = cuda.to_device(in_h)
    out_d = cuda.device_array(NUM_ELEMENTS, dtype=np.uint32)

    tkg1 = time()

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)

    RadixGPU [number_of_blocks, threads_per_block] (in_d, out_d, NUM_ELEMENTS)
    out_d.copy_to_host(out_h)
    #print "Rad = ", list(out_h)

    stride = 4
    # while stride < NUM_ELEMENTS:
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1)
    #     bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride)
    #     stride *= 2
    #     # number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)
    #     # RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS)
    #     # out_d = in_d
    #     out_d.copy_to_host(out_h)
    #     print "Str = ", list(out_h)
    #     break
    # # stride /= 2
    # while stride >= 4:
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (stride * 1.0 * threads_per_block[0]))), 1)
    #     bitonicSort [number_of_blocks, threads_per_block] (out_d, NUM_ELEMENTS, stride)
    #     stride /= 2
    #     cuda.synchronize()
    #
    #     number_of_blocks = (int(ceil(NUM_ELEMENTS / (2 * 1.0 * threads_per_block[0]))), 1)
    #     RadixGPU [number_of_blocks, threads_per_block] (out_d, in_d, NUM_ELEMENTS)
    #     out_d = in_d
        #
        # out_d.copy_to_host(out_h)
        # cuda.synchronize()
        #
        # line = ""
        # for i in range(0, NUM_ELEMENTS):
        #     line += " " + str(out_h[i])
        #
        # print line

    tkg2 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "GPU = ", list(out_h)
    # line = ""
    # for i in range(0, NUM_ELEMENTS):
    #     line += " " + str(out_h[i])
    #
    # print line

    in_cpu = list(in_h)#[NUM_ELEMENTS - i -1 for i in range(0, NUM_ELEMENTS)]
    tc1 = time()
    in_cpu.sort()
    #print "CPU = ", in_cpu
    tc2 = time()

    print "GPU Time = ", tkg2 - tkg1
    print "CPU Time = ", tc2 - tc1
    print len(in_cpu)

예제 #15

0

파일 보기

def test_apriori():

    output_file = open("apriori_out.txt", "w")

    offsets, transactions, num_transactions, num_elements = readFile(
        "syncthetic_data.txt")
    print "Offset = ", offsets[:num_transactions]
    print "transactions = ", transactions[:num_elements]
    print "Num transactions = ", num_transactions
    print "Num elements = ", num_elements
    min_support = MIN_SUPPORT

    # to find number of max digits required to represent that many number of unique items

    power = 1
    while MAX_UNIQUE_ITEMS / (10**power) != 0:
        power += 1

    print "Power = ", power

    t = [item for item in transactions.tolist()]

    if num_elements > NUM_ELEMENTS:
        print "Error: Elements exceeding NUM_ELEMENTS. Exiting..."
        sys.exit(12)

    input_h = np.array(t, dtype=np.int32)
    print "Input transactions = ", list(input_h)
    print "Size of transactions = ", input_h.size
    ci_h = np.zeros(MAX_UNIQUE_ITEMS, dtype=np.int32)
    li_h = np.empty(MAX_UNIQUE_ITEMS, dtype=np.int32)

    input_d = cuda.to_device(input_h)
    ci_d = cuda.to_device(ci_h)
    li_d = cuda.device_array(MAX_UNIQUE_ITEMS, dtype=np.int32)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(NUM_ELEMENTS / (1.0 * threads_per_block[0]))),
                        1)  #((NUM_ELEMENTS / threads_per_block[0]) + 1, 1)

    histogramGPU[number_of_blocks, threads_per_block](input_d, ci_d,
                                                      num_elements)
    #cuda.synchronize()

    ci_d.copy_to_host(ci_h)
    print "Ci_H Histogram result = ", ci_h  # support count for each item

    number_of_blocks = (int(
        ceil(MAX_UNIQUE_ITEMS / (1.0 * threads_per_block[0]))), 1)
    pruneGPU[number_of_blocks, threads_per_block](ci_d, MAX_UNIQUE_ITEMS,
                                                  min_support)
    cuda.synchronize()

    ci_d.copy_to_host(ci_h)
    print "Keys = ", [i for i in range(0, len(ci_h))]
    print "Ci_H Pruning result = ", ci_h  # support count for each item

    # calculate concise list of items satisfying min support
    l1_patterns = {}

    k = 0  # number of items whose sup_count > min_support
    for j in range(0, len(ci_h)):
        if ci_h[j] != 0:
            li_h[k] = j
            l1_patterns[(j, )] = ci_h[j]
            k += 1

    print "\n=======================================================\n"
    print "L1 = ", list(li_h)[:k]  #items whose support_count > min_support
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(l1_patterns, 1))

    print "K(num_items_with_good_sup_count = ", k

    #k = 102
    ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32)
    ci_d = cuda.to_device(ci_h)

    #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32)
    #tli_h = np.array([i for i in range(1, k + 1)], dtype=np.int32)

    t1 = time()
    li_d = cuda.to_device(li_h)
    number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1)
    print "Self join 2 number of blocks = ", number_of_blocks
    print "K = ", k
    print "Ci_H size = ", ci_h.size
    print "LI_H size = ", li_h.size
    selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power)
    cuda.synchronize()
    li_d.copy_to_host(li_h)
    ci_d.copy_to_host(ci_h)
    t2 = time()

    #sys.exit(0)
    # f = open('join.txt', 'w')
    #
    # for i in range(0, k):
    #     line = ""
    #     for j in range(0, k):
    #         line += str(ci_h[k * i + j]) + " "
    #     f.write(line + "\n")
    #
    # f.close()
    #ci_h = ci_h.reshape(k, k)

    print "Initial Mask = ", ci_h.reshape(k, k)

    print "Self joining time = ", (t2 - t1)

    d_offsets = cuda.to_device(offsets)
    d_transactions = cuda.to_device(transactions)

    #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)
    number_of_blocks = (int(
        ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1)
    print "Num blocks for findFrequency = ", number_of_blocks

    print "Num transactions = ", num_transactions
    print "Num patterns = ", k
    print "index = ", list(li_h)[:k]
    findFrequencyGPU[number_of_blocks,
                     threads_per_block](d_transactions, d_offsets,
                                        num_transactions, num_elements, li_d,
                                        ci_d, k)
    cuda.synchronize()
    ci_d.copy_to_host(ci_h)
    print "Final Mask = ", ci_h.reshape(k, k)
    d_transactions.copy_to_host(transactions)

    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))),
                        (int(ceil(k / (1.0 * threads_per_block[0])))))

    pruneMultipleGPU[number_of_blocks, threads_per_block](
        ci_d, k, min_support)  # prunes according to min_support

    ci_d.copy_to_host(ci_h)
    print "Outer Mask = ", ci_h.reshape(k, k)

    ci_hn = np.zeros(k, dtype=np.int32)
    ci_dn = cuda.to_device(ci_hn)

    combinationsAvailable[threads_per_block, number_of_blocks](
        ci_d, ci_dn, k)  #Number of possible patterns in each row

    ci_dn.copy_to_host(ci_hn)

    print "Ci_hn = ", list(ci_hn)

    ci_hnx = np.empty(k, dtype=np.int32)
    ci_dnx = cuda.to_device(ci_hnx)

    preScan(ci_dnx, ci_dn, k)  # Prefix sum on patterns in each row

    ci_dnx.copy_to_host(ci_hnx)
    num_patterns = ci_hnx[-1]
    print "Ci_hnx = ", list(ci_hnx)

    sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32)
    sparseM_d = cuda.to_device(sparseM_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1)

    convert2Sparse[threads_per_block,
                   number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k)

    sparseM_d.copy_to_host(sparseM_h)

    # sparseM_h = sparseM_h.reshape(3, num_patterns)
    print sparseM_h.reshape(3, num_patterns)

    patterns = {}
    for i in range(0, num_patterns):
        item1 = sparseM_h[i]
        item2 = sparseM_h[i + num_patterns]
        support = sparseM_h[i + 2 * num_patterns]
        patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support

    print "\n=======================================================\n"
    print "L2 = ", patterns
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(patterns, 2))

    new_modulo_map = {}
    index_id = 1

    actual_pattern_items = []
    index_items_lookup = []

    #patterns = {(2, 3, 5) : 1, (2, 3, 6) : 1, (2, 3, 7) : 1, (2, 4, 5) : 1, (2, 4, 7) : 1, (3, 5, 7) : 1}
    for pattern in sorted(patterns.keys()):
        if pattern[:-1] not in new_modulo_map:
            new_modulo_map[pattern[:-1]] = index_id
            prev_len = len(actual_pattern_items)
            pattern_len = len(pattern[:-1])
            actual_pattern_items += pattern[:-1]
            index_items_lookup += [index_id, prev_len, pattern_len]
            index_id += 1

        if (pattern[-1], ) not in new_modulo_map:
            new_modulo_map[(pattern[-1], )] = index_id
            prev_len = len(actual_pattern_items)
            pattern_len = len([pattern[-1]])
            actual_pattern_items += [pattern[-1]]
            index_items_lookup += [index_id, prev_len, pattern_len]
            index_id += 1

    #print "Actual pattern items = ", actual_pattern_items
    #print "Index lookup = ", index_items_lookup
    print new_modulo_map

    new_patterns = []
    for pattern in patterns:
        new_patterns.append(
            (new_modulo_map[pattern[:-1]], new_modulo_map[(pattern[-1], )]))
    print new_patterns

    new_new_pattern = []
    for pattern in new_patterns:
        new_new_pattern.append(pattern[0] * 10**power + pattern[1])

    new_new_pattern.sort()
    print new_new_pattern

    k = len(new_new_pattern)

    li_h = np.array(new_new_pattern, dtype=np.int32)

    ci_h = np.array([-1 for i in range(0, k**2)], dtype=np.int32)
    ci_d = cuda.to_device(ci_h)

    #li_h = np.array(sorted([randint(10, 99) for i in range(0, k)]), dtype=np.int32)

    t1 = time()
    li_d = cuda.to_device(li_h)
    number_of_blocks = (int(ceil(k / (1.0 * MAX_ITEM_PER_SM))), 1)
    selfJoinGPU[number_of_blocks, threads_per_block](li_d, ci_d, k, power)

    li_d.copy_to_host(li_h)
    ci_d.copy_to_host(ci_h)

    api_h = np.array(actual_pattern_items, dtype=np.int32)
    iil_h = np.array(index_items_lookup, dtype=np.int32)

    api_d = cuda.to_device(api_h)
    iil_d = cuda.to_device(iil_h)

    print "Api_h = ", list(api_h), " Size = ", api_h.size
    print "IIL_H = ", list(iil_h), " Size = ", iil_h.size
    t2 = time()
    print "LI_H = ", li_h
    print "Initial Mask = ", ci_h.reshape(k, k)

    #number_of_blocks = (1, 1) #(int(num_transactions / (1.0 * threads_per_block[0])) + 1, 1)
    number_of_blocks = (int(
        ceil(num_transactions / (1.0 * MAX_TRANSACTIONS_PER_SM))), 1)

    print "Num transactions = ", num_transactions
    print "Num patterns = ", k
    print "index = ", li_h
    print "Size of api_d = ", api_h.size
    print "Size of iil_h = ", iil_h.size
    findHigherPatternFrequencyGPU[number_of_blocks,
                                  threads_per_block](d_transactions, d_offsets,
                                                     num_transactions,
                                                     num_elements, li_d, ci_d,
                                                     k, api_d, iil_d, power,
                                                     api_h.size, iil_h.size)
    cuda.synchronize()
    ci_d.copy_to_host(ci_h)

    print "Final Mask = ", ci_h.reshape(k, k)
    #d_transactions.copy_to_host(transactions)

    #print transactions[:num_elements]

    threads_per_block = (BLOCK_SIZE, BLOCK_SIZE)
    number_of_blocks = ((int(ceil(k / (1.0 * threads_per_block[0])))),
                        (int(ceil(k / (1.0 * threads_per_block[0])))))

    pruneMultipleGPU[number_of_blocks, threads_per_block](ci_d, k, min_support)

    ci_d.copy_to_host(ci_h)
    print "Outer Mask = ", ci_h.reshape(k, k)
    print "K = ", k

    ci_hn = np.zeros(k, dtype=np.int32)
    ci_dn = cuda.to_device(ci_hn)

    combinationsAvailable[threads_per_block, number_of_blocks](ci_d, ci_dn, k)

    ci_dn.copy_to_host(ci_hn)

    print "Ci_hn = ", list(ci_hn)

    ci_hnx = np.empty(k, dtype=np.int32)
    ci_dnx = cuda.to_device(ci_hnx)

    preScan(ci_dnx, ci_dn, k)

    ci_dnx.copy_to_host(ci_hnx)
    num_patterns = ci_hnx[-1]
    print list(ci_hnx)

    sparseM_h = np.empty(ci_hnx[-1] * 3, dtype=np.uint32)
    sparseM_d = cuda.to_device(sparseM_h)

    threads_per_block = (BLOCK_SIZE, 1)
    number_of_blocks = (int(ceil(k / (1.0 * threads_per_block[0]))), 1)
    print "K = ", k

    convert2Sparse[threads_per_block,
                   number_of_blocks](ci_d, ci_dnx, sparseM_d, num_patterns, k)

    sparseM_d.copy_to_host(sparseM_h)

    # sparseM_h = sparseM_h.reshape(3, num_patterns)
    print sparseM_h.reshape(3, num_patterns)

    patterns = {}
    for i in range(0, num_patterns):
        item1 = sparseM_h[i]
        item2 = sparseM_h[i + num_patterns]
        support = sparseM_h[i + 2 * num_patterns]
        patterns[tuple(sorted([li_h[item1], li_h[item2]]))] = support
    print patterns

    actual_patterns = {}

    for pattern in patterns:
        v_common_pat = pattern[0] / (10**power)
        vitem1 = pattern[0] % (10**power)
        vitem2 = pattern[1] % (10**power)

        item1 = actual_pattern_items[index_items_lookup[(vitem1 - 1) * 3 + 1]]
        item2 = actual_pattern_items[index_items_lookup[(vitem2 - 1) * 3 + 1]]

        common_pat_start = index_items_lookup[(v_common_pat - 1) * 3 + 1]
        common_pat_length = index_items_lookup[(v_common_pat - 1) * 3 + 2]
        common_pat_end = common_pat_start + common_pat_length

        common_pattern = actual_pattern_items[common_pat_start:common_pat_end]

        pattern_key = tuple(common_pattern) + tuple(sorted([item1, item2]))
        actual_patterns[pattern_key] = patterns[pattern]

    print "\n=======================================================\n"
    print "L3 = ", actual_patterns
    print "\n=======================================================\n"

    output_file.write(createFormattedPatterns(actual_patterns, 3))

    output_file.close()