Пример #1
0
 def scan_block(data, sums):
     sm_data = hsa.shared.array(128, dtype=intp)
     tid = hsa.get_local_id(0)
     gid = hsa.get_global_id(0)
     blkid = hsa.get_group_id(0)
     sm_data[tid] = data[gid]
     prefixsum = device_scan_generic(tid, sm_data)
     data[gid] = sm_data[tid]
     sums[blkid, tid] = prefixsum
Пример #2
0
 def scan_block(data, sums):
     sm_data = hsa.shared.array(128, dtype=intp)
     tid = hsa.get_local_id(0)
     gid = hsa.get_global_id(0)
     blkid = hsa.get_group_id(0)
     sm_data[tid] = data[gid]
     prefixsum = device_scan_generic(tid, sm_data)
     data[gid] = sm_data[tid]
     sums[blkid, tid] = prefixsum
Пример #3
0
        def scan_block(data, sums):
            sm_data = hsa.shared.array(128, dtype=intp)
            tid = hsa.get_local_id(0)
            gid = hsa.get_global_id(0)
            blkid = hsa.get_group_id(0)

            scanval, prefixsum = device_scan(tid, data[gid], sm_data, False)

            data[gid] = scanval
            sums[blkid, tid] = prefixsum
Пример #4
0
        def scan_block(data, sums):
            sm_data = hsa.shared.array(128, dtype=intp)
            tid = hsa.get_local_id(0)
            gid = hsa.get_global_id(0)
            blkid = hsa.get_group_id(0)

            scanval, prefixsum = device_scan(tid, data[gid], sm_data,
                                             False)

            data[gid] = scanval
            sums[blkid, tid] = prefixsum
Пример #5
0
        def udt(output):
            global_id = hsa.get_global_id(0)
            global_size = hsa.get_global_size(0)
            local_id = hsa.get_local_id(0)
            group_id = hsa.get_group_id(0)
            num_groups = hsa.get_num_groups(0)
            workdim = hsa.get_work_dim()
            local_size = hsa.get_local_size(0)

            output[0, group_id, local_id] = global_id
            output[1, group_id, local_id] = global_size
            output[2, group_id, local_id] = local_id
            output[3, group_id, local_id] = local_size
            output[4, group_id, local_id] = group_id
            output[5, group_id, local_id] = num_groups
            output[6, group_id, local_id] = workdim
Пример #6
0
        def kernel_scatter(size, shift, shuffled, scanblocksum, localscan,
                           shuffled_sorted, indices, indices_sorted,
                           store_indices):
            tid = hsa.get_local_id(0)
            blkid = hsa.get_group_id(0)
            gid = hsa.get_global_id(0)

            if gid < size:
                curdata = uintp(shuffled[blkid, tid])
                data_radix = uintp((curdata >> uintp(shift)) &
                                   uintp(RADIX_MINUS_1))
                pos = scanblocksum[data_radix, blkid] + localscan[blkid, tid]
                shuffled_sorted[pos] = curdata

                if store_indices:
                    indices_sorted[pos] = indices[gid]
Пример #7
0
    def kernel_reduce_min(inp, out, nelem):
        tid = hsa.get_local_id(0)
        blkid = hsa.get_group_id(0)
        blksz = hsa.get_local_size(0)
        numgroup = hsa.get_num_groups(0)

        i = blkid * blksz + tid

        accum = dtype(POS_INF)
        while i < nelem:
            accum = min(accum, inp[i])
            i += blksz * numgroup

        accum = group_reducer(accum)
        if tid == 0:
            out[blkid] = accum
Пример #8
0
        def udt(output):
            global_id = hsa.get_global_id(0)
            global_size = hsa.get_global_size(0)
            local_id = hsa.get_local_id(0)
            group_id = hsa.get_group_id(0)
            num_groups = hsa.get_num_groups(0)
            workdim = hsa.get_work_dim()
            local_size = hsa.get_local_size(0)

            output[0, group_id, local_id] = global_id
            output[1, group_id, local_id] = global_size
            output[2, group_id, local_id] = local_id
            output[3, group_id, local_id] = local_size
            output[4, group_id, local_id] = group_id
            output[5, group_id, local_id] = num_groups
            output[6, group_id, local_id] = workdim
Пример #9
0
    def kernel_reduce_min(inp, out, nelem):
        tid = hsa.get_local_id(0)
        blkid = hsa.get_group_id(0)
        blksz = hsa.get_local_size(0)
        numgroup = hsa.get_num_groups(0)

        i = blkid * blksz + tid

        accum = dtype(POS_INF)
        while i < nelem:
            accum = min(accum, inp[i])
            i += blksz * numgroup

        accum = group_reducer(accum)
        if tid == 0:
            out[blkid] = accum
Пример #10
0
    def hsa_uni_kde(support, samples, bandwidth, pdf):
        gid = hsa.get_group_id(0)
        tid = hsa.get_local_id(0)
        tsz = hsa.get_local_size(0)

        supp = support[gid]

        # all local threads cooperatively computes the energy for a support
        energy = 0
        for base in range(0, samples.size, tsz):
            idx = tid + base
            if idx < samples.size:
                energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth

        # reduce energy
        total = group_reduce_sum_float64(energy)
        if tid == 0:
            pdf[gid] = total / samples.size
Пример #11
0
    def hsa_uni_kde(support, samples, bandwidth, pdf):
        gid = hsa.get_group_id(0)
        tid = hsa.get_local_id(0)
        tsz = hsa.get_local_size(0)

        supp = support[gid]

        # all local threads cooperatively computes the energy for a support
        energy = 0
        for base in range(0, samples.size, tsz):
            idx = tid + base
            if idx < samples.size:
                energy += kernel((samples[idx] - supp) / bandwidth) / bandwidth

        # reduce energy
        total = group_reduce_sum_float64(energy)
        if tid == 0:
            pdf[gid] = total / samples.size
Пример #12
0
        def kernel_local_shuffle(data, size, shift, blocksum, localscan,
                                 shuffled, indices, store_indices):
            tid = hsa.get_local_id(0)
            blkid = hsa.get_group_id(0)
            blksz = localscan.shape[1]

            sm_mask = hsa.shared.array(shape=mask_shape, dtype=int32)
            sm_blocksum = hsa.shared.array(shape=4, dtype=int32)
            sm_shuffled = hsa.shared.array(shape=block_size, dtype=uintp)
            sm_indices = hsa.shared.array(shape=block_size, dtype=uintp)
            sm_localscan = hsa.shared.array(shape=block_size, dtype=int32)
            sm_localscan[tid] = -1

            dataid = blkid * blksz + tid
            valid = dataid < size and tid < blksz
            curdata = uintp(data[dataid] if valid else uintp(0))
            processed_data = uintp((curdata >> uintp(shift)) &
                                   uintp(RADIX_MINUS_1))

            chunk_offset, scanval = four_way_scan(processed_data, sm_mask,
                                                  sm_blocksum, blksz, valid)

            if tid < RADIX:
                blocksum[tid, blkid] = sm_blocksum[tid]

            if tid < blksz:
                # Store local scan value
                where = chunk_offset + scanval
                # Store shuffled value and indices
                shuffled[blkid, where] = curdata
                if store_indices and valid:
                    sm_indices[where] = indices[dataid]
                sm_localscan[where] = scanval

            # Cleanup
            hsa.barrier()
            if tid < blksz:
                # shuffled[blkid, tid] = sm_shuffled[tid]
                if store_indices and valid:
                    indices[dataid] = sm_indices[tid]
                localscan[blkid, tid] = sm_localscan[tid]
Пример #13
0
 def udt(output):
     global_id = hsa.get_global_id(0)
     group_id = hsa.get_group_id(0)
     output[global_id] = group_id + 1