def parallel_sort(key_arrs, data, ascending=True): n_local = len(key_arrs[0]) n_total = hpat.distributed_api.dist_reduce(n_local, np.int32(Reduce_Type.Sum.value)) n_pes = hpat.distributed_api.get_size() my_rank = hpat.distributed_api.get_rank() # similar to Spark's sample computation Partitioner.scala sampleSize = min(samplePointsPerPartitionHint * n_pes, MIN_SAMPLES) fraction = min(sampleSize / max(n_total, 1), 1.0) n_loc_samples = min(math.ceil(fraction * n_local), n_local) inds = np.random.randint(0, n_local, n_loc_samples) samples = key_arrs[0][inds] # print(sampleSize, fraction, n_local, n_loc_samples, len(samples)) all_samples = hpat.distributed_api.gatherv(samples) all_samples = to_string_list(all_samples) bounds = empty_like_type(n_pes - 1, all_samples) if my_rank == MPI_ROOT: all_samples.sort() if not ascending: all_samples = all_samples[::-1] n_samples = len(all_samples) step = math.ceil(n_samples / n_pes) for i in range(n_pes - 1): bounds[i] = all_samples[min((i + 1) * step, n_samples - 1)] # print(bounds) bounds = str_list_to_array(bounds) bounds = hpat.distributed_api.prealloc_str_for_bcast(bounds) hpat.distributed_api.bcast(bounds) # calc send/recv counts pre_shuffle_meta = alloc_pre_shuffle_metadata(key_arrs, data, n_pes, True) node_id = 0 for i in range(n_local): val = key_arrs[0][i] # TODO: refactor if node_id < (n_pes - 1) and (ascending and val >= bounds[node_id] or (not ascending) and val <= bounds[node_id]): node_id += 1 update_shuffle_meta(pre_shuffle_meta, node_id, i, (val, ), getitem_arr_tup(data, i), True) shuffle_meta = finalize_shuffle_meta(key_arrs, data, pre_shuffle_meta, n_pes, True) # shuffle recvs = alltoallv_tup(key_arrs + data, shuffle_meta) out_key = _get_keys_tup(recvs, key_arrs) out_data = _get_data_tup(recvs, key_arrs) return out_key, out_data
def local_merge_new(left_key, right_key, data_left, data_right): curr_size = 101 + min(len(left_key), len(right_key)) // 10 out_left_key = empty_like_type(curr_size, left_key) out_data_left = alloc_arr_tup(curr_size, data_left) out_data_right = alloc_arr_tup(curr_size, data_right) out_ind = 0 left_ind = 0 right_ind = 0 while left_ind < len(left_key) and right_ind < len(right_key): if left_key[left_ind] == right_key[right_ind]: out_left_key = copy_elem_buff(out_left_key, out_ind, left_key[left_ind]) l_data_val = getitem_arr_tup(data_left, left_ind) out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val) r_data_val = getitem_arr_tup(data_right, right_ind) out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val) out_ind += 1 left_run = left_ind + 1 while left_run < len(left_key) and left_key[left_run] == right_key[right_ind]: out_left_key = copy_elem_buff(out_left_key, out_ind, left_key[left_run]) l_data_val = getitem_arr_tup(data_left, left_run) out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val) r_data_val = getitem_arr_tup(data_right, right_ind) out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val) out_ind += 1 left_run += 1 right_run = right_ind + 1 while right_run < len(right_key) and right_key[right_run] == left_key[left_ind]: out_left_key = copy_elem_buff(out_left_key, out_ind, left_key[left_ind]) l_data_val = getitem_arr_tup(data_left, left_ind) out_data_left = copy_elem_buff_tup(out_data_left, out_ind, l_data_val) r_data_val = getitem_arr_tup(data_right, right_run) out_data_right = copy_elem_buff_tup(out_data_right, out_ind, r_data_val) out_ind += 1 right_run += 1 left_ind += 1 right_ind += 1 elif left_key[left_ind] < right_key[right_ind]: left_ind += 1 else: right_ind += 1 #out_left_key = out_left_key[:out_ind] out_left_key = trim_arr(out_left_key, out_ind) out_right_key = out_left_key.copy() out_data_left = trim_arr_tup(out_data_left, out_ind) out_data_right = trim_arr_tup(out_data_right, out_ind) return out_left_key, out_right_key, out_data_left, out_data_right
def parallel_sort(key_arr, data): n_local = len(key_arr) n_total = hpat.distributed_api.dist_reduce(n_local, np.int32(Reduce_Type.Sum.value)) n_pes = hpat.distributed_api.get_size() my_rank = hpat.distributed_api.get_rank() # similar to Spark's sample computation Partitioner.scala sampleSize = min(samplePointsPerPartitionHint * n_pes, MIN_SAMPLES) fraction = min(sampleSize / max(n_total, 1), 1.0) n_loc_samples = min(math.ceil(fraction * n_local), n_local) inds = np.random.randint(0, n_local, n_loc_samples) samples = key_arr[inds] # print(sampleSize, fraction, n_local, n_loc_samples, len(samples)) all_samples = hpat.distributed_api.gatherv(samples) all_samples = to_string_list(all_samples) bounds = empty_like_type(n_pes - 1, all_samples) if my_rank == MPI_ROOT: all_samples.sort() n_samples = len(all_samples) step = math.ceil(n_samples / n_pes) for i in range(n_pes - 1): bounds[i] = all_samples[min((i + 1) * step, n_samples - 1)] # print(bounds) bounds = str_list_to_array(bounds) bounds = hpat.distributed_api.prealloc_str_for_bcast(bounds) hpat.distributed_api.bcast(bounds) # calc send/recv counts shuffle_meta = alloc_shuffle_metadata(key_arr, n_pes, True) data_shuffle_meta = data_alloc_shuffle_metadata(data, n_pes, True) node_id = 0 for i in range(n_local): val = key_arr[i] if node_id < (n_pes - 1) and val >= bounds[node_id]: node_id += 1 update_shuffle_meta(shuffle_meta, node_id, i, val) update_data_shuffle_meta(data_shuffle_meta, node_id, i, data) finalize_shuffle_meta(key_arr, shuffle_meta, True) finalize_data_shuffle_meta(data, data_shuffle_meta, shuffle_meta, True) # shuffle alltoallv(key_arr, shuffle_meta) out_data = alltoallv_tup(data, data_shuffle_meta, shuffle_meta) return shuffle_meta.out_arr, out_data
def gatherv_impl(data): rank = hpat.distributed_api.get_rank() n_loc = len(data) recv_counts = gather_scalar(np.int32(n_loc)) n_total = recv_counts.sum() all_data = empty_like_type(n_total, data) # displacements displs = np.empty(1, np.int32) if rank == MPI_ROOT: displs = hpat.hiframes.join.calc_disp(recv_counts) c_gatherv(data.ctypes, np.int32(n_loc), all_data.ctypes, recv_counts.ctypes, displs.ctypes, np.int32(typ_val)) return all_data
def local_merge_asof(left_key, right_key, data_left, data_right): # adapted from pandas/_libs/join_func_helper.pxi l_size = len(left_key) r_size = len(right_key) out_left_key = empty_like_type(l_size, left_key) out_right_key = empty_like_type(l_size, right_key) out_data_left = alloc_arr_tup(l_size, data_left) out_data_right = alloc_arr_tup(l_size, data_right) left_ind = 0 right_ind = 0 for left_ind in range(l_size): # restart right_ind if it went negative in a previous iteration if right_ind < 0: right_ind = 0 # find last position in right whose value is less than left's while right_ind < r_size and right_key[right_ind] <= left_key[left_ind]: right_ind += 1 right_ind -= 1 out_left_key[left_ind] = left_key[left_ind] # TODO: copy_tup setitem_arr_tup(out_data_left, left_ind, getitem_arr_tup(data_left, left_ind)) if right_ind >= 0: out_right_key[left_ind] = right_key[right_ind] setitem_arr_tup(out_data_right, left_ind, getitem_arr_tup(data_right, right_ind)) else: setitem_arr_nan(out_right_key, left_ind) setitem_arr_tup_nan(out_data_right, left_ind) return out_left_key, out_right_key, out_data_left, out_data_right
def ensureCapacity(self, minCapacity): if self.tmpLength < minCapacity: # Compute smallest power of 2 > minCapacity newSize = minCapacity newSize |= newSize >> 1 newSize |= newSize >> 2 newSize |= newSize >> 4 newSize |= newSize >> 8 newSize |= newSize >> 16 newSize += 1 if newSize < 0: # Not bloody likely! newSize = minCapacity else: newSize = min(newSize, self.aLength >> 1) self.tmp = empty_like_type(newSize, self.key_arr) self.tmp_data = alloc_arr_tup(newSize, self.data) self.tmpLength = newSize return self.tmp
def __init__(self, key_arr, aLength, data): self.key_arr = key_arr self.data = data self.aLength = aLength # This controls when we get *into* galloping mode. It is initialized # to MIN_GALLOP. The mergeLo and mergeHi methods nudge it higher for # random data, and lower for highly structured data. self.minGallop = MIN_GALLOP arr_len = aLength # Allocate temp storage (which may be increased later if necessary) self.tmpLength = arr_len >> 1 if arr_len < 2 * INITIAL_TMP_STORAGE_LENGTH else INITIAL_TMP_STORAGE_LENGTH self.tmp = empty_like_type(self.tmpLength, self.key_arr) self.tmp_data = alloc_arr_tup(self.tmpLength, data) # A stack of pending runs yet to be merged. Run i starts at # address base[i] and extends for len[i] elements. It's always # true (so long as the indices are in bounds) that: # # runBase[i] + runLen[i] == runBase[i + 1] # # so we could cut the storage for this, but it's a minor amount, # and keeping all the info explicit simplifies the code. # Allocate runs-to-be-merged stack (which cannot be expanded). The # stack length requirements are described in listsort.txt. The C # version always uses the same stack length (85), but this was # measured to be too expensive when sorting "mid-sized" arrays (e.g., # 100 elements) in Java. Therefore, we use smaller (but sufficiently # large) stack lengths for smaller arrays. The "magic numbers" in the # computation below must be changed if MIN_MERGE is decreased. See # the MIN_MERGE declaration above for more information. self.stackSize = 0 # Number of pending runs on stack stackLen = 5 if arr_len < 120 else (10 if arr_len < 1542 else (19 if arr_len < 119151 else 40)) self.runBase = np.empty(stackLen, np.int64) self.runLen = np.empty(stackLen, np.int64)