def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.values.shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(len(blk) for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar: sofar + len(blk)] com.take_nd(blk.values, unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) return make_block(out, new_block_items, self.result_items)
def _test_dtype(dtype, can_hold_na): data = np.random.randint(0, 2, (5, 3)).astype(dtype) indexer = [2, 1, 0, 1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) com.take_nd(data, indexer, out=out0, axis=0) com.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) indexer = [2, 1, 0, -1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) if can_hold_na: com.take_nd(data, indexer, out=out0, axis=0) com.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected0[3, :] = np.nan expected1[:, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) else: for i, out in enumerate([out0, out1]): with tm.assertRaisesRegexp(TypeError, self.fill_error): com.take_nd(data, indexer, out=out, axis=i) # no exception o/w data.take(indexer, out=out, axis=i)
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.values.shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(len(blk) for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar:sofar + len(blk)] com.take_nd(blk.values, unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) return make_block(out, new_block_items, self.result_items)
def test_2d_bool(self): arr = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 1]], dtype=bool) result = com.take_nd(arr, [0, 2, 2, 1]) expected = arr.take([0, 2, 2, 1], axis=0) self.assert_(np.array_equal(result, expected)) result = com.take_nd(arr, [0, 2, 2, 1], axis=1) expected = arr.take([0, 2, 2, 1], axis=1) self.assert_(np.array_equal(result, expected)) result = com.take_nd(arr, [0, 2, -1]) self.assert_(result.dtype == np.object_)
def _test_dtype(dtype, can_hold_na): data = np.random.randint(0, 2, (5, 3)).astype(dtype) indexer = [2, 1, 0, 1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) com.take_nd(data, indexer, out=out0, axis=0) com.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) indexer = [2, 1, 0, -1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) if can_hold_na: com.take_nd(data, indexer, out=out0, axis=0) com.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected0[3, :] = np.nan expected1[:, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) else: self.assertRaises(Exception, com.take_nd, data, indexer, out=out0, axis=0) self.assertRaises(Exception, com.take_nd, data, indexer, out=out1, axis=1) # no exception o/w data.take(indexer, out=out0, axis=0) data.take(indexer, out=out1, axis=1)
def test_2d_float32(self): arr = np.random.randn(4, 3).astype(np.float32) indexer = [0, 2, -1, 1, -1] # axis=0 result = com.take_nd(arr, indexer, axis=0) result2 = np.empty_like(result) com.take_nd(arr, indexer, axis=0, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected[[2, 4], :] = np.nan tm.assert_almost_equal(result, expected) #### this now accepts a float32! # test with float64 out buffer out = np.empty((len(indexer), arr.shape[1]), dtype='float32') com.take_nd(arr, indexer, out=out) # it works! # axis=1 result = com.take_nd(arr, indexer, axis=1) result2 = np.empty_like(result) com.take_nd(arr, indexer, axis=1, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = np.nan tm.assert_almost_equal(result, expected)
def test_2d_other_dtypes(self): arr = np.random.randn(10, 5).astype(np.float32) indexer = [1, 2, 3, -1] # axis=0 result = com.take_nd(arr, indexer, axis=0) expected = arr.take(indexer, axis=0) expected[-1] = np.nan tm.assert_almost_equal(result, expected) # axis=1 result = com.take_nd(arr, indexer, axis=1) expected = arr.take(indexer, axis=1) expected[:, -1] = np.nan tm.assert_almost_equal(result, expected)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array( values[:, i], categories=self.is_categorical.categories, ordered=True) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = np.empty((len(index),values.shape[1])), values values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = (np.empty( (len(index), values.shape[1])), values) values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.get_values().shape) n = len(fidx) if fidx is not None else out_shape[self.axis] out_shape[0] = sum(blk.get_merge_length() for unit, blk in merge_chunks) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar:sofar + len(blk)] com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) # need to set placement if we have a non-unique result # calculate by the existing placement plus the offset in the result set placement = None if not self.result_items.is_unique: nchunks = len(merge_chunks) offsets = np.array([0] + [len(self.result_items) / nchunks] * (nchunks - 1)).cumsum() placement = [] for (unit, blk), offset in zip(merge_chunks, offsets): placement.extend(blk.ref_locs + offset) return make_block(out, new_block_items, self.result_items, placement=placement)
def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): mat = mgr.as_matrix() reindexed_mat = com.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer(new_labels, indexer, axis, fill_value=fill_value) assert_almost_equal(reindexed_mat, reindexed.as_matrix()) assert_almost_equal(reindexed.axes[axis], new_labels)
def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): """ Take values according to indexer and return them as a block.bb """ if fill_tuple is None: fill_value = self.fill_value new_values = com.take_nd(self.get_values(), indexer, axis=axis, allow_fill=False) else: fill_value = fill_tuple[0] new_values = com.take_nd(self.get_values(), indexer, axis=axis, allow_fill=True, fill_value=fill_value) if new_mgr_locs is None: if axis == 0: slc = lib.indexer_as_slice(indexer) if slc is not None: new_mgr_locs = self.mgr_locs[slc]
def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): mat = mgr.as_matrix() indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) assert_almost_equal(com.take_nd(mat, indexer, axis, fill_value=fill_value), reindexed.as_matrix()) assert_almost_equal(reindexed.axes[axis], new_labels)
def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): mat = mgr.as_matrix() indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) assert_almost_equal( com.take_nd(mat, indexer, axis, fill_value=fill_value), reindexed.as_matrix()) assert_almost_equal(reindexed.axes[axis], new_labels)
def _test_dtype(dtype, fill_value, out_dtype): data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, -1] result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) assert ((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) assert ((result[3, :, :] == fill_value).all()) assert (result.dtype == out_dtype) result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) assert ((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) assert ((result[:, 3, :] == fill_value).all()) assert (result.dtype == out_dtype) result = com.take_nd(data, indexer, axis=2, fill_value=fill_value) assert ((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) assert ((result[:, :, 3] == fill_value).all()) assert (result.dtype == out_dtype) indexer = [2, 1, 0, 1] result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) assert ((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) assert (result.dtype == dtype) result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) assert ((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) assert (result.dtype == dtype) result = com.take_nd(data, indexer, axis=2, fill_value=fill_value) assert ((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) assert (result.dtype == dtype)
def _test_dtype(dtype, fill_value, out_dtype): data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, -1] result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) assert((result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()) assert((result[3, :, :] == fill_value).all()) assert(result.dtype == out_dtype) result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) assert((result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()) assert((result[:, 3, :] == fill_value).all()) assert(result.dtype == out_dtype) result = com.take_nd(data, indexer, axis=2, fill_value=fill_value) assert((result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()) assert((result[:, :, 3] == fill_value).all()) assert(result.dtype == out_dtype) indexer = [2, 1, 0, 1] result = com.take_nd(data, indexer, axis=0, fill_value=fill_value) assert((result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()) assert(result.dtype == dtype) result = com.take_nd(data, indexer, axis=1, fill_value=fill_value) assert((result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()) assert(result.dtype == dtype) result = com.take_nd(data, indexer, axis=2, fill_value=fill_value) assert((result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()) assert(result.dtype == dtype)
def _merge_blocks(self, merge_chunks): """ merge_chunks -> [(_JoinUnit, Block)] """ funit, fblock = merge_chunks[0] fidx = funit.indexer out_shape = list(fblock.get_values().shape) n = len(fidx) if fidx is not None else out_shape[self.axis] merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks) out_shape[0] = sum(merge_lengths) out_shape[self.axis] = n # Should use Fortran order?? block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) out = np.empty(out_shape, dtype=block_dtype) sofar = 0 for unit, blk in merge_chunks: out_chunk = out[sofar : sofar + len(blk)] com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk) sofar += len(blk) # does not sort new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) # need to set placement if we have a non-unique result # calculate by the existing placement plus the offset in the result set placement = None if not self.result_items.is_unique: placement = [] offsets = np.append(np.array([0]), self.offsets.cumsum()[:-1]) for (unit, blk), offset in zip(merge_chunks, offsets): placement.extend(blk.ref_locs + offset) return make_block(out, new_block_items, self.result_items, placement=placement)
def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): """ Reindex using pre-computed indexer information """ if axis < 1: raise AssertionError('axis must be at least 1, got %d' % axis) if fill_value is None: fill_value = self.fill_value new_values = com.take_nd(self.values, indexer, axis, fill_value=fill_value, mask_info=mask_info) return make_block(new_values, ndim=self.ndim, fastpath=True, placement=self.mgr_locs)
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] return DataFrame(values, index=index, columns=columns)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: values = [Categorical.from_array( values[:, i], categories=self.is_categorical.categories, ordered=True) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns)
def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 arr = np.random.randint(long(11045376), long(11360736), (5,3))*100000000000 arr = arr.view(dtype='datetime64[ns]') indexer = [0, 2, -1, 1, -1] # axis=0 result = com.take_nd(arr, indexer, axis=0) result2 = np.empty_like(result) com.take_nd(arr, indexer, axis=0, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected.view(np.int64)[[2, 4], :] = iNaT tm.assert_almost_equal(result, expected) result = com.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) com.take_nd(arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected[[2, 4], :] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) # axis=1 result = com.take_nd(arr, indexer, axis=1) result2 = np.empty_like(result) com.take_nd(arr, indexer, axis=1, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected.view(np.int64)[:, [2, 4]] = iNaT tm.assert_almost_equal(result, expected) result = com.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) com.take_nd(arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected)
def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 arr = np.random.randint(long(11045376), long(11360736), (5, 3)) * 100000000000 arr = arr.view(dtype='datetime64[ns]') indexer = [0, 2, -1, 1, -1] # axis=0 result = com.take_nd(arr, indexer, axis=0) result2 = np.empty_like(result) com.take_nd(arr, indexer, axis=0, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected.view(np.int64)[[2, 4], :] = iNaT tm.assert_almost_equal(result, expected) result = com.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) com.take_nd(arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected[[2, 4], :] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) # axis=1 result = com.take_nd(arr, indexer, axis=1) result2 = np.empty_like(result) com.take_nd(arr, indexer, axis=1, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected.view(np.int64)[:, [2, 4]] = iNaT tm.assert_almost_equal(result, expected) result = com.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) com.take_nd(arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected)