def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if not self.drop_keys: # do nothing, already found in one of the DataFrames return # insert group keys for i, name in enumerate(self.join_names): if name in result: key_col = result[name] if name in self.left: na_indexer = (left_indexer == -1).nonzero()[0] right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) else: na_indexer = (right_indexer == -1).nonzero()[0] left_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], left_na_indexer)) else: # a faster way? key_col = com.take_1d(self.left_join_keys[i], left_indexer) na_indexer = (left_indexer == -1).nonzero()[0] right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) result.insert(i, name, key_col)
def _maybe_add_join_keys(self, result, left_indexer, right_indexer): # insert group keys keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue if name in result: key_col = result[name] if name in self.left and left_indexer is not None: na_indexer = (left_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) elif name in self.right and right_indexer is not None: na_indexer = (right_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue left_na_indexer = left_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], left_na_indexer)) elif left_indexer is not None: if name is None: name = "key_%d" % i # a faster way? key_col = com.take_1d(self.left_join_keys[i], left_indexer) na_indexer = (left_indexer == -1).nonzero()[0] right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) result.insert(i, name, key_col)
def test_1d_bool(self): arr = np.array([0, 1, 0], dtype=bool) result = com.take_1d(arr, [0, 2, 2, 1]) expected = arr.take([0, 2, 2, 1]) self.assert_numpy_array_equal(result, expected) result = com.take_1d(arr, [0, 2, -1]) self.assertEqual(result.dtype, np.object_)
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): return NotImplemented time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name, na_op) if time_converted is None: lvalues, rvalues = left, right dtype = None wrap_results = lambda x: x elif time_converted is NotImplemented: return NotImplemented else: left, right = time_converted.left, time_converted.right lvalues, rvalues = time_converted.lvalues, time_converted.rvalues dtype = time_converted.dtype wrap_results = time_converted.wrap_results na_op = time_converted.na_op if isinstance(rvalues, pd.Series): rindex = getattr(rvalues, 'index', rvalues) name = _maybe_match_name(left, rvalues) lvalues = getattr(lvalues, 'values', lvalues) rvalues = getattr(rvalues, 'values', rvalues) if left.index.equals(rindex): index = left.index else: index, lidx, ridx = left.index.join(rindex, how='outer', return_indexers=True) if lidx is not None: lvalues = com.take_1d(lvalues, lidx) if ridx is not None: rvalues = com.take_1d(rvalues, ridx) arr = na_op(lvalues, rvalues) return left._constructor(wrap_results(arr), index=index, name=name, dtype=dtype) else: # scalars if (hasattr(lvalues, 'values') and not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values return left._constructor(wrap_results(na_op(lvalues, rvalues)), index=left.index, name=left.name, dtype=dtype)
def _maybe_add_join_keys(self, result, left_indexer, right_indexer): # insert group keys keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue if name in result: key_indexer = result.columns.get_loc(name) if left_indexer is not None and right_indexer is not None: if name in self.left: if len(self.left) == 0: continue na_indexer = (left_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue right_na_indexer = right_indexer.take(na_indexer) result.iloc[na_indexer, key_indexer] = ( com.take_1d(self.right_join_keys[i], right_na_indexer)) elif name in self.right: if len(self.right) == 0: continue na_indexer = (right_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue left_na_indexer = left_indexer.take(na_indexer) result.iloc[na_indexer, key_indexer] = ( com.take_1d(self.left_join_keys[i], left_na_indexer)) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): if name is None: name = 'key_%d' % i # a faster way? key_col = com.take_1d(self.left_join_keys[i], left_indexer) na_indexer = (left_indexer == -1).nonzero()[0] right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) result.insert(i, name, key_col)
def _test_dtype(dtype, fill_value, out_dtype): data = np.random.randint(0, 2, 4).astype(dtype) indexer = [2, 1, 0, -1] result = com.take_1d(data, indexer, fill_value=fill_value) assert((result[[0, 1, 2]] == data[[2, 1, 0]]).all()) assert(result[3] == fill_value) assert(result.dtype == out_dtype) indexer = [2, 1, 0, 1] result = com.take_1d(data, indexer, fill_value=fill_value) assert((result[[0, 1, 2, 3]] == data[indexer]).all()) assert(result.dtype == dtype)
def _reindex_indexer_items(self, new_items, indexer, fill_value): # TODO: less efficient than I'd like item_order = com.take_1d(self.items.values, indexer) # keep track of what items aren't found anywhere mask = np.zeros(len(item_order), dtype=bool) new_blocks = [] for blk in self.blocks: blk_indexer = blk.items.get_indexer(item_order) selector = blk_indexer != -1 # update with observed items mask |= selector if not selector.any(): continue new_block_items = new_items.take(selector.nonzero()[0]) new_values = com.take_fast(blk.values, blk_indexer[selector], None, False, axis=0) new_blocks.append(make_block(new_values, new_block_items, new_items)) if not mask.all(): na_items = new_items[-mask] na_block = self._make_na_block(na_items, new_items, fill_value=fill_value) new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) return BlockManager(new_blocks, [new_items] + self.axes[1:])
def _wrap_result(self, result, use_codes=True, name=None): # for category, we do the stuff on the categories, so blow it up # to the full series again # But for some operations, we have to do the stuff on the full values, # so make it possible to skip this step as the method already did this before # the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) # leave as it is to keep extract and get_dummies results # can be merged to _wrap_result_expand in v0.17 from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.index import Index if not hasattr(result, 'ndim'): return result name = name or getattr(result, 'name', None) or self._orig.name if result.ndim == 1: if isinstance(self._orig, Index): # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if is_bool_dtype(result): return result return Index(result, name=name) return Series(result, index=self._orig.index, name=name) else: assert result.ndim < 3 return DataFrame(result, index=self._orig.index)
def reindex(self, index=None, method=None, copy=True, limit=None): """ Conform SparseSeries to new Index See Series.reindex docstring for general behavior Returns ------- reindexed : SparseSeries """ new_index = _ensure_index(index) if self.index.equals(new_index): if copy: return self.copy() else: return self if len(self.index) == 0: # FIXME: inelegant / slow values = np.empty(len(new_index), dtype=np.float64) values.fill(nan) return SparseSeries(values, index=new_index, fill_value=self.fill_value) new_index, fill_vec = self.index.reindex(index, method=method, limit=limit) new_values = common.take_1d(self.values, fill_vec) return SparseSeries(new_values, index=new_index, fill_value=self.fill_value, name=self.name)
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, copy=False, allow_dups=False): if method is not None or limit is not None: raise NotImplementedError("cannot reindex with a method or limit with sparse") if fill_value is None: fill_value = np.nan index, row_indexer = reindexers.get(0, (None, None)) columns, col_indexer = reindexers.get(1, (None, None)) if columns is None: columns = self.columns new_arrays = {} for col in columns: if col not in self: continue if row_indexer is not None: new_arrays[col] = com.take_1d( self[col].get_values(), row_indexer, fill_value=fill_value) else: new_arrays[col] = self[col] return SparseDataFrame(new_arrays, index=index, columns=columns).__finalize__(self)
def __array__(self, dtype=None): """ The numpy array interface. Returns ------- values : numpy array A numpy array of the same dtype as categorical.levels.dtype """ return com.take_1d(self.levels.values, self._codes)
def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the values by the indexer, fill with the fill_value. """ if allow_fill and fill_value is None: fill_value = np.nan values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) result = Categorical(values=values, levels=self.levels, ordered=self.ordered, name=self.name, fastpath=True) return result
def _maybe_add_join_keys(self, result, left_indexer, right_indexer): # insert group keys keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue if name in result: key_col = result[name] if left_indexer is not None and right_indexer is not None: if name in self.left: na_indexer = (left_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue right_na_indexer = right_indexer.take(na_indexer) key_col.put( na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) elif name in self.right: na_indexer = (right_indexer == -1).nonzero()[0] if len(na_indexer) == 0: continue left_na_indexer = left_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], left_na_indexer)) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): if name is None: name = 'key_%d' % i # a faster way? key_col = com.take_1d(self.left_join_keys[i], left_indexer) na_indexer = (left_indexer == -1).nonzero()[0] right_na_indexer = right_indexer.take(na_indexer) key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer)) result.insert(i, name, key_col)
def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. """ # filling must always be None/nan here # but is passed thru internally assert isnull(fill_value) codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = Categorical(codes, levels=self.levels, ordered=self.ordered, name=self.name, fastpath=True) return result
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1)) else: raise NotImplementedError
def _reindex_indexer(self, new_index, indexer, copy): if indexer is not None: new_values = com.take_1d(self.values.values, indexer) else: if copy: result = self.copy() else: result = self return result # be subclass-friendly return self._constructor(new_values, new_index, name=self.name)
def _test_dtype(dtype, can_hold_na): data = np.random.randint(0, 2, 4).astype(dtype) indexer = [2, 1, 0, 1] out = np.empty(4, dtype=dtype) com.take_1d(data, indexer, out=out) expected = data.take(indexer) tm.assert_almost_equal(out, expected) indexer = [2, 1, 0, -1] out = np.empty(4, dtype=dtype) if can_hold_na: com.take_1d(data, indexer, out=out) expected = data.take(indexer) expected[3] = np.nan tm.assert_almost_equal(out, expected) else: self.assertRaises(Exception, com.take_1d, data, indexer, out=out) # no exception o/w data.take(indexer, out=out)
def __array__(self, dtype=None): """ The numpy array interface. Returns ------- values : numpy array A numpy array of either the specified dtype or, if dtype==None (default), the same dtype as categorical.levels.dtype """ ret = com.take_1d(self.levels.values, self._codes) if dtype and dtype != self.levels.dtype: return np.asarray(ret, dtype) return ret
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") return DataFrame( obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)) else: raise ValueError("'obj' should be either a Series or a DataFrame")
def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer, copy, fill_value): if columns is None: columns = self.columns new_arrays = {} for col in columns: if col not in self: continue if row_indexer is not None: new_arrays[col] = com.take_1d(self[col].values, row_indexer, fill_value=fill_value) else: new_arrays[col] = self[col] return self._constructor(new_arrays, index=index, columns=columns)
def get_values(self): """ Return the values. For internal compatibility with pandas formatting. Returns ------- values : numpy array A numpy array of the same dtype as categorical.levels.dtype or dtype string if periods """ # if we are a period index, return a string repr if isinstance(self.levels, PeriodIndex): return com.take_1d(np.array(self.levels.to_native_types(), dtype=object), self._codes) return np.array(self)
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame from pandas.core.internals import BlockManager if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError data = obj._data new_blocks = [b.take(indexer, axis=1) for b in data.blocks] new_axes = list(data.axes) new_axes[1] = new_index new_data = BlockManager(new_blocks, new_axes) return DataFrame(new_data) else: raise NotImplementedError
def _wrap_result_expand(self, result, expand=False): if not isinstance(expand, bool): raise ValueError("expand must be True or False") # for category, we do the stuff on the categories, so blow it up # to the full series again if self._is_categorical: result = take_1d(result, self._orig.cat.codes) from pandas.core.index import Index, MultiIndex if not hasattr(result, 'ndim'): return result if isinstance(self._orig, Index): name = getattr(result, 'name', None) # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if hasattr(result, 'dtype') and is_bool_dtype(result): return result if expand: result = list(result) return MultiIndex.from_tuples(result, names=name) else: return Index(result, name=name) else: index = self._orig.index if expand: def cons_row(x): if is_list_like(x): return x else: return [x] cons = self._orig._constructor_expanddim data = [cons_row(x) for x in result] return cons(data, index=index) else: name = getattr(result, 'name', None) cons = self._orig._constructor return cons(result, name=name, index=index)
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values,name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike object are not " "supported and are discarded. Change values on the original.") return result
def _reindex_indexer_items(self, new_items, indexer, fill_value): # TODO: less efficient than I'd like item_order = com.take_1d(self.items.values, indexer) # keep track of what items aren't found anywhere mask = np.zeros(len(item_order), dtype=bool) new_blocks = [] for blk in self.blocks: blk_indexer = blk.items.get_indexer(item_order) selector = blk_indexer != -1 # update with observed items mask |= selector if not selector.any(): continue new_block_items = new_items.take(selector.nonzero()[0]) new_values = com.take_fast(blk.values, blk_indexer[selector], None, False, axis=0) new_blocks.append( make_block(new_values, new_block_items, new_items)) if not mask.all(): na_items = new_items[-mask] na_block = self._make_na_block(na_items, new_items, fill_value=fill_value) new_blocks.append(na_block) new_blocks = _consolidate(new_blocks, new_items) return BlockManager(new_blocks, [new_items] + self.axes[1:])
def _test_dtype(dtype, can_hold_na, writeable=True): data = np.random.randint(0, 2, 4).astype(dtype) data.flags.writeable = writeable indexer = [2, 1, 0, 1] out = np.empty(4, dtype=dtype) com.take_1d(data, indexer, out=out) expected = data.take(indexer) tm.assert_almost_equal(out, expected) indexer = [2, 1, 0, -1] out = np.empty(4, dtype=dtype) if can_hold_na: com.take_1d(data, indexer, out=out) expected = data.take(indexer) expected[3] = np.nan tm.assert_almost_equal(out, expected) else: with tm.assertRaisesRegexp(TypeError, self.fill_error): com.take_1d(data, indexer, out=out) # no exception o/w data.take(indexer, out=out)
def take_1d_pg2_int64(): com.take_1d(self.df.int64.values, self.indexer)
def take_1d_pg2_float64(self): com.take_1d(self.df.float64.values, self.indexer)
def __array__(self, dtype=None): return com.take_1d(self.levels.values, self.labels)