def _combine_match_index(self, other, func, level=None, fill_value=None): new_data = {} if fill_value is not None: raise NotImplementedError if level is not None: raise NotImplementedError new_index = self.index.union(other.index) this = self if self.index is not new_index: this = self.reindex(new_index) if other.index is not new_index: other = other.reindex(new_index) for col, series in compat.iteritems(this): new_data[col] = func(series.values, other.values) # fill_value is a function of our operator if isnull(other.fill_value) or isnull(self.default_fill_value): fill_value = np.nan else: fill_value = func(np.float64(self.default_fill_value), np.float64(other.fill_value)) return self._constructor(new_data, index=new_index, columns=self.columns, default_fill_value=fill_value, fill_value=self.default_fill_value).__finalize__(self)
def assert_almost_equal(a, b): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) if isinstance(a, basestring): assert a == b, (a, b) return True if isiterable(a): np.testing.assert_(isiterable(b)) np.testing.assert_equal(len(a), len(b)) if np.array_equal(a, b): return True else: for i in xrange(len(a)): assert_almost_equal(a[i], b[i]) return True err_msg = lambda a, b: 'expected %.5f but got %.5f' % (a, b) if isnull(a): np.testing.assert_(isnull(b)) return if isinstance(a, (bool, float, int)): # case for zero if abs(a) < 1e-5: np.testing.assert_almost_equal( a, b, decimal=5, err_msg=err_msg(a, b), verbose=False) else: np.testing.assert_almost_equal( 1, a/b, decimal=5, err_msg=err_msg(a, b), verbose=False) else: assert(a == b)
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta coerce = True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (other is not None and other.dtype == 'timedelta64[ns]' and all(isnull(v) for v in values)): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif not (isinstance(values, (np.ndarray, pd.Series)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, coerce=coerce) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = np.array([getattr(v, 'delta', None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError("cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format( ', '.join([com.pprint_thing(v) for v in values[mask]]))) values = to_timedelta(os, coerce=coerce) elif inferred_type == 'floating': # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT else: raise TypeError( 'incompatible type [{0}] for a datetime/timedelta ' 'operation'.format(np.array(values).dtype)) else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(np.array(values).dtype)) return values
def test_operators_none_as_na(self): df = DataFrame({"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object) ops = [operator.add, operator.sub, operator.mul, operator.truediv] # since filling converts dtypes from object, changed expected to be # object for op in ops: filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[com.isnull(expected)] = None assert_frame_equal(result, expected) result = op(df, df) expected = op(filled, filled).astype(object) expected[com.isnull(expected)] = None assert_frame_equal(result, expected) result = op(df, df.fillna(7)) assert_frame_equal(result, expected) result = op(df.fillna(7), df) assert_frame_equal(result, expected, check_dtype=False)
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta ovalues = values if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (other is not None and other.dtype == 'timedelta64[ns]' and all(isnull(v) for v in values)): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() # datetime with tz elif isinstance(ovalues, datetime.datetime) and hasattr(ovalues,'tz'): values = pd.DatetimeIndex(values) # datetime array with tz elif com.is_datetimetz(values): if isinstance(values, pd.Series): values = values._values elif not (isinstance(values, (np.ndarray, pd.Series)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, errors='coerce') elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif inferred_type == 'floating': # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT else: raise TypeError( 'incompatible type [{0}] for a datetime/timedelta ' 'operation'.format(np.array(values).dtype)) elif self._is_offset(values): return values else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(np.array(values).dtype)) return values
def test_isnull_lists(): result = isnull([[False]]) exp = np.array([[False]]) assert(np.array_equal(result, exp)) result = isnull([[1],[2]]) exp = np.array([[False], [False]]) assert(np.array_equal(result, exp))
def test_isnull_nat(): result = isnull([NaT]) exp = np.array([True]) assert(np.array_equal(result, exp)) result = isnull(np.array([NaT], dtype=object)) exp = np.array([True]) assert(np.array_equal(result, exp))
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import _possibly_cast_to_timedelta coerce = "compat" if pd._np_version_under1p7 else True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ("datetime64", "datetime", "date", "time"): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if other is not None and other.dtype == "timedelta64[ns]" and all(isnull(v) for v in values): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT # a datetlike elif not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif inferred_type in ("timedelta", "timedelta64"): # have a timedelta, convert to to ns here values = _possibly_cast_to_timedelta(values, coerce=coerce) elif inferred_type == "integer": # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == "m": values = values.astype("timedelta64[ns]") elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ("__truediv__", "__div__", "__mul__"): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = pa.array([getattr(v, "delta", None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError( "cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format(", ".join([com.pprint_thing(v) for v in values[mask]])) ) values = _possibly_cast_to_timedelta(os, coerce=coerce) elif inferred_type == "floating": # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT else: raise TypeError( "incompatible type [{0}] for a datetime/timedelta " "operation".format(pa.array(values).dtype) ) else: raise TypeError( "incompatible type [{0}] for a datetime/timedelta" " operation".format(pa.array(values).dtype) ) return values
def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True): """ if we have an object dtype, try to coerce dates and/or numbers """ # if we have passed in a list or scalar if isinstance(values, (list, tuple)): values = np.array(values, dtype=np.object_) if not hasattr(values, "dtype"): values = np.array([values], dtype=np.object_) # convert dates if convert_dates and values.dtype == np.object_: # we take an aggressive stance and convert to datetime64[ns] if convert_dates == "coerce": new_values = _possibly_cast_to_datetime(values, "M8[ns]", errors="coerce") # if we are all nans then leave me alone if not isnull(new_values).all(): values = new_values else: values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) # convert timedeltas if convert_timedeltas and values.dtype == np.object_: if convert_timedeltas == "coerce": from pandas.tseries.timedeltas import to_timedelta new_values = to_timedelta(values, coerce=True) # if we are all nans then leave me alone if not isnull(new_values).all(): values = new_values else: values = lib.maybe_convert_objects(values, convert_timedelta=convert_timedeltas) # convert to numeric if values.dtype == np.object_: if convert_numeric: try: new_values = lib.maybe_convert_numeric(values, set(), coerce_numeric=True) # if we are all nans then leave me alone if not isnull(new_values).all(): values = new_values except: pass else: # soft-conversion values = lib.maybe_convert_objects(values) values = values.copy() if copy else values return values
def fillna(self, fill_value=None, method=None, limit=None, **kwargs): """ Fill NA/NaN values using the specified method. Parameters ---------- method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap value : scalar Value to use to fill holes (e.g. 0) limit : int, default None Maximum size gap to forward or backward fill (not implemented yet!) Returns ------- filled : Categorical with NA/NaN filled """ if fill_value is None: fill_value = np.nan if limit is not None: raise NotImplementedError values = self._codes # Make sure that we also get NA in categories if self.categories.dtype.kind in ['S', 'O', 'f']: if np.nan in self.categories: values = values.copy() nan_pos = np.where(isnull(self.categories))[0] # we only have one NA in categories values[values == nan_pos] = -1 # pad / bfill if method is not None: values = self.to_dense().reshape(-1,len(self)) values = com.interpolate_2d( values, method, 0, None, fill_value).astype(self.categories.dtype)[0] values = _get_codes_for_values(values, self.categories) else: if not com.isnull(fill_value) and fill_value not in self.categories: raise ValueError("fill value must be in categories") mask = values==-1 if mask.any(): values = values.copy() values[mask] = self.categories.get_loc(fill_value) return Categorical(values, categories=self.categories, ordered=self.ordered, name=self.name, fastpath=True)
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) # series for s in [tm.makeFloatSeries(),tm.makeStringSeries(), tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: assert(isinstance(isnull(s), Series)) # frame for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def str_cat(arr, others=None, sep=None, na_rep=None): """ Concatenate arrays of strings with given separator Parameters ---------- arr : list or array-like others : list or array, or list of arrays sep : string or None, default None na_rep : string or None, default None If None, an NA in any array will propagate Returns ------- concat : array """ if sep is None: sep = '' if others is not None: arrays = _get_array_list(arr, others) n = _length_check(arrays) masks = np.array([isnull(x) for x in arrays]) cats = None if na_rep is None: na_mask = np.logical_or.reduce(masks, axis=0) result = np.empty(n, dtype=object) np.putmask(result, na_mask, np.nan) notmask = -na_mask tuples = zip(*[x[notmask] for x in arrays]) cats = [sep.join(tup) for tup in tuples] result[notmask] = cats else: for i, x in enumerate(arrays): x = np.where(masks[i], na_rep, x) if cats is None: cats = x else: cats = cats + sep + x result = cats return result else: arr = np.asarray(arr, dtype=object) mask = isnull(arr) if na_rep is None and mask.any(): return np.nan return sep.join(np.where(mask, na_rep, arr))
def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) arr[::3] = np.nan labels = cut(arr, 4) ex_labels = np.where(com.isnull(arr), np.nan, labels) tm.assert_almost_equal(labels, ex_labels) labels = cut(arr, 4, labels=False) ex_labels = np.where(com.isnull(arr), np.nan, labels) tm.assert_almost_equal(labels, ex_labels)
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not isscalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if isscalar(y) and isnull(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: mask = isnull(x) | isnull(y) y = y.view('i8') x = x.view('i8') try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def f(x, y): xmask = isnull(x) ymask = isnull(y) mask = xmask | ymask result = op(x, y) if mask.any(): if result.dtype == np.bool_: result = result.astype('O') np.putmask(result, mask, np.nan) return result
def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) idx = date_range('1/1/1990', periods=20) assert(notnull(idx).all()) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isnull(idx) assert(mask[0]) assert(not mask[1:].any())
def f(x, y): xmask = isnull(x) ymask = isnull(y) mask = xmask | ymask result = op(x, y) if mask.any(): if is_bool_dtype(result): result = result.astype("O") np.putmask(result, mask, np.nan) return result
def to_dense(self, fill=None): """ Convert SparseSeries to (dense) Series """ values = self.values # fill the nans if fill is None: fill = self.fill_value if not com.isnull(fill): values[com.isnull(values)] = fill return values
def assert_almost_equal(a, b, check_less_precise=False): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) if isinstance(a, compat.string_types): assert a == b, "%s != %s" % (a, b) return True if isiterable(a): np.testing.assert_(isiterable(b)) na, nb = len(a), len(b) assert na == nb, "%s != %s" % (na, nb) # TODO: Figure out why I thought this needed instance cheacks... # if (isinstance(a, np.ndarray) and isinstance(b, np.ndarray) and # np.array_equal(a, b)): if np.array_equal(a, b): return True else: for i in range(na): assert_almost_equal(a[i], b[i], check_less_precise) return True err_msg = lambda a, b: 'expected %.5f but got %.5f' % (b, a) if isnull(a): np.testing.assert_(isnull(b)) return if isinstance(a, (bool, float, int, np.float32)): decimal = 5 # deal with differing dtypes if check_less_precise: dtype_a = np.dtype(type(a)) dtype_b = np.dtype(type(b)) if dtype_a.kind == 'f' and dtype_b == 'f': if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4: decimal = 3 if np.isinf(a): assert np.isinf(b), err_msg(a, b) # case for zero elif abs(a) < 1e-5: np.testing.assert_almost_equal( a, b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) else: np.testing.assert_almost_equal( 1, a / b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) else: assert a == b, "%s != %s" % (a, b)
def assert_almost_equal(a, b, check_less_precise = False): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) if isinstance(a, basestring): assert a == b, "{0} != {1}".format(a, b) return True if isiterable(a): np.testing.assert_(isiterable(b)) na, nb = len(a), len(b) assert na == nb, "{0} != {1}".format(na, nb) if np.array_equal(a, b): return True else: for i in xrange(na): assert_almost_equal(a[i], b[i], check_less_precise) return True err_msg = lambda a, b: 'expected %.5f but got %.5f' % (a, b) if isnull(a): np.testing.assert_(isnull(b)) return if isinstance(a, (bool, float, int, np.float32)): decimal = 5 # deal with differing dtypes if check_less_precise: dtype_a = np.dtype(type(a)) dtype_b = np.dtype(type(b)) if dtype_a.kind == 'i' and dtype_b == 'i': pass if dtype_a.kind == 'f' and dtype_b == 'f': if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4: decimal = 3 if np.isinf(a): assert np.isinf(b), err_msg(a, b) # case for zero elif abs(a) < 1e-5: np.testing.assert_almost_equal( a, b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) else: np.testing.assert_almost_equal( 1, a / b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) else: assert a == b, "%s != %s" % (a, b)
def fillna(self, value=None, method='pad'): """ Fill NaN values using the specified method. Parameters ---------- value : any kind (should be same type as array) Value to use to fill holes (e.g. 0) method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap Returns ------- TimeSeries with NaN's filled See also -------- reindex, asfreq """ if value is not None: newSeries = self.copy() newSeries[isnull(newSeries)] = value return newSeries else: # Using reindex to pad / backfill if method is None: # pragma: no cover raise ValueError('must specify a fill method') method = method.lower() if method == 'ffill': method = 'pad' if method == 'bfill': method = 'backfill' mask = isnull(self.values) if _numpy_lt_151(): # pragma: no cover mask = mask.astype(np.uint8) if method == 'pad': indexer = _tseries.get_pad_indexer(mask) elif method == 'backfill': indexer = _tseries.get_backfill_indexer(mask) new_values = self.values.take(indexer) return Series(new_values, index=self.index)
def _mask_missing(array, missing_values): if np.isscalar(missing_values): missing_values = [missing_values] missing_values = np.array(missing_values, dtype=object) if com.isnull(missing_values).any(): mask = com.isnull(array) missing_values = missing_values[com.notnull(missing_values)] for v in missing_values: if mask is None: mask = array == missing_values else: mask |= array == missing_values return mask
def test_isnull_datetime(): assert not isnull(datetime.now()) assert notnull(datetime.now()) idx = date_range("1/1/1990", periods=20) assert notnull(idx).all() import pandas.lib as lib idx = np.asarray(idx) idx[0] = lib.iNaT idx = DatetimeIndex(idx) mask = isnull(idx) assert mask[0] assert not mask[1:].any()
def test_isnull_lists(): result = isnull([[False]]) exp = np.array([[False]]) assert np.array_equal(result, exp) result = isnull([[1], [2]]) exp = np.array([[False], [False]]) assert np.array_equal(result, exp) # list of strings / unicode result = isnull(["foo", "bar"]) assert not result.any() result = isnull([u("foo"), u("bar")]) assert not result.any()
def test_isnull_lists(): result = isnull([[False]]) exp = np.array([[False]]) assert(np.array_equal(result, exp)) result = isnull([[1], [2]]) exp = np.array([[False], [False]]) assert(np.array_equal(result, exp)) # list of strings / unicode result = isnull(['foo', 'bar']) assert(not result.any()) result = isnull([u('foo'), u('bar')]) assert(not result.any())
def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) arr[::3] = np.nan result = cut(arr, 4) result_arr = np.asarray(result) ex_arr = np.where(com.isnull(arr), np.nan, result_arr) tm.assert_almost_equal(result_arr, ex_arr) result = cut(arr, 4, labels=False) ex_result = np.where(com.isnull(arr), np.nan, result) tm.assert_almost_equal(result, ex_result)
def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) with cf.option_context("mode.use_inf_as_null", False): assert notnull(np.inf) assert notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.all() with cf.option_context("mode.use_inf_as_null", True): assert not notnull(np.inf) assert not notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.sum() == 2 with cf.option_context("mode.use_inf_as_null", False): for s in [tm.makeFloatSeries(),tm.makeStringSeries(), tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: assert(isinstance(isnull(s), Series))
def wrapper(self, other): func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other): other = _to_m8(other) result = func(other) if com.isnull(other): result.fill(nat_result) else: if not com.is_list_like(other): raise TypeError("cannot compare a TimedeltaIndex with type " "{0}".format(type(other))) other = TimedeltaIndex(other).values result = func(other) result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == tslib.iNaT else: o_mask = other.view('i8') == tslib.iNaT if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if com.is_bool_dtype(result): return result return Index(result)
def _mask_missing(array, missing_values): if not isinstance(missing_values, (list, np.ndarray)): missing_values = [missing_values] mask = None missing_values = np.array(missing_values, dtype=object) if com.isnull(missing_values).any(): mask = com.isnull(array) missing_values = missing_values[com.notnull(missing_values)] for v in missing_values: if mask is None: mask = array == missing_values else: mask |= array == missing_values return mask
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = np.empty((len(index),values.shape[1])), values values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) # series for s in [ tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries(), tm.makeTimeSeries(), tm.makePeriodSeries() ]: assert (isinstance(isnull(s), Series)) # frame for df in [ tm.makeTimeDataFrame(), tm.makePeriodFrame(), tm.makeMixedDataFrame() ]: result = isnull(df) expected = df.apply(isnull) tm.assert_frame_equal(result, expected) # panel for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel_equal(result, expected) # panel 4d for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: result = isnull(p) expected = p.apply(isnull) tm.assert_panel4d_equal(result, expected)
def get_result(self): # series only if self._is_series: # stack blocks if self.axis == 0: new_data = com._concat_compat([x._values for x in self.objs]) name = com._consensus_name_attr(self.objs) return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) # checks if the column variable already stores valid column names (because set via the 'key' argument # in the 'concat' function call. If that's not the case, use the series names as column names if columns.equals(Index(np.arange(len(self.objs)))) and not self.ignore_index: columns = np.array([ data[i].name for i in range(len(data)) ], dtype='object') indexer = isnull(columns) if indexer.any(): columns[indexer] = np.arange(len(indexer[indexer])) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
def wrapper(self, other): if isinstance(other, pd.Series): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (pa.Array, pd.Index)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not com.is_categorical_dtype(self): msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\ "If you want to compare values, use 'series <op> np.asarray(other)'." raise TypeError(msg.format(op=op, typ=self.dtype)) else: mask = isnull(self) values = self.get_values() other = _index.convert_scalar(values, _values_from_object(other)) if issubclass(values.dtype.type, np.datetime64): values = values.view('i8') # scalars res = na_op(values, other) if np.isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') # mask out the invalids if mask.any(): res[mask] = masker return res
def value_counts(values, sort=True, ascending=False, normalize=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys,dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def nanvar(values, axis=None, skipna=True, ddof=1): if not _is_floating_dtype(values): values = values.astype('f8') mask = isnull(values) count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) X = _ensure_numeric(values.sum(axis)) XX = _ensure_numeric((values**2).sum(axis)) return np.fabs((XX - X**2 / count) / d)
def dropna(self, axis=0, inplace=False, **kwargs): """ Analogous to Series.dropna. If fill_value=NaN, returns a dense Series """ # TODO: make more efficient axis = self._get_axis_number(axis or 0) dense_valid = self.to_dense().valid() if inplace: raise NotImplementedError("Cannot perform inplace dropna" " operations on a SparseSeries") if isnull(self.fill_value): return dense_valid else: dense_valid = dense_valid[dense_valid != self.fill_value] return dense_valid.to_sparse(fill_value=self.fill_value)
def _nanvar(values, axis=None, skipna=True, ddof=1): # private nanvar calculator mask = isnull(values) if is_any_int_dtype(values): values = values.astype('f8') count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) X = _ensure_numeric(values.sum(axis)) XX = _ensure_numeric((values**2).sum(axis)) return np.fabs((XX - X**2 / count) / d)
def _nanvar(values, axis=None, skipna=True, ddof=1): mask = isnull(values) if axis is not None: count = (values.shape[axis] - mask.sum(axis)).astype(float) else: count = float(values.size - mask.sum()) if skipna: values = values.copy() np.putmask(values, mask, 0) X = _ensure_numeric(values.sum(axis)) XX = _ensure_numeric((values**2).sum(axis)) return np.fabs((XX - X**2 / count) / (count - ddof))
def _evaluate_with_timedelta_like(self, other, op, opstr): # allow division by a timedelta if opstr in ['__div__','__truediv__']: if _is_convertible_to_td(other): other = Timedelta(other) if isnull(other): raise NotImplementedError("division by pd.NaT not implemented") i8 = self.asi8 result = i8/float(other.value) result = self._maybe_mask_results(result,convert='float64') return Index(result,name=self.name,copy=False) return NotImplemented
def fillna(self, value, downcast=None): if downcast is not None: raise NotImplementedError if issubclass(self.dtype.type, np.floating): value = float(value) if self._null_fill_value: return self._simple_new(self.sp_values, self.sp_index, fill_value=value) else: new_values = self.sp_values.copy() new_values[com.isnull(new_values)] = value return self._simple_new(new_values, self.sp_index, fill_value=self.fill_value)
def _check_bool_indexer(ax, key): # boolean indexing, need to check that the data are aligned, otherwise # disallowed result = key if _is_series(key) and key.dtype == np.bool_: if not key.index.equals(ax): result = key.reindex(ax) if isinstance(result, np.ndarray) and result.dtype == np.object_: mask = com.isnull(result) if mask.any(): raise IndexingError('cannot index with vector containing ' 'NA / NaN values') return result
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = _values_from_object(values) if isfinite: mask = _isfinite(values) else: mask = isnull(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = _maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max
def nanvar(values, axis=None, skipna=True, copy=True, ddof=1): mask = isnull(values) if axis is not None: count = (values.shape[axis] - mask.sum(axis)).astype(float) else: count = float(values.size - mask.sum()) if skipna: if copy: values = values.copy() np.putmask(values, mask, 0) X = values.sum(axis) XX = (values**2).sum(axis) return (XX - X**2 / count) / (count - ddof)
def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): values = np.array(list(self), dtype=object) mask = isnull(self.values) values[mask] = na_rep imask = ~mask if date_format: formatter = lambda dt: dt.strftime(date_format) else: formatter = lambda dt: u('%s') % dt values[imask] = np.array([formatter(dt) for dt in values[imask]]) return values
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert isnull(np.inf) assert isnull(-np.inf) float_series = Series(np.random.randn(5)) obj_series = Series(np.random.randn(5), dtype=object) assert (isinstance(isnull(float_series), Series)) assert (isinstance(isnull(obj_series), Series)) # call on DataFrame df = DataFrame(np.random.randn(10, 5)) df['foo'] = 'bar' result = isnull(df) expected = result.apply(isnull) tm.assert_frame_equal(result, expected)
def _validate_levels(cls, levels): """" Validates that we have good levels """ if not isinstance(levels, Index): dtype = None if not hasattr(levels, "dtype"): levels = _convert_to_list_like(levels) # on levels with NaNs, int values would be converted to float. Use "object" dtype # to prevent this. if isnull(levels).any(): without_na = np.array([x for x in levels if com.notnull(x)]) with_na = np.array(levels) if with_na.dtype != without_na.dtype: dtype = "object" levels = Index(levels, dtype=dtype) if not levels.is_unique: raise ValueError('Categorical levels must be unique') return levels
def fill(self, value=None, method='pad'): """ Fill NaN values using the specified method. Member Series / TimeSeries are filled separately. Parameters ---------- value : any kind (should be same type as array) Value to use to fill holes (e.g. 0) method : {'backfill', 'pad', None} Method to use for filling holes in new inde Returns ------- y : DataMatrix See also -------- DataMatrix.reindex, DataMatrix.asfreq """ if value is None: result = {} series = self._series for col, s in series.iteritems(): result[col] = s.fill(method=method, value=value) return DataMatrix(result, index=self.index, objects=self.objects) else: # Float type values if len(self.columns) == 0: return self vals = self.values.copy() vals.flat[common.isnull(vals.ravel())] = value objects = None if self.objects is not None: objects = self.objects.copy() return DataMatrix(vals, index=self.index, columns=self.columns, objects=objects)
def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. """ # filling must always be None/nan here # but is passed thru internally assert isnull(fill_value) codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) result = Categorical(codes, levels=self.levels, ordered=self.ordered, name=self.name, fastpath=True) return result
def _validate_categories(cls, categories): """" Validates that we have good categories """ if not isinstance(categories, Index): dtype = None if not hasattr(categories, "dtype"): categories = _convert_to_list_like(categories) # on categories with NaNs, int values would be converted to float. # Use "object" dtype to prevent this. if isnull(categories).any(): without_na = np.array([x for x in categories if com.notnull(x)]) with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" categories = Index(categories, dtype=dtype) if not categories.is_unique: raise ValueError('Categorical categories must be unique') return categories
def _check_bool_indexer(ax, key): # boolean indexing, need to check that the data are aligned, otherwise # disallowed # this function assumes that com._is_bool_indexer(key) == True result = key if _is_series(key) and not key.index.equals(ax): result = result.reindex(ax) mask = com.isnull(result) if mask.any(): raise IndexingError('Unalignable boolean Series key provided') # com._is_bool_indexer has already checked for nulls in the case of an # object array key, so no check needed here result = np.asarray(result, dtype=bool) return result
def fillna(self, fill_value=None, method=None, limit=None, **kwargs): """ Fill NA/NaN values using the specified method. Parameters ---------- method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap value : scalar Value to use to fill holes (e.g. 0) limit : int, default None Maximum size gap to forward or backward fill (not implemented yet!) Returns ------- filled : Categorical with NA/NaN filled """ if fill_value is None: fill_value = np.nan if limit is not None: raise NotImplementedError values = self._codes # pad / bfill if method is not None: values = self.to_dense().reshape(-1,len(self)) values = com.interpolate_2d( values, method, 0, None, fill_value).astype(self.levels.dtype)[0] values = _get_codes_for_values(values, self.levels) else: if not com.isnull(fill_value) and fill_value not in self.levels: raise ValueError("fill value must be in levels") mask = self._codes==-1 if mask.any(): values = self._codes.copy() values[mask] = self.levels.get_loc(fill_value) return Categorical(values, levels=self.levels, ordered=self.ordered, name=self.name, fastpath=True)
def test_nan_to_nat_conversions(): df = DataFrame(dict({ 'A' : np.asarray(list(range(10)),dtype='float64'), 'B' : Timestamp('20010101') })) df.iloc[3:6,:] = np.nan result = df.loc[4,'B'].value assert(result == iNaT) values = df['B'].values result, changed = com._maybe_upcast_indexer(values,tuple([slice(8,9)]),np.nan) assert(isnull(result[8])) # numpy < 1.7.0 is wrong from distutils.version import LooseVersion if LooseVersion(np.__version__) >= '1.7.0': assert(result[8] == np.datetime64('NaT'))
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: fmt = lambda v: _format_label(v, precision=precision) if right: levels = ['(%s, %s]' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] if include_lowest: levels[0] = '[' + levels[0][1:] else: levels = ['[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:])] else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if not retbins: return fac return fac, bins
def test_nan_to_nat_conversions(): df = DataFrame(dict({ 'A' : np.asarray(lrange(10),dtype='float64'), 'B' : Timestamp('20010101') })) df.iloc[3:6,:] = np.nan result = df.loc[4,'B'].value assert(result == iNaT) s = df['B'].copy() s._data = s._data.setitem(indexer=tuple([slice(8,9)]),value=np.nan) assert(isnull(s[8])) # numpy < 1.7.0 is wrong from distutils.version import LooseVersion if LooseVersion(np.__version__) >= '1.7.0': assert(s[8].value == np.datetime64('NaT').astype(np.int64))
def wrapper(self, other): if isinstance(other, pd.Series): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (pa.Array, pd.Series)): if len(self) != len(other): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index, name=self.name) else: mask = isnull(self) values = self.values other = _index.convert_scalar(values, other) if issubclass(values.dtype.type, np.datetime64): values = values.view('i8') # scalars res = na_op(values, other) if np.isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) # always return a full value series here res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') # mask out the invalids if mask.any(): res[mask] = masker return res
def _nanmean(values, axis=None, skipna=True): mask = isnull(values) if skipna and not issubclass(values.dtype.type, np.integer): values = values.copy() np.putmask(values, mask, 0) the_sum = _ensure_numeric(values.sum(axis)) count = _get_counts(mask, axis) if axis is not None: the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan return the_mean
def _format_datetime64(x, tz=None): if isnull(x): return 'NaT' stamp = lib.Timestamp(x, tz=tz) base = stamp.strftime('%Y-%m-%d %H:%M:%S') fraction = stamp.microsecond * 1000 + stamp.nanosecond digits = 9 if fraction == 0: return base while (fraction % 10) == 0: fraction /= 10 digits -= 1 return base + ('.%%.%id' % digits) % fraction
def _nanmax(values, axis=None, skipna=True): mask = isnull(values) if skipna and not issubclass(values.dtype.type, np.integer): values = values.copy() np.putmask(values, mask, -np.inf) # numpy 1.6.1 workaround in Python 3.x if (values.dtype == np.object_ and sys.version_info[0] >= 3): # pragma: no cover import __builtin__ if values.ndim > 1: apply_ax = axis if axis is not None else 0 result = np.apply_along_axis(__builtin__.max, apply_ax, values) else: result = __builtin__.max(values) else: result = values.max(axis) return _maybe_null_out(result, axis, mask)
def test_isnull(): assert not isnull(1.) assert isnull(None) assert isnull(np.NaN) assert not isnull(np.inf) assert not isnull(-np.inf) for s in [tm.makeFloatSeries(),tm.makeStringSeries(), tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: assert(isinstance(isnull(s), np.ndarray)) # call on DataFrame df = DataFrame(np.random.randn(10, 5)) df['foo'] = 'bar' result = isnull(df) expected = result.apply(isnull) tm.assert_frame_equal(result, expected)
def _convert_to_array(self, values, name=None): """converts values to ndarray""" from pandas.tseries.timedeltas import _possibly_cast_to_timedelta coerce = 'compat' if pd._np_version_under1p7 else True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # a datetlike if not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = _possibly_cast_to_timedelta(values, coerce=coerce) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = pa.array([getattr(v, 'delta', None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError( "cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format(','.join( [com.pprint_thing(v) for v in values[mask]]))) values = _possibly_cast_to_timedelta(os, coerce=coerce) else: raise TypeError( "incompatible type [{0}] for a datetime/timedelta operation". format(pa.array(values).dtype)) return values