def wrapper(self, other): is_self_int_dtype = is_integer_dtype(self.dtype) fill_int = lambda x: x.fillna(0) fill_bool = lambda x: x.fillna(False).astype(bool) if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) other = other.reindex_like(self) is_other_int_dtype = is_integer_dtype(other.dtype) other = fill_int(other) if is_other_int_dtype else fill_bool(other) filler = (fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool) return filler(self._constructor(na_op(self.values, other.values), index=self.index, name=name)) elif isinstance(other, pd.DataFrame): return NotImplemented else: # scalars, list, tuple, np.array filler = (fill_int if is_self_int_dtype and is_integer_dtype(np.asarray(other)) else fill_bool) return filler(self._constructor( na_op(self.values, other), index=self.index)).__finalize__(self)
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]): # There is a mismatch between floating and integer columns. # Determine all mismatched and error. mismatched = sorted(c for c in df.columns if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c])) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "%s\n\n" "To fix, specify dtypes manually by adding:\n\n" "%s\n\n" "to the call to `read_csv`/`read_table`.\n\n" "Alternatively, provide `assume_missing=True` to " "interpret all unspecified integer columns as floats.") missing_list = '\n'.join('- %r' % c for c in mismatched) dtype_list = ('%r: float' % c for c in mismatched) missing_dict = 'dtype={%s}' % ',\n '.join(dtype_list) raise ValueError(msg % (missing_list, missing_dict)) df[c] = df[c].astype(dtypes[c])
def coerce_dtypes(df, dtypes): """ Coerce dataframe to dtypes safely Operates in place Parameters ---------- df: Pandas DataFrame dtypes: dict like {'x': float} """ for c in df.columns: if c in dtypes and df.dtypes[c] != dtypes[c]: if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c]): # There is a mismatch between floating and integer columns. # Determine all mismatched and error. mismatched = sorted(c for c in df.columns if is_float_dtype(df.dtypes[c]) and is_integer_dtype(dtypes[c])) msg = ("Mismatched dtypes found.\n" "Expected integers, but found floats for columns:\n" "%s\n\n" "To fix, specify dtypes manually by adding:\n\n" "%s\n\n" "to the call to `read_csv`/`read_table`.\n\n" "Alternatively, provide `assume_missing=True` to " "interpret all unspecified integer columns as floats.") missing_list = '\n'.join('- %r' % c for c in mismatched) dtype_list = ('%r: float' % c for c in mismatched) missing_dict = 'dtype={%s}' % ',\n '.join(dtype_list) raise ValueError(msg % (missing_list, missing_dict)) df[c] = df[c].astype(dtypes[c])
def _sparse_array_op(left, right, op, name, series=False): if series and is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf if name in ('floordiv', 'mod') and (right.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) dtype = _maybe_match_dtype(left, right) result_dtype = None if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: with np.errstate(all='ignore'): result = op(left.get_values(), right.get_values()) fill = op(_get_fill(left), _get_fill(right)) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index elif left.sp_index.equals(right.sp_index): with np.errstate(all='ignore'): result = op(left.sp_values, right.sp_values) fill = op(_get_fill(left), _get_fill(right)) index = left.sp_index else: if name[0] == 'r': left, right = right, left name = name[1:] if name in ('and', 'or') and dtype == 'bool': opname = 'sparse_{name}_uint8'.format(name=name, dtype=dtype) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) result_dtype = np.bool else: opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) left_sp_values = left.sp_values right_sp_values = right.sp_values sparse_op = getattr(splib, opname) with np.errstate(all='ignore'): result, index, fill = sparse_op(left_sp_values, left.sp_index, left.fill_value, right_sp_values, right.sp_index, right.fill_value) if result_dtype is None: result_dtype = result.dtype return _wrap_result(name, result, index, fill, dtype=result_dtype)
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result result = np.asarray(result) # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): if is_integer_dtype(other): return other elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically # but ufunc may pass integer to _add_delta return other # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr))
def _get_data_algo(values, func_map): f = None if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: try: f = func_map['string'] except KeyError: pass if f is None: f = func_map['generic'] return f, values
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcut-able if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format(unit)).astype( 'timedelta64[ns]', copy=False) else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) except ValueError: if errors == 'ignore': return arg else: # This else-block accounts for the cases when errors='raise' # and errors='coerce'. If errors == 'raise', these errors # should be raised. If errors == 'coerce', we shouldn't # expect any errors to be raised, since all parsing errors # cause coercion to pd.NaT. However, if an error / bug is # introduced that causes an Exception to be raised, we would # like to surface it. raise if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isnull(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): if is_integer_dtype(other): return other elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr))
def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are # also uint64. if is_integer_dtype(keyarr): return _asarray_tuplesafe(keyarr, dtype=np.uint64) return keyarr
def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so # that the values returned from indexing are # also uint64. if is_integer_dtype(keyarr): return _asarray_tuplesafe(keyarr, dtype=np.uint64) return keyarr
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isnull(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_integer_dtype(values): values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: mask = isnull(values) values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def _get_data_algo(values, func_map): f = None if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) else: values = _ensure_object(values) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: try: f = func_map['string'] except KeyError: pass if f is None: f = func_map['generic'] return f, values
def _simple_new(cls, data, sp_index, fill_value): if not isinstance(sp_index, SparseIndex): # caller must pass SparseIndex raise ValueError('sp_index must be a SparseIndex') if fill_value is None: if sp_index.ngaps > 0: # has missing hole fill_value = np.nan else: fill_value = na_value_for_dtype(data.dtype) if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float data = data.astype(float) result = data.view(cls) if not isinstance(sp_index, SparseIndex): # caller must pass SparseIndex raise ValueError('sp_index must be a SparseIndex') result.sp_index = sp_index result._fill_value = fill_value return result
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcut-able if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format( unit)).astype('timedelta64[ns]', copy=False) else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) except ValueError: if errors == 'ignore': return arg else: # This else-block accounts for the cases when errors='raise' # and errors='coerce'. If errors == 'raise', these errors # should be raised. If errors == 'coerce', we shouldn't # expect any errors to be raised, since all parsing errors # cause coercion to pd.NaT. However, if an error / bug is # introduced that causes an Exception to be raised, we would # like to surface it. raise if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if is_integer_dtype(values): values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif is_categorical_dtype(values): result = constructor(values.mode()) else: mask = isnull(values) values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def _simple_new(cls, data, sp_index, fill_value): if not isinstance(sp_index, SparseIndex): # caller must pass SparseIndex raise ValueError('sp_index must be a SparseIndex') if fill_value is None: if sp_index.ngaps > 0: # has missing hole fill_value = np.nan else: fill_value = na_value_for_dtype(data.dtype) if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float data = data.astype(float) result = data.view(cls) if not isinstance(sp_index, SparseIndex): # caller must pass SparseIndex raise ValueError('sp_index must be a SparseIndex') result.sp_index = sp_index result._fill_value = fill_value return result
def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name)
def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 if is_integer_dtype(values): values = values.astype('int64', copy=False) return values
def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name)
def coerce(values): # we allow coercion to if errors allows values = to_numeric(values, errors=errors) # prevent overflow in case of int8 or int16 if is_integer_dtype(values): values = values.astype('int64', copy=False) return values
def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan self.assertEqual(df['c'].dtype, np.float64) df.loc[0, 'c'] = 'foo' expected = DataFrame([{ "a": 1, "c": 'foo' }, { "a": 3, "b": 2, "c": np.nan }]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=list('ab'), columns=['foo', 'bar', 'baz']) for val in [3.14, 'wxyz']: left = df.copy() left.loc['a', 'bar'] = val right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_integer_dtype(left['foo'])) self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), columns=['foo', 'bar', 'baz']) left.loc['a', 'bar'] = 'wxyz' right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_float_dtype(left['foo'])) self.assertTrue(is_float_dtype(left['baz']))
def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period = (isinstance(values, ABCPeriodIndex) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period = (isinstance(values, ABCPeriodIndex) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: if isinstance(orig, ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = _ensure_object(values) mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _simple_new(cls, values, name=None, freq=None, **kwargs): """ Values can be any type that can be coerced to Periods. Ordinals in an ndarray are fastpath-ed to `_from_ordinals` """ if not is_integer_dtype(values): values = np.array(values, copy=False) if len(values) > 0 and is_float_dtype(values): raise TypeError("PeriodIndex can't take floats") return cls(values, name=name, freq=freq, **kwargs) return cls._from_ordinals(values, name, freq, **kwargs)
def fill_zeros(result, x, y, name, fill): """ if this is a reversed op, then flip x,y if we have an integer value (or array in y) and we have 0's, fill them with the fill, return the result mask the nan's from x """ if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) is_scalar_type = is_scalar(y) if not is_variable_type and not is_scalar_type: return result if is_scalar_type: y = np.array(y) if is_integer_dtype(y): if (y == 0).any(): # GH 7325, mask and nans must be broadcastable (also: PR 9308) # Raveling and then reshaping makes np.putmask faster mask = ((y == 0) & ~np.isnan(result)).ravel() shape = result.shape result = result.astype('float64', copy=False).ravel() np.putmask(result, mask, fill) # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): signs = np.sign(y if name.startswith(('r', '__r')) else x) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) if "floordiv" in name: # (PR 9308) nan_mask = ((y == 0) & (x == 0)).ravel() np.putmask(result, nan_mask, np.nan) result = result.reshape(shape) return result
def fill_zeros(result, x, y, name, fill): """ if this is a reversed op, then flip x,y if we have an integer value (or array in y) and we have 0's, fill them with the fill, return the result mask the nan's from x """ if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) is_scalar_type = is_scalar(y) if not is_variable_type and not is_scalar_type: return result if is_scalar_type: y = np.array(y) if is_integer_dtype(y): if (y == 0).any(): # GH 7325, mask and nans must be broadcastable (also: PR 9308) # Raveling and then reshaping makes np.putmask faster mask = ((y == 0) & ~np.isnan(result)).ravel() shape = result.shape result = result.astype('float64', copy=False).ravel() np.putmask(result, mask, fill) # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): signs = np.sign(y if name.startswith(('r', '__r')) else x) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) if "floordiv" in name: # (PR 9308) nan_mask = ((y == 0) & (x == 0)).ravel() np.putmask(result, nan_mask, np.nan) result = result.reshape(shape) return result
def _get_prev_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype): return label - np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: raise TypeError('cannot determine next label for type %r' % type(label))
def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan self.assertEqual(df['c'].dtype, np.float64) df.loc[0, 'c'] = 'foo' expected = DataFrame([{"a": 1, "c": 'foo'}, {"a": 3, "b": 2, "c": np.nan}]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=list('ab'), columns=['foo', 'bar', 'baz']) for val in [3.14, 'wxyz']: left = df.copy() left.loc['a', 'bar'] = val right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_integer_dtype(left['foo'])) self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), columns=['foo', 'bar', 'baz']) left.loc['a', 'bar'] = 'wxyz' right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) self.assertTrue(is_float_dtype(left['foo'])) self.assertTrue(is_float_dtype(left['baz']))
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def _sparse_array_op(left, right, op, name, series=False): if series and is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf if name in ('floordiv', 'mod') and (right.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) dtype = _maybe_match_dtype(left, right) if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: result = op(left.get_values(), right.get_values()) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index fill = op(_get_fill(left), _get_fill(right)) elif left.sp_index.equals(right.sp_index): result = op(left.sp_values, right.sp_values) index = left.sp_index fill = op(_get_fill(left), _get_fill(right)) else: if name[0] == 'r': left, right = right, left name = name[1:] opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) sparse_op = getattr(splib, opname) result, index, fill = sparse_op(left.sp_values, left.sp_index, left.fill_value, right.sp_values, right.sp_index, right.fill_value) return _wrap_result(name, result, index, fill, dtype=result.dtype)
def _sparse_array_op(left, right, op, name, series=False): if series and is_integer_dtype(left) and is_integer_dtype(right): # series coerces to float64 if result should have NaN/inf if name in ('floordiv', 'mod') and (right.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): left = left.astype(np.float64) right = right.astype(np.float64) dtype = _maybe_match_dtype(left, right) if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: result = op(left.get_values(), right.get_values()) if left.sp_index.ngaps == 0: index = left.sp_index else: index = right.sp_index fill = op(_get_fill(left), _get_fill(right)) elif left.sp_index.equals(right.sp_index): result = op(left.sp_values, right.sp_values) index = left.sp_index fill = op(_get_fill(left), _get_fill(right)) else: if name[0] == 'r': left, right = right, left name = name[1:] opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) sparse_op = getattr(splib, opname) result, index, fill = sparse_op(left.sp_values, left.sp_index, left.fill_value, right.sp_values, right.sp_index, right.fill_value) return _wrap_result(name, result, index, fill, dtype=result.dtype)
def _simple_new(cls, left, right, closed=None, name=None, copy=False, verify_integrity=True): result = IntervalMixin.__new__(cls) if closed is None: closed = 'right' left = _ensure_index(left, copy=copy) right = _ensure_index(right, copy=copy) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) if is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): raise ValueError("must not have differing left [{}] " "and right [{}] types".format( type(left), type(right))) if isinstance(left, ABCPeriodIndex): raise ValueError("Period dtypes are not supported, " "use a PeriodIndex instead") result._left = left result._right = right result._closed = closed result.name = name if verify_integrity: result._validate() result._reset_identity() return result
def _simple_new(cls, data, sp_index, fill_value): if is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0: # if float fill_value is being included in dense repr, # convert values to float data = data.astype(float) result = data.view(cls) if not isinstance(sp_index, SparseIndex): # caller must pass SparseIndex raise ValueError("sp_index must be a SparseIndex") result.sp_index = sp_index result.fill_value = fill_value return result
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_float_dtype(dtype): values = self._values.astype(dtype, copy=copy) elif is_integer_dtype(dtype): if self.hasnans: raise ValueError('cannot convert float NaN to integer') values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) else: raise TypeError('Setting %s dtype to anything other than ' 'float64 or object is not supported' % self.__class__) return Index(values, name=self.name, dtype=dtype)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_float_dtype(dtype): values = self._values.astype(dtype, copy=copy) elif is_integer_dtype(dtype): if self.hasnans: raise ValueError('cannot convert float NaN to integer') values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) else: raise TypeError('Setting %s dtype to anything other than ' 'float64 or object is not supported' % self.__class__) return Index(values, name=self.name, dtype=dtype)
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = _values_from_object(values) if isfinite: mask = _isfinite(values) else: mask = isnull(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = _maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): if copy: return self._int64index.copy() else: return self._int64index elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): if copy: return self._int64index.copy() else: return self._int64index elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, _ensure_object)
def _get_data_algo(values, func_map): if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) else: f = func_map['generic'] values = _ensure_object(values) return f, values
def _get_data_algo(values, func_map): if is_float_dtype(values): f = func_map['float64'] values = _ensure_float64(values) elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif is_integer_dtype(values): f = func_map['int64'] values = _ensure_int64(values) else: f = func_map['generic'] values = _ensure_object(values) return f, values
def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, _ensure_object)
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values .. versionadded:: 0.19.0 Parameters ---------- keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): if isinstance(values, (ABCSeries, ABCIndex)): values = values.values.view(np.int64) else: values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def duplicated(values, keep='first'): """ Return boolean ndarray denoting duplicate values .. versionadded:: 0.19.0 Parameters ---------- keep : {'first', 'last', False}, default 'first' - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - False : Mark all duplicates as ``True``. Returns ------- duplicated : ndarray """ dtype = values.dtype # no need to revert to original type if is_datetime_or_timedelta_dtype(dtype) or is_datetimetz(dtype): if isinstance(values, (ABCSeries, ABCIndex)): values = values.values.view(np.int64) else: values = values.view(np.int64) elif is_period_arraylike(values): from pandas.tseries.period import PeriodIndex values = PeriodIndex(values).asi8 elif is_categorical_dtype(dtype): values = values.values.codes elif isinstance(values, (ABCSeries, ABCIndex)): values = values.values if is_integer_dtype(dtype): values = _ensure_int64(values) duplicated = htable.duplicated_int64(values, keep=keep) elif is_float_dtype(dtype): values = _ensure_float64(values) duplicated = htable.duplicated_float64(values, keep=keep) else: values = _ensure_object(values) duplicated = htable.duplicated_object(values, keep=keep) return duplicated
def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_timedelta64_ns_dtype(dtype): if copy is True: return self.copy() return self elif is_timedelta64_dtype(dtype): # return an index (essentially this is division) result = self.values.astype(dtype, copy=copy) if self.hasnans: return Index(self._maybe_mask_results(result, convert="float64"), name=self.name) return Index(result.astype("i8"), name=self.name) elif is_integer_dtype(dtype): return Index(self.values.astype("i8", copy=copy), dtype="i8", name=self.name) raise ValueError("Cannot cast TimedeltaIndex to dtype %s" % dtype)
def _simple_new(cls, values, name=None, freq=None, **kwargs): if not is_integer_dtype(values): values = np.array(values, copy=False) if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: return PeriodIndex(values, name=name, freq=freq, **kwargs) values = np.array(values, dtype='int64', copy=False) result = object.__new__(cls) result._data = values result.name = name if freq is None: raise ValueError('freq is not specified') result.freq = Period._maybe_convert_freq(freq) result._reset_identity() return result
def _simple_new(cls, values, name=None, freq=None, **kwargs): if not is_integer_dtype(values): values = np.array(values, copy=False) if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: return PeriodIndex(values, name=name, freq=freq, **kwargs) values = np.array(values, dtype='int64', copy=False) result = object.__new__(cls) result._data = values result.name = name if freq is None: raise ValueError('freq is not specified') result.freq = Period._maybe_convert_freq(freq) result._reset_identity() return result
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = _values_from_object(values) if isfinite: mask = _isfinite(values) else: mask = isnull(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = _maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max