def test_binary_input_aligns_columns(request, dtype_a, dtype_b): if (is_extension_array_dtype(dtype_a) or isinstance(dtype_a, dict) or is_extension_array_dtype(dtype_b) or isinstance(dtype_b, dict)): request.node.add_marker( pytest.mark.xfail( reason="Extension / mixed with multiple inputs not implemented." )) df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}).astype(dtype_a) if isinstance(dtype_a, dict) and isinstance(dtype_b, dict): dtype_b["C"] = dtype_b.pop("B") df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b) with tm.assert_produces_warning(FutureWarning): result = np.heaviside(df1, df2) # Expected future behaviour: # expected = np.heaviside( # np.array([[1, 3, np.nan], [2, 4, np.nan]]), # np.array([[1, np.nan, 3], [2, np.nan, 4]]), # ) # expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"]) tm.assert_frame_equal(result, expected) # ensure the expected is the same when applying with numpy array result = np.heaviside(df1, df2.values) tm.assert_frame_equal(result, expected)
def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if exc is None: sdtype = tm.get_dtype(s) if (hasattr(other, "dtype") and not is_extension_array_dtype(other.dtype) and is_integer_dtype(other.dtype) and sdtype.is_unsigned_integer): # TODO: comment below is inaccurate; other can be int8, int16, ... # and the trouble is that e.g. if s is UInt8 and other is int8, # then result is UInt16 # other is np.int64 and would therefore always result in # upcasting, so keeping other as same numpy_dtype other = other.astype(sdtype.numpy_dtype) result = op(s, other) expected = self._combine(s, other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): expected = expected.fillna(np.nan).astype("Float64") else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) self.assert_equal(result, expected) else: with pytest.raises(exc): op(s, other)
def test_binary_input_aligns_index(request, dtype): if is_extension_array_dtype(dtype) or isinstance(dtype, dict): request.node.add_marker( pytest.mark.xfail( reason="Extension / mixed with multiple inputs not implemented." )) df1 = pd.DataFrame({ "A": [1, 2], "B": [3, 4] }, index=["a", "b"]).astype(dtype) df2 = pd.DataFrame({ "A": [1, 2], "B": [3, 4] }, index=["a", "c"]).astype(dtype) with tm.assert_produces_warning(FutureWarning): result = np.heaviside(df1, df2) # Expected future behaviour: # expected = np.heaviside( # np.array([[1, 3], [3, 4], [np.nan, np.nan]]), # np.array([[1, 3], [np.nan, np.nan], [3, 4]]), # ) # # TODO(FloatArray): this will be Float64Dtype. # expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"], index=["a", "b"]) tm.assert_frame_equal(result, expected) # ensure the expected is the same when applying with numpy array result = np.heaviside(df1, df2.values) tm.assert_frame_equal(result, expected)
def block_to_header_bytes(block): values = block.values try: # pandas >= 0.19 from pandas.api.types import is_datetime64tz_dtype except ImportError: from pandas.core.common import is_datetime64tz_dtype if isinstance(values, pd.Categorical): extension = ('categorical_type', (values.ordered, values.categories)) values = values.codes elif is_datetime64tz_dtype(block): extension = ('datetime64_tz_type', (block.values.tzinfo, )) values = values.view('i8') elif is_extension_array_dtype(block.dtype): extension = ("other", ()) else: extension = ('numpy_type', ()) header = (block.mgr_locs.as_array, values.dtype, values.shape, extension) if extension == ("other", ()): bytes = pickle.dumps(values) else: bytes = pnp.compress(pnp.serialize(values), values.dtype) return header, bytes
def dtype(self) -> Optional[str]: """String representation of the dtype.""" dtype_ = self._pandas_dtype if dtype_ is None: return dtype_ if is_extension_array_dtype(dtype_): if isinstance(dtype_, type): try: # Convert to str here because some pandas dtypes allow # an empty constructor for compatatibility but fail on str(). # e.g: PeriodDtype return str(dtype_()) except (TypeError, AttributeError) as err: raise TypeError( f"Pandas dtype {dtype_} cannot be instantiated: " f"{err}\n Usage Tip: Use an instance or a string " "representation.") from err return str(dtype_) if dtype_ in dtypes.NUMPY_TYPES: dtype_ = PandasDtype.from_numpy_type(dtype_) elif isinstance(dtype_, str): dtype_ = PandasDtype.from_str_alias(dtype_) elif isinstance(dtype_, type): dtype_ = PandasDtype.from_python_type(dtype_) if isinstance(dtype_, dtypes.PandasDtype): return dtype_.str_alias raise TypeError( "type of `pandas_dtype` argument not recognized: %s " "Please specify a pandera PandasDtype enum, legal pandas data " "type, pandas data type string alias, or numpy data type " "string alias" % type(self._pandas_dtype))
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, RaggedDtype): if copy: return self.copy() return self elif is_extension_array_dtype(dtype): return dtype.construct_array_type()._from_sequence( np.asarray(self)) return np.array([v for v in self], dtype=dtype, copy=copy)
def test_unary_binary(request, dtype): # unary input, binary output if is_extension_array_dtype(dtype) or isinstance(dtype, dict): request.node.add_marker( pytest.mark.xfail( reason= "Extension / mixed with multiple outputs not implemented.")) values = np.array([[-1, -1], [1, 1]], dtype="int64") df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype) result_pandas = np.modf(df) assert isinstance(result_pandas, tuple) assert len(result_pandas) == 2 expected_numpy = np.modf(values) for result, b in zip(result_pandas, expected_numpy): expected = pd.DataFrame(b, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected)
def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if exc is None: if s.dtype.is_unsigned_integer and (op_name == "__rsub__"): # TODO see https://github.com/pandas-dev/pandas/issues/22023 pytest.skip("unsigned subtraction gives negative values") if ( hasattr(other, "dtype") and not is_extension_array_dtype(other.dtype) and is_integer_dtype(other.dtype) ): # other is np.int64 and would therefore always result in # upcasting, so keeping other as same numpy_dtype other = other.astype(s.dtype.numpy_dtype) result = op(s, other) expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): expected = expected.fillna(np.nan).astype("Float64") elif op_name.startswith("__r"): # TODO reverse operators result in object dtype # see https://github.com/pandas-dev/pandas/issues/22024 expected = expected.astype(s.dtype) result = result.astype(s.dtype) else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 result = result.fillna(1) self.assert_series_equal(result, expected) else: with pytest.raises(exc): op(s, other)
def get_str_dtype(cls, pandas_dtype_arg): """Get pandas-compatible string representation of dtype.""" dtype_ = pandas_dtype_arg if dtype_ is None: return dtype_ if is_extension_array_dtype(dtype_): if isinstance(dtype_, type): try: # Convert to str here because some pandas dtypes allow # an empty constructor for compatatibility but fail on # str(). e.g: PeriodDtype return str(dtype_()) except (TypeError, AttributeError) as err: raise TypeError( f"Pandas dtype {dtype_} cannot be instantiated: " f"{err}\n Usage Tip: Use an instance or a string " "representation." ) from err return str(dtype_) if dtype_ in NUMPY_TYPES: dtype_ = cls.from_numpy_type(dtype_) elif isinstance(dtype_, str): dtype_ = cls.from_str_alias(dtype_) elif isinstance(dtype_, type): dtype_ = cls.from_python_type(dtype_) if isinstance(dtype_, cls): return dtype_.str_alias raise TypeError( "type of `pandas_dtype` argument not recognized: " f"{type(pandas_dtype_arg)}. Please specify a pandera PandasDtype " "enum, legal pandas data type, pandas data type string alias, or " "numpy data type string alias" )
def time_is_extension_array_dtype_false(self): is_extension_array_dtype(self.np_dtype)
def time_is_extension_array_dtype_true(self): is_extension_array_dtype(self.ext_dtype)
def numeric(self) -> pd.DataFrame: """ Descriptive statistics for numeric data Returns ------- DataFrame The statistics of the numeric columns """ df: pd.DataFrame = self._data.loc[:, self._is_numeric] cols = df.columns _, k = df.shape std = df.std() count = df.count() mean = df.mean() mad = (df - mean).abs().mean() std_err = std.copy() std_err.loc[count > 0] /= count.loc[count > 0] if self._use_t: q = stats.t(count - 1).ppf(1.0 - self._alpha / 2) else: q = stats.norm.ppf(1.0 - self._alpha / 2) def _mode(ser): mode_res = stats.mode(ser.dropna()) if mode_res[0].shape[0] > 0: return [float(val) for val in mode_res] return np.nan, np.nan mode_values = df.apply(_mode).T if mode_values.size > 0: if isinstance(mode_values, pd.DataFrame): # pandas 1.0 or later mode = np.asarray(mode_values[0], dtype=float) mode_counts = np.asarray(mode_values[1], dtype=np.int64) else: # pandas before 1.0 returns a Series of 2-elem list mode = [] mode_counts = [] for idx in mode_values.index: val = mode_values.loc[idx] mode.append(val[0]) mode_counts.append(val[1]) mode = np.atleast_1d(mode) mode_counts = np.atleast_1d(mode_counts) else: mode = mode_counts = np.empty(0) loc = count > 0 mode_freq = np.full(mode.shape[0], np.nan) mode_freq[loc] = mode_counts[loc] / count.loc[loc] # TODO: Workaround for pandas AbstractMethodError in extension # types. Remove when quantile is supported for these _df = df try: from pandas.api.types import is_extension_array_dtype _df = df.copy() for col in df: if is_extension_array_dtype(df[col].dtype): _df[col] = _df[col].astype(object).fillna(np.nan) except ImportError: pass if df.shape[1] > 0: iqr = _df.quantile(0.75) - _df.quantile(0.25) else: iqr = mean def _safe_jarque_bera(c): a = np.asarray(c) if a.shape[0] < 2: return (np.nan, ) * 4 return jarque_bera(a) jb = df.apply(lambda x: list(_safe_jarque_bera(x.dropna())), result_type="expand").T nan_mean = mean.copy() nan_mean.loc[nan_mean == 0] = np.nan coef_var = std / nan_mean results = { "nobs": pd.Series(np.ones(k, dtype=np.int64) * df.shape[0], index=cols), "missing": df.shape[0] - count, "mean": mean, "std_err": std_err, "upper_ci": mean + q * std_err, "lower_ci": mean - q * std_err, "std": std, "iqr": iqr, "mad": mad, "coef_var": coef_var, "range": pd_ptp(df), "max": df.max(), "min": df.min(), "skew": jb[2], "kurtosis": jb[3], "iqr_normal": iqr / np.diff(stats.norm.ppf([0.25, 0.75])), "mad_normal": mad / np.sqrt(2 / np.pi), "jarque_bera": jb[0], "jarque_bera_pval": jb[1], "mode": pd.Series(mode, index=cols), "mode_freq": pd.Series(mode_freq, index=cols), "median": df.median(), } final = {k: v for k, v in results.items() if k in self._stats} results_df = pd.DataFrame(list(final.values()), columns=cols, index=list(final.keys())) if "percentiles" not in self._stats: return results_df # Pandas before 1.0 cannot handle empty DF if df.shape[1] > 0: # TODO: Remove when extension types support quantile perc = _df.quantile(self._percentiles / 100).astype(float) else: perc = pd.DataFrame(index=self._percentiles / 100, dtype=float) if np.all(np.floor(100 * perc.index) == (100 * perc.index)): perc.index = [f"{int(100 * idx)}%" for idx in perc.index] else: dupe = True scale = 100 index = perc.index while dupe: scale *= 10 idx = np.floor(scale * perc.index) if np.all(np.diff(idx) > 0): dupe = False index = np.floor(scale * index) / (scale / 100) fmt = f"0.{len(str(scale//100))-1}f" output = f"{{0:{fmt}}}%" perc.index = [output.format(val) for val in index] return self._reorder(pd.concat([results_df, perc], 0))