def test_summarizer(config): pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Unsupported")) _ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Numeric")) _ = format_summary( pps.summarize( config, pd.Series(pd.date_range(start="1/1/2018", end="1/08/2018")), "DateTime", ) ) _ = format_summary( pps.summarize(config, pd.Series(["abc", "abc", "abba"]), "Categorical") ) _ = format_summary( pps.summarize(config, pd.Series(["https://www.example.com"]), "URL") ) _ = format_summary( pps.summarize( config, pd.Series( [ os.path.abspath( base_path + r"../../../src/pandas_profiling/model/typeset_does_not_exist.py" ) ] ), "Path", ) ) _ = format_summary( pps.summarize( config, pd.Series( [ os.path.abspath( base_path + r"../../../src/pandas_profiling/model/typeset.py" ) ] ), "File", ) ) _ = format_summary( pps.summarize( config, pd.Series( [os.path.abspath(base_path + r"../../../docsrc/assets/lambda-labs.png")] ), "Image", ) ) _ = format_summary( pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean") )
def test_describe_df(column, describe_data, expected_results, summarizer): config = Settings() config.vars.num.low_categorical_threshold = 0 typeset = ProfilingTypeSet(config) describe_data_frame = pd.DataFrame({column: describe_data[column]}) if column == "somedate": describe_data_frame["somedate"] = pd.to_datetime( describe_data_frame["somedate"] ) results = describe(config, describe_data_frame, summarizer, typeset) assert { "analysis", "table", "variables", "scatter", "correlations", "missing", "messages", "package", "sample", "duplicates", } == set(results.keys()), "Not in results" # Loop over variables for k, v in expected_results[column].items(): if v == check_is_NaN: test_condition = k not in results["variables"][column] elif isinstance(v, float): test_condition = pytest.approx(v) == results["variables"][column][k] else: test_condition = v == results["variables"][column][k] assert ( test_condition ), f"Value `{results['variables'][column][k]}` for key `{k}` in column `{column}` is not NaN" if results["variables"][column]["type"] in ["Numeric", "DateTime"]: assert ( "histogram" in results["variables"][column] ), f"Histogram missing for column {column}"
def test_summarizer(): pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet()) _ = format_summary(pps.summarize(pd.Series([1, 2, 3, 4, 5]), Unsupported)) _ = format_summary(pps.summarize(pd.Series([1, 2, 3, 4, 5]), Numeric)) _ = format_summary( pps.summarize( pd.Series(pd.date_range(start="1/1/2018", end="1/08/2018")), DateTime)) _ = format_summary( pps.summarize(pd.Series(["abc", "abc", "abba"]), Categorical)) _ = format_summary( pps.summarize(pd.Series(["https://www.example.com"]), URL)) _ = format_summary( pps.summarize( pd.Series([ os.path.abspath( base_path + r"../../../src/pandas_profiling/model/typeset_does_not_exist.py" ) ]), Path, )) _ = format_summary( pps.summarize( pd.Series([ os.path.abspath( base_path + r"../../../src/pandas_profiling/model/typeset.py") ]), File, )) _ = format_summary( pps.summarize( pd.Series([ os.path.abspath(base_path + r"../../../docsrc/assets/lambda-labs.png") ]), Image, )) _ = format_summary( pps.summarize(pd.Series([True, False, True, False, False]), Boolean))
infers, ) from pandas_profiling.config import config from pandas_profiling.model.typeset import ( Boolean, Categorical, DateTime, Numeric, ProfilingTypeSet, Unsupported, ) series = get_series() typeset = ProfilingTypeSet() contains_map = { Numeric: { "int_series", "Int64_int_series", "int_range", "Int64_int_nan_series", "int_series_boolean", "np_uint32", "pd_uint32", "float_series", "float_series2", "float_series3", "float_series4", "inf_series",
def typeset(self): if self._typeset is None: self._typeset = ProfilingTypeSet() return self._typeset
def typeset(self) -> Optional[VisionsTypeset]: if self._typeset is None: self._typeset = ProfilingTypeSet(self.config) return self._typeset
def typeset(config): return ProfilingTypeSet(config)
def typeset(): return ProfilingTypeSet()
(15, 16), (17, 18), ], "date_str": ["2018-01-01", "2017-02-01", "2018-04-07"], "nullable_int": pd.Series([1, None], dtype="Int64"), } return {key: pd.Series(values, name=key) for key, values in data.items()} series = get_profiling_series() config = Settings() config.vars.num.low_categorical_threshold = 0 my_typeset = ProfilingTypeSet(config) type_map = {str(k): k for k in my_typeset.types} Numeric = type_map["Numeric"] Categorical = type_map["Categorical"] Boolean = type_map["Boolean"] DateTime = type_map["DateTime"] Unsupported = type_map["Unsupported"] config2 = Settings() config2.vars.num.low_categorical_threshold = 2 typeset2 = ProfilingTypeSet(config2) type_map2 = {str(k): k for k in typeset2.types} Numeric2 = type_map2["Numeric"] Categorical2 = type_map2["Categorical"] Boolean2 = type_map2["Boolean"]
from tests.unit.test_utils import patch_arg if int(pd.__version__.split(".")[0]) < 1: from visions.dtypes.boolean import BoolDtype # noqa: F401 btype = "Bool" else: btype = "boolean" base_path = os.path.abspath(os.path.dirname(__file__)) series = get_series() my_config = Settings() my_config.vars.num.low_categorical_threshold = 0 my_typeset_default = ProfilingTypeSet(my_config) type_map = {str(k): k for k in my_typeset_default.types} Numeric = type_map["Numeric"] Categorical = type_map["Categorical"] Boolean = type_map["Boolean"] DateTime = type_map["DateTime"] Unsupported = type_map["Unsupported"] contains_map = { Numeric: { "int_series", "Int64_int_series", "int_range", "Int64_int_nan_series", "int_series_boolean",