Exemplo n.º 1
0
def test_summarizer(config):
    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))

    _ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Unsupported"))
    _ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Numeric"))
    _ = format_summary(
        pps.summarize(
            config,
            pd.Series(pd.date_range(start="1/1/2018", end="1/08/2018")),
            "DateTime",
        )
    )
    _ = format_summary(
        pps.summarize(config, pd.Series(["abc", "abc", "abba"]), "Categorical")
    )
    _ = format_summary(
        pps.summarize(config, pd.Series(["https://www.example.com"]), "URL")
    )
    _ = format_summary(
        pps.summarize(
            config,
            pd.Series(
                [
                    os.path.abspath(
                        base_path
                        + r"../../../src/pandas_profiling/model/typeset_does_not_exist.py"
                    )
                ]
            ),
            "Path",
        )
    )
    _ = format_summary(
        pps.summarize(
            config,
            pd.Series(
                [
                    os.path.abspath(
                        base_path + r"../../../src/pandas_profiling/model/typeset.py"
                    )
                ]
            ),
            "File",
        )
    )
    _ = format_summary(
        pps.summarize(
            config,
            pd.Series(
                [os.path.abspath(base_path + r"../../../docsrc/assets/lambda-labs.png")]
            ),
            "Image",
        )
    )
    _ = format_summary(
        pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean")
    )
Exemplo n.º 2
0
def test_describe_df(column, describe_data, expected_results, summarizer):
    config = Settings()
    config.vars.num.low_categorical_threshold = 0

    typeset = ProfilingTypeSet(config)

    describe_data_frame = pd.DataFrame({column: describe_data[column]})
    if column == "somedate":
        describe_data_frame["somedate"] = pd.to_datetime(
            describe_data_frame["somedate"]
        )

    results = describe(config, describe_data_frame, summarizer, typeset)

    assert {
        "analysis",
        "table",
        "variables",
        "scatter",
        "correlations",
        "missing",
        "messages",
        "package",
        "sample",
        "duplicates",
    } == set(results.keys()), "Not in results"

    # Loop over variables
    for k, v in expected_results[column].items():
        if v == check_is_NaN:
            test_condition = k not in results["variables"][column]
        elif isinstance(v, float):
            test_condition = pytest.approx(v) == results["variables"][column][k]
        else:
            test_condition = v == results["variables"][column][k]

        assert (
            test_condition
        ), f"Value `{results['variables'][column][k]}` for key `{k}` in column `{column}` is not NaN"

    if results["variables"][column]["type"] in ["Numeric", "DateTime"]:
        assert (
            "histogram" in results["variables"][column]
        ), f"Histogram missing for column {column}"
Exemplo n.º 3
0
def test_summarizer():
    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet())

    _ = format_summary(pps.summarize(pd.Series([1, 2, 3, 4, 5]), Unsupported))
    _ = format_summary(pps.summarize(pd.Series([1, 2, 3, 4, 5]), Numeric))
    _ = format_summary(
        pps.summarize(
            pd.Series(pd.date_range(start="1/1/2018", end="1/08/2018")),
            DateTime))
    _ = format_summary(
        pps.summarize(pd.Series(["abc", "abc", "abba"]), Categorical))
    _ = format_summary(
        pps.summarize(pd.Series(["https://www.example.com"]), URL))
    _ = format_summary(
        pps.summarize(
            pd.Series([
                os.path.abspath(
                    base_path +
                    r"../../../src/pandas_profiling/model/typeset_does_not_exist.py"
                )
            ]),
            Path,
        ))
    _ = format_summary(
        pps.summarize(
            pd.Series([
                os.path.abspath(
                    base_path +
                    r"../../../src/pandas_profiling/model/typeset.py")
            ]),
            File,
        ))
    _ = format_summary(
        pps.summarize(
            pd.Series([
                os.path.abspath(base_path +
                                r"../../../docsrc/assets/lambda-labs.png")
            ]),
            Image,
        ))
    _ = format_summary(
        pps.summarize(pd.Series([True, False, True, False, False]), Boolean))
    infers,
)

from pandas_profiling.config import config
from pandas_profiling.model.typeset import (
    Boolean,
    Categorical,
    DateTime,
    Numeric,
    ProfilingTypeSet,
    Unsupported,
)

series = get_series()

typeset = ProfilingTypeSet()

contains_map = {
    Numeric: {
        "int_series",
        "Int64_int_series",
        "int_range",
        "Int64_int_nan_series",
        "int_series_boolean",
        "np_uint32",
        "pd_uint32",
        "float_series",
        "float_series2",
        "float_series3",
        "float_series4",
        "inf_series",
Exemplo n.º 5
0
 def typeset(self):
     if self._typeset is None:
         self._typeset = ProfilingTypeSet()
     return self._typeset
Exemplo n.º 6
0
 def typeset(self) -> Optional[VisionsTypeset]:
     if self._typeset is None:
         self._typeset = ProfilingTypeSet(self.config)
     return self._typeset
Exemplo n.º 7
0
def typeset(config):
    return ProfilingTypeSet(config)
Exemplo n.º 8
0
def typeset():
    return ProfilingTypeSet()
            (15, 16),
            (17, 18),
        ],
        "date_str": ["2018-01-01", "2017-02-01", "2018-04-07"],
        "nullable_int":
        pd.Series([1, None], dtype="Int64"),
    }

    return {key: pd.Series(values, name=key) for key, values in data.items()}


series = get_profiling_series()

config = Settings()
config.vars.num.low_categorical_threshold = 0
my_typeset = ProfilingTypeSet(config)

type_map = {str(k): k for k in my_typeset.types}
Numeric = type_map["Numeric"]
Categorical = type_map["Categorical"]
Boolean = type_map["Boolean"]
DateTime = type_map["DateTime"]
Unsupported = type_map["Unsupported"]

config2 = Settings()
config2.vars.num.low_categorical_threshold = 2
typeset2 = ProfilingTypeSet(config2)
type_map2 = {str(k): k for k in typeset2.types}
Numeric2 = type_map2["Numeric"]
Categorical2 = type_map2["Categorical"]
Boolean2 = type_map2["Boolean"]
from tests.unit.test_utils import patch_arg

if int(pd.__version__.split(".")[0]) < 1:
    from visions.dtypes.boolean import BoolDtype  # noqa: F401

    btype = "Bool"
else:
    btype = "boolean"

base_path = os.path.abspath(os.path.dirname(__file__))

series = get_series()

my_config = Settings()
my_config.vars.num.low_categorical_threshold = 0
my_typeset_default = ProfilingTypeSet(my_config)

type_map = {str(k): k for k in my_typeset_default.types}
Numeric = type_map["Numeric"]
Categorical = type_map["Categorical"]
Boolean = type_map["Boolean"]
DateTime = type_map["DateTime"]
Unsupported = type_map["Unsupported"]

contains_map = {
    Numeric: {
        "int_series",
        "Int64_int_series",
        "int_range",
        "Int64_int_nan_series",
        "int_series_boolean",