Пример #1
0
def test_describe_unique(data, expected, summarizer, typeset):
    """Test the unique feature of 1D data"""
    config = Settings()
    config.vars.num.low_categorical_threshold = 0

    desc_1d = describe_1d(config, data, summarizer, typeset)
    if expected["is_unique"] is not None:
        assert (
            desc_1d["p_unique"] == expected["p_unique"]
        ), "Describe 1D p_unique incorrect"
        assert (
            desc_1d["p_distinct"] == expected["p_distinct"]
        ), "Describe 1D p_distinct incorrect"
        assert (
            desc_1d["is_unique"] == expected["is_unique"]
        ), "Describe 1D should return unique"
Пример #2
0
def test_describe_df(column, describe_data, expected_results, summarizer):
    config = Settings()
    config.vars.num.low_categorical_threshold = 0

    typeset = ProfilingTypeSet(config)

    describe_data_frame = pd.DataFrame({column: describe_data[column]})
    if column == "somedate":
        describe_data_frame["somedate"] = pd.to_datetime(
            describe_data_frame["somedate"]
        )

    results = describe(config, describe_data_frame, summarizer, typeset)

    assert {
        "analysis",
        "table",
        "variables",
        "scatter",
        "correlations",
        "missing",
        "messages",
        "package",
        "sample",
        "duplicates",
    } == set(results.keys()), "Not in results"

    # Loop over variables
    for k, v in expected_results[column].items():
        if v == check_is_NaN:
            test_condition = k not in results["variables"][column]
        elif isinstance(v, float):
            test_condition = pytest.approx(v) == results["variables"][column][k]
        else:
            test_condition = v == results["variables"][column][k]

        assert (
            test_condition
        ), f"Value `{results['variables'][column][k]}` for key `{k}` in column `{column}` is not NaN"

    if results["variables"][column]["type"] in ["Numeric", "DateTime"]:
        assert (
            "histogram" in results["variables"][column]
        ), f"Histogram missing for column {column}"
Пример #3
0
def describe(
    config: Settings,
    df: pd.DataFrame,
    summarizer: BaseSummarizer,
    typeset: VisionsTypeset,
    sample: Optional[dict] = None,
) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        config: report Settings object
        df: DataFrame.
        summarizer: summarizer object
        typeset: visions typeset
        sample: optional, dict with custom sample

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - alerts: direct special attention to these patterns in your data.
            - package: package details.
    """

    if df is None:
        raise ValueError(
            "Can not describe a `lazy` ProfileReport without a DataFrame.")

    check_dataframe(df)
    df = preprocess(config, df)

    number_of_tasks = 5

    with tqdm(
            total=number_of_tasks,
            desc="Summarize dataset",
            disable=not config.progress_bar,
            position=0,
    ) as pbar:
        date_start = datetime.utcnow()

        # Variable-specific
        pbar.total += len(df.columns)
        series_description = get_series_descriptions(config, df, summarizer,
                                                     typeset, pbar)

        pbar.set_postfix_str("Get variable types")
        pbar.total += 1
        variables = {
            column: description["type"]
            for column, description in series_description.items()
        }
        supported_columns = [
            column for column, type_name in variables.items()
            if type_name != "Unsupported"
        ]
        interval_columns = [
            column for column, type_name in variables.items()
            if type_name == "Numeric"
        ]
        pbar.update()

        # Get correlations
        correlation_names = get_active_correlations(config)
        pbar.total += len(correlation_names)

        correlations = {
            correlation_name:
            progress(calculate_correlation, pbar,
                     f"Calculate {correlation_name} correlation")(
                         config, df, correlation_name, series_description)
            for correlation_name in correlation_names
        }

        # make sure correlations is not None
        correlations = {
            key: value
            for key, value in correlations.items() if value is not None
        }

        # Scatter matrix
        pbar.set_postfix_str("Get scatter matrix")
        scatter_tasks = get_scatter_tasks(config, interval_columns)
        pbar.total += len(scatter_tasks)
        scatter_matrix: Dict[Any, Dict[Any, Any]] = {
            x: {
                y: None
            }
            for x, y in scatter_tasks
        }
        for x, y in scatter_tasks:
            scatter_matrix[x][y] = progress(
                get_scatter_plot, pbar, f"scatter {x}, {y}")(config, df, x, y,
                                                             interval_columns)

        # Table statistics
        table_stats = progress(get_table_stats, pbar,
                               "Get dataframe statistics")(config, df,
                                                           series_description)

        # missing diagrams
        missing_map = get_missing_active(config, table_stats)
        pbar.total += len(missing_map)
        missing = {
            name: progress(get_missing_diagram, pbar,
                           f"Missing diagram {name}")(config, df, settings)
            for name, settings in missing_map.items()
        }
        missing = {
            name: value
            for name, value in missing.items() if value is not None
        }

        # Sample
        pbar.set_postfix_str("Take sample")
        if sample is None:
            samples = get_sample(config, df)
        else:
            samples = get_custom_sample(sample)
        pbar.update()

        # Duplicates
        metrics, duplicates = progress(
            get_duplicates, pbar, "Detecting duplicates")(config, df,
                                                          supported_columns)
        table_stats.update(metrics)

        alerts = progress(get_alerts, pbar,
                          "Get alerts")(config, table_stats,
                                        series_description, correlations)

        pbar.set_postfix_str("Get reproduction details")
        package = {
            "pandas_profiling_version": __version__,
            "pandas_profiling_config": config.json(),
        }
        pbar.update()

        pbar.set_postfix_str("Completed")

        date_end = datetime.utcnow()

    analysis = {
        "title": config.title,
        "date_start": date_start,
        "date_end": date_end,
        "duration": date_end - date_start,
    }

    return {
        # Analysis metadata
        "analysis": analysis,
        # Overall dataset description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Alerts
        "alerts": alerts,
        # Package
        "package": package,
        # Sample
        "sample": samples,
        # Duplicates
        "duplicates": duplicates,
    }
Пример #4
0
def describe(
    config: Settings,
    df: pd.DataFrame,
    summarizer: BaseSummarizer,
    typeset: VisionsTypeset,
    sample: Optional[dict] = None,
) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        config: report Settings object
        df: DataFrame.
        sample: optional, dict with custom sample

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
            - package: package details.
    """

    if df is None:
        raise ValueError(
            "Can not describe a `lazy` ProfileReport without a DataFrame.")

    if not isinstance(df, pd.DataFrame):
        warnings.warn("df is not of type pandas.DataFrame")

    disable_progress_bar = not config.progress_bar

    date_start = datetime.utcnow()

    correlation_names = [
        correlation_name for correlation_name in [
            "pearson",
            "spearman",
            "kendall",
            "phi_k",
            "cramers",
        ] if config.correlations[correlation_name].calculate
    ]

    number_of_tasks = 8 + len(df.columns) + len(correlation_names)

    with tqdm(total=number_of_tasks,
              desc="Summarize dataset",
              disable=disable_progress_bar) as pbar:
        series_description = get_series_descriptions(config, df, summarizer,
                                                     typeset, pbar)

        pbar.set_postfix_str("Get variable types")
        variables = {
            column: description["type"]
            for column, description in series_description.items()
        }
        supported_columns = [
            column for column, type_name in variables.items()
            if type_name != "Unsupported"
        ]
        interval_columns = [
            column for column, type_name in variables.items()
            if type_name == "Numeric"
        ]
        pbar.update()

        # Get correlations
        correlations = {}
        for correlation_name in correlation_names:
            pbar.set_postfix_str(f"Calculate {correlation_name} correlation")
            correlations[correlation_name] = calculate_correlation(
                config, df, correlation_name, series_description)
            pbar.update()

        # make sure correlations is not None
        correlations = {
            key: value
            for key, value in correlations.items() if value is not None
        }

        # Scatter matrix
        pbar.set_postfix_str("Get scatter matrix")
        scatter_matrix = get_scatter_matrix(config, df, interval_columns)
        pbar.update()

        # Table statistics
        pbar.set_postfix_str("Get table statistics")
        table_stats = get_table_stats(config, df, series_description)
        pbar.update()

        # missing diagrams
        pbar.set_postfix_str("Get missing diagrams")
        missing = get_missing_diagrams(config, df, table_stats)
        pbar.update()

        # Sample
        pbar.set_postfix_str("Take sample")
        if sample is None:
            samples = get_sample(config, df)
        else:
            if "name" not in sample:
                sample["name"] = None
            if "caption" not in sample:
                sample["caption"] = None

            samples = [
                Sample(
                    id="custom",
                    data=sample["data"],
                    name=sample["name"],
                    caption=sample["caption"],
                )
            ]
        pbar.update()

        # Duplicates
        pbar.set_postfix_str("Locating duplicates")
        metrics, duplicates = get_duplicates(config, df, supported_columns)
        table_stats.update(metrics)
        pbar.update()

        # Messages
        pbar.set_postfix_str("Get messages/warnings")
        messages = get_messages(config, table_stats, series_description,
                                correlations)
        pbar.update()

        pbar.set_postfix_str("Get reproduction details")
        package = {
            "pandas_profiling_version": __version__,
            "pandas_profiling_config": config.json(),
        }
        pbar.update()

        pbar.set_postfix_str("Completed")

    date_end = datetime.utcnow()

    analysis = {
        "title": config.title,
        "date_start": date_start,
        "date_end": date_end,
        "duration": date_end - date_start,
    }

    return {
        # Analysis metadata
        "analysis": analysis,
        # Overall dataset description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
        # Sample
        "sample": samples,
        # Duplicates
        "duplicates": duplicates,
    }
Пример #5
0
    def __init__(
        self,
        df: Optional[pd.DataFrame] = None,
        minimal: bool = False,
        explorative: bool = False,
        sensitive: bool = False,
        dark_mode: bool = False,
        orange_mode: bool = False,
        sample: Optional[dict] = None,
        config_file: Union[Path, str] = None,
        lazy: bool = True,
        typeset: Optional[VisionsTypeset] = None,
        summarizer: Optional[BaseSummarizer] = None,
        config: Optional[Settings] = None,
        **kwargs,
    ):
        """Generate a ProfileReport based on a pandas DataFrame

        Args:
            df: the pandas DataFrame
            minimal: minimal mode is a default configuration with minimal computation
            config_file: a config file (.yml), mutually exclusive with `minimal`
            lazy: compute when needed
            sample: optional dict(name="Sample title", caption="Caption", data=pd.DataFrame())
            typeset: optional user typeset to use for type inference
            summarizer: optional user summarizer to generate custom summary output
            **kwargs: other arguments, for valid arguments, check the default configuration file.
        """

        if df is None and not lazy:
            raise ValueError(
                "Can init a not-lazy ProfileReport with no DataFrame")

        report_config: Settings = Settings() if config is None else config

        if config_file is not None and minimal:
            raise ValueError(
                "Arguments `config_file` and `minimal` are mutually exclusive."
            )

        if config_file or minimal:
            if not config_file:
                config_file = get_config("config_minimal.yaml")

            with open(config_file) as f:
                data = yaml.safe_load(f)

            report_config = report_config.parse_obj(data)

        if explorative:
            report_config = report_config.update(
                Config.get_arg_groups("explorative"))
        if sensitive:
            report_config = report_config.update(
                Config.get_arg_groups("sensitive"))
        if dark_mode:
            report_config = report_config.update(
                Config.get_arg_groups("dark_mode"))
        if orange_mode:
            report_config = report_config.update(
                Config.get_arg_groups("orange_mode"))
        if len(kwargs) > 0:
            report_config = report_config.update(Config.shorthands(kwargs))

        self.df = None
        self.config = report_config
        self._df_hash = None
        self._sample = sample
        self._typeset = typeset
        self._summarizer = summarizer

        if df is not None:
            # preprocess df
            self.df = self.preprocess(df)

        if not lazy:
            # Trigger building the report structure
            _ = self.report
Пример #6
0
def config():
    return Settings()
Пример #7
0
def test_describe_list(summarizer, typeset):
    config = Settings()

    with pytest.raises(AttributeError), pytest.warns(UserWarning):
        describe(config, "", [1, 2, 3], summarizer, typeset)
            (11, 12),
            (13, 14),
            (15, 16),
            (17, 18),
        ],
        "date_str": ["2018-01-01", "2017-02-01", "2018-04-07"],
        "nullable_int":
        pd.Series([1, None], dtype="Int64"),
    }

    return {key: pd.Series(values, name=key) for key, values in data.items()}


series = get_profiling_series()

config = Settings()
config.vars.num.low_categorical_threshold = 0
my_typeset = ProfilingTypeSet(config)

type_map = {str(k): k for k in my_typeset.types}
Numeric = type_map["Numeric"]
Categorical = type_map["Categorical"]
Boolean = type_map["Boolean"]
DateTime = type_map["DateTime"]
Unsupported = type_map["Unsupported"]

config2 = Settings()
config2.vars.num.low_categorical_threshold = 2
typeset2 = ProfilingTypeSet(config2)
type_map2 = {str(k): k for k in typeset2.types}
Numeric2 = type_map2["Numeric"]
from pandas_profiling.config import Settings
from pandas_profiling.model.typeset import ProfilingTypeSet
from tests.unit.test_utils import patch_arg

if int(pd.__version__.split(".")[0]) < 1:
    from visions.dtypes.boolean import BoolDtype  # noqa: F401

    btype = "Bool"
else:
    btype = "boolean"

base_path = os.path.abspath(os.path.dirname(__file__))

series = get_series()

my_config = Settings()
my_config.vars.num.low_categorical_threshold = 0
my_typeset_default = ProfilingTypeSet(my_config)

type_map = {str(k): k for k in my_typeset_default.types}
Numeric = type_map["Numeric"]
Categorical = type_map["Categorical"]
Boolean = type_map["Boolean"]
DateTime = type_map["DateTime"]
Unsupported = type_map["Unsupported"]

contains_map = {
    Numeric: {
        "int_series",
        "Int64_int_series",
        "int_range",
Пример #10
0
def test_describe_list(summarizer, typeset):
    config = Settings()

    with pytest.raises(NotImplementedError):
        describe(config, "", [1, 2, 3], summarizer, typeset)