示例#1
0
def describe(df: pd.DataFrame) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        df: DataFrame.

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")

    if df.empty:
        raise ValueError("df can not be empty")

    # Multiprocessing of Describe 1D for each column
    pool_size = config["pool_size"].get(int)
    if pool_size <= 0:
        pool_size = multiprocessing.cpu_count()

    if pool_size == 1:
        args = [(column, series) for column, series in df.iteritems()]
        series_description = {
            column: series
            for column, series in itertools.starmap(multiprocess_1d, args)
        }
    else:
        with multiprocessing.pool.ThreadPool(pool_size) as executor:
            series_description = {}
            results = executor.starmap(multiprocess_1d, df.iteritems())
            for col, description in results:
                series_description[col] = description

    # Mapping from column name to variable type
    variables = {
        column: description["type"]
        for column, description in series_description.items()
    }

    # Get correlations
    correlations = calculate_correlations(df, variables)

    # Check correlations between numerical variables
    if (config["check_correlation_pearson"].get(bool) is True
            and "pearson" in correlations):
        # Overwrites the description with "CORR" series
        correlation_threshold = config["correlation_threshold_pearson"].get(
            float)
        update(
            series_description,
            perform_check_correlation(
                correlations["pearson"],
                lambda x: x > correlation_threshold,
                Variable.S_TYPE_CORR,
            ),
        )

    # Check correlations between categorical variables
    if (config["check_correlation_cramers"].get(bool) is True
            and "cramers" in correlations):
        # Overwrites the description with "CORR" series
        correlation_threshold = config["correlation_threshold_cramers"].get(
            float)
        update(
            series_description,
            perform_check_correlation(
                correlations["cramers"],
                lambda x: x > correlation_threshold,
                Variable.S_TYPE_CORR,
            ),
        )

    # Check recoded
    if config["check_recoded"].get(bool) is True and "recoded" in correlations:
        # Overwrites the description with "RECORDED" series
        update(
            series_description,
            perform_check_correlation(correlations["recoded"],
                                      lambda x: x == 1,
                                      Variable.S_TYPE_RECODED),
        )

    # Transform the series_description in a DataFrame
    variable_stats = pd.DataFrame(series_description)

    # Table statistics
    table_stats = describe_table(df, variable_stats)

    # missing diagrams
    missing = get_missing_diagrams(df, table_stats)

    # Messages
    messages = check_table_messages(table_stats)
    for col, description in series_description.items():
        messages += check_variable_messages(col, description)

    package = {
        "pandas_profiling_version": __version__,
        "pandas_profiling_config": config.dump(),
    }

    return {
        # Overall description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
    }
def describe(title: str, df: pd.DataFrame, sample: Optional[dict] = None) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        title: report title
        df: DataFrame.
        sample: optional, dict with custom sample

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
            - package: package details.
    """

    if df is None:
        raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.")

    if not isinstance(df, pd.DataFrame):
        warnings.warn("df is not of type pandas.DataFrame")

    if df.empty:
        raise ValueError("df can not be empty")

    disable_progress_bar = not config["progress_bar"].get(bool)

    date_start = datetime.utcnow()

    correlation_names = [
        correlation_name
        for correlation_name in ["pearson", "spearman", "kendall", "phi_k", "cramers",]
        if config["correlations"][correlation_name]["calculate"].get(bool)
    ]

    number_of_tasks = 9 + len(df.columns) + len(correlation_names)

    with tqdm(
        total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
    ) as pbar:
        series_description = get_series_descriptions(df, pbar)

        pbar.set_postfix_str("Get variable types")
        variables = {
            column: description["type"]
            for column, description in series_description.items()
        }
        pbar.update()

        # Transform the series_description in a DataFrame
        pbar.set_postfix_str("Get variable statistics")
        variable_stats = pd.DataFrame(series_description)
        pbar.update()

        # Get correlations
        correlations = {}
        for correlation_name in correlation_names:
            pbar.set_postfix_str(f"Calculate {correlation_name} correlation")
            correlations[correlation_name] = calculate_correlation(
                df, variables, correlation_name
            )
            pbar.update()

        # make sure correlations is not None
        correlations = {
            key: value for key, value in correlations.items() if value is not None
        }

        # Scatter matrix
        pbar.set_postfix_str("Get scatter matrix")
        scatter_matrix = get_scatter_matrix(df, variables)
        pbar.update()

        # Table statistics
        pbar.set_postfix_str("Get table statistics")
        table_stats = get_table_stats(df, variable_stats)
        pbar.update()

        # missing diagrams
        pbar.set_postfix_str("Get missing diagrams")
        missing = get_missing_diagrams(df, table_stats)
        pbar.update()

        # Sample
        pbar.set_postfix_str("Take sample")
        if sample is None:
            samples = get_sample(df)
        else:
            if "name" not in sample:
                sample["name"] = None
            if "caption" not in sample:
                sample["caption"] = None

            samples = [
                Sample("custom", sample["data"], sample["name"], sample["caption"])
            ]
        pbar.update()

        # Duplicates
        pbar.set_postfix_str("Locating duplicates")
        supported_columns = [
            key
            for key, value in series_description.items()
            if value["type"] != Variable.S_TYPE_UNSUPPORTED
        ]

        duplicates = get_duplicates(df, supported_columns)
        pbar.update()

        # Messages
        pbar.set_postfix_str("Get messages/warnings")
        messages = get_messages(table_stats, series_description, correlations)
        pbar.update()

        pbar.set_postfix_str("Get reproduction details")
        package = {
            "pandas_profiling_version": __version__,
            "pandas_profiling_config": config.dump(),
        }
        pbar.update()

        pbar.set_postfix_str("Completed")

    date_end = datetime.utcnow()

    analysis = {
        "title": title,
        "date_start": date_start,
        "date_end": date_end,
        "duration": date_end - date_start,
    }

    return {
        # Analysis metadata
        "analysis": analysis,
        # Overall dataset description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
        # Sample
        "sample": samples,
        # Duplicates
        "duplicates": duplicates,
    }
示例#3
0
def describe(df: pd.DataFrame) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        df: DataFrame.

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
    """
    if not isinstance(df, pd.DataFrame):
        warnings.warn("df is not of type pandas.DataFrame")

    if df.empty:
        raise ValueError("df can not be empty")

    disable_progress_bar = not config["progress_bar"].get(bool)

    # Multiprocessing of Describe 1D for each column
    pool_size = config["pool_size"].get(int)
    if pool_size <= 0:
        pool_size = multiprocessing.cpu_count()

    args = [(column, series) for column, series in df.iteritems()]
    series_description = {}
    with tqdm(total=len(args), desc="variables", disable=disable_progress_bar) as pbar:
        if pool_size == 1:
            for arg in args:
                column, description = multiprocess_1d(arg)
                series_description[column] = description
                pbar.update()
        else:
            # Store the original order
            original_order = {
                k: v for v, k in enumerate([column for column, _ in args])
            }

            # TODO: use `Pool` for Linux-based systems
            with multiprocessing.pool.ThreadPool(pool_size) as executor:
                for i, (column, description) in enumerate(
                    executor.imap_unordered(multiprocess_1d, args)
                ):
                    series_description[column] = description
                    pbar.set_postfix({'feature_name': column})
                    pbar.update()

            # Restore the original order
            series_description = dict(
                sorted(
                    series_description.items(),
                    key=lambda index: original_order.get(index[0]),
                )
            )

    # Mapping from column name to variable type
    sort = config["sort"].get(str)
    series_description = sort_column_names(series_description, sort)

    variables = {
        column: description["type"]
        for column, description in series_description.items()
    }

    # Transform the series_description in a DataFrame
    variable_stats = pd.DataFrame(series_description)

    # Get correlations
    correlations = calculate_correlations(df, variables)

    # Scatter matrix
    scatter_matrix = get_scatter_matrix(df, variables)

    # Table statistics
    with tqdm(total=1, desc="table", disable=disable_progress_bar) as pbar:
        table_stats = describe_table(df, variable_stats)
        pbar.update(1)

    # missing diagrams
    missing = get_missing_diagrams(df, table_stats)

    # Messages
    with tqdm(total=3, desc="warnings", disable=disable_progress_bar) as pbar:
        pbar.set_description_str("warnings [table]")
        messages = check_table_messages(table_stats)
        pbar.update()
        pbar.set_description_str("warnings [variables]")
        for col, description in series_description.items():
            messages += check_variable_messages(col, description)
        pbar.update()
        pbar.set_description_str("warnings [correlations]")
        messages += check_correlation_messages(correlations)
        messages.sort(key=lambda message: str(message.message_type))
        pbar.update()

    with tqdm(total=1, desc="package", disable=disable_progress_bar) as pbar:
        package = {
            "pandas_profiling_version": __version__,
            "pandas_profiling_config": config.dump(),
        }
        pbar.update()

    return {
        # Overall description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
    }
示例#4
0
def describe(title, df: pd.DataFrame) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        df: DataFrame.

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
            - package: package details.
            :param title:
    """

    if df is None:
        raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.")

    if not isinstance(df, pd.DataFrame):
        warnings.warn("df is not of type pandas.DataFrame")

    if df.empty:
        raise ValueError("df can not be empty")

    disable_progress_bar = not config["progress_bar"].get(bool)

    date_start = datetime.utcnow()

    correlation_names = [
        correlation_name
        for correlation_name in ["pearson", "spearman", "kendall", "phi_k", "cramers",]
        if config["correlations"][correlation_name]["calculate"].get(bool)
    ]

    number_of_tasks = 9 + len(df.columns) + len(correlation_names)

    with tqdm(
        total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
    ) as pbar:
        series_description = get_series_descriptions(df, pbar)

        pbar.set_postfix_str("Get variable types")
        variables = {
            column: description["type"]
            for column, description in series_description.items()
        }
        pbar.update()

        # Transform the series_description in a DataFrame
        pbar.set_postfix_str("Get variable statistics")
        variable_stats = pd.DataFrame(series_description)
        pbar.update()

        # Get correlations
        correlations = {}
        for correlation_name in correlation_names:
            pbar.set_postfix_str(f"Calculate {correlation_name} correlation")
            correlations[correlation_name] = calculate_correlation(
                df, variables, correlation_name
            )
            pbar.update()

        # Make sure correlations is not None
        correlations = {
            key: value for key, value in correlations.items() if value is not None
        }

        # Scatter matrix
        pbar.set_postfix_str("Get scatter matrix")
        scatter_matrix = get_scatter_matrix(df, variables)
        pbar.update()

        # Table statistics
        pbar.set_postfix_str("Get table statistics")
        table_stats = get_table_stats(df, variable_stats)
        pbar.update()

        # Missing diagrams
        pbar.set_postfix_str("Get missing diagrams")
        missing = get_missing_diagrams(df, table_stats)
        pbar.update()

        # Sample
        pbar.set_postfix_str("Take sample")
        sample = get_sample(df)
        pbar.update()

        # Duplicates
        pbar.set_postfix_str("Locating duplicates")
        supported_columns = [
            key
            for key, value in series_description.items()
            if value["type"] != Variable.S_TYPE_UNSUPPORTED
        ]
        duplicates = get_duplicates(df, supported_columns)
        pbar.update()

        # Clusters
        pbar.set_postfix_str("Searching for clusters")
        categoricals = [column_name for column_name, variable_type in variables.items() if variable_type == Variable.TYPE_CAT]
        df_without_missing = df.dropna()
        df_ohe = pd.concat([df_without_missing.drop(categoricals, axis=1), pd.get_dummies(df_without_missing[categoricals])], axis=1).reset_index()
        clusters = {
            name: pd.concat([df_ohe, pd.DataFrame({"Cluster": eval(clustering).fit(df_ohe).labels_})], axis=1)
            for name, clustering in config["clusters"]["clusterings"].get()
        }

        # Outliers
        pbar.set_postfix_str("Detecting outliers")
        outliers = {
            name: pd.concat([df_ohe, pd.DataFrame({"Outlier": eval(detector).fit_predict(df_ohe)})], axis=1)
            for name, detector in config["outliers"]["detectors"].get()
        }

        # Messages
        pbar.set_postfix_str("Get messages/warnings")
        messages = get_messages(table_stats, series_description, correlations)
        pbar.update()

        pbar.set_postfix_str("Get reproduction details")
        package = {
            "pandas_profiling_version": VERSION,
            "pandas_profiling_config": config.dump(),
        }
        pbar.update()

        pbar.set_postfix_str("Completed")

    date_end = datetime.utcnow()

    analysis = {
        "title": title,
        "date_start": date_start,
        "date_end": date_end,
        "duration": date_end - date_start,
    }

    return {
        # Analysis metadata
        "analysis": analysis,
        # Overall dataset description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
        # Sample
        "sample": sample,
        # Duplicates
        "duplicates": duplicates,
        # Clusters
        "clusters": clusters,
        # Outliers
        "outliers": outliers
    }
示例#5
0
def describe(df: pd.DataFrame) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        df: DataFrame.

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")

    if df.empty:
        raise ValueError("df can not be empty")

    # Multiprocessing of Describe 1D for each column
    pool_size = config["pool_size"].get(int)
    if pool_size <= 0:
        pool_size = multiprocessing.cpu_count()

    if pool_size == 1:
        args = [(column, series) for column, series in df.iteritems()]
        series_description = {
            column: series
            for column, series in itertools.starmap(multiprocess_1d, args)
        }
    else:
        with multiprocessing.pool.ThreadPool(pool_size) as executor:
            series_description = {}
            results = executor.starmap(multiprocess_1d, df.iteritems())
            for col, description in results:
                series_description[col] = description

    # Mapping from column name to variable type
    variables = {
        column: description["type"]
        for column, description in series_description.items()
    }

    # Get correlations
    correlations = calculate_correlations(df, variables)

    # Scatter matrix
    scatter_matrix = get_scatter_matrix(df, variables)

    # Transform the series_description in a DataFrame
    variable_stats = pd.DataFrame(series_description)

    # Table statistics
    table_stats = describe_table(df, variable_stats)

    # missing diagrams
    missing = get_missing_diagrams(df, table_stats)

    # Messages
    messages = check_table_messages(table_stats)
    for col, description in series_description.items():
        messages += check_variable_messages(col, description)

    messages += check_correlation_messages(correlations)

    package = {
        "pandas_profiling_version": __version__,
        "pandas_profiling_config": config.dump(),
    }

    return {
        # Overall description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
    }