def check_correlation_messages(config: Settings, correlations: dict) -> List[Message]: messages = [] for corr, matrix in correlations.items(): if config.correlations[corr].warn_high_correlations: threshold = config.correlations[corr].threshold correlated_mapping = perform_check_correlation(matrix, threshold) if len(correlated_mapping) > 0: for k, v in correlated_mapping.items(): messages.append( Message( column_name=k, message_type=MessageType.HIGH_CORRELATION, values={"corr": corr, "fields": v}, ) ) return messages
def check_correlation_messages(correlations): messages = [] for corr, matrix in correlations.items(): if config["correlations"][corr]["warn_high_correlations"].get(bool): threshold = config["correlations"][corr]["threshold"].get(float) correlated_mapping = perform_check_correlation(matrix, threshold) if len(correlated_mapping) > 0: for k, v in correlated_mapping.items(): messages.append( Message( column_name=k, message_type=MessageType.HIGH_CORRELATION, values={"corr": corr, "fields": v}, ) ) return messages
def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert]: alerts = [] for corr, matrix in correlations.items(): if config.correlations[corr].warn_high_correlations: threshold = config.correlations[corr].threshold correlated_mapping = perform_check_correlation(matrix, threshold) if len(correlated_mapping) > 0: for k, v in correlated_mapping.items(): alerts.append( Alert( column_name=k, alert_type=AlertType.HIGH_CORRELATION, values={ "corr": corr, "fields": v }, )) return alerts
def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() if pool_size == 1: args = [(column, series) for column, series in df.iteritems()] series_description = { column: series for column, series in itertools.starmap(multiprocess_1d, args) } else: with multiprocessing.pool.ThreadPool(pool_size) as executor: series_description = {} results = executor.starmap(multiprocess_1d, df.iteritems()) for col, description in results: series_description[col] = description # Mapping from column name to variable type variables = { column: description["type"] for column, description in series_description.items() } # Get correlations correlations = calculate_correlations(df, variables) # Check correlations between numerical variables if (config["check_correlation_pearson"].get(bool) is True and "pearson" in correlations): # Overwrites the description with "CORR" series correlation_threshold = config["correlation_threshold_pearson"].get( float) update( series_description, perform_check_correlation( correlations["pearson"], lambda x: x > correlation_threshold, Variable.S_TYPE_CORR, ), ) # Check correlations between categorical variables if (config["check_correlation_cramers"].get(bool) is True and "cramers" in correlations): # Overwrites the description with "CORR" series correlation_threshold = config["correlation_threshold_cramers"].get( float) update( series_description, perform_check_correlation( correlations["cramers"], lambda x: x > correlation_threshold, Variable.S_TYPE_CORR, ), ) # Check recoded if config["check_recoded"].get(bool) is True and "recoded" in correlations: # Overwrites the description with "RECORDED" series update( series_description, perform_check_correlation(correlations["recoded"], lambda x: x == 1, Variable.S_TYPE_RECODED), ) # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Table statistics table_stats = describe_table(df, variable_stats) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages messages = check_table_messages(table_stats) for col, description in series_description.items(): messages += check_variable_messages(col, description) package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }