def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() if pool_size == 1: args = [(column, series) for column, series in df.iteritems()] series_description = { column: series for column, series in itertools.starmap(multiprocess_1d, args) } else: with multiprocessing.pool.ThreadPool(pool_size) as executor: series_description = {} results = executor.starmap(multiprocess_1d, df.iteritems()) for col, description in results: series_description[col] = description # Mapping from column name to variable type variables = { column: description["type"] for column, description in series_description.items() } # Get correlations correlations = calculate_correlations(df, variables) # Check correlations between numerical variables if (config["check_correlation_pearson"].get(bool) is True and "pearson" in correlations): # Overwrites the description with "CORR" series correlation_threshold = config["correlation_threshold_pearson"].get( float) update( series_description, perform_check_correlation( correlations["pearson"], lambda x: x > correlation_threshold, Variable.S_TYPE_CORR, ), ) # Check correlations between categorical variables if (config["check_correlation_cramers"].get(bool) is True and "cramers" in correlations): # Overwrites the description with "CORR" series correlation_threshold = config["correlation_threshold_cramers"].get( float) update( series_description, perform_check_correlation( correlations["cramers"], lambda x: x > correlation_threshold, Variable.S_TYPE_CORR, ), ) # Check recoded if config["check_recoded"].get(bool) is True and "recoded" in correlations: # Overwrites the description with "RECORDED" series update( series_description, perform_check_correlation(correlations["recoded"], lambda x: x == 1, Variable.S_TYPE_RECODED), ) # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Table statistics table_stats = describe_table(df, variable_stats) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages messages = check_table_messages(table_stats) for col, description in series_description.items(): messages += check_variable_messages(col, description) package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }
def describe(title: str, df: pd.DataFrame, sample: Optional[dict] = None) -> dict: """Calculate the statistics for each series in this DataFrame. Args: title: report title df: DataFrame. sample: optional, dict with custom sample Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. - package: package details. """ if df is None: raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.") if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") disable_progress_bar = not config["progress_bar"].get(bool) date_start = datetime.utcnow() correlation_names = [ correlation_name for correlation_name in ["pearson", "spearman", "kendall", "phi_k", "cramers",] if config["correlations"][correlation_name]["calculate"].get(bool) ] number_of_tasks = 9 + len(df.columns) + len(correlation_names) with tqdm( total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar ) as pbar: series_description = get_series_descriptions(df, pbar) pbar.set_postfix_str("Get variable types") variables = { column: description["type"] for column, description in series_description.items() } pbar.update() # Transform the series_description in a DataFrame pbar.set_postfix_str("Get variable statistics") variable_stats = pd.DataFrame(series_description) pbar.update() # Get correlations correlations = {} for correlation_name in correlation_names: pbar.set_postfix_str(f"Calculate {correlation_name} correlation") correlations[correlation_name] = calculate_correlation( df, variables, correlation_name ) pbar.update() # make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_matrix = get_scatter_matrix(df, variables) pbar.update() # Table statistics pbar.set_postfix_str("Get table statistics") table_stats = get_table_stats(df, variable_stats) pbar.update() # missing diagrams pbar.set_postfix_str("Get missing diagrams") missing = get_missing_diagrams(df, table_stats) pbar.update() # Sample pbar.set_postfix_str("Take sample") if sample is None: samples = get_sample(df) else: if "name" not in sample: sample["name"] = None if "caption" not in sample: sample["caption"] = None samples = [ Sample("custom", sample["data"], sample["name"], sample["caption"]) ] pbar.update() # Duplicates pbar.set_postfix_str("Locating duplicates") supported_columns = [ key for key, value in series_description.items() if value["type"] != Variable.S_TYPE_UNSUPPORTED ] duplicates = get_duplicates(df, supported_columns) pbar.update() # Messages pbar.set_postfix_str("Get messages/warnings") messages = get_messages(table_stats, series_description, correlations) pbar.update() pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, # Sample "sample": samples, # Duplicates "duplicates": duplicates, }
def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") disable_progress_bar = not config["progress_bar"].get(bool) # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() args = [(column, series) for column, series in df.iteritems()] series_description = {} with tqdm(total=len(args), desc="variables", disable=disable_progress_bar) as pbar: if pool_size == 1: for arg in args: column, description = multiprocess_1d(arg) series_description[column] = description pbar.update() else: # Store the original order original_order = { k: v for v, k in enumerate([column for column, _ in args]) } # TODO: use `Pool` for Linux-based systems with multiprocessing.pool.ThreadPool(pool_size) as executor: for i, (column, description) in enumerate( executor.imap_unordered(multiprocess_1d, args) ): series_description[column] = description pbar.set_postfix({'feature_name': column}) pbar.update() # Restore the original order series_description = dict( sorted( series_description.items(), key=lambda index: original_order.get(index[0]), ) ) # Mapping from column name to variable type sort = config["sort"].get(str) series_description = sort_column_names(series_description, sort) variables = { column: description["type"] for column, description in series_description.items() } # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Get correlations correlations = calculate_correlations(df, variables) # Scatter matrix scatter_matrix = get_scatter_matrix(df, variables) # Table statistics with tqdm(total=1, desc="table", disable=disable_progress_bar) as pbar: table_stats = describe_table(df, variable_stats) pbar.update(1) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages with tqdm(total=3, desc="warnings", disable=disable_progress_bar) as pbar: pbar.set_description_str("warnings [table]") messages = check_table_messages(table_stats) pbar.update() pbar.set_description_str("warnings [variables]") for col, description in series_description.items(): messages += check_variable_messages(col, description) pbar.update() pbar.set_description_str("warnings [correlations]") messages += check_correlation_messages(correlations) messages.sort(key=lambda message: str(message.message_type)) pbar.update() with tqdm(total=1, desc="package", disable=disable_progress_bar) as pbar: package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } pbar.update() return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }
def describe(title, df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. - package: package details. :param title: """ if df is None: raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.") if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") disable_progress_bar = not config["progress_bar"].get(bool) date_start = datetime.utcnow() correlation_names = [ correlation_name for correlation_name in ["pearson", "spearman", "kendall", "phi_k", "cramers",] if config["correlations"][correlation_name]["calculate"].get(bool) ] number_of_tasks = 9 + len(df.columns) + len(correlation_names) with tqdm( total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar ) as pbar: series_description = get_series_descriptions(df, pbar) pbar.set_postfix_str("Get variable types") variables = { column: description["type"] for column, description in series_description.items() } pbar.update() # Transform the series_description in a DataFrame pbar.set_postfix_str("Get variable statistics") variable_stats = pd.DataFrame(series_description) pbar.update() # Get correlations correlations = {} for correlation_name in correlation_names: pbar.set_postfix_str(f"Calculate {correlation_name} correlation") correlations[correlation_name] = calculate_correlation( df, variables, correlation_name ) pbar.update() # Make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_matrix = get_scatter_matrix(df, variables) pbar.update() # Table statistics pbar.set_postfix_str("Get table statistics") table_stats = get_table_stats(df, variable_stats) pbar.update() # Missing diagrams pbar.set_postfix_str("Get missing diagrams") missing = get_missing_diagrams(df, table_stats) pbar.update() # Sample pbar.set_postfix_str("Take sample") sample = get_sample(df) pbar.update() # Duplicates pbar.set_postfix_str("Locating duplicates") supported_columns = [ key for key, value in series_description.items() if value["type"] != Variable.S_TYPE_UNSUPPORTED ] duplicates = get_duplicates(df, supported_columns) pbar.update() # Clusters pbar.set_postfix_str("Searching for clusters") categoricals = [column_name for column_name, variable_type in variables.items() if variable_type == Variable.TYPE_CAT] df_without_missing = df.dropna() df_ohe = pd.concat([df_without_missing.drop(categoricals, axis=1), pd.get_dummies(df_without_missing[categoricals])], axis=1).reset_index() clusters = { name: pd.concat([df_ohe, pd.DataFrame({"Cluster": eval(clustering).fit(df_ohe).labels_})], axis=1) for name, clustering in config["clusters"]["clusterings"].get() } # Outliers pbar.set_postfix_str("Detecting outliers") outliers = { name: pd.concat([df_ohe, pd.DataFrame({"Outlier": eval(detector).fit_predict(df_ohe)})], axis=1) for name, detector in config["outliers"]["detectors"].get() } # Messages pbar.set_postfix_str("Get messages/warnings") messages = get_messages(table_stats, series_description, correlations) pbar.update() pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": VERSION, "pandas_profiling_config": config.dump(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, # Sample "sample": sample, # Duplicates "duplicates": duplicates, # Clusters "clusters": clusters, # Outliers "outliers": outliers }
def describe(df: pd.DataFrame) -> dict: """Calculate the statistics for each series in this DataFrame. Args: df: DataFrame. Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be of type pandas.DataFrame") if df.empty: raise ValueError("df can not be empty") # Multiprocessing of Describe 1D for each column pool_size = config["pool_size"].get(int) if pool_size <= 0: pool_size = multiprocessing.cpu_count() if pool_size == 1: args = [(column, series) for column, series in df.iteritems()] series_description = { column: series for column, series in itertools.starmap(multiprocess_1d, args) } else: with multiprocessing.pool.ThreadPool(pool_size) as executor: series_description = {} results = executor.starmap(multiprocess_1d, df.iteritems()) for col, description in results: series_description[col] = description # Mapping from column name to variable type variables = { column: description["type"] for column, description in series_description.items() } # Get correlations correlations = calculate_correlations(df, variables) # Scatter matrix scatter_matrix = get_scatter_matrix(df, variables) # Transform the series_description in a DataFrame variable_stats = pd.DataFrame(series_description) # Table statistics table_stats = describe_table(df, variable_stats) # missing diagrams missing = get_missing_diagrams(df, table_stats) # Messages messages = check_table_messages(table_stats) for col, description in series_description.items(): messages += check_variable_messages(col, description) messages += check_correlation_messages(correlations) package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.dump(), } return { # Overall description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, }