def test_describe_unique(data, expected, summarizer, typeset): """Test the unique feature of 1D data""" config = Settings() config.vars.num.low_categorical_threshold = 0 desc_1d = describe_1d(config, data, summarizer, typeset) if expected["is_unique"] is not None: assert ( desc_1d["p_unique"] == expected["p_unique"] ), "Describe 1D p_unique incorrect" assert ( desc_1d["p_distinct"] == expected["p_distinct"] ), "Describe 1D p_distinct incorrect" assert ( desc_1d["is_unique"] == expected["is_unique"] ), "Describe 1D should return unique"
def test_describe_df(column, describe_data, expected_results, summarizer): config = Settings() config.vars.num.low_categorical_threshold = 0 typeset = ProfilingTypeSet(config) describe_data_frame = pd.DataFrame({column: describe_data[column]}) if column == "somedate": describe_data_frame["somedate"] = pd.to_datetime( describe_data_frame["somedate"] ) results = describe(config, describe_data_frame, summarizer, typeset) assert { "analysis", "table", "variables", "scatter", "correlations", "missing", "messages", "package", "sample", "duplicates", } == set(results.keys()), "Not in results" # Loop over variables for k, v in expected_results[column].items(): if v == check_is_NaN: test_condition = k not in results["variables"][column] elif isinstance(v, float): test_condition = pytest.approx(v) == results["variables"][column][k] else: test_condition = v == results["variables"][column][k] assert ( test_condition ), f"Value `{results['variables'][column][k]}` for key `{k}` in column `{column}` is not NaN" if results["variables"][column]["type"] in ["Numeric", "DateTime"]: assert ( "histogram" in results["variables"][column] ), f"Histogram missing for column {column}"
def describe( config: Settings, df: pd.DataFrame, summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None, ) -> dict: """Calculate the statistics for each series in this DataFrame. Args: config: report Settings object df: DataFrame. summarizer: summarizer object typeset: visions typeset sample: optional, dict with custom sample Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - alerts: direct special attention to these patterns in your data. - package: package details. """ if df is None: raise ValueError( "Can not describe a `lazy` ProfileReport without a DataFrame.") check_dataframe(df) df = preprocess(config, df) number_of_tasks = 5 with tqdm( total=number_of_tasks, desc="Summarize dataset", disable=not config.progress_bar, position=0, ) as pbar: date_start = datetime.utcnow() # Variable-specific pbar.total += len(df.columns) series_description = get_series_descriptions(config, df, summarizer, typeset, pbar) pbar.set_postfix_str("Get variable types") pbar.total += 1 variables = { column: description["type"] for column, description in series_description.items() } supported_columns = [ column for column, type_name in variables.items() if type_name != "Unsupported" ] interval_columns = [ column for column, type_name in variables.items() if type_name == "Numeric" ] pbar.update() # Get correlations correlation_names = get_active_correlations(config) pbar.total += len(correlation_names) correlations = { correlation_name: progress(calculate_correlation, pbar, f"Calculate {correlation_name} correlation")( config, df, correlation_name, series_description) for correlation_name in correlation_names } # make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_tasks = get_scatter_tasks(config, interval_columns) pbar.total += len(scatter_tasks) scatter_matrix: Dict[Any, Dict[Any, Any]] = { x: { y: None } for x, y in scatter_tasks } for x, y in scatter_tasks: scatter_matrix[x][y] = progress( get_scatter_plot, pbar, f"scatter {x}, {y}")(config, df, x, y, interval_columns) # Table statistics table_stats = progress(get_table_stats, pbar, "Get dataframe statistics")(config, df, series_description) # missing diagrams missing_map = get_missing_active(config, table_stats) pbar.total += len(missing_map) missing = { name: progress(get_missing_diagram, pbar, f"Missing diagram {name}")(config, df, settings) for name, settings in missing_map.items() } missing = { name: value for name, value in missing.items() if value is not None } # Sample pbar.set_postfix_str("Take sample") if sample is None: samples = get_sample(config, df) else: samples = get_custom_sample(sample) pbar.update() # Duplicates metrics, duplicates = progress( get_duplicates, pbar, "Detecting duplicates")(config, df, supported_columns) table_stats.update(metrics) alerts = progress(get_alerts, pbar, "Get alerts")(config, table_stats, series_description, correlations) pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.json(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": config.title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Alerts "alerts": alerts, # Package "package": package, # Sample "sample": samples, # Duplicates "duplicates": duplicates, }
def describe( config: Settings, df: pd.DataFrame, summarizer: BaseSummarizer, typeset: VisionsTypeset, sample: Optional[dict] = None, ) -> dict: """Calculate the statistics for each series in this DataFrame. Args: config: report Settings object df: DataFrame. sample: optional, dict with custom sample Returns: This function returns a dictionary containing: - table: overall statistics. - variables: descriptions per series. - correlations: correlation matrices. - missing: missing value diagrams. - messages: direct special attention to these patterns in your data. - package: package details. """ if df is None: raise ValueError( "Can not describe a `lazy` ProfileReport without a DataFrame.") if not isinstance(df, pd.DataFrame): warnings.warn("df is not of type pandas.DataFrame") disable_progress_bar = not config.progress_bar date_start = datetime.utcnow() correlation_names = [ correlation_name for correlation_name in [ "pearson", "spearman", "kendall", "phi_k", "cramers", ] if config.correlations[correlation_name].calculate ] number_of_tasks = 8 + len(df.columns) + len(correlation_names) with tqdm(total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar) as pbar: series_description = get_series_descriptions(config, df, summarizer, typeset, pbar) pbar.set_postfix_str("Get variable types") variables = { column: description["type"] for column, description in series_description.items() } supported_columns = [ column for column, type_name in variables.items() if type_name != "Unsupported" ] interval_columns = [ column for column, type_name in variables.items() if type_name == "Numeric" ] pbar.update() # Get correlations correlations = {} for correlation_name in correlation_names: pbar.set_postfix_str(f"Calculate {correlation_name} correlation") correlations[correlation_name] = calculate_correlation( config, df, correlation_name, series_description) pbar.update() # make sure correlations is not None correlations = { key: value for key, value in correlations.items() if value is not None } # Scatter matrix pbar.set_postfix_str("Get scatter matrix") scatter_matrix = get_scatter_matrix(config, df, interval_columns) pbar.update() # Table statistics pbar.set_postfix_str("Get table statistics") table_stats = get_table_stats(config, df, series_description) pbar.update() # missing diagrams pbar.set_postfix_str("Get missing diagrams") missing = get_missing_diagrams(config, df, table_stats) pbar.update() # Sample pbar.set_postfix_str("Take sample") if sample is None: samples = get_sample(config, df) else: if "name" not in sample: sample["name"] = None if "caption" not in sample: sample["caption"] = None samples = [ Sample( id="custom", data=sample["data"], name=sample["name"], caption=sample["caption"], ) ] pbar.update() # Duplicates pbar.set_postfix_str("Locating duplicates") metrics, duplicates = get_duplicates(config, df, supported_columns) table_stats.update(metrics) pbar.update() # Messages pbar.set_postfix_str("Get messages/warnings") messages = get_messages(config, table_stats, series_description, correlations) pbar.update() pbar.set_postfix_str("Get reproduction details") package = { "pandas_profiling_version": __version__, "pandas_profiling_config": config.json(), } pbar.update() pbar.set_postfix_str("Completed") date_end = datetime.utcnow() analysis = { "title": config.title, "date_start": date_start, "date_end": date_end, "duration": date_end - date_start, } return { # Analysis metadata "analysis": analysis, # Overall dataset description "table": table_stats, # Per variable descriptions "variables": series_description, # Bivariate relations "scatter": scatter_matrix, # Correlation matrices "correlations": correlations, # Missing values "missing": missing, # Warnings "messages": messages, # Package "package": package, # Sample "sample": samples, # Duplicates "duplicates": duplicates, }
def __init__( self, df: Optional[pd.DataFrame] = None, minimal: bool = False, explorative: bool = False, sensitive: bool = False, dark_mode: bool = False, orange_mode: bool = False, sample: Optional[dict] = None, config_file: Union[Path, str] = None, lazy: bool = True, typeset: Optional[VisionsTypeset] = None, summarizer: Optional[BaseSummarizer] = None, config: Optional[Settings] = None, **kwargs, ): """Generate a ProfileReport based on a pandas DataFrame Args: df: the pandas DataFrame minimal: minimal mode is a default configuration with minimal computation config_file: a config file (.yml), mutually exclusive with `minimal` lazy: compute when needed sample: optional dict(name="Sample title", caption="Caption", data=pd.DataFrame()) typeset: optional user typeset to use for type inference summarizer: optional user summarizer to generate custom summary output **kwargs: other arguments, for valid arguments, check the default configuration file. """ if df is None and not lazy: raise ValueError( "Can init a not-lazy ProfileReport with no DataFrame") report_config: Settings = Settings() if config is None else config if config_file is not None and minimal: raise ValueError( "Arguments `config_file` and `minimal` are mutually exclusive." ) if config_file or minimal: if not config_file: config_file = get_config("config_minimal.yaml") with open(config_file) as f: data = yaml.safe_load(f) report_config = report_config.parse_obj(data) if explorative: report_config = report_config.update( Config.get_arg_groups("explorative")) if sensitive: report_config = report_config.update( Config.get_arg_groups("sensitive")) if dark_mode: report_config = report_config.update( Config.get_arg_groups("dark_mode")) if orange_mode: report_config = report_config.update( Config.get_arg_groups("orange_mode")) if len(kwargs) > 0: report_config = report_config.update(Config.shorthands(kwargs)) self.df = None self.config = report_config self._df_hash = None self._sample = sample self._typeset = typeset self._summarizer = summarizer if df is not None: # preprocess df self.df = self.preprocess(df) if not lazy: # Trigger building the report structure _ = self.report
def config(): return Settings()
def test_describe_list(summarizer, typeset): config = Settings() with pytest.raises(AttributeError), pytest.warns(UserWarning): describe(config, "", [1, 2, 3], summarizer, typeset)
(11, 12), (13, 14), (15, 16), (17, 18), ], "date_str": ["2018-01-01", "2017-02-01", "2018-04-07"], "nullable_int": pd.Series([1, None], dtype="Int64"), } return {key: pd.Series(values, name=key) for key, values in data.items()} series = get_profiling_series() config = Settings() config.vars.num.low_categorical_threshold = 0 my_typeset = ProfilingTypeSet(config) type_map = {str(k): k for k in my_typeset.types} Numeric = type_map["Numeric"] Categorical = type_map["Categorical"] Boolean = type_map["Boolean"] DateTime = type_map["DateTime"] Unsupported = type_map["Unsupported"] config2 = Settings() config2.vars.num.low_categorical_threshold = 2 typeset2 = ProfilingTypeSet(config2) type_map2 = {str(k): k for k in typeset2.types} Numeric2 = type_map2["Numeric"]
from pandas_profiling.config import Settings from pandas_profiling.model.typeset import ProfilingTypeSet from tests.unit.test_utils import patch_arg if int(pd.__version__.split(".")[0]) < 1: from visions.dtypes.boolean import BoolDtype # noqa: F401 btype = "Bool" else: btype = "boolean" base_path = os.path.abspath(os.path.dirname(__file__)) series = get_series() my_config = Settings() my_config.vars.num.low_categorical_threshold = 0 my_typeset_default = ProfilingTypeSet(my_config) type_map = {str(k): k for k in my_typeset_default.types} Numeric = type_map["Numeric"] Categorical = type_map["Categorical"] Boolean = type_map["Boolean"] DateTime = type_map["DateTime"] Unsupported = type_map["Unsupported"] contains_map = { Numeric: { "int_series", "Int64_int_series", "int_range",
def test_describe_list(summarizer, typeset): config = Settings() with pytest.raises(NotImplementedError): describe(config, "", [1, 2, 3], summarizer, typeset)