示例#1
0
 def description_set(self) -> Dict[str, Any]:
     if self._description_set is None:
         self._description_set = describe_df(
             self.config,
             self.df,
             self.summarizer,
             self.typeset,
             self._sample,
         )
     return self._description_set
示例#2
0
    def __init__(self, df, config_file: Path = None, **kwargs):
        if config_file:
            config.config.set_file(str(config_file))
        config.set_kwargs(kwargs)

        # Treat index as any other column
        if (not pd.Index(np.arange(0, len(df))).equals(df.index)
                or df.index.dtype != np.int64):
            df = df.reset_index()

        # Rename reserved column names
        df = rename_index(df)

        # Remove spaces and colons from column names
        df = clean_column_names(df)

        # Sort column names
        sort = config["sort"].get(str)
        if sys.version_info[1] <= 5 and sort != "None":
            warnings.warn("Sorting is supported from Python 3.6+")

        if sort in ["asc", "ascending"]:
            df = df.reindex(sorted(df.columns, key=lambda s: s.casefold()),
                            axis=1)
        elif sort in ["desc", "descending"]:
            df = df.reindex(reversed(
                sorted(df.columns, key=lambda s: s.casefold())),
                            axis=1)
        elif sort != "None":
            raise ValueError(
                '"sort" should be "ascending", "descending" or None.')

        # Store column order
        config["column_order"] = df.columns.tolist()

        # Get dataset statistics
        description_set = describe_df(df)

        # Get sample
        sample = {}
        n_head = config["samples"]["head"].get(int)
        if n_head > 0:
            sample["head"] = df.head(n=n_head)

        n_tail = config["samples"]["tail"].get(int)
        if n_tail > 0:
            sample["tail"] = df.tail(n=n_tail)

        # Render HTML
        self.html = to_html(sample, description_set)
        self.minify_html = config["minify_html"].get(bool)
        self.use_local_assets = config["use_local_assets"].get(bool)
        self.title = config["title"].get(str)
        self.description_set = description_set
        self.sample = sample
示例#3
0
    def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
        if sys.version_info <= (3, 5):
            warnings.warn(
                "This is the last release to support Python 3.5, please upgrade.",
                category=DeprecationWarning,
            )

        if config_file is not None and minimal:
            raise ValueError(
                "Arguments `config_file` and `minimal` are mutually exclusive."
            )

        if minimal:
            config_file = get_config_minimal()

        if config_file:
            config.set_file(str(config_file))
        config.set_kwargs(kwargs)

        self.date_start = datetime.utcnow()

        # Treat index as any other column
        if (not pd.Index(np.arange(0, len(df))).equals(df.index)
                or df.index.dtype != np.int64):
            df = df.reset_index()

        # Rename reserved column names
        df = rename_index(df)

        # Ensure that columns are strings
        df.columns = df.columns.astype("str")

        # Get dataset statistics
        description_set = describe_df(df)

        # Build report structure
        self.sample = self.get_sample(df)
        self.title = config["title"].get(str)
        self.description_set = description_set
        self.date_end = datetime.utcnow()

        disable_progress_bar = not config["progress_bar"].get(bool)

        with tqdm(total=1,
                  desc="build report structure",
                  disable=disable_progress_bar) as pbar:
            self.report = get_report_structure(self.date_start, self.date_end,
                                               self.sample, description_set)
            pbar.update()
示例#4
0
    def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
        if config_file is not None and minimal:
            raise ValueError(
                "Arguments `config_file` and `minimal` are mutually exclusive."
            )

        if minimal:
            config_file = get_config_minimal()

        if config_file:
            config.config.set_file(str(config_file))
        config.set_kwargs(kwargs)

        self.date_start = datetime.utcnow()

        # Treat index as any other column
        if (
            not pd.Index(np.arange(0, len(df))).equals(df.index)
            or df.index.dtype != np.int64
        ):
            df = df.reset_index()

        # Rename reserved column names
        df = rename_index(df)

        # Ensure that columns are strings
        df.columns = df.columns.astype("str")

        # Sort names according to config (asc, desc, no sort)
        df = self.sort_column_names(df)
        config["column_order"] = df.columns.tolist()

        # Get dataset statistics
        description_set = describe_df(df)

        # Build report structure
        self.sample = self.get_sample(df)
        self.title = config["title"].get(str)
        self.description_set = description_set

        self.date_end = datetime.utcnow()
        self.report = get_report_structure(
            self.date_start, self.date_end, self.sample, description_set
        )
示例#5
0
from dash.dependencies import Input, Output
import dash_utils as du

import seaborn as sns
import os
import plotly.graph_objs as go

from pandas_profiling.model.describe import describe as describe_df

base_path = os.path.abspath(os.path.dirname(__file__))

# -- data
# df = pd.read_csv('./data/diamonds.csv', sep=',', index_col=0)
df = sns.load_dataset('diamonds')
# using pandas profiling to create the description
desc = describe_df(df)
variables = desc['variables']
variables = {
    k: v
    for k, v in variables.items()
    if str(v['type'] != 'Variabe.TYPE_UNSUPPORTED')
}
cats = {
    col: {
        'CAT':
        variables[col]['type'],
        'n_unique':
        variables[col]['distinct_count']
        if str(variables[col]['type']) == "Variable.TYPE_CAT" else 0
    }
    for col in variables.keys()
 def description_set(self):
     if self._description_set is None:
         self._description_set = describe_df(self.title, self.df,
                                             self._sample)
     return self._description_set
示例#7
0
 def description_set(self):
     if self._description_set is None:
         _ = self.df_hash
         self._description_set = describe_df(self.title, self.df)
     return self._description_set