Пример #1
0
def test_set_variable():
    r = ProfileReport(pool_size=3)
    assert config["pool_size"].get(int) == 3
    assert config["html"]["minify_html"].get(bool)
    r.set_variable("pool_size", 1)
    assert config["pool_size"].get(int) == 1
    r.set_variable("html.minify_html", False)
    assert not config["html"]["minify_html"].get(bool)
    r.set_variable("html", {"minify_html": True})
    assert config["html"]["minify_html"].get(bool)
Пример #2
0
def test_config_shorthands():
    r = ProfileReport(
        samples=None, correlations=None, missing_diagrams=None, duplicates=None
    )
    assert config["samples"]["head"].get(int) == 0
    assert config["samples"]["tail"].get(int) == 0
    assert config["duplicates"]["head"].get(int) == 0
    assert not config["correlations"]["spearman"]["calculate"].get(bool)
    assert not config["missing_diagrams"]["bar"].get(bool)

    r = ProfileReport()
    r.set_variable("samples", None)
    r.set_variable("duplicates", None)
    r.set_variable("correlations", None)
    r.set_variable("missing_diagrams", None)

    assert config["samples"]["head"].get(int) == 0
    assert config["samples"]["tail"].get(int) == 0
    assert config["duplicates"]["head"].get(int) == 0
    assert not config["correlations"]["spearman"]["calculate"].get(bool)
    assert not config["missing_diagrams"]["bar"].get(bool)
Пример #3
0
            "hours-per-week",
            "native-country",
        ],
    )

    # Prepare missing values
    df = df.replace("\\?", np.nan, regex=True)

    # Initialize the report
    profile = ProfileReport(df, title="Census Dataset", explorative=True)

    # show column definition
    definitions = json.load(open(f"census_column_definition.json"))
    profile.set_variable(
        "dataset",
        {
            "description":
            'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0)). Prediction task is to determine whether a person makes over 50K a year.',
            "copyright_year": "1996",
            "author": "Ronny Kohavi and Barry Becker",
            "creator": "Barry Becker",
            "url": "https://archive.ics.uci.edu/ml/datasets/adult",
        },
    )
    profile.set_variable("variables.descriptions", definitions)

    # Only show the descriptions in the overview
    profile.set_variable("show_variable_description", False)

    profile.to_file(Path("./census_report.html"))
Пример #4
0
# PP only accepts absolute paths
series = series.apply(lambda x: x.absolute()).apply(str)

df = pd.DataFrame(series)

# Generate the profile report
profile = ProfileReport(
    df,
    title="Example showcasing EXIF data (Kaggle 5 Celebrity Faces Dataset)",
    # Disable what's not in our focus
    duplicates=None,
    correlations=None,
    samples=None,
    missing_diagrams=None,
    # Enable files and images (off by default, as it uses relatively expensive computations when not interested)
    explorative=True,
)
# We can also configure the report like this
profile.set_variable(
    "variables.descriptions",
    {
        "files": "The 5 Celebrity Faces Dataset found on Kaggle (dansbecker/5-celebrity-faces-dataset)."
    },
)

# Save the report
profile.to_file("celebrity-faces.html")

# The analysis reveals that quite some photos contain "hidden" EXIF information.
# This can be both interesting as troublesome, depending on the situation.
Пример #5
0
# Generate the profile report
profile = ProfileReport(
    df,
    title=
    "Example of summarization of an image dataset (Kaggle Cat and Dog dataset)",
    # We will not need those
    samples=None,
    missing_diagrams=None,
)

# Give our variable a description
profile.set_variable(
    "variables.descriptions",
    {
        "files":
        "Paths linking to the cats and dogs found https://www.kaggle.com/tongpython/cat-and-dog."
    },
)
# If the number of samples is above this threshold, the scatter plots are replaced with hexbin plots
# We are just over the threshold of 10.000 samples, so let's increase the limit.
profile.set_variable("plot.scatter_threshold", 25000)

# Enable files and images (off by default, as it uses relatively expensive computations when not interested)
profile.set_variable("vars.path.active", True)
profile.set_variable("vars.file.active", True)
profile.set_variable("vars.image.active", True)

# No exif found, so turn off expensive computation
profile.set_variable("vars.image.exif", False)
Пример #6
0
#
# If a column is not present as specified by the schema, a `SchemaError` is raised.

# %%
corrupted_data = fatal_encounters.drop("Subject's age", axis="columns")
try:
    clean_columns(corrupted_data)
except pa.errors.SchemaError as exc:
    print(exc)

# %% [markdown] slideshow={"slide_type": "slide"}
# ### Explore the Data with [`pandas-profiling`](https://github.com/pandas-profiling/pandas-profiling)

# %% tags=["hide_input"]
profile = ProfileReport(fatal_encounters_clean_columns, minimal=True)
profile.set_variable("html.navbar_show", False)
profile.to_notebook_iframe()

# %% [markdown] slideshow={"slide_type": "slide"}
# ### Declare the Training Data Schema

# %% slideshow={"slide_type": "skip"}
genders = ["female", "male", "transgender", "transexual"]
races = [
    "african_american_black", "asian_pacific_islander",
    "european_american_white", "hispanic_latino", "middle_eastern",
    "native_american_alaskan", "race_unspecified",
]
causes_of_death = [
    'asphyxiated_restrained', 'beaten_bludgeoned_with_instrument',
    'burned_smoke_inhalation', 'chemical_agent_pepper_spray',