def test_dataset_schema():
    file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta")
    df = pd.read_stata(file_name)

    metadata = {
        "creator": "Firstname Lastname",
        "author": "Firstname Lastname",
        "description": "This profiling report was generated using a sample of 5% of the original dataset.",
        "copyright_holder": "RandoCorp LLC",
        "copyright_year": "2020",
        "url": "http://www.dataset-sources.com/data/dataset.dat",
    }

    # Length left out due to correlation with weight.
    report = df.profile_report(
        title="Dataset schema",
        dataset=metadata,
        minimal=True,
    )

    html = report.to_html()

    assert "<p class=h4>Dataset</p>" in html
    for key in metadata.keys():
        if not key.startswith("copyright_") and key != "url":
            assert f"<th>{key.capitalize()}</th>" in html
    assert "<tr><th>Copyright</th><td>(c) RandoCorp LLC 2020</td></tr>"
    assert '<tr><th>URL</th><td><a href="http://www.dataset-sources.com/data/dataset.dat">http://www.dataset-sources.com/data/dataset.dat</a></td></tr>'
    assert "<p class=h4>Reproduction</p>" in html
Exemplo n.º 2
0
def test_titanic_default(benchmark):
    file_name = cache_file(
        "titanic.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
    )

    data = pd.read_parquet(file_name)

    benchmark(partial(func), data)
Exemplo n.º 3
0
def test_titanic_explorative(benchmark):
    file_name = cache_file(
        "titanic.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
    )

    data = pd.read_parquet(file_name)

    kwargs = dict(explorative=True)
    benchmark(partial(func, **kwargs), data)
Exemplo n.º 4
0
def test_rdw_minimal(benchmark):
    file_name = cache_file(
        "rdw_sample_100k.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw_sample_100k.parquet",
    )

    data = pd.read_parquet(file_name)

    kwargs = dict(minimal=True)
    benchmark(partial(func, **kwargs), data)
Exemplo n.º 5
0
def test_titanic_minimal(benchmark):
    file_name = cache_file(
        "titanic.parquet",
        "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet",
    )

    data = pd.read_parquet(file_name)

    kwargs = {"minimal": True}
    benchmark(partial(func, **kwargs), data)
def test_dataset_schema_empty():
    file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta")
    df = pd.read_stata(file_name)

    # Length left out due to correlation with weight.
    report = df.profile_report(
        title="Dataset schema empty", minimal=True, dataset=None,
    )

    html = report.to_html()

    assert "<p class=h4>Dataset</p>" not in html
    assert "<div class=col-sm-12><p class=h4>Reproduction</p>" in html
Exemplo n.º 7
0
def test_issue416():
    file_name = cache_file(
        "products.tsv",
        "https://raw.githubusercontent.com/mrichman/clickstream-pandas/master/products.tsv",
    )

    df = pd.read_csv(file_name, sep="\t")
    df["path"] = df["url"].str.replace("http://www.acme.com", "")

    profile = pandas_profiling.ProfileReport(
        df, title="Pandas Profiling Report", html={"style": {"full_width": True}}
    )
    data = profile.to_json()
    assert '"PATH": 1' in data
    assert '"common_prefix": "/",' in data
def test_issue377():
    file_name = cache_file(
        "bank-full.csv",
        "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
    )

    # Download the UCI Bank Marketing Dataset
    df = pd.read_csv(file_name, sep=";")

    original_order = tuple(df.columns.values)

    profile = pandas_profiling.ProfileReport(df,
                                             sort="None",
                                             pool_size=5,
                                             progress_bar=False)
    new_order = tuple(profile.get_description()["variables"].keys())
    assert original_order == new_order
Exemplo n.º 9
0
def test_issue416():
    file_name = cache_file(
        "products.tsv",
        "https://raw.githubusercontent.com/mrichman/clickstream-pandas/master/products.tsv",
    )

    df = pd.read_csv(file_name, sep="\t")
    df["path"] = df["url"].str.replace("http://www.acme.com", "")

    profile = pandas_profiling.ProfileReport(
        df,
        title="Pandas Profiling Report",
        html={"style": {
            "full_width": True
        }},
        explorative=True,
    )
    data = profile.get_description()

    assert data["table"]["types"][Categorical] == 1
    assert data["table"]["types"][Path] == 1
    assert data["table"]["types"][URL] == 1
    assert data["variables"]["path"]["common_prefix"] == "/"
Exemplo n.º 10
0
#%%
from pathlib import Path

import requests
import numpy as np
import pandas as pd

import pandas_profiling
from pandas_profiling.utils.cache import cache_file

# %%
file_name = cache_file(
    "apple.csv",
    "https://raw.githubusercontent.com/anarinsk/adp-apple_mobility_trend/master/data/applemobilitytrends-2020-04-23.csv",
)
    
df = pd.read_csv(file_name)

#%%
tmplist = zip(df.columns[3:], "time_" +  df.columns[3:])
df.rename(columns = dict(tmplist), inplace=True)
#
df = pd.wide_to_long(df, stubnames='time', i=['geo_type', 'region', 'transportation_type'], j='date', sep='_', suffix='.*')
df = df.reset_index()
#
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') 
df
Exemplo n.º 11
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "coal_prices.csv",
        r"https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=PCOALAUUSDM&scale=left&cosd=1990-01-01&coed=2020-01-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-02-12&revision_date=2020-02-12&nd=1990-01-01",
    )

    df = pd.read_csv(file_name, parse_dates=["DATE"])

    report = ProfileReport(
        df,
        title="Coal Prices (IMF)",
        config_file=Path("../../src/pandas_profiling/config_dark.yaml"),
        html={
            "style": {
                "logo":
                ""
            }
        },
    )
    report.to_file(Path("flatly_report.html"))
Exemplo n.º 12
0
from pathlib import Path

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "census_train.csv",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    )

    # Names based on https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
    df = pd.read_csv(
        file_name,
        header=None,
        index_col=False,
        names=[
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education-num",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
import great_expectations as ge
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

file_name = cache_file(
    "titanic.csv",
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
)

df = pd.read_csv(file_name)

profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

# Example 1
# Obtain expectation suite, this includes profiling the dataset, saving the expectation suite, validating the
# dataframe, and building data docs
suite = profile.to_expectation_suite(suite_name="titanic_expectations")

# Example 2
# Run Great Expectations while specifying the directory with an existing Great Expectations set-up by passing in a
# Data Context
data_context = ge.data_context.DataContext(
    context_root_dir="my_ge_root_directory/")

suite = profile.to_expectation_suite(suite_name="titanic_expectations",
                                     data_context=data_context)

# Example 3
# Just build the suite
Exemplo n.º 14
0
# As featured on this Google Cloud Platform page:
# https://cloud.google.com/solutions/building-a-propensity-model-for-financial-services-on-gcp
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "bank-full.csv",
        "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv",
    )

    # Download the UCI Bank Marketing Dataset
    df = pd.read_csv(file_name, sep=";")

    profile = ProfileReport(
        df, title="Profile Report of the UCI Bank Marketing Dataset")
    profile.to_file(Path("uci_bank_marketing_report.html"))
Exemplo n.º 15
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "chicago_employees.csv",
        "https://data.cityofchicago.org/api/views/xzkq-xp2w/rows.csv?accessType=DOWNLOAD",
    )

    df = pd.read_csv(file_name)

    profile = ProfileReport(df, title="Chicago Employees")
    profile.to_file(output_file=Path("./chicago_employees_report.html"))
Exemplo n.º 16
0
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "rdw.parquet",
        "https://raw.githubusercontent.com/pandas-profiling/pandas-profiling-data/master/data/rdw.parquet",
    )
    data = pd.read_parquet(file_name)

    profile = ProfileReport(data, title="RDW Dataset", minimal=True)
    profile.to_file("rdw.html")
Exemplo n.º 17
0
from pathlib import Path

import numpy as np
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # Set a seed for reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)
    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Unhashable
Exemplo n.º 18
0
 def getter(file_name, url):
     source_file = cache_file(file_name, url)
     # Move to temporary directory
     test_path = Path(str(tmpdir))
     shutil.copy(str(source_file), str(test_path / file_name))
     return str(test_path / file_name)
Exemplo n.º 19
0
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "colors.csv",
        "https://github.com/codebrainz/color-names/raw/master/output/colors.csv",
    )

    df = pd.read_csv(file_name, names=["Code", "Name", "Hex", "R", "G", "B"])
    report = ProfileReport(df, title="Colors")
    report.to_file("colors_report.html")
Exemplo n.º 20
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "reviews_Musical_Instruments_5.json.gz",
        r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz",
    )

    df = pd.read_json(file_name, compression="gzip", lines=True)

    profile = ProfileReport(
        df, title="Amazon Musical Instrument Review | Profile Report")
    profile.to_file(Path("./review_report.html"))
Exemplo n.º 21
0
import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "vektis_postcodes.csv",
        "https://www.vektis.nl/uploads/Docs%20per%20pagina/Open%20Data%20Bestanden/2017/Vektis%20Open%20Databestand%20Zorgverzekeringswet%202017%20-%20postcode3.csv",
    )

    df = pd.read_csv(file_name, sep=";", low_memory=False)
    report = ProfileReport(
        df,
        title="Vektis Postal Code 2017",
        correlations={
            "recoded": {"calculate": False},
            "kendall": {"calculate": False},
            "phi_k": {"calculate": False},
            "cramers": {"calculate": False},
        },
        plot={"histogram": {"bayesian_blocks_bins": False}},
    )
    report.to_file("vektis_report.html", True)
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file("auto2.dta",
                           "http://www.stata-press.com/data/r15/auto2.dta")
    # Suggested by @adamrossnelson
    df = pd.read_stata(file_name)

    # Length left out due to correlation with weight.
    report = ProfileReport(df,
                           title="1978 Automobile dataset",
                           explorative=True)
    report.to_file(Path("stata_auto_report.html"))
Exemplo n.º 23
0
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
from pandas_profiling.utils.cache import cache_file

if __name__ == "__main__":
    file_name = cache_file(
        "websites.csv",
        "https://raw.githubusercontent.com/berkmancenter/url-lists/master/lists/et.csv",
    )

    df = pd.read_csv(file_name, parse_dates=["date_added"])
    profile = ProfileReport(
        df,
        title="Website Inaccessibility Test Lists",
        correlations={"cramers": {
            "calculate": False
        }},
    )
    profile.to_file(Path("./website_inaccessibility_report.html"))