def test_dataset_schema(): file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta") df = pd.read_stata(file_name) metadata = { "creator": "Firstname Lastname", "author": "Firstname Lastname", "description": "This profiling report was generated using a sample of 5% of the original dataset.", "copyright_holder": "RandoCorp LLC", "copyright_year": "2020", "url": "http://www.dataset-sources.com/data/dataset.dat", } # Length left out due to correlation with weight. report = df.profile_report( title="Dataset schema", dataset=metadata, minimal=True, ) html = report.to_html() assert "<p class=h4>Dataset</p>" in html for key in metadata.keys(): if not key.startswith("copyright_") and key != "url": assert f"<th>{key.capitalize()}</th>" in html assert "<tr><th>Copyright</th><td>(c) RandoCorp LLC 2020</td></tr>" assert '<tr><th>URL</th><td><a href="http://www.dataset-sources.com/data/dataset.dat">http://www.dataset-sources.com/data/dataset.dat</a></td></tr>' assert "<p class=h4>Reproduction</p>" in html
def test_titanic_default(benchmark): file_name = cache_file( "titanic.parquet", "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", ) data = pd.read_parquet(file_name) benchmark(partial(func), data)
def test_titanic_explorative(benchmark): file_name = cache_file( "titanic.parquet", "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", ) data = pd.read_parquet(file_name) kwargs = dict(explorative=True) benchmark(partial(func, **kwargs), data)
def test_rdw_minimal(benchmark): file_name = cache_file( "rdw_sample_100k.parquet", "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/rdw_sample_100k.parquet", ) data = pd.read_parquet(file_name) kwargs = dict(minimal=True) benchmark(partial(func, **kwargs), data)
def test_titanic_minimal(benchmark): file_name = cache_file( "titanic.parquet", "https://github.com/pandas-profiling/pandas-profiling-data/raw/master/data/titanic.parquet", ) data = pd.read_parquet(file_name) kwargs = {"minimal": True} benchmark(partial(func, **kwargs), data)
def test_dataset_schema_empty(): file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta") df = pd.read_stata(file_name) # Length left out due to correlation with weight. report = df.profile_report( title="Dataset schema empty", minimal=True, dataset=None, ) html = report.to_html() assert "<p class=h4>Dataset</p>" not in html assert "<div class=col-sm-12><p class=h4>Reproduction</p>" in html
def test_issue416(): file_name = cache_file( "products.tsv", "https://raw.githubusercontent.com/mrichman/clickstream-pandas/master/products.tsv", ) df = pd.read_csv(file_name, sep="\t") df["path"] = df["url"].str.replace("http://www.acme.com", "") profile = pandas_profiling.ProfileReport( df, title="Pandas Profiling Report", html={"style": {"full_width": True}} ) data = profile.to_json() assert '"PATH": 1' in data assert '"common_prefix": "/",' in data
def test_issue377(): file_name = cache_file( "bank-full.csv", "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv", ) # Download the UCI Bank Marketing Dataset df = pd.read_csv(file_name, sep=";") original_order = tuple(df.columns.values) profile = pandas_profiling.ProfileReport(df, sort="None", pool_size=5, progress_bar=False) new_order = tuple(profile.get_description()["variables"].keys()) assert original_order == new_order
def test_issue416(): file_name = cache_file( "products.tsv", "https://raw.githubusercontent.com/mrichman/clickstream-pandas/master/products.tsv", ) df = pd.read_csv(file_name, sep="\t") df["path"] = df["url"].str.replace("http://www.acme.com", "") profile = pandas_profiling.ProfileReport( df, title="Pandas Profiling Report", html={"style": { "full_width": True }}, explorative=True, ) data = profile.get_description() assert data["table"]["types"][Categorical] == 1 assert data["table"]["types"][Path] == 1 assert data["table"]["types"][URL] == 1 assert data["variables"]["path"]["common_prefix"] == "/"
#%% from pathlib import Path import requests import numpy as np import pandas as pd import pandas_profiling from pandas_profiling.utils.cache import cache_file # %% file_name = cache_file( "apple.csv", "https://raw.githubusercontent.com/anarinsk/adp-apple_mobility_trend/master/data/applemobilitytrends-2020-04-23.csv", ) df = pd.read_csv(file_name) #%% tmplist = zip(df.columns[3:], "time_" + df.columns[3:]) df.rename(columns = dict(tmplist), inplace=True) # df = pd.wide_to_long(df, stubnames='time', i=['geo_type', 'region', 'transportation_type'], j='date', sep='_', suffix='.*') df = df.reset_index() # df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d') df
from pathlib import Path import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "coal_prices.csv", r"https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1168&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=PCOALAUUSDM&scale=left&cosd=1990-01-01&coed=2020-01-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2009-06-01&line_index=1&transformation=lin&vintage_date=2020-02-12&revision_date=2020-02-12&nd=1990-01-01", ) df = pd.read_csv(file_name, parse_dates=["DATE"]) report = ProfileReport( df, title="Coal Prices (IMF)", config_file=Path("../../src/pandas_profiling/config_dark.yaml"), html={ "style": { "logo": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAAArCAYAAADIWo5HAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JQAAgIMAAPn/AACA6QAAdTAAAOpgAAA6mAAAF2+SX8VGAAAABmJLR0QAAAAAAAD5Q7t/AAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH5AIMFC0AShJOrwAACuVJREFUaN7tmWnMpXV5xn//7dnec867DTMvzsAww2xUsYVCYyEYbVUgBSPSRm1iE/nQJk0a28TYxdgmGhJbSRdJl9TGllZR0ZombZq0SRdRC12cqGDJIINI4J31Xc+z/7d+eA6TpuknGHghmevDOck5eZ7nPte5l+u6/3AJl3AJl3AJLz9+5SO/rO69/zfFqy0u/Uo9aG11/W1bZ7fOPPL8E9954BN/MLe9vaXOnDp/TaL1DWVVduPxeCMfzf39lz//UCWE4Lqbb1Rr61vi2SeedC9nXC/bP/LJP/09uWdu+eC/P/rogaqs31KV5Xud8/HcuXP/BnFvhMz7cCCGcHlZljHL0q4o5v7x4NUHzj/1vadCmuflZZfteqqs6y+/+923n3v00f/Sn/7jz/SvCQLe9YGfkbkz781M+rHnnntut7NuHCOUZYn3Hu8dRhuEECilmE5LlpaX6Pse7z1CwFwxR5Im9frGxmMHDuz/j/F4FDtr733wrx48+6ol4Pbb3ibfdNPN4/88/q3DW+vrD9iu/yEQjMcjyqrCO08IAR88RZ4TQkApSQygtGF7ewshBCYZyAk+4LxHK8ni0kKYzE8+eNWxI39038c/ES5WzOpiEnDNtW849vTJ7//l2dNnfr6clocn4zExBJaWltjenkIMIARJmpAXBc55qqomz3O6riNNUwCccwM5UhJ8QClFCFGUZXV9My33Hzp8aP/eq/Z3z5x8+uyrhoD7/vwP5ZlnVj985vTZn9VSLmqt8TFie0td1zRNQ1NXCCFACKSQeO9BQJKmlFVF13UorUiThPFoPCSoEKRZSt/1tG03cs79WAi+S5Lkn3/6595z/hv/8rVXBwGpTN66ubb+8XJ7OlJKoaSknJZorZFSoJUiAtYHpFJ451lYWEApRdPU9H2HFKCMYXlpiaosiUDTNPR9T9t1+BCIIcY0zeyhq6++9vzZc4889u3HtnecgLvf/x65dvrcR6WQN2utqKqKpmkpioIYA0JInHMoozFGI6WkKHJCjDjnEAKSJEEpjVKaqqpxzuG8J4ZIJOKdwyQJ3nsRvF/J8/yo0ebsgaNHv3niu//tX2zs8mIQ4DprXG9HbdMyLStCCCRJQp7nF+raGENqEsajEVII+q6na1vqukZIiQ8R7wPBe+p6uEfwnjQbrplMxgTvEcC0LFl9flWfPnX67q5v515K7BeFgL7qbnHO39J2LVmWoo0hyzKEgDzPgUjbtvR9j3OeECO9tSilGY1GECGEQIwREBTFHEopjEmQQiKVZmFhkSRJyLKcIh8aaFlWC0uThcmOE9D1/ZEQwh4pFUpr8rwgztJ7c3OLEOMgO7Wmt5aiKAgxYp0lEsnynKIocN7TtA0hREKMhBjY2NgkBM90OsU5T17k5HnO9nSKEGK36+2Re37xF+SOEXD7XXekaaL7ELy3fY/rLQJIjGFxYYEQAzEEjNGkaYoxhr7rSZIErTREmL0wHo1QSiGEQAhB0zQkiaG3lo3NLaSUVGWJsxbvHWVZzq+urn5oe3s6vyNe4Efe8iaTm/QjvevuiTEqgBgjIXhC8Kytr5MYQ4wghKCqK5RUSKVItKbtOrTWlFVFmqQkSULTNOzdu4yzPYkxTMuSqiwRArquwxhDMVegpcQ5J43Sdjw3bl5xAq561w28fnLonVvrG79UltWCVhovPMaYC/OdGFGzkSelpG0b+ghLi4tkeYZSkq7vUVLRti2RyGQywdqe6fY2SimUkoxHI+q6xmiNdXYohxAQAlZPnbq8izEH2leUgDTN6a3d3tzcdINIaUCIYdwphZRySOUQaJuWNE3IsgzrPF3X0dkeIQRd2yGVQoihEWZZTts2aK3RxqCDJsZABHwIOOfprWU8GrO5tUmWZ8uL86PLgI1XtAec+OLXqKryuTRNN5IkQUo1zOwYEULgrKNtWuJM7xdFPvsu0tke2/c461haWiLPsmFsZhlKCrRUxBgpy3ImgnqIkel0CgjSJKG3lizLuGLfvt1pYg7vSBPMsvTJYjL+rbqp2xAH7S4FGG2QSpJmCc47vPf0bT+4QedJkwQAa3vWN9apqgopJSZJmE63ccFhTELwHpMkaKlwzpNlGXmezRpkzfxkwhX79mVXXXnlNfp1r3vlCfjKQ1/xSsnpZDLv4szlCSmp6oq6qgapKQfl52NACIGUkrquCT4QQqBrW3rbY61la3MT7wNN3eCcI01TnLWEGNB6yLAXLLTWQ2nUdfPd6XQ6uevtb013ZAzu3rXr2N59+0ZplmGMQSuDVoo0TQcRIwXGJCilSAYpi0AQYsBZhxRDCG3bXHCBaZaBiEg1GCatNUoqTDLcx/vBVpdVtfX9Z3/wYSfD76hE2x0h4Nz5839TVuUj4/FkaGp9N0tng9Z6FqzHOYd3fsgCJQdZKwe3F0NADmMN5yzW9jR1c0EhVlVNkqaE4IkhYK0lMYbEJGmeZ7caJe0XPv1A2BECDt3whqerqvrO2vnzxDg0wQjEMDM6sx+plKJ3Fuc9Pnh8CHjvkWKQvkJIvHd03dAcEaC1IkvTIdWbCmM0bdfivaeuG9q+X02z/F+LYi7siBACeP7kSaQQnTaaGBkcYJjVcfBIMUyAth3GtLMW7wQCgdaatm2p6hqIaG1YXJin63u881RVRfCB0Xg8W5UJrHXkeY73gYWFxY3DR4989fc/+bv2xcb/ku3wY8cfiz/+5pvWBbx5e2tr2VorYoyoWf0O8z3OmtZQ18O4hDxLEUKitCKEgBRyVkY9XddjtMaHoVk67+naFpMks/soLr9897LtO7334P5HTp743s70AIADRw4cT7PsU0op0XUdXd8jtWI8Gg27veCxM/3e9z3GaLRWWOtmG6CUNM1wztFbi5QSKQRSKZaXl4elymyTlKXpIJjSlLW19eb551e/1QeaHcsAgIf/6avhjW+8VnVdf6NUakXMPIGQkr7v6a29IG6882ht6LpusNJdj5By2H69cJ0Qg5v0ntFohPeeLEupqxoYRmkxl/9AGvOrpzbX//r4w9940SVwUTIA4OpDhzaOXXPU7L5sF0oqyrIihllvmnVzYwxFUeC9H8ZZCEitcN7hrMUkyWx7NHgHJSUbGxuUVYmdZUZR5Bw8dPDsaH7+12+56cbPPvXtx9uXEvdFOxmKxPUQwp8VRf6+NM/2pbZfcd4LBEgpUFoxHo+YTsshxaVEKTXTAN3svSXEyGhujhACTdOCANdbNu0W4/HY7lnZ88DiwuTrB19/7Esf/dBvvOT1+EU/GLnjzluv955JVdV3ee9vapr2qFZ6vL62xng8wvtA3/VY5/DeDU1wZoaEkLRdy6iYm2VMArOtUl7krKysPB6VeMdDD37h1Kv2bPDv/vYfjgO8485bH+/qbnnvrstWjFQ/UTf1fiHlbUWa7bHWDvu+NGVpcYnNrS0Ew2TQStF13aAagyfPMubnJxw4cPCRw0cP/drJZ58585o4G/y/+Mk7b5sUUr/vyn1XfOzEE0/OlVVZOOvE/PykbZpGN03rQoyn2rbZmyapHY3Hm3mR73XOrY7Go2/OTSb3PfTg5x9+zRyO/n94+0/dtrB3ZWXf2dPnlibj0R3b29tXBsQDWsocgcjz/MTa+bXfbtt247ofvf4vrv3h6w6XdfP1aV+dWDfb9v4P3htf0wT8b7z/ng+M+r7XX/zs5zZf+OzOu9+pluaXD+ej0XN/8qn7Sy7hEi7hEi7h5cX/AJEc/aDrj11yAAAAJXRFWHRkYXRlOmNyZWF0ZQAyMDIwLTAyLTEyVDIwOjQ1OjAwKzAwOjAwz7DySAAAACV0RVh0ZGF0ZTptb2RpZnkAMjAyMC0wMi0xMlQyMDo0NTowMCswMDowML7tSvQAAAAASUVORK5CYII=" } }, ) report.to_file(Path("flatly_report.html"))
from pathlib import Path import numpy as np import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "census_train.csv", "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", ) # Names based on https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names df = pd.read_csv( file_name, header=None, index_col=False, names=[ "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain",
import great_expectations as ge import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file file_name = cache_file( "titanic.csv", "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv", ) df = pd.read_csv(file_name) profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True) # Example 1 # Obtain expectation suite, this includes profiling the dataset, saving the expectation suite, validating the # dataframe, and building data docs suite = profile.to_expectation_suite(suite_name="titanic_expectations") # Example 2 # Run Great Expectations while specifying the directory with an existing Great Expectations set-up by passing in a # Data Context data_context = ge.data_context.DataContext( context_root_dir="my_ge_root_directory/") suite = profile.to_expectation_suite(suite_name="titanic_expectations", data_context=data_context) # Example 3 # Just build the suite
# As featured on this Google Cloud Platform page: # https://cloud.google.com/solutions/building-a-propensity-model-for-financial-services-on-gcp from pathlib import Path import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "bank-full.csv", "https://storage.googleapis.com/erwinh-public-data/bankingdata/bank-full.csv", ) # Download the UCI Bank Marketing Dataset df = pd.read_csv(file_name, sep=";") profile = ProfileReport( df, title="Profile Report of the UCI Bank Marketing Dataset") profile.to_file(Path("uci_bank_marketing_report.html"))
from pathlib import Path import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "chicago_employees.csv", "https://data.cityofchicago.org/api/views/xzkq-xp2w/rows.csv?accessType=DOWNLOAD", ) df = pd.read_csv(file_name) profile = ProfileReport(df, title="Chicago Employees") profile.to_file(output_file=Path("./chicago_employees_report.html"))
import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "rdw.parquet", "https://raw.githubusercontent.com/pandas-profiling/pandas-profiling-data/master/data/rdw.parquet", ) data = pd.read_parquet(file_name) profile = ProfileReport(data, title="RDW Dataset", minimal=True) profile.to_file("rdw.html")
from pathlib import Path import numpy as np import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) # Set a seed for reproducibility np.random.seed(7331) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Unhashable
def getter(file_name, url): source_file = cache_file(file_name, url) # Move to temporary directory test_path = Path(str(tmpdir)) shutil.copy(str(source_file), str(test_path / file_name)) return str(test_path / file_name)
import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "colors.csv", "https://github.com/codebrainz/color-names/raw/master/output/colors.csv", ) df = pd.read_csv(file_name, names=["Code", "Name", "Hex", "R", "G", "B"]) report = ProfileReport(df, title="Colors") report.to_file("colors_report.html")
from pathlib import Path import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "reviews_Musical_Instruments_5.json.gz", r"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz", ) df = pd.read_json(file_name, compression="gzip", lines=True) profile = ProfileReport( df, title="Amazon Musical Instrument Review | Profile Report") profile.to_file(Path("./review_report.html"))
import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "vektis_postcodes.csv", "https://www.vektis.nl/uploads/Docs%20per%20pagina/Open%20Data%20Bestanden/2017/Vektis%20Open%20Databestand%20Zorgverzekeringswet%202017%20-%20postcode3.csv", ) df = pd.read_csv(file_name, sep=";", low_memory=False) report = ProfileReport( df, title="Vektis Postal Code 2017", correlations={ "recoded": {"calculate": False}, "kendall": {"calculate": False}, "phi_k": {"calculate": False}, "cramers": {"calculate": False}, }, plot={"histogram": {"bayesian_blocks_bins": False}}, ) report.to_file("vektis_report.html", True)
from pathlib import Path import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file("auto2.dta", "http://www.stata-press.com/data/r15/auto2.dta") # Suggested by @adamrossnelson df = pd.read_stata(file_name) # Length left out due to correlation with weight. report = ProfileReport(df, title="1978 Automobile dataset", explorative=True) report.to_file(Path("stata_auto_report.html"))
from pathlib import Path import pandas as pd from pandas_profiling import ProfileReport from pandas_profiling.utils.cache import cache_file if __name__ == "__main__": file_name = cache_file( "websites.csv", "https://raw.githubusercontent.com/berkmancenter/url-lists/master/lists/et.csv", ) df = pd.read_csv(file_name, parse_dates=["date_added"]) profile = ProfileReport( df, title="Website Inaccessibility Test Lists", correlations={"cramers": { "calculate": False }}, ) profile.to_file(Path("./website_inaccessibility_report.html"))