Exemplo n.º 1
0
def test_urls(get_data_file):
    file_name = get_data_file(
        "whitelist_urls.csv",
        "https://raw.githubusercontent.com/openeventdata/scraper/master/whitelist_urls.csv",
    )

    df = pd.read_csv(file_name,
                     header=None,
                     names=["source", "url", "reach", "language"])

    # Add ~10% missing values
    df = df.mask(np.random.random(df.shape) < 0.1)

    profile = ProfileReport(
        df,
        title="DataFrame with URL column",
        samples={
            "head": 0,
            "tail": 0
        },
        explorative=True,
    )

    assert "<small>URL</small>" in profile.to_html(), "URL not detected"
    assert "<th>URL</th>" in profile.to_html(), "URL not detected"
Exemplo n.º 2
0
def test_issue200():
    df = pd.DataFrame([0, 1, 2], columns=["a"], index=["0", "1", "2"])

    assert df.index.dtype == "object", "Index type should be 'object'"
    report = ProfileReport(df, title="String indices")
    assert ("<title>String indices</title>"
            in report.to_html()), "Profile report should be generated."
Exemplo n.º 3
0
def test_load(get_data_file, test_output_dir):
    file_name = get_data_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # For reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)

    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy())

    df = pd.concat([df, duplicates_to_add], ignore_index=True)

    profile1 = ProfileReport(
        df,
        title="NASA Meteorites",
        samples={"head": 5, "tail": 5},
        duplicates={"head": 10},
        minimal=True,
        progress_bar=False,
    )

    test_output_path = test_output_dir / "NASA-Meteorites.pp"
    json1 = profile1.to_json()
    profile1.dump(test_output_path)
    _ = profile1.to_html()

    assert test_output_path.exists(), "Output file does not exist"

    profile2 = ProfileReport(df, progress_bar=False).load(test_output_path)
    # json1 are compute before dumps, so _description_set should be the same
    assert isinstance(profile2._description_set, dict)

    # profile1 is lazy, html1 are compute after dumps, so report should be None
    assert profile2._report is None

    json2 = profile2.to_json()

    # both profile should generate same output
    assert json1 == json2
Exemplo n.º 4
0
def test_issue_169_index(issue_169_data):
    df = pd.read_csv(issue_169_data, sep=",", index_col=0)
    report = ProfileReport(df,
                           missing_diagrams={
                               "dendrogram": True,
                               "heatmap": True
                           })
    html = report.to_html()
    assert type(html) == str
    assert "<p class=h4>Dataset statistics</p>" in html
Exemplo n.º 5
0
def test_issue671():
    test = pd.DataFrame([0, 5, 22, 32, 65, np.nan], columns=["a"])

    for i in range(0, 10):
        profile = ProfileReport(test,
                                vars={"num": {
                                    "low_categorical_threshold": i
                                }},
                                progress_bar=False)
        assert len(profile.to_html()) > 0
Exemplo n.º 6
0
def make_report(df):
    report = ProfileReport(
        df,
        minimal=False,
        pool_size=0,
        sort="None",
        title="Dataset with <em>Numeric</em> Categories",
    )
    html = report.to_html()
    assert type(html) == str and '<p class="h2">Dataset info</p>' in html
Exemplo n.º 7
0
def test_issue147(tmpdir):
    file_name = Path(str(tmpdir)) / "userdata1.parquet"
    data = requests.get(
        "https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata2.parquet"
    )
    file_name.write_bytes(data.content)

    df = pd.read_parquet(str(file_name), engine="pyarrow")
    report = ProfileReport(df, title="PyArrow with Pandas Parquet Backend")
    html = report.to_html()
    assert type(html) == str and '<p class="h2">Dataset info</p>' in html
Exemplo n.º 8
0
def test_issue_169_column(issue_169_data):
    df = pd.read_csv(issue_169_data, sep=",")
    report = ProfileReport(
        df,
        missing_diagrams={"dendrogram": True, "heatmap": True},
        progress_bar=False,
        pool_size=1,
    )
    html = report.to_html()
    assert type(html) == str
    assert "<p class=h4>Dataset statistics</p>" in html
Exemplo n.º 9
0
def test_example_empty():
    df = pd.DataFrame({"A": [], "B": []})
    profile = ProfileReport(df)
    description = profile.get_description()

    assert len(description["correlations"]) == 0
    assert len(description["missing"]) == 0
    assert len(description["sample"]) == 0

    html = profile.to_html()
    assert "Dataset is empty" in html
Exemplo n.º 10
0
def test_issue147(get_data_file):
    file_name = get_data_file(
        "userdata1.parquet",
        "https://github.com/Teradata/kylo/raw/master/samples/sample-data/parquet/userdata2.parquet",
    )

    df = pd.read_parquet(str(file_name), engine="pyarrow")
    report = ProfileReport(df, title="PyArrow with Pandas Parquet Backend")
    html = report.to_html()
    assert type(html) == str
    assert "<p class=h4>Dataset statistics</p>" in html
Exemplo n.º 11
0
def test_issue545(get_data_file):
    file_name = get_data_file(
        "sample_eda_df.pkl",
        "https://github.com/justinsola/justinsola.github.com/raw/master/files/sample_eda_df.pkl",
    )

    sample_eda_df = pd.read_pickle(str(file_name))
    sample_profile = ProfileReport(sample_eda_df,
                                   title="Sample Profiling Report",
                                   explorative=True,
                                   pool_size=1)
    assert len(sample_profile.to_html()) > 0
Exemplo n.º 12
0
def test_issue353():
    df = pd.DataFrame(np.random.rand(100, 5),
                      columns=["a", "b", "c", "d", "e"])
    # make one column categorical
    df["a"] = df["a"].multiply(5).astype("int").astype("category")

    profile = ProfileReport(df,
                            title="Pandas Profiling Report",
                            html={"style": {
                                "full_width": True
                            }})
    assert len(profile.to_html()) > 0
def test_custom_sample():
    df = pd.DataFrame({"test": [1, 2, 3, 4, 5]})

    # In case that a sample of the real data (cars) would disclose sensitive information, we can replace it with
    # mock data. For illustrative purposes, we use data based on cars from a popular game series.
    mock_data = pd.DataFrame({
        "make": ["Blista Kanjo", "Sentinel", "Burrito"],
        "price": [58000, 95000, 65000],
        "mpg": [20, 30, 22],
        "rep78": ["Average", "Excellent", "Fair"],
        "headroom": [2.5, 3.0, 1.5],
        "trunk": [8, 10, 4],
        "weight": [1050, 1600, 2500],
        "length": [165, 170, 180],
        "turn": [40, 50, 32],
        "displacement": [80, 100, 60],
        "gear_ratio": [2.74, 3.51, 2.41],
        "foreign": ["Domestic", "Domestic", "Foreign"],
    })

    # Length left out due to correlation with weight.
    report = ProfileReport(
        df,
        title="Test custom sample",
        sample={
            "name":
            "Mock data sample",
            "data":
            mock_data,
            "caption":
            "Disclaimer: this is synthetic data generated based on the format of the data in this table.",
        },
        minimal=True,
    )

    samples = report.get_description()["sample"]
    assert len(samples) == 1
    sample = samples[0]
    assert sample.id == "custom"
    assert hash_dataframe(sample.data) == hash_dataframe(mock_data)
    assert sample.name == "Mock data sample"
    assert (
        sample.caption ==
        "Disclaimer: this is synthetic data generated based on the format of the data in this table."
    )

    html = report.to_html()
    assert "Mock data sample" in html
    assert all(make in html for make in mock_data["make"].values.tolist())
    assert (
        "Disclaimer: this is synthetic data generated based on the format of the data in this table"
        in html)
Exemplo n.º 14
0
def test_issue_120():
    df = pd.read_csv(
        "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt"
    )

    report = ProfileReport(
        df,
        correlations={"cramers": {"calculate": False}},
        vars={"cat": {"check_composition": True}},
    )
    html = report.to_html()
    assert type(html) == str
    assert "<p class=h2>Dataset info</p>" in html
Exemplo n.º 15
0
def test_issue100():
    df = pd.DataFrame(np.random.randint(0, 1000, size=(1000, 4)),
                      columns=list("ABCD"))
    df[["B", "C"]] = df[["B", "C"]].astype("category")

    report = ProfileReport(
        df,
        pool_size=1,
        title="Dataset with <em>Numeric</em> Categories",
        samples={"head": 20},
    )
    html = report.to_html()
    assert type(html) == str and '<p class="h2">Dataset info</p>' in html
Exemplo n.º 16
0
def test_decorator():
    df = pd.read_csv(
        "https://raw.githubusercontent.com/oncletom/coursera-ml/master/week-1/people-example.csv"
    )
    report = ProfileReport(
        df,
        title="Coursera Test Report",
        samples={"head": 20},
        missing_diagrams={
            "heatmap": False,
            "dendrogram": False
        },
    )
    assert "Coursera Test Report" in report.to_html(), "Title is not found"
Exemplo n.º 17
0
def test_example(get_data_file, test_output_dir):
    file_name = get_data_file(
        "meteorites.csv",
        "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
    )

    # For reproducibility
    np.random.seed(7331)

    df = pd.read_csv(file_name)

    # Note: Pandas does not support dates before 1880, so we ignore these for this analysis
    df["year"] = pd.to_datetime(df["year"], errors="coerce")

    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5,
                                                        size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy())

    df = df.append(duplicates_to_add, ignore_index=True)

    output_file = test_output_dir / "profile.html"
    profile = ProfileReport(
        df,
        title="NASA Meteorites",
        samples={
            "head": 5,
            "tail": 5
        },
        duplicates={"head": 10},
        minimal=True,
    )
    profile.to_file(output_file)
    assert (test_output_dir /
            "profile.html").exists(), "Output file does not exist"
    assert (type(profile.get_description()) == dict and len(
        profile.get_description().items()) == 10), "Unexpected result"
    assert "<span class=badge>12</span>" in profile.to_html()
Exemplo n.º 18
0
def test_issue51(get_data_file):
    # Categorical has empty ('') value
    file_name = get_data_file(
        "buggy1.pkl",
        "https://raw.githubusercontent.com/adamrossnelson/HelloWorld/master/sparefiles/buggy1.pkl",
    )

    df = pd.read_pickle(str(file_name))

    report = ProfileReport(df,
                           title="Pandas Profiling Report",
                           progress_bar=False,
                           explorative=True)
    assert ("<title>Pandas Profiling Report</title>"
            in report.to_html()), "Profile report should be generated."
Exemplo n.º 19
0
def test_issue51_similar():
    df = pd.DataFrame({
        "test": ["", "hoi", None],
        "blest": [None, "", "geert"],
        "bert": ["snor", "", None],
    })

    report = ProfileReport(df,
                           title="Pandas Profiling Report",
                           progress_bar=False,
                           explorative=True)
    report.config.vars.num.low_categorical_threshold = 0
    # FIXME: assert correlation values (report.description_set["correlations"])

    assert ("<title>Pandas Profiling Report</title>"
            in report.to_html()), "Profile report should be generated."
Exemplo n.º 20
0
def test_sensitive():
    df = pd.DataFrame(
        {
            "name": ["John Doe", "Marco Polo", "Louis Brandeis", "William Douglas"],
            "year": [1965, 1271, 1916, 1975],
            "tf": [True, False, False, True],
            "date": pd.to_datetime(
                [datetime.now() - timedelta(days=i) for i in range(4)]
            ),
        }
    )

    report = ProfileReport(df, sensitive=True, explorative=True)

    html = report.to_html()
    assert all(value not in html for value in df["name"].values.tolist())
Exemplo n.º 21
0
def test_modular_present(tdf):
    profile = ProfileReport(
        tdf,
        title="Modular test",
        duplicates={"head": 10},
        samples={
            "head": 10,
            "tail": 10
        },
        interactions={
            "targets": ["mass (g)"],
            "continuous": True
        },
        correlations={
            "pearson": {
                "calculate": True
            },
            "spearman": {
                "calculate": True
            },
            "kendall": {
                "calculate": True
            },
            "phi_k": {
                "calculate": True
            },
            "cramers": {
                "calculate": True
            },
        },
        missing_diagrams={
            "matrix": True,
            "bar": True,
            "dendrogram": True,
            "heatmap": True,
        },
        pool_size=1,
    )

    html = profile.to_html()
    assert "Correlations</h1>" in html
    assert "Duplicate rows</h1>" in html
    assert "Sample</h1>" in html
    assert "Missing values</h1>" in html
Exemplo n.º 22
0
def test_issue_120(get_data_file):
    file_name = get_data_file(
        "pandas_profiling_bug.txt",
        "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt",
    )
    df = pd.read_csv(file_name)

    report = ProfileReport(
        df,
        correlations={"cramers": {
            "calculate": False
        }},
        vars={"cat": {
            "check_composition": True
        }},
    )
    html = report.to_html()
    assert type(html) == str
    assert "<p class=h4>Dataset statistics</p>" in html
Exemplo n.º 23
0
def test_modular_absent(tdf):
    profile = ProfileReport(
        tdf,
        title="Modular test",
        duplicates={"head": 0},
        samples={
            "head": 0,
            "tail": 0
        },
        interactions=None,
        correlations=None,
        missing_diagrams=None,
    )

    html = profile.to_html()
    assert "Correlations</h1>" not in html
    assert "Duplicate rows</h1>" not in html
    assert "Sample</h1>" not in html
    assert "Missing values</h1>" not in html
Exemplo n.º 24
0
def test_issue523():
    # https://github.com/pandas-dev/pandas/issues/33803

    data = [
        1871248,
        12522551,
        1489260,
        6657093,
        pd.NA,
        pd.NA,
        pd.NA,
        pd.NA,
        pd.NA,
        1489260,
        pd.NA,
        2468576,
    ]
    df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype())

    profile_report = ProfileReport(df, title="Test Report")
    assert len(profile_report.to_html()) > 0
Exemplo n.º 25
0
def get_profile_results(data):
    """profiles pandas dataframe"""

    if isinstance(data, pd.DataFrame):
        profile = ProfileReport(
          data,
          title='Snowflake Data Profiler from Hashmap',
          progress_bar=False,
          explorative=True,
          correlations={
             "pearson": {"calculate": True},
             "spearman": {"calculate": False},
             "kendall": {"calculate": False},
             "phi_k": {"calculate": False},
             "cramers": {"calculate": False},
         },
        )

        p = profile.to_html() # this step sometimes fails with matplotlib errors about threads. I've only fixed it by adjusting requirements.txt in the past. I've just specified the specific versions of libraries. Pyarrow seems to have an impact on this.

        return p

    else:
        raise TypeError('This is not a pandas dataframe.')
Exemplo n.º 26
0
def func(df, **kwargs):
    profile = ProfileReport(df, progress_bar=False, **kwargs)
    report = profile.to_html()
    return report
 def _to_html(profile_report: ProfileReport) -> str:
     html_report = profile_report.to_html()
     html_report = html.escape(html_report)
     return (
         f'<iframe srcdoc="{html_report}" style={STYLE} frameborder="0" '
         "allowfullscreen></iframe>")
Exemplo n.º 28
0
def test_issue249():
    df = pd.DataFrame(data=[[1], [2]], index=["foo", 1], columns=["a"])
    report = ProfileReport(df, explorative=True, progress_bar=False)
    assert type(report.config.title) == str
    assert len(report.to_html()) > 0
Exemplo n.º 29
0
def test_issue664():
    test = pd.DataFrame([np.nan] * 100, columns=["a"])

    profile = ProfileReport(test)
    assert len(profile.to_html()) > 0
Exemplo n.º 30
0
 def to_html(self, df: pandas.DataFrame) -> str:
     assert isinstance(df, pandas.DataFrame)
     profile = ProfileReport(df, title=self._title)
     return profile.to_html()