Exemplo n.º 1
0
def test_one_row_dataframe():
    items = [
        ("a", [1]),
        ("b", [-0.5]),
        ("c", ["hello"]),
        ("d", [datetime.datetime.now()]),
    ]
    columns = sorted([item[0] for item in items])
    df = pd.DataFrame.from_dict(dict(items))
    report = summarise(df)._report
    assert sorted(report["_columns"]) == columns
    column_properties = report["column_properties"]
    for column in columns:
        props = column_properties[column]
        assert props["nulls"] == 0
        assert props["notnulls"] == 1
        assert props["unique"] == 1
    assert column_properties["a"]["dtype"] == "int64"
    assert column_properties["b"]["dtype"] == "float64"
    assert column_properties["c"]["dtype"] == "object"
    assert column_properties["d"]["dtype"] == "datetime64[ns]"
    column_summary = report["column_summary"]
    assert column_summary["a"]["max"] == 1
    assert column_summary["a"]["min"] == 1
    assert column_summary["a"]["mean"] == 1.0
    assert column_summary["a"]["median"] == 1.0
    assert column_summary["a"]["iqr"] == 0.0

    assert column_summary["b"]["max"] == -0.5
    assert column_summary["b"]["min"] == -0.5
    assert column_summary["b"]["median"] == -0.5
    assert column_summary["b"]["mean"] == -0.5
Exemplo n.º 2
0
def test_one_row_dataframe():
    items = [
        ('a', [1]),
        ('b', [-0.5]),
        ('c', ['hello']),
        ('d', [datetime.datetime.now()])
    ]
    columns = sorted([item[0] for item in items])
    df = pd.DataFrame.from_items(items)
    report = summarise(df)._report
    assert sorted(report['_columns']) == columns
    column_properties = report['column_properties']
    for column in columns:
        props = column_properties[column]
        assert props['nulls'] == 0
        assert props['notnulls'] == 1
        assert props['unique'] == 1
    assert column_properties['a']['dtype'] == 'int64'
    assert column_properties['b']['dtype'] == 'float64'
    assert column_properties['c']['dtype'] == 'object'
    assert column_properties['d']['dtype'] == 'datetime64[ns]'
    column_summary = report['column_summary']
    assert column_summary['a']['max'] == 1
    assert column_summary['a']['min'] == 1
    assert column_summary['a']['mean'] == 1.0
    assert column_summary['a']['median'] == 1.0
    assert column_summary['a']['iqr'] == 0.0

    assert column_summary['b']['max'] == -0.5
    assert column_summary['b']['min'] == -0.5
    assert column_summary['b']['median'] == -0.5
    assert column_summary['b']['mean'] == -0.5
Exemplo n.º 3
0
def test_correlation_matrix_one_column():
    column_values = np.random.ranf(size=200)
    df = pd.DataFrame.from_dict({"a": column_values})
    summary = summarise(df)
    columns, correlation_matrix = summary.correlation_matrix()
    assert columns == ["a"]
    assert correlation_matrix.shape == (1, 1)
    numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0)
Exemplo n.º 4
0
def test_zero_rows_dataframe():
    columns = sorted(['a', 'b', 'c', 'd'])
    df = pd.DataFrame(columns=columns)
    report = summarise(df)._report
    assert sorted(report['_columns']) == columns
    for column in columns:
        props = report['column_properties'][column]
        assert props['nulls'] == 0
        assert props['notnulls'] == 0
        assert props['unique'] == 0
Exemplo n.º 5
0
def test_zero_rows_dataframe():
    columns = sorted(["a", "b", "c", "d"])
    df = pd.DataFrame(columns=columns)
    report = summarise(df)._report
    assert sorted(report["_columns"]) == columns
    for column in columns:
        props = report["column_properties"][column]
        assert props["nulls"] == 0
        assert props["notnulls"] == 0
        assert props["unique"] == 0
Exemplo n.º 6
0
def test_dask_compute_graph(df, scheduler, num_workers, pairdensities):
    dreport = summarise(
        df, scheduler=scheduler, num_workers=num_workers,
        pairdensities=pairdensities)._report
    fname = None
    if scheduler == 'multiprocessing' and num_workers is None:
        fname = '{}/test_results/report_test_data_{}.json'.format(dirname,
                                                                  'mp')
    assert dreport['_lens_version'] == __version__
    if not pairdensities:
        assert dreport['pairdensity'] == {'_columns': [], '_run_time': 0.0}

    serialize_full_report(dreport, fname=fname)
Exemplo n.º 7
0
def test_correlation_matrix_two_columns():
    column1_values = np.random.ranf(size=200)
    column2_values = np.random.ranf(size=200)
    df = pd.DataFrame.from_dict({"a": column1_values, "b": column2_values})
    summary = summarise(df)
    columns, correlation_matrix = summary.correlation_matrix()
    assert sorted(columns) == ["a", "b"]
    numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0)
    numpy.testing.assert_approx_equal(correlation_matrix[1, 1], 1.0)
    off_diagonal_term = scipy.stats.spearmanr(df["a"], df["b"]).correlation
    numpy.testing.assert_approx_equal(correlation_matrix[1, 0],
                                      off_diagonal_term)
    numpy.testing.assert_approx_equal(correlation_matrix[0, 1],
                                      off_diagonal_term)
Exemplo n.º 8
0
def test_summary_regression(input_):
    # load the input into a pandas dataframe
    df = pd.read_csv("s3://{}/input/{}".format(BUCKET, input_))

    # run the lens summarise method
    summary = lens.summarise(df)

    # Save generated report
    summary.to_json(os.path.join(result_dir, input_.replace(".csv", ".json")))

    # load the expected output file into a summary object
    output = input_.replace(".csv", ".json")
    s3_summary = read_s3_file(BUCKET, "output/{}".format(output))[
        "Body"
    ].read()

    if isinstance(s3_summary, bytes):
        s3_summary = s3_summary.decode("utf-8")

    expected_summary = json.loads(s3_summary)

    # list of keys to ignore from the response because they are
    # probablistically generated
    exclude = [
        "_run_time",
        "tdigest",
        "density",
        "bw",
        "logtrans_IQR",
        "kde",
        "_lens_version",
    ]

    diffs = find_diff(
        json.loads(json.dumps(summary._report)), expected_summary, exclude
    )

    for diff in diffs:
        print(diff)

    if len(diffs):
        # Save expected report to check the differences manually if needed
        exp_name = os.path.join(
            result_dir, output.replace(".json", "-expected.json")
        )
        with open(exp_name, "w") as f:
            f.write(s3_summary)

    # compare the input and output summary objects
    assert len(diffs) == 0
Exemplo n.º 9
0
def test_correlation_matrix_three_columns():
    column_values = [np.random.ranf(size=200) for i in range(3)]
    column_headers = ["a", "b", "c"]
    df = pd.DataFrame.from_dict(dict(zip(column_headers, column_values)))
    summary = summarise(df)
    columns, correlation_matrix = summary.correlation_matrix()
    assert sorted(columns) == column_headers

    for i, first_column in enumerate(columns):
        for j, second_column in enumerate(columns):
            expected = scipy.stats.spearmanr(df[first_column],
                                             df[second_column]).correlation
            actual = correlation_matrix[i, j]
            numpy.testing.assert_approx_equal(expected, actual)
Exemplo n.º 10
0
def test_correlation_matrix_two_columns():
    column1_values = np.random.ranf(size=200)
    column2_values = np.random.ranf(size=200)
    df = pd.DataFrame.from_items([('a', column1_values),
                                  ('b', column2_values)])
    summary = summarise(df)
    columns, correlation_matrix = summary.correlation_matrix()
    assert sorted(columns) == ['a', 'b']
    numpy.testing.assert_approx_equal(correlation_matrix[0, 0], 1.0)
    numpy.testing.assert_approx_equal(correlation_matrix[1, 1], 1.0)
    off_diagonal_term = scipy.stats.spearmanr(df['a'], df['b']).correlation
    numpy.testing.assert_approx_equal(correlation_matrix[1, 0],
                                      off_diagonal_term)
    numpy.testing.assert_approx_equal(correlation_matrix[0, 1],
                                      off_diagonal_term)
Exemplo n.º 11
0
def test_dask_compute_graph(df, scheduler, num_workers, pairdensities):
    dreport = summarise(
        df,
        scheduler=scheduler,
        num_workers=num_workers,
        pairdensities=pairdensities,
    )._report
    fname = None
    if scheduler == "multiprocessing" and num_workers is None:
        fname = "{}/test_results/report_test_data_{}.json".format(
            dirname, "mp"
        )
    assert dreport["_lens_version"] == __version__
    if not pairdensities:
        assert dreport["pairdensity"] == {"_columns": [], "_run_time": 0.0}

    serialize_full_report(dreport, fname=fname)
Exemplo n.º 12
0
def test_summary_regression(input_):
    # load the input into a pandas dataframe
    df = pd.read_csv('s3://{}/input/{}'.format(BUCKET, input_))

    # run the lens summarise method
    summary = lens.summarise(df)

    # Save generated report
    summary.to_json(os.path.join(result_dir, input_.replace('.csv', '.json')))

    # load the expected output file into a summary object
    output = input_.replace('.csv', '.json')
    s3_summary = (read_s3_file(BUCKET,
                               'output/{}'.format(output))['Body'].read())

    if isinstance(s3_summary, bytes):
        s3_summary = s3_summary.decode('utf-8')

    expected_summary = json.loads(s3_summary)

    # list of keys to ignore from the response because they are
    # probablistically generated
    exclude = [
        '_run_time', 'tdigest', 'density', 'bw', 'logtrans_IQR', 'kde',
        '_lens_version'
    ]

    diffs = find_diff(json.loads(json.dumps(summary._report)),
                      expected_summary, exclude)

    for diff in diffs:
        print(diff)

    if len(diffs):
        # Save expected report to check the differences manually if needed
        exp_name = os.path.join(result_dir,
                                output.replace('.json', '-expected.json'))
        with open(exp_name, 'w') as f:
            f.write(s3_summary)

    # compare the input and output summary objects
    assert len(diffs) == 0
Exemplo n.º 13
0
def test_int_num_cpus_env(small_df, monkeypatch):
    num_cpus_env = 2
    monkeypatch.setenv("NUM_CPUS", str(num_cpus_env))
    ls = summarise(small_df)
    assert set(ls._report["_columns"]) == set(small_df.columns)
Exemplo n.º 14
0
def test_string_num_cpus_env(small_df, monkeypatch):
    monkeypatch.setenv("NUM_CPUS", "not-an-int")
    ls = summarise(small_df)
    assert set(ls._report["_columns"]) == set(small_df.columns)
Exemplo n.º 15
0
def test_string_num_cpus_env(small_df, monkeypatch):
    monkeypatch.setenv('NUM_CPUS', 'not-an-int')
    ls = summarise(small_df)
    assert set(ls._report['_columns']) == set(small_df.columns)
Exemplo n.º 16
0
def test_empty_df():
    empty_df = pd.DataFrame()
    with pytest.raises(EmptyDataFrameError):
        summarise(empty_df)
Exemplo n.º 17
0
def artworks_summary(artworks_df):
    summary = lens.summarise(artworks_df)
    return summary