Exemplo n.º 1
0
def test_type_check_fail(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        force_type = True

    project.add_column(TestCol())

    data.integer_col = data.integer_col.astype(float)
    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    with pytest.raises(InvalidType):
        ds.run()
Exemplo n.º 2
0
def test_type_check(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        force_type = True

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    assert 1
Exemplo n.º 3
0
def test_project_get_table(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        force_type = True

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert hist_data.shape[0] > 0
Exemplo n.º 4
0
def test_null_check_fail(data, project):
    class TestCol(Column):
        column_name = "float_col"
        column_type = FloatType()
        null = False
        force_null = True

    project.add_column(TestCol())

    data.loc[0, "float_col"] = np.NaN
    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    with pytest.raises(NullableError):
        ds.run()
Exemplo n.º 5
0
def test_multiple_columns(data, project):
    class TestCol(Column):
        column_name = ["integer_col", "float_col"]
        column_type = NumericType()
        null = False
        force_null = True

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert "integer_col" in hist_data.column_name.values
    assert "float_col" in hist_data.column_name.values
Exemplo n.º 6
0
def test_function_call_without_arguments_dict_style(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        null = False
        force_null = True
        functions = [{"function": mean}]

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert hist_data[hist_data.metric == "mean"].value.astype(
        float).values[0] == 2.0
Exemplo n.º 7
0
def test_other_column(data, project):
    class TestCol(Column):
        column_name = ["integer_col"]
        column_type = IntType()
        null = False
        force_null = True
        functions = [{
            "function": correlation_two_columns,
            "parameters": {
                "column_two": "float_col"
            },
        }]

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    assert 1
Exemplo n.º 8
0
def test_default_checks_done(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        force_type = True

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert (hist_data[hist_data.metric == "perc_missing"].value.astype(
        float).values[0] == 0)
    assert (hist_data[(hist_data.metric == "count") & (
        hist_data.column_name == "rows")].value.astype(int).values[0] == 3)
    assert (hist_data[(hist_data.metric == "count") & (
        hist_data.column_name == "columns")].value.astype(int).values[0] == 3)
    assert hist_data[hist_data.metric == "dtype"].value.values[0] == "int64"
Exemplo n.º 9
0
def test_table_extra_functions(data, project):
    @function(return_format=float)
    def test_fun(data, column):
        return 0

    class Table(PandasTable):
        columns = ["integer_col"]
        infer_schema = False
        table_name = "test"
        types = {"integer_col": IntType()}

        extra_functions = {"integer_col": [{"function": test_fun}]}

    project.add_table(Table())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert "test_fun" in hist_data.metric.values
Exemplo n.º 10
0
def test_function_call_with_arguments(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        null = False
        force_null = True
        functions = [{
            "function": number_of_outliers,
            "parameters": {
                "std_away": 2
            }
        }]

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert (hist_data[hist_data.metric == "number_of_outliers"].value.astype(
        int).values[0] == 0)
Exemplo n.º 11
0
def test_project_file(data, project):
    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        force_type = True

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    with open(os.path.join(project.config_dir, "projects.json"), "r") as f:
        project_file = json.loads(f.read())

    assert "test" in project_file

    project.delete_from_project_config()

    with open(os.path.join(project.config_dir, "projects.json"), "r") as f:
        project_file = json.loads(f.read())

    assert "test" not in project_file

    project.add_to_project_list(schema={
        "integer_col": {
            "unique": False,
            "dtype": "int32",
            "nullable": True
        }
    })

    with open(os.path.join(project.config_dir, "projects.json"), "r") as f:
        project_file = json.loads(f.read())

    assert "test" in project_file
Exemplo n.º 12
0
def test_table_equals_columns(data, project):
    class Table(PandasTable):
        columns = ["integer_col"]
        infer_schema = False
        table_name = "test"
        types = {"integer_col": IntType()}

    project.add_table(Table())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert hist_data.shape[0] == 7

    project.columns = {}
    project.delete_from_project_config()
    project.delete_data()

    class TestCol(Column):
        column_name = "integer_col"
        column_type = IntType()
        force_type = True
        functions = [mean, std, percentage_missing]

    project.add_column(TestCol())

    ds = DataSet(project=project, batch_name="test")
    ds.set_dataset(data)
    ds.run()

    hist_data = project.get_project_table()
    assert hist_data.shape[0] == 7

    hist_data = project.get_project_table()
    assert hist_data.shape[0] > 0