Exemplo n.º 1
0
    def test_yield_correct_results(self, df_with_numeric_values):

        df = df_with_numeric_values

        base_check = Check(CheckLevel.EXCEPTION, description="a description")
        analyzers = [
            Minimum("att1"),
            Maximum("att1"),
            Mean("att1"),
            StandardDeviation("att1"),
            Sum("att1"),
            Quantile("att1", 0.5),
        ]

        engine = PandasEngine(df)
        repo = InMemoryMetadataRepository()
        context_numeric = do_analysis_run(engine, repo, analyzers)

        assert is_success(base_check.has_min("att1", lambda v: v == 1.0),
                          context_numeric)
        assert is_success(base_check.has_max("att1", lambda v: v == 6.0),
                          context_numeric)
        assert is_success(base_check.has_mean("att1", lambda v: v == 3.5),
                          context_numeric)
        assert is_success(
            base_check.has_standard_deviation("att1", lambda v: v == 1.870829),
            context_numeric,
        )
        assert is_success(base_check.has_sum("att1", lambda v: v == 21.0),
                          context_numeric)
        assert is_success(
            base_check.has_approx_quantile("att1", 0.5, lambda v: v == 4.0),
            context_numeric)
Exemplo n.º 2
0
    def test_return_result_for_configured_analyzers(self, df_full):
        analyzers = [
            Size(),
            Minimum("item"),
            Completeness("item"),
        ]

        engine = PandasEngine(df_full)
        repo = InMemoryMetadataRepository()

        ac = do_analysis_run(engine, repo, analyzers)

        sm = AnalyzerContext.success_metrics_as_dataframe(ac)

        expected = pd.DataFrame(
            [
                ("DATASET", "*", "Size", 4.0),
                ("COLUMN", "item", "Minimum", 1.0),
                ("COLUMN", "item", "Completeness", 1.0),
            ],
            columns=("entity", "instance", "name", "value"),
        )

        ConnectionHandler.close_connections()

        assert_frame_equal(sm, expected, check_like=True)
Exemplo n.º 3
0
def run_checks(data, *checks) -> AnalyzerContext:
    analyzers = tuple(
        [a for check in checks for a in check.required_analyzers()])
    engine = PandasEngine(data)
    repo = InMemoryMetadataRepository()
    result = do_analysis_run(engine, repo, analyzers)
    ConnectionHandler.close_connections()
    return result
Exemplo n.º 4
0
def run_checks(data, *checks) -> AnalyzerContext:
    analyzers = tuple([a for check in checks for a in check.required_analyzers()])
    engine = PandasEngine(data)
    repo = SQLMetadataRepositoryFactory.create_sql_metadata_repository("duckdb://:memory:")
    repo.set_dataset("data","1")
    result = do_analysis_run(engine, repo, analyzers)
    ConnectionHandler.close_connections()
    return result
Exemplo n.º 5
0
    def run(self, data: DataFrame, dataset_id: str = None, partition_id: str = None) -> VerificationResult: #TODO: maybe drop this function
        """
        Runs all check groups and returns the verification result.
        Verification result includes all the metrics computed during the run.

        Parameters
        ----------

        data:
             tabular data on which the checks should be verified
        """
        engine = PandasEngine(data)
        repo = InMemoryMetadataRepository()
        repo.set_dataset(dataset_id,partition_id)
        return self.do_verification_run(
            engine, repo, self._checks, self._required_analyzers
        )
Exemplo n.º 6
0
    def test_return_basic_statistics(self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Mean("att1"),
            StandardDeviation("att1"),
            Minimum("att1"),
            Maximum("att1"),
            ApproxDistinctness("att1"),
            ApproxDistinctness("att2"),
        ]

        engine = PandasEngine(df_with_numeric_values)
        repo = InMemoryMetadataRepository()

        result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics()

        ConnectionHandler.close_connections()

        assert len(result_metrics) == len(analyzers)

        assert (
            DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics
        )
        assert (
            DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0))
            in result_metrics
        )
        assert (
                DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0))
                in result_metrics
        )
        assert (
                DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205))
                in result_metrics
        )
        assert (
            DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0))
            in result_metrics
        )
        assert (
            DoubleMetric(
                Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829)
            )
            in result_metrics
        )
Exemplo n.º 7
0
    def test_run_individual_analyzer_only_once(self, df_full):

        analyzers = [
            Minimum("item"),
            Minimum("item"),
            Minimum("item"),
        ]
        engine = PandasEngine(df_full)
        repo = InMemoryMetadataRepository()

        ac = do_analysis_run(engine, repo, analyzers)

        ConnectionHandler.close_connections()

        assert len(ac.all_metrics()) == 1
        metric = ac.metric(Minimum("item"))
        assert metric is not None
        assert metric.value.get() == 1
Exemplo n.º 8
0
    def test_run_analyzers_with_different_where_conditions_separately(
        self, df_with_numeric_values
    ):
        df = df_with_numeric_values
        analyzers = [
            Maximum("att1"),
            Maximum("att1", where="att1 > att2"),
        ]

        engine = PandasEngine(df)
        repo = InMemoryMetadataRepository()

        ctx = do_analysis_run(engine, repo, analyzers)

        ConnectionHandler.close_connections()

        assert ctx.metric(analyzers[0]) == DoubleMetric(
            Entity.COLUMN, "Maximum", "att1", Success(6.0)
        )

        assert ctx.metric(analyzers[1]) == DoubleMetric(
            Entity.COLUMN, "Maximum", "att1", Success(3.0)
        )
Exemplo n.º 9
0
    def test_multiple_quantiles_are_computed(self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Quantile("att1", 0.1),
            Quantile("att1", 0.5),
            Quantile("att1", 0.9),
        ]
        engine = PandasEngine(df)
        repo = InMemoryMetadataRepository()
        context_numeric = do_analysis_run(engine, repo, analyzers)
        assert len(context_numeric.metric_map) == 3

        print(context_numeric)
        base_check = Check(CheckLevel.EXCEPTION, description="a description")

        assert is_success(
            base_check.has_approx_quantile("att1", 0.5, lambda v: v == 4.0),
            context_numeric)
        assert is_success(
            base_check.has_approx_quantile("att1", 0.9, lambda v: v == 6.0),
            context_numeric)
        assert is_success(
            base_check.has_approx_quantile("att1", 0.1, lambda v: v == 1.0),
            context_numeric)
Exemplo n.º 10
0
 def on_data(self, data: DataFrame, dataset_id: str = None, partition_id: str = None):
     engine = PandasEngine(data)
     return VerificationRunBuilder(engine, dataset_id, partition_id)
Exemplo n.º 11
0
 def on_data_no_sharing(self,
                        data: DataFrame,
                        dataset_id: str = None,
                        partition_id: str = None):
     engine = PandasEngine(data, no_sharing=True)
     return VerificationRunBuilder(engine, dataset_id, partition_id)