def test_yield_correct_results(self, df_with_numeric_values): df = df_with_numeric_values base_check = Check(CheckLevel.EXCEPTION, description="a description") analyzers = [ Minimum("att1"), Maximum("att1"), Mean("att1"), StandardDeviation("att1"), Sum("att1"), Quantile("att1", 0.5), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() context_numeric = do_analysis_run(engine, repo, analyzers) assert is_success(base_check.has_min("att1", lambda v: v == 1.0), context_numeric) assert is_success(base_check.has_max("att1", lambda v: v == 6.0), context_numeric) assert is_success(base_check.has_mean("att1", lambda v: v == 3.5), context_numeric) assert is_success( base_check.has_standard_deviation("att1", lambda v: v == 1.870829), context_numeric, ) assert is_success(base_check.has_sum("att1", lambda v: v == 21.0), context_numeric) assert is_success( base_check.has_approx_quantile("att1", 0.5, lambda v: v == 4.0), context_numeric)
def test_return_result_for_configured_analyzers(self, df_full): analyzers = [ Size(), Minimum("item"), Completeness("item"), ] engine = PandasEngine(df_full) repo = InMemoryMetadataRepository() ac = do_analysis_run(engine, repo, analyzers) sm = AnalyzerContext.success_metrics_as_dataframe(ac) expected = pd.DataFrame( [ ("DATASET", "*", "Size", 4.0), ("COLUMN", "item", "Minimum", 1.0), ("COLUMN", "item", "Completeness", 1.0), ], columns=("entity", "instance", "name", "value"), ) ConnectionHandler.close_connections() assert_frame_equal(sm, expected, check_like=True)
def run_checks(data, *checks) -> AnalyzerContext: analyzers = tuple( [a for check in checks for a in check.required_analyzers()]) engine = PandasEngine(data) repo = InMemoryMetadataRepository() result = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() return result
def run_checks(data, *checks) -> AnalyzerContext: analyzers = tuple([a for check in checks for a in check.required_analyzers()]) engine = PandasEngine(data) repo = SQLMetadataRepositoryFactory.create_sql_metadata_repository("duckdb://:memory:") repo.set_dataset("data","1") result = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() return result
def do_verification_run( self, engine: Engine, repo: MetadataRepository, checks: Sequence[Check], required_analyzers: Optional[Tuple[Property, ...]] = None, ) -> VerificationResult: """ Runs all check groups and returns the verification result. Verification result includes all the metrics computed during the run. Parameters ---------- data: tabular data on which the checks should be verified checks: A sequence of check objects to be executed required_analyzers: Can be used to enforce the calculation of some some metrics regardless of if there are constraints on them (optional) aggregate_with: not implemented loader from which we retrieve initial states to aggregate (optional) save_states_with: not implemented persist resulting states for the configured analyzers (optional) metrics_repository_options: Options related to the MetricsRepository Returns -------- returns Result for every check including the overall status, detailed status for each constraints and all metrics produced """ required_analyzers = required_analyzers or () analyzers = required_analyzers + tuple( [a for check in checks for a in check.required_analyzers()] ) # This rhis returns AnalysisContext analysis_result = do_analysis_run(engine, repo, analyzers) verification_result = self.evaluate(checks, analysis_result) # TODO: Save ave or append Results on the metric reposiotory # TODO: Save JsonOutputToFilesystemIfNecessary # pull up store_metrics from do_analysis_run() and include precondition metrics repo.store_checks(checks) ConnectionHandler.close_connections() return verification_result
def test_return_basic_statistics(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Mean("att1"), StandardDeviation("att1"), Minimum("att1"), Maximum("att1"), ApproxDistinctness("att1"), ApproxDistinctness("att2"), ] engine = PandasEngine(df_with_numeric_values) repo = InMemoryMetadataRepository() result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics() ConnectionHandler.close_connections() assert len(result_metrics) == len(analyzers) assert ( DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205)) in result_metrics ) assert ( DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0)) in result_metrics ) assert ( DoubleMetric( Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829) ) in result_metrics )
def test_run_individual_analyzer_only_once(self, df_full): analyzers = [ Minimum("item"), Minimum("item"), Minimum("item"), ] engine = PandasEngine(df_full) repo = InMemoryMetadataRepository() ac = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() assert len(ac.all_metrics()) == 1 metric = ac.metric(Minimum("item")) assert metric is not None assert metric.value.get() == 1
def test_run_analyzers_with_different_where_conditions_separately( self, df_with_numeric_values ): df = df_with_numeric_values analyzers = [ Maximum("att1"), Maximum("att1", where="att1 > att2"), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() ctx = do_analysis_run(engine, repo, analyzers) ConnectionHandler.close_connections() assert ctx.metric(analyzers[0]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(6.0) ) assert ctx.metric(analyzers[1]) == DoubleMetric( Entity.COLUMN, "Maximum", "att1", Success(3.0) )
def test_multiple_quantiles_are_computed(self, df_with_numeric_values): df = df_with_numeric_values analyzers = [ Quantile("att1", 0.1), Quantile("att1", 0.5), Quantile("att1", 0.9), ] engine = PandasEngine(df) repo = InMemoryMetadataRepository() context_numeric = do_analysis_run(engine, repo, analyzers) assert len(context_numeric.metric_map) == 3 print(context_numeric) base_check = Check(CheckLevel.EXCEPTION, description="a description") assert is_success( base_check.has_approx_quantile("att1", 0.5, lambda v: v == 4.0), context_numeric) assert is_success( base_check.has_approx_quantile("att1", 0.9, lambda v: v == 6.0), context_numeric) assert is_success( base_check.has_approx_quantile("att1", 0.1, lambda v: v == 1.0), context_numeric)