示例#1
0
    def test_compute_correct_metrics(self, df_with_numeric_values):
        df = df_with_numeric_values

        assert Compliance("rule1", "att1 > 3").calculate(df) == DoubleMetric(
            Entity.COLUMN, "Compliance", "rule1", Success(3.0 / 6.0))

        assert Compliance("rule2", "att1 > 2").calculate(df) == DoubleMetric(
            Entity.COLUMN, "Compliance", "rule2", Success(4.0 / 6.0))
示例#2
0
    def test_computes_correct_metrics(self, df_missing):
        assert (len(Completeness("some_missing_column").preconditions()) == 1
                ), "should check colunm name"

        assert Completeness("att1").calculate(df_missing) == DoubleMetric(
            Entity.COLUMN, "Completeness", "att1", Success(0.5))
        assert Completeness("att2").calculate(df_missing) == DoubleMetric(
            Entity.COLUMN, "Completeness", "att2", Success(0.75))
示例#3
0
def metric_from_value(value: Union[float, Dict[str,
                                               str]], name: str, instance: str,
                      entity: Entity) -> Union[DoubleMetric, SchemaMetric]:
    if isinstance(value, (float, int)):
        return DoubleMetric(entity, name, instance, Success(value))
    elif isinstance(value, dict):
        return SchemaMetric(entity, name, instance, Success(value))
    else:
        raise NoMetricForValueException(
            f"Can not create a Metric for value type {value.__class__.__name__}"
        )
示例#4
0
def test_filtered_uniqueness(sample_data):
    df = pd.DataFrame([("1", "unique"), ("2", "unique"), ("3", "duplicate"),
                       ("3", "duplicate"), ("4", "unique")],
                      columns=("value", "type"))

    uniq = Uniqueness(["value"])
    uniq_with_filter = Uniqueness(["value"], "type=='unique'")

    assert uniq.calculate(df) == DoubleMetric(Entity.COLUMN, "Uniqueness",
                                              ",".join(["value"]),
                                              Success(0.6))
    assert uniq_with_filter.calculate(df) == DoubleMetric(
        Entity.COLUMN, "Uniqueness", ",".join(["value"]), Success(1.0))
示例#5
0
    def test_run_analyzers_with_different_where_conditions_separately(
            self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Maximum("att1"),
            Maximum("att1", where="att1 > att2"),
        ]

        ctx = do_analysis_run(df, analyzers)

        assert ctx.metric(analyzers[0]) == DoubleMetric(
            Entity.COLUMN, "Maximum", "att1", Success(6.0))

        assert ctx.metric(analyzers[1]) == DoubleMetric(
            Entity.COLUMN, "Maximum", "att1", Success(3.0))
示例#6
0
    def test_filter_should_accept_custom_exception_and_message(self):
        class DummyException(Exception):
            pass

        failure = Success(1).filter(lambda x: False, DummyException, "dummy")
        self.assertRaises(DummyException, failure.get)
        self.assertEqual(repr(failure), "Failure(DummyException('dummy',))")
示例#7
0
 def test_equality_of_failure_should_be_based_on_a_type_and_args(self):
     self.assertEqual(Failure(Exception("e")), Failure(Exception("e")))
     self.assertNotEqual(Failure(Exception("foo")),
                         Failure(Exception("bar")))
     self.assertNotEqual(Failure(ZeroDivisionError()), Failure(TypeError()))
     self.assertNotEqual(Failure(Exception("e")), Failure(TypeError("e")))
     self.assertNotEqual(Failure(Exception()), Success(1))
示例#8
0
    def test_computes_max_value_with_predicate_correctly(
            self, df_with_numeric_values):
        data = df_with_numeric_values
        col = "att1"
        a = Maximum(col, where="item != '6'")
        value = a.calculate(data).value

        assert value == Success(5.0)
示例#9
0
def test_uniqueness_should_be_correct_for_multiple_fields(sample_data):
    df = sample_data
    # because "Address Line 1" is unique, all should be
    # this should also work when the columns contain None
    cols = ["Address Line 1", "Address Line 3"]

    assert Uniqueness(cols).calculate(df) == DoubleMetric(
        Entity.MULTICOLUMN, "Uniqueness", ",".join(cols), Success(1.0))
示例#10
0
    def test_computes_std_value_with_predicate_correctly(
            self, df_with_numeric_values):
        data = df_with_numeric_values
        col = "att1"
        a = StandardDeviation(col, where="item != '6'")
        value = a.calculate(data).value

        assert value == Success(1.4142135623730951)
示例#11
0
    def test_computes_correct_metrics(self):
        df = pd.DataFrame({"col": ["miguel", "benjamin", "miguelito"]})

        assert PatternMatch("col", r"^miguel").calculate(df) == DoubleMetric(
            entity=Entity.COLUMN,
            name="PatternMatch",
            instance="col",
            value=Success(0.6666666666666666),
        )
示例#12
0
    def test_generator_with_argument(self):
        def f():
            x = None
            while True:
                x = yield x

        g = f()
        g.send(None)
        self.assertEqual(Try(g, 41).map(lambda x: x + 1), Success(42))
示例#13
0
    def test_fail_with_unhashable_value(self):
        with pytest.raises(TypeError):
            hash(Success([1]))

        class UnhashableException(Exception):
            def __hash__(self):
                raise TypeError()

        with pytest.raises(TypeError):
            hash(Failure(UnhashableException()))
示例#14
0
def test_double_metric_should_flatten():
    metric = DoubleMetric(Entity.COLUMN, "metric-name", "instance-name",
                          Success(50))

    assert metric.flatten() == (metric, )

    metric = DoubleMetric(Entity.COLUMN, "metric-name", "instance-name",
                          Failure(Exception("sample")))

    assert metric.flatten() == (metric, )
示例#15
0
    def test_return_basic_statistics(self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Mean("att1"),
            StandardDeviation("att1"),
            Minimum("att1"),
            Maximum("att1"),
            # CountDistinct("att1")
        ]

        result_metrics = do_analysis_run(df, analyzers).all_metrics()

        assert len(result_metrics) == len(analyzers)

        assert (DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5))
                in result_metrics)
        assert (DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0))
                in result_metrics)
        assert (DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0))
                in result_metrics)

        assert (DoubleMetric(Entity.COLUMN, "StandardDeviation", "att1",
                             Success(1.707825127659933)) in result_metrics)
示例#16
0
    def test_run_analyzers_with_different_where_conditions_separately(
        self, df_with_numeric_values
    ):
        df = df_with_numeric_values
        analyzers = [
            Maximum("att1"),
            Maximum("att1", where="att1 > att2"),
        ]

        engine = PandasEngine(df)
        repo = InMemoryMetadataRepository()

        ctx = do_analysis_run(engine, repo, analyzers)

        ConnectionHandler.close_connections()

        assert ctx.metric(analyzers[0]) == DoubleMetric(
            Entity.COLUMN, "Maximum", "att1", Success(6.0)
        )

        assert ctx.metric(analyzers[1]) == DoubleMetric(
            Entity.COLUMN, "Maximum", "att1", Success(3.0)
        )
示例#17
0
    def test_match_urls(self):

        maybe_urls = [
            "http://foo.com/blah_blah",
            "http://foo.com/blah_blah_(wikipedia)",
            "http://foo.bar/?q=Test%20URL-encoded%20stuff",
            "http://➡.ws/䨹",
            "http://⌘.ws/",
            "http://☺.damowmow.com/",
            "http://例子.测试",
            "https://foo_bar.example.com/",
            "http://[email protected]:8080",
            "http://foo.com/blah_(wikipedia)#cite-1",
            "http://../",  # not really a valid URL
            "h://test",  # not really a valid URL
            "http://.www.foo.bar/"  # not really a valid URL
        ]
        df = pd.DataFrame({"some": maybe_urls})
        result = PatternMatch("some", hpatterns.URL).calculate(df)
        assert result.value == Success(10 / 13.0)
示例#18
0
    def test_match_credit_card_numbers(self):

        maybe_cc_numbers = [
            "378282246310005",  # AMEX
            "6011111111111117",  # Discover
            "6011 1111 1111 1117",  # Discover spaced
            "6011-1111-1111-1117",  # Discover dashed
            "5555555555554444",  # MasterCard
            "5555 5555 5555 4444",  # MasterCard spaced
            "5555-5555-5555-4444",  # MasterCard dashed
            "4111111111111111",  # Visa
            "4111 1111 1111 1111",  # Visa spaced
            "4111-1111-1111-1111",  # Visa dashed
            "0000111122223333",  # not really a CC number
            "000011112222333",  # not really a CC number
            "00001111222233",  # not really a CC number
        ]

        df = pd.DataFrame({"some": maybe_cc_numbers})
        result = PatternMatch("some", hpatterns.CREDITCARD).calculate(df)
        assert result.value == Success(10.0 / 13.0)
示例#19
0
    def test_return_basic_statistics(self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Mean("att1"),
            StandardDeviation("att1"),
            Minimum("att1"),
            Maximum("att1"),
            ApproxDistinctness("att1"),
            ApproxDistinctness("att2"),
        ]

        engine = PandasEngine(df_with_numeric_values)
        repo = InMemoryMetadataRepository()

        result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics()

        ConnectionHandler.close_connections()

        assert len(result_metrics) == len(analyzers)

        assert (
            DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics
        )
        assert (
            DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0))
            in result_metrics
        )
        assert (
                DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0))
                in result_metrics
        )
        assert (
                DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205))
                in result_metrics
        )
        assert (
            DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0))
            in result_metrics
        )
        assert (
            DoubleMetric(
                Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829)
            )
            in result_metrics
        )
示例#20
0
def metric_from_value(value: float, name: str, instance: str,
                      entity: Entity) -> DoubleMetric:
    return DoubleMetric(entity, name, instance, Success(value))
示例#21
0
 def test_flatmap_should_fail_if_f_doesnt_return_try(self):
     self.assertRaises(TypeError, Success(1).flatMap, lambda x: x)
示例#22
0
 def test_match_email_addresses(self):
     col = "some"
     df = pd.DataFrame({col: ["*****@*****.**", "someone@else"]})
     assert PatternMatch(
         col, hpatterns.EMAIL).calculate(df).value == Success(0.5)
示例#23
0
 def test_computes_correct_metrics(self, data):
     a = Size()
     metric = a.calculate(data)
     assert metric == DoubleMetric(Entity.DATASET, "Size", "*",
                                   Success(len(data)))
示例#24
0
 def test_compute_correct_metric_with_filtering(self,
                                                df_with_numeric_values):
     df = df_with_numeric_values
     result = Compliance("rule1", "att2 == 0", "att1 < 4").calculate(df)
     assert result == DoubleMetric(Entity.COLUMN, "Compliance", "rule1",
                                   Success(1.0))
示例#25
0
 def test_map_on_success_should_return_value_depending_on_a_function(self):
     success = Success(1).map(lambda x: -x)
     self.assertTrue(success.isSuccess)
     self.assertEqual(success.get(), -1)
示例#26
0
 def test_filter_on_success_should_return_value_depending_on_a_predicate(
         self):
     self.assertTrue(Success(1).filter(lambda x: x > 0).isSuccess)
     self.assertTrue(Success(-1).filter(lambda x: x > 0).isFailure)
示例#27
0
    def test_works_with_filtering(self, df_missing):
        result = Completeness("att1",
                              "item==1 or item==2").calculate(df_missing)

        assert result == DoubleMetric(Entity.COLUMN, "Completeness", "att1",
                                      Success(1.0))
示例#28
0
 def test_recover_on_success_should_return_identity(self):
     success = Success(1)
     self.assertEqual(success.recover(lambda x: 1 / 0), success)
示例#29
0
 def test_recover_with_on_success_should_return_identity(self):
     success = Success(1)
     self.assertEqual(success.recoverWith(lambda x: Try(lambda x: -1)),
                      success)
示例#30
0
def test_uniqunes_should_be_correct_for_a_single_column(sample_data):
    df = sample_data
    col = "Address Line 1"

    assert Uniqueness([col]).calculate(df) == DoubleMetric(
        Entity.COLUMN, "Uniqueness", col, Success(1.0))