Exemplo n.º 1
0
    def test_return_basic_statistics(self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Mean("att1"),
            StandardDeviation("att1"),
            Minimum("att1"),
            Maximum("att1"),
            # CountDistinct("att1")
        ]

        result_metrics = do_analysis_run(df, analyzers).all_metrics()

        assert len(result_metrics) == len(analyzers)

        assert (DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5))
                in result_metrics)
        assert (DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0))
                in result_metrics)
        assert (DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0))
                in result_metrics)

        assert (DoubleMetric(Entity.COLUMN, "StandardDeviation", "att1",
                             Success(1.707825127659933)) in result_metrics)
Exemplo n.º 2
0
    def test_match_urls(self):

        maybe_urls = [
            "http://foo.com/blah_blah",
            "http://foo.com/blah_blah_(wikipedia)",
            "http://foo.bar/?q=Test%20URL-encoded%20stuff",
            "http://➡.ws/䨹",
            "http://⌘.ws/",
            "http://☺.damowmow.com/",
            "http://例子.测试",
            "https://foo_bar.example.com/",
            "http://[email protected]:8080",
            "http://foo.com/blah_(wikipedia)#cite-1",
            "http://../",  # not really a valid URL
            "h://test",  # not really a valid URL
            "http://.www.foo.bar/"  # not really a valid URL
        ]
        df = pd.DataFrame({"some": maybe_urls})
        result = PatternMatch("some", hpatterns.URL).calculate(df)
        assert result.value == Success(10 / 13.0)
Exemplo n.º 3
0
    def test_match_credit_card_numbers(self):

        maybe_cc_numbers = [
            "378282246310005",  # AMEX
            "6011111111111117",  # Discover
            "6011 1111 1111 1117",  # Discover spaced
            "6011-1111-1111-1117",  # Discover dashed
            "5555555555554444",  # MasterCard
            "5555 5555 5555 4444",  # MasterCard spaced
            "5555-5555-5555-4444",  # MasterCard dashed
            "4111111111111111",  # Visa
            "4111 1111 1111 1111",  # Visa spaced
            "4111-1111-1111-1111",  # Visa dashed
            "0000111122223333",  # not really a CC number
            "000011112222333",  # not really a CC number
            "00001111222233",  # not really a CC number
        ]

        df = pd.DataFrame({"some": maybe_cc_numbers})
        result = PatternMatch("some", hpatterns.CREDITCARD).calculate(df)
        assert result.value == Success(10.0 / 13.0)
Exemplo n.º 4
0
    def test_return_basic_statistics(self, df_with_numeric_values):
        df = df_with_numeric_values
        analyzers = [
            Mean("att1"),
            StandardDeviation("att1"),
            Minimum("att1"),
            Maximum("att1"),
            ApproxDistinctness("att1"),
            ApproxDistinctness("att2"),
        ]

        engine = PandasEngine(df_with_numeric_values)
        repo = InMemoryMetadataRepository()

        result_metrics = do_analysis_run(engine, repo, analyzers).all_metrics()

        ConnectionHandler.close_connections()

        assert len(result_metrics) == len(analyzers)

        assert (
            DoubleMetric(Entity.COLUMN, "Mean", "att1", Success(3.5)) in result_metrics
        )
        assert (
            DoubleMetric(Entity.COLUMN, "Minimum", "att1", Success(1.0))
            in result_metrics
        )
        assert (
                DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att1", Success(1.0))
                in result_metrics
        )
        assert (
                DoubleMetric(Entity.COLUMN, "ApproxDistinctness", "att2", Success(0.6666666716337205))
                in result_metrics
        )
        assert (
            DoubleMetric(Entity.COLUMN, "Maximum", "att1", Success(6.0))
            in result_metrics
        )
        assert (
            DoubleMetric(
                Entity.COLUMN, "StandardDeviation", "att1", Success(1.870829)
            )
            in result_metrics
        )
Exemplo n.º 5
0
 def test_recover_with_on_success_should_return_identity(self):
     success = Success(1)
     self.assertEqual(success.recoverWith(lambda x: Try(lambda x: -1)),
                      success)
Exemplo n.º 6
0
def test_uniqunes_should_be_correct_for_a_single_column(sample_data):
    df = sample_data
    col = "Address Line 1"

    assert Uniqueness([col]).calculate(df) == DoubleMetric(
        Entity.COLUMN, "Uniqueness", col, Success(1.0))
Exemplo n.º 7
0
 def test_or_else_on_success_should_return_identity(self):
     success = Success(1)
     self.assertEqual(success.orElse(lambda: 1), success)
Exemplo n.º 8
0
 def test_recover_on_success_should_return_identity(self):
     success = Success(1)
     self.assertEqual(success.recover(lambda x: 1 / 0), success)
Exemplo n.º 9
0
 def test_generator_without_arguments(self):
     g = (lambda: (yield 1))()
     self.assertEqual(Try(g).map(lambda x: x + 1), Success(2))
Exemplo n.º 10
0
def metric_from_value(value: float, name: str, instance: str,
                      entity: Entity) -> DoubleMetric:
    return DoubleMetric(entity, name, instance, Success(value))
Exemplo n.º 11
0
 def test_flatmap_on_failure_should_return_failure(self):
     self.assertTrue(
         Failure(Exception("")).flatMap(lambda x: Success(1)).isFailure)
Exemplo n.º 12
0
 def test_match_email_addresses(self):
     col = "some"
     df = pd.DataFrame({col: ["*****@*****.**", "someone@else"]})
     assert PatternMatch(
         col, hpatterns.EMAIL).calculate(df).value == Success(0.5)
Exemplo n.º 13
0
 def test_equality_of_success_should_be_based_on_the_equality_of_values(
         self):
     self.assertEqual(Success(1), Success(1))
     self.assertNotEqual(Success(1), Success(2))
Exemplo n.º 14
0
 def test_truthness(self):
     self.assertFalse(Failure(Exception("e")))
     self.assertTrue(Success(1))
Exemplo n.º 15
0
 def test__try_identity_if_try_or_raise(self):
     success = Success(1)
     failure = Failure(Exception("e"))
     self.assertRaises(TypeError, Try_._identity_if_try_or_raise, 1)
     self.assertEqual(Try_._identity_if_try_or_raise(success), success)
     self.assertEqual(Try_._identity_if_try_or_raise(failure), failure)
Exemplo n.º 16
0
 def test_failed_on_success_should_throw_type_error_exception(self):
     self.assertRaises(TypeError, Success(1).failed)
Exemplo n.º 17
0
 def test_or_else_on_failure_should_return_else(self):
     success = Success(1)
     self.assertEqual(Failure(Exception("e")).orElse(success), success)
Exemplo n.º 18
0
    def test_works_with_filtering(self, df_missing):
        result = Completeness("att1",
                              "item==1 or item==2").calculate(df_missing)

        assert result == DoubleMetric(Entity.COLUMN, "Completeness", "att1",
                                      Success(1.0))
Exemplo n.º 19
0
 def test_flatmap_on_success_should_return_value_depending_on_a_function(
         self):
     self.assertTrue(Success(1).flatMap(lambda x: Success(1)).isSuccess)
     self.assertTrue(
         Success(1).flatMap(lambda x: Failure(Exception())).isFailure)
Exemplo n.º 20
0
 def test_compute_correct_metric_with_filtering(self,
                                                df_with_numeric_values):
     df = df_with_numeric_values
     result = Compliance("rule1", "att2 == 0", "att1 < 4").calculate(df)
     assert result == DoubleMetric(Entity.COLUMN, "Compliance", "rule1",
                                   Success(1.0))
Exemplo n.º 21
0
 def test_failed_on_success_should_be_a_failure(self):
     result = Success(1).failed()
     self.assertTrue(result.isFailure)
Exemplo n.º 22
0
 def test_computes_correct_metrics(self, data):
     a = Size()
     metric = a.calculate(data)
     assert metric == DoubleMetric(Entity.DATASET, "Size", "*",
                                   Success(len(data)))
Exemplo n.º 23
0
 def test_map_on_success_should_return_value_depending_on_a_function(self):
     success = Success(1).map(lambda x: -x)
     self.assertTrue(success.isSuccess)
     self.assertEqual(success.get(), -1)
Exemplo n.º 24
0
 def test_hashable(self):
     self.assertTrue(hash(Success(1)) == hash(Success(1)))
     self.assertTrue(hash(Success(1)) == 1)
     e = Exception("e")
     self.assertTrue(hash(Failure(e)) == hash(Failure(e)))
Exemplo n.º 25
0
 def test_filter_on_success_should_return_value_depending_on_a_predicate(
         self):
     self.assertTrue(Success(1).filter(lambda x: x > 0).isSuccess)
     self.assertTrue(Success(-1).filter(lambda x: x > 0).isFailure)
Exemplo n.º 26
0
 def test_flatmap_should_fail_if_f_doesnt_return_try(self):
     self.assertRaises(TypeError, Success(1).flatMap, lambda x: x)
Exemplo n.º 27
0
 def test_get_or_else_with_success_should_return_this_value(self):
     self.assertEqual(Success(1).getOrElse(lambda: -1), 1)