Пример #1
0
    def testDetectDuplicates(self):
        iconstraints = FieldConstraints('i', [NoDuplicatesConstraint()])
        sconstraints = FieldConstraints('s', [NoDuplicatesConstraint()])
        constraints = DatasetConstraints([iconstraints, sconstraints])

        df1 = pd.DataFrame({
            'i': [1, 2, 3, 4, np.nan],
            's': ['one', 'two', 'three', 'four', np.nan]
        })
        verifier1 = pdc.PandasConstraintVerifier(df1)
        v1 = verifier1.detect(constraints,
                              VerificationClass=pdc.PandasDetection)
        self.assertEqual(v1.passes, 2)
        self.assertEqual(v1.failures, 0)
        ddf1 = v1.detected()
        self.assertIsNone(ddf1)

        df2 = pd.DataFrame({
            'i': [1, 2, 3, 2, np.nan],
            's': ['one', 'two', 'three', 'two', np.nan]
        })
        verifier2 = pdc.PandasConstraintVerifier(df2)
        v2 = verifier2.detect(constraints,
                              VerificationClass=pdc.PandasDetection,
                              per_constraint=True,
                              output_fields=['i', 's'])
        self.assertEqual(v2.passes, 0)
        self.assertEqual(v2.failures, 2)
        ddf2 = v2.detected()
        self.assertStringCorrect(ddf2.to_string(), 'detect_dups.df')
Пример #2
0
    def testFieldConstraintsDict(self):
        c = FieldConstraints('one', [
            TypeConstraint('int'),
            MinConstraint(3),
            MaxConstraint(7),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])
        dfc = Fields([c])
        self.assertEqual(
            strip_lines(json.dumps(dfc.to_dict_value(), indent=4)), '''{
    "one": {
        "type": "int",
        "min": 3,
        "max": 7,
        "sign": "positive",
        "max_nulls": 0,
        "no_duplicates": true
    }
}''')

        c = FieldConstraints('one', [
            TypeConstraint('int'),
            MinConstraint(3, precision='closed'),
            MaxConstraint(7, precision='fuzzy'),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])
        dfc = Fields([c])
        self.assertEqual(
            strip_lines(json.dumps(dfc.to_dict_value(), indent=4)), '''{
    "one": {
        "type": "int",
        "min": {
            "value": 3,
            "precision": "closed"
        },
        "max": {
            "value": 7,
            "precision": "fuzzy"
        },
        "sign": "positive",
        "max_nulls": 0,
        "no_duplicates": true
    }
}''')
Пример #3
0
 def test_constraint_repr(self):
     self.assertEqual(repr(MinConstraint(7)),
                      'MinConstraint(value=7, precision=None)')
     self.assertEqual(repr(MinConstraint('a')),
                      "MinConstraint(value='a', precision=None)")
     self.assertEqual(repr(MinConstraint('a', precision='closed')),
                      "MinConstraint(value='a', precision='closed')")
     self.assertEqual(repr(MinLengthConstraint(3)),
                      "MinLengthConstraint(value=3)")
     self.assertEqual(repr(MaxConstraint(-3)),
                      'MaxConstraint(value=-3, precision=None)')
     self.assertEqual(repr(MaxConstraint('KJ')),
                      "MaxConstraint(value='KJ', precision=None)")
     self.assertEqual(repr(MaxConstraint(4.2, precision='closed')),
                      "MaxConstraint(value=4.2, precision='closed')")
     self.assertEqual(repr(MaxLengthConstraint(0)),
                      "MaxLengthConstraint(value=0)")
     self.assertEqual(repr(SignConstraint('positive')),
                      "SignConstraint(value='positive')")
     self.assertEqual(repr(MaxNullsConstraint(0)),
                      "MaxNullsConstraint(value=0)")
     self.assertEqual(repr(NoDuplicatesConstraint()),
                      "NoDuplicatesConstraint(value=True)")
     self.assertEqual(repr(TypeConstraint('int')),
                      "TypeConstraint(value='int')")
     self.assertEqual(repr(TypeConstraint(['int', 'real'])),
                      "TypeConstraint(value=['int', 'real'])")
     self.assertEqual(repr(AllowedValuesConstraint(['a', 'b'])),
                      "AllowedValuesConstraint(value=['a', 'b'])")
Пример #4
0
 def test_verify_no_duplicates_constraint(self):
     df = pd.DataFrame({
         'bu': [True, False, None, None],  # Note two nulls
         'iu': [1, -1, 0, 2],
         'ru': [1.0, -1.0, 0.0, 3.0],
         'su': ['a', 'b', 'c', ''],
         'du': [
             datetime.datetime(2000, 1, 1),
             datetime.datetime(2000, 1, 2),
             datetime.datetime(2000, 1, 3),
             datetime.datetime(2000, 1, 4)
         ],
         'bd': [True, True, False, None],
         'id': [1, 2, 2, 3],
         'rd': [1.0, 2.0, 2.0, 3.0],
         'sd': ['a', 'a', 'a', 'a'],
         'dd': [datetime.datetime(2000, 1, 1)] * 4,
         'IU': [1, -1, None, None],
         'RU': [1.0, -1.0, None, None],
         'SU': [None, None, 'c', ''],
         'DU': [
             datetime.datetime(2000, 1, 1), None, None,
             datetime.datetime(2000, 1, 4)
         ],
         'BD': [True, True, False, None],
         'ID': [1, 2, 2, None],
         'RD': [None, 2.0, 2.0, None],
         'SD': ['a', 'a', None, 'b'],
         'DD': [datetime.datetime(2000, 1, 1)] * 2 + [None] * 2,
         'nu': [None, None, None, None],
     })
     cvt = ConstraintVerificationTester(self, df)
     c = NoDuplicatesConstraint(True)
     for col in df:
         if col.lower().endswith('u'):
             cvt.verify_no_duplicates_constraint(col, c).isTrue()
         else:
             cvt.verify_no_duplicates_constraint(col, c).isFalse()
Пример #5
0
    def discover_field_constraints(self, fieldname):
        min_constraint = max_constraint = None
        min_length_constraint = max_length_constraint = None
        sign_constraint = no_duplicates_constraint = None
        max_nulls_constraint = allowed_values_constraint = None
        rex_constraint = None

        type_ = self.calc_tdda_type(fieldname)
        if type_ == 'other':
            return None  # Unrecognized or complex
        else:
            type_constraint = TypeConstraint(type_)
        length = self.get_nrecords()

        if length > 0:  # Things are not very interesting when there is no data
            nNull = self.calc_null_count(fieldname)
            nNonNull = self.calc_non_null_count(fieldname)
            assert nNull + nNonNull == length
            if nNull < 2:
                max_nulls_constraint = MaxNullsConstraint(nNull)

            # Useful info:
            uniqs = None
            n_unique = -1  # won't equal number of non-nulls later on
            if type_ in ('string', 'int'):
                n_unique = self.calc_nunique(fieldname)
                if type_ == 'string':
                    if n_unique <= MAX_CATEGORIES:
                        uniqs = self.calc_unique_values(fieldname,
                                                        include_nulls=False)
                    if uniqs:
                        avc = AllowedValuesConstraint(uniqs)
                        allowed_values_constraint = avc

            if nNonNull > 0:
                if type_ == 'string':
                    # We don't generate a min, max or sign constraints for
                    # strings. But we do generate min and max length
                    # constraints
                    if (uniqs is None and n_unique > 0):
                        # There were too many for us to have bothered getting
                        # them all before, but we need them now.
                        uniqs = self.calc_unique_values(fieldname,
                                                        include_nulls=False)
                    if uniqs:
                        if type(uniqs[0]) is unicode_string:
                            L = [len(v) for v in uniqs]
                        else:
                            L = [len(v.decode('UTF-8')) for v in uniqs]
                        m = min(L)
                        M = max(L)
                        min_length_constraint = MinLengthConstraint(m)
                        max_length_constraint = MaxLengthConstraint(M)
                else:
                    # Non-string fields all potentially get min and max values
                    m = self.calc_min(fieldname)
                    M = self.calc_max(fieldname)
                    if not self.is_null(m):
                        min_constraint = MinConstraint(m)
                    if not self.is_null(M):
                        max_constraint = MaxConstraint(M)

                    # Non-date fields potentially get a sign constraint too.
                    if min_constraint and max_constraint and type_ != 'date':
                        if m == M == 0:
                            sign_constraint = SignConstraint('zero')
                        elif m >= 0:
                            sign = 'positive' if m > 0 else 'non-negative'
                            sign_constraint = SignConstraint(sign)
                        elif M <= 0:
                            sign = 'negative' if M < 0 else 'non-positive'
                            sign_constraint = SignConstraint(sign)
                        # else:
                        # mixed
                    elif self.is_null(m) and type_ != 'date':
                        sign_constraint = SignConstraint('null')

            if n_unique == nNonNull and n_unique > 1 and type_ != 'real':
                no_duplicates_constraint = NoDuplicatesConstraint()

        if type_ == 'string' and self.inc_rex:
            rex_constraint = RexConstraint(
                self.find_rexes(fieldname, values=uniqs))

        constraints = [
            c for c in [
                type_constraint, min_constraint, max_constraint,
                min_length_constraint, max_length_constraint, sign_constraint,
                max_nulls_constraint, no_duplicates_constraint,
                allowed_values_constraint, rex_constraint
            ] if c is not None
        ]
        return FieldConstraints(fieldname, constraints)
Пример #6
0
def discover_field_constraints(field):
    """
    Discover constraints for a single field (column) from a Pandas DataFrame.

    Input:

        *field*:
            a single field (column; Series) object, usually from
            a Pandas DataFrame.

    Returns:

        - :py:class:`tdda.base.FieldConstraints` object,
          if any constraints were found.
        - ``None``, otherwise.

    """
    min_constraint = max_constraint = None
    min_length_constraint = max_length_constraint = None
    sign_constraint = no_duplicates_constraint = None
    max_nulls_constraint = allowed_values_constraint = None

    type_ = tdda_type(field)
    if type_ == 'other':
        return None         # Unrecognized or complex
    else:
        type_constraint = TypeConstraint(type_)
    length = len(field)

    if length > 0:  # Things are not very interesting when there is no data
        nNull = int(field.isnull().sum().astype(int))
        nNonNull = int(field.notnull().sum().astype(int))
        assert nNull + nNonNull == length
        if nNull < 2:
            max_nulls_constraint = MaxNullsConstraint(nNull)

        # Useful info:
        uniqs = None
        n_unique = -1   # won't equal number of non-nulls later on
        if type_ in ('string', 'int'):
            n_unique = field.nunique()          # excludes NaN
            if type_ == 'string':
                if n_unique <= MAX_CATEGORIES:
                    uniqs = list(field.dropna().unique())
                if uniqs:
                    allowed_values_constraint = AllowedValuesConstraint(uniqs)

        if nNonNull > 0:
            if type_ == 'string':
                # We don't generate a min, max or sign constraints for strings
                # But we do generate min and max length constraints
                if (uniqs is None         # There were too many for us to have
                    and n_unique > 0):    # bothered getting them all
                    uniqs = list(field.dropna().unique())  # need them now
                if uniqs:
                    m = min(len(v) for v in uniqs)
                    M = max(len(v) for v in uniqs)
                    min_length_constraint = MinLengthConstraint(m)
                    max_length_constraint = MaxLengthConstraint(M)
            else:
                # Non-string fields all potentially get min and max values
                if type_ == 'date':
                    m = field.min()
                    M = field.max()
                    if pd.notnull(m):
                        m = m.to_pydatetime()
                    if pd.notnull(M):
                        M = M.to_pydatetime()
                else:
                    m = field.min().item()
                    M = field.max().item()
                if pd.notnull(m):
                    min_constraint = MinConstraint(m)
                if pd.notnull(M):
                    max_constraint = MaxConstraint(M)

                # Non-date fields potentially get a sign constraint too.
                if min_constraint and max_constraint and type_ != 'date':
                    if m == M == 0:
                        sign_constraint = SignConstraint('zero')
                    elif m >= 0:
                        sign = 'positive' if m > 0 else 'non-negative'
                        sign_constraint = SignConstraint(sign)
                    elif M <= 0:
                        sign = 'negative' if M < 0 else 'non-positive'
                        sign_constraint = SignConstraint(sign)
                    # else:
                        # mixed
                elif pd.isnull(m) and type_ != 'date':
                    sign_constraint = SignConstraint('null')

        if n_unique == nNonNull and n_unique > 1 and type_ != 'real':
            no_duplicates_constraint = NoDuplicatesConstraint()

    constraints = [c for c in [type_constraint,
                               min_constraint, max_constraint,
                               min_length_constraint, max_length_constraint,
                               sign_constraint, max_nulls_constraint,
                               no_duplicates_constraint,
                               allowed_values_constraint]
                     if c is not None]
    return FieldConstraints(field.name, constraints)
Пример #7
0
    def testFieldVerification(self):
        df1 = pd.DataFrame({
            'b': [True, False] * 2,
            'i':
            range(1, 5),
            'r': [float(x) for x in range(1, 5)],
            's': ['S%s' % x for x in range(1, 5)],
            'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)]
        })
        ic1 = FieldConstraints('i', [
            TypeConstraint('int'),
            MinConstraint(0),
            MaxConstraint(10),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        ic2 = FieldConstraints('i', [
            TypeConstraint('bool'),
            MinConstraint(2),
            MaxConstraint(3),
            SignConstraint('negative'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        dfc1 = [ic1]
        dsc1 = DatasetConstraints(dfc1)
        pdcv1 = pdc.PandasConstraintVerifier(df1)
        results1 = verify(dsc1, list(df1), pdcv1.verifiers())
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  6 passes  '
            'type ✓  min ✓  max ✓  sign ✓  '
            'max_nulls ✓  no_duplicates ✓\n\n'
            'SUMMARY:\n\nConstraints passing: 6\nConstraints failing: 0')
        self.assertEqual(str(results1), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [6]),
                ('type', [True]),
                ('min', [True]),
                ('max', [True]),
                ('sign', [True]),
                ('max_nulls', [True]),
                ('no_duplicates', [True]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results1)
        self.assertTrue(vdf.equals(expected))

        df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]})
        dfc2 = [ic2]
        dsc2 = DatasetConstraints(dfc2)
        pdcv2 = pdc.PandasConstraintVerifier(df2)
        results2 = verify(dsc2, list(df2), pdcv2.verifiers())
        # expect the boolean->real type constraint to pass with sloppy types
        expected = (
            'FIELDS:\n\n'
            'i: 5 failures  1 pass  '
            'type ✓  min ✗  max ✗  sign ✗  '
            'max_nulls ✗  no_duplicates ✗\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 5')
        self.assertEqual(str(results2), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [5]),
                ('passes', [1]),
                ('type', [True]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results2)
        self.assertTrue(vdf.equals(expected))

        pdcv2strict = pdc.PandasConstraintVerifier(df2, type_checking='strict')
        results2strict = verify(dsc2, list(df2), pdcv2strict.verifiers())
        # expect the boolean->real type constraint to fail with strict types
        expected = (
            'FIELDS:\n\n'
            'i: 6 failures  0 passes  '
            'type ✗  min ✗  max ✗  sign ✗  '
            'max_nulls ✗  no_duplicates ✗\n\n'
            'SUMMARY:\n\nConstraints passing: 0\nConstraints failing: 6')
        self.assertEqual(str(results2strict), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [6]),
                ('passes', [0]),
                ('type', [False]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results2strict)
        self.assertTrue(vdf.equals(expected))

        ic3 = FieldConstraints('i', [TypeConstraint('int')])
        df3 = df1
        dfc3 = [ic3]
        dsc3 = DatasetConstraints(dfc3)
        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, list(df3), pdcv3.verifiers())
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  1 pass  type ✓\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0')
        self.assertEqual(str(results3), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [1]),
                ('type', [True]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results3)
        self.assertTrue(vdf.equals(expected))

        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, list(df3), pdcv3.verifiers(), ascii=True)
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  1 pass  type OK\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0')
        self.assertEqual(str(results3), expected)
Пример #8
0
    def testFieldVerification(self):
        df1 = pd.DataFrame({
            'b': [True, False] * 2,
            'i':
            range(1, 5),
            'r': [float(x) for x in range(1, 5)],
            's': ['S%s' % x for x in range(1, 5)],
            'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)]
        })
        ic1 = FieldConstraints('i', [
            TypeConstraint('int'),
            MinConstraint(0),
            MaxConstraint(10),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        ic2 = FieldConstraints('i', [
            TypeConstraint('bool'),
            MinConstraint(2),
            MaxConstraint(3),
            SignConstraint('negative'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        dfc1 = [ic1]
        dsc1 = DatasetConstraints(dfc1)
        pdcv1 = pdc.PandasConstraintVerifier(df1)
        results1 = verify(dsc1, pdcv1.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 0 failures  6 passes  '
                    'type ✓  min ✓  max ✓  sign ✓  '
                    'max_nulls ✓  no_duplicates ✓\n\n'
                    'SUMMARY:\n\nPasses: 6\nFailures: 0')
        self.assertEqual(str(results1), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [6]),
                ('type', [True]),
                ('min', [True]),
                ('max', [True]),
                ('sign', [True]),
                ('max_nulls', [True]),
                ('no_duplicates', [True]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results1).equals(expected))

        df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]})
        dfc2 = [ic2]
        dsc2 = DatasetConstraints(dfc2)
        pdcv2 = pdc.PandasConstraintVerifier(df2)
        results2 = verify(dsc2, pdcv2.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 6 failures  0 passes  '
                    'type ✗  min ✗  max ✗  sign ✗  '
                    'max_nulls ✗  no_duplicates ✗\n\n'
                    'SUMMARY:\n\nPasses: 0\nFailures: 6')
        self.assertEqual(str(results2), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [6]),
                ('passes', [0]),
                ('type', [False]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results2).equals(expected))

        ic3 = FieldConstraints('i', [TypeConstraint('int')])
        df3 = df1
        dfc3 = [ic3]
        dsc3 = DatasetConstraints(dfc3)
        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, pdcv3.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 0 failures  1 pass  type ✓\n\n'
                    'SUMMARY:\n\nPasses: 1\nFailures: 0')
        self.assertEqual(str(results3), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [1]),
                ('type', [True]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results3).equals(expected))