def testFieldConstraintsDict(self): c = FieldConstraints('one', [ TypeConstraint('int'), MinConstraint(3), MaxConstraint(7), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc = Fields([c]) self.assertEqual( strip_lines(json.dumps(dfc.to_dict_value(), indent=4)), '''{ "one": { "type": "int", "min": 3, "max": 7, "sign": "positive", "max_nulls": 0, "no_duplicates": true } }''') c = FieldConstraints('one', [ TypeConstraint('int'), MinConstraint(3, precision='closed'), MaxConstraint(7, precision='fuzzy'), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc = Fields([c]) self.assertEqual( strip_lines(json.dumps(dfc.to_dict_value(), indent=4)), '''{ "one": { "type": "int", "min": { "value": 3, "precision": "closed" }, "max": { "value": 7, "precision": "fuzzy" }, "sign": "positive", "max_nulls": 0, "no_duplicates": true } }''')
def test_constraint_repr(self): self.assertEqual(repr(MinConstraint(7)), 'MinConstraint(value=7, precision=None)') self.assertEqual(repr(MinConstraint('a')), "MinConstraint(value='a', precision=None)") self.assertEqual(repr(MinConstraint('a', precision='closed')), "MinConstraint(value='a', precision='closed')") self.assertEqual(repr(MinLengthConstraint(3)), "MinLengthConstraint(value=3)") self.assertEqual(repr(MaxConstraint(-3)), 'MaxConstraint(value=-3, precision=None)') self.assertEqual(repr(MaxConstraint('KJ')), "MaxConstraint(value='KJ', precision=None)") self.assertEqual(repr(MaxConstraint(4.2, precision='closed')), "MaxConstraint(value=4.2, precision='closed')") self.assertEqual(repr(MaxLengthConstraint(0)), "MaxLengthConstraint(value=0)") self.assertEqual(repr(SignConstraint('positive')), "SignConstraint(value='positive')") self.assertEqual(repr(MaxNullsConstraint(0)), "MaxNullsConstraint(value=0)") self.assertEqual(repr(NoDuplicatesConstraint()), "NoDuplicatesConstraint(value=True)") self.assertEqual(repr(TypeConstraint('int')), "TypeConstraint(value='int')") self.assertEqual(repr(TypeConstraint(['int', 'real'])), "TypeConstraint(value=['int', 'real'])") self.assertEqual(repr(AllowedValuesConstraint(['a', 'b'])), "AllowedValuesConstraint(value=['a', 'b'])")
def test_verify_tdda_max_nulls_constraint(self): df = pd.DataFrame({ 'b': [True, False], 'i': [1, -1], 'r': [1.0, -1.0], 's': ['a', 'a'], 'd': [datetime.datetime.now()] * 2, 'bn': [True, None], 'in': [1, None], 'rn': [None, 1.0], 'sn': [None, 'a'], 'dn': [datetime.datetime.now(), None], 'n': [None, None], }) cvt = ConstraintVerificationTester(self, df) c = MaxNullsConstraint(0) for col in df: if col.endswith('n'): cvt.verify_max_nulls_constraint(col, c).isFalse() else: cvt.verify_max_nulls_constraint(col, c).isTrue()
def discover_field_constraints(self, fieldname): min_constraint = max_constraint = None min_length_constraint = max_length_constraint = None sign_constraint = no_duplicates_constraint = None max_nulls_constraint = allowed_values_constraint = None rex_constraint = None type_ = self.calc_tdda_type(fieldname) if type_ == 'other': return None # Unrecognized or complex else: type_constraint = TypeConstraint(type_) length = self.get_nrecords() if length > 0: # Things are not very interesting when there is no data nNull = self.calc_null_count(fieldname) nNonNull = self.calc_non_null_count(fieldname) assert nNull + nNonNull == length if nNull < 2: max_nulls_constraint = MaxNullsConstraint(nNull) # Useful info: uniqs = None n_unique = -1 # won't equal number of non-nulls later on if type_ in ('string', 'int'): n_unique = self.calc_nunique(fieldname) if type_ == 'string': if n_unique <= MAX_CATEGORIES: uniqs = self.calc_unique_values(fieldname, include_nulls=False) if uniqs: avc = AllowedValuesConstraint(uniqs) allowed_values_constraint = avc if nNonNull > 0: if type_ == 'string': # We don't generate a min, max or sign constraints for # strings. But we do generate min and max length # constraints if (uniqs is None and n_unique > 0): # There were too many for us to have bothered getting # them all before, but we need them now. uniqs = self.calc_unique_values(fieldname, include_nulls=False) if uniqs: if type(uniqs[0]) is unicode_string: L = [len(v) for v in uniqs] else: L = [len(v.decode('UTF-8')) for v in uniqs] m = min(L) M = max(L) min_length_constraint = MinLengthConstraint(m) max_length_constraint = MaxLengthConstraint(M) else: # Non-string fields all potentially get min and max values m = self.calc_min(fieldname) M = self.calc_max(fieldname) if not self.is_null(m): min_constraint = MinConstraint(m) if not self.is_null(M): max_constraint = MaxConstraint(M) # Non-date fields potentially get a sign constraint too. if min_constraint and max_constraint and type_ != 'date': if m == M == 0: sign_constraint = SignConstraint('zero') elif m >= 0: sign = 'positive' if m > 0 else 'non-negative' sign_constraint = SignConstraint(sign) elif M <= 0: sign = 'negative' if M < 0 else 'non-positive' sign_constraint = SignConstraint(sign) # else: # mixed elif self.is_null(m) and type_ != 'date': sign_constraint = SignConstraint('null') if n_unique == nNonNull and n_unique > 1 and type_ != 'real': no_duplicates_constraint = NoDuplicatesConstraint() if type_ == 'string' and self.inc_rex: rex_constraint = RexConstraint( self.find_rexes(fieldname, values=uniqs)) constraints = [ c for c in [ type_constraint, min_constraint, max_constraint, min_length_constraint, max_length_constraint, sign_constraint, max_nulls_constraint, no_duplicates_constraint, allowed_values_constraint, rex_constraint ] if c is not None ] return FieldConstraints(fieldname, constraints)
def discover_field_constraints(field): """ Discover constraints for a single field (column) from a Pandas DataFrame. Input: *field*: a single field (column; Series) object, usually from a Pandas DataFrame. Returns: - :py:class:`tdda.base.FieldConstraints` object, if any constraints were found. - ``None``, otherwise. """ min_constraint = max_constraint = None min_length_constraint = max_length_constraint = None sign_constraint = no_duplicates_constraint = None max_nulls_constraint = allowed_values_constraint = None type_ = tdda_type(field) if type_ == 'other': return None # Unrecognized or complex else: type_constraint = TypeConstraint(type_) length = len(field) if length > 0: # Things are not very interesting when there is no data nNull = int(field.isnull().sum().astype(int)) nNonNull = int(field.notnull().sum().astype(int)) assert nNull + nNonNull == length if nNull < 2: max_nulls_constraint = MaxNullsConstraint(nNull) # Useful info: uniqs = None n_unique = -1 # won't equal number of non-nulls later on if type_ in ('string', 'int'): n_unique = field.nunique() # excludes NaN if type_ == 'string': if n_unique <= MAX_CATEGORIES: uniqs = list(field.dropna().unique()) if uniqs: allowed_values_constraint = AllowedValuesConstraint(uniqs) if nNonNull > 0: if type_ == 'string': # We don't generate a min, max or sign constraints for strings # But we do generate min and max length constraints if (uniqs is None # There were too many for us to have and n_unique > 0): # bothered getting them all uniqs = list(field.dropna().unique()) # need them now if uniqs: m = min(len(v) for v in uniqs) M = max(len(v) for v in uniqs) min_length_constraint = MinLengthConstraint(m) max_length_constraint = MaxLengthConstraint(M) else: # Non-string fields all potentially get min and max values if type_ == 'date': m = field.min() M = field.max() if pd.notnull(m): m = m.to_pydatetime() if pd.notnull(M): M = M.to_pydatetime() else: m = field.min().item() M = field.max().item() if pd.notnull(m): min_constraint = MinConstraint(m) if pd.notnull(M): max_constraint = MaxConstraint(M) # Non-date fields potentially get a sign constraint too. if min_constraint and max_constraint and type_ != 'date': if m == M == 0: sign_constraint = SignConstraint('zero') elif m >= 0: sign = 'positive' if m > 0 else 'non-negative' sign_constraint = SignConstraint(sign) elif M <= 0: sign = 'negative' if M < 0 else 'non-positive' sign_constraint = SignConstraint(sign) # else: # mixed elif pd.isnull(m) and type_ != 'date': sign_constraint = SignConstraint('null') if n_unique == nNonNull and n_unique > 1 and type_ != 'real': no_duplicates_constraint = NoDuplicatesConstraint() constraints = [c for c in [type_constraint, min_constraint, max_constraint, min_length_constraint, max_length_constraint, sign_constraint, max_nulls_constraint, no_duplicates_constraint, allowed_values_constraint] if c is not None] return FieldConstraints(field.name, constraints)
def testFieldVerification(self): df1 = pd.DataFrame({ 'b': [True, False] * 2, 'i': range(1, 5), 'r': [float(x) for x in range(1, 5)], 's': ['S%s' % x for x in range(1, 5)], 'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)] }) ic1 = FieldConstraints('i', [ TypeConstraint('int'), MinConstraint(0), MaxConstraint(10), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) ic2 = FieldConstraints('i', [ TypeConstraint('bool'), MinConstraint(2), MaxConstraint(3), SignConstraint('negative'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc1 = [ic1] dsc1 = DatasetConstraints(dfc1) pdcv1 = pdc.PandasConstraintVerifier(df1) results1 = verify(dsc1, list(df1), pdcv1.verifiers()) expected = ( 'FIELDS:\n\n' 'i: 0 failures 6 passes ' 'type ✓ min ✓ max ✓ sign ✓ ' 'max_nulls ✓ no_duplicates ✓\n\n' 'SUMMARY:\n\nConstraints passing: 6\nConstraints failing: 0') self.assertEqual(str(results1), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [6]), ('type', [True]), ('min', [True]), ('max', [True]), ('sign', [True]), ('max_nulls', [True]), ('no_duplicates', [True]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results1) self.assertTrue(vdf.equals(expected)) df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]}) dfc2 = [ic2] dsc2 = DatasetConstraints(dfc2) pdcv2 = pdc.PandasConstraintVerifier(df2) results2 = verify(dsc2, list(df2), pdcv2.verifiers()) # expect the boolean->real type constraint to pass with sloppy types expected = ( 'FIELDS:\n\n' 'i: 5 failures 1 pass ' 'type ✓ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 5') self.assertEqual(str(results2), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [5]), ('passes', [1]), ('type', [True]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results2) self.assertTrue(vdf.equals(expected)) pdcv2strict = pdc.PandasConstraintVerifier(df2, type_checking='strict') results2strict = verify(dsc2, list(df2), pdcv2strict.verifiers()) # expect the boolean->real type constraint to fail with strict types expected = ( 'FIELDS:\n\n' 'i: 6 failures 0 passes ' 'type ✗ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nConstraints passing: 0\nConstraints failing: 6') self.assertEqual(str(results2strict), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [6]), ('passes', [0]), ('type', [False]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results2strict) self.assertTrue(vdf.equals(expected)) ic3 = FieldConstraints('i', [TypeConstraint('int')]) df3 = df1 dfc3 = [ic3] dsc3 = DatasetConstraints(dfc3) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, list(df3), pdcv3.verifiers()) expected = ( 'FIELDS:\n\n' 'i: 0 failures 1 pass type ✓\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0') self.assertEqual(str(results3), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [1]), ('type', [True]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results3) self.assertTrue(vdf.equals(expected)) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, list(df3), pdcv3.verifiers(), ascii=True) expected = ( 'FIELDS:\n\n' 'i: 0 failures 1 pass type OK\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0') self.assertEqual(str(results3), expected)
def testFieldVerification(self): df1 = pd.DataFrame({ 'b': [True, False] * 2, 'i': range(1, 5), 'r': [float(x) for x in range(1, 5)], 's': ['S%s' % x for x in range(1, 5)], 'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)] }) ic1 = FieldConstraints('i', [ TypeConstraint('int'), MinConstraint(0), MaxConstraint(10), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) ic2 = FieldConstraints('i', [ TypeConstraint('bool'), MinConstraint(2), MaxConstraint(3), SignConstraint('negative'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc1 = [ic1] dsc1 = DatasetConstraints(dfc1) pdcv1 = pdc.PandasConstraintVerifier(df1) results1 = verify(dsc1, pdcv1.verifiers()) expected = ('FIELDS:\n\n' 'i: 0 failures 6 passes ' 'type ✓ min ✓ max ✓ sign ✓ ' 'max_nulls ✓ no_duplicates ✓\n\n' 'SUMMARY:\n\nPasses: 6\nFailures: 0') self.assertEqual(str(results1), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [6]), ('type', [True]), ('min', [True]), ('max', [True]), ('sign', [True]), ('max_nulls', [True]), ('no_duplicates', [True]), ))) self.assertTrue( pdc.verification_to_dataframe(results1).equals(expected)) df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]}) dfc2 = [ic2] dsc2 = DatasetConstraints(dfc2) pdcv2 = pdc.PandasConstraintVerifier(df2) results2 = verify(dsc2, pdcv2.verifiers()) expected = ('FIELDS:\n\n' 'i: 6 failures 0 passes ' 'type ✗ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nPasses: 0\nFailures: 6') self.assertEqual(str(results2), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [6]), ('passes', [0]), ('type', [False]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) self.assertTrue( pdc.verification_to_dataframe(results2).equals(expected)) ic3 = FieldConstraints('i', [TypeConstraint('int')]) df3 = df1 dfc3 = [ic3] dsc3 = DatasetConstraints(dfc3) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, pdcv3.verifiers()) expected = ('FIELDS:\n\n' 'i: 0 failures 1 pass type ✓\n\n' 'SUMMARY:\n\nPasses: 1\nFailures: 0') self.assertEqual(str(results3), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [1]), ('type', [True]), ))) self.assertTrue( pdc.verification_to_dataframe(results3).equals(expected))