def detect(self, constraints, VerificationClass=Verification, outpath=None, write_all=False, per_constraint=False, output_fields=None, rownumber=False, in_place=False, **kwargs): """ Apply verifiers to a set of constraints, for detection """ return verify(constraints, self.get_column_names(), self.verifiers(), VerificationClass=VerificationClass, detect=True, detect_outpath=outpath, detect_write_all=write_all, detect_per_constraint=per_constraint, detect_output_fields=output_fields, detect_rownumber=rownumber, detect_in_place=in_place, detected_records_writer=self.write_detected_records, **kwargs)
def verify(self, constraints, VerificationClass=Verification, **kwargs): """ Apply verifiers to a set of constraints, for reporting """ return verify(constraints, self.get_column_names(), self.verifiers(), VerificationClass=VerificationClass, detected_records_writer=self.write_detected_records, **kwargs)
def verify_df(df, constraints_path, epsilon=None, type_checking=None, **kwargs): """ Verify that (i.e. check whether) the Pandas DataFrame provided satisfies the constraints in the JSON .tdda file provided. Mandatory Inputs: *df*: A Pandas DataFrame, to be checked. *constraints_path*: The path to a JSON .tdda file (possibly generated by the discover_constraints function, below) containing constraints to be checked. Optional Inputs: *epsilon*: When checking minimum and maximum values for numeric fields, this provides a tolerance. The tolerance is a proportion of the constraint value by which the constraint can be exceeded without causing a constraint violation to be issued. With the default value of epsilon (:py:const:`EPSILON_DEFAULT` = 0.01, i.e. 1%), values can be up to 1% larger than a max constraint without generating constraint failure, and minimum values can be up to 1% smaller that the minimum constraint value without generating a constraint failure. (These are modified, as appropraite, for negative values.) NOTE: A consequence of the fact that these are proportionate is that min/max values of zero do not have any tolerance, i.e. the wrong sign always generates a failure. *type_checking*: 'strict' or 'sloppy'. Because Pandas silently, routinely and automatically "promotes" integer and boolean columns to reals and objects respectively if they contain nulls, strict type checking can be problematical in Pandas. For this reason, type_checking defaults to 'sloppy', meaning that type changes that could plausibly be attriuted to Pandas type promotion will not generate constraint values. If this is set to strict, a Pandas "float" column c will only be allowed to satisfy a an "int" type constraint if: `c.dropnulls().astype(int) == c.dropnulls()` Similarly, Object fields will satisfy a 'bool' constraint only if: `c.dropnulls().astype(bool) == c.dropnulls()` *report*: 'all' or 'fields'. This controls the behaviour of the :py:meth:`~PandasVerification.__str__` method on the resulting :py:class:`~PandasVerification` object (but not its content). The default is 'all', which means that all fields are shown, together with the verification status of each constraint for that field. If report is set to 'fields', only fields for which at least one constraint failed are shown. NOTE: The method also accepts two further parameters to control (not yet implemented) behaviour. 'constraints', will be used to indicate that only failing constraints for failing fields should be shown. 'one_per_line' will indicate that each constraint failure should be reported on a separate line. Returns: :py:class:`~PandasVerification` object. This object has attributes: - *passed* --- Number of passing constriants - *failures* --- Number of failing constraints It also has a :py:meth:`~PandasVerification.to_frame()` method for converting the results of the verification to a Pandas DataFrame, and a :py:meth:`~PandasVerification.__str__` method to print both the detailed and summary results of the verification. Example usage:: import pandas as pd from tdda.constraints.pdconstraints import verify_df df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN], 'b': ['one', 'one', 'two', 'three', pd.np.NaN]}) v = verify_df(df, 'example_constraints.tdda') print('Passes:', v.passes) print('Failures: %d\\n' % v.failures) print(str(v)) print(v.to_frame()) See *simple_verification.py* in the :ref:`constraint_examples` for a slightly fuller example. """ pdv = PandasConstraintVerifier(df, epsilon=epsilon, type_checking=type_checking) constraints = DatasetConstraints(loadpath=constraints_path) return verify(constraints, pdv.verifiers(), VerificationClass=PandasVerification, **kwargs)
def testFieldVerification(self): df1 = pd.DataFrame({ 'b': [True, False] * 2, 'i': range(1, 5), 'r': [float(x) for x in range(1, 5)], 's': ['S%s' % x for x in range(1, 5)], 'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)] }) ic1 = FieldConstraints('i', [ TypeConstraint('int'), MinConstraint(0), MaxConstraint(10), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) ic2 = FieldConstraints('i', [ TypeConstraint('bool'), MinConstraint(2), MaxConstraint(3), SignConstraint('negative'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc1 = [ic1] dsc1 = DatasetConstraints(dfc1) pdcv1 = pdc.PandasConstraintVerifier(df1) results1 = verify(dsc1, list(df1), pdcv1.verifiers()) expected = ( 'FIELDS:\n\n' 'i: 0 failures 6 passes ' 'type ✓ min ✓ max ✓ sign ✓ ' 'max_nulls ✓ no_duplicates ✓\n\n' 'SUMMARY:\n\nConstraints passing: 6\nConstraints failing: 0') self.assertEqual(str(results1), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [6]), ('type', [True]), ('min', [True]), ('max', [True]), ('sign', [True]), ('max_nulls', [True]), ('no_duplicates', [True]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results1) self.assertTrue(vdf.equals(expected)) df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]}) dfc2 = [ic2] dsc2 = DatasetConstraints(dfc2) pdcv2 = pdc.PandasConstraintVerifier(df2) results2 = verify(dsc2, list(df2), pdcv2.verifiers()) # expect the boolean->real type constraint to pass with sloppy types expected = ( 'FIELDS:\n\n' 'i: 5 failures 1 pass ' 'type ✓ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 5') self.assertEqual(str(results2), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [5]), ('passes', [1]), ('type', [True]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results2) self.assertTrue(vdf.equals(expected)) pdcv2strict = pdc.PandasConstraintVerifier(df2, type_checking='strict') results2strict = verify(dsc2, list(df2), pdcv2strict.verifiers()) # expect the boolean->real type constraint to fail with strict types expected = ( 'FIELDS:\n\n' 'i: 6 failures 0 passes ' 'type ✗ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nConstraints passing: 0\nConstraints failing: 6') self.assertEqual(str(results2strict), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [6]), ('passes', [0]), ('type', [False]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results2strict) self.assertTrue(vdf.equals(expected)) ic3 = FieldConstraints('i', [TypeConstraint('int')]) df3 = df1 dfc3 = [ic3] dsc3 = DatasetConstraints(dfc3) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, list(df3), pdcv3.verifiers()) expected = ( 'FIELDS:\n\n' 'i: 0 failures 1 pass type ✓\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0') self.assertEqual(str(results3), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [1]), ('type', [True]), ))) vdf = pdc.PandasVerification.verification_to_dataframe(results3) self.assertTrue(vdf.equals(expected)) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, list(df3), pdcv3.verifiers(), ascii=True) expected = ( 'FIELDS:\n\n' 'i: 0 failures 1 pass type OK\n\n' 'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0') self.assertEqual(str(results3), expected)
def testFieldVerification(self): df1 = pd.DataFrame({ 'b': [True, False] * 2, 'i': range(1, 5), 'r': [float(x) for x in range(1, 5)], 's': ['S%s' % x for x in range(1, 5)], 'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)] }) ic1 = FieldConstraints('i', [ TypeConstraint('int'), MinConstraint(0), MaxConstraint(10), SignConstraint('positive'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) ic2 = FieldConstraints('i', [ TypeConstraint('bool'), MinConstraint(2), MaxConstraint(3), SignConstraint('negative'), MaxNullsConstraint(0), NoDuplicatesConstraint() ]) dfc1 = [ic1] dsc1 = DatasetConstraints(dfc1) pdcv1 = pdc.PandasConstraintVerifier(df1) results1 = verify(dsc1, pdcv1.verifiers()) expected = ('FIELDS:\n\n' 'i: 0 failures 6 passes ' 'type ✓ min ✓ max ✓ sign ✓ ' 'max_nulls ✓ no_duplicates ✓\n\n' 'SUMMARY:\n\nPasses: 6\nFailures: 0') self.assertEqual(str(results1), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [6]), ('type', [True]), ('min', [True]), ('max', [True]), ('sign', [True]), ('max_nulls', [True]), ('no_duplicates', [True]), ))) self.assertTrue( pdc.verification_to_dataframe(results1).equals(expected)) df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]}) dfc2 = [ic2] dsc2 = DatasetConstraints(dfc2) pdcv2 = pdc.PandasConstraintVerifier(df2) results2 = verify(dsc2, pdcv2.verifiers()) expected = ('FIELDS:\n\n' 'i: 6 failures 0 passes ' 'type ✗ min ✗ max ✗ sign ✗ ' 'max_nulls ✗ no_duplicates ✗\n\n' 'SUMMARY:\n\nPasses: 0\nFailures: 6') self.assertEqual(str(results2), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [6]), ('passes', [0]), ('type', [False]), ('min', [False]), ('max', [False]), ('sign', [False]), ('max_nulls', [False]), ('no_duplicates', [False]), ))) self.assertTrue( pdc.verification_to_dataframe(results2).equals(expected)) ic3 = FieldConstraints('i', [TypeConstraint('int')]) df3 = df1 dfc3 = [ic3] dsc3 = DatasetConstraints(dfc3) pdcv3 = pdc.PandasConstraintVerifier(df3) results3 = verify(dsc3, pdcv3.verifiers()) expected = ('FIELDS:\n\n' 'i: 0 failures 1 pass type ✓\n\n' 'SUMMARY:\n\nPasses: 1\nFailures: 0') self.assertEqual(str(results3), expected) expected = pd.DataFrame( OrderedDict(( ('field', ['i']), ('failures', [0]), ('passes', [1]), ('type', [True]), ))) self.assertTrue( pdc.verification_to_dataframe(results3).equals(expected))