def test_verify_allowed_values_constraint(self): DIGITS = list('1234567890') PRIMES = list('2357') EMPTIES = ['', ' ', ' ', ' '] ANDOR = ['and', 'or'] df = pd.DataFrame({ 'digits': list('8275'), 'primes1': PRIMES, 'primes2': list('3355'), 'empties': EMPTIES, 'eitheror': ['and', 'not', 'either', 'or'], 'digitsn': list('827') + [None], 'primes1n': PRIMES[:-1] + [np.nan], 'primes2n': [None, np.nan, '3', '5'], 'emptiesn': [' ', None, ' ', ' '], 'eitherorn': ['and', None, 'either', 'or'], 'null': [None] * 4, }) cvt = ConstraintVerificationTester(self, df) c_digits = AllowedValuesConstraint(DIGITS) c_primes = AllowedValuesConstraint(PRIMES) c_empties = AllowedValuesConstraint(EMPTIES) c_andor = AllowedValuesConstraint(ANDOR) c_nothing = AllowedValuesConstraint([]) cvt.verify_allowed_values_constraint('digits', c_digits).isTrue() cvt.verify_allowed_values_constraint('digitsn', c_digits).isTrue() cvt.verify_allowed_values_constraint('primes1', c_primes).isTrue() cvt.verify_allowed_values_constraint('primes1n', c_primes).isTrue() cvt.verify_allowed_values_constraint('primes2', c_primes).isTrue() cvt.verify_allowed_values_constraint('primes2n', c_primes).isTrue() cvt.verify_allowed_values_constraint('primes1', c_digits).isTrue() cvt.verify_allowed_values_constraint('primes1n', c_digits).isTrue() cvt.verify_allowed_values_constraint('primes2', c_digits).isTrue() cvt.verify_allowed_values_constraint('primes2n', c_digits).isTrue() cvt.verify_allowed_values_constraint('empties', c_empties).isTrue() cvt.verify_allowed_values_constraint('emptiesn', c_empties).isTrue() cvt.verify_allowed_values_constraint('eitheror', c_empties).isFalse() cvt.verify_allowed_values_constraint('eitherorn', c_empties).isFalse() cvt.verify_allowed_values_constraint('digits', c_primes).isFalse() cvt.verify_allowed_values_constraint('digitsn', c_primes).isFalse() cvt.verify_allowed_values_constraint('primes1', c_empties).isFalse() cvt.verify_allowed_values_constraint('primes1n', c_andor).isFalse() cvt.verify_allowed_values_constraint('primes2', c_andor).isFalse() cvt.verify_allowed_values_constraint('primes2n', c_empties).isFalse() cvt.verify_allowed_values_constraint('empties', c_primes).isFalse() cvt.verify_allowed_values_constraint('emptiesn', c_digits).isFalse() for col in df: if col == 'null': cvt.verify_allowed_values_constraint(col, c_nothing).isTrue() else: cvt.verify_allowed_values_constraint(col, c_nothing).isFalse()
def test_constraint_repr(self): self.assertEqual(repr(MinConstraint(7)), 'MinConstraint(value=7, precision=None)') self.assertEqual(repr(MinConstraint('a')), "MinConstraint(value='a', precision=None)") self.assertEqual(repr(MinConstraint('a', precision='closed')), "MinConstraint(value='a', precision='closed')") self.assertEqual(repr(MinLengthConstraint(3)), "MinLengthConstraint(value=3)") self.assertEqual(repr(MaxConstraint(-3)), 'MaxConstraint(value=-3, precision=None)') self.assertEqual(repr(MaxConstraint('KJ')), "MaxConstraint(value='KJ', precision=None)") self.assertEqual(repr(MaxConstraint(4.2, precision='closed')), "MaxConstraint(value=4.2, precision='closed')") self.assertEqual(repr(MaxLengthConstraint(0)), "MaxLengthConstraint(value=0)") self.assertEqual(repr(SignConstraint('positive')), "SignConstraint(value='positive')") self.assertEqual(repr(MaxNullsConstraint(0)), "MaxNullsConstraint(value=0)") self.assertEqual(repr(NoDuplicatesConstraint()), "NoDuplicatesConstraint(value=True)") self.assertEqual(repr(TypeConstraint('int')), "TypeConstraint(value='int')") self.assertEqual(repr(TypeConstraint(['int', 'real'])), "TypeConstraint(value=['int', 'real'])") self.assertEqual(repr(AllowedValuesConstraint(['a', 'b'])), "AllowedValuesConstraint(value=['a', 'b'])")
def discover_field_constraints(self, fieldname): min_constraint = max_constraint = None min_length_constraint = max_length_constraint = None sign_constraint = no_duplicates_constraint = None max_nulls_constraint = allowed_values_constraint = None rex_constraint = None type_ = self.calc_tdda_type(fieldname) if type_ == 'other': return None # Unrecognized or complex else: type_constraint = TypeConstraint(type_) length = self.get_nrecords() if length > 0: # Things are not very interesting when there is no data nNull = self.calc_null_count(fieldname) nNonNull = self.calc_non_null_count(fieldname) assert nNull + nNonNull == length if nNull < 2: max_nulls_constraint = MaxNullsConstraint(nNull) # Useful info: uniqs = None n_unique = -1 # won't equal number of non-nulls later on if type_ in ('string', 'int'): n_unique = self.calc_nunique(fieldname) if type_ == 'string': if n_unique <= MAX_CATEGORIES: uniqs = self.calc_unique_values(fieldname, include_nulls=False) if uniqs: avc = AllowedValuesConstraint(uniqs) allowed_values_constraint = avc if nNonNull > 0: if type_ == 'string': # We don't generate a min, max or sign constraints for # strings. But we do generate min and max length # constraints if (uniqs is None and n_unique > 0): # There were too many for us to have bothered getting # them all before, but we need them now. uniqs = self.calc_unique_values(fieldname, include_nulls=False) if uniqs: if type(uniqs[0]) is unicode_string: L = [len(v) for v in uniqs] else: L = [len(v.decode('UTF-8')) for v in uniqs] m = min(L) M = max(L) min_length_constraint = MinLengthConstraint(m) max_length_constraint = MaxLengthConstraint(M) else: # Non-string fields all potentially get min and max values m = self.calc_min(fieldname) M = self.calc_max(fieldname) if not self.is_null(m): min_constraint = MinConstraint(m) if not self.is_null(M): max_constraint = MaxConstraint(M) # Non-date fields potentially get a sign constraint too. if min_constraint and max_constraint and type_ != 'date': if m == M == 0: sign_constraint = SignConstraint('zero') elif m >= 0: sign = 'positive' if m > 0 else 'non-negative' sign_constraint = SignConstraint(sign) elif M <= 0: sign = 'negative' if M < 0 else 'non-positive' sign_constraint = SignConstraint(sign) # else: # mixed elif self.is_null(m) and type_ != 'date': sign_constraint = SignConstraint('null') if n_unique == nNonNull and n_unique > 1 and type_ != 'real': no_duplicates_constraint = NoDuplicatesConstraint() if type_ == 'string' and self.inc_rex: rex_constraint = RexConstraint( self.find_rexes(fieldname, values=uniqs)) constraints = [ c for c in [ type_constraint, min_constraint, max_constraint, min_length_constraint, max_length_constraint, sign_constraint, max_nulls_constraint, no_duplicates_constraint, allowed_values_constraint, rex_constraint ] if c is not None ] return FieldConstraints(fieldname, constraints)
def discover_field_constraints(field): """ Discover constraints for a single field (column) from a Pandas DataFrame. Input: *field*: a single field (column; Series) object, usually from a Pandas DataFrame. Returns: - :py:class:`tdda.base.FieldConstraints` object, if any constraints were found. - ``None``, otherwise. """ min_constraint = max_constraint = None min_length_constraint = max_length_constraint = None sign_constraint = no_duplicates_constraint = None max_nulls_constraint = allowed_values_constraint = None type_ = tdda_type(field) if type_ == 'other': return None # Unrecognized or complex else: type_constraint = TypeConstraint(type_) length = len(field) if length > 0: # Things are not very interesting when there is no data nNull = int(field.isnull().sum().astype(int)) nNonNull = int(field.notnull().sum().astype(int)) assert nNull + nNonNull == length if nNull < 2: max_nulls_constraint = MaxNullsConstraint(nNull) # Useful info: uniqs = None n_unique = -1 # won't equal number of non-nulls later on if type_ in ('string', 'int'): n_unique = field.nunique() # excludes NaN if type_ == 'string': if n_unique <= MAX_CATEGORIES: uniqs = list(field.dropna().unique()) if uniqs: allowed_values_constraint = AllowedValuesConstraint(uniqs) if nNonNull > 0: if type_ == 'string': # We don't generate a min, max or sign constraints for strings # But we do generate min and max length constraints if (uniqs is None # There were too many for us to have and n_unique > 0): # bothered getting them all uniqs = list(field.dropna().unique()) # need them now if uniqs: m = min(len(v) for v in uniqs) M = max(len(v) for v in uniqs) min_length_constraint = MinLengthConstraint(m) max_length_constraint = MaxLengthConstraint(M) else: # Non-string fields all potentially get min and max values if type_ == 'date': m = field.min() M = field.max() if pd.notnull(m): m = m.to_pydatetime() if pd.notnull(M): M = M.to_pydatetime() else: m = field.min().item() M = field.max().item() if pd.notnull(m): min_constraint = MinConstraint(m) if pd.notnull(M): max_constraint = MaxConstraint(M) # Non-date fields potentially get a sign constraint too. if min_constraint and max_constraint and type_ != 'date': if m == M == 0: sign_constraint = SignConstraint('zero') elif m >= 0: sign = 'positive' if m > 0 else 'non-negative' sign_constraint = SignConstraint(sign) elif M <= 0: sign = 'negative' if M < 0 else 'non-positive' sign_constraint = SignConstraint(sign) # else: # mixed elif pd.isnull(m) and type_ != 'date': sign_constraint = SignConstraint('null') if n_unique == nNonNull and n_unique > 1 and type_ != 'real': no_duplicates_constraint = NoDuplicatesConstraint() constraints = [c for c in [type_constraint, min_constraint, max_constraint, min_length_constraint, max_length_constraint, sign_constraint, max_nulls_constraint, no_duplicates_constraint, allowed_values_constraint] if c is not None] return FieldConstraints(field.name, constraints)