def vectorize_categorical_columns(X, constraints):
    """
    vectorize columns by first converting them to categorical and
    then to one hot encoding. Then it removes the original categorical
    columns for the output.

    :param X: data
    :param constraints: json file with TDDA constraints
    :return: data with one hot columns
    """
    cons = DatasetConstraints(loadpath=constraints)
    n_cat_cols = 0
    n_cats = 0
    initial_shape = X.shape[1]

    for key, value in cons.to_dict()['fields'].items():
        if value['type'] == 'string':
            if len(value['allowed_values']) < 20:
                # for checking
                n_cat_cols += 1
                n_cats += len(value['allowed_values'])

                X[key] = pd.Categorical(X[key],
                                        categories=value['allowed_values'])
                X = X.join(pd.get_dummies(X[key], prefix=key))
                X = X.drop(key, axis=1)

    expected_len = initial_shape + n_cats - n_cat_cols
    actual_len = X.shape[1]
    if actual_len != expected_len:
        raise ValueError(
            'Expected shape mismatch after vectorizing: {} != {}'.format(
                expected_len, actual_len))

    return X
Пример #2
0
 def testload(self):
     path = os.path.join(TESTDATA_DIR, 'ddd.tdda')
     constraints = DatasetConstraints(loadpath=path)
     constraints.sort_fields()
     actual = constraints.to_json()
     with open(path) as f:
         expected = json.dumps(sort_constraint_dict(json.loads(f.read())),
                               indent=4) + '\n'
         self.assertEqual(actual, expected)
Пример #3
0
def get_columns_format_violations(attribute_id, column_values):
    attribute_record = Attribute.objects.get(id=attribute_id)
    constraint_dict = json.loads(attribute_record.format_specification)
    if 'allowed_values' in constraint_dict['fields']['column'].keys():
        constraint_dict['fields']['column']['allowed_values'] = json.loads(
            constraint_dict['fields']['column']['allowed_values'])

    df = pd.DataFrame({'column': column_values})
    if constraint_dict['fields']['column']['type'] == 'int':
        # if there's one None value in the column, then pandas will convert the whole column to np.float64 instead of np.int64, which causes problems
        df = df[df['column'].notnull()]
        df = df.astype('int64')

    if constraint_dict['fields']['column']['type'] == 'real':
        # javascript only has the datatype 'numeric' -> floating point numbers might look just like integers in the json
        df[df['column'].apply(lambda x: type(x) == int)] = df[
            df['column'].apply(lambda x: type(x) == int)].astype('float64')

    pdv = PandasConstraintVerifier(df, epsilon=None, type_checking=None)

    print(
        '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&'
    )
    print(constraint_dict)
    print(
        '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&'
    )

    constraints = DatasetConstraints()
    constraints.initialize_from_dict(constraint_dict)

    pdv.repair_field_types(constraints)
    detection = pdv.detect(constraints,
                           VerificationClass=PandasDetection,
                           outpath=None,
                           write_all=False,
                           per_constraint=False,
                           output_fields=None,
                           index=False,
                           in_place=False,
                           rownumber_is_index=True,
                           boolean_ints=False,
                           report='records')
    violation_df = detection.detected()

    if violation_df is None:
        return []
    else:
        violating_rows = [
            int(row_nb) for row_nb in list(violation_df.index.values)
        ]
        return violating_rows
Пример #4
0
    def testDetectDuplicates(self):
        iconstraints = FieldConstraints('i', [NoDuplicatesConstraint()])
        sconstraints = FieldConstraints('s', [NoDuplicatesConstraint()])
        constraints = DatasetConstraints([iconstraints, sconstraints])

        df1 = pd.DataFrame({
            'i': [1, 2, 3, 4, np.nan],
            's': ['one', 'two', 'three', 'four', np.nan]
        })
        verifier1 = pdc.PandasConstraintVerifier(df1)
        v1 = verifier1.detect(constraints,
                              VerificationClass=pdc.PandasDetection)
        self.assertEqual(v1.passes, 2)
        self.assertEqual(v1.failures, 0)
        ddf1 = v1.detected()
        self.assertIsNone(ddf1)

        df2 = pd.DataFrame({
            'i': [1, 2, 3, 2, np.nan],
            's': ['one', 'two', 'three', 'two', np.nan]
        })
        verifier2 = pdc.PandasConstraintVerifier(df2)
        v2 = verifier2.detect(constraints,
                              VerificationClass=pdc.PandasDetection,
                              per_constraint=True,
                              output_fields=['i', 's'])
        self.assertEqual(v2.passes, 0)
        self.assertEqual(v2.failures, 2)
        ddf2 = v2.detected()
        self.assertStringCorrect(ddf2.to_string(), 'detect_dups.df')
Пример #5
0
 def testload(self):
     path = os.path.join(TESTDATA_DIR, 'ddd.tdda')
     constraints = DatasetConstraints(loadpath=path)
     fields = ['index', 'evennulls', 'oddnulls', 'evens', 'odds',
               'evenreals', 'oddreals', 'evenstr', 'oddstr',
               'elevens', 'greek', 'binnedindex', 'binnedodds',
               'basedate', 'evendates']
     constraints.sort_fields(fields)
     self.assertStringCorrect(constraints.to_json(), 'ddd.tdda',
                              rstrip=True,
                              ignore_substrings=['"as_at":',
                                                 '"local_time":',
                                                 '"utc_time":',
                                                 '"creator":',
                                                 '"host":',
                                                 '"user":'******'"tddafile":'])
Пример #6
0
 def testload(self):
     path = os.path.join(TESTDATA_DIR, 'ddd.tdda')
     constraints = DatasetConstraints(loadpath=path)
     fields = [
         'index', 'evennulls', 'oddnulls', 'evens', 'odds', 'evenreals',
         'oddreals', 'evenstr', 'oddstr', 'elevens', 'greek', 'binnedindex',
         'binnedodds', 'basedate', 'evendates'
     ]
     constraints.sort_fields(fields)
     self.assertStringCorrect(constraints.to_json(),
                              'ddd.tdda',
                              rstrip=True,
                              ignore_substrings=[
                                  '"as_at":', '"local_time":',
                                  '"utc_time":', '"creator":', '"host":',
                                  '"user":'******'"tddafile":'
                              ])
Пример #7
0
 def discover(self):
     field_constraints = []
     for col in self.get_column_names():
         constraints = self.discover_field_constraints(col)
         if constraints:
             field_constraints.append(constraints)
     if field_constraints:
         return DatasetConstraints(field_constraints)
     else:
         return None
Пример #8
0
def detect_df(df,
              constraints_path,
              epsilon=None,
              type_checking=None,
              outpath=None,
              write_all=False,
              per_constraint=False,
              output_fields=None,
              index=False,
              in_place=False,
              rownumber_is_index=True,
              boolean_ints=False,
              repair=True,
              report='records',
              **kwargs):
    """
    Check the records from the Pandas DataFrame provided, to detect
    records that fail any of the constraints in the JSON ``.tdda`` file
    provided. This is anomaly detection.

    Mandatory Inputs:

        *df*:
                            A Pandas DataFrame, to be checked.

        *constraints_path*:
                            The path to a JSON ``.tdda`` file (possibly
                            generated by the discover_df function, below)
                            containing constraints to be checked.
                            Or, alternatively, an in-memory dictionary
                            containing the structured contents of a ``.tdda``
                            file.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.

                            For example, with epsilon set to 0.01 (i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropriate, for negative
                            values.)

                            If not specified, an *epsilon* of 0 is used,
                            so there is no tolerance.


                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            ``strict`` or ``sloppy``.
                            Because Pandas silently, routinely and
                            automatically "promotes" integer and boolean
                            columns to reals and objects respectively
                            if they contain nulls, strict type checking
                            can be problematical in Pandas. For this reason,
                            ``type_checking`` defaults to ``sloppy``, meaning
                            that type changes that could plausibly be
                            attributed to Pandas type promotion will not
                            generate constraint values.

                            If this is set to strict, a Pandas ``float``
                            column ``c`` will only be allowed to satisfy a
                            an ``int`` type constraint if::

                                c.dropnulls().astype(int) == c.dropnulls()

                            Similarly, Object fields will satisfy a
                            ``bool`` constraint only if::

                                c.dropnulls().astype(bool) == c.dropnulls()

        *outpath*:
                            This specifies that the verification process
                            should detect records that violate any constraints,
                            and write them out to this CSV (or feather) file.

                            By default, only failing records are written out
                            to file, but this can be overridden with the
                            ``write_all`` parameter.

                            By default, the columns in the detection output
                            file will be a boolean ``ok`` field for each
                            constraint on each field, an and ``n_failures``
                            field containing the total number of constraints
                            that failed for each row.  This behavious can be
                            overridden with the ``per_constraint``,
                            ``output_fields`` and ``index`` parameters.

        *write_all*:
                            Include passing records in the detection output
                            file when detecting.

        *per_constraint*:
                            Write one column per failing constraint, as well
                            as the ``n_failures`` total.

        *output_fields*:
                            Specify original columns to write out when detecting.

                            If passed in as an empty list (rather than None),
                            all original columns will be included.

        *index*:
                            Boolean to specify whether to include a row-number
                            index in the output file when detecting.

                            This is automatically enabled if no output field
                            names are specified.

                            Rows are numbered from 0.

        *in_place*:
                            Detect failing constraints by adding columns to
                            the input DataFrame.

                            If ``outpath`` is also specified, then
                            failing records will also be written to file.

        *rownumber_is_index*:
                            ``False`` if the DataFrame originated from a CSV
                            file (and therefore any detection output file
                            should refer to row numbers from the file, rather
                            than items from the DataFrame index).

        *boolean_ints*:
                            If ``True``, write out all boolean values to
                            CSV file as integers (1 for true, and 0 for
                            false), rather than as ``true`` and ``false``
                            values.

        *repair*:
                            A boolean to specify whether to try to use the
                            information in the constraints to attempt to
                            repair potentially-incorrect type inferrences
                            made when constructing the dataframe. When the
                            dataframe has been loaded from a .csv file, this
                            can often be useful (but should not be used with
                            dataframes that have come from a more reliable
                            source).

    The *report* parameter from :py:func:`verify_df` can also be
    used, in which case a verification report will also be produced in
    addition to the detection results.

    Returns:

        :py:class:`~tdda.constraints.pd.constraints.PandasDetection` object.

        This object has a :py:meth:`~PandasDetection.detected()` method
        for obtaining the Pandas DataFrame containing the detection
        results.

    Example usage::

        import pandas as pd
        from tdda.constraints import detect_df

        df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                           'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
        v = detect_df(df, 'example_constraints.tdda')
        detection_df = v.detected()
        print(detection_df.to_string())

    """
    pdv = PandasConstraintVerifier(df,
                                   epsilon=epsilon,
                                   type_checking=type_checking)
    if isinstance(constraints_path, dict):
        constraints = DatasetConstraints()
        constraints.initialize_from_dict(native_definite(constraints_path))
    else:
        constraints = DatasetConstraints(loadpath=constraints_path)
    if repair:
        pdv.repair_field_types(constraints)
    return pdv.detect(constraints,
                      VerificationClass=PandasDetection,
                      outpath=outpath,
                      write_all=write_all,
                      per_constraint=per_constraint,
                      output_fields=output_fields,
                      index=index,
                      in_place=in_place,
                      rownumber_is_index=rownumber_is_index,
                      boolean_ints=boolean_ints,
                      report=report,
                      **kwargs)
Пример #9
0
def verify_df(df,
              constraints_path,
              epsilon=None,
              type_checking=None,
              repair=True,
              report='all',
              **kwargs):
    """
    Verify that (i.e. check whether) the Pandas DataFrame provided
    satisfies the constraints in the JSON ``.tdda`` file provided.

    Mandatory Inputs:

        *df*:
                            A Pandas DataFrame, to be checked.

        *constraints_path*:
                            The path to a JSON ``.tdda`` file (possibly
                            generated by the discover_df function, below)
                            containing constraints to be checked.
                            Or, alternatively, an in-memory dictionary
                            containing the structured contents of a ``.tdda``
                            file.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.

                            For example, with epsilon set to 0.01 (i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropriate, for negative
                            values.)

                            If not specified, an *epsilon* of 0 is used,
                            so there is no tolerance.


                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            ``strict`` or ``sloppy``.
                            Because Pandas silently, routinely and
                            automatically "promotes" integer and boolean
                            columns to reals and objects respectively
                            if they contain nulls, strict type checking
                            can be problematical in Pandas. For this reason,
                            ``type_checking`` defaults to ``sloppy``, meaning
                            that type changes that could plausibly be
                            attributed to Pandas type promotion will not
                            generate constraint values.

                            If this is set to strict, a Pandas ``float``
                            column ``c`` will only be allowed to satisfy a
                            an ``int`` type constraint if::

                                c.dropnulls().astype(int) == c.dropnulls()

                            Similarly, Object fields will satisfy a
                            ``bool`` constraint only if::

                                c.dropnulls().astype(bool) == c.dropnulls()

        *repair*:
                            A boolean to specify whether to try to use the
                            information in the constraints to attempt to
                            repair potentially-incorrect type inferrences
                            made when constructing the dataframe. When the
                            dataframe has been loaded from a .csv file, this
                            can often be useful (but should not be used with
                            dataframes that have come from a more reliable
                            source).

        *report*:
                            ``all`` or ``fields``.
                            This controls the behaviour of the
                            :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.__str__` method on
                            the resulting :py:class:`~tdda.constraints.pd.constraints.PandasVerification`
                            object (but not its content).

                            The default is ``all``, which means that
                            all fields are shown, together with the
                            verification status of each constraint
                            for that field.

                            If report is set to ``fields``, only fields for
                            which at least one constraint failed are shown.

    Returns:

        :py:class:`~tdda.constraints.pd.constraints.PandasVerification` object.

        This object has attributes:

        - *passes*      --- Number of passing constriants
        - *failures*    --- Number of failing constraints

        It also has a :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.to_frame()` method for
        converting the results of the verification to a Pandas DataFrame,
        and a :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.__str__` method to print
        both the detailed and summary results of the verification.

    Example usage::

        import pandas as pd
        from tdda.constraints import verify_df

        df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                           'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
        v = verify_df(df, 'example_constraints.tdda')

        print('Constraints passing: %d\\n' % v.passes)
        print('Constraints failing: %d\\n' % v.failures)
        print(str(v))
        print(v.to_frame())

    See *simple_verification.py* in the :ref:`constraint_examples`
    for a slightly fuller example.

    """
    pdv = PandasConstraintVerifier(df,
                                   epsilon=epsilon,
                                   type_checking=type_checking)
    if isinstance(constraints_path, dict):
        constraints = DatasetConstraints()
        constraints.initialize_from_dict(native_definite(constraints_path))
    else:
        constraints = DatasetConstraints(loadpath=constraints_path)
    if repair:
        pdv.repair_field_types(constraints)
    return pdv.verify(constraints,
                      VerificationClass=PandasVerification,
                      report=report,
                      **kwargs)
Пример #10
0
def discover_constraints(df):
    """
    Automatically discover potentially useful constraints that characterize
    the Pandas DataFrame provided.

    Input:

        *df*:
            any Pandas DataFrame.

    Possible return values:

       -  :py:class:`~tdda.constraints.base.DatasetConstraints` object
       -  ``None``    --- (if no constraints were found).

    This function goes through each column in the DataFrame and, where
    appropriate, generates constraints that describe (and are satisified
    by) this dataframe.

    Assuming it generates at least one constraint for at least one field
    it returns a :py:class:`tdda.constraints.base.DatasetConstraints` object.

    This includes a 'fields' attribute, keyed on the column name.

    The returned :py:class:`~tdda.constraints.base.DatasetConstraints` object
    includes a :py:meth:`~tdda.constraints.base.DatasetContraints.to_json`
    method, which converts the constraints into JSON for saving as a tdda
    constraints file. By convention, such JSON files use a '.tdda'
    extension.

    The JSON constraints file can be used to check whether other datasets
    also satisfy the constraints.

    The kinds of constraints (potentially) generated for each field (column)
    are:

        *type*:
                the (coarse, TDDA) type of the field. One of
                'bool', 'int', 'real', 'string' or 'date'.


        *min*:
                for non-string fields, the minimum value in the column.
                Not generated for all-null columns.

        *max*:
                for non-string fields, the maximum value in the column.
                Not generated for all-null columns.

        *min_length*:
                For string fields, the length of the shortest string(s)
                in the field. N.B. In Python3, this is of course,
                a unicode string length; in Python2, it is an encoded
                string length, which may be less meaningful.

        *max_length*:
                For string fields, the length of the longest string(s)
                in the field.  N.B. In Python3, this is of course,
                a unicode string length; in Python2, it is an encoded
                string length, which may be less meaningful.

        *sign*:
                If all the values in a numeric field have consistent sign,
                a sign constraint will be written with a value chosen from:

                    - positive     --- For all values *v* in field: `v > 0`
                    - non-negative --- For all values *v* in field: `v >= 0`
                    - zero         --- For all values *v* in field: `v == 0`
                    - non-positive --- For all values *v* in field: `v <= 0`
                    - negative     --- For all values *v* in field: `v < 0`
                    - null         --- For all values *v* in field: `v is null`

        *max_nulls*:
                The maximum number of nulls allowed in the field.

                    - If the field has no nulls, a constraint
                      will be written with max_nulls set to zero.
                    - If the field has a single null, a constraint will
                      be written with max_nulls set to one.
                    - If the field has more than 1 null, no constraint
                      will be generated.

        *no_duplicates*:
                For string fields (only, for now), if every
                non-null value in the field is different,
                this constraint will be generated (with value ``True``);
                otherwise no constraint will be generated. So this constraint
                indicates that all the **non-null** values in a string
                field are distinct (unique).

        *allowed_values*:
                 For string fields only, if there are
                 :py:const:`MAX_CATEGORIES` or fewer distinct string
                 values in the dataframe, an AllowedValues constraint
                 listing them will be generated.
                 :py:const:`MAX_CATEGORIES` is currently "hard-wired" to 20.

    Example usage::

        import pandas as pd
        from tdda.constraints.pdconstraints import discover_constraints

        df = pd.DataFrame({'a': [1, 2, 3], 'b': ['one', 'two', pd.np.NaN]})
        constraints = discover_constraints(df)
        with open('example_constraints.tdda', 'w') as f:
            f.write(constraints.to_json())

    See *simple_generation.py* in the :ref:`constraint_examples`
    for a slightly fuller example.

    """
    field_constraints = []
    for col in df:
        constraints = discover_field_constraints(df[col])
        if constraints:
            field_constraints.append(constraints)
    if field_constraints:
        return DatasetConstraints(field_constraints)
    else:
        return None
Пример #11
0
def verify_df(df, constraints_path, epsilon=None, type_checking=None,
              **kwargs):
    """
    Verify that (i.e. check whether) the Pandas DataFrame provided
    satisfies the constraints in the JSON .tdda file provided.

    Mandatory Inputs:

        *df*:
                            A Pandas DataFrame, to be checked.

        *constraints_path*:
                            The path to a JSON .tdda file (possibly
                            generated by the discover_constraints
                            function, below) containing constraints
                            to be checked.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.
                            With the default value of epsilon
                            (:py:const:`EPSILON_DEFAULT` = 0.01, i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropraite, for negative
                            values.)

                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            'strict' or 'sloppy'.
                            Because Pandas silently, routinely and
                            automatically "promotes" integer and boolean
                            columns to reals and objects respectively
                            if they contain nulls, strict type checking
                            can be problematical in Pandas. For this reason,
                            type_checking defaults to 'sloppy', meaning
                            that type changes that could plausibly be
                            attriuted to Pandas type promotion will not
                            generate constraint values.

                            If this is set to strict, a Pandas "float"
                            column c will only be allowed to satisfy a
                            an "int" type constraint if:

                                `c.dropnulls().astype(int) == c.dropnulls()`

                            Similarly, Object fields will satisfy a
                            'bool' constraint only if:

                                `c.dropnulls().astype(bool) == c.dropnulls()`

        *report*:
                            'all' or 'fields'.
                            This controls the behaviour of the
                            :py:meth:`~PandasVerification.__str__` method on
                            the resulting :py:class:`~PandasVerification`
                            object (but not its content).

                            The default is 'all', which means that
                            all fields are shown, together with the
                            verification status of each constraint
                            for that field.

                            If report is set to 'fields', only fields for
                            which at least one constraint failed are shown.

                            NOTE: The method also accepts two further
                            parameters to control (not yet implemented)
                            behaviour. 'constraints', will be used to
                            indicate that only failing constraints for
                            failing fields should be shown.
                            'one_per_line' will indicate that each constraint
                            failure should be reported on a separate line.

    Returns:

        :py:class:`~PandasVerification` object.

        This object has attributes:

            - *passed*      --- Number of passing constriants
            - *failures*    --- Number of failing constraints

        It also has a :py:meth:`~PandasVerification.to_frame()` method for
        converting the results of the verification to a Pandas DataFrame,
        and a :py:meth:`~PandasVerification.__str__` method to print
        both the detailed and summary results of the verification.

    Example usage::

        import pandas as pd
        from tdda.constraints.pdconstraints import verify_df

        df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                           'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
        v = verify_df(df, 'example_constraints.tdda')

        print('Passes:', v.passes)
        print('Failures: %d\\n' % v.failures)
        print(str(v))
        print(v.to_frame())

    See *simple_verification.py* in the :ref:`constraint_examples`
    for a slightly fuller example.

    """
    pdv = PandasConstraintVerifier(df, epsilon=epsilon,
                                   type_checking=type_checking)
    constraints = DatasetConstraints(loadpath=constraints_path)
    return verify(constraints, pdv.verifiers(),
                  VerificationClass=PandasVerification, **kwargs)
Пример #12
0
def detect_df(df, constraints_path, epsilon=None, type_checking=None,
              outpath=None, write_all=False, per_constraint=False,
              output_fields=None, index=False, in_place=False,
              rownumber_is_index=True, boolean_ints=False,
              repair=True, report='records',
              **kwargs):
    """
    Check the records from the Pandas DataFrame provided, to detect
    records that fail any of the constraints in the JSON ``.tdda`` file
    provided. This is anomaly detection.

    Mandatory Inputs:

        *df*:
                            A Pandas DataFrame, to be checked.

        *constraints_path*:
                            The path to a JSON ``.tdda`` file (possibly
                            generated by the discover_df function, below)
                            containing constraints to be checked.
                            Or, alternatively, an in-memory dictionary
                            containing the structured contents of a ``.tdda``
                            file.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.

                            For example, with epsilon set to 0.01 (i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropriate, for negative
                            values.)

                            If not specified, an *epsilon* of 0 is used,
                            so there is no tolerance.


                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            ``strict`` or ``sloppy``.
                            Because Pandas silently, routinely and
                            automatically "promotes" integer and boolean
                            columns to reals and objects respectively
                            if they contain nulls, strict type checking
                            can be problematical in Pandas. For this reason,
                            ``type_checking`` defaults to ``sloppy``, meaning
                            that type changes that could plausibly be
                            attributed to Pandas type promotion will not
                            generate constraint values.

                            If this is set to strict, a Pandas ``float``
                            column ``c`` will only be allowed to satisfy a
                            an ``int`` type constraint if::

                                c.dropnulls().astype(int) == c.dropnulls()

                            Similarly, Object fields will satisfy a
                            ``bool`` constraint only if::

                                c.dropnulls().astype(bool) == c.dropnulls()

        *outpath*:
                            This specifies that the verification process
                            should detect records that violate any constraints,
                            and write them out to this CSV (or feather) file.

                            By default, only failing records are written out
                            to file, but this can be overridden with the
                            ``write_all`` parameter.

                            By default, the columns in the detection output
                            file will be a boolean ``ok`` field for each
                            constraint on each field, an and ``n_failures``
                            field containing the total number of constraints
                            that failed for each row.  This behavious can be
                            overridden with the ``per_constraint``,
                            ``output_fields`` and ``index`` parameters.

        *write_all*:
                            Include passing records in the detection output
                            file when detecting.

        *per_constraint*:
                            Write one column per failing constraint, as well
                            as the ``n_failures`` total.

        *output_fields*:
                            Specify original columns to write out when detecting.

                            If passed in as an empty list (rather than None),
                            all original columns will be included.

        *index*:
                            Boolean to specify whether to include a row-number
                            index in the output file when detecting.

                            This is automatically enabled if no output field
                            names are specified.

                            Rows are numbered from 0.

        *in_place*:
                            Detect failing constraints by adding columns to
                            the input DataFrame.

                            If ``outpath`` is also specified, then
                            failing records will also be written to file.

        *rownumber_is_index*:
                            ``False`` if the DataFrame originated from a CSV
                            file (and therefore any detection output file
                            should refer to row numbers from the file, rather
                            than items from the DataFrame index).

        *boolean_ints*:
                            If ``True``, write out all boolean values to
                            CSV file as integers (1 for true, and 0 for
                            false), rather than as ``true`` and ``false``
                            values.

        *repair*:
                            A boolean to specify whether to try to use the
                            information in the constraints to attempt to
                            repair potentially-incorrect type inferrences
                            made when constructing the dataframe. When the
                            dataframe has been loaded from a .csv file, this
                            can often be useful (but should not be used with
                            dataframes that have come from a more reliable
                            source).

    The *report* parameter from :py:func:`verify_df` can also be
    used, in which case a verification report will also be produced in
    addition to the detection results.

    Returns:

        :py:class:`~tdda.constraints.pd.constraints.PandasDetection` object.

        This object has a :py:meth:`~PandasDetection.detected()` method
        for obtaining the Pandas DataFrame containing the detection
        results.

    Example usage::

        import pandas as pd
        from tdda.constraints import detect_df

        df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                           'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
        v = detect_df(df, 'example_constraints.tdda')
        detection_df = v.detected()
        print(detection_df.to_string())

    """
    pdv = PandasConstraintVerifier(df, epsilon=epsilon,
                                   type_checking=type_checking)
    if isinstance(constraints_path, dict):
        constraints = DatasetConstraints()
        constraints.initialize_from_dict(native_definite(constraints_path))
    else:
        constraints = DatasetConstraints(loadpath=constraints_path)
    if repair:
        pdv.repair_field_types(constraints)
    return pdv.detect(constraints, VerificationClass=PandasDetection,
                      outpath=outpath, write_all=write_all,
                      per_constraint=per_constraint,
                      output_fields=output_fields, index=index,
                      in_place=in_place,
                      rownumber_is_index=rownumber_is_index,
                      boolean_ints=boolean_ints,
                      report=report, **kwargs)
Пример #13
0
 def testload(self):
     path = os.path.join(TESTDATA_DIR, 'ddd.tdda')
     constraints = DatasetConstraints(loadpath=path)
Пример #14
0
    def testFieldVerification(self):
        df1 = pd.DataFrame({
            'b': [True, False] * 2,
            'i':
            range(1, 5),
            'r': [float(x) for x in range(1, 5)],
            's': ['S%s' % x for x in range(1, 5)],
            'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)]
        })
        ic1 = FieldConstraints('i', [
            TypeConstraint('int'),
            MinConstraint(0),
            MaxConstraint(10),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        ic2 = FieldConstraints('i', [
            TypeConstraint('bool'),
            MinConstraint(2),
            MaxConstraint(3),
            SignConstraint('negative'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        dfc1 = [ic1]
        dsc1 = DatasetConstraints(dfc1)
        pdcv1 = pdc.PandasConstraintVerifier(df1)
        results1 = verify(dsc1, list(df1), pdcv1.verifiers())
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  6 passes  '
            'type ✓  min ✓  max ✓  sign ✓  '
            'max_nulls ✓  no_duplicates ✓\n\n'
            'SUMMARY:\n\nConstraints passing: 6\nConstraints failing: 0')
        self.assertEqual(str(results1), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [6]),
                ('type', [True]),
                ('min', [True]),
                ('max', [True]),
                ('sign', [True]),
                ('max_nulls', [True]),
                ('no_duplicates', [True]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results1)
        self.assertTrue(vdf.equals(expected))

        df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]})
        dfc2 = [ic2]
        dsc2 = DatasetConstraints(dfc2)
        pdcv2 = pdc.PandasConstraintVerifier(df2)
        results2 = verify(dsc2, list(df2), pdcv2.verifiers())
        # expect the boolean->real type constraint to pass with sloppy types
        expected = (
            'FIELDS:\n\n'
            'i: 5 failures  1 pass  '
            'type ✓  min ✗  max ✗  sign ✗  '
            'max_nulls ✗  no_duplicates ✗\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 5')
        self.assertEqual(str(results2), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [5]),
                ('passes', [1]),
                ('type', [True]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results2)
        self.assertTrue(vdf.equals(expected))

        pdcv2strict = pdc.PandasConstraintVerifier(df2, type_checking='strict')
        results2strict = verify(dsc2, list(df2), pdcv2strict.verifiers())
        # expect the boolean->real type constraint to fail with strict types
        expected = (
            'FIELDS:\n\n'
            'i: 6 failures  0 passes  '
            'type ✗  min ✗  max ✗  sign ✗  '
            'max_nulls ✗  no_duplicates ✗\n\n'
            'SUMMARY:\n\nConstraints passing: 0\nConstraints failing: 6')
        self.assertEqual(str(results2strict), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [6]),
                ('passes', [0]),
                ('type', [False]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results2strict)
        self.assertTrue(vdf.equals(expected))

        ic3 = FieldConstraints('i', [TypeConstraint('int')])
        df3 = df1
        dfc3 = [ic3]
        dsc3 = DatasetConstraints(dfc3)
        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, list(df3), pdcv3.verifiers())
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  1 pass  type ✓\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0')
        self.assertEqual(str(results3), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [1]),
                ('type', [True]),
            )))
        vdf = pdc.PandasVerification.verification_to_dataframe(results3)
        self.assertTrue(vdf.equals(expected))

        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, list(df3), pdcv3.verifiers(), ascii=True)
        expected = (
            'FIELDS:\n\n'
            'i: 0 failures  1 pass  type OK\n\n'
            'SUMMARY:\n\nConstraints passing: 1\nConstraints failing: 0')
        self.assertEqual(str(results3), expected)
Пример #15
0
    def testFieldVerification(self):
        df1 = pd.DataFrame({
            'b': [True, False] * 2,
            'i':
            range(1, 5),
            'r': [float(x) for x in range(1, 5)],
            's': ['S%s' % x for x in range(1, 5)],
            'd': [datetime.datetime(2016, 1, x) for x in range(1, 5)]
        })
        ic1 = FieldConstraints('i', [
            TypeConstraint('int'),
            MinConstraint(0),
            MaxConstraint(10),
            SignConstraint('positive'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        ic2 = FieldConstraints('i', [
            TypeConstraint('bool'),
            MinConstraint(2),
            MaxConstraint(3),
            SignConstraint('negative'),
            MaxNullsConstraint(0),
            NoDuplicatesConstraint()
        ])

        dfc1 = [ic1]
        dsc1 = DatasetConstraints(dfc1)
        pdcv1 = pdc.PandasConstraintVerifier(df1)
        results1 = verify(dsc1, pdcv1.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 0 failures  6 passes  '
                    'type ✓  min ✓  max ✓  sign ✓  '
                    'max_nulls ✓  no_duplicates ✓\n\n'
                    'SUMMARY:\n\nPasses: 6\nFailures: 0')
        self.assertEqual(str(results1), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [6]),
                ('type', [True]),
                ('min', [True]),
                ('max', [True]),
                ('sign', [True]),
                ('max_nulls', [True]),
                ('no_duplicates', [True]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results1).equals(expected))

        df2 = pd.DataFrame({'i': [1, 2, 2, 6, np.nan]})
        dfc2 = [ic2]
        dsc2 = DatasetConstraints(dfc2)
        pdcv2 = pdc.PandasConstraintVerifier(df2)
        results2 = verify(dsc2, pdcv2.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 6 failures  0 passes  '
                    'type ✗  min ✗  max ✗  sign ✗  '
                    'max_nulls ✗  no_duplicates ✗\n\n'
                    'SUMMARY:\n\nPasses: 0\nFailures: 6')
        self.assertEqual(str(results2), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [6]),
                ('passes', [0]),
                ('type', [False]),
                ('min', [False]),
                ('max', [False]),
                ('sign', [False]),
                ('max_nulls', [False]),
                ('no_duplicates', [False]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results2).equals(expected))

        ic3 = FieldConstraints('i', [TypeConstraint('int')])
        df3 = df1
        dfc3 = [ic3]
        dsc3 = DatasetConstraints(dfc3)
        pdcv3 = pdc.PandasConstraintVerifier(df3)
        results3 = verify(dsc3, pdcv3.verifiers())
        expected = ('FIELDS:\n\n'
                    'i: 0 failures  1 pass  type ✓\n\n'
                    'SUMMARY:\n\nPasses: 1\nFailures: 0')
        self.assertEqual(str(results3), expected)
        expected = pd.DataFrame(
            OrderedDict((
                ('field', ['i']),
                ('failures', [0]),
                ('passes', [1]),
                ('type', [True]),
            )))
        self.assertTrue(
            pdc.verification_to_dataframe(results3).equals(expected))
Пример #16
0
def verify_db_table(dbtype, db, tablename, constraints_path, epsilon=None,
                    type_checking='strict', testing=False, report='all',
                    **kwargs):
    """
    Verify that (i.e. check whether) the database table provided
    satisfies the constraints in the JSON .tdda file provided.

    Mandatory Inputs:

        *dbtype*:
                            Type of database.
        *db*:
                            A database object
        *tablename*:
                            A database table name, to be checked.

        *constraints_path*:
                            The path to a JSON .tdda file (possibly
                            generated by the discover_constraints
                            function, below) containing constraints
                            to be checked.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.

                            For example, with epsilon set to 0.01 (i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropriate, for negative
                            values.)

                            If not specified, an *epsilon* of 0 is used,
                            so there is no tolerance.

                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            ``strict`` or ``sloppy``. For databases (unlike
                            Pandas DataFrames), this defaults to 'strict'.

                            If this is set to sloppy, a database "real"
                            column c will only be allowed to satisfy a
                            an "int" type constraint.

        *report*:
                            ``all`` or ``fields``.
                            This controls the behaviour of the
                            :py:meth:`~~tdda.constraints.db.constraints.DatabaseVerification.__str__`
                            method on the resulting
                            :py:class:`~tdda.constraints.db.constraints.DatabaseVerification`
                            object (but not its content).

                            The default is ``all``, which means that
                            all fields are shown, together with the
                            verification status of each constraint
                            for that field.

                            If report is set to ``fields``, only fields for
                            which at least one constraint failed are shown.

        *testing*:
                            Boolean flag. Should only be set to ``True``
                            when being run as part of an automated test.
                            It suppresses type-compatibility warnings.

    Returns:

        :py:class:`~tdda.constraints.db.constraints.DatabaseVerification` object.

        This object has attributes:

        - *passed*      --- Number of passing constriants
        - *failures*    --- Number of failing constraints

    Example usage::

        import pgdb
        from tdda.constraints import verify_db_table

        dbspec = 'localhost:databasename:username:password'
        tablename = 'schemaname.tablename'
        db = pgdb.connect(dbspec)
        v = verify_db_table('postgres' db, tablename, 'myconstraints.tdda')

        print('Constraints passing:', v.passes)
        print('Constraints failing: %d\\n' % v.failures)
        print(str(v))
    """
    dbv = DatabaseConstraintVerifier(dbtype, db, tablename, epsilon=epsilon,
                                     type_checking=type_checking,
                                     testing=testing)
    if not dbv.check_table_exists(tablename):
        print('No table %s' % tablename, file=sys.stderr)
        sys.exit(1)
    constraints = DatasetConstraints(loadpath=constraints_path)
    return dbv.verify(constraints,
                      VerificationClass=DatabaseVerification,
                      report=report, **kwargs)
Пример #17
0
def verify_directory_from_file(path, constraints_path, **kwargs):
    fv = FilesConstraintVerifier(path, **kwargs)
    constraints = DatasetConstraints(loadpath=constraints_path)
    return fv.verify(constraints)
Пример #18
0
def verify_df(df, constraints_path, epsilon=None, type_checking=None,
              repair=True, report='all', **kwargs):
    """
    Verify that (i.e. check whether) the Pandas DataFrame provided
    satisfies the constraints in the JSON ``.tdda`` file provided.

    Mandatory Inputs:

        *df*:
                            A Pandas DataFrame, to be checked.

        *constraints_path*:
                            The path to a JSON ``.tdda`` file (possibly
                            generated by the discover_df function, below)
                            containing constraints to be checked.
                            Or, alternatively, an in-memory dictionary
                            containing the structured contents of a ``.tdda``
                            file.

    Optional Inputs:

        *epsilon*:
                            When checking minimum and maximum values
                            for numeric fields, this provides a
                            tolerance. The tolerance is a proportion
                            of the constraint value by which the
                            constraint can be exceeded without causing
                            a constraint violation to be issued.

                            For example, with epsilon set to 0.01 (i.e. 1%),
                            values can be up to 1% larger than a max constraint
                            without generating constraint failure,
                            and minimum values can be up to 1% smaller
                            that the minimum constraint value without
                            generating a constraint failure. (These
                            are modified, as appropriate, for negative
                            values.)

                            If not specified, an *epsilon* of 0 is used,
                            so there is no tolerance.


                            NOTE: A consequence of the fact that these
                            are proportionate is that min/max values
                            of zero do not have any tolerance, i.e.
                            the wrong sign always generates a failure.

        *type_checking*:
                            ``strict`` or ``sloppy``.
                            Because Pandas silently, routinely and
                            automatically "promotes" integer and boolean
                            columns to reals and objects respectively
                            if they contain nulls, strict type checking
                            can be problematical in Pandas. For this reason,
                            ``type_checking`` defaults to ``sloppy``, meaning
                            that type changes that could plausibly be
                            attributed to Pandas type promotion will not
                            generate constraint values.

                            If this is set to strict, a Pandas ``float``
                            column ``c`` will only be allowed to satisfy a
                            an ``int`` type constraint if::

                                c.dropnulls().astype(int) == c.dropnulls()

                            Similarly, Object fields will satisfy a
                            ``bool`` constraint only if::

                                c.dropnulls().astype(bool) == c.dropnulls()

        *repair*:
                            A boolean to specify whether to try to use the
                            information in the constraints to attempt to
                            repair potentially-incorrect type inferrences
                            made when constructing the dataframe. When the
                            dataframe has been loaded from a .csv file, this
                            can often be useful (but should not be used with
                            dataframes that have come from a more reliable
                            source).

        *report*:
                            ``all`` or ``fields``.
                            This controls the behaviour of the
                            :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.__str__` method on
                            the resulting :py:class:`~tdda.constraints.pd.constraints.PandasVerification`
                            object (but not its content).

                            The default is ``all``, which means that
                            all fields are shown, together with the
                            verification status of each constraint
                            for that field.

                            If report is set to ``fields``, only fields for
                            which at least one constraint failed are shown.

    Returns:

        :py:class:`~tdda.constraints.pd.constraints.PandasVerification` object.

        This object has attributes:

        - *passes*      --- Number of passing constriants
        - *failures*    --- Number of failing constraints

        It also has a :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.to_frame()` method for
        converting the results of the verification to a Pandas DataFrame,
        and a :py:meth:`~tdda.constraints.pd.constraints.PandasVerification.__str__` method to print
        both the detailed and summary results of the verification.

    Example usage::

        import pandas as pd
        from tdda.constraints import verify_df

        df = pd.DataFrame({'a': [0, 1, 2, 10, pd.np.NaN],
                           'b': ['one', 'one', 'two', 'three', pd.np.NaN]})
        v = verify_df(df, 'example_constraints.tdda')

        print('Constraints passing: %d\\n' % v.passes)
        print('Constraints failing: %d\\n' % v.failures)
        print(str(v))
        print(v.to_frame())

    See *simple_verification.py* in the :ref:`constraint_examples`
    for a slightly fuller example.

    """
    pdv = PandasConstraintVerifier(df, epsilon=epsilon,
                                   type_checking=type_checking)
    if isinstance(constraints_path, dict):
        constraints = DatasetConstraints()
        constraints.initialize_from_dict(native_definite(constraints_path))
    else:
        constraints = DatasetConstraints(loadpath=constraints_path)
    if repair:
        pdv.repair_field_types(constraints)
    return pdv.verify(constraints,
                      VerificationClass=PandasVerification,
                      report=report, **kwargs)