Пример #1
0
    def calculate_integrity(self, df_ge: PandasDataset,
                            specs: SchemaParserResult) -> dict:
        """
        Calculates the integrity from the defined types and the expectations.
        """
        def get_unexpected(eg_result):
            return eg_result[
                'unexpected_count'] if 'unexpected_count' in eg_result else 0

        def merge_dicts(d1, d2):
            for key, value in d2.items():
                for inner_value in d2[key]:
                    d1[key].append(inner_value)
            return d1

        all_elements = defaultdict(list)
        invalid_elements = defaultdict(list)
        for definition in specs.type_definitions:
            result = df_ge.expect_column_to_exist(definition)
            if not result.success:
                if definition in specs.required_types:  # does only count as error if required
                    invalid_elements[definition].append(df_ge.shape[0])
                    all_elements[definition].append(df_ge.shape[0])
                continue

            # check missing values
            result = df_ge.expect_column_values_to_not_be_null(definition)
            if definition in specs.required_types:  # only count as error if required
                invalid_elements[definition].append(
                    get_unexpected(result.result))
            all_elements[definition].append(result.result['element_count'])

            # check not correct types
            type_specification = TypeSpecification.create(
                specs.type_definitions.get(definition))
            type_list = [t.__name__ for t in type_specification.get_types()]
            # noinspection PyTypeChecker
            result = df_ge.expect_column_values_to_be_in_type_list(
                definition, type_list)
            invalid_elements[definition].append(get_unexpected(result.result))

        # handle attributes that are not specified
        not_specified_fields = set(df_ge.columns) - set(specs.type_definitions)
        if len(not_specified_fields) > 0:
            for attribute in not_specified_fields:
                result = df_ge.expect_column_values_to_be_null(attribute)
                # integrity of not specified fields has been defined as 1 - so we add 0 to unexpected
                invalid_elements[attribute].append(0)
                all_elements[attribute].append(get_unexpected(result.result))

        # check expectations
        expectation_violations = self.validate_expectations(df_ge, specs)
        merge_dicts(invalid_elements, expectation_violations)

        # flatten attribute metrics
        integrity_details = dict()
        for k, v in invalid_elements.items():
            integrity_details[k] = 1 - (np.sum(v) / sum(all_elements[k]))

        return integrity_details
Пример #2
0
def _prepare_dataset(dataset: PandasDataset) -> PandasDataset:
    dataset_copy = dataset.copy(deep=True)

    for column in dataset.columns:
        if dataset.expect_column_values_to_be_in_type_list(
            column, type_list=sorted(list(ProfilerTypeMapping.DATETIME_TYPE_NAMES))
        ).success:
            # GE cannot parse Timestamp or other pandas datetime time
            dataset_copy[column] = dataset[column].dt.strftime("%Y-%m-%dT%H:%M:%S")

        if dataset[column].dtype == np.float32:
            # GE converts expectation arguments into native Python float
            # This could cause error on comparison => so better to convert to double prematurely
            dataset_copy[column] = dataset[column].astype(np.float64)

    return dataset_copy