Python parse_result_format 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: great_expectations.data_asset.util

메소드/함수: parse_result_format

hotexamples.com에서의 예제들: 9

Python parse_result_format - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 great_expectations.data_asset.util.parse_result_format에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: validation_operators.py 프로젝트: trucklos/great_expectations

    def __init__(
        self,
        data_context,
        action_list,
        name,
        base_expectation_suite_name=None,
        expectation_suite_name_suffixes=None,
        stop_on_first_error=False,
        slack_webhook=None,
        notify_on="all",
        result_format={"result_format": "SUMMARY"},
    ):
        super().__init__(data_context, action_list, name)

        if expectation_suite_name_suffixes is None:
            expectation_suite_name_suffixes = [".failure", ".warning"]

        self.stop_on_first_error = stop_on_first_error
        self.base_expectation_suite_name = base_expectation_suite_name

        assert len(expectation_suite_name_suffixes) == 2
        for suffix in expectation_suite_name_suffixes:
            assert isinstance(suffix, str)
        self.expectation_suite_name_suffixes = expectation_suite_name_suffixes

        self.slack_webhook = slack_webhook
        self.notify_on = notify_on
        result_format = parse_result_format(result_format)
        assert result_format["result_format"] in [
            "BOOLEAN_ONLY",
            "BASIC",
            "SUMMARY",
            "COMPLETE",
        ]
        self.result_format = result_format

예제 #2

파일 보기

파일: validation_operators.py 프로젝트: trucklos/great_expectations

    def __init__(
        self,
        data_context,
        action_list,
        name,
        result_format={"result_format": "SUMMARY"},
    ):
        super().__init__()
        self.data_context = data_context
        self.name = name

        result_format = parse_result_format(result_format)
        assert result_format["result_format"] in [
            "BOOLEAN_ONLY",
            "BASIC",
            "SUMMARY",
            "COMPLETE",
        ]
        self.result_format = result_format

        self.action_list = action_list
        self.actions = OrderedDict()
        for action_config in action_list:
            assert isinstance(action_config, dict)
            # NOTE: Eugene: 2019-09-23: need a better way to validate an action config:
            if not set(action_config.keys()) == {"name", "action"}:
                raise KeyError(
                    'Action config keys must be ("name", "action"). Instead got {}'.format(
                        action_config.keys()
                    )
                )

            config = action_config["action"]
            module_name = "great_expectations.validation_operators"
            new_action = instantiate_class_from_config(
                config=config,
                runtime_environment={"data_context": self.data_context},
                config_defaults={"module_name": module_name},
            )
            if not new_action:
                raise ClassInstantiationError(
                    module_name=module_name,
                    package_name=None,
                    class_name=config["class_name"],
                )
            self.actions[action_config["name"]] = new_action

예제 #3

파일 보기

파일: sqlalchemy_dataset.py 프로젝트: shrutikaponde/great_expectations

        def inner_wrapper(self,
                          column,
                          mostly=None,
                          result_format=None,
                          *args,
                          **kwargs):
            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)

            if result_format['result_format'] == 'COMPLETE':
                warnings.warn(
                    "Setting result format to COMPLETE for a SqlAlchemyDataset can be dangerous because it will not limit the number of returned results."
                )
                unexpected_count_limit = None
            else:
                unexpected_count_limit = result_format[
                    'partial_unexpected_count']

            expected_condition = func(self, column, *args, **kwargs)

            # Added to prepare for when an ignore_values argument is added to the expectation
            ignore_values = [None]
            if func.__name__ in [
                    'expect_column_values_to_not_be_null',
                    'expect_column_values_to_be_null'
            ]:
                ignore_values = []
                # Counting the number of unexpected values can be expensive when there is a large
                # number of np.nan values.
                # This only happens on expect_column_values_to_not_be_null expectations.
                # Since there is no reason to look for most common unexpected values in this case,
                # we will instruct the result formatting method to skip this step.
                result_format['partial_unexpected_count'] = 0

            ignore_values_conditions = []
            if len(ignore_values) > 0 and None not in ignore_values or len(
                    ignore_values) > 1 and None in ignore_values:
                ignore_values_conditions += [
                    sa.column(column).in_(
                        [val for val in ignore_values if val is not None])
                ]
            if None in ignore_values:
                ignore_values_conditions += [sa.column(column).is_(None)]

            if len(ignore_values_conditions) > 1:
                ignore_values_condition = sa.or_(*ignore_values_conditions)
            elif len(ignore_values_conditions) == 1:
                ignore_values_condition = ignore_values_conditions[0]
            else:
                ignore_values_condition = sa.literal(False)

            count_query = sa.select([
                sa.func.count().label('element_count'),
                sa.func.sum(sa.case([(ignore_values_condition, 1)],
                                    else_=0)).label('null_count'),
                sa.func.sum(
                    sa.case([(sa.and_(sa.not_(expected_condition),
                                      sa.not_(ignore_values_condition)), 1)],
                            else_=0)).label('unexpected_count')
            ]).select_from(self._table)

            count_results = dict(self.engine.execute(count_query).fetchone())

            # Handle case of empty table gracefully:
            if "element_count" not in count_results or count_results[
                    "element_count"] is None:
                count_results["element_count"] = 0
            if "null_count" not in count_results or count_results[
                    "null_count"] is None:
                count_results["null_count"] = 0
            if "unexpected_count" not in count_results or count_results[
                    "unexpected_count"] is None:
                count_results["unexpected_count"] = 0

            # Retrieve unexpected values
            unexpected_query_results = self.engine.execute(
                sa.select([sa.column(column)]).select_from(self._table).where(
                    sa.and_(sa.not_(expected_condition),
                            sa.not_(ignore_values_condition))).limit(
                                unexpected_count_limit))

            nonnull_count = count_results['element_count'] - \
                count_results['null_count']

            if "output_strftime_format" in kwargs:
                output_strftime_format = kwargs["output_strftime_format"]
                maybe_limited_unexpected_list = []
                for x in unexpected_query_results.fetchall():
                    if isinstance(x[column], string_types):
                        col = parse(x[column])
                    else:
                        col = x[column]
                    maybe_limited_unexpected_list.append(
                        datetime.strftime(col, output_strftime_format))
            else:
                maybe_limited_unexpected_list = [
                    x[column] for x in unexpected_query_results.fetchall()
                ]

            success_count = nonnull_count - count_results['unexpected_count']
            success, percent_success = self._calc_map_expectation_success(
                success_count, nonnull_count, mostly)

            return_obj = self._format_map_output(
                result_format,
                success,
                count_results['element_count'],
                nonnull_count,
                count_results['unexpected_count'],
                maybe_limited_unexpected_list,
                None,
            )

            if func.__name__ in [
                    'expect_column_values_to_not_be_null',
                    'expect_column_values_to_be_null'
            ]:
                # These results are unnecessary for the above expectations
                del return_obj['result']['unexpected_percent_nonmissing']
                del return_obj['result']['missing_count']
                del return_obj['result']['missing_percent']
                try:
                    del return_obj['result']['partial_unexpected_counts']
                    del return_obj['result']['partial_unexpected_list']
                except KeyError:
                    pass

            return return_obj

예제 #4

파일 보기

파일: sparkdf_dataset.py 프로젝트: walvekarvarun/great_expectations

        def inner_wrapper(self,
                          column,
                          mostly=None,
                          result_format=None,
                          *args,
                          **kwargs):
            """
            This whole decorator is pending a re-write. Currently there is are huge performance issues
            when the # of unexpected elements gets large (10s of millions). Additionally, there is likely
            easy optimization opportunities by coupling result_format with how many different transformations
            are done on the dataset, as is done in sqlalchemy_dataset.
            """

            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)

            # this is a little dangerous: expectations that specify "COMPLETE" result format and have a very
            # large number of unexpected results could hang for a long time. we should either call this out in docs
            # or put a limit on it
            if result_format['result_format'] == 'COMPLETE':
                unexpected_count_limit = None
            else:
                unexpected_count_limit = result_format[
                    'partial_unexpected_count']

            col_df = self.spark_df.select(column)  # pyspark.sql.DataFrame

            # a couple of tests indicate that caching here helps performance
            col_df.cache()
            element_count = self.get_row_count()

            # FIXME temporary fix for missing/ignored value
            if func.__name__ not in [
                    'expect_column_values_to_not_be_null',
                    'expect_column_values_to_be_null'
            ]:
                col_df = col_df.filter(
                    '{column} is not null'.format(column=column))
                # these nonnull_counts are cached by SparkDFDataset
                nonnull_count = self.get_column_nonnull_count(column)
            else:
                nonnull_count = element_count

            # success_df will have columns [column, '__success']
            # this feels a little hacky, so might want to change
            success_df = func(self, col_df, *args, **kwargs)
            success_count = success_df.filter('__success = True').count()

            unexpected_count = nonnull_count - success_count
            if unexpected_count == 0:
                # save some computation time if no unexpected items
                maybe_limited_unexpected_list = []
            else:
                # here's an example of a place where we could do optimizations if we knew result format: see
                # comment block below
                unexpected_df = success_df.filter('__success = False')
                if unexpected_count_limit:
                    unexpected_df = unexpected_df.limit(unexpected_count_limit)
                maybe_limited_unexpected_list = [
                    row[column] for row in unexpected_df.collect()
                ]

            success, percent_success = self._calc_map_expectation_success(
                success_count, nonnull_count, mostly)

            # Currently the abstraction of "result_format" that _format_column_map_output provides
            # limits some possible optimizations within the column-map decorator. It seems that either
            # this logic should be completely rolled into the processing done in the column_map decorator, or that the decorator
            # should do a minimal amount of computation agnostic of result_format, and then delegate the rest to this method.
            # In the first approach, it could make sense to put all of this decorator logic in Dataset, and then implement
            # properties that require dataset-type-dependent implementations (as is done with SparkDFDataset.row_count currently).
            # Then a new dataset type could just implement these properties/hooks and Dataset could deal with caching these and
            # with the optimizations based on result_format. A side benefit would be implementing an interface for the user
            # to get basic info about a dataset in a standardized way, e.g. my_dataset.row_count, my_dataset.columns (only for
            # tablular datasets maybe). However, unclear if this is worth it or if it would conflict with optimizations being done
            # in other dataset implementations.
            return_obj = self._format_map_output(
                result_format,
                success,
                element_count,
                nonnull_count,
                unexpected_count,
                maybe_limited_unexpected_list,
                unexpected_index_list=None,
            )

            # FIXME Temp fix for result format
            if func.__name__ in [
                    'expect_column_values_to_not_be_null',
                    'expect_column_values_to_be_null'
            ]:
                del return_obj['result']['unexpected_percent_nonmissing']
                try:
                    del return_obj['result']['partial_unexpected_counts']
                except KeyError:
                    pass

            col_df.unpersist()

            return return_obj

예제 #5

파일 보기

파일: file_data_asset.py 프로젝트: rexboyce/great_expectations

        def inner_wrapper(self,
                          skip=None,
                          mostly=None,
                          null_lines_regex=r"^\s*$",
                          result_format=None,
                          *args,
                          **kwargs):
            try:
                f = open(self._path)
            except OSError:
                raise

            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)
            lines = f.readlines()  # Read in file lines

            # Skip k initial lines designated by the user
            if skip is not None and skip <= len(lines):
                try:
                    assert float(skip).is_integer()
                    assert float(skip) >= 0
                except (AssertionError, ValueError):
                    raise ValueError("skip must be a positive integer")

                for i in range(1, skip + 1):
                    lines.pop(0)

            if lines:
                if null_lines_regex is not None:
                    null_lines = re.compile(
                        null_lines_regex
                    )  # Ignore lines that are empty or have only white space ("null values" in the line-map context)
                    boolean_mapped_null_lines = np.array(
                        [bool(null_lines.match(line)) for line in lines])
                else:
                    boolean_mapped_null_lines = np.zeros(len(lines),
                                                         dtype=bool)
                element_count = int(len(lines))
                if element_count > sum(boolean_mapped_null_lines):
                    nonnull_lines = list(
                        compress(lines, np.invert(boolean_mapped_null_lines)))
                    nonnull_count = int(
                        (boolean_mapped_null_lines == False).sum())
                    boolean_mapped_success_lines = np.array(
                        func(self, _lines=nonnull_lines, *args, **kwargs))
                    success_count = np.count_nonzero(
                        boolean_mapped_success_lines)
                    unexpected_list = list(
                        compress(nonnull_lines,
                                 np.invert(boolean_mapped_success_lines)))
                    nonnull_lines_index = range(0, len(nonnull_lines) + 1)
                    unexpected_index_list = list(
                        compress(nonnull_lines_index,
                                 np.invert(boolean_mapped_success_lines)))
                    success, percent_success = self._calc_map_expectation_success(
                        success_count, nonnull_count, mostly)
                    return_obj = self._format_map_output(
                        result_format,
                        success,
                        element_count,
                        nonnull_count,
                        len(unexpected_list),
                        unexpected_list,
                        unexpected_index_list,
                    )
                else:
                    return_obj = self._format_map_output(
                        result_format=result_format,
                        success=None,
                        element_count=element_count,
                        nonnull_count=0,
                        unexpected_count=0,
                        unexpected_list=[],
                        unexpected_index_list=[],
                    )
            else:
                return_obj = self._format_map_output(
                    result_format=result_format,
                    success=None,
                    element_count=0,
                    nonnull_count=0,
                    unexpected_count=0,
                    unexpected_list=[],
                    unexpected_index_list=[],
                )
            f.close()
            return return_obj

예제 #6

파일 보기

파일: sparkdf_dataset.py 프로젝트: tsanikgr/great_expectations

        def inner_wrapper(self, column_A, column_B, mostly=None, ignore_row_if="both_values_are_missing", result_format=None, *args, **kwargs):
            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)

            # this is a little dangerous: expectations that specify "COMPLETE" result format and have a very
            # large number of unexpected results could hang for a long time. we should either call this out in docs
            # or put a limit on it
            if result_format['result_format'] == 'COMPLETE':
                unexpected_count_limit = None
            else:
                unexpected_count_limit = result_format['partial_unexpected_count']

            cols_df = self.spark_df.select(column_A, column_B).withColumn("__row", monotonically_increasing_id())  # pyspark.sql.DataFrame

            # a couple of tests indicate that caching here helps performance
            cols_df.cache()
            element_count = self.get_row_count()

            if ignore_row_if == "both_values_are_missing":
                boolean_mapped_null_values = cols_df.selectExpr("`__row`",
                                                                "`{0}` AS `A_{0}`".format(column_A),
                                                                "`{0}` AS `B_{0}`".format(column_B),
                                                                "ISNULL(`{0}`) AND ISNULL(`{1}`) AS `__null_val`".format(column_A, column_B)
                                                                )
            elif ignore_row_if == "either_value_is_missing":
                boolean_mapped_null_values = cols_df.selectExpr("`__row`",
                                                                "`{0}` AS `A_{0}`".format(column_A),
                                                                "`{0}` AS `B_{0}`".format(column_B),
                                                                "ISNULL(`{0}`) OR ISNULL(`{1}`) AS `__null_val`".format(column_A, column_B))
            elif ignore_row_if == "never":
                boolean_mapped_null_values = cols_df.selectExpr("`__row`",
                                                                "`{0}` AS `A_{0}`".format(column_A),
                                                                "`{0}` AS `B_{0}`".format(column_B),
                                                                lit(False).alias("__null_val"))
            else:
                raise ValueError(
                    "Unknown value of ignore_row_if: %s", (ignore_row_if,))

            # since pyspark guaranteed each columns selected has the same number of rows, no need to do assert as in pandas
            # assert series_A.count() == (
            #     series_B.count()), "Series A and B must be the same length"

            nonnull_df = boolean_mapped_null_values.filter("__null_val = False")
            nonnull_count = nonnull_df.count()

            col_A_df = nonnull_df.select("__row", "`A_{0}`".format(column_A))
            col_B_df = nonnull_df.select("__row", "`B_{0}`".format(column_B))

            success_df = func(
                self, col_A_df, col_B_df, *args, **kwargs)
            success_count = success_df.filter("__success = True").count()

            unexpected_count = nonnull_count - success_count
            if unexpected_count == 0:
                # save some computation time if no unexpected items
                maybe_limited_unexpected_list = []
            else:
                # here's an example of a place where we could do optimizations if we knew result format: see
                # comment block below
                unexpected_df = success_df.filter('__success = False')
                if unexpected_count_limit:
                    unexpected_df = unexpected_df.limit(unexpected_count_limit)
                maybe_limited_unexpected_list = [
                    (row["A_{0}".format(column_A)], row["B_{0}".format(column_B)])
                    for row
                    in unexpected_df.collect()
                ]

                if "output_strftime_format" in kwargs:
                    output_strftime_format = kwargs["output_strftime_format"]
                    parsed_maybe_limited_unexpected_list = []
                    for val in maybe_limited_unexpected_list:
                        if val is None or (val[0] is None or val[1] is None):
                            parsed_maybe_limited_unexpected_list.append(val)
                        else:
                            if isinstance(val[0], string_types) and isinstance(val[1], string_types):
                                val = (parse(val[0]), parse(val[1]))
                            parsed_maybe_limited_unexpected_list.append((datetime.strftime(val[0], output_strftime_format), datetime.strftime(val[1], output_strftime_format)))
                    maybe_limited_unexpected_list = parsed_maybe_limited_unexpected_list

            success, percent_success = self._calc_map_expectation_success(
                success_count, nonnull_count, mostly)

            # Currently the abstraction of "result_format" that _format_column_map_output provides
            # limits some possible optimizations within the column-map decorator. It seems that either
            # this logic should be completely rolled into the processing done in the column_map decorator, or that the decorator
            # should do a minimal amount of computation agnostic of result_format, and then delegate the rest to this method.
            # In the first approach, it could make sense to put all of this decorator logic in Dataset, and then implement
            # properties that require dataset-type-dependent implementations (as is done with SparkDFDataset.row_count currently).
            # Then a new dataset type could just implement these properties/hooks and Dataset could deal with caching these and
            # with the optimizations based on result_format. A side benefit would be implementing an interface for the user
            # to get basic info about a dataset in a standardized way, e.g. my_dataset.row_count, my_dataset.columns (only for
            # tablular datasets maybe). However, unclear if this is worth it or if it would conflict with optimizations being done
            # in other dataset implementations.
            return_obj = self._format_map_output(
                result_format,
                success,
                element_count,
                nonnull_count,
                unexpected_count,
                maybe_limited_unexpected_list,
                unexpected_index_list=None,
            )

            # # FIXME Temp fix for result format
            # if func.__name__ in ['expect_column_values_to_not_be_null', 'expect_column_values_to_be_null']:
            #     del return_obj['result']['unexpected_percent_nonmissing']
            #     del return_obj['result']['missing_count']
            #     del return_obj['result']['missing_percent']
            #     try:
            #         del return_obj['result']['partial_unexpected_counts']
            #     except KeyError:
            #         pass

            cols_df.unpersist()

            return return_obj

예제 #7

파일 보기

        def inner_wrapper(self,
                          column,
                          mostly=None,
                          result_format=None,
                          *args,
                          **kwargs):
            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)

            if result_format['result_format'] == 'COMPLETE':
                warnings.warn(
                    "Setting result format to COMPLETE for a SqlAlchemyDataset can be dangerous because it will not limit the number of returned results."
                )
                unexpected_count_limit = None
            else:
                unexpected_count_limit = result_format[
                    'partial_unexpected_count']

            expected_condition = func(self, column, *args, **kwargs)

            # FIXME Temporary Fix for counting missing values
            # Added to compensate when an ignore_values argument is added to the expectation
            ignore_values = [None]
            if func.__name__ in [
                    'expect_column_values_to_not_be_null',
                    'expect_column_values_to_be_null'
            ]:
                ignore_values = []
                # Counting the number of unexpected values can be expensive when there is a large
                # number of np.nan values.
                # This only happens on expect_column_values_to_not_be_null expectations.
                # Since there is no reason to look for most common unexpected values in this case,
                # we will instruct the result formatting method to skip this step.
                result_format['partial_unexpected_count'] = 0

            count_query = sa.select([
                sa.func.count().label('element_count'),
                sa.func.sum(
                    sa.case(
                        [(
                            sa.or_(
                                sa.column(column).in_(ignore_values),
                                # Below is necessary b/c sa.in_() uses `==` but None != None
                                # But we only consider this if None is actually in the list of ignore values
                                sa.column(column).is_(None)
                                if None in ignore_values else False),
                            1)],
                        else_=0)).label('null_count'),
                sa.func.sum(
                    sa.case([(sa.and_(
                        sa.not_(expected_condition),
                        sa.case([(sa.column(column).is_(None), False)],
                                else_=True)
                        if None in ignore_values else True), 1)],
                            else_=0)).label('unexpected_count')
            ]).select_from(self._table)

            count_results = dict(self.engine.execute(count_query).fetchone())

            # Handle case of empty table gracefully:
            if "element_count" not in count_results or count_results[
                    "element_count"] is None:
                count_results["element_count"] = 0
            if "null_count" not in count_results or count_results[
                    "null_count"] is None:
                count_results["null_count"] = 0
            if "unexpected_count" not in count_results or count_results[
                    "unexpected_count"] is None:
                count_results["unexpected_count"] = 0

            # Retrieve unexpected values
            unexpected_query_results = self.engine.execute(
                sa.select([sa.column(column)]).select_from(self._table).where(
                    sa.and_(
                        sa.not_(expected_condition),
                        sa.or_(
                            # SA normally evaluates `== None` as `IS NONE`. However `sa.in_()`
                            # replaces `None` as `NULL` in the list and incorrectly uses `== NULL`
                            sa.case([(sa.column(column).is_(None), False)],
                                    else_=True)
                            if None in ignore_values else False,
                            # Ignore any other values that are in the ignore list
                            sa.column(column).in_(ignore_values) ==
                            False))).limit(unexpected_count_limit))

            nonnull_count = count_results['element_count'] - \
                count_results['null_count']

            if "output_strftime_format" in kwargs:
                output_strftime_format = kwargs["output_strftime_format"]
                maybe_limited_unexpected_list = []
                for x in unexpected_query_results.fetchall():
                    if isinstance(x[column], string_types):
                        col = parse(x[column])
                    else:
                        col = x[column]
                    maybe_limited_unexpected_list.append(
                        datetime.strftime(col, output_strftime_format))
            else:
                maybe_limited_unexpected_list = [
                    x[column] for x in unexpected_query_results.fetchall()
                ]

            success_count = nonnull_count - count_results['unexpected_count']
            success, percent_success = self._calc_map_expectation_success(
                success_count, nonnull_count, mostly)

            return_obj = self._format_map_output(
                result_format,
                success,
                count_results['element_count'],
                nonnull_count,
                count_results['unexpected_count'],
                maybe_limited_unexpected_list,
                None,
            )

            if func.__name__ in [
                    'expect_column_values_to_not_be_null',
                    'expect_column_values_to_be_null'
            ]:
                # These results are unnecessary for the above expectations
                del return_obj['result']['unexpected_percent_nonmissing']
                try:
                    del return_obj['result']['partial_unexpected_counts']
                    del return_obj['result']['partial_unexpected_list']
                except KeyError:
                    pass

            return return_obj

예제 #8

파일 보기

파일: pandas_dataset.py 프로젝트: talagluck/great_expectations

        def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs):

            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)

            # FIXME temporary fix for missing/ignored value
            ignore_values = [None, np.nan]
            if func.__name__ in ['expect_column_values_to_not_be_null', 'expect_column_values_to_be_null']:
                ignore_values = []
                # Counting the number of unexpected values can be expensive when there is a large
                # number of np.nan values.
                # This only happens on expect_column_values_to_not_be_null expectations.
                # Since there is no reason to look for most common unexpected values in this case,
                # we will instruct the result formatting method to skip this step.
                result_format['partial_unexpected_count'] = 0

            series = self[column]

            # FIXME rename to mapped_ignore_values?
            if len(ignore_values) == 0:
                boolean_mapped_null_values = np.array(
                    [False for value in series])
            else:
                boolean_mapped_null_values = np.array([True if (value in ignore_values) or (pd.isnull(value)) else False
                                                       for value in series])

            element_count = int(len(series))

            # FIXME rename nonnull to non_ignored?
            nonnull_values = series[boolean_mapped_null_values == False]
            nonnull_count = int((boolean_mapped_null_values == False).sum())

            boolean_mapped_success_values = func(
                self, nonnull_values, *args, **kwargs)
            success_count = np.count_nonzero(boolean_mapped_success_values)

            unexpected_list = list(
                nonnull_values[boolean_mapped_success_values == False])
            unexpected_index_list = list(
                nonnull_values[boolean_mapped_success_values == False].index)

            if "output_strftime_format" in kwargs:
                output_strftime_format = kwargs["output_strftime_format"]
                parsed_unexpected_list = []
                for val in unexpected_list:
                    if val is None:
                        parsed_unexpected_list.append(val)
                    else:
                        if isinstance(val, string_types):
                            val = parse(val)
                        parsed_unexpected_list.append(datetime.strftime(val, output_strftime_format))
                unexpected_list = parsed_unexpected_list

            success, percent_success = self._calc_map_expectation_success(
                success_count, nonnull_count, mostly)

            return_obj = self._format_map_output(
                result_format, success,
                element_count, nonnull_count,
                len(unexpected_list),
                unexpected_list, unexpected_index_list
            )

            # FIXME Temp fix for result format
            if func.__name__ in ['expect_column_values_to_not_be_null', 'expect_column_values_to_be_null']:
                del return_obj['result']['unexpected_percent_nonmissing']
                try:
                    del return_obj['result']['partial_unexpected_counts']
                    del return_obj['result']['partial_unexpected_list']
                except KeyError:
                    pass

            return return_obj

예제 #9

파일 보기

파일: sparkdf_dataset.py 프로젝트: james-fu/great_expectations

        def inner_wrapper(self,
                          column_list,
                          mostly=None,
                          ignore_row_if="all_values_are_missing",
                          result_format=None,
                          *args,
                          **kwargs):
            if result_format is None:
                result_format = self.default_expectation_args["result_format"]

            result_format = parse_result_format(result_format)

            # this is a little dangerous: expectations that specify "COMPLETE" result format and have a very
            # large number of unexpected results could hang for a long time. we should either call this out in docs
            # or put a limit on it
            if result_format["result_format"] == "COMPLETE":
                unexpected_count_limit = None
            else:
                unexpected_count_limit = result_format[
                    "partial_unexpected_count"]

            temp_df = self.spark_df.select(
                *column_list)  # pyspark.sql.DataFrame

            # a couple of tests indicate that caching here helps performance
            temp_df.cache()
            element_count = self.get_row_count()

            if ignore_row_if == "all_values_are_missing":
                boolean_mapped_skip_values = temp_df.select([
                    *column_list,
                    reduce(lambda a, b: a & b,
                           [col(c).isNull()
                            for c in column_list]).alias("__null_val"),
                ])
            elif ignore_row_if == "any_value_is_missing":
                boolean_mapped_skip_values = temp_df.select([
                    *column_list,
                    reduce(lambda a, b: a | b,
                           [col(c).isNull()
                            for c in column_list]).alias("__null_val"),
                ])
            elif ignore_row_if == "never":
                boolean_mapped_skip_values = temp_df.select(
                    [*column_list,
                     lit(False).alias("__null_val")])
            else:
                raise ValueError("Unknown value of ignore_row_if: %s",
                                 (ignore_row_if, ))

            nonnull_df = boolean_mapped_skip_values.filter(
                "__null_val = False")
            nonnull_count = nonnull_df.count()

            cols_df = nonnull_df.select(*column_list)

            success_df = func(self, cols_df, *args, **kwargs)
            success_count = success_df.filter("__success = True").count()

            unexpected_count = nonnull_count - success_count
            if unexpected_count == 0:
                maybe_limited_unexpected_list = []
            else:
                # here's an example of a place where we could do optimizations if we knew result format: see
                # comment block below
                unexpected_df = success_df.filter("__success = False")
                if unexpected_count_limit:
                    unexpected_df = unexpected_df.limit(unexpected_count_limit)
                maybe_limited_unexpected_list = [
                    OrderedDict((c, row[c]) for c in column_list)
                    for row in unexpected_df.collect()
                ]

                if "output_strftime_format" in kwargs:
                    output_strftime_format = kwargs["output_strftime_format"]
                    parsed_maybe_limited_unexpected_list = []
                    for val in maybe_limited_unexpected_list:
                        if val is None or not all(v for k, v in val):
                            parsed_maybe_limited_unexpected_list.append(val)
                        else:
                            if all(isinstance(v, str) for k, v in val):
                                val = OrderedDict(
                                    (k, parse(v)) for k, v in val)
                            parsed_maybe_limited_unexpected_list.append(
                                OrderedDict((k,
                                             datetime.strftime(
                                                 v, output_strftime_format))
                                            for k, v in val))
                    maybe_limited_unexpected_list = parsed_maybe_limited_unexpected_list

            success, percent_success = self._calc_map_expectation_success(
                success_count, nonnull_count, mostly)

            # Currently the abstraction of "result_format" that _format_column_map_output provides
            # limits some possible optimizations within the column-map decorator. It seems that either
            # this logic should be completely rolled into the processing done in the column_map decorator, or that the decorator
            # should do a minimal amount of computation agnostic of result_format, and then delegate the rest to this method.
            # In the first approach, it could make sense to put all of this decorator logic in Dataset, and then implement
            # properties that require dataset-type-dependent implementations (as is done with SparkDFDataset.row_count currently).
            # Then a new dataset type could just implement these properties/hooks and Dataset could deal with caching these and
            # with the optimizations based on result_format. A side benefit would be implementing an interface for the user
            # to get basic info about a dataset in a standardized way, e.g. my_dataset.row_count, my_dataset.columns (only for
            # tablular datasets maybe). However, unclear if this is worth it or if it would conflict with optimizations being done
            # in other dataset implementations.
            return_obj = self._format_map_output(
                result_format,
                success,
                element_count,
                nonnull_count,
                unexpected_count,
                maybe_limited_unexpected_list,
                unexpected_index_list=None,
            )

            temp_df.unpersist()

            return return_obj