def __init__( self, data_context, action_list, name, base_expectation_suite_name=None, expectation_suite_name_suffixes=None, stop_on_first_error=False, slack_webhook=None, notify_on="all", result_format={"result_format": "SUMMARY"}, ): super().__init__(data_context, action_list, name) if expectation_suite_name_suffixes is None: expectation_suite_name_suffixes = [".failure", ".warning"] self.stop_on_first_error = stop_on_first_error self.base_expectation_suite_name = base_expectation_suite_name assert len(expectation_suite_name_suffixes) == 2 for suffix in expectation_suite_name_suffixes: assert isinstance(suffix, str) self.expectation_suite_name_suffixes = expectation_suite_name_suffixes self.slack_webhook = slack_webhook self.notify_on = notify_on result_format = parse_result_format(result_format) assert result_format["result_format"] in [ "BOOLEAN_ONLY", "BASIC", "SUMMARY", "COMPLETE", ] self.result_format = result_format
def __init__( self, data_context, action_list, name, result_format={"result_format": "SUMMARY"}, ): super().__init__() self.data_context = data_context self.name = name result_format = parse_result_format(result_format) assert result_format["result_format"] in [ "BOOLEAN_ONLY", "BASIC", "SUMMARY", "COMPLETE", ] self.result_format = result_format self.action_list = action_list self.actions = OrderedDict() for action_config in action_list: assert isinstance(action_config, dict) # NOTE: Eugene: 2019-09-23: need a better way to validate an action config: if not set(action_config.keys()) == {"name", "action"}: raise KeyError( 'Action config keys must be ("name", "action"). Instead got {}'.format( action_config.keys() ) ) config = action_config["action"] module_name = "great_expectations.validation_operators" new_action = instantiate_class_from_config( config=config, runtime_environment={"data_context": self.data_context}, config_defaults={"module_name": module_name}, ) if not new_action: raise ClassInstantiationError( module_name=module_name, package_name=None, class_name=config["class_name"], ) self.actions[action_config["name"]] = new_action
def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs): if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) if result_format['result_format'] == 'COMPLETE': warnings.warn( "Setting result format to COMPLETE for a SqlAlchemyDataset can be dangerous because it will not limit the number of returned results." ) unexpected_count_limit = None else: unexpected_count_limit = result_format[ 'partial_unexpected_count'] expected_condition = func(self, column, *args, **kwargs) # Added to prepare for when an ignore_values argument is added to the expectation ignore_values = [None] if func.__name__ in [ 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_null' ]: ignore_values = [] # Counting the number of unexpected values can be expensive when there is a large # number of np.nan values. # This only happens on expect_column_values_to_not_be_null expectations. # Since there is no reason to look for most common unexpected values in this case, # we will instruct the result formatting method to skip this step. result_format['partial_unexpected_count'] = 0 ignore_values_conditions = [] if len(ignore_values) > 0 and None not in ignore_values or len( ignore_values) > 1 and None in ignore_values: ignore_values_conditions += [ sa.column(column).in_( [val for val in ignore_values if val is not None]) ] if None in ignore_values: ignore_values_conditions += [sa.column(column).is_(None)] if len(ignore_values_conditions) > 1: ignore_values_condition = sa.or_(*ignore_values_conditions) elif len(ignore_values_conditions) == 1: ignore_values_condition = ignore_values_conditions[0] else: ignore_values_condition = sa.literal(False) count_query = sa.select([ sa.func.count().label('element_count'), sa.func.sum(sa.case([(ignore_values_condition, 1)], else_=0)).label('null_count'), sa.func.sum( sa.case([(sa.and_(sa.not_(expected_condition), sa.not_(ignore_values_condition)), 1)], else_=0)).label('unexpected_count') ]).select_from(self._table) count_results = dict(self.engine.execute(count_query).fetchone()) # Handle case of empty table gracefully: if "element_count" not in count_results or count_results[ "element_count"] is None: count_results["element_count"] = 0 if "null_count" not in count_results or count_results[ "null_count"] is None: count_results["null_count"] = 0 if "unexpected_count" not in count_results or count_results[ "unexpected_count"] is None: count_results["unexpected_count"] = 0 # Retrieve unexpected values unexpected_query_results = self.engine.execute( sa.select([sa.column(column)]).select_from(self._table).where( sa.and_(sa.not_(expected_condition), sa.not_(ignore_values_condition))).limit( unexpected_count_limit)) nonnull_count = count_results['element_count'] - \ count_results['null_count'] if "output_strftime_format" in kwargs: output_strftime_format = kwargs["output_strftime_format"] maybe_limited_unexpected_list = [] for x in unexpected_query_results.fetchall(): if isinstance(x[column], string_types): col = parse(x[column]) else: col = x[column] maybe_limited_unexpected_list.append( datetime.strftime(col, output_strftime_format)) else: maybe_limited_unexpected_list = [ x[column] for x in unexpected_query_results.fetchall() ] success_count = nonnull_count - count_results['unexpected_count'] success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) return_obj = self._format_map_output( result_format, success, count_results['element_count'], nonnull_count, count_results['unexpected_count'], maybe_limited_unexpected_list, None, ) if func.__name__ in [ 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_null' ]: # These results are unnecessary for the above expectations del return_obj['result']['unexpected_percent_nonmissing'] del return_obj['result']['missing_count'] del return_obj['result']['missing_percent'] try: del return_obj['result']['partial_unexpected_counts'] del return_obj['result']['partial_unexpected_list'] except KeyError: pass return return_obj
def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs): """ This whole decorator is pending a re-write. Currently there is are huge performance issues when the # of unexpected elements gets large (10s of millions). Additionally, there is likely easy optimization opportunities by coupling result_format with how many different transformations are done on the dataset, as is done in sqlalchemy_dataset. """ if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) # this is a little dangerous: expectations that specify "COMPLETE" result format and have a very # large number of unexpected results could hang for a long time. we should either call this out in docs # or put a limit on it if result_format['result_format'] == 'COMPLETE': unexpected_count_limit = None else: unexpected_count_limit = result_format[ 'partial_unexpected_count'] col_df = self.spark_df.select(column) # pyspark.sql.DataFrame # a couple of tests indicate that caching here helps performance col_df.cache() element_count = self.get_row_count() # FIXME temporary fix for missing/ignored value if func.__name__ not in [ 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_null' ]: col_df = col_df.filter( '{column} is not null'.format(column=column)) # these nonnull_counts are cached by SparkDFDataset nonnull_count = self.get_column_nonnull_count(column) else: nonnull_count = element_count # success_df will have columns [column, '__success'] # this feels a little hacky, so might want to change success_df = func(self, col_df, *args, **kwargs) success_count = success_df.filter('__success = True').count() unexpected_count = nonnull_count - success_count if unexpected_count == 0: # save some computation time if no unexpected items maybe_limited_unexpected_list = [] else: # here's an example of a place where we could do optimizations if we knew result format: see # comment block below unexpected_df = success_df.filter('__success = False') if unexpected_count_limit: unexpected_df = unexpected_df.limit(unexpected_count_limit) maybe_limited_unexpected_list = [ row[column] for row in unexpected_df.collect() ] success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) # Currently the abstraction of "result_format" that _format_column_map_output provides # limits some possible optimizations within the column-map decorator. It seems that either # this logic should be completely rolled into the processing done in the column_map decorator, or that the decorator # should do a minimal amount of computation agnostic of result_format, and then delegate the rest to this method. # In the first approach, it could make sense to put all of this decorator logic in Dataset, and then implement # properties that require dataset-type-dependent implementations (as is done with SparkDFDataset.row_count currently). # Then a new dataset type could just implement these properties/hooks and Dataset could deal with caching these and # with the optimizations based on result_format. A side benefit would be implementing an interface for the user # to get basic info about a dataset in a standardized way, e.g. my_dataset.row_count, my_dataset.columns (only for # tablular datasets maybe). However, unclear if this is worth it or if it would conflict with optimizations being done # in other dataset implementations. return_obj = self._format_map_output( result_format, success, element_count, nonnull_count, unexpected_count, maybe_limited_unexpected_list, unexpected_index_list=None, ) # FIXME Temp fix for result format if func.__name__ in [ 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_null' ]: del return_obj['result']['unexpected_percent_nonmissing'] try: del return_obj['result']['partial_unexpected_counts'] except KeyError: pass col_df.unpersist() return return_obj
def inner_wrapper(self, skip=None, mostly=None, null_lines_regex=r"^\s*$", result_format=None, *args, **kwargs): try: f = open(self._path) except OSError: raise if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) lines = f.readlines() # Read in file lines # Skip k initial lines designated by the user if skip is not None and skip <= len(lines): try: assert float(skip).is_integer() assert float(skip) >= 0 except (AssertionError, ValueError): raise ValueError("skip must be a positive integer") for i in range(1, skip + 1): lines.pop(0) if lines: if null_lines_regex is not None: null_lines = re.compile( null_lines_regex ) # Ignore lines that are empty or have only white space ("null values" in the line-map context) boolean_mapped_null_lines = np.array( [bool(null_lines.match(line)) for line in lines]) else: boolean_mapped_null_lines = np.zeros(len(lines), dtype=bool) element_count = int(len(lines)) if element_count > sum(boolean_mapped_null_lines): nonnull_lines = list( compress(lines, np.invert(boolean_mapped_null_lines))) nonnull_count = int( (boolean_mapped_null_lines == False).sum()) boolean_mapped_success_lines = np.array( func(self, _lines=nonnull_lines, *args, **kwargs)) success_count = np.count_nonzero( boolean_mapped_success_lines) unexpected_list = list( compress(nonnull_lines, np.invert(boolean_mapped_success_lines))) nonnull_lines_index = range(0, len(nonnull_lines) + 1) unexpected_index_list = list( compress(nonnull_lines_index, np.invert(boolean_mapped_success_lines))) success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) return_obj = self._format_map_output( result_format, success, element_count, nonnull_count, len(unexpected_list), unexpected_list, unexpected_index_list, ) else: return_obj = self._format_map_output( result_format=result_format, success=None, element_count=element_count, nonnull_count=0, unexpected_count=0, unexpected_list=[], unexpected_index_list=[], ) else: return_obj = self._format_map_output( result_format=result_format, success=None, element_count=0, nonnull_count=0, unexpected_count=0, unexpected_list=[], unexpected_index_list=[], ) f.close() return return_obj
def inner_wrapper(self, column_A, column_B, mostly=None, ignore_row_if="both_values_are_missing", result_format=None, *args, **kwargs): if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) # this is a little dangerous: expectations that specify "COMPLETE" result format and have a very # large number of unexpected results could hang for a long time. we should either call this out in docs # or put a limit on it if result_format['result_format'] == 'COMPLETE': unexpected_count_limit = None else: unexpected_count_limit = result_format['partial_unexpected_count'] cols_df = self.spark_df.select(column_A, column_B).withColumn("__row", monotonically_increasing_id()) # pyspark.sql.DataFrame # a couple of tests indicate that caching here helps performance cols_df.cache() element_count = self.get_row_count() if ignore_row_if == "both_values_are_missing": boolean_mapped_null_values = cols_df.selectExpr("`__row`", "`{0}` AS `A_{0}`".format(column_A), "`{0}` AS `B_{0}`".format(column_B), "ISNULL(`{0}`) AND ISNULL(`{1}`) AS `__null_val`".format(column_A, column_B) ) elif ignore_row_if == "either_value_is_missing": boolean_mapped_null_values = cols_df.selectExpr("`__row`", "`{0}` AS `A_{0}`".format(column_A), "`{0}` AS `B_{0}`".format(column_B), "ISNULL(`{0}`) OR ISNULL(`{1}`) AS `__null_val`".format(column_A, column_B)) elif ignore_row_if == "never": boolean_mapped_null_values = cols_df.selectExpr("`__row`", "`{0}` AS `A_{0}`".format(column_A), "`{0}` AS `B_{0}`".format(column_B), lit(False).alias("__null_val")) else: raise ValueError( "Unknown value of ignore_row_if: %s", (ignore_row_if,)) # since pyspark guaranteed each columns selected has the same number of rows, no need to do assert as in pandas # assert series_A.count() == ( # series_B.count()), "Series A and B must be the same length" nonnull_df = boolean_mapped_null_values.filter("__null_val = False") nonnull_count = nonnull_df.count() col_A_df = nonnull_df.select("__row", "`A_{0}`".format(column_A)) col_B_df = nonnull_df.select("__row", "`B_{0}`".format(column_B)) success_df = func( self, col_A_df, col_B_df, *args, **kwargs) success_count = success_df.filter("__success = True").count() unexpected_count = nonnull_count - success_count if unexpected_count == 0: # save some computation time if no unexpected items maybe_limited_unexpected_list = [] else: # here's an example of a place where we could do optimizations if we knew result format: see # comment block below unexpected_df = success_df.filter('__success = False') if unexpected_count_limit: unexpected_df = unexpected_df.limit(unexpected_count_limit) maybe_limited_unexpected_list = [ (row["A_{0}".format(column_A)], row["B_{0}".format(column_B)]) for row in unexpected_df.collect() ] if "output_strftime_format" in kwargs: output_strftime_format = kwargs["output_strftime_format"] parsed_maybe_limited_unexpected_list = [] for val in maybe_limited_unexpected_list: if val is None or (val[0] is None or val[1] is None): parsed_maybe_limited_unexpected_list.append(val) else: if isinstance(val[0], string_types) and isinstance(val[1], string_types): val = (parse(val[0]), parse(val[1])) parsed_maybe_limited_unexpected_list.append((datetime.strftime(val[0], output_strftime_format), datetime.strftime(val[1], output_strftime_format))) maybe_limited_unexpected_list = parsed_maybe_limited_unexpected_list success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) # Currently the abstraction of "result_format" that _format_column_map_output provides # limits some possible optimizations within the column-map decorator. It seems that either # this logic should be completely rolled into the processing done in the column_map decorator, or that the decorator # should do a minimal amount of computation agnostic of result_format, and then delegate the rest to this method. # In the first approach, it could make sense to put all of this decorator logic in Dataset, and then implement # properties that require dataset-type-dependent implementations (as is done with SparkDFDataset.row_count currently). # Then a new dataset type could just implement these properties/hooks and Dataset could deal with caching these and # with the optimizations based on result_format. A side benefit would be implementing an interface for the user # to get basic info about a dataset in a standardized way, e.g. my_dataset.row_count, my_dataset.columns (only for # tablular datasets maybe). However, unclear if this is worth it or if it would conflict with optimizations being done # in other dataset implementations. return_obj = self._format_map_output( result_format, success, element_count, nonnull_count, unexpected_count, maybe_limited_unexpected_list, unexpected_index_list=None, ) # # FIXME Temp fix for result format # if func.__name__ in ['expect_column_values_to_not_be_null', 'expect_column_values_to_be_null']: # del return_obj['result']['unexpected_percent_nonmissing'] # del return_obj['result']['missing_count'] # del return_obj['result']['missing_percent'] # try: # del return_obj['result']['partial_unexpected_counts'] # except KeyError: # pass cols_df.unpersist() return return_obj
def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs): if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) if result_format['result_format'] == 'COMPLETE': warnings.warn( "Setting result format to COMPLETE for a SqlAlchemyDataset can be dangerous because it will not limit the number of returned results." ) unexpected_count_limit = None else: unexpected_count_limit = result_format[ 'partial_unexpected_count'] expected_condition = func(self, column, *args, **kwargs) # FIXME Temporary Fix for counting missing values # Added to compensate when an ignore_values argument is added to the expectation ignore_values = [None] if func.__name__ in [ 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_null' ]: ignore_values = [] # Counting the number of unexpected values can be expensive when there is a large # number of np.nan values. # This only happens on expect_column_values_to_not_be_null expectations. # Since there is no reason to look for most common unexpected values in this case, # we will instruct the result formatting method to skip this step. result_format['partial_unexpected_count'] = 0 count_query = sa.select([ sa.func.count().label('element_count'), sa.func.sum( sa.case( [( sa.or_( sa.column(column).in_(ignore_values), # Below is necessary b/c sa.in_() uses `==` but None != None # But we only consider this if None is actually in the list of ignore values sa.column(column).is_(None) if None in ignore_values else False), 1)], else_=0)).label('null_count'), sa.func.sum( sa.case([(sa.and_( sa.not_(expected_condition), sa.case([(sa.column(column).is_(None), False)], else_=True) if None in ignore_values else True), 1)], else_=0)).label('unexpected_count') ]).select_from(self._table) count_results = dict(self.engine.execute(count_query).fetchone()) # Handle case of empty table gracefully: if "element_count" not in count_results or count_results[ "element_count"] is None: count_results["element_count"] = 0 if "null_count" not in count_results or count_results[ "null_count"] is None: count_results["null_count"] = 0 if "unexpected_count" not in count_results or count_results[ "unexpected_count"] is None: count_results["unexpected_count"] = 0 # Retrieve unexpected values unexpected_query_results = self.engine.execute( sa.select([sa.column(column)]).select_from(self._table).where( sa.and_( sa.not_(expected_condition), sa.or_( # SA normally evaluates `== None` as `IS NONE`. However `sa.in_()` # replaces `None` as `NULL` in the list and incorrectly uses `== NULL` sa.case([(sa.column(column).is_(None), False)], else_=True) if None in ignore_values else False, # Ignore any other values that are in the ignore list sa.column(column).in_(ignore_values) == False))).limit(unexpected_count_limit)) nonnull_count = count_results['element_count'] - \ count_results['null_count'] if "output_strftime_format" in kwargs: output_strftime_format = kwargs["output_strftime_format"] maybe_limited_unexpected_list = [] for x in unexpected_query_results.fetchall(): if isinstance(x[column], string_types): col = parse(x[column]) else: col = x[column] maybe_limited_unexpected_list.append( datetime.strftime(col, output_strftime_format)) else: maybe_limited_unexpected_list = [ x[column] for x in unexpected_query_results.fetchall() ] success_count = nonnull_count - count_results['unexpected_count'] success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) return_obj = self._format_map_output( result_format, success, count_results['element_count'], nonnull_count, count_results['unexpected_count'], maybe_limited_unexpected_list, None, ) if func.__name__ in [ 'expect_column_values_to_not_be_null', 'expect_column_values_to_be_null' ]: # These results are unnecessary for the above expectations del return_obj['result']['unexpected_percent_nonmissing'] try: del return_obj['result']['partial_unexpected_counts'] del return_obj['result']['partial_unexpected_list'] except KeyError: pass return return_obj
def inner_wrapper(self, column, mostly=None, result_format=None, *args, **kwargs): if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) # FIXME temporary fix for missing/ignored value ignore_values = [None, np.nan] if func.__name__ in ['expect_column_values_to_not_be_null', 'expect_column_values_to_be_null']: ignore_values = [] # Counting the number of unexpected values can be expensive when there is a large # number of np.nan values. # This only happens on expect_column_values_to_not_be_null expectations. # Since there is no reason to look for most common unexpected values in this case, # we will instruct the result formatting method to skip this step. result_format['partial_unexpected_count'] = 0 series = self[column] # FIXME rename to mapped_ignore_values? if len(ignore_values) == 0: boolean_mapped_null_values = np.array( [False for value in series]) else: boolean_mapped_null_values = np.array([True if (value in ignore_values) or (pd.isnull(value)) else False for value in series]) element_count = int(len(series)) # FIXME rename nonnull to non_ignored? nonnull_values = series[boolean_mapped_null_values == False] nonnull_count = int((boolean_mapped_null_values == False).sum()) boolean_mapped_success_values = func( self, nonnull_values, *args, **kwargs) success_count = np.count_nonzero(boolean_mapped_success_values) unexpected_list = list( nonnull_values[boolean_mapped_success_values == False]) unexpected_index_list = list( nonnull_values[boolean_mapped_success_values == False].index) if "output_strftime_format" in kwargs: output_strftime_format = kwargs["output_strftime_format"] parsed_unexpected_list = [] for val in unexpected_list: if val is None: parsed_unexpected_list.append(val) else: if isinstance(val, string_types): val = parse(val) parsed_unexpected_list.append(datetime.strftime(val, output_strftime_format)) unexpected_list = parsed_unexpected_list success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) return_obj = self._format_map_output( result_format, success, element_count, nonnull_count, len(unexpected_list), unexpected_list, unexpected_index_list ) # FIXME Temp fix for result format if func.__name__ in ['expect_column_values_to_not_be_null', 'expect_column_values_to_be_null']: del return_obj['result']['unexpected_percent_nonmissing'] try: del return_obj['result']['partial_unexpected_counts'] del return_obj['result']['partial_unexpected_list'] except KeyError: pass return return_obj
def inner_wrapper(self, column_list, mostly=None, ignore_row_if="all_values_are_missing", result_format=None, *args, **kwargs): if result_format is None: result_format = self.default_expectation_args["result_format"] result_format = parse_result_format(result_format) # this is a little dangerous: expectations that specify "COMPLETE" result format and have a very # large number of unexpected results could hang for a long time. we should either call this out in docs # or put a limit on it if result_format["result_format"] == "COMPLETE": unexpected_count_limit = None else: unexpected_count_limit = result_format[ "partial_unexpected_count"] temp_df = self.spark_df.select( *column_list) # pyspark.sql.DataFrame # a couple of tests indicate that caching here helps performance temp_df.cache() element_count = self.get_row_count() if ignore_row_if == "all_values_are_missing": boolean_mapped_skip_values = temp_df.select([ *column_list, reduce(lambda a, b: a & b, [col(c).isNull() for c in column_list]).alias("__null_val"), ]) elif ignore_row_if == "any_value_is_missing": boolean_mapped_skip_values = temp_df.select([ *column_list, reduce(lambda a, b: a | b, [col(c).isNull() for c in column_list]).alias("__null_val"), ]) elif ignore_row_if == "never": boolean_mapped_skip_values = temp_df.select( [*column_list, lit(False).alias("__null_val")]) else: raise ValueError("Unknown value of ignore_row_if: %s", (ignore_row_if, )) nonnull_df = boolean_mapped_skip_values.filter( "__null_val = False") nonnull_count = nonnull_df.count() cols_df = nonnull_df.select(*column_list) success_df = func(self, cols_df, *args, **kwargs) success_count = success_df.filter("__success = True").count() unexpected_count = nonnull_count - success_count if unexpected_count == 0: maybe_limited_unexpected_list = [] else: # here's an example of a place where we could do optimizations if we knew result format: see # comment block below unexpected_df = success_df.filter("__success = False") if unexpected_count_limit: unexpected_df = unexpected_df.limit(unexpected_count_limit) maybe_limited_unexpected_list = [ OrderedDict((c, row[c]) for c in column_list) for row in unexpected_df.collect() ] if "output_strftime_format" in kwargs: output_strftime_format = kwargs["output_strftime_format"] parsed_maybe_limited_unexpected_list = [] for val in maybe_limited_unexpected_list: if val is None or not all(v for k, v in val): parsed_maybe_limited_unexpected_list.append(val) else: if all(isinstance(v, str) for k, v in val): val = OrderedDict( (k, parse(v)) for k, v in val) parsed_maybe_limited_unexpected_list.append( OrderedDict((k, datetime.strftime( v, output_strftime_format)) for k, v in val)) maybe_limited_unexpected_list = parsed_maybe_limited_unexpected_list success, percent_success = self._calc_map_expectation_success( success_count, nonnull_count, mostly) # Currently the abstraction of "result_format" that _format_column_map_output provides # limits some possible optimizations within the column-map decorator. It seems that either # this logic should be completely rolled into the processing done in the column_map decorator, or that the decorator # should do a minimal amount of computation agnostic of result_format, and then delegate the rest to this method. # In the first approach, it could make sense to put all of this decorator logic in Dataset, and then implement # properties that require dataset-type-dependent implementations (as is done with SparkDFDataset.row_count currently). # Then a new dataset type could just implement these properties/hooks and Dataset could deal with caching these and # with the optimizations based on result_format. A side benefit would be implementing an interface for the user # to get basic info about a dataset in a standardized way, e.g. my_dataset.row_count, my_dataset.columns (only for # tablular datasets maybe). However, unclear if this is worth it or if it would conflict with optimizations being done # in other dataset implementations. return_obj = self._format_map_output( result_format, success, element_count, nonnull_count, unexpected_count, maybe_limited_unexpected_list, unexpected_index_list=None, ) temp_df.unpersist() return return_obj