def test_catch_exceptions_with_bad_expectation_type(): # We want to catch degenerate cases where an expectation suite is incompatible with my_df = PandasDataset({"x": range(10)}) my_df._expectation_suite.append_expectation( ExpectationConfiguration(expectation_type="foobar", kwargs={}) ) result = my_df.validate(catch_exceptions=True) # Find the foobar result idx = 0 for idx, val_result in enumerate(result.results): if val_result.expectation_config.expectation_type == "foobar": break assert result.results[idx].success is False assert result.results[idx].expectation_config.expectation_type == "foobar" assert result.results[idx].expectation_config.kwargs == {} assert result.results[idx].exception_info["raised_exception"] is True assert ( "AttributeError: 'PandasDataset' object has no attribute 'foobar'" in result.results[idx].exception_info["exception_traceback"] ) with pytest.raises(AttributeError): result = my_df.validate(catch_exceptions=False)
def calculate_integrity(self, df_ge: PandasDataset, specs: SchemaParserResult) -> dict: """ Calculates the integrity from the defined types and the expectations. """ def get_unexpected(eg_result): return eg_result[ 'unexpected_count'] if 'unexpected_count' in eg_result else 0 def merge_dicts(d1, d2): for key, value in d2.items(): for inner_value in d2[key]: d1[key].append(inner_value) return d1 all_elements = defaultdict(list) invalid_elements = defaultdict(list) for definition in specs.type_definitions: result = df_ge.expect_column_to_exist(definition) if not result.success: if definition in specs.required_types: # does only count as error if required invalid_elements[definition].append(df_ge.shape[0]) all_elements[definition].append(df_ge.shape[0]) continue # check missing values result = df_ge.expect_column_values_to_not_be_null(definition) if definition in specs.required_types: # only count as error if required invalid_elements[definition].append( get_unexpected(result.result)) all_elements[definition].append(result.result['element_count']) # check not correct types type_specification = TypeSpecification.create( specs.type_definitions.get(definition)) type_list = [t.__name__ for t in type_specification.get_types()] # noinspection PyTypeChecker result = df_ge.expect_column_values_to_be_in_type_list( definition, type_list) invalid_elements[definition].append(get_unexpected(result.result)) # handle attributes that are not specified not_specified_fields = set(df_ge.columns) - set(specs.type_definitions) if len(not_specified_fields) > 0: for attribute in not_specified_fields: result = df_ge.expect_column_values_to_be_null(attribute) # integrity of not specified fields has been defined as 1 - so we add 0 to unexpected invalid_elements[attribute].append(0) all_elements[attribute].append(get_unexpected(result.result)) # check expectations expectation_violations = self.validate_expectations(df_ge, specs) merge_dicts(invalid_elements, expectation_violations) # flatten attribute metrics integrity_details = dict() for k, v in invalid_elements.items(): integrity_details[k] = 1 - (np.sum(v) / sum(all_elements[k])) return integrity_details
def test_pandas_column_map_decorator_partial_exception_counts(self): df = PandasDataset({'a': [0, 1, 2, 3, 4]}) out = df.expect_column_values_to_be_between( 'a', 3, 4, result_format={ 'result_format': 'COMPLETE', 'partial_unexpected_count': 1 }) self.assertTrue(1, len(out['result']['partial_unexpected_counts'])) self.assertTrue(3, len(out['result']['unexpected_list']))
def test_pandas_column_map_decorator_partial_exception_counts(): df = PandasDataset({"a": [0, 1, 2, 3, 4]}) out = df.expect_column_values_to_be_between( "a", 3, 4, result_format={ "result_format": "COMPLETE", "partial_unexpected_count": 1 }, ) assert 1 == len(out.result["partial_unexpected_counts"]) assert 3 == len(out.result["unexpected_list"])
def credit_profiler(ds: PandasDataset) -> ExpectationSuite: # simple checks on data consistency ds.expect_column_values_to_be_between( "credit_card_due", min_value=0, mostly=0.99, # allow some outliers ) ds.expect_column_values_to_be_between( "missed_payments_1y", min_value=0, max_value=5, mostly=0.99, # allow some outliers ) return ds.get_expectation_suite()
def get_dataset(dataset_type, data): """For Pandas, data should be either a DataFrame or a dictionary that can be instantiated as a DataFrame For SQL, data should have the following shape: { 'table': 'table': SqlAlchemy Table object named_column: [list of values] } """ if dataset_type == 'PandasDataset': return PandasDataset(data) elif dataset_type == 'SqlAlchemyDataset': # Create a new database engine = create_engine('sqlite://') # Add the data to the database as a new table df = pd.DataFrame(data) df.to_sql(name='test_data', con=engine, index=False) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset('test_data', engine=engine) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def validate_expectations(cls, df_ge: PandasDataset, specs: SchemaParserResult) -> defaultdict[list]: """ Validates the dynamic expectations from the schema via the great expectations library. """ invalid_elements = defaultdict(list) suite = ExpectationSuite( expectation_suite_name="custom_specifications") for column in specs.expectation_definitions.keys(): for expectation in specs.expectation_definitions[column]: kwargs_extended = dict(expectation['kwargs']) kwargs_extended['column'] = column suite.append_expectation( ExpectationConfiguration( expectation_type=expectation['expectation_type'], kwargs=kwargs_extended)) # noinspection PyTypeChecker result = df_ge.validate(expectation_suite=suite, result_format="BASIC") for expectation_result in result.results: if expectation_result.exception_info['raised_exception']: continue column_name = expectation_result.expectation_config.kwargs[ "column"] n_invalid = expectation_result.result['unexpected_count'] invalid_elements[column_name].append(n_invalid) return invalid_elements
def _prepare_dataset(dataset: PandasDataset) -> PandasDataset: dataset_copy = dataset.copy(deep=True) for column in dataset.columns: if dataset.expect_column_values_to_be_in_type_list( column, type_list=sorted(list(ProfilerTypeMapping.DATETIME_TYPE_NAMES)) ).success: # GE cannot parse Timestamp or other pandas datetime time dataset_copy[column] = dataset[column].dt.strftime("%Y-%m-%dT%H:%M:%S") if dataset[column].dtype == np.float32: # GE converts expectation arguments into native Python float # This could cause error on comparison => so better to convert to double prematurely dataset_copy[column] = dataset[column].astype(np.float64) return dataset_copy
def nulls_dataset(): df = pd.DataFrame({ "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)], "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)], }) batch_df = PandasDataset(df) return batch_df
def udf(df: pd.DataFrame) -> pd.Series: from datadog.dogstatsd import DogStatsd reporter = (DogStatsd( host=os.environ["STATSD_HOST"], port=int(os.environ["STATSD_PORT"]), telemetry_min_flush_interval=0, ) if os.getenv("STATSD_HOST") and os.getenv("STATSD_PORT") else DogStatsd()) ds = PandasDataset.from_dataset(df) result = ds.validate(expectations, result_format="COMPLETE") valid_rows = pd.Series([True] * df.shape[0]) for check in result.results: if check.exception_info["raised_exception"]: # ToDo: probably we should mark all rows as invalid continue check_kwargs = check.expectation_config.kwargs check_kwargs.pop("result_format", None) check_name = "_".join([check.expectation_config.expectation_type] + [ str(v) for v in check_kwargs.values() if isinstance(v, (str, int, float)) ]) if ("unexpected_count" in check.result and check.result["unexpected_count"] > 0): reporter.increment( "feast_feature_validation_check_failed", value=check.result["unexpected_count"], tags=[ f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}", f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}", f"check:{check_name}", ], ) valid_rows.iloc[check.result["unexpected_index_list"]] = False elif "observed_value" in check.result and check.result[ "observed_value"]: reporter.gauge( "feast_feature_validation_observed_value", value=int(check.result["observed_value"] * 100 # storing as decimal with precision 2 ) if not check.success else 0, # nullify everything below threshold tags=[ f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}", f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}", f"check:{check_name}", ], ) return valid_rows
def test_dataset_from_pandas_source(tmpdir): data_file = tmpdir + '/data.json' json_data = [ {"name": "my name", "birthdate": "2020-10-01", "address": "1234 Main st", "size": 12}, {"name": "your name", "birthdate": "2020-06-01", "address": "1313 Mockingbird Ln", "size": 12} ] with open(data_file, mode='w') as out: json.dump(json_data, out) store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir) project_config.stores = store_defaults.stores project_config.expectations_store_name = store_defaults.expectations_store_name project_config.validations_store_name = store_defaults.validations_store_name project_config.checkpoint_store_name = store_defaults.checkpoint_store_name ctx = BaseDataContext(project_config=project_config) pd_dataset = PandasDataset(pandas.read_json(data_file), **{'batch_kwargs': {'path': 'gcs://my_bucket/path/to/my/data'}, 'data_context': ctx}) action = OpenLineageValidationAction(ctx, openlineage_host='http://localhost:5000', openlineage_namespace='test_ns', job_name='test_job') datasets = action._fetch_datasets_from_pandas_source(pd_dataset, validation_result_suite=result_suite) assert len(datasets) == 1 input_ds = datasets[0] assert input_ds.name == '/path/to/my/data' assert input_ds.namespace == "gcs://my_bucket" assert "dataSource" in input_ds.facets assert input_ds.facets["dataSource"].name == "gcs://my_bucket" assert input_ds.facets["dataSource"].uri == 'gcs://my_bucket' assert 'schema' in input_ds.facets assert len(input_ds.facets['schema'].fields) == 4 assert all(f in input_ds.facets['schema'].fields for f in [SchemaField('name', 'object'), SchemaField('birthdate', 'object'), SchemaField('address', 'object'), SchemaField('size', 'int64')]) assert len(input_ds.inputFacets) == 3 assert all(k in input_ds.inputFacets for k in ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics']) assert input_ds.inputFacets['dataQuality'].rowCount == 10 assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60 assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2 assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True), GreatExpectationsAssertion('expect_column_sum_to_be_between', True, 'size')])
def create_suite(): df = pd.DataFrame() df['num'] = np.random.randint(0, 10, 100) df['num2'] = np.random.randint(0, 20, 100) ds = PandasDataset.from_dataset(df) ds.expect_column_values_to_be_between('num', 0, 10) ds.expect_column_values_to_be_between('num2', 0, 20) return ds.get_expectation_suite()
def profiler_with_unrealistic_expectations( dataset: PandasDataset) -> ExpectationSuite: # need to create dataframe with corrupted data first df = pd.DataFrame() df["current_balance"] = [-100] df["avg_passenger_count"] = [0] other_ds = PandasDataset(df) other_ds.expect_column_max_to_be_between("current_balance", -1000, -100) other_ds.expect_column_values_to_be_in_set("avg_passenger_count", value_set={0}) # this should pass other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000) return other_ds.get_expectation_suite()
def validate(df) -> pd.DataFrame: ds = PandasDataset.from_dataset(df) # print(ds, ds.shape) result = ds.validate(suite, result_format='COMPLETE') valid_rows = pd.Series([True] * ds.shape[0]) # print(result) for check in result.results: if check.success: continue valid_rows.iloc[check.result['unexpected_index_list']] = False return valid_rows
def cardinality_dataset(): df = pd.DataFrame({ "col_none": [None for i in range(0, 1000)], "col_one": [0 for i in range(0, 1000)], "col_two": [i % 2 for i in range(0, 1000)], "col_very_few": [i % 10 for i in range(0, 1000)], "col_few": [i % 50 for i in range(0, 1000)], "col_many": [i % 100 for i in range(0, 1000)], "col_very_many": [i % 500 for i in range(0, 1000)], "col_unique": [i for i in range(0, 1000)], }) batch_df = PandasDataset(df) return batch_df
def analyze_dataset(self, df: pd.DataFrame) -> Profile: """ Generate GEProfile with ExpectationSuite (set of expectations) from a given pandas dataframe by applying user defined profiler. Some fixes are also applied to the dataset (see _prepare_dataset function) to make it compatible with GE. Return GEProfile """ dataset = PandasDataset(df) dataset = _prepare_dataset(dataset) return GEProfile(expectation_suite=self.user_defined_profiler(dataset))
def udf(df: pd.DataFrame) -> pd.Series: ds = PandasDataset.from_dataset(df) result = ds.validate(expectations, result_format="COMPLETE") valid_rows = pd.Series([True] * df.shape[0]) for check in result.results: if check.success: continue if check.exception_info["raised_exception"]: # ToDo: probably we should mark all rows as invalid continue valid_rows.iloc[check.result["unexpected_index_list"]] = False return valid_rows
def validate(self, df: pd.DataFrame) -> "GEValidationReport": """ Validate provided dataframe against GE expectation suite. 1. Pandas dataframe is converted into PandasDataset (GE type) 2. Some fixes applied to the data to avoid crashes inside GE (see _prepare_dataset) 3. Each expectation from ExpectationSuite instance tested against resulting dataset Return GEValidationReport, which parses great expectation's schema into list of generic ValidationErrors. """ dataset = PandasDataset(df) dataset = _prepare_dataset(dataset) results = ge.validate( dataset, expectation_suite=self.expectation_suite, result_format="COMPLETE" ) return GEValidationReport(results)
def compare_attributes_with_schema( self, samples: list, schema_definition: SchemaDefinition) -> (dict, dict): """ Calculates attribute integrity and specification of the samples. """ specs = self.parse_schema(schema_definition) df_normalized = json_normalize(samples, sep="/") df_ge = PandasDataset(df_normalized) # calculate integrity integrity_details = self.calculate_integrity(df_ge, specs) # calculate specification specification_details = self.calculate_specification(df_ge, specs) return integrity_details, specification_details
def test_config_with_not_null_only(possible_expectations_set): """ What does this test do and why? Confirms that the not_null_only key in config works as expected. """ excluded_expectations = [i for i in possible_expectations_set if "null" not in i] df = pd.DataFrame( { "mostly_null": [i if i % 3 == 0 else None for i in range(0, 1000)], "mostly_not_null": [None if i % 3 == 0 else i for i in range(0, 1000)], } ) batch_df = PandasDataset(df) profiler_without_not_null_only = UserConfigurableProfiler( batch_df, excluded_expectations, not_null_only=False ) suite_without_not_null_only = profiler_without_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( suite_without_not_null_only ) assert expectations == { "expect_column_values_to_be_null", "expect_column_values_to_not_be_null", } profiler_with_not_null_only = UserConfigurableProfiler( batch_df, excluded_expectations, not_null_only=True ) not_null_only_suite = profiler_with_not_null_only.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite( not_null_only_suite ) assert expectations == {"expect_column_values_to_not_be_null"} no_config_profiler = UserConfigurableProfiler(batch_df) no_config_suite = no_config_profiler.build_suite() _, expectations = get_set_of_columns_and_expectations_from_suite(no_config_suite) assert "expect_column_values_to_be_null" in expectations
def test_pandas_datetime_evaluation_parameter(): evaluation_parameters = { "now": pd.Timestamp.now(), "now_minus_48h": pd.Timestamp.now() - pd.to_timedelta(2, unit="d"), } test_data = { "data_refresh": [ pd.Timestamp.now(), (pd.Timestamp.now() - pd.to_timedelta(1, unit="d")), ] } _df = pd.DataFrame(test_data) df = PandasDataset(_df) for param in evaluation_parameters: df.set_evaluation_parameter(param, evaluation_parameters[param]) df.expect_column_max_to_be_between( column="data_refresh", min_value={"$PARAMETER": "now_minus_48h"}) result = df.validate() assert result.success
def get_dataset(dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == 'PandasDataset': if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": from sqlalchemy import create_engine engine = create_engine('sqlite://') conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if schemas and "sqlite" in schemas and isinstance( engine.dialect, sqlitetypes.dialect): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'postgresql': from sqlalchemy import create_engine # Create a new database engine = create_engine('postgresql://postgres@localhost/test_ci') conn = engine.connect() sql_dtypes = {} if schemas and "postgresql" in schemas and isinstance( engine.dialect, postgresqltypes.dialect): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'mysql': engine = create_engine('mysql://root@localhost/test_ci') conn = engine.connect() sql_dtypes = {} if schemas and "mysql" in schemas and isinstance( engine.dialect, mysqltypes.dialect): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'SparkDFDataset': from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and 'spark' in schemas: schema = schemas['spark'] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def test_validation_with_ge(feast_client: Client, kafka_server): entity = Entity(name="key", description="Key", value_type=ValueType.INT64) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="validation_test", entities=["key"], features=[ Feature("num", ValueType.INT64), Feature("set", ValueType.STRING) ], batch_source=FileSource( event_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url="/dev/null", ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_test:num", "validation_test:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_test:num", "validation_test:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_test:num", "set": "validation_test:set" }), )
def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.columns_exist, caching=False): """For Pandas, data should be either a DataFrame or a dictionary that can be instantiated as a DataFrame. For SQL, data should have the following shape: { 'table': 'table': SqlAlchemy Table object named_column: [list of values] } """ if dataset_type == 'PandasDataset': df = pd.DataFrame(data) if schemas and "pandas" in schemas: pandas_schema = { key: np.dtype(value) for (key, value) in schemas["pandas"].items() } df = df.astype(pandas_schema) return PandasDataset(df, autoinspect_func=autoinspect_func, caching=caching) elif dataset_type == 'SqlAlchemyDataset': # Create a new database # Try to use a local postgres instance (e.g. on Travis); this will allow more testing than sqlite try: engine = create_engine('postgresql://*****:*****@localhost/test_ci') conn = engine.connect() except SQLAlchemyError: warnings.warn("Falling back to sqlite database.") engine = create_engine('sqlite://') conn = engine.connect() # Add the data to the database as a new table df = pd.DataFrame(data) sql_dtypes = {} if schemas and "sqlite" in schemas and isinstance( engine.dialect, sqlitetypes.dialect): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type = schema[col] if type == "int": df[col] = pd.to_numeric(df[col], downcast='signed') elif type == "float": df[col] = pd.to_numeric(df[col], downcast='float') elif type == "datetime": df[col] = pd.to_datetime(df[col]) elif schemas and "postgresql" in schemas and isinstance( engine.dialect, postgresqltypes.dialect): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type = schema[col] if type == "int": df[col] = pd.to_numeric(df[col], downcast='signed') elif type == "float": df[col] = pd.to_numeric(df[col], downcast='float') elif type == "timestamp": df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, autoinspect_func=autoinspect_func, caching=caching) elif dataset_type == 'SparkDFDataset': spark = SparkSession.builder.getOrCreate() data_reshaped = list(zip(*[v for _, v in data.items()])) if schemas and 'spark' in schemas: schema = schemas['spark'] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]]()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def get_dataset( dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True, table_name=None, sqlite_db_path=None, ): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == "PandasDataset": if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": if not create_engine: return None if sqlite_db_path is not None: engine = create_engine(f"sqlite:////{sqlite_db_path}") else: engine = create_engine("sqlite://") conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if (schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect)): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "postgresql": if not create_engine: return None # Create a new database engine = create_engine("postgresql://postgres@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect)): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mysql": if not create_engine: return None engine = create_engine("mysql+pymysql://root@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect)): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mssql": if not create_engine: return None engine = create_engine( "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true", # echo=True, ) # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit" # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful). # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True)) conn = engine.connect() sql_dtypes = {} if (schemas and dataset_type in schemas and isinstance(engine.dialect, mssqltypes.dialect)): schema = schemas[dataset_type] sql_dtypes = { col: MSSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "SparkDFDataset": from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType, } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and "spark" in schemas: schema = schemas["spark"] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def titanic_dataset(): df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv")) batch_df = PandasDataset(df) return batch_df
def test_that_ge_pandas_datasets_are_memory_efficient(csvpath: Path): df = pd.read_csv(str(csvpath)) df_ge = PandasDataset(df) bequals = df.values.base == df_ge.values.base assert bool(bequals.all())
def test_validation_with_ge(feast_client: Client, kafka_server): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=1) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=["validation_ge:num", "validation_ge:set"], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() test_data["num"] = test_data["num"].astype(np.float64) test_data["num"].iloc[invalid_idx] = np.nan test_data["set"].iloc[invalid_idx] = None pd.testing.assert_frame_equal( ingested[["key", "validation_ge:num", "validation_ge:set"]], test_data[["key", "num", "set"]].rename(columns={ "num": "validation_ge:num", "set": "validation_ge:set" }), )
def test_validation_reports_metrics(feast_client: Client, kafka_server, statsd_server: StatsDServer): kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" entity, feature_table = create_schema(kafka_broker, topic_name, "validation_ge_metrics") feast_client.apply_entity(entity) feast_client.apply_feature_table(feature_table) train_data = generate_train_data() ge_ds = PandasDataset(train_data) ge_ds.expect_column_values_to_be_between("num", 0, 100) ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"]) expectations = ge_ds.get_expectation_suite() udf = create_validation_udf("testUDF", expectations, feature_table) apply_validation(feast_client, feature_table, udf, validation_window_secs=10) job = feast_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120) wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120) test_data = generate_test_data() ge_ds = PandasDataset(test_data) validation_result = ge_ds.validate(expectations, result_format="COMPLETE") unexpected_counts = { "expect_column_values_to_be_between_num_0_100": validation_result.results[0].result["unexpected_count"], "expect_column_values_to_be_in_set_set": validation_result.results[1].result["unexpected_count"], } invalid_idx = list({ idx for check in validation_result.results for idx in check.result["unexpected_index_list"] }) entity_rows = [{"key": key} for key in test_data["key"].tolist()] try: ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=entity_rows, feature_names=[ "validation_ge_metrics:num", "validation_ge_metrics:set" ], expected_ingested_count=test_data.shape[0] - len(invalid_idx), ) finally: job.cancel() expected_metrics = [( f"feast_feature_validation_check_failed#check:{check_name}," f"feature_table:{feature_table.name},project:{feast_client.project}", value, ) for check_name, value in unexpected_counts.items()] wait_retry_backoff( lambda: ( None, all( statsd_server.metrics.get(m) == v for m, v in expected_metrics), ), timeout_secs=30, timeout_msg="Expected metrics were not received: " + str(expected_metrics) + "\n" "Actual received metrics" + str(statsd_server.metrics), )