def build_configuration(cls, data_asset_type=None, generators=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if generators is None: generators = {"default": {"class_name": "TableGenerator"}} if data_asset_type is None: data_asset_type = ClassConfig(class_name="SqlAlchemyDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass configuration = kwargs configuration.update({ "data_asset_type": data_asset_type, "generators": generators, }) return configuration
def __init__(self, name="default", data_context=None, data_asset_type=None, profile=None, generators=None, **kwargs): if not sqlalchemy: raise DatasourceInitializationError( name, "ModuleNotFoundError: No module named 'sqlalchemy'") if generators is None: generators = {"default": {"type": "queries"}} if data_asset_type is None: data_asset_type = ClassConfig(class_name="SqlAlchemyDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass super(SqlAlchemyDatasource, self).__init__(name, type_="sqlalchemy", data_context=data_context, data_asset_type=data_asset_type, generators=generators) if profile is not None: self._datasource_config.update({"profile": profile}) try: # if an engine was provided, use that if "engine" in kwargs: self.engine = kwargs.pop("engine") # if a connection string or url was provided, use that elif "connection_string" in kwargs: connection_string = kwargs.pop("connection_string") self.engine = create_engine(connection_string, **kwargs) self.engine.connect() elif "url" in kwargs: url = kwargs.pop("url") self.engine = create_engine(url, **kwargs) self.engine.connect() # Otherwise, connect using remaining kwargs else: self.engine = create_engine( self._get_sqlalchemy_connection_options(**kwargs)) self.engine.connect() except sqlalchemy.exc.OperationalError as sqlalchemy_error: raise DatasourceInitializationError(self._name, str(sqlalchemy_error)) self._build_generators()
def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation. **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if generators is None: # Provide a gentle way to build a datasource with a sane default, # including ability to specify the base_directory and reader_options base_directory = kwargs.pop("base_directory", "data") # By default, use CSV sniffer to infer separator, which requires the python engine reader_options = kwargs.pop("reader_options", { "sep": None, "engine": "python" }) generators = { "default": { "class_name": "SubdirReaderGenerator", "base_directory": base_directory, "reader_options": reader_options } } if data_asset_type is None: data_asset_type = ClassConfig( class_name="PandasDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass configuration = kwargs configuration.update({ "data_asset_type": data_asset_type, "generators": generators, }) if boto3_options is not None: if isinstance(boto3_options, dict): configuration.update(boto3_options) else: raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " "initialization.") return configuration
def build_configuration(cls, data_asset_type=None, generators=None, spark_config=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary spark_config: dictionary of key-value pairs to pass to the spark builder **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if generators is None: # Provide a gentle way to build a datasource with a sane default, # including ability to specify the base_directory base_directory = kwargs.pop("base_directory", "/data") reader_options = kwargs.pop("reader_options", {}) generators = { "default": { "class_name": "SubdirReaderGenerator", "base_directory": base_directory, "reader_options": reader_options } } if data_asset_type is None: data_asset_type = ClassConfig(class_name="SparkDFDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass if spark_config is None: spark_config = {} configuration = kwargs configuration.update({ "data_asset_type": data_asset_type, "generators": generators, "spark_config": spark_config }) return configuration
def build_configuration(cls, data_asset_type=None, batch_kwargs_generators=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary batch_kwargs_generators: Generator configuration dictionary **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if data_asset_type is None: data_asset_type = { "class_name": "SqlAlchemyDataset", "module_name": "great_expectations.dataset", } else: data_asset_type = classConfigSchema.dump( ClassConfig(**data_asset_type)) configuration = kwargs configuration["data_asset_type"] = data_asset_type if batch_kwargs_generators is not None: configuration["batch_kwargs_generators"] = batch_kwargs_generators return configuration
def build_configuration(cls, data_asset_type=None, generators=None, spark_config=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary spark_config: dictionary of key-value pairs to pass to the spark builder **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if data_asset_type is None: data_asset_type = {"class_name": "SparkDFDataset"} else: data_asset_type = classConfigSchema.dump(ClassConfig(**data_asset_type)) if spark_config is None: spark_config = {} configuration = kwargs configuration.update({ "data_asset_type": data_asset_type, "spark_config": spark_config }) if generators: configuration["generators"] = generators return configuration
def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, reader_method=None, reader_options=None, limit=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation. reader_method: Optional default reader_method for generated batches reader_options: Optional default reader_options for generated batches limit: Optional default limit for generated batches **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if data_asset_type is None: data_asset_type = {"class_name": "PandasDataset"} else: data_asset_type = classConfigSchema.dump( ClassConfig(**data_asset_type)) configuration = kwargs configuration["data_asset_type"] = data_asset_type if generators: configuration["generators"] = generators if boto3_options is not None: if isinstance(boto3_options, dict): configuration.update(boto3_options) else: raise ValueError( "boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " "initialization.") if reader_options is not None: if isinstance(reader_options, dict): configuration.update(reader_options) else: raise ValueError( "boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " "initialization.") if reader_method is not None: configuration["reader_method"] = reader_method if limit is not None: configuration["limit"] = limit return configuration
def __init__(self, name="default", data_context=None, data_asset_type=None, generators=None, **kwargs): if generators is None: # Provide a gentle way to build a datasource with a sane default, # including ability to specify the base_directory base_directory = kwargs.pop("base_directory", "/data") reader_options = kwargs.pop("reader_options", {}) generators = { "default": { "type": "subdir_reader", "base_directory": base_directory, "reader_options": reader_options } } if data_asset_type is None: data_asset_type = ClassConfig(class_name="SparkDFDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass super(SparkDFDatasource, self).__init__(name, type_="spark", data_context=data_context, data_asset_type=data_asset_type, generators=generators) try: self.spark = SparkSession.builder.getOrCreate() except Exception: logger.error( "Unable to load spark context; install optional spark dependency for support." ) self.spark = None self._build_generators()
def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): batch_kwargs.update(kwargs) if "data_asset_type" in batch_kwargs: # Sqlalchemy does not use reader_options or need to remove batch_kwargs since it does not pass # options through to a later reader data_asset_type_config = batch_kwargs["data_asset_type"] try: data_asset_type_config = ClassConfig(**data_asset_type_config) except TypeError: # We tried; we'll pass the config downstream, probably as a string, and handle an error later pass else: data_asset_type_config = self._data_asset_type data_asset_type = self._get_data_asset_class(data_asset_type_config) if not issubclass(data_asset_type, SqlAlchemyDataset): raise ValueError("SqlAlchemyDatasource cannot instantiate batch with data_asset_type: '%s'. It " "must be a subclass of SqlAlchemyDataset." % data_asset_type.__name__) # We need to build a batch_id to be used in the dataframe batch_id = BatchId({ "timestamp": time.time() }) if "schema" in batch_kwargs: schema = batch_kwargs["schema"] else: schema = None if "table" in batch_kwargs: return data_asset_type( table_name=batch_kwargs["table"], engine=self.engine, schema=schema, data_context=self._data_context, expectation_suite=expectation_suite, batch_kwargs=batch_kwargs, batch_id=batch_id ) elif "query" in batch_kwargs: query = Template(batch_kwargs["query"]).safe_substitute(**kwargs) return data_asset_type( custom_sql=query, engine=self.engine, data_context=self._data_context, expectation_suite=expectation_suite, batch_kwargs=batch_kwargs, batch_id=batch_id ) else: raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified")
def build_configuration(cls, data_asset_type=None, generators=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ # As of 0.9.0, we do not require generators be configured # generators = { # "default": { # "class_name": "TableBatchKwargsGenerator" # }, # "passthrough": { # "class_name": "PassthroughGenerator", # } # } if data_asset_type is None: data_asset_type = ClassConfig(class_name="SqlAlchemyDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass configuration = kwargs configuration["data_asset_type"] = data_asset_type if generators is not None: configuration["generators"] = generators return configuration
def __init__(self, name="pandas", data_context=None, data_asset_type=None, generators=None, **kwargs): if generators is None: # Provide a gentle way to build a datasource with a sane default, # including ability to specify the base_directory and reader_options base_directory = kwargs.pop("base_directory", "/data") # By default, use CSV sniffer to infer separator, which requires the python engine reader_options = kwargs.pop("reader_options", { "sep": None, "engine": "python" }) generators = { "default": { "type": "subdir_reader", "base_directory": base_directory, "reader_options": reader_options } } if data_asset_type is None: data_asset_type = ClassConfig(class_name="PandasDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass super(PandasDatasource, self).__init__(name, type_="pandas", data_context=data_context, data_asset_type=data_asset_type, generators=generators) self._build_generators()
def build_configuration( cls, data_asset_type=None, batch_kwargs_generators=None, spark_config=None, force_reuse_spark_context=False, **kwargs, ): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary batch_kwargs_generators: Generator configuration dictionary spark_config: dictionary of key-value pairs to pass to the spark builder **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ if data_asset_type is None: data_asset_type = { "class_name": "SparkDFDataset", "module_name": "great_expectations.dataset", } else: data_asset_type = classConfigSchema.dump( ClassConfig(**data_asset_type)) if spark_config is None: spark_config = {} configuration = kwargs configuration.update({ "data_asset_type": data_asset_type, "spark_config": spark_config, "force_reuse_spark_context": force_reuse_spark_context, }) if batch_kwargs_generators: configuration["batch_kwargs_generators"] = batch_kwargs_generators return configuration
def __init__(self, batch, expectation_suite, expectation_engine=None, **kwargs): self.batch = batch self.expectation_suite = expectation_suite if isinstance(expectation_engine, dict): expectation_engine = ClassConfig(**expectation_engine) if isinstance(expectation_engine, ClassConfig): module_name = expectation_engine.module_name or "great_expectations.dataset" verify_dynamic_loading_support(module_name=module_name) expectation_engine = load_class( class_name=expectation_engine.class_name, module_name=module_name) self.expectation_engine = expectation_engine if self.expectation_engine is None: # Guess the engine try: import pandas as pd if isinstance(batch.data, pd.DataFrame): self.expectation_engine = PandasDataset except ImportError: pass if self.expectation_engine is None: if isinstance(batch.data, SqlAlchemyBatchReference): self.expectation_engine = SqlAlchemyDataset if self.expectation_engine is None: try: import pyspark if isinstance(batch.data, pyspark.sql.DataFrame): self.expectation_engine = SparkDFDataset except ImportError: pass if self.expectation_engine is None: raise ValueError( "Unable to identify expectation_engine. It must be a subclass of DataAsset." ) self.init_kwargs = kwargs
def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): for k, v in kwargs.items(): if isinstance(v, dict): if k in batch_kwargs and isinstance(batch_kwargs[k], dict): batch_kwargs[k].update(v) else: batch_kwargs[k] = v else: batch_kwargs[k] = v if "data_asset_type" in batch_kwargs: # Sqlalchemy does not use reader_options or need to remove batch_kwargs since it does not pass # options through to a later reader data_asset_type_config = batch_kwargs["data_asset_type"] try: data_asset_type_config = ClassConfig(**data_asset_type_config) except TypeError: # We tried; we'll pass the config downstream, probably as a string, and handle an error later pass else: data_asset_type_config = self._data_asset_type data_asset_type = self._get_data_asset_class(data_asset_type_config) if not issubclass(data_asset_type, SqlAlchemyDataset): raise ValueError( "SqlAlchemyDatasource cannot instantiate batch with data_asset_type: '%s'. It " "must be a subclass of SqlAlchemyDataset." % data_asset_type.__name__) # We need to build a batch_id to be used in the dataframe batch_id = BatchId({"timestamp": time.time()}) if "schema" in batch_kwargs: schema = batch_kwargs["schema"] else: schema = None if "table" in batch_kwargs: limit = batch_kwargs.get('limit') offset = batch_kwargs.get('offset') if limit is not None or offset is not None: logger.info( "Generating query from table batch_kwargs based on limit and offset" ) raw_query = sqlalchemy.select([sqlalchemy.text("*")])\ .select_from(sqlalchemy.schema.Table(batch_kwargs['table'], sqlalchemy.MetaData(), schema=schema))\ .offset(offset)\ .limit(limit) query = str( raw_query.compile(self.engine, compile_kwargs={"literal_binds": True})) return data_asset_type(custom_sql=query, engine=self.engine, data_context=self._data_context, expectation_suite=expectation_suite, batch_kwargs=batch_kwargs, batch_id=batch_id) else: return data_asset_type(table_name=batch_kwargs["table"], engine=self.engine, schema=schema, data_context=self._data_context, expectation_suite=expectation_suite, batch_kwargs=batch_kwargs, batch_id=batch_id) elif "query" in batch_kwargs: if "limit" in batch_kwargs or "offset" in batch_kwargs: logger.warning( "Limit and offset parameters are ignored when using query-based batch_kwargs; consider " "adding limit and offset directly to the generated query.") if "bigquery_temp_table" in batch_kwargs: table_name = batch_kwargs.get("bigquery_temp_table") else: table_name = None query = Template(batch_kwargs["query"]).safe_substitute(**kwargs) return data_asset_type(custom_sql=query, engine=self.engine, table_name=table_name, data_context=self._data_context, expectation_suite=expectation_suite, batch_kwargs=batch_kwargs, batch_id=batch_id) else: raise ValueError( "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified" )
def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, reader_method=None, reader_options=None, limit=None, **kwargs): """ Build a full configuration object for a datasource, potentially including generators with defaults. Args: data_asset_type: A ClassConfig dictionary generators: Generator configuration dictionary boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation. reader_method: Optional default reader_method for generated batches reader_options: Optional default reader_options for generated batches limit: Optional default limit for generated batches **kwargs: Additional kwargs to be part of the datasource constructor's initialization Returns: A complete datasource configuration. """ # PENDING DELETION - JPC - 20200130 # if generators is None: # Provide a gentle way to build a datasource with a sane default, # including ability to specify the base_directory and reader_options # base_directory = kwargs.pop("base_directory", "data") # By default, use CSV sniffer to infer separator, which requires the python engine # reader_options = kwargs.pop("reader_options", { # "sep": None, # "engine": "python" # }) # generators = { # # "default": { # # "class_name": "SubdirReaderBatchKwargsGenerator", # # "base_directory": base_directory, # # "reader_options": reader_options # # }, # # "passthrough": { # # "class_name": "PassthroughGenerator", # # } # } if data_asset_type is None: data_asset_type = ClassConfig( class_name="PandasDataset") else: try: data_asset_type = ClassConfig(**data_asset_type) except TypeError: # In this case, we allow the passed config, for now, in case they're using a legacy string-only config pass configuration = kwargs configuration["data_asset_type"] = data_asset_type if generators: configuration["generators"] = generators if boto3_options is not None: if isinstance(boto3_options, dict): configuration.update(boto3_options) else: raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " "initialization.") if reader_options is not None: if isinstance(reader_options, dict): configuration.update(reader_options) else: raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon " "initialization.") if reader_method is not None: configuration["reader_method"] = reader_method if limit is not None: configuration["limit"] = limit return configuration
def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs): """class-private implementation of get_data_asset""" if self.spark is None: logger.error("No spark session available") return None batch_kwargs.update(kwargs) reader_options = batch_kwargs.copy() if "data_asset_type" in reader_options: data_asset_type_config = reader_options.pop( "data_asset_type") # Get and remove the config try: data_asset_type_config = ClassConfig(**data_asset_type_config) except TypeError: # We tried; we'll pass the config downstream, probably as a string, and handle an error later pass else: data_asset_type_config = self._data_asset_type data_asset_type = self._get_data_asset_class(data_asset_type_config) if not issubclass(data_asset_type, SparkDFDataset): raise ValueError( "SparkDFDatasource cannot instantiate batch with data_asset_type: '%s'. It " "must be a subclass of SparkDFDataset." % data_asset_type.__name__) if "path" in batch_kwargs or "s3" in batch_kwargs: if "path" in batch_kwargs: path = reader_options.pop( "path" ) # We remove this so it is not used as a reader option else: path = reader_options.pop("s3") reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_method = reader_options.pop("reader_method", None) if reader_method is None: reader_method = self._guess_reader_method_from_path(path) if reader_method is None: raise BatchKwargsError( "Unable to determine reader for path: %s" % path, batch_kwargs) else: try: reader_method = ReaderMethods[reader_method] except KeyError: raise BatchKwargsError( "Unknown reader method: %s" % reader_method, batch_kwargs) reader = self.spark.read for option in reader_options.items(): reader = reader.option(*option) if reader_method == ReaderMethods.CSV: df = reader.csv(path) elif reader_method == ReaderMethods.parquet: df = reader.parquet(path) elif reader_method == ReaderMethods.delta: df = reader.format("delta").load(path) else: raise BatchKwargsError( "Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs["query"]) elif "df" in batch_kwargs and isinstance(batch_kwargs["df"], (DataFrame, SparkDFDataset)): df = batch_kwargs.pop( "df") # We don't want to store the actual DataFrame in kwargs if isinstance(df, SparkDFDataset): # Grab just the spark_df reference, since we want to override everything else df = df.spark_df batch_kwargs["SparkDFRef"] = True else: raise BatchKwargsError( "Unrecognized batch_kwargs for spark_source", batch_kwargs) return data_asset_type(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs, caching=caching)
def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): batch_kwargs.update(kwargs) # pandas cannot take unicode as a delimiter, which can happen in py2. Handle this case explicitly. # We handle it here so that the updated value will be in the batch_kwargs for transparency to the user. if PY2 and "sep" in batch_kwargs and batch_kwargs["sep"] is not None: batch_kwargs["sep"] = str(batch_kwargs["sep"]) # We will use and manipulate reader_options along the way reader_options = batch_kwargs.copy() # We need to build a batch_id to be used in the dataframe batch_id = BatchId({"timestamp": time.time()}) if "data_asset_type" in batch_kwargs: data_asset_type_config = reader_options.pop( "data_asset_type") # Get and remove the config try: data_asset_type_config = ClassConfig(**data_asset_type_config) except TypeError: # We tried; we'll pass the config downstream, probably as a string, and handle an error later pass else: data_asset_type_config = self._data_asset_type data_asset_type = self._get_data_asset_class(data_asset_type_config) if not issubclass(data_asset_type, PandasDataset): raise ValueError( "PandasDatasource cannot instantiate batch with data_asset_type: '%s'. It " "must be a subclass of PandasDataset." % data_asset_type.__name__) if "path" in batch_kwargs: path = reader_options.pop( "path") # We remove this so it is not used as a reader option reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_options.pop("partition_id", "") reader_method = reader_options.pop("reader_method", None) reader_fn, reader_fn_options = self._get_reader_fn( reader_method, path, reader_options) try: df = getattr(pd, reader_fn)(path, **reader_fn_options) except AttributeError: raise BatchKwargsError( "Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "s3" in batch_kwargs: try: import boto3 s3 = boto3.client("s3") except ImportError: raise BatchKwargsError( "Unable to load boto3 client to read s3 asset.", batch_kwargs) raw_url = reader_options.pop( "s3") # We need to remove from the reader reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_method = reader_options.pop("reader_method", None) url = S3Url(raw_url) logger.debug("Fetching s3 object. Bucket: %s Key: %s" % (url.bucket, url.key)) s3_object = s3.get_object(Bucket=url.bucket, Key=url.key) reader_fn, reader_fn_options = self._get_reader_fn( reader_method, url.key, reader_options) try: df = getattr(pd, reader_fn)(StringIO( s3_object["Body"].read().decode( s3_object.get("ContentEncoding", "utf-8"))), **reader_fn_options) except AttributeError: raise BatchKwargsError( "Unsupported reader: %s" % reader_method.name, batch_kwargs) except IOError: raise elif "dataset" in batch_kwargs and isinstance( batch_kwargs["dataset"], (pd.DataFrame, pd.Series)): df = batch_kwargs.pop( "dataset" ) # We don't want to store the actual dataframe in kwargs # Record this in the kwargs *and* the id batch_kwargs["PandasInMemoryDF"] = True batch_id["PandasInMemoryDF"] = True else: raise BatchKwargsError( "Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource", batch_kwargs) if df.memory_usage().sum() < HASH_THRESHOLD: batch_id["fingerprint"] = hashlib.md5( pd.util.hash_pandas_object(df, index=True).values).hexdigest() return data_asset_type(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs, batch_id=batch_id)
def _get_data_asset(self, batch_kwargs, expectation_suite, caching=True, **kwargs): """class-private implementation of get_data_asset""" if self.spark is None: logger.error("No spark session available") return None for k, v in kwargs.items(): if isinstance(v, dict): if k in batch_kwargs and isinstance(batch_kwargs[k], dict): batch_kwargs[k].update(v) else: batch_kwargs[k] = v else: batch_kwargs[k] = v reader_options = batch_kwargs.get("reader_options", {}) # We need to build a batch_id to be used in the dataframe batch_id = BatchId({"timestamp": time.time()}) if "data_asset_type" in batch_kwargs: data_asset_type_config = reader_options.pop( "data_asset_type") # Get and remove the config try: data_asset_type_config = ClassConfig(**data_asset_type_config) except TypeError: # We tried; we'll pass the config downstream, probably as a string, and handle an error later pass else: data_asset_type_config = self._data_asset_type data_asset_type = self._get_data_asset_class(data_asset_type_config) if not issubclass(data_asset_type, SparkDFDataset): raise ValueError( "SparkDFDatasource cannot instantiate batch with data_asset_type: '%s'. It " "must be a subclass of SparkDFDataset." % data_asset_type.__name__) if "path" in batch_kwargs or "s3" in batch_kwargs: # If both are present, let s3 override path = batch_kwargs.get("path") path = batch_kwargs.get("s3", path) reader_method = batch_kwargs.get("reader_method") if reader_method is None: reader_method = self._guess_reader_method_from_path(path) if reader_method is None: raise BatchKwargsError( "Unable to determine reader for path: %s" % path, batch_kwargs) else: try: reader_method = ReaderMethods[reader_method] except KeyError: raise BatchKwargsError( "Unknown reader method: %s" % reader_method, batch_kwargs) reader = self.spark.read for option in reader_options.items(): reader = reader.option(*option) if reader_method == ReaderMethods.CSV: df = reader.csv(path) elif reader_method == ReaderMethods.parquet: df = reader.parquet(path) elif reader_method == ReaderMethods.delta: df = reader.format("delta").load(path) else: raise BatchKwargsError( "Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "query" in batch_kwargs: df = self.spark.sql(batch_kwargs["query"]) elif "dataset" in batch_kwargs and isinstance( batch_kwargs["dataset"], (DataFrame, SparkDFDataset)): df = batch_kwargs.get("dataset") # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs batch_kwargs = { k: batch_kwargs[k] for k in batch_kwargs if k != 'dataset' } if isinstance(df, SparkDFDataset): # Grab just the spark_df reference, since we want to override everything else df = df.spark_df # Record this in the kwargs *and* the id batch_kwargs["SparkDFRef"] = True batch_id["SparkDFRef"] = True else: raise BatchKwargsError( "Unrecognized batch_kwargs for spark_source", batch_kwargs) if "limit" in batch_kwargs: df = df.limit(batch_kwargs['limit']) return data_asset_type(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs, caching=caching, batch_id=batch_id)
def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs): batch_kwargs.update(kwargs) reader_options = batch_kwargs.copy() if "data_asset_type" in reader_options: data_asset_type_config = reader_options.pop( "data_asset_type") # Get and remove the config try: data_asset_type_config = ClassConfig(**data_asset_type_config) except TypeError: # We tried; we'll pass the config downstream, probably as a string, and handle an error later pass else: data_asset_type_config = self._data_asset_type data_asset_type = self._get_data_asset_class(data_asset_type_config) if not issubclass(data_asset_type, PandasDataset): raise ValueError( "PandasDatasource cannot instantiate batch with data_asset_type: '%s'. It " "must be a subclass of PandasDataset." % data_asset_type.__name__) if "path" in batch_kwargs: path = reader_options.pop( "path") # We need to remove from the reader reader_options.pop("timestamp", "") # ditto timestamp (but missing ok) reader_method = reader_options.pop("reader_method", None) if reader_method is None: reader_method = self._guess_reader_method_from_path(path) if reader_method is None: raise BatchKwargsError( "Unable to determine reader for path: %s" % path, batch_kwargs) else: try: reader_method = ReaderMethods[reader_method] except KeyError: raise BatchKwargsError( "Unknown reader method: %s" % reader_method, batch_kwargs) if reader_method == ReaderMethods.CSV: df = pd.read_csv(path, **reader_options) elif reader_method == ReaderMethods.parquet: df = pd.read_parquet(path, **reader_options) elif reader_method == ReaderMethods.excel: df = pd.read_excel(path, **reader_options) elif reader_method == ReaderMethods.JSON: df = pd.read_json(path, **reader_options) else: raise BatchKwargsError( "Unsupported reader: %s" % reader_method.name, batch_kwargs) elif "df" in batch_kwargs and isinstance(batch_kwargs["df"], (pd.DataFrame, pd.Series)): df = batch_kwargs.pop( "df") # We don't want to store the actual dataframe in kwargs batch_kwargs["PandasInMemoryDF"] = True else: raise BatchKwargsError( "Invalid batch_kwargs: path or df is required for a PandasDatasource", batch_kwargs) return data_asset_type(df, expectation_suite=expectation_suite, data_context=self._data_context, batch_kwargs=batch_kwargs)