class AzureADConfig(ConfigModel): """Config to create a token and connect to Azure AD instance""" # Required client_id: str tenant_id: str client_secret: str redirect: str authority: str token_url: str graph_url: str # Optional: Customize the mapping to DataHub Username from an attribute in the REST API response # Reference: https://docs.microsoft.com/en-us/graph/api/user-list?view=graph-rest-1.0&tabs=http#response-1 azure_ad_response_to_username_attr: str = "mail" azure_ad_response_to_username_regex: str = "([^@]+)" # Optional: Customize the mapping to DataHub Groupname from an attribute in the REST API response # Reference: https://docs.microsoft.com/en-us/graph/api/group-list?view=graph-rest-1.0&tabs=http#response-1 azure_ad_response_to_groupname_attr: str = "displayName" azure_ad_response_to_groupname_regex: str = "(.*)" # Optional: to ingest users, groups or both ingest_users: bool = True ingest_groups: bool = True ingest_group_membership: bool = True ingest_groups_users: bool = True users_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() groups_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
class SQLAlchemyConfig(ConfigModel): env: str = DEFAULT_ENV options: dict = {} # Although the 'table_pattern' enables you to skip everything from certain schemas, # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() include_views: Optional[bool] = True include_tables: Optional[bool] = True @abstractmethod def get_sql_alchemy_url(self): pass def get_identifier(self, schema: str, table: str) -> str: return f"{schema}.{table}" def standardize_schema_table_names(self, schema: str, entity: str) -> Tuple[str, str]: # Some SQLAlchemy dialects need a standardization step to clean the schema # and table names. See BigQuery for an example of when this is useful. return schema, entity
class AwsSourceConfig(ConfigModel): """ Common AWS credentials config. Currently used by: - Glue source - SageMaker source """ env: str = DEFAULT_ENV database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() aws_access_key_id: Optional[str] = None aws_secret_access_key: Optional[str] = None aws_session_token: Optional[str] = None aws_role: Optional[Union[str, List[str]]] = None aws_region: str def get_client(self, service: str) -> boto3.client: if ( self.aws_access_key_id and self.aws_secret_access_key and self.aws_session_token ): return boto3.client( service, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, aws_session_token=self.aws_session_token, region_name=self.aws_region, ) elif self.aws_access_key_id and self.aws_secret_access_key: return boto3.client( service, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, region_name=self.aws_region, ) elif self.aws_role: if isinstance(self.aws_role, str): credentials = assume_role(self.aws_role, self.aws_region) else: credentials = reduce( lambda new_credentials, role_arn: assume_role( role_arn, self.aws_region, new_credentials ), self.aws_role, {}, ) return boto3.client( service, aws_access_key_id=credentials["AccessKeyId"], aws_secret_access_key=credentials["SecretAccessKey"], aws_session_token=credentials["SessionToken"], region_name=self.aws_region, ) else: return boto3.client(service, region_name=self.aws_region)
class MongoDBConfig(ConfigModel): # See the MongoDB authentication docs for details and examples. # https://pymongo.readthedocs.io/en/stable/examples/authentication.html connect_uri: str = "mongodb://localhost" username: Optional[str] = None password: Optional[str] = None authMechanism: Optional[str] = None options: dict = {} enableSchemaInference: bool = True schemaSamplingSize: Optional[PositiveInt] = 1000 useRandomSampling: bool = True maxSchemaSize: Optional[PositiveInt] = 300 # mongodb only supports 16MB as max size for documents. However, if we try to retrieve a larger document it # errors out with "16793600" as the maximum size supported. maxDocumentSize: Optional[PositiveInt] = 16793600 env: str = DEFAULT_ENV database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() @validator("maxDocumentSize") def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value): if doc_size_filter_value > 16793600: raise ValueError( "maxDocumentSize must be a positive value <= 16793600.") return doc_size_filter_value
class DBTConfig(ConfigModel): manifest_path: str catalog_path: str sources_path: Optional[str] env: str = DEFAULT_ENV target_platform: str load_schemas: bool = True use_identifiers: bool = False node_type_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() tag_prefix: str = f"{DBT_PLATFORM}:" node_name_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() disable_dbt_node_creation = False meta_mapping: Dict = {} enable_meta_mapping = True write_semantics: str = "PATCH" strip_user_ids_from_email: bool = False @validator("target_platform") def validate_target_platform_value(cls, target_platform: str) -> str: if target_platform.lower() == DBT_PLATFORM: raise ValueError( "target_platform cannot be dbt. It should be the platform which dbt is operating on top of. For e.g " "postgres.") return target_platform @validator("write_semantics") def validate_write_semantics(cls, write_semantics: str) -> str: if write_semantics.lower() not in {"patch", "override"}: raise ValueError( "write_semantics cannot be any other value than PATCH or OVERRIDE. Default value is PATCH. " "For PATCH semantics consider using the datahub-rest sink or " "provide a datahub_api: configuration on your ingestion recipe" ) return write_semantics
class SnowflakeUsageConfig(BaseSnowflakeConfig, BaseUsageConfig, StatefulIngestionConfigBase): env: str = builder.DEFAULT_ENV options: dict = {} database_pattern: AllowDenyPattern = AllowDenyPattern( deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"]) email_domain: Optional[str] schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() apply_view_usage_to_tables: bool = False stateful_ingestion: Optional[SnowflakeStatefulIngestionConfig] = None @pydantic.validator("role", always=True) def role_accountadmin(cls, v): if not v or v.lower() != "accountadmin": # This isn't an error, since the privileges can be delegated to other # roles as well: https://docs.snowflake.com/en/sql-reference/account-usage.html#enabling-account-usage-for-other-roles logger.info( 'snowflake usage tables are only accessible by role "accountadmin" by default; you set %s', v, ) return v def get_sql_alchemy_url(self): return super().get_sql_alchemy_url(database="snowflake")
class GlueSourceConfig(ConfigModel): env: str = "PROD" database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() aws_access_key_id: Optional[str] = None aws_secret_access_key: Optional[str] = None aws_session_token: Optional[str] = None aws_region: str @property def glue_client(self): if (self.aws_access_key_id and self.aws_secret_access_key and self.aws_session_token): return boto3.client( "glue", aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, aws_session_token=self.aws_session_token, region_name=self.aws_region, ) elif self.aws_access_key_id and self.aws_secret_access_key: return boto3.client( "glue", aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, region_name=self.aws_region, ) else: return boto3.client("glue", region_name=self.aws_region)
class RedashConfig(ConfigModel): # See the Redash API for details # https://redash.io/help/user-guide/integrations-and-api/api connect_uri: str = Field(default="http://localhost:5000", description="Redash base URL.") api_key: str = Field(default="REDASH_API_KEY", description="Redash user API key.") # Optionals dashboard_patterns: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="regex patterns for dashboards to filter for ingestion.", ) chart_patterns: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="regex patterns for charts to filter for ingestion.", ) skip_draft: bool = Field( default=True, description="Only ingest published dashboards and charts.") api_page_limit: int = Field( default=sys.maxsize, description="Limit on ingested dashboards and charts API pagination.", ) parse_table_names_from_sql: bool = Field(default=False, description="See note below.") sql_parser: str = Field( default="datahub.utilities.sql_parser.DefaultSQLParser", description="custom SQL parser. See note below for details.", ) env: str = Field( default=DEFAULT_ENV, description="Environment to use in namespace when constructing URNs.", )
class SQLAlchemyConfig(StatefulIngestionConfigBase): env: str = DEFAULT_ENV options: dict = {} # Although the 'table_pattern' enables you to skip everything from certain schemas, # having another option to allow/deny on schema level is an optimization for the case when there is a large number # of schemas that one wants to skip and you want to avoid the time to needlessly fetch those tables only to filter # them out afterwards via the table_pattern. schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() profile_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() include_views: Optional[bool] = True include_tables: Optional[bool] = True from datahub.ingestion.source.ge_data_profiler import GEProfilingConfig profiling: GEProfilingConfig = GEProfilingConfig() # Custom Stateful Ingestion settings stateful_ingestion: Optional[SQLAlchemyStatefulIngestionConfig] = None @pydantic.root_validator() def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any]) -> Dict[str, Any]: profiling = values.get("profiling") if profiling is not None and profiling.enabled: profiling.allow_deny_patterns = values["profile_pattern"] return values @abstractmethod def get_sql_alchemy_url(self): pass
class LookMLSourceConfig(ConfigModel): base_folder: pydantic.DirectoryPath connection_to_platform_map: Dict[str, str] platform_name: str = "looker" model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() env: str = builder.DEFAULT_ENV parse_table_names_from_sql: bool = False
class GlueSourceConfig(ConfigModel): env: str = "PROD" database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() extract_transforms: Optional[bool] = True aws_access_key_id: Optional[str] = None aws_secret_access_key: Optional[str] = None aws_session_token: Optional[str] = None aws_role: Optional[Union[str, List[str]]] = None aws_region: str def get_client(self, service: str) -> boto3.client: if (self.aws_access_key_id and self.aws_secret_access_key and self.aws_session_token): return boto3.client( service, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, aws_session_token=self.aws_session_token, region_name=self.aws_region, ) elif self.aws_access_key_id and self.aws_secret_access_key: return boto3.client( service, aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, region_name=self.aws_region, ) elif self.aws_role: if isinstance(self.aws_role, str): credentials = assume_role(self.aws_role, self.aws_region) else: credentials = reduce( lambda new_credentials, role_arn: assume_role( role_arn, self.aws_region, new_credentials), self.aws_role, {}, ) return boto3.client( service, aws_access_key_id=credentials["AccessKeyId"], aws_secret_access_key=credentials["SecretAccessKey"], aws_session_token=credentials["SessionToken"], region_name=self.aws_region, ) else: return boto3.client(service, region_name=self.aws_region) @property def glue_client(self): return self.get_client("glue") @property def s3_client(self): return self.get_client("s3")
class LookMLSourceConfig(ConfigModel): # pragma: no cover base_folder: str connection_to_platform_map: Dict[str, str] platform_name: str = "looker_views" actor: str = "urn:li:corpuser:etl" model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() env: str = "PROD" parse_table_names_from_sql: bool = False
class LookerDashboardSourceConfig(ConfigModel): client_id: str client_secret: str base_url: str platform_name: str = "looker" actor: str = "urn:li:corpuser:etl" dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() env: str = "PROD"
class LookMLSourceConfig(LookerCommonConfig): base_folder: pydantic.DirectoryPath connection_to_platform_map: Optional[Dict[str, LookerConnectionDefinition]] model_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() parse_table_names_from_sql: bool = False sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser" api: Optional[LookerAPIConfig] project_name: Optional[str] transport_options: Optional[TransportOptions] @validator("platform_instance") def platform_instance_not_supported(cls, v: str) -> str: raise ConfigurationError( "LookML Source doesn't support platform instance at the top level. However connection-specific platform instances are supported for generating lineage edges. Read the documentation to find out more." ) @validator("connection_to_platform_map", pre=True) def convert_string_to_connection_def(cls, conn_map): # Previous version of config supported strings in connection map. This upconverts strings to ConnectionMap for key in conn_map: if isinstance(conn_map[key], str): platform = conn_map[key] if "." in platform: platform_db_split = conn_map[key].split(".") connection = LookerConnectionDefinition( platform=platform_db_split[0], default_db=platform_db_split[1], default_schema="", ) conn_map[key] = connection else: logger.warning( f"Connection map for {key} provides platform {platform} but does not provide a default database name. This might result in failed resolution" ) conn_map[key] = LookerConnectionDefinition( platform=platform, default_db="", default_schema="") return conn_map @root_validator() def check_either_connection_map_or_connection_provided(cls, values): """Validate that we must either have a connection map or an api credential""" if not values.get("connection_to_platform_map", {}) and not values.get( "api", {}): raise ConfigurationError( "Neither api not connection_to_platform_map config was found. LookML source requires either api credentials for Looker or a map of connection names to platform identifiers to work correctly" ) return values @root_validator() def check_either_project_name_or_api_provided(cls, values): """Validate that we must either have a project name or an api credential to fetch project names""" if not values.get("project_name") and not values.get("api"): raise ConfigurationError( "Neither project_name not an API credential was found. LookML source requires either api credentials for Looker or a project_name to accurately name views and models." ) return values
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="postgresqldb", database_alias="library_catalog", username="******", schema_pattern=AllowDenyPattern(allow=["^librarydb"]), profile_pattern=AllowDenyPattern( allow=["library_catalog.librarydb.*"]), profiling=GEProfilingConfig( enabled=True, include_field_null_count=True, include_field_min_value=True, include_field_max_value=True, include_field_mean_value=True, include_field_median_value=True, include_field_stddev_value=True, include_field_quantiles=True, include_field_distinct_value_frequencies=True, include_field_histogram=True, include_field_sample_values=True, ), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", )
class LookerDashboardSourceConfig(LookerAPIConfig, LookerCommonConfig): platform_name: str = "looker" actor: Optional[str] dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() include_deleted: bool = False env: str = builder.DEFAULT_ENV extract_owners: bool = True strip_user_ids_from_email: bool = False skip_personal_folders: bool = False
def test_fully_speced(): pattern = AllowDenyPattern(allow=["foo.mytable"]) assert pattern.is_fully_specified_allow_list() pattern = AllowDenyPattern(allow=["foo.*", "foo.table"]) assert not pattern.is_fully_specified_allow_list() pattern = AllowDenyPattern(allow=["foo.?", "foo.table"]) assert not pattern.is_fully_specified_allow_list()
class MongoDBConfig(ConfigModel): # See the MongoDB authentication docs for details and examples. # https://pymongo.readthedocs.io/en/stable/examples/authentication.html connect_uri: str = "mongodb://localhost" username: Optional[str] = None password: Optional[str] = None authMechanism: Optional[str] = None options: dict = {} database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
class LookerDashboardSourceConfig(ConfigModel): client_id: str client_secret: str base_url: str platform_name: str = "looker" # The datahub platform where looker views are stored, must be the same as `platform_name` in lookml source view_platform_name: str = "looker_views" actor: str = "urn:li:corpuser:etl" dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() env: str = "PROD"
class RedashConfig(ConfigModel): # See the Redash API for details # https://redash.io/help/user-guide/integrations-and-api/api connect_uri: str = "http://localhost:5000" api_key: str = "REDASH_API_KEY" env: str = DEFAULT_ENV # Optionals dashboard_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() chart_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() skip_draft: bool = True api_page_limit: int = sys.maxsize
class SnowflakeUsageConfig(BaseSnowflakeConfig, BaseUsageConfig, StatefulIngestionConfigBase): options: dict = pydantic.Field( default_factory=dict, description= "Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details.", ) database_pattern: AllowDenyPattern = pydantic.Field( default=AllowDenyPattern( deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"]), description= "List of regex patterns for databases to include/exclude in usage ingestion.", ) email_domain: Optional[str] = pydantic.Field( description= "Email domain of your organisation so users can be displayed on UI appropriately." ) schema_pattern: AllowDenyPattern = pydantic.Field( default=AllowDenyPattern.allow_all(), description= "List of regex patterns for schemas to include/exclude in usage ingestion.", ) table_pattern: AllowDenyPattern = pydantic.Field( default=AllowDenyPattern.allow_all(), description= "List of regex patterns for tables to include in ingestion.", ) view_pattern: AllowDenyPattern = pydantic.Field( default=AllowDenyPattern.allow_all(), description="List of regex patterns for views to include in ingestion.", ) apply_view_usage_to_tables: bool = pydantic.Field( default=False, description="Allow/deny patterns for views in snowflake dataset names.", ) stateful_ingestion: Optional[ SnowflakeStatefulIngestionConfig] = pydantic.Field( default=None, description="Stateful ingestion related configs") def get_options(self) -> dict: options_connect_args: Dict = super().get_sql_alchemy_connect_args() options_connect_args.update(self.options.get("connect_args", {})) self.options["connect_args"] = options_connect_args return self.options def get_sql_alchemy_url(self): return super().get_sql_alchemy_url( database="snowflake", username=self.username, password=self.password, role=self.role, )
class LookerDashboardSourceConfig(LookerAPIConfig, LookerCommonConfig): dashboard_pattern: AllowDenyPattern = Field( AllowDenyPattern.allow_all(), description= "Patterns for selecting dashboard ids that are to be included", ) chart_pattern: AllowDenyPattern = Field( AllowDenyPattern.allow_all(), description="Patterns for selecting chart ids that are to be included", ) include_deleted: bool = Field( False, description="Whether to include deleted dashboards.") extract_owners: bool = Field( True, description= "When enabled, extracts ownership from Looker directly. When disabled, ownership is left empty for dashboards and charts.", ) actor: Optional[str] = Field( None, description= "This config is deprecated in favor of `extract_owners`. Previously, was the actor to use in ownership properties of ingested metadata.", ) strip_user_ids_from_email: bool = Field( False, description= "When enabled, converts Looker user emails of the form [email protected] to urn:li:corpuser:name when assigning ownership", ) skip_personal_folders: bool = Field( False, description= "Whether to skip ingestion of dashboards in personal folders. Setting this to True will only ingest dashboards in the Shared folder space.", ) max_threads: int = Field( os.cpu_count() or 40, description= "Max parallelism for Looker API calls. Defaults to cpuCount or 40", ) external_base_url: Optional[str] = Field( None, description= "Optional URL to use when constructing external URLs to Looker if the `base_url` is not the correct one to use. For example, `https://looker-public.company.com`. If not provided, the external base URL will default to `base_url`.", ) @validator("external_base_url", pre=True, always=True) def external_url_defaults_to_api_config_base_url( cls, v: Optional[str], *, values: Dict[str, Any], **kwargs: Dict[str, Any]) -> str: return v or values["base_url"] @validator("platform_instance") def platform_instance_not_supported(cls, v: str) -> str: raise ConfigurationError( "Looker Source doesn't support platform instances")
class MongoDBConfig(ConfigModel): # See the MongoDB authentication docs for details and examples. # https://pymongo.readthedocs.io/en/stable/examples/authentication.html connect_uri: str = "mongodb://localhost" username: Optional[str] = None password: Optional[str] = None authMechanism: Optional[str] = None options: dict = {} enableSchemaInference: bool = True schemaSamplingSize: Optional[PositiveInt] = 1000 env: str = DEFAULT_ENV database_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() collection_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
class RedashConfig(ConfigModel): # See the Redash API for details # https://redash.io/help/user-guide/integrations-and-api/api connect_uri: str = "http://localhost:5000" api_key: str = "REDASH_API_KEY" env: str = DEFAULT_ENV # Optionals dashboard_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() chart_patterns: AllowDenyPattern = AllowDenyPattern.allow_all() skip_draft: bool = True api_page_limit: int = sys.maxsize parse_table_names_from_sql: bool = False sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser"
class MongoDBConfig(EnvBasedSourceConfigBase): # See the MongoDB authentication docs for details and examples. # https://pymongo.readthedocs.io/en/stable/examples/authentication.html connect_uri: str = Field(default="mongodb://localhost", description="MongoDB connection URI.") username: Optional[str] = Field(default=None, description="MongoDB username.") password: Optional[str] = Field(default=None, description="MongoDB password.") authMechanism: Optional[str] = Field( default=None, description="MongoDB authentication mechanism.") options: dict = Field( default={}, description="Additional options to pass to `pymongo.MongoClient()`.") enableSchemaInference: bool = Field( default=True, description="Whether to infer schemas. ") schemaSamplingSize: Optional[PositiveInt] = Field( default=1000, description= "Number of documents to use when inferring schema size. If set to `0`, all documents will be scanned.", ) useRandomSampling: bool = Field( default=True, description= "If documents for schema inference should be randomly selected. If `False`, documents will be selected from start.", ) maxSchemaSize: Optional[PositiveInt] = Field( default=300, description="Maximum number of fields to include in the schema.") # mongodb only supports 16MB as max size for documents. However, if we try to retrieve a larger document it # errors out with "16793600" as the maximum size supported. maxDocumentSize: Optional[PositiveInt] = Field(default=16793600, description="") database_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="regex patterns for databases to filter in ingestion.", ) collection_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description="regex patterns for collections to filter in ingestion.", ) @validator("maxDocumentSize") def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value): if doc_size_filter_value > 16793600: raise ValueError( "maxDocumentSize must be a positive value <= 16793600.") return doc_size_filter_value
def test_case_sensitivity(): pattern = AllowDenyPattern(allow=["Foo.myTable"]) assert pattern.allowed("foo.mytable") assert pattern.allowed("FOO.MYTABLE") assert pattern.allowed("Foo.MyTable") pattern = AllowDenyPattern(allow=["Foo.myTable"], ignoreCase=False) assert not pattern.allowed("foo.mytable") assert pattern.allowed("Foo.myTable")
class NifiSourceConfig(ConfigModel): site_url: str auth: NifiAuthType = NifiAuthType.NO_AUTH provenance_days: int = 7 # Fetch provenance events for past 1 week process_group_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() # Required for nifi deployments using Remote Process Groups site_name: str = "default" site_url_to_site_name: Dict[str, str] = {} # Required to be set if auth is of type SINGLE_USER username: Optional[str] password: Optional[str] # Required to be set if auth is of type CLIENT_CERT client_cert_file: Optional[str] client_key_file: Optional[str] client_key_password: Optional[str] # Required to be set if nifi server certificate is not signed by # root CA trusted by client system, e.g. self-signed certificates ca_file: Optional[str] env: str = builder.DEFAULT_ENV
class SnowflakeConfig(BaseSnowflakeConfig, SQLAlchemyConfig): database_pattern: AllowDenyPattern = AllowDenyPattern( deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] ) provision_role: Optional[SnowflakeProvisionRoleConfig] = None ignore_start_time_lineage: bool = False upstream_lineage_in_report: bool = False def get_sql_alchemy_url( self, database: str = None, username: Optional[str] = None, password: Optional[pydantic.SecretStr] = None, role: Optional[str] = None, ) -> str: return super().get_sql_alchemy_url( database=database, username=username, password=password, role=role ) def get_options(self) -> dict: options_connect_args: Dict = super().get_sql_alchemy_connect_args() options_connect_args.update(self.options.get("connect_args", {})) self.options["connect_args"] = options_connect_args return self.options
def is_dataset_pattern_allowed( dataset_name: Optional[Any], dataset_type: Optional[Any]) -> bool: # TODO: support table/view patterns for usage logs by pulling that information as well from the usage query if not dataset_type or not dataset_name: return True table_or_view_pattern: Optional[ AllowDenyPattern] = AllowDenyPattern.allow_all() # Test domain type = external_table and then add it table_or_view_pattern = ( self.config.table_pattern if dataset_type.lower() in {"table"} else (self.config.view_pattern if dataset_type.lower() in {"view", "materialized_view"} else None)) if table_or_view_pattern is None: return True dataset_params = dataset_name.split(".") assert len(dataset_params) == 3 if (not self.config.database_pattern.allowed(dataset_params[0]) or not self.config.schema_pattern.allowed( dataset_params[1]) or not table_or_view_pattern.allowed(dataset_params[2])): return False return True
class ElasticsearchSourceConfig(ConfigModel): host: str = "localhost:9092" username: str = "" password: str = "" env: str = DEFAULT_ENV index_pattern: AllowDenyPattern = AllowDenyPattern( allow=[".*"], deny=["^_.*", "^ilm-history.*"]) @validator("host") def host_colon_port_comma(cls, host_val: str) -> str: for entry in host_val.split(","): # The port can be provided but is not required. port = None if ":" in entry: (host, port) = entry.rsplit(":", 1) else: host = entry assert re.match( # This regex is quite loose. Many invalid hostnames or IPs will slip through, # but it serves as a good first line of validation. We defer to Kafka for the # remaining validation. r"^[\w\-\.\:]+$", host, ), f"host contains bad characters, found {host}" if port is not None: assert port.isdigit(), f"port must be all digits, found {port}" return host_val