def ingest(config: str): """Main command for ingesting metadata into DataHub""" config_file = pathlib.Path(config) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}".format( config_file.suffix ) ) with config_file.open() as fp: pipeline_config = config_mech.load_config(fp) with nicely_formatted_validation_errors(): logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config) pipeline.run()
def host_colon_port_comma(cls, host_val: str) -> str: for entry in host_val.split(","): # The port can be provided but is not required. port = None for prefix in ["http://", "https://"]: if entry.startswith(prefix): entry = entry[len(prefix):] for suffix in ["/"]: if entry.endswith(suffix): entry = entry[:-len(suffix)] if ":" in entry: (host, port) = entry.rsplit(":", 1) else: host = entry if not re.match( # This regex is quite loose. Many invalid hostnames or IPs will slip through, # but it serves as a good first line of validation. We defer to Elastic for the # remaining validation. r"^[\w\-\.]+$", host, ): raise ConfigurationError( f"host contains bad characters, found {host}") if port is not None and not port.isdigit(): raise ConfigurationError( f"port must be all digits, found {port}") return host_val
def web_service_url_scheme_host_port(cls, val: str) -> str: # Tokenize the web url url = urlparse(val) if url.scheme not in ["http", "https"]: raise ConfigurationError( f"Scheme should be http or https, found {url.scheme}" ) if not _is_valid_hostname(url.hostname.__str__()): raise ConfigurationError( f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}" ) return config_clean.remove_trailing_slashes(val)
def platform_validator(cls, v: str) -> str: if not v or v in VALID_PLATFORMS: return v else: raise ConfigurationError( f"'platform' can only take following values: {VALID_PLATFORMS}" )
def get_access_token(self): if self.__access_token != "": LOGGER.info("Returning the cached access token") return self.__access_token LOGGER.info("Generating PowerBi access token") auth_response = self.__msal_client.acquire_token_for_client( scopes=[self.__config.scope]) if not auth_response.get("access_token"): LOGGER.warn( "Failed to generate the PowerBi access token. Please check input configuration" ) raise ConfigurationError( "Powerbi authorization failed . Please check your input configuration." ) LOGGER.info("Generated PowerBi access token") self.__access_token = "Bearer {}".format( auth_response.get("access_token")) LOGGER.debug("{}={}".format(Constant.PBIAccessToken, self.__access_token)) return self.__access_token
def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: if values.get("enabled"): if values.get("state_provider") is None: raise ConfigurationError( "Must specify state_provider configuration if stateful ingestion is enabled." ) return values
def __init__(self, graph: DataHubGraph): self.graph = graph if not self._is_server_stateful_ingestion_capable(): raise ConfigurationError( "Datahub server is not capable of supporting stateful ingestion." " Please consider upgrading to the latest server version to use this feature." )
def _initialize_state_provider(self) -> None: self.ingestion_state_provider: Optional[IngestionStateProvider] = None if ( self.stateful_ingestion_config is not None and self.stateful_ingestion_config.state_provider is not None and self.stateful_ingestion_config.enabled ): if self.ctx.pipeline_name is None: raise ConfigurationError( "pipeline_name must be provided if stateful ingestion is enabled." ) state_provider_class = ingestion_state_provider_registry.get( self.stateful_ingestion_config.state_provider.type ) self.ingestion_state_provider = state_provider_class.create( self.stateful_ingestion_config.state_provider.dict().get("config", {}), self.ctx, ) if self.stateful_ingestion_config.ignore_old_state: logger.warning( "The 'ignore_old_state' config is True. The old checkpoint state will not be provided." ) if self.stateful_ingestion_config.ignore_new_state: logger.warning( "The 'ignore_new_state' config is True. The new checkpoint state will not be created." ) logger.debug( f"Successfully created {self.stateful_ingestion_config.state_provider.type} state provider." )
def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) self.source_config = config if ( self.is_stateful_ingestion_configured() and not self.source_config.platform_instance ): raise ConfigurationError( "Enabling kafka stateful ingestion requires to specify a platform instance." ) self.consumer = confluent_kafka.Consumer( { "group.id": "test", "bootstrap.servers": self.source_config.connection.bootstrap, **self.source_config.connection.consumer_config, } ) # Use the fully qualified name for SchemaRegistryClient to make it mock patchable for testing. self.schema_registry_client = ( confluent_kafka.schema_registry.schema_registry_client.SchemaRegistryClient( { "url": self.source_config.connection.schema_registry_url, **self.source_config.connection.schema_registry_config, } ) ) self.report = KafkaSourceReport() self.known_schema_registry_subjects: List[str] = [] try: self.known_schema_registry_subjects.extend( self.schema_registry_client.get_subjects() ) except Exception as e: logger.warning(f"Failed to get subjects from schema registry: {e}")
def _authenticate(self): # https://tableau.github.io/server-client-python/docs/api-ref#authentication authentication = None if self.config.username and self.config.password: authentication = TableauAuth( username=self.config.username, password=self.config.password, site_id=self.config.site, ) elif self.config.token_name and self.config.token_value: authentication = PersonalAccessTokenAuth(self.config.token_name, self.config.token_value, self.config.site) else: raise ConfigurationError( "Tableau Source: Either username/password or token_name/token_value must be set" ) try: self.server = Server(self.config.connect_uri, use_server_version=True) self.server.auth.sign_in(authentication) except ServerResponseError as e: logger.error(e) self.report.report_failure( key="tableau-login", reason=f"Unable to Login with credentials provided" f"Reason: {str(e)}", ) except Exception as e: logger.error(e) self.report.report_failure(key="tableau-login", reason=f"Unable to Login" f"Reason: {str(e)}")
def __init__(self, config: AddDatasetOwnershipConfig, ctx: PipelineContext): self.ctx = ctx self.config = config if self.config.semantics == Semantics.PATCH and self.ctx.graph is None: raise ConfigurationError( "With PATCH semantics, AddDatasetOwnership requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" )
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> IngestionStateProvider: if ctx.graph: return cls(ctx.graph) elif config_dict is None: raise ConfigurationError("Missing provider configuration") else: provider_config = DatahubIngestionStateProviderConfig.parse_obj( config_dict) if provider_config.datahub_api: graph = DataHubGraph(provider_config.datahub_api) return cls(graph) else: raise ConfigurationError( "Missing datahub_api. Provide either a global one or under the state_provider." )
def ensure_only_issuer_or_token( cls, token: Optional[str], values: Dict[str, Optional[str]] ) -> Optional[str]: if token is not None and values.get("issuer_url") is not None: raise ConfigurationError( "Expected only one authentication method, either issuer_url or token." ) return token
def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext, name: str) -> IngestionCheckpointingProviderBase: if ctx.graph: # Use the pipeline-level graph if set return cls(ctx.graph, name) elif config_dict is None: raise ConfigurationError("Missing provider configuration.") else: provider_config = DatahubIngestionStateProviderConfig.parse_obj( config_dict) if provider_config.datahub_api: graph = DataHubGraph(provider_config.datahub_api) return cls(graph, name) else: raise ConfigurationError( "Missing datahub_api. Provide either a global one or under the state_provider." )
def get_workunits(self) -> Iterable[MetadataWorkUnit]: if self.config.write_semantics == "PATCH" and not self.ctx.graph: raise ConfigurationError( "With PATCH semantics, dbt source requires a datahub_api to connect to. " "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe." ) ( nodes, manifest_schema, manifest_version, catalog_schema, catalog_version, manifest_nodes_raw, ) = loadManifestAndCatalog( self.config.manifest_path, self.config.catalog_path, self.config.sources_path, self.config.load_schemas, self.config.use_identifiers, self.config.tag_prefix, self.config.node_type_pattern, self.report, self.config.node_name_pattern, ) additional_custom_props = { "manifest_schema": manifest_schema, "manifest_version": manifest_version, "catalog_schema": catalog_schema, "catalog_version": catalog_version, } additional_custom_props_filtered = { key: value for key, value in additional_custom_props.items() if value is not None } if not self.config.disable_dbt_node_creation: yield from self.create_platform_mces( nodes, additional_custom_props_filtered, manifest_nodes_raw, DBT_PLATFORM, ) yield from self.create_platform_mces( nodes, additional_custom_props_filtered, manifest_nodes_raw, self.config.target_platform, ) if self.is_stateful_ingestion_configured(): # Clean up stale entities. yield from self.gen_removed_entity_workunits()
def validate_that_bigquery_audit_metadata_datasets_is_correctly_configured( cls, values: Dict[str, Any]) -> Dict[str, Any]: if (values.get("use_exported_bigquery_audit_metadata") and not values.get("use_v2_audit_metadata") and not values.get("bigquery_audit_metadata_datasets")): raise ConfigurationError( "bigquery_audit_metadata_datasets must be specified if using exported audit metadata. Otherwise set use_v2_audit_metadata to True." ) pass return values
def ensure_client_id_and_secret_for_issuer_url( cls, client_secret: Optional[str], values: Dict[str, Optional[str]] ) -> Optional[str]: if values.get("issuer_url") is not None and ( client_secret is None or values.get("client_id") is None ): raise ConfigurationError( "Missing configuration: client_id and client_secret are mandatory when issuer_url is set." ) return client_secret
def env_must_be_one_of(cls, v: str) -> str: # Get all the constants from the FabricTypeClass. It's not an enum, so this is a bit hacky but works allowed_envs = [ value for name, value in vars(FabricTypeClass).items() if not name.startswith("_") ] if (v.upper()) not in allowed_envs: raise ConfigurationError( f"env must be one of {allowed_envs}, found {v}") return v.upper()
def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext, platform: str): super().__init__(ctx) self.config = config self.platform = platform self.report = SQLSourceReport() if self.config.profiling.enabled and not self._can_run_profiler(): raise ConfigurationError( "Table profiles requested but profiler plugin is not enabled. " f"Try running: pip install '{__package_name__}[sql-profiles]'")
def from_looker_connection( cls, looker_connection: DBConnection ) -> "LookerConnectionDefinition": """Dialect definitions are here: https://docs.looker.com/setup-and-management/database-config""" extractors: Dict[str, Any] = { "^bigquery": _get_bigquery_definition, ".*": _get_generic_definition, } if looker_connection.dialect_name is not None: for extractor_pattern, extracting_function in extractors.items(): if re.match(extractor_pattern, looker_connection.dialect_name): (platform, db, schema) = extracting_function(looker_connection) return cls(platform=platform, default_db=db, default_schema=schema) raise ConfigurationError( f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}" ) else: raise ConfigurationError( f"Unable to fetch a fully filled out connection for {looker_connection.name}. Please check your API permissions." )
def load_config_file(config_file: Union[pathlib.Path, str]) -> dict: if isinstance(config_file, str): config_file = pathlib.Path(config_file) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config_file}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}". format(config_file.suffix)) with config_file.open() as raw_config_fp: raw_config_file = raw_config_fp.read() config_fp = io.StringIO(raw_config_file) config = config_mech.load_config(config_fp) resolve_env_variables(config) return config