예제 #1
0
def ingest(config: str):
    """Main command for ingesting metadata into DataHub"""

    config_file = pathlib.Path(config)
    if not config_file.is_file():
        raise ConfigurationError(f"Cannot open config file {config}")

    config_mech: ConfigurationMechanism
    if config_file.suffix in [".yaml", ".yml"]:
        config_mech = YamlConfigurationMechanism()
    elif config_file.suffix == ".toml":
        config_mech = TomlConfigurationMechanism()
    else:
        raise ConfigurationError(
            "Only .toml and .yml are supported. Cannot process file type {}".format(
                config_file.suffix
            )
        )

    with config_file.open() as fp:
        pipeline_config = config_mech.load_config(fp)

    with nicely_formatted_validation_errors():
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
예제 #2
0
    def host_colon_port_comma(cls, host_val: str) -> str:
        for entry in host_val.split(","):
            # The port can be provided but is not required.
            port = None
            for prefix in ["http://", "https://"]:
                if entry.startswith(prefix):
                    entry = entry[len(prefix):]
            for suffix in ["/"]:
                if entry.endswith(suffix):
                    entry = entry[:-len(suffix)]

            if ":" in entry:
                (host, port) = entry.rsplit(":", 1)
            else:
                host = entry
            if not re.match(
                    # This regex is quite loose. Many invalid hostnames or IPs will slip through,
                    # but it serves as a good first line of validation. We defer to Elastic for the
                    # remaining validation.
                    r"^[\w\-\.]+$",
                    host,
            ):
                raise ConfigurationError(
                    f"host contains bad characters, found {host}")
            if port is not None and not port.isdigit():
                raise ConfigurationError(
                    f"port must be all digits, found {port}")
        return host_val
예제 #3
0
파일: pulsar.py 프로젝트: hsheth2/datahub
    def web_service_url_scheme_host_port(cls, val: str) -> str:
        # Tokenize the web url
        url = urlparse(val)

        if url.scheme not in ["http", "https"]:
            raise ConfigurationError(
                f"Scheme should be http or https, found {url.scheme}"
            )

        if not _is_valid_hostname(url.hostname.__str__()):
            raise ConfigurationError(
                f"Not a valid hostname, hostname contains invalid characters, found {url.hostname}"
            )

        return config_clean.remove_trailing_slashes(val)
예제 #4
0
 def platform_validator(cls, v: str) -> str:
     if not v or v in VALID_PLATFORMS:
         return v
     else:
         raise ConfigurationError(
             f"'platform' can only take following values: {VALID_PLATFORMS}"
         )
예제 #5
0
파일: powerbi.py 프로젝트: hsheth2/datahub
    def get_access_token(self):
        if self.__access_token != "":
            LOGGER.info("Returning the cached access token")
            return self.__access_token

        LOGGER.info("Generating PowerBi access token")

        auth_response = self.__msal_client.acquire_token_for_client(
            scopes=[self.__config.scope])

        if not auth_response.get("access_token"):
            LOGGER.warn(
                "Failed to generate the PowerBi access token. Please check input configuration"
            )
            raise ConfigurationError(
                "Powerbi authorization failed . Please check your input configuration."
            )

        LOGGER.info("Generated PowerBi access token")

        self.__access_token = "Bearer {}".format(
            auth_response.get("access_token"))

        LOGGER.debug("{}={}".format(Constant.PBIAccessToken,
                                    self.__access_token))

        return self.__access_token
예제 #6
0
 def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
     if values.get("enabled"):
         if values.get("state_provider") is None:
             raise ConfigurationError(
                 "Must specify state_provider configuration if stateful ingestion is enabled."
             )
     return values
 def __init__(self, graph: DataHubGraph):
     self.graph = graph
     if not self._is_server_stateful_ingestion_capable():
         raise ConfigurationError(
             "Datahub server is not capable of supporting stateful ingestion."
             " Please consider upgrading to the latest server version to use this feature."
         )
예제 #8
0
    def _initialize_state_provider(self) -> None:
        self.ingestion_state_provider: Optional[IngestionStateProvider] = None
        if (
            self.stateful_ingestion_config is not None
            and self.stateful_ingestion_config.state_provider is not None
            and self.stateful_ingestion_config.enabled
        ):
            if self.ctx.pipeline_name is None:
                raise ConfigurationError(
                    "pipeline_name must be provided if stateful ingestion is enabled."
                )
            state_provider_class = ingestion_state_provider_registry.get(
                self.stateful_ingestion_config.state_provider.type
            )
            self.ingestion_state_provider = state_provider_class.create(
                self.stateful_ingestion_config.state_provider.dict().get("config", {}),
                self.ctx,
            )
            if self.stateful_ingestion_config.ignore_old_state:
                logger.warning(
                    "The 'ignore_old_state' config is True. The old checkpoint state will not be provided."
                )
            if self.stateful_ingestion_config.ignore_new_state:
                logger.warning(
                    "The 'ignore_new_state' config is True. The new checkpoint state will not be created."
                )

            logger.debug(
                f"Successfully created {self.stateful_ingestion_config.state_provider.type} state provider."
            )
예제 #9
0
파일: kafka.py 프로젝트: shirshanka/datahub
    def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
        super().__init__(config, ctx)
        self.source_config = config
        if (
            self.is_stateful_ingestion_configured()
            and not self.source_config.platform_instance
        ):
            raise ConfigurationError(
                "Enabling kafka stateful ingestion requires to specify a platform instance."
            )

        self.consumer = confluent_kafka.Consumer(
            {
                "group.id": "test",
                "bootstrap.servers": self.source_config.connection.bootstrap,
                **self.source_config.connection.consumer_config,
            }
        )
        # Use the fully qualified name for SchemaRegistryClient to make it mock patchable for testing.
        self.schema_registry_client = (
            confluent_kafka.schema_registry.schema_registry_client.SchemaRegistryClient(
                {
                    "url": self.source_config.connection.schema_registry_url,
                    **self.source_config.connection.schema_registry_config,
                }
            )
        )
        self.report = KafkaSourceReport()
        self.known_schema_registry_subjects: List[str] = []
        try:
            self.known_schema_registry_subjects.extend(
                self.schema_registry_client.get_subjects()
            )
        except Exception as e:
            logger.warning(f"Failed to get subjects from schema registry: {e}")
예제 #10
0
    def _authenticate(self):
        # https://tableau.github.io/server-client-python/docs/api-ref#authentication
        authentication = None
        if self.config.username and self.config.password:
            authentication = TableauAuth(
                username=self.config.username,
                password=self.config.password,
                site_id=self.config.site,
            )
        elif self.config.token_name and self.config.token_value:
            authentication = PersonalAccessTokenAuth(self.config.token_name,
                                                     self.config.token_value,
                                                     self.config.site)
        else:
            raise ConfigurationError(
                "Tableau Source: Either username/password or token_name/token_value must be set"
            )

        try:
            self.server = Server(self.config.connect_uri,
                                 use_server_version=True)
            self.server.auth.sign_in(authentication)
        except ServerResponseError as e:
            logger.error(e)
            self.report.report_failure(
                key="tableau-login",
                reason=f"Unable to Login with credentials provided"
                f"Reason: {str(e)}",
            )
        except Exception as e:
            logger.error(e)
            self.report.report_failure(key="tableau-login",
                                       reason=f"Unable to Login"
                                       f"Reason: {str(e)}")
예제 #11
0
 def __init__(self, config: AddDatasetOwnershipConfig, ctx: PipelineContext):
     self.ctx = ctx
     self.config = config
     if self.config.semantics == Semantics.PATCH and self.ctx.graph is None:
         raise ConfigurationError(
             "With PATCH semantics, AddDatasetOwnership requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe"
         )
 def create(cls, config_dict: Dict[str, Any],
            ctx: PipelineContext) -> IngestionStateProvider:
     if ctx.graph:
         return cls(ctx.graph)
     elif config_dict is None:
         raise ConfigurationError("Missing provider configuration")
     else:
         provider_config = DatahubIngestionStateProviderConfig.parse_obj(
             config_dict)
         if provider_config.datahub_api:
             graph = DataHubGraph(provider_config.datahub_api)
             return cls(graph)
         else:
             raise ConfigurationError(
                 "Missing datahub_api. Provide either a global one or under the state_provider."
             )
예제 #13
0
파일: pulsar.py 프로젝트: hsheth2/datahub
 def ensure_only_issuer_or_token(
     cls, token: Optional[str], values: Dict[str, Optional[str]]
 ) -> Optional[str]:
     if token is not None and values.get("issuer_url") is not None:
         raise ConfigurationError(
             "Expected only one authentication method, either issuer_url or token."
         )
     return token
예제 #14
0
 def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext,
            name: str) -> IngestionCheckpointingProviderBase:
     if ctx.graph:
         # Use the pipeline-level graph if set
         return cls(ctx.graph, name)
     elif config_dict is None:
         raise ConfigurationError("Missing provider configuration.")
     else:
         provider_config = DatahubIngestionStateProviderConfig.parse_obj(
             config_dict)
         if provider_config.datahub_api:
             graph = DataHubGraph(provider_config.datahub_api)
             return cls(graph, name)
         else:
             raise ConfigurationError(
                 "Missing datahub_api. Provide either a global one or under the state_provider."
             )
예제 #15
0
    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        if self.config.write_semantics == "PATCH" and not self.ctx.graph:
            raise ConfigurationError(
                "With PATCH semantics, dbt source requires a datahub_api to connect to. "
                "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe."
            )

        (
            nodes,
            manifest_schema,
            manifest_version,
            catalog_schema,
            catalog_version,
            manifest_nodes_raw,
        ) = loadManifestAndCatalog(
            self.config.manifest_path,
            self.config.catalog_path,
            self.config.sources_path,
            self.config.load_schemas,
            self.config.use_identifiers,
            self.config.tag_prefix,
            self.config.node_type_pattern,
            self.report,
            self.config.node_name_pattern,
        )

        additional_custom_props = {
            "manifest_schema": manifest_schema,
            "manifest_version": manifest_version,
            "catalog_schema": catalog_schema,
            "catalog_version": catalog_version,
        }

        additional_custom_props_filtered = {
            key: value
            for key, value in additional_custom_props.items()
            if value is not None
        }

        if not self.config.disable_dbt_node_creation:
            yield from self.create_platform_mces(
                nodes,
                additional_custom_props_filtered,
                manifest_nodes_raw,
                DBT_PLATFORM,
            )

        yield from self.create_platform_mces(
            nodes,
            additional_custom_props_filtered,
            manifest_nodes_raw,
            self.config.target_platform,
        )

        if self.is_stateful_ingestion_configured():
            # Clean up stale entities.
            yield from self.gen_removed_entity_workunits()
예제 #16
0
파일: bigquery.py 프로젝트: hsheth2/datahub
 def validate_that_bigquery_audit_metadata_datasets_is_correctly_configured(
         cls, values: Dict[str, Any]) -> Dict[str, Any]:
     if (values.get("use_exported_bigquery_audit_metadata")
             and not values.get("use_v2_audit_metadata")
             and not values.get("bigquery_audit_metadata_datasets")):
         raise ConfigurationError(
             "bigquery_audit_metadata_datasets must be specified if using exported audit metadata. Otherwise set use_v2_audit_metadata to True."
         )
         pass
     return values
예제 #17
0
파일: pulsar.py 프로젝트: hsheth2/datahub
 def ensure_client_id_and_secret_for_issuer_url(
     cls, client_secret: Optional[str], values: Dict[str, Optional[str]]
 ) -> Optional[str]:
     if values.get("issuer_url") is not None and (
         client_secret is None or values.get("client_id") is None
     ):
         raise ConfigurationError(
             "Missing configuration: client_id and client_secret are mandatory when issuer_url is set."
         )
     return client_secret
예제 #18
0
 def env_must_be_one_of(cls, v: str) -> str:
     # Get all the constants from the FabricTypeClass. It's not an enum, so this is a bit hacky but works
     allowed_envs = [
         value for name, value in vars(FabricTypeClass).items()
         if not name.startswith("_")
     ]
     if (v.upper()) not in allowed_envs:
         raise ConfigurationError(
             f"env must be one of {allowed_envs}, found {v}")
     return v.upper()
예제 #19
0
    def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext,
                 platform: str):
        super().__init__(ctx)
        self.config = config
        self.platform = platform
        self.report = SQLSourceReport()

        if self.config.profiling.enabled and not self._can_run_profiler():
            raise ConfigurationError(
                "Table profiles requested but profiler plugin is not enabled. "
                f"Try running: pip install '{__package_name__}[sql-profiles]'")
예제 #20
0
    def from_looker_connection(
        cls, looker_connection: DBConnection
    ) -> "LookerConnectionDefinition":
        """Dialect definitions are here: https://docs.looker.com/setup-and-management/database-config"""
        extractors: Dict[str, Any] = {
            "^bigquery": _get_bigquery_definition,
            ".*": _get_generic_definition,
        }

        if looker_connection.dialect_name is not None:
            for extractor_pattern, extracting_function in extractors.items():
                if re.match(extractor_pattern, looker_connection.dialect_name):
                    (platform, db, schema) = extracting_function(looker_connection)
                    return cls(platform=platform, default_db=db, default_schema=schema)
            raise ConfigurationError(
                f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}"
            )
        else:
            raise ConfigurationError(
                f"Unable to fetch a fully filled out connection for {looker_connection.name}. Please check your API permissions."
            )
예제 #21
0
def load_config_file(config_file: Union[pathlib.Path, str]) -> dict:
    if isinstance(config_file, str):
        config_file = pathlib.Path(config_file)
    if not config_file.is_file():
        raise ConfigurationError(f"Cannot open config file {config_file}")

    config_mech: ConfigurationMechanism
    if config_file.suffix in [".yaml", ".yml"]:
        config_mech = YamlConfigurationMechanism()
    elif config_file.suffix == ".toml":
        config_mech = TomlConfigurationMechanism()
    else:
        raise ConfigurationError(
            "Only .toml and .yml are supported. Cannot process file type {}".
            format(config_file.suffix))

    with config_file.open() as raw_config_fp:
        raw_config_file = raw_config_fp.read()
    config_fp = io.StringIO(raw_config_file)
    config = config_mech.load_config(config_fp)
    resolve_env_variables(config)
    return config