Exemplo n.º 1
0
    def from_dict(cls, config: dict) -> "GordoBaseDataProvider":
        provider_type = "DataLakeProvider"
        if "type" in config:
            config = copy(config)
            provider_type = config.pop("type")

        module = None
        if "." in provider_type:
            module_name, class_name = provider_type.rsplit(".", 1)

            # TODO validate module_name
            try:
                module = importlib.import_module(module_name)
            except ImportError as e:
                raise ConfigException(
                    f"Unable to import module '{module_name}': {str(e)}")
        else:
            from gordo_dataset.data_provider import providers

            module_name, class_name = "gordo_dataset.data_provider", provider_type
            module = providers

        try:
            Provider = getattr(module, class_name)
        except AttributeError:
            raise ConfigException(
                f"Unable to find data provider '{class_name}' in module {module_name}"
            )

        return Provider(**config)
Exemplo n.º 2
0
    def __init__(
        self,
        storage: Optional[Union[FileSystem, Dict[str, Any]]] = None,
        assets_config: Optional[AssetsConfig] = None,
        interactive: Optional[bool] = None,
        storename: Optional[str] = None,
        dl_service_auth_str: Optional[str] = None,
        **kwargs,
    ):
        """
        Instantiates a DataLakeBackedDataset, for fetching of data from the data lake

        Parameters
        ----------
        storage: Optional[Union[FileSystem, Dict[str, Any]]]
            DataLake config. The structure depends on which DataLake you are going to use.
        assets_config: Optional[AssetsConfig]
            Uses assets config from `gordo_dataset.data_provider.resources` by default
        interactive: bool
            To perform authentication interactively, or attempt to do it a
            automatically, in such a case must provide 'del_service_authS_tr'
            parameter or as 'DL_SERVICE_AUTH_STR' env var. Only for DataLake Gen1 and will be deprecated in new versions
        storename
            The store name to read data from. Only for DataLake Gen1 and will be deprecated in new versions
        dl_service_auth_str: Optional[str]
            string on the format 'tenant_id:service_id:service_secret'. To
            perform authentication automatically; will default to
            DL_SERVICE_AUTH_STR env var or None. Unsupported argument

        .. deprecated::
            Arguments `interactive`, `storename`, `dl_service_auth_str`
        """
        if assets_config is None:
            assets_config = load_assets_config()
        self.assets_config = assets_config

        if "secrets_loader" in kwargs:
            raise ConfigException("Unsupported parameter secrets_loader")
        self.kwargs = kwargs

        # This arguments only preserved for back-compatibility reasons and will be removed in future versions of gordo
        self.adl1_kwargs: Dict[str, Any] = {}
        if interactive is not None:
            self.adl1_kwargs["interactive"] = interactive
        if storename is not None:
            self.adl1_kwargs["store_name"] = storename
        if dl_service_auth_str is not None:
            raise ConfigException("Unsupported parameter dl_service_auth_str")

        self.storage = storage
        self._storage_instance: Optional[FileSystem] = None
        self._sub_dataproviders = None
Exemplo n.º 3
0
def create_storage(storage_type: Optional[str] = None,
                   secrets_loader: Optional[ADLSecretsLoader] = None,
                   **kwargs) -> FileSystem:
    """
    Create ``FileSystem`` instance from the config

    Parameters
    ----------
    storage_type: Optional[str]
        Storage type only supported `adl1`, `adl2` values
    secrets_loader: Optional[ADLSecretsLoader]

    kwargs

    Returns
    -------

    """
    if storage_type is None:
        storage_type = DEFAULT_STORAGE_TYPE
    if secrets_loader is None:
        secrets_loader = DEFAULT_SECRETS_LOADER
    if not isinstance(secrets_loader, ADLSecretsLoader):
        raise ConfigException(
            "secrets_loader should be instance of ADLSecretsLoader and not %s type",
            type(secrets_loader),
        )
    storage: FileSystem
    if storage_type == "adl1":
        if "store_name" not in kwargs:
            kwargs["store_name"] = "dataplatformdlsprod"
        kwargs["adl_secret"] = secrets_loader.get_secret(
            storage_type, kwargs["store_name"])
        storage = adl1.ADLGen1FileSystem.create_from_env(**kwargs)
    elif storage_type == "adl2":
        if "account_name" not in kwargs:
            kwargs["account_name"] = "omniadlseun"
        if "file_system_name" not in kwargs:
            kwargs["file_system_name"] = "dls"
        kwargs["adl_secret"] = secrets_loader.get_secret(
            storage_type, kwargs["account_name"])
        storage = adl2.ADLGen2FileSystem.create_from_env(**kwargs)
    else:
        raise ConfigException("Unknown storage type '%s'" % storage_type)
    return storage
Exemplo n.º 4
0
 def get_secret(self, storage_type: str, storage_name: str) -> Optional[ADLSecret]:
     if storage_type not in self._secrets_envs:
         raise ConfigException("Unknown storage type '%s'" % storage_type)
     if storage_name not in self._secrets_envs[storage_type]:
         raise ConfigException(
             "Unknown storage name '%s' for type '%s'" % (storage_type, storage_name)
         )
     env_var_name = self._secrets_envs[storage_type][storage_name]
     env_var = os.environ.get(env_var_name)
     if not env_var:
         return None
     data = env_var.split(":")
     if len(data) != 3:
         raise ValueError(
             "Environment variable %s has %d fields, but 3 is required"
             % (env_var_name, len(data))
         )
     tenant_id, client_id, client_secret = data
     return ADLSecret(tenant_id, client_id, client_secret)
Exemplo n.º 5
0
 def _adl1_back_compatible_kwarg(self, storage_type: str,
                                 kwarg: Dict[str, Any]) -> Dict[str, Any]:
     if storage_type == "adl1":
         if self.adl1_kwargs:
             adl1_kwarg = copy(self.adl1_kwargs)
             adl1_kwarg.update(kwarg)
             return adl1_kwarg
     else:
         if self.adl1_kwargs:
             arguments = ", ".join(self.adl1_kwargs.keys())
             raise ConfigException(
                 "%s does no support%s by storage '%s'" %
                 (arguments, "s" if len(arguments) > 1 else "",
                  storage_type))
     return kwarg
Exemplo n.º 6
0
    def lookup(
        self,
        asset_config: AssetsConfig,
        tags: List[SensorTag],
        partitions: Iterable[Partition],
        threads_count: int = 1,
        base_dir: Optional[str] = None,
    ) -> Iterable[TagLocations]:
        """
        Takes assets paths from ``AssetsConfig`` and find tags files paths in the data lake storage

        Parameters
        ----------
        asset_config: AssetsConfig
        tags: List[SensorTag]
        partitions: Iterable[Partition]
        threads_count: int
            Number of threads for internal `ThreadPool`. Do not uses thread pool if 1
        base_dir: Optional[str]

        Returns
        -------

        """
        if not threads_count or threads_count < 1:
            raise ConfigException("thread_count should bigger or equal to 1")
        multi_thread = threads_count > 1
        tag_dirs = self.assets_config_tags_lookup(asset_config,
                                                  tags,
                                                  base_dir=base_dir)
        partitions_tuple = tuple(partitions)
        if multi_thread:
            with ThreadPoolExecutor(max_workers=threads_count) as executor:
                result = executor.map(
                    self._thread_pool_lookup_mapper,
                    tag_dirs,
                    self._partitions_inf_iterator(partitions_tuple),
                )
                for tag_locations in result:
                    yield tag_locations
        else:
            for tag, tag_dir in tag_dirs:
                if tag_dir is not None:
                    yield self.files_lookup(tag_dir, tag, partitions_tuple)
                else:
                    yield TagLocations(tag, None)
Exemplo n.º 7
0
    def create_from_env(
        cls,
        account_name: str,
        file_system_name: str,
        interactive: bool = False,
        adl_secret: Optional[ADLSecret] = None,
        **kwargs,
    ) -> "ADLGen2FileSystem":
        """
        Creates ADL Gen2 file system client.

        Parameters
        ----------
        account_name: str
            Azure account name
        file_system_name: str
            Container name
        interactive: bool
            If true then use interactive authentication
        adl_secret: ADLSecret
            Azure authentication information

        Returns
        -------
        ADLGen2FileSystem
        """
        if interactive:
            logger.info("Attempting to use interactive azure authentication")
            credential = InteractiveBrowserCredential()
        else:
            if type(adl_secret) is not ADLSecret:
                raise ConfigException("Unsupported type for adl_secret '%s'" %
                                      type(adl_secret))
            adl_secret = cast(ADLSecret, adl_secret)
            logger.info("Attempting to use datalake service authentication")
            credential = ClientSecretCredential(
                tenant_id=adl_secret.tenant_id,
                client_id=adl_secret.client_id,
                client_secret=adl_secret.client_secret,
            )
        return cls.create_from_credential(account_name, file_system_name,
                                          credential, **kwargs)
Exemplo n.º 8
0
    def create_from_env(
        cls,
        store_name: str,
        interactive: bool = False,
        adl_secret: Optional[ADLSecret] = None,
    ) -> "ADLGen1FileSystem":
        """
        Creates ADL Gen1 file system client.

        Parameters
        ----------
        store_name: str
            Name of datalake store.
        interactive: bool
            If true then use interactive authentication
        adl_secret: ADLSecret
            Azure authentication information

        Returns
        -------
        ADLGen1FileSystem
        """

        if interactive:
            logger.info("Attempting to use interactive azure authentication")
            token = lib.auth()
        else:
            if type(adl_secret) is not ADLSecret:
                raise ConfigException("Unsupported type for adl_secret '%s'" %
                                      type(adl_secret))
            adl_secret = cast(ADLSecret, adl_secret)
            logger.info("Attempting to use datalake service authentication")
            token = lib.auth(
                tenant_id=adl_secret.tenant_id,
                client_id=adl_secret.client_id,
                client_secret=adl_secret.client_secret,
                resource="https://datalake.azure.net/",
            )

        adl_client = core.AzureDLFileSystem(token, store_name=store_name)
        return cls(adl_client, store_name)