def from_dict(cls, config: dict) -> "GordoBaseDataProvider": provider_type = "DataLakeProvider" if "type" in config: config = copy(config) provider_type = config.pop("type") module = None if "." in provider_type: module_name, class_name = provider_type.rsplit(".", 1) # TODO validate module_name try: module = importlib.import_module(module_name) except ImportError as e: raise ConfigException( f"Unable to import module '{module_name}': {str(e)}") else: from gordo_dataset.data_provider import providers module_name, class_name = "gordo_dataset.data_provider", provider_type module = providers try: Provider = getattr(module, class_name) except AttributeError: raise ConfigException( f"Unable to find data provider '{class_name}' in module {module_name}" ) return Provider(**config)
def __init__( self, storage: Optional[Union[FileSystem, Dict[str, Any]]] = None, assets_config: Optional[AssetsConfig] = None, interactive: Optional[bool] = None, storename: Optional[str] = None, dl_service_auth_str: Optional[str] = None, **kwargs, ): """ Instantiates a DataLakeBackedDataset, for fetching of data from the data lake Parameters ---------- storage: Optional[Union[FileSystem, Dict[str, Any]]] DataLake config. The structure depends on which DataLake you are going to use. assets_config: Optional[AssetsConfig] Uses assets config from `gordo_dataset.data_provider.resources` by default interactive: bool To perform authentication interactively, or attempt to do it a automatically, in such a case must provide 'del_service_authS_tr' parameter or as 'DL_SERVICE_AUTH_STR' env var. Only for DataLake Gen1 and will be deprecated in new versions storename The store name to read data from. Only for DataLake Gen1 and will be deprecated in new versions dl_service_auth_str: Optional[str] string on the format 'tenant_id:service_id:service_secret'. To perform authentication automatically; will default to DL_SERVICE_AUTH_STR env var or None. Unsupported argument .. deprecated:: Arguments `interactive`, `storename`, `dl_service_auth_str` """ if assets_config is None: assets_config = load_assets_config() self.assets_config = assets_config if "secrets_loader" in kwargs: raise ConfigException("Unsupported parameter secrets_loader") self.kwargs = kwargs # This arguments only preserved for back-compatibility reasons and will be removed in future versions of gordo self.adl1_kwargs: Dict[str, Any] = {} if interactive is not None: self.adl1_kwargs["interactive"] = interactive if storename is not None: self.adl1_kwargs["store_name"] = storename if dl_service_auth_str is not None: raise ConfigException("Unsupported parameter dl_service_auth_str") self.storage = storage self._storage_instance: Optional[FileSystem] = None self._sub_dataproviders = None
def create_storage(storage_type: Optional[str] = None, secrets_loader: Optional[ADLSecretsLoader] = None, **kwargs) -> FileSystem: """ Create ``FileSystem`` instance from the config Parameters ---------- storage_type: Optional[str] Storage type only supported `adl1`, `adl2` values secrets_loader: Optional[ADLSecretsLoader] kwargs Returns ------- """ if storage_type is None: storage_type = DEFAULT_STORAGE_TYPE if secrets_loader is None: secrets_loader = DEFAULT_SECRETS_LOADER if not isinstance(secrets_loader, ADLSecretsLoader): raise ConfigException( "secrets_loader should be instance of ADLSecretsLoader and not %s type", type(secrets_loader), ) storage: FileSystem if storage_type == "adl1": if "store_name" not in kwargs: kwargs["store_name"] = "dataplatformdlsprod" kwargs["adl_secret"] = secrets_loader.get_secret( storage_type, kwargs["store_name"]) storage = adl1.ADLGen1FileSystem.create_from_env(**kwargs) elif storage_type == "adl2": if "account_name" not in kwargs: kwargs["account_name"] = "omniadlseun" if "file_system_name" not in kwargs: kwargs["file_system_name"] = "dls" kwargs["adl_secret"] = secrets_loader.get_secret( storage_type, kwargs["account_name"]) storage = adl2.ADLGen2FileSystem.create_from_env(**kwargs) else: raise ConfigException("Unknown storage type '%s'" % storage_type) return storage
def get_secret(self, storage_type: str, storage_name: str) -> Optional[ADLSecret]: if storage_type not in self._secrets_envs: raise ConfigException("Unknown storage type '%s'" % storage_type) if storage_name not in self._secrets_envs[storage_type]: raise ConfigException( "Unknown storage name '%s' for type '%s'" % (storage_type, storage_name) ) env_var_name = self._secrets_envs[storage_type][storage_name] env_var = os.environ.get(env_var_name) if not env_var: return None data = env_var.split(":") if len(data) != 3: raise ValueError( "Environment variable %s has %d fields, but 3 is required" % (env_var_name, len(data)) ) tenant_id, client_id, client_secret = data return ADLSecret(tenant_id, client_id, client_secret)
def _adl1_back_compatible_kwarg(self, storage_type: str, kwarg: Dict[str, Any]) -> Dict[str, Any]: if storage_type == "adl1": if self.adl1_kwargs: adl1_kwarg = copy(self.adl1_kwargs) adl1_kwarg.update(kwarg) return adl1_kwarg else: if self.adl1_kwargs: arguments = ", ".join(self.adl1_kwargs.keys()) raise ConfigException( "%s does no support%s by storage '%s'" % (arguments, "s" if len(arguments) > 1 else "", storage_type)) return kwarg
def lookup( self, asset_config: AssetsConfig, tags: List[SensorTag], partitions: Iterable[Partition], threads_count: int = 1, base_dir: Optional[str] = None, ) -> Iterable[TagLocations]: """ Takes assets paths from ``AssetsConfig`` and find tags files paths in the data lake storage Parameters ---------- asset_config: AssetsConfig tags: List[SensorTag] partitions: Iterable[Partition] threads_count: int Number of threads for internal `ThreadPool`. Do not uses thread pool if 1 base_dir: Optional[str] Returns ------- """ if not threads_count or threads_count < 1: raise ConfigException("thread_count should bigger or equal to 1") multi_thread = threads_count > 1 tag_dirs = self.assets_config_tags_lookup(asset_config, tags, base_dir=base_dir) partitions_tuple = tuple(partitions) if multi_thread: with ThreadPoolExecutor(max_workers=threads_count) as executor: result = executor.map( self._thread_pool_lookup_mapper, tag_dirs, self._partitions_inf_iterator(partitions_tuple), ) for tag_locations in result: yield tag_locations else: for tag, tag_dir in tag_dirs: if tag_dir is not None: yield self.files_lookup(tag_dir, tag, partitions_tuple) else: yield TagLocations(tag, None)
def create_from_env( cls, account_name: str, file_system_name: str, interactive: bool = False, adl_secret: Optional[ADLSecret] = None, **kwargs, ) -> "ADLGen2FileSystem": """ Creates ADL Gen2 file system client. Parameters ---------- account_name: str Azure account name file_system_name: str Container name interactive: bool If true then use interactive authentication adl_secret: ADLSecret Azure authentication information Returns ------- ADLGen2FileSystem """ if interactive: logger.info("Attempting to use interactive azure authentication") credential = InteractiveBrowserCredential() else: if type(adl_secret) is not ADLSecret: raise ConfigException("Unsupported type for adl_secret '%s'" % type(adl_secret)) adl_secret = cast(ADLSecret, adl_secret) logger.info("Attempting to use datalake service authentication") credential = ClientSecretCredential( tenant_id=adl_secret.tenant_id, client_id=adl_secret.client_id, client_secret=adl_secret.client_secret, ) return cls.create_from_credential(account_name, file_system_name, credential, **kwargs)
def create_from_env( cls, store_name: str, interactive: bool = False, adl_secret: Optional[ADLSecret] = None, ) -> "ADLGen1FileSystem": """ Creates ADL Gen1 file system client. Parameters ---------- store_name: str Name of datalake store. interactive: bool If true then use interactive authentication adl_secret: ADLSecret Azure authentication information Returns ------- ADLGen1FileSystem """ if interactive: logger.info("Attempting to use interactive azure authentication") token = lib.auth() else: if type(adl_secret) is not ADLSecret: raise ConfigException("Unsupported type for adl_secret '%s'" % type(adl_secret)) adl_secret = cast(ADLSecret, adl_secret) logger.info("Attempting to use datalake service authentication") token = lib.auth( tenant_id=adl_secret.tenant_id, client_id=adl_secret.client_id, client_secret=adl_secret.client_secret, resource="https://datalake.azure.net/", ) adl_client = core.AzureDLFileSystem(token, store_name=store_name) return cls(adl_client, store_name)