Пример #1
0
    def __init__(self, connector_contract: ConnectorContract):
        """ initialise the Handler passing the connector_contract dictionary

        Extra Parameters in the ConnectorContract kwargs:
            - region_name (optional) session region name
            - profile_name (optional) session shared credentials file profile name
        """
        self.botocore = HandlerFactory.get_module('botocore.exceptions')
        self.boto3 = HandlerFactory.get_module('boto3')
        super().__init__(connector_contract)
        cc_params = connector_contract.kwargs
        cc_params.update(connector_contract.query
                         )  # Update kwargs with those in the uri query
        region_name = cc_params.pop('region_name', 'us-east-2')
        aws_access_key_id = cc_params.pop('aws_access_key_id',
                                          os.environ.get('AWS_ACCESS_KEY_ID'))
        aws_secret_access_key = cc_params.pop(
            'aws_secret_access_key', os.environ.get('AWS_SECRET_ACCESS_KEY'))
        aws_session_token = cc_params.pop('aws_session_token',
                                          os.environ.get('AWS_SESSION_TOKEN'))
        profile_name = cc_params.pop('profile_name', None)
        self._session = self.boto3.Session(
            region_name=region_name,
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            profile_name=profile_name,
            aws_session_token=aws_session_token)
        self._file_state = 0
        self._changed_flag = True
Пример #2
0
 def __init__(self, connector_contract: ConnectorContract):
     """ initialise the Hander passing the source_contract dictionary """
     # required module import
     self.psycopg2 = HandlerFactory.get_module('psycopg2')
     super().__init__(connector_contract)
     self._file_state = 0
     self._changed_flag = True
    def _yaml_dump(self, data, path_file, **kwargs) -> None:
        """ dump YAML file

        :param data: the data to persist
        :param path_file: the name and path of the file
        :param default_flow_style: (optional) if to include the default YAML flow style
        """
        module_name = 'yaml'
        if HandlerFactory.check_module(module_name=module_name):
            module = HandlerFactory.get_module(module_name=module_name)
        else:
            raise ModuleNotFoundError(
                f"The required module {module_name} has not been installed. "
                f"Please pip install the appropriate package in order to complete this action"
            )
        encoding = kwargs.pop('encoding', 'utf-8')
        default_flow_style = kwargs.pop('default_flow_style', False)
        with self._lock:
            # make sure the dump is clean
            try:
                with closing(open(path_file, mode='w',
                                  encoding=encoding)) as ymlfile:
                    module.safe_dump(data=data,
                                     stream=ymlfile,
                                     default_flow_style=default_flow_style,
                                     **kwargs)
            except IOError as e:
                raise IOError(
                    f"The yaml file {path_file} failed to open with: {e}")
        # check the file was created
        return
    def _yaml_load(path_file, **kwargs) -> dict:
        """ loads the YAML file

        :param path_file: the name and path of the file
        :return: a dictionary
        """
        module_name = 'yaml'
        if HandlerFactory.check_module(module_name=module_name):
            module = HandlerFactory.get_module(module_name=module_name)
        else:
            raise ModuleNotFoundError(
                f"The required module {module_name} has not been installed. "
                f"Please pip install the appropriate package in order to complete this action"
            )
        encoding = kwargs.pop('encoding', 'utf-8')
        try:
            with closing(open(path_file, mode='r',
                              encoding=encoding)) as ymlfile:
                rtn_dict = module.safe_load(ymlfile)
        except IOError as e:
            raise IOError(
                f"The yaml file {path_file} failed to open with: {e}")
        if not isinstance(rtn_dict, dict) or not rtn_dict:
            raise TypeError(
                f"The yaml file {path_file} could not be loaded as a dict type"
            )
        return rtn_dict
 def has_changed(self) -> bool:
     """ returns the status of the change_flag indicating if the file has changed since last load or reset"""
     if not self.exists():
         return False
     # maintain the change flag
     _cc = self.connector_contract
     if _cc.schema.startswith('http') or _cc.schema.startswith('git'):
         if not isinstance(self.connector_contract, ConnectorContract):
             raise ValueError(
                 "The Pandas Connector Contract has not been set")
         module_name = 'requests'
         _address = _cc.address.replace("git://", "https://")
         if HandlerFactory.check_module(module_name=module_name):
             module = HandlerFactory.get_module(module_name=module_name)
             state = module.head(_address).headers.get('last-modified', 0)
         else:
             raise ModuleNotFoundError(
                 f"The required module {module_name} has not been installed. Please pip "
                 f"install the appropriate package in order to complete this action"
             )
     else:
         state = os.stat(_cc.address).st_mtime_ns
     if state != self._file_state:
         self._changed_flag = True
         self._file_state = state
     return self._changed_flag
    def register_estimator(self,
                           canonical: pd.DataFrame,
                           target: str,
                           headers: list,
                           class_name: str,
                           module_name: str,
                           hyper_param: dict = None,
                           test_size: float = None,
                           random_state: int = None,
                           save_intent: bool = None,
                           model_name: str = None,
                           intent_order: int = None,
                           replace_intent: bool = None,
                           remove_duplicates: bool = None):
        """ registers and fits an estimator model returning the model fit

        :param canonical: the model canonical
        :param class_name: the name of the model class
        :param target: the model target
        :param headers: the model features header names
        :param hyper_param: (optional) hyper parameters for the model instance
        :param test_size:  (optional) the size of the test sample (default tp 0.33)
        :param random_state:  (optional) a random state value for the test sample
        :param module_name: (optional) the name of the module
        :param save_intent: (optional) if the intent contract should be saved to the property manager
        :param model_name: (optional) the name of the model
        :param intent_order: (optional) the order in which each intent should run.
                        If None: default's to -1
                        if -1: added to a level above any current instance of the intent section, level 0 if not found
                        if int: added to the level specified, overwriting any that already exist
        :param replace_intent: (optional) if the intent method exists at the level, or default level
                        True - replaces the current intent method with the new
                        False - leaves it untouched, disregarding the new intent
        :param remove_duplicates: (optional) removes any duplicate intent in any level that is identical
        :return: CatBoostClassifier.
        """
        # resolve intent persist options
        _method = inspect.currentframe().f_code.co_name
        self._set_intend_signature(self._intent_builder(method=_method,
                                                        params=locals()),
                                   model_name=model_name,
                                   intent_order=intent_order,
                                   replace_intent=replace_intent,
                                   remove_duplicates=remove_duplicates,
                                   save_intent=save_intent)
        # Code block for intent
        local_intent = {}
        if model_name and self._pm.has_intent(model_name):
            local_intent = self._pm.get_intent(level=model_name,
                                               intent=_method)
        module_name = module_name if isinstance(
            module_name, str) else local_intent.get('module_name', None)
        X = Commons.filter_columns(canonical, headers=headers)
        y = Commons.filter_columns(canonical, headers=target)
        module = HandlerFactory.get_module(module_name='ds_behavioral')
Пример #7
0
 def __init__(self, connector_contract: ConnectorContract):
     """ Initialise the handler passing the source_contract dictionary """
     super().__init__(connector_contract)
     self.cortex_content = HandlerFactory.get_module('cortex.content')
     self.token = self._load_token()
     self.api_endpoint = self._load_api_endpoint()
     self.project = self._load_project_name()
     self.cortex_mc_client = self.cortex_content.ManagedContentClient(
         url=self.api_endpoint, token=self.token)
     self._etag = 0
     self._changed_flag = True
 def __init__(self, connector_contract: ConnectorContract):
     """ initialise the Hander passing the source_contract dictionary """
     # required module import
     self.mongo = HandlerFactory.get_module('pymongo')
     super().__init__(connector_contract)
     database = self.connector_contract.kwargs.get("database")
     self._mongo_database = self.mongo.MongoClient(
         self.connector_contract.uri)[database]
     self._mongo_collection = self._mongo_database[
         self.connector_contract.kwargs.get("collection")]
     self._file_state = 0
     self._changed_flag = True
 def _get_canonical(self,
                    data: [pd.DataFrame, pd.Series, list, str, dict],
                    header: str = None) -> pd.DataFrame:
     if isinstance(data, pd.DataFrame):
         return deepcopy(data)
     if isinstance(data, dict):
         method = data.pop('method', None)
         if method is None:
             raise ValueError(f"The data dictionary has no 'method' key.")
         if str(method).startswith('@generate'):
             task_name = data.pop('task_name', None)
             if task_name is None:
                 raise ValueError(
                     f"The data method '@generate' requires a 'task_name' key."
                 )
             repo_uri = data.pop('repo_uri', None)
             module = HandlerFactory.get_module(module_name='ds_behavioral')
             inst = module.SyntheticBuilder.from_env(task_name=task_name,
                                                     uri_pm_repo=repo_uri,
                                                     default_save=False)
             size = data.pop('size', None)
             seed = data.get('seed', None)
             run_book = data.pop('run_book', None)
             result = inst.tools.run_intent_pipeline(size=size,
                                                     columns=run_book,
                                                     seed=seed)
             return inst.tools.frame_selection(canonical=result,
                                               save_intent=False,
                                               **data)
         else:
             raise ValueError(
                 f"The data 'method' key {method} is not a recognised intent method"
             )
     elif isinstance(data, (list, pd.Series)):
         header = header if isinstance(header, str) else 'default'
         return pd.DataFrame(data=deepcopy(data), columns=[header])
     elif isinstance(data, str):
         if data == '@empty':
             return pd.DataFrame()
         if not self._pm.has_connector(connector_name=data):
             raise ValueError(
                 f"The data connector name '{data}' is not in the connectors catalog"
             )
         handler = self._pm.get_connector_handler(data)
         canonical = handler.load_canonical()
         if isinstance(canonical, dict):
             canonical = pd.DataFrame.from_dict(data=canonical,
                                                orient='columns')
         return canonical
     raise ValueError(
         f"The canonical format is not recognised, pd.DataFrame, pd.Series"
         f"str, list or dict expected, {type(data)} passed")
 def _json_load(path_file: str, **kwargs) -> [dict, pd.DataFrame]:
     """ loads a pickle file """
     if path_file.startswith('http'):
         module_name = 'requests'
         if HandlerFactory.check_module(module_name=module_name):
             module = HandlerFactory.get_module(module_name=module_name)
             username = kwargs.get('username', None)
             password = kwargs.get('password', None)
             auth = (username, password) if username and password else None
             r = module.get(path_file, auth=auth)
             return r.json()
     with closing(open(path_file, mode='r')) as f:
         return json.load(f, **kwargs)
 def exists(self) -> bool:
     """ Returns True is the file exists """
     if not isinstance(self.connector_contract, ConnectorContract):
         raise ValueError("The Pandas Connector Contract has not been set")
     _cc = self.connector_contract
     if _cc.schema.startswith('http') or _cc.schema.startswith('git'):
         module_name = 'requests'
         _address = _cc.address.replace("git://", "https://")
         if HandlerFactory.check_module(module_name=module_name):
             module = HandlerFactory.get_module(module_name=module_name)
             return module.get(_address).status_code == 200
         raise ModuleNotFoundError(
             f"The required module {module_name} has not been installed. "
             f"Please pip install the appropriate package in order to complete this action"
         )
     if os.path.exists(_cc.address):
         return True
     return False
 def _pickle_load(path_file: str, **kwargs) -> [dict, pd.DataFrame]:
     """ loads a pickle file """
     fix_imports = kwargs.pop('fix_imports', True)
     encoding = kwargs.pop('encoding', 'ASCII')
     errors = kwargs.pop('errors', 'strict')
     if path_file.startswith('http'):
         module_name = 'requests'
         if HandlerFactory.check_module(module_name=module_name):
             module = HandlerFactory.get_module(module_name=module_name)
             username = kwargs.get('username', None)
             password = kwargs.get('password', None)
             auth = (username, password) if username and password else None
             r = module.get(path_file, auth=auth)
             return r.content
     with closing(open(path_file, mode='rb')) as f:
         return pickle.load(f,
                            fix_imports=fix_imports,
                            encoding=encoding,
                            errors=errors)
 def _get_constant(reference: str,
                   size: int = None,
                   shuffle: bool = True,
                   seed: int = None) -> [pd.DataFrame, list]:
     """private method to retrieve data constant"""
     module = HandlerFactory.get_module(
         module_name=f"ds_behavioral.sample.{reference}")
     if reference.startswith("lookup_"):
         return AbstractSample._select_list(selection=module.data,
                                            size=size,
                                            seed=seed,
                                            shuffle=shuffle)
     df = pd.DataFrame.from_dict(module.data, orient='columns')
     idx = df.index.to_list()
     selection = AbstractSample._select_list(selection=idx,
                                             size=size,
                                             seed=seed,
                                             shuffle=shuffle)
     rtn_df: pd.DataFrame = df.iloc[selection].reset_index(drop=True)
     return rtn_df
Пример #14
0
 def __init__(self, connector_contract: ConnectorContract):
     """ initialise the Handler passing the source_contract dictionary """
     self.pyhive = HandlerFactory.get_module('pyhive.hive')
     super().__init__(connector_contract)
     self._file_state = 0
     self._changed_flag = True
Пример #15
0
    def _get_canonical(self, data: [pd.DataFrame, pd.Series, list, str, dict, int], header: str=None, size: int=None,
                       deep_copy: bool=None) -> pd.DataFrame:
        """ Used to return or generate a pandas Dataframe from a number of different methods.
        The following can be passed and their returns
        - pd.Dataframe -> a deep copy of the pd.DataFrame
        - pd.Series or list -> creates a pd.DataFrame of one column with the 'header' name or 'default' if not given
        - str -> instantiates a connector handler with the connector_name and loads the DataFrame from the connection
        - int -> generates an empty pd.Dataframe with an index size of the int passed.
        - dict -> use the canonical2dict(...) method to construct a dict with a method and related parameters
            methods:
                - model_*(...) -> one of the builder model methods and paramters
                - *_selection(...) -> one of the builder selection methods (get_, correlate_, frame_) and paramters
                - @empty -> generates an empty pd.DataFrame where size and headers can be passed
                    :size sets the index size of the dataframe
                    :headers any initial headers for the dataframe
                - @generate -> generate a synthetic file from a remote Domain Contract
                    :task_name the name of the SyntheticBuilder task to run
                    :repo_uri the location of the Domain Product
                    :size (optional) a size to generate
                    :seed (optional) if a seed should be applied
                    :run_book (optional) if specific intent should be run only

        :param data: a dataframe or action event to generate a dataframe
        :param header: (optional) header for pd.Series or list
        :param size: (optional) a size parameter for @empty of @generate
        :param header: (optional) used in conjunction with lists or pd.Series to give a header reference
        :return: a pd.Dataframe
        """
        deep_copy = deep_copy if isinstance(deep_copy, bool) else True
        if isinstance(data, pd.DataFrame):
            if deep_copy:
                return deepcopy(data)
            return data
        if isinstance(data, dict):
            data = data.copy()
            method = data.pop('method', None)
            if method is None:
                try:
                    return pd.DataFrame.from_dict(data=data)
                except ValueError:
                    raise ValueError("The canonical data passed was of type 'dict' but did not contain a 'method' key "
                                     "or was not convertible to Dataframe")
            if method in self.__dir__():
                if str(method).startswith('model_') or method == 'frame_selection':
                    data.update({'save_intent': False})
                    return eval(f"self.{method}(**data)", globals(), locals())
                if str(method).endswith('_selection'):
                    if not isinstance(header, str):
                        raise ValueError(f"The canonical type 'dict' method '{method}' must have a header parameter.")
                    data.update({'save_intent': False})
                    if method == 'get_selection':
                        if not isinstance(size, int):
                            raise ValueError(f"The canonical type 'dict' method '{method}' must have a size parameter.")
                        data.update({'size': size})
                    return pd.DataFrame(data=eval(f"self.{method}(**data)", globals(), locals()), columns=[header])
            elif str(method).startswith('@generate'):
                task_name = data.pop('task_name', None)
                if task_name is None:
                    raise ValueError(f"The data method '@generate' requires a 'task_name' key.")
                uri_pm_repo = data.pop('repo_uri', None)
                module = HandlerFactory.get_module(module_name='ds_discovery')
                inst = module.SyntheticBuilder.from_env(task_name=task_name, uri_pm_repo=uri_pm_repo,
                                                        default_save=False)
                size = size if isinstance(size, int) and 'size' not in data.keys() else data.pop('size', None)
                seed = data.get('seed', None)
                run_book = data.pop('run_book', None)
                result = inst.tools.run_intent_pipeline(canonical=size, columns=run_book, seed=seed)
                return inst.tools.frame_selection(canonical=result, save_intent=False, **data)
            elif str(method).startswith('@empty'):
                size = size if isinstance(size, int) and 'size' not in data.keys() else data.pop('size', None)
                headers = data.pop('headers', None)
                size = range(size) if size else None
                return pd.DataFrame(index=size, columns=headers)
            else:
                raise ValueError(f"The data 'method' key {method} is not a recognised intent method")
        elif isinstance(data, (list, pd.Series)):
            header = header if isinstance(header, str) else 'default'
            if deep_copy:
                data = deepcopy(data)
            return pd.DataFrame(data=data, columns=[header])
        elif isinstance(data, str):
            if not self._pm.has_connector(connector_name=data):
                if isinstance(size, int):
                    return pd.DataFrame(index=range(size))
                raise ValueError(f"The data connector name '{data}' is not in the connectors catalog")
            handler = self._pm.get_connector_handler(data)
            canonical = handler.load_canonical()
            if isinstance(canonical, dict):
                canonical = pd.DataFrame.from_dict(data=canonical)
            return canonical
        elif isinstance(data, int):
            return pd.DataFrame(index=range(data)) if data > 0 else pd.DataFrame()
        elif not data:
            return pd.DataFrame()
        raise ValueError(f"The canonical format is not recognised, pd.DataFrame, pd.Series, "
                         f"str, list or dict expected, {type(data)} passed")