def _yaml_load(path_file, **kwargs) -> dict: """ loads the YAML file :param path_file: the name and path of the file :return: a dictionary """ module_name = 'yaml' if HandlerFactory.check_module(module_name=module_name): module = HandlerFactory.get_module(module_name=module_name) else: raise ModuleNotFoundError( f"The required module {module_name} has not been installed. " f"Please pip install the appropriate package in order to complete this action" ) encoding = kwargs.pop('encoding', 'utf-8') try: with closing(open(path_file, mode='r', encoding=encoding)) as ymlfile: rtn_dict = module.safe_load(ymlfile) except IOError as e: raise IOError( f"The yaml file {path_file} failed to open with: {e}") if not isinstance(rtn_dict, dict) or not rtn_dict: raise TypeError( f"The yaml file {path_file} could not be loaded as a dict type" ) return rtn_dict
def has_changed(self) -> bool: """ returns the status of the change_flag indicating if the file has changed since last load or reset""" if not self.exists(): return False # maintain the change flag _cc = self.connector_contract if _cc.schema.startswith('http') or _cc.schema.startswith('git'): if not isinstance(self.connector_contract, ConnectorContract): raise ValueError( "The Pandas Connector Contract has not been set") module_name = 'requests' _address = _cc.address.replace("git://", "https://") if HandlerFactory.check_module(module_name=module_name): module = HandlerFactory.get_module(module_name=module_name) state = module.head(_address).headers.get('last-modified', 0) else: raise ModuleNotFoundError( f"The required module {module_name} has not been installed. Please pip " f"install the appropriate package in order to complete this action" ) else: state = os.stat(_cc.address).st_mtime_ns if state != self._file_state: self._changed_flag = True self._file_state = state return self._changed_flag
def _yaml_dump(self, data, path_file, **kwargs) -> None: """ dump YAML file :param data: the data to persist :param path_file: the name and path of the file :param default_flow_style: (optional) if to include the default YAML flow style """ module_name = 'yaml' if HandlerFactory.check_module(module_name=module_name): module = HandlerFactory.get_module(module_name=module_name) else: raise ModuleNotFoundError( f"The required module {module_name} has not been installed. " f"Please pip install the appropriate package in order to complete this action" ) encoding = kwargs.pop('encoding', 'utf-8') default_flow_style = kwargs.pop('default_flow_style', False) with self._lock: # make sure the dump is clean try: with closing(open(path_file, mode='w', encoding=encoding)) as ymlfile: module.safe_dump(data=data, stream=ymlfile, default_flow_style=default_flow_style, **kwargs) except IOError as e: raise IOError( f"The yaml file {path_file} failed to open with: {e}") # check the file was created return
def recover_state(self): """recovers the state from last persisted and applies any events from the event log""" if isinstance(self._state_connector, ConnectorContract): handler = HandlerFactory.instantiate(self._state_connector) self.__book_state = handler.load_canonical() else: self.__book_state = pd.DataFrame() super()._set_modified(True) if isinstance(self._events_connector, ConnectorContract): handler = HandlerFactory.instantiate(self._events_connector) self.__events_log = handler.load_canonical() _event_times = pd.Series(list( self.__events_log.keys())).sort_values().reset_index(drop=True) for _items in _event_times: _action, _event = self.__events_log.get( _items, ['add', pd.DataFrame()]) if str(_action).lower() == 'add': self.add_event(event=_event) elif str(_action).lower() == 'increment': self.increment_event(event=_event) elif str(_action).lower() == 'decrement': self.decrement_event(event=_event) else: self.__events_log = dict() return
def __init__(self, connector_contract: ConnectorContract): """ initialise the Handler passing the connector_contract dictionary Extra Parameters in the ConnectorContract kwargs: - region_name (optional) session region name - profile_name (optional) session shared credentials file profile name """ self.botocore = HandlerFactory.get_module('botocore.exceptions') self.boto3 = HandlerFactory.get_module('boto3') super().__init__(connector_contract) cc_params = connector_contract.kwargs cc_params.update(connector_contract.query ) # Update kwargs with those in the uri query region_name = cc_params.pop('region_name', 'us-east-2') aws_access_key_id = cc_params.pop('aws_access_key_id', os.environ.get('AWS_ACCESS_KEY_ID')) aws_secret_access_key = cc_params.pop( 'aws_secret_access_key', os.environ.get('AWS_SECRET_ACCESS_KEY')) aws_session_token = cc_params.pop('aws_session_token', os.environ.get('AWS_SESSION_TOKEN')) profile_name = cc_params.pop('profile_name', None) self._session = self.boto3.Session( region_name=region_name, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, profile_name=profile_name, aws_session_token=aws_session_token) self._file_state = 0 self._changed_flag = True
def _json_load(path_file: str, **kwargs) -> [dict, pd.DataFrame]: """ loads a pickle file """ if path_file.startswith('http'): module_name = 'requests' if HandlerFactory.check_module(module_name=module_name): module = HandlerFactory.get_module(module_name=module_name) username = kwargs.get('username', None) password = kwargs.get('password', None) auth = (username, password) if username and password else None r = module.get(path_file, auth=auth) return r.json() with closing(open(path_file, mode='r')) as f: return json.load(f, **kwargs)
def __init__(self, connector_contract: ConnectorContract): """ initialise the Hander passing the source_contract dictionary """ # required module import self.psycopg2 = HandlerFactory.get_module('psycopg2') super().__init__(connector_contract) self._file_state = 0 self._changed_flag = True
def exists(self) -> bool: """ Returns True is the file exists """ if not isinstance(self.connector_contract, ConnectorContract): raise ValueError("The Pandas Connector Contract has not been set") _cc = self.connector_contract if _cc.schema.startswith('http') or _cc.schema.startswith('git'): module_name = 'requests' _address = _cc.address.replace("git://", "https://") if HandlerFactory.check_module(module_name=module_name): module = HandlerFactory.get_module(module_name=module_name) return module.get(_address).status_code == 200 raise ModuleNotFoundError( f"The required module {module_name} has not been installed. " f"Please pip install the appropriate package in order to complete this action" ) if os.path.exists(_cc.address): return True return False
def _pickle_load(path_file: str, **kwargs) -> [dict, pd.DataFrame]: """ loads a pickle file """ fix_imports = kwargs.pop('fix_imports', True) encoding = kwargs.pop('encoding', 'ASCII') errors = kwargs.pop('errors', 'strict') if path_file.startswith('http'): module_name = 'requests' if HandlerFactory.check_module(module_name=module_name): module = HandlerFactory.get_module(module_name=module_name) username = kwargs.get('username', None) password = kwargs.get('password', None) auth = (username, password) if username and password else None r = module.get(path_file, auth=auth) return r.content with closing(open(path_file, mode='rb')) as f: return pickle.load(f, fix_imports=fix_imports, encoding=encoding, errors=errors)
def register_estimator(self, canonical: pd.DataFrame, target: str, headers: list, class_name: str, module_name: str, hyper_param: dict = None, test_size: float = None, random_state: int = None, save_intent: bool = None, model_name: str = None, intent_order: int = None, replace_intent: bool = None, remove_duplicates: bool = None): """ registers and fits an estimator model returning the model fit :param canonical: the model canonical :param class_name: the name of the model class :param target: the model target :param headers: the model features header names :param hyper_param: (optional) hyper parameters for the model instance :param test_size: (optional) the size of the test sample (default tp 0.33) :param random_state: (optional) a random state value for the test sample :param module_name: (optional) the name of the module :param save_intent: (optional) if the intent contract should be saved to the property manager :param model_name: (optional) the name of the model :param intent_order: (optional) the order in which each intent should run. If None: default's to -1 if -1: added to a level above any current instance of the intent section, level 0 if not found if int: added to the level specified, overwriting any that already exist :param replace_intent: (optional) if the intent method exists at the level, or default level True - replaces the current intent method with the new False - leaves it untouched, disregarding the new intent :param remove_duplicates: (optional) removes any duplicate intent in any level that is identical :return: CatBoostClassifier. """ # resolve intent persist options _method = inspect.currentframe().f_code.co_name self._set_intend_signature(self._intent_builder(method=_method, params=locals()), model_name=model_name, intent_order=intent_order, replace_intent=replace_intent, remove_duplicates=remove_duplicates, save_intent=save_intent) # Code block for intent local_intent = {} if model_name and self._pm.has_intent(model_name): local_intent = self._pm.get_intent(level=model_name, intent=_method) module_name = module_name if isinstance( module_name, str) else local_intent.get('module_name', None) X = Commons.filter_columns(canonical, headers=headers) y = Commons.filter_columns(canonical, headers=target) module = HandlerFactory.get_module(module_name='ds_behavioral')
def __init__(self, connector_contract: ConnectorContract): """ Initialise the handler passing the source_contract dictionary """ super().__init__(connector_contract) self.cortex_content = HandlerFactory.get_module('cortex.content') self.token = self._load_token() self.api_endpoint = self._load_api_endpoint() self.project = self._load_project_name() self.cortex_mc_client = self.cortex_content.ManagedContentClient( url=self.api_endpoint, token=self.token) self._etag = 0 self._changed_flag = True
def save_state(self, with_reset: bool = None, fillna: bool = None, **kwargs): """ saves the current state and optionally resets the event book""" if isinstance(self._state_connector, ConnectorContract): _current_state = self.current_state(fillna=fillna) handler = HandlerFactory.instantiate(self._state_connector) handler.persist_canonical(_current_state, **kwargs) if isinstance(with_reset, bool) and with_reset: self.reset_state() return
def __init__(self, connector_contract: ConnectorContract): """ initialise the Hander passing the source_contract dictionary """ # required module import self.mongo = HandlerFactory.get_module('pymongo') super().__init__(connector_contract) database = self.connector_contract.kwargs.get("database") self._mongo_database = self.mongo.MongoClient( self.connector_contract.uri)[database] self._mongo_collection = self._mongo_database[ self.connector_contract.kwargs.get("collection")] self._file_state = 0 self._changed_flag = True
def _get_canonical(self, data: [pd.DataFrame, pd.Series, list, str, dict], header: str = None) -> pd.DataFrame: if isinstance(data, pd.DataFrame): return deepcopy(data) if isinstance(data, dict): method = data.pop('method', None) if method is None: raise ValueError(f"The data dictionary has no 'method' key.") if str(method).startswith('@generate'): task_name = data.pop('task_name', None) if task_name is None: raise ValueError( f"The data method '@generate' requires a 'task_name' key." ) repo_uri = data.pop('repo_uri', None) module = HandlerFactory.get_module(module_name='ds_behavioral') inst = module.SyntheticBuilder.from_env(task_name=task_name, uri_pm_repo=repo_uri, default_save=False) size = data.pop('size', None) seed = data.get('seed', None) run_book = data.pop('run_book', None) result = inst.tools.run_intent_pipeline(size=size, columns=run_book, seed=seed) return inst.tools.frame_selection(canonical=result, save_intent=False, **data) else: raise ValueError( f"The data 'method' key {method} is not a recognised intent method" ) elif isinstance(data, (list, pd.Series)): header = header if isinstance(header, str) else 'default' return pd.DataFrame(data=deepcopy(data), columns=[header]) elif isinstance(data, str): if data == '@empty': return pd.DataFrame() if not self._pm.has_connector(connector_name=data): raise ValueError( f"The data connector name '{data}' is not in the connectors catalog" ) handler = self._pm.get_connector_handler(data) canonical = handler.load_canonical() if isinstance(canonical, dict): canonical = pd.DataFrame.from_dict(data=canonical, orient='columns') return canonical raise ValueError( f"The canonical format is not recognised, pd.DataFrame, pd.Series" f"str, list or dict expected, {type(data)} passed")
def backup_state(self, stamp_uri: str = None, fillna: bool = None, **kwargs): """ persists the event book state with an alternative to save off a stamped copy to a provided URI :param stamp_uri: in addition to persisting the event book, save to this uri :param fillna: if the NAN values in the current state should be filled :return: """ if isinstance(self._state_connector, ConnectorContract) and isinstance( stamp_uri, str): _current_state = self.current_state(fillna=fillna) handler = HandlerFactory.instantiate(self._state_connector) handler.backup_canonical(canonical=_current_state, uri=stamp_uri, **kwargs) return
def _get_constant(reference: str, size: int = None, shuffle: bool = True, seed: int = None) -> [pd.DataFrame, list]: """private method to retrieve data constant""" module = HandlerFactory.get_module( module_name=f"ds_behavioral.sample.{reference}") if reference.startswith("lookup_"): return AbstractSample._select_list(selection=module.data, size=size, seed=seed, shuffle=shuffle) df = pd.DataFrame.from_dict(module.data, orient='columns') idx = df.index.to_list() selection = AbstractSample._select_list(selection=idx, size=size, seed=seed, shuffle=shuffle) rtn_df: pd.DataFrame = df.iloc[selection].reset_index(drop=True) return rtn_df
def _persist_events(self): """Saves the pandas.DataFrame to the persisted stater""" if isinstance(self._events_connector, ConnectorContract): handler = HandlerFactory.instantiate(self._events_connector) handler.persist_canonical(self.__events_log) return
def __init__(self, connector_contract: ConnectorContract): """ initialise the Handler passing the source_contract dictionary """ self.pyhive = HandlerFactory.get_module('pyhive.hive') super().__init__(connector_contract) self._file_state = 0 self._changed_flag = True
def _get_canonical(self, data: [pd.DataFrame, pd.Series, list, str, dict, int], header: str=None, size: int=None, deep_copy: bool=None) -> pd.DataFrame: """ Used to return or generate a pandas Dataframe from a number of different methods. The following can be passed and their returns - pd.Dataframe -> a deep copy of the pd.DataFrame - pd.Series or list -> creates a pd.DataFrame of one column with the 'header' name or 'default' if not given - str -> instantiates a connector handler with the connector_name and loads the DataFrame from the connection - int -> generates an empty pd.Dataframe with an index size of the int passed. - dict -> use the canonical2dict(...) method to construct a dict with a method and related parameters methods: - model_*(...) -> one of the builder model methods and paramters - *_selection(...) -> one of the builder selection methods (get_, correlate_, frame_) and paramters - @empty -> generates an empty pd.DataFrame where size and headers can be passed :size sets the index size of the dataframe :headers any initial headers for the dataframe - @generate -> generate a synthetic file from a remote Domain Contract :task_name the name of the SyntheticBuilder task to run :repo_uri the location of the Domain Product :size (optional) a size to generate :seed (optional) if a seed should be applied :run_book (optional) if specific intent should be run only :param data: a dataframe or action event to generate a dataframe :param header: (optional) header for pd.Series or list :param size: (optional) a size parameter for @empty of @generate :param header: (optional) used in conjunction with lists or pd.Series to give a header reference :return: a pd.Dataframe """ deep_copy = deep_copy if isinstance(deep_copy, bool) else True if isinstance(data, pd.DataFrame): if deep_copy: return deepcopy(data) return data if isinstance(data, dict): data = data.copy() method = data.pop('method', None) if method is None: try: return pd.DataFrame.from_dict(data=data) except ValueError: raise ValueError("The canonical data passed was of type 'dict' but did not contain a 'method' key " "or was not convertible to Dataframe") if method in self.__dir__(): if str(method).startswith('model_') or method == 'frame_selection': data.update({'save_intent': False}) return eval(f"self.{method}(**data)", globals(), locals()) if str(method).endswith('_selection'): if not isinstance(header, str): raise ValueError(f"The canonical type 'dict' method '{method}' must have a header parameter.") data.update({'save_intent': False}) if method == 'get_selection': if not isinstance(size, int): raise ValueError(f"The canonical type 'dict' method '{method}' must have a size parameter.") data.update({'size': size}) return pd.DataFrame(data=eval(f"self.{method}(**data)", globals(), locals()), columns=[header]) elif str(method).startswith('@generate'): task_name = data.pop('task_name', None) if task_name is None: raise ValueError(f"The data method '@generate' requires a 'task_name' key.") uri_pm_repo = data.pop('repo_uri', None) module = HandlerFactory.get_module(module_name='ds_discovery') inst = module.SyntheticBuilder.from_env(task_name=task_name, uri_pm_repo=uri_pm_repo, default_save=False) size = size if isinstance(size, int) and 'size' not in data.keys() else data.pop('size', None) seed = data.get('seed', None) run_book = data.pop('run_book', None) result = inst.tools.run_intent_pipeline(canonical=size, columns=run_book, seed=seed) return inst.tools.frame_selection(canonical=result, save_intent=False, **data) elif str(method).startswith('@empty'): size = size if isinstance(size, int) and 'size' not in data.keys() else data.pop('size', None) headers = data.pop('headers', None) size = range(size) if size else None return pd.DataFrame(index=size, columns=headers) else: raise ValueError(f"The data 'method' key {method} is not a recognised intent method") elif isinstance(data, (list, pd.Series)): header = header if isinstance(header, str) else 'default' if deep_copy: data = deepcopy(data) return pd.DataFrame(data=data, columns=[header]) elif isinstance(data, str): if not self._pm.has_connector(connector_name=data): if isinstance(size, int): return pd.DataFrame(index=range(size)) raise ValueError(f"The data connector name '{data}' is not in the connectors catalog") handler = self._pm.get_connector_handler(data) canonical = handler.load_canonical() if isinstance(canonical, dict): canonical = pd.DataFrame.from_dict(data=canonical) return canonical elif isinstance(data, int): return pd.DataFrame(index=range(data)) if data > 0 else pd.DataFrame() elif not data: return pd.DataFrame() raise ValueError(f"The canonical format is not recognised, pd.DataFrame, pd.Series, " f"str, list or dict expected, {type(data)} passed")