def test_source_handler(self): file = os.path.join(os.environ['HADRON_DEFAULT_PATH'], 'example01.csv') handler = PandasSourceHandler(ConnectorContract(file, '', '')) self.assertTrue(isinstance(handler.supported_types(), list) and len(handler.supported_types()) > 0) df = handler.load_canonical() self.assertEqual((1000,11), df.shape) self.assertTrue(handler.has_changed()) file = os.path.join(os.environ['HADRON_DEFAULT_PATH'], 'example01.dat') handler = PandasSourceHandler(ConnectorContract(file, '', '', file_type='csv')) df = handler.load_canonical() self.assertEqual((1000,11), df.shape)
def add_book_contract(self, book_name: str, with_log: bool = None, file_type: str = None, versioned: bool = None, stamped: bool = None, save: bool = None, **kwargs): """ adds an event book connector using the book connector template and appending a book pattern to the URI path :param book_name: the name of the event book :param with_log: (optional) if an events log connector should be created :param file_type: (optional) a file type extension. defaults to 'pickle' :param versioned: (optional) if the connector uri should be versioned :param stamped: (optional) if the connector uri should be timestamped :param save: (optional) override of the default save action set at initialisation. :param kwargs: extra kwargs to pass to the connector """ if not self.pm.has_connector( connector_name=self.BOOK_TEMPLATE_CONNECTOR): raise ConnectionError( f"The book template connector has not been set") template = self.pm.get_connector_contract(self.BOOK_TEMPLATE_CONNECTOR) uri_file = self.pm.file_pattern(name=book_name, file_type=file_type, versioned=versioned, stamped=stamped) uri = os.path.join(template.path, uri_file) if not isinstance(kwargs, dict): kwargs = {} kwargs.update(template.raw_kwargs) cc = ConnectorContract(uri=uri, module_name=template.module_name, handler=template.handler, **kwargs) self.add_connector_contract(connector_name=book_name, connector_contract=cc, template_aligned=True, save=save) # add the log persist if isinstance(with_log, bool) and with_log: log = f"{book_name}_log" uri_log = self.pm.file_pattern(name=log, file_type=file_type) lc = ConnectorContract(uri=uri, module_name=template.module_name, handler=template.handler, **kwargs) self.add_connector_contract(connector_name=uri_log, connector_contract=lc, template_aligned=True, save=save) return
def test_persist_backup(self): handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/example01.json', '', '')) df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10) self.assertTrue(handler.persist_canonical(df)) df = pd.DataFrame(data=handler.load_canonical()) self.assertEqual((1000, 10), df.shape) handler.remove_canonical() self.assertFalse(handler.exists()) # Backup uri = 'work/data/2_transition/example01.pq.bak?file_type=parquet' self.assertFalse(os.path.exists(uri)) handler.backup_canonical(canonical=df, uri=uri) self.assertTrue(os.path.exists(ConnectorContract.parse_address(uri)))
def set_book_contract_template(self, uri_path: str = None, module_name: str = None, handler: str = None, save: bool = None, **kwargs): """ sets the book template connector that is used as the base for all event book persistence. for parameters not given, the persist connector template is used. :param uri_path: a uri path :param module_name: a module package name :param handler: a handler :param save: override of the default save action set at initialisation. :param kwargs: additional kwargs """ template = self.pm.get_connector_contract(self.TEMPLATE_PERSIST) uri_path = uri_path if isinstance(uri_path, str) else template.raw_uri module_name = module_name if isinstance( module_name, str) else template.raw_module_name handler = handler if isinstance(handler, str) else template.raw_handler if not isinstance(kwargs, dict): kwargs = {} kwargs.update(template.raw_kwargs) book_template = ConnectorContract(uri=uri_path, module_name=module_name, handler=handler, **kwargs) if self.pm.has_connector(self.BOOK_TEMPLATE_CONNECTOR): self.remove_connector_contract( connector_name=self.BOOK_TEMPLATE_CONNECTOR) self.pm.set_connector_contract( connector_name=self.BOOK_TEMPLATE_CONNECTOR, connector_contract=book_template) self.pm_persist(save=save) return
def test_persist(self): state_uri = os.path.join(os.environ['HADRON_PM_PATH'], 'state.pickle') events_uri = os.path.join(os.environ['HADRON_PM_PATH'], 'events_log.pickle') state_connector = ConnectorContract(uri=state_uri, module_name=self.MODULE, handler=self.HANDLER) events_connector = ConnectorContract(uri=events_uri, module_name=self.MODULE, handler=self.HANDLER) engine = PandasEventBook('test', state_connector=state_connector, events_log_connector=events_connector) self.assertEqual(False, os.path.exists(state_uri)) self.assertEqual(False, os.path.exists(events_uri)) for i in range(10): engine.increment_event(event=pd.DataFrame( data={'A': [i, i * 2, i * 3]})) self.assertEqual(0, len(engine._current_events_log.keys()), "loop run") self.assertEqual(False, os.path.exists(state_uri)) self.assertEqual(False, os.path.exists(events_uri)) engine.set_count_distance(3) engine.set_events_log_distance(2) # add one engine.increment_event(event=pd.DataFrame(data={'A': [1, 1, 1]})) self.assertEqual(False, os.path.exists(state_uri)) self.assertEqual(False, os.path.exists(events_uri)) self.assertEqual(1, len(engine._current_events_log.keys()), "loop One") # add two engine.increment_event(event=pd.DataFrame(data={'A': [1, 1, 1]})) self.assertEqual(False, os.path.exists(state_uri)) self.assertEqual(True, os.path.exists(events_uri)) self.assertEqual(0, len(engine._current_events_log.keys()), "loop Two") # add three engine.increment_event(event=pd.DataFrame(data={'A': [1, 1, 1]})) self.assertEqual(True, os.path.exists(state_uri)) self.assertEqual(True, os.path.exists(events_uri)) self.assertEqual(0, len(engine._current_events_log.keys()), "loop Three") # add four engine.increment_event(event=pd.DataFrame(data={'A': [1, 1, 1]})) self.assertEqual(1, len(engine._current_events_log.keys()), "loop Four")
def test_json(self): df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10) handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/handler_test.json', '', '', file_type='json')) handler.persist_canonical(df) self.assertTrue(handler.exists()) result = pd.DataFrame(data=handler.load_canonical()) self.assertEqual(df.shape, result.shape) self.assertCountEqual(df.columns, result.columns) handler.remove_canonical() self.assertFalse(handler.exists())
def test_aws_connector_init(self): handler = S3PersistHandler(connector_contract=ConnectorContract( uri='s3://project-hadron-cs-repo/factory/healthcare/members', module_name='', handler='')) data = {'a': [1, 2, 3, 4, 5]} handler.persist_canonical(data) result = handler.load_canonical() self.assertTrue(isinstance(result, dict)) result = handler.load_canonical(read_params={'as_dataframe': True}) self.assertTrue(isinstance(result, pd.DataFrame))
def test_handler(self): """Basic smoke test""" cc = ConnectorContract(uri='eb://test_book', module_name='', handler='') handler = EventPersistHandler(connector_contract=cc) # test persist and load df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [7, 2, 1, 4]}) handler.persist_canonical(df) result = handler.load_canonical() self.assertEqual(['A', 'B'], list(result.columns)) self.assertEqual((4, 2), result.shape)
def test_file_column(self): tools = DataBuilderTools df = pd.DataFrame() df['cat'] = tools.get_category(list('MFU'), size=10, seed=31) df['values'] = tools.get_number(10, size=10, seed=31) df.to_csv("test_df.csv", sep=',', index=False) connector_contract = ConnectorContract( uri="test_df.csv", module_name='aistac.handlers.python_handlers', handler='PythonSourceHandler') result = tools.get_column('cat', connector_contract, size=3, seed=31) self.assertEqual((3, 1), result.shape)
def test_transition_summary_report(self): tr: Transition = Transition.from_env('test', default_save=False, default_save_intent=False, has_contract=False) cc = ConnectorContract(uri=os.path.join(os.environ['HOME'], 'code', 'projects', 'data', 'sample', 'synthetic_customer.csv'), module_name=tr.DEFAULT_MODULE, handler=tr.DEFAULT_SOURCE_HANDLER) tr.set_source_contract(connector_contract=cc) report = tr.report_quality_summary(as_dict=True) self.assertEqual( ['score', 'data_shape', 'data_type', 'usability', 'cost'], list(report.keys()))
def test_change_flags(self): """Basic smoke test""" file = os.path.join(os.environ['HADRON_DEFAULT_PATH'], 'example01.csv') open(file, 'a').close() cc = ConnectorContract(uri=file, module_name='', handler='') source = PandasSourceHandler(cc) self.assertTrue(source.has_changed()) _ = source.load_canonical() self.assertFalse(source.has_changed()) source.reset_changed(True) self.assertTrue(source.has_changed()) source.reset_changed() self.assertFalse(source.has_changed()) # touch the file os.remove(file) with open(file, 'a'): os.utime(file, None) self.assertTrue(source.has_changed())
def test_has_changed(self): # test the handler cc = ConnectorContract(uri='eb://test_portfolio/test_book', module_name='', handler='') handler = EventPersistHandler(connector_contract=cc) # Test has changed self.assertFalse(handler.has_changed()) df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [7, 2, 1, 4]}) handler.persist_canonical(df) self.assertTrue(handler.has_changed()) result = handler.load_canonical() self.assertDictEqual(df.to_dict(), result.to_dict()) self.assertTrue(handler.has_changed()) handler.reset_changed() self.assertFalse(handler.has_changed()) df = pd.DataFrame({'C': [9, 8, 7, 3], 'B': [4, 2, 1, 0]}) handler.persist_canonical(df, reset_state=False) result = handler.load_canonical() print(result)
def test_persist_handler(self): handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/example01.pkl', '', '', file_type='pickle')) self.assertTrue(isinstance(handler.supported_types(), list) and len(handler.supported_types()) > 0) self.assertFalse(handler.has_changed()) self.assertFalse(handler.exists()) # create the file and persist df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10) self.assertTrue(handler.persist_canonical(df)) self.assertTrue(handler.exists()) self.assertTrue(handler.has_changed()) df = handler.load_canonical() self.assertEqual((1000, 10), df.shape) # write again to check modified df['value'] = [0] * df.shape[0] self.assertTrue(handler.persist_canonical(df)) df = handler.load_canonical() self.assertEqual((1000, 11), df.shape) self.assertTrue(handler.has_changed()) df = df.drop('value', axis='columns') self.assertTrue(handler.persist_canonical(df)) self.assertEqual((1000, 10), df.shape) self.assertTrue(handler.has_changed())
def backup_canonical(self, canonical: pd.DataFrame, uri: str, reset_state: bool = None, **kwargs) -> bool: """ persists the canonical into the event book extending or replacing the current state :param canonical: the canonical to persist to the event book :param uri: the uri of the event book :param reset_state: True - resets the event book (Default) False - merges the canonical to the current state based on their index """ _schema, _book_name, _ = ConnectorContract.parse_address_elements( uri=uri) if _schema != 'eb' or len(_book_name) == 0: raise ValueError( f"The connector contract uri must be in the format 'eb://<book_name>' as a minimum" ) self._controller.add_event_book(book_name=_book_name, reset=True) self._controller.add_event(book_name=self._book_name, event=canonical, fix_index=False) return True
def test_runs(self): """Basic smoke test""" PandasSourceHandler(ConnectorContract('work/data/0_raw/example01.csv', '', '', file_type='csv')) PandasPersistHandler(ConnectorContract('work/data/2_transition/example01.pkl', '', '', file_type='pickle'))