Exemplo n.º 1
0
 def synthetic_agent(agent_name: str, size: int, remote_uri: str):
     SyntheticBuilder.from_env(
         agent_name,
         uri_pm_repo=remote_uri).run_synthetic_pipeline(size=size)
     Transition.from_env(agent_name,
                         uri_pm_repo=remote_uri).run_transition_pipeline()
     FeatureCatalog.from_env(agent_name,
                             uri_pm_repo=remote_uri).run_feature_pipeline()
Exemplo n.º 2
0
 def test_from_component(self):
     # EventBook
     os.environ['HADRON_DEFAULT_PATH'] = 'eb://grey_storage/'
     os.environ[
         'HADRON_DEFAULT_MODULE'] = 'ds_engines.handlers.event_handlers'
     os.environ['HADRON_DEFAULT_SOURCE_HANDLER'] = 'EventPersistHandler'
     os.environ['HADRON_DEFAULT_PERSIST_HANDLER'] = 'EventSourceHandler'
     # Portfolio
     builder = SyntheticBuilder.from_env('members', has_contract=False)
     builder.set_outcome(uri_file="synthetic_members")
     builder = SyntheticBuilder.from_env('members')
    def setUp(self):
        # clean out any old environments
        for key in os.environ.keys():
            if key.startswith('HADRON'):
                del os.environ[key]

        os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
        os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
        try:
            os.makedirs(os.environ['HADRON_PM_PATH'])
            os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
        except:
            raise IOError('Unable to create directories')
        PropertyManager._remove_all()
        builder = SyntheticBuilder.from_env('task1', has_contract=False)
        builder.set_persist()
        builder.pm_persist()
        tr = Transition.from_env('task2', has_contract=False)
        tr.set_source_uri(builder.get_persist_contract().raw_uri)
        tr.set_persist()
        tr.pm_persist()
        wr = Wrangle.from_env('task3', has_contract=False)
        wr.set_source_uri(tr.get_persist_contract().raw_uri)
        wr.set_persist()
        wr.pm_persist()
Exemplo n.º 4
0
 def test_dict_generate(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame()
     df['gender'] = tools.get_category(selection=['M', 'F'],
                                       column_name='gender')
     df['age'] = tools.get_number(from_value=18,
                                  to_value=90,
                                  column_name='age')
     target = {'method': '@generate', 'task_name': 'generator'}
     result = tools._get_canonical(data=target)
     self.assertCountEqual(['age', 'gender'], result.columns.to_list())
     target = {'method': '@generate', 'task_name': 'generator', 'size': 100}
     result = tools._get_canonical(data=target)
     self.assertCountEqual(['age', 'gender'], result.columns.to_list())
     self.assertEqual(100, result.shape[0])
     selection = [tools.select2dict(column='gender', condition="@=='M'")]
     target = {
         'method': '@generate',
         'task_name': 'generator',
         'size': 100,
         'selection': selection
     }
     result = tools._get_canonical(data=target)
     self.assertGreater(result.shape[0], 0)
     self.assertEqual(0, (result[result['gender'] == 'F']).shape[0])
 def test_flatten_onehot(self):
     builder = SyntheticBuilder.from_env('tester',
                                         default_save_intent=False)
     tools: SyntheticIntentModel = builder.tools
     sample_size = 10
     df = pd.DataFrame()
     df['profile'] = tools.get_number(from_value=1,
                                      to_value=9,
                                      at_most=3,
                                      size=sample_size)
     df['cat'] = tools.get_category(selection=['A', 'B', 'C'],
                                    size=sample_size)
     df['num'] = tools.get_number(from_value=1,
                                  to_value=9,
                                  size=sample_size)
     df['value'] = tools.get_number(from_value=1,
                                    to_value=3,
                                    size=sample_size)
     result = tools.model_multihot(df, header='cat')
     result = tools.model_group(result,
                                group_by='profile',
                                headers=['cat', 'value'],
                                regex=True)
     self.assertCountEqual(['profile', 'value', 'cat_A', 'cat_B', 'cat_C'],
                           result.columns.to_list())
 def test_runs(self):
     """Basic smoke test"""
     im = SyntheticBuilder.from_env('tester',
                                    default_save=False,
                                    default_save_intent=False,
                                    reset_templates=False,
                                    has_contract=False).intent_model
     self.assertTrue(SyntheticIntentModel, type(im))
Exemplo n.º 7
0
 def setUp(self):
     # clean out any old environments
     for key in os.environ.keys():
         if key.startswith('HADRON'):
             del os.environ[key]
     # Local Domain Contract
     os.environ['HADRON_PM_PATH'] = os.path.join('working', 'contracts')
     os.environ['HADRON_PM_TYPE'] = 'json'
     # Local Connectivity
     os.environ['HADRON_DEFAULT_PATH'] = Path('working/data').as_posix()
     # Specialist Component
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
     except:
         pass
     try:
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     builder = SyntheticBuilder.from_env('builder', has_contract=False)
     builder.set_persist()
     sample_size = 10
     df = pd.DataFrame()
     df['cat'] = builder.tools.get_category(selection=['a', 'b', 'c', 'd'],
                                            size=sample_size,
                                            column_name='cat')
     df['norm'] = builder.tools.get_dist_normal(mean=4,
                                                std=1,
                                                size=sample_size,
                                                column_name='norm')
     df['pois'] = builder.tools.get_dist_poisson(interval=7,
                                                 size=sample_size,
                                                 column_name='pois')
     df['norm_std'] = builder.tools.correlate_numbers(
         df, header='norm', standardize=True, column_name='norm_std')
     df['jitter1'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=0.1,
                                                     column_name='jitter1')
     df['jitter2'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=0.8,
                                                     column_name='jitter2')
     df['jitter3'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=1.5,
                                                     column_name='jitter3')
     df['jitter4'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=2,
                                                     column_name='jitter4')
     df['jitter5'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=3,
                                                     column_name='jitter5')
     builder.run_component_pipeline()
Exemplo n.º 8
0
 def test_set_report_persist(self):
     builder = SyntheticBuilder.from_env('tester',
                                         default_save=False,
                                         has_contract=False)
     builder.setup_bootstrap(domain='domain',
                             project_name='project_name',
                             path=None)
     report = builder.report_connectors(stylise=False)
     _, file = os.path.split(report.uri.iloc[-1])
     self.assertTrue(file.startswith('project_name'))
Exemplo n.º 9
0
 def test_dict_method_model(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     action = tools.canonical2dict(method='model_sample_map',
                                   canonical=tools.action2dict(
                                       method='@empty', size=100),
                                   sample_map='us_persona',
                                   female_bias=0.3)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 5), result.shape)
     self.assertEqual(30, result['gender'].value_counts().loc['F'])
Exemplo n.º 10
0
 def test_filter_correlate(self):
     builder = SyntheticBuilder.from_env('builder')
     tr = Transition.from_env("tr1", has_contract=False)
     cleaners: TransitionIntentModel = tr.cleaners
     tr.set_source_uri(builder.get_persist_contract().raw_uri)
     tr.set_persist()
     df = tr.load_source_canonical()
     self.assertEqual((1000, 9), df.shape)
     df = cleaners.auto_brute_force_correlated(df)
     self.assertEqual((1000, 7), df.shape)
     df = cleaners.auto_brute_force_correlated(df, threshold=0.8)
     self.assertEqual((1000, 3), df.shape)
 def test_remove_unwanted_rows(self):
     builder = SyntheticBuilder.from_env("test",
                                         default_save=False,
                                         default_save_intent=False,
                                         has_contract=False)
     builder.set_source_uri(
         uri=
         "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
     )
     selection = [
         builder.tools.select2dict(column="survived", condition="@==1")
     ]
     result = builder.tools.frame_selection(
         canonical=builder.CONNECTOR_SOURCE, selection=selection)
     self.assertEqual(1, result["survived"].min())
 def test_model_sample(self):
     builder = SyntheticBuilder.from_env("test",
                                         default_save=False,
                                         default_save_intent=False,
                                         has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_persist(connector_name="sample",
                                   uri_file="sample.parquet")
     sample = pd.DataFrame()
     sample["age"] = [20, 34, 50, 75]
     sample["gender"] = list("MMFM")
     builder.persist_canonical(connector_name="sample", canonical=sample)
     df = pd.DataFrame(index=range(1973))
     df = tools.model_sample(df, other="sample", headers=["age", "gender"])
     self.assertEqual((1973, 2), df.shape)
Exemplo n.º 13
0
 def test_run_synthetic_pipeline_seed(self):
     builder = SyntheticBuilder.from_env('tester', has_contract=False)
     builder.set_persist()
     tools: SyntheticIntentModel = builder.tools
     _ = tools.get_category(selection=['M', 'F'],
                            relative_freq=[4, 3],
                            column_name='gender')
     _ = tools.get_number(from_value=18, to_value=80, column_name='age')
     builder.run_component_pipeline(size=1000, seed=23)
     df = builder.load_persist_canonical()
     dist = df['gender'].value_counts().values
     mean = df['age'].mean()
     builder.run_component_pipeline(size=1000, seed=23)
     df = builder.load_persist_canonical()
     self.assertCountEqual(dist, df['gender'].value_counts().values)
     self.assertEqual(mean, df['age'].mean())
Exemplo n.º 14
0
 def test_canonical_run_pipeline_runbook(self):
     builder = SyntheticBuilder.from_env('sample', has_contract=False)
     tools = builder.tools
     df = pd.DataFrame()
     df['values'] = tools.get_category(selection=['A', 'B'],
                                       column_name='values')
     builder.add_run_book_level(run_level='values')
     df['numbers'] = tools.get_number(1, 2, column_name='numbers')
     builder.add_run_book_level(run_level='numbers')
     df['addition'] = tools.get_number(1, 2, column_name='addition')
     result = tools.run_intent_pipeline(
         canonical=10, run_book=self.builder.pm.PRIMARY_RUN_BOOK)
     self.assertEqual(['values', 'numbers'], result.columns.to_list())
     result = tools.run_intent_pipeline(canonical=10)
     self.assertEqual(['values', 'numbers', 'addition'],
                      result.columns.to_list())
Exemplo n.º 15
0
 def setUp(self):
     os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
     os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     self.tools = SyntheticBuilder.from_env('tester',
                                            default_save=False,
                                            default_save_intent=False,
                                            has_contract=False).intent_model
     self.fc: FeatureCatalog = FeatureCatalog.from_env('tester',
                                                       default_save=False,
                                                       has_contract=False)
Exemplo n.º 16
0
 def test_dict_empty(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     action = tools.canonical2dict(method='@empty')
     result = tools._get_canonical(data=action)
     self.assertEqual((0, 0), result.shape)
     action = tools.canonical2dict(method='@empty', size=100)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 0), result.shape)
     action = tools.canonical2dict(method='@empty', size=100)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 0), result.shape)
     action = tools.canonical2dict(method='@empty',
                                   size=100,
                                   headers=['A', 'B', 'C'])
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 3), result.shape)
Exemplo n.º 17
0
 def test_model_iterator(self):
     builder = SyntheticBuilder.from_env("test",
                                         default_save=False,
                                         default_save_intent=False,
                                         has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_uri(
         "titanic",
         uri=
         "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv",
     )
     # do nothing
     result = tools.model_iterator(canonical="titanic")
     self.assertEqual(builder.load_canonical("titanic").shape, result.shape)
     # add marker
     result = tools.model_iterator(canonical="titanic", marker_col="marker")
     self.assertEqual(
         builder.load_canonical("titanic").shape[1] + 1, result.shape[1])
     # with selection
     selection = [tools.select2dict(column="survived", condition="@==1")]
     control = tools.frame_selection(canonical="titanic",
                                     selection=selection)
     result = tools.model_iterator(canonical="titanic",
                                   marker_col="marker",
                                   selection=selection)
     self.assertEqual(control.shape[0], result.shape[0])
     # with iteration
     result = tools.model_iterator(canonical="titanic",
                                   marker_col="marker",
                                   iter_stop=3)
     self.assertCountEqual([0, 1, 2],
                           result["marker"].value_counts().index.to_list())
     # with actions
     actions = {
         2: (tools.action2dict(method="get_category", selection=[4, 5]))
     }
     result = tools.model_iterator(
         canonical="titanic",
         marker_col="marker",
         iter_stop=3,
         iteration_actions=actions,
     )
     self.assertCountEqual([0, 1, 4, 5],
                           result["marker"].value_counts().index.to_list())
 def test_correlate_date(self):
     builder = SyntheticBuilder.from_env('tester',
                                         default_save_intent=False)
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame()
     df['date1'] = tools.get_datetime(start='2020-01-01',
                                      until='2020-02-01',
                                      seed=31,
                                      size=10)
     df['date2'] = tools.get_datetime(start='2020-03-01',
                                      until='2020-04-01',
                                      seed=31,
                                      size=10)
     result = tools.correlate_date_diff(df,
                                        first_date='date1',
                                        second_date='date2')
     self.assertEqual(
         [60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0],
         result)
 def test_model_columns_headers(self):
     builder = SyntheticBuilder.from_env("test",
                                         default_save=False,
                                         default_save_intent=False,
                                         has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     builder.set_source_uri(
         uri=
         "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
     )
     df = pd.DataFrame(index=range(300))
     result = tools.model_concat(
         df,
         other=builder.CONNECTOR_SOURCE,
         as_rows=False,
         headers=["survived", "sex", "fare"],
     )
     self.assertCountEqual(["survived", "sex", "fare"],
                           list(result.columns))
     self.assertEqual(300, result.shape[0])
Exemplo n.º 20
0
 def test_synthetic_with_no_source(self):
     shutil.rmtree('work/config')
     os.makedirs(os.environ['HADRON_PM_PATH'])
     PropertyManager._remove_all()
     builder = SyntheticBuilder.from_env('task3', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     builder.set_persist()
     df = pd.DataFrame(index=range(10))
     tools.model_noise(df, num_columns=5, column_name='noise')
     controller = Controller.from_env(has_contract=False)
     controller.intent_model.synthetic_builder(df, task_name='task3')
     controller.run_controller()
     self.assertIn(
         builder.CONNECTOR_PERSIST,
         builder.report_connectors(
             stylise=False)['connector_name'].to_list())
     self.assertNotIn(
         builder.CONNECTOR_SOURCE,
         builder.report_connectors(
             stylise=False)['connector_name'].to_list())
 def setUp(self):
     # clean out any old environments
     for key in os.environ.keys():
         if key.startswith('HADRON'):
             del os.environ[key]
     # Local Domain Contract
     os.environ['HADRON_PM_PATH'] = os.path.join('working', 'contracts')
     os.environ['HADRON_PM_TYPE'] = 'json'
     # Local Connectivity
     os.environ['HADRON_DEFAULT_PATH'] = Path('working/data').as_posix()
     # Specialist Component
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
     except:
         pass
     try:
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     builder = SyntheticBuilder.from_env('tester', has_contract=False)
     builder.set_persist()
     tools: SyntheticIntentModel = builder.tools
 def setUp(self):
     os.environ['HADRON_PM_PATH'] = os.path.join(os.environ['PWD'], 'work')
     self.tools: SyntheticIntentModel = SyntheticBuilder.from_env('tester', default_save=False,
                                                                  default_save_intent=False,
                                                                  has_contract=False).intent_model
Exemplo n.º 23
0
 def test_runs(self):
     """Basic smoke test"""
     self.assertEqual(
         SyntheticBuilder,
         type(SyntheticBuilder.from_env('tester', has_contract=False)))
Exemplo n.º 24
0
 def builder(self) -> SyntheticBuilder:
     return SyntheticBuilder.from_env('tester', has_contract=False)