Exemplo n.º 1
0
 def synthetic_agent(agent_name: str, size: int, remote_uri: str):
     SyntheticBuilder.from_env(
         agent_name,
         uri_pm_repo=remote_uri).run_synthetic_pipeline(size=size)
     Transition.from_env(agent_name,
                         uri_pm_repo=remote_uri).run_transition_pipeline()
     FeatureCatalog.from_env(agent_name,
                             uri_pm_repo=remote_uri).run_feature_pipeline()
Exemplo n.º 2
0
 def test_from_component(self):
     # EventBook
     os.environ['HADRON_DEFAULT_PATH'] = 'eb://grey_storage/'
     os.environ[
         'HADRON_DEFAULT_MODULE'] = 'ds_engines.handlers.event_handlers'
     os.environ['HADRON_DEFAULT_SOURCE_HANDLER'] = 'EventPersistHandler'
     os.environ['HADRON_DEFAULT_PERSIST_HANDLER'] = 'EventSourceHandler'
     # Portfolio
     builder = SyntheticBuilder.from_env('members', has_contract=False)
     builder.set_outcome(uri_file="synthetic_members")
     builder = SyntheticBuilder.from_env('members')
 def test_model_group(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_uri(
         "titanic",
         uri=
         "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv",
     )
     df = tools.model_group("titanic",
                            headers="fare",
                            group_by=["survived", "sex"],
                            aggregator="sum")
     self.assertEqual((4, 3), df.shape)
     df = tools.model_group(
         "titanic",
         headers=["class", "embark_town"],
         group_by=["survived", "sex"],
         aggregator="set",
         list_choice=2,
     )
     # print(df.loc[:, ['class', 'embark_town']])
     self.assertEqual((4, 4), df.shape)
     self.assertCountEqual(["class", "embark_town", "survived", "sex"],
                           df.columns.to_list())
     df = tools.model_group(
         "titanic",
         headers=["fare", "survived"],
         group_by="sex",
         aggregator="sum",
         include_weighting=True,
     )
     self.assertEqual((2, 4), df.shape)
     self.assertCountEqual(["survived", "sex", "fare", "weighting"],
                           df.columns.to_list())
 def test_flatten_onehot(self):
     builder = SyntheticBuilder.from_env('tester',
                                         default_save_intent=False)
     tools: SyntheticIntentModel = builder.tools
     sample_size = 10
     df = pd.DataFrame()
     df['profile'] = tools.get_number(from_value=1,
                                      to_value=9,
                                      at_most=3,
                                      size=sample_size)
     df['cat'] = tools.get_category(selection=['A', 'B', 'C'],
                                    size=sample_size)
     df['num'] = tools.get_number(from_value=1,
                                  to_value=9,
                                  size=sample_size)
     df['value'] = tools.get_number(from_value=1,
                                    to_value=3,
                                    size=sample_size)
     result = tools.model_multihot(df, header='cat')
     result = tools.model_group(result,
                                group_by='profile',
                                headers=['cat', 'value'],
                                regex=True)
     self.assertCountEqual(['profile', 'value', 'cat_A', 'cat_B', 'cat_C'],
                           result.columns.to_list())
 def test_correlate_categories_nulls(self):
     tools = self.tools
     builder = SyntheticBuilder.from_memory().tools
     df = pd.DataFrame()
     df['pcp_tax_id'] = builder.get_category(selection=[
         '993406113', '133757370', '260089066', '448512481', '546434723'
     ],
                                             quantity=0.9,
                                             size=100,
                                             column_name='pcp_tax_id')
     correlations = [
         '993406113', '133757370', '260089066', '448512481', '546434723'
     ]
     actions = {
         0: 'LABCORP OF AMERICA',
         1: 'LPCH MEDICAL GROUP',
         2: 'ST JOSEPH HERITAGE MEDICAL',
         3: 'MONARCH HEALTHCARE',
         4: 'PRIVIA MEICAL GROUP'
     }
     df['pcp_name'] = tools.correlate_categories(df,
                                                 header='pcp_tax_id',
                                                 correlations=correlations,
                                                 actions=actions,
                                                 column_name='pcp_name')
     print(df.head())
Exemplo n.º 6
0
 def test_associate_analysis_complex(self):
     builder = SyntheticBuilder.from_memory()
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     builder.add_connector_uri('clinical_health', uri=clinical_health)
     discover: DataDiscovery = Transition.from_memory().discover
     A = discover.analysis2dict(header='age',
                                dtype='int',
                                granularity=10.0,
                                lower=21,
                                upper=90)
     B = discover.analysis2dict(header='pregnancies')
     columns_list = [A, B]
     df_clinical = builder.load_canonical('clinical_health')
     analysis_blob = discover.analyse_association(df_clinical,
                                                  columns_list=columns_list)
     canonical = pd.DataFrame(index=range(1973))
     df = builder.tools.model_analysis(canonical,
                                       analysis_blob=analysis_blob,
                                       column_name='clinical')
     self.assertEqual((1973, 2), df.shape)
     pregnancies = Commons.list_standardize(
         Commons.list_formatter(df_clinical.pregnancies))
     low, high = discover.bootstrap_confidence_interval(
         pd.Series(pregnancies), func=np.mean)
     pregnancies = Commons.list_standardize(
         Commons.list_formatter(df.pregnancies))
     self.assertTrue(low <= np.mean(pregnancies) <= high)
Exemplo n.º 7
0
 def test_dict_generate(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame()
     df['gender'] = tools.get_category(selection=['M', 'F'],
                                       column_name='gender')
     df['age'] = tools.get_number(from_value=18,
                                  to_value=90,
                                  column_name='age')
     target = {'method': '@generate', 'task_name': 'generator'}
     result = tools._get_canonical(data=target)
     self.assertCountEqual(['age', 'gender'], result.columns.to_list())
     target = {'method': '@generate', 'task_name': 'generator', 'size': 100}
     result = tools._get_canonical(data=target)
     self.assertCountEqual(['age', 'gender'], result.columns.to_list())
     self.assertEqual(100, result.shape[0])
     selection = [tools.select2dict(column='gender', condition="@=='M'")]
     target = {
         'method': '@generate',
         'task_name': 'generator',
         'size': 100,
         'selection': selection
     }
     result = tools._get_canonical(data=target)
     self.assertGreater(result.shape[0], 0)
     self.assertEqual(0, (result[result['gender'] == 'F']).shape[0])
 def test_correlate_mark_outliers(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame()
     df["number"] = tools.get_dist_normal(2, 1, size=1000, seed=99)
     result = tools.correlate_mark_outliers(canonical=df,
                                            header="number",
                                            measure=1.5,
                                            method='quantile')
     df['quantile'] = result
     result = tools.correlate_mark_outliers(canonical=df,
                                            header="number",
                                            measure=3,
                                            method='empirical')
     df['empirical'] = result
     result = tools.correlate_mark_outliers(canonical=df,
                                            header="number",
                                            measure=0.002,
                                            method='probability')
     df['probability'] = result
     self.assertEqual([992, 8],
                      df['quantile'].value_counts().values.tolist())
     self.assertEqual([995, 5],
                      df['empirical'].value_counts().values.tolist())
     self.assertEqual([996, 4],
                      df['probability'].value_counts().values.tolist())
Exemplo n.º 9
0
 def test_dict_int(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     result = tools._get_canonical(data=0)
     self.assertEqual((0, 0), result.shape)
     result = tools._get_canonical(data=2)
     self.assertEqual((2, 0), result.shape)
Exemplo n.º 10
0
 def test_dict_method_selection(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_uri(
         'titanic',
         "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
     )
     # frame selection
     action = tools.canonical2dict(method='frame_selection',
                                   canonical='titanic',
                                   headers=['survived', 'sex', 'fare'])
     result = tools._get_canonical(data=action)
     self.assertEqual((891, 3), result.shape)
     # correlate selection
     action = tools.action2dict(method='@header', header='sex')
     action = tools.canonical2dict(method='correlate_selection',
                                   canonical='titanic',
                                   selection=[],
                                   action=action)
     result = tools._get_canonical(data=action, header='default')
     self.assertEqual((891, 1), result.shape)
     # get selection
     sample_size = builder.load_canonical('titanic').shape[0]
     action = tools.canonical2dict(method='get_selection',
                                   canonical='titanic',
                                   column_header='survived')
     result = tools._get_canonical(data=action,
                                   header='default',
                                   size=sample_size)
     self.assertEqual((891, 1), result.shape)
 def test_discovery_associate(self):
     tools = SyntheticBuilder.scratch_pad()
     df = pd.DataFrame()
     df['cat'] = tools.get_category(list('AB'), relative_freq=[1,3], size=1000)
     df['gender'] = tools.get_category(list('MF'), relative_freq=[1,3], size=1000)
     result = Discover.analyse_association(df, columns_list=['cat', 'gender'])
     self.assertEqual(['cat', 'gender'], list(result))
 def test_discovery_analytics_class(self):
     tools = SyntheticBuilder.scratch_pad()
     dataset = tools.get_category(list('ABCDE')+[np.nan], relative_freq=[1,3,2,7,4], size=694)
     result = Discover.analyse_category(dataset)
     analytics = DataAnalytics(analysis=result)
     self.assertEqual(analytics.intent.selection, analytics.sample_map.index.to_list())
     self.assertEqual(analytics.patterns.sample_distribution, analytics.sample_map.to_list())
    def setUp(self):
        # clean out any old environments
        for key in os.environ.keys():
            if key.startswith('HADRON'):
                del os.environ[key]

        os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
        os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
        try:
            os.makedirs(os.environ['HADRON_PM_PATH'])
            os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
        except:
            raise IOError('Unable to create directories')
        PropertyManager._remove_all()
        builder = SyntheticBuilder.from_env('task1', has_contract=False)
        builder.set_persist()
        builder.pm_persist()
        tr = Transition.from_env('task2', has_contract=False)
        tr.set_source_uri(builder.get_persist_contract().raw_uri)
        tr.set_persist()
        tr.pm_persist()
        wr = Wrangle.from_env('task3', has_contract=False)
        wr.set_source_uri(tr.get_persist_contract().raw_uri)
        wr.set_persist()
        wr.pm_persist()
 def test_complex_sample_modelling(self):
     tools = SyntheticBuilder.from_memory().tools
     state_code = [
         "CA",
         "NY",
         "LA",
         "NJ",
         "VA",
         "CO",
         "NV",
         "GA",
         "IN",
         "OH",
         "KY",
         "ME",
         "MO",
         "WI",
     ]
     df = pd.DataFrame(index=range(100))
     df = tools.model_sample_map(
         canonical=df,
         sample_map="us_zipcode",
         state_filter=state_code,
         column_name="zipcodes",
     )
     sample_data = tools.action2dict(
         method="model_sample_map",
         canonical=tools.action2dict(method="@empty"),
         sample_map="us_healthcare_practitioner",
         headers=["city", "pcp_tax_id"],
         shuffle=False,
     )
     merge_data = tools.action2dict(
         method="model_group",
         canonical=sample_data,
         headers="pcp_tax_id",
         group_by="city",
         aggregator="list",
     )
     df = tools.model_merge(
         df,
         merge_data,
         how="left",
         left_on="city",
         right_on="city",
         column_name="pcp_tax_id",
     )
     self.assertCountEqual(
         [
             "city",
             "state_abbr",
             "state",
             "county_fips",
             "county",
             "zipcode",
             "pcp_tax_id",
         ],
         df.columns.to_list(),
     )
 def test_runs(self):
     """Basic smoke test"""
     im = SyntheticBuilder.from_env('tester',
                                    default_save=False,
                                    default_save_intent=False,
                                    reset_templates=False,
                                    has_contract=False).intent_model
     self.assertTrue(SyntheticIntentModel, type(im))
Exemplo n.º 16
0
 def test_str(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     df = pd.DataFrame(data={'A': list('12345')})
     builder.add_connector_persist(connector_name='test',
                                   uri_file='test.pickle')
     builder.save_canonical(connector_name='test', canonical=df)
     result = tools._get_canonical(data='test')
     self.assertDictEqual(df.to_dict(), result.to_dict())
 def test_model_modify_except(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame(data={"A": [1, 2, 3, 4, 5], "B": [1, 2, 'A', 4, 5]})
     other = {"headers": ["A", "B"], "target": [2, 0.2]}
     with self.assertRaises(TypeError) as context:
         result = tools.model_modifier(df, other, aggregator='sum')
     self.assertTrue(
         "The column B is not of type numeric" in str(context.exception))
Exemplo n.º 18
0
 def setUp(self):
     # clean out any old environments
     for key in os.environ.keys():
         if key.startswith('HADRON'):
             del os.environ[key]
     # Local Domain Contract
     os.environ['HADRON_PM_PATH'] = os.path.join('working', 'contracts')
     os.environ['HADRON_PM_TYPE'] = 'json'
     # Local Connectivity
     os.environ['HADRON_DEFAULT_PATH'] = Path('working/data').as_posix()
     # Specialist Component
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
     except:
         pass
     try:
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     builder = SyntheticBuilder.from_env('builder', has_contract=False)
     builder.set_persist()
     sample_size = 10
     df = pd.DataFrame()
     df['cat'] = builder.tools.get_category(selection=['a', 'b', 'c', 'd'],
                                            size=sample_size,
                                            column_name='cat')
     df['norm'] = builder.tools.get_dist_normal(mean=4,
                                                std=1,
                                                size=sample_size,
                                                column_name='norm')
     df['pois'] = builder.tools.get_dist_poisson(interval=7,
                                                 size=sample_size,
                                                 column_name='pois')
     df['norm_std'] = builder.tools.correlate_numbers(
         df, header='norm', standardize=True, column_name='norm_std')
     df['jitter1'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=0.1,
                                                     column_name='jitter1')
     df['jitter2'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=0.8,
                                                     column_name='jitter2')
     df['jitter3'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=1.5,
                                                     column_name='jitter3')
     df['jitter4'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=2,
                                                     column_name='jitter4')
     df['jitter5'] = builder.tools.correlate_numbers(df,
                                                     header='pois',
                                                     jitter=3,
                                                     column_name='jitter5')
     builder.run_component_pipeline()
 def test_analyse_date(self):
     tools = SyntheticBuilder.scratch_pad()
     str_dates = tools.get_datetime('12/01/2016', '12/01/2018', date_format='%d-%m-%Y', size=10, seed=31)
     ts_dates = tools.get_datetime('12/01/2016', '12/01/2018', size=10, seed=31)
     result = Discover.analyse_date(str_dates, granularity=3, date_format='%Y-%m-%d')
     control = {'intent': {'date_format': '%Y-%m-%d',
                           'day_first': False,
                           'dtype': 'date',
                           'granularity': 3,
                           'lowest': '2017-12-02',
                           'selection': [('2017-12-02', '2018-03-13', 'both'),
                                         ('2018-03-13', '2018-06-22', 'right'),
                                         ('2018-06-22', '2018-10-01', 'right')],
                           'highest': '2018-10-01',
                           'freq_precision': 2,
                           'year_first': False},
                'patterns': {'sample_distribution': [1, 0, 3],
                             'relative_freq': [25.0, 0.0, 75.0]},
                'stats': {'bootstrap_bci': (17572.5, 17797.75),
                          'emp_outliers': [0, 0],
                          'excluded_percent': 0.0,
                          'irq_outliers': [1, 0],
                          'kurtosis': 3.86,
                          'mean': '2018-07-04',
                          'nulls_percent': 60.0,
                          'sample': 4,
                          'skew': -1.96}}
     self.assertEqual(control, result)
     result = Discover.analyse_date(ts_dates, granularity=3)
     control = {'intent': {'day_first': False,
                           'dtype': 'date',
                           'granularity': 3,
                           'lowest': pd.Timestamp('2017-02-12 19:02:11.531780+0000', tz='UTC'),
                           'selection': [(pd.Timestamp('2017-02-12 19:02:11.531780+0000', tz='UTC'),
                                          pd.Timestamp('2017-09-08 17:43:30.973860+0000', tz='UTC'),
                                          'both'),
                                         (pd.Timestamp('2017-09-08 17:43:30.973860+0000', tz='UTC'),
                                          pd.Timestamp('2018-04-04 16:24:50.415940+0000', tz='UTC'),
                                          'right'),
                                         (pd.Timestamp('2018-04-04 16:24:50.415940+0000', tz='UTC'),
                                          pd.Timestamp('2018-10-29 15:06:09.858020+0000', tz='UTC'),
                                          'right')],
                           'highest': pd.Timestamp('2018-10-29 15:06:09.858020+0000', tz='UTC'),
                           'freq_precision': 2,
                           'year_first': False},
                'patterns': {'sample_distribution': [2, 3, 5],
                             'relative_freq': [20.0, 30.0, 50.0]},
                'stats': {'bootstrap_bci': (17493.5054775573, 17724.4628926684),
                          'emp_outliers': [0, 0],
                          'excluded_percent': 0.0,
                          'irq_outliers': [1, 0], 'kurtosis': 0.64,
                          'mean': pd.Timestamp('2018-03-22 17:31:12+0000', tz='UTC'),
                          'nulls_percent': 0.0,
                          'excluded_percent': 0.0,
                          'sample': 10,
                          'skew': -0.94}}
     self.assertEqual(control, result)
 def test_correlate_number_encode(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame()
     df["number"] = tools.get_dist_poisson(interval=2, size=10)
     result = tools.correlate_numbers(canonical=df,
                                      header="number",
                                      transform="log")
     print(result)
 def test_model_dict(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     sample = [
         {
             "task": "members_sim",
             "source": 100000
         },
         {
             "task": "pcp_sim",
             "source": 0
         },
         {
             "task": "members_gen",
             "source": "members_sim",
             "persist": True
         },
         None,
     ]
     df = pd.DataFrame(data={
         "A": [5, 2, 3, 4],
         "X": sample,
         "Y": list("VWXY")
     })
     df = tools._model_dict_column(df, header="X")
     self.assertCountEqual(["A", "Y", "task", "source", "persist"],
                           df.columns.to_list())
     # as strings
     sample = [
         "{'task': 'members_sim', 'source': 100000}",
         "{'task': 'pcp_sim', 'source': 0}",
         "{'task': 'members_gen', 'source': 'members_sim', 'persist': True}",
         None,
     ]
     df = pd.DataFrame(data={
         "A": [5, 2, None, 4],
         "X": sample,
         "Y": list("VWXY")
     })
     df = tools._model_dict_column(df, header="X", convert_str=True)
     self.assertCountEqual(["A", "Y", "task", "source", "persist"],
                           df.columns.to_list())
     # replace nulls
     df = pd.DataFrame(data={
         "A": [5, 2, None, 4],
         "X": sample,
         "Y": list("VWXY")
     })
     df = tools._model_dict_column(df,
                                   header="X",
                                   convert_str=True,
                                   replace_null="default")
     self.assertEqual(
         ["default", "default", "default"],
         df.loc[3, ["task", "source", "persist"]].to_list(),
     )
Exemplo n.º 22
0
 def test_set_report_persist(self):
     builder = SyntheticBuilder.from_env('tester',
                                         default_save=False,
                                         has_contract=False)
     builder.setup_bootstrap(domain='domain',
                             project_name='project_name',
                             path=None)
     report = builder.report_connectors(stylise=False)
     _, file = os.path.split(report.uri.iloc[-1])
     self.assertTrue(file.startswith('project_name'))
 def test_json(self):
     df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10)
     handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/handler_test.json',
                                                      '', '', file_type='json'))
     handler.persist_canonical(df)
     self.assertTrue(handler.exists())
     result = pd.DataFrame(data=handler.load_canonical())
     self.assertEqual(df.shape, result.shape)
     self.assertCountEqual(df.columns, result.columns)
     handler.remove_canonical()
     self.assertFalse(handler.exists())
Exemplo n.º 24
0
 def test_dict_method_model(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     action = tools.canonical2dict(method='model_sample_map',
                                   canonical=tools.action2dict(
                                       method='@empty', size=100),
                                   sample_map='us_persona',
                                   female_bias=0.3)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 5), result.shape)
     self.assertEqual(30, result['gender'].value_counts().loc['F'])
Exemplo n.º 25
0
 def test_list(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     sample = list('12345')
     result = tools._get_canonical(data=sample)
     self.assertEqual(sample, result['default'].to_list())
     result = tools._get_canonical(data=sample, header='sample')
     self.assertEqual(sample, result['sample'].to_list())
     sample = pd.Series(sample)
     result = tools._get_canonical(data=sample, header='sample')
     self.assertEqual(sample.to_list(), result['sample'].to_list())
 def setUp(self):
     os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
     os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     self.tools = SyntheticBuilder.scratch_pad()
     self.clean = Transition.scratch_pad()
 def test_filter_correlated(self):
     tools = SyntheticBuilder.scratch_pad()
     df = pd.DataFrame()
     df['col1'] = [1, 2, 3, 4, 5, 6, 7]
     df['col2'] = [1, 2, 3, 4, 5, 6, 7]
     df['col3'] = [2, 2, 3, 2, 2, 2, 3]
     df['col4'] = [2, 2, 3, 2, 2, 2, 3]
     df['col5'] = [2, 2, 3, 2, 2, 2, 3]
     df['col4'] = [7, 2, 4, 2, 1, 6, 4]
     df['target'] = [1, 0, 1, 1, 0, 0, 1]
     result = DataDiscovery.filter_correlated(df, target='target')
     print(result)
Exemplo n.º 28
0
 def test_dict_to_dataframe(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     df = {'A': [1, 2, 3, 4, 5], 'B': list('ABCDE'), 'C': list('ABCDE')}
     result = tools._get_canonical(data=df)
     self.assertEqual((5, 3), result.shape)
     self.assertEqual(list('ABC'), result.columns.to_list())
     df = {'A': [4, 5], 'B': list('ABCDE'), 'C': list('ABCD')}
     with self.assertRaises(ValueError) as context:
         result = tools._get_canonical(data=df)
     self.assertTrue("The canonical data passed was of type 'dict'" in str(
         context.exception))
Exemplo n.º 29
0
 def test_filter_correlate(self):
     builder = SyntheticBuilder.from_env('builder')
     tr = Transition.from_env("tr1", has_contract=False)
     cleaners: TransitionIntentModel = tr.cleaners
     tr.set_source_uri(builder.get_persist_contract().raw_uri)
     tr.set_persist()
     df = tr.load_source_canonical()
     self.assertEqual((1000, 9), df.shape)
     df = cleaners.auto_brute_force_correlated(df)
     self.assertEqual((1000, 7), df.shape)
     df = cleaners.auto_brute_force_correlated(df, threshold=0.8)
     self.assertEqual((1000, 3), df.shape)
 def test_persist_backup(self):
     handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/example01.json', '', ''))
     df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10)
     self.assertTrue(handler.persist_canonical(df))
     df = pd.DataFrame(data=handler.load_canonical())
     self.assertEqual((1000, 10), df.shape)
     handler.remove_canonical()
     self.assertFalse(handler.exists())
     # Backup
     uri = 'work/data/2_transition/example01.pq.bak?file_type=parquet'
     self.assertFalse(os.path.exists(uri))
     handler.backup_canonical(canonical=df, uri=uri)
     self.assertTrue(os.path.exists(ConnectorContract.parse_address(uri)))