def synthetic_agent(agent_name: str, size: int, remote_uri: str): SyntheticBuilder.from_env( agent_name, uri_pm_repo=remote_uri).run_synthetic_pipeline(size=size) Transition.from_env(agent_name, uri_pm_repo=remote_uri).run_transition_pipeline() FeatureCatalog.from_env(agent_name, uri_pm_repo=remote_uri).run_feature_pipeline()
def test_from_component(self): # EventBook os.environ['HADRON_DEFAULT_PATH'] = 'eb://grey_storage/' os.environ[ 'HADRON_DEFAULT_MODULE'] = 'ds_engines.handlers.event_handlers' os.environ['HADRON_DEFAULT_SOURCE_HANDLER'] = 'EventPersistHandler' os.environ['HADRON_DEFAULT_PERSIST_HANDLER'] = 'EventSourceHandler' # Portfolio builder = SyntheticBuilder.from_env('members', has_contract=False) builder.set_outcome(uri_file="synthetic_members") builder = SyntheticBuilder.from_env('members')
def test_model_group(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools builder.add_connector_uri( "titanic", uri= "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv", ) df = tools.model_group("titanic", headers="fare", group_by=["survived", "sex"], aggregator="sum") self.assertEqual((4, 3), df.shape) df = tools.model_group( "titanic", headers=["class", "embark_town"], group_by=["survived", "sex"], aggregator="set", list_choice=2, ) # print(df.loc[:, ['class', 'embark_town']]) self.assertEqual((4, 4), df.shape) self.assertCountEqual(["class", "embark_town", "survived", "sex"], df.columns.to_list()) df = tools.model_group( "titanic", headers=["fare", "survived"], group_by="sex", aggregator="sum", include_weighting=True, ) self.assertEqual((2, 4), df.shape) self.assertCountEqual(["survived", "sex", "fare", "weighting"], df.columns.to_list())
def test_flatten_onehot(self): builder = SyntheticBuilder.from_env('tester', default_save_intent=False) tools: SyntheticIntentModel = builder.tools sample_size = 10 df = pd.DataFrame() df['profile'] = tools.get_number(from_value=1, to_value=9, at_most=3, size=sample_size) df['cat'] = tools.get_category(selection=['A', 'B', 'C'], size=sample_size) df['num'] = tools.get_number(from_value=1, to_value=9, size=sample_size) df['value'] = tools.get_number(from_value=1, to_value=3, size=sample_size) result = tools.model_multihot(df, header='cat') result = tools.model_group(result, group_by='profile', headers=['cat', 'value'], regex=True) self.assertCountEqual(['profile', 'value', 'cat_A', 'cat_B', 'cat_C'], result.columns.to_list())
def test_correlate_categories_nulls(self): tools = self.tools builder = SyntheticBuilder.from_memory().tools df = pd.DataFrame() df['pcp_tax_id'] = builder.get_category(selection=[ '993406113', '133757370', '260089066', '448512481', '546434723' ], quantity=0.9, size=100, column_name='pcp_tax_id') correlations = [ '993406113', '133757370', '260089066', '448512481', '546434723' ] actions = { 0: 'LABCORP OF AMERICA', 1: 'LPCH MEDICAL GROUP', 2: 'ST JOSEPH HERITAGE MEDICAL', 3: 'MONARCH HEALTHCARE', 4: 'PRIVIA MEICAL GROUP' } df['pcp_name'] = tools.correlate_categories(df, header='pcp_tax_id', correlations=correlations, actions=actions, column_name='pcp_name') print(df.head())
def test_associate_analysis_complex(self): builder = SyntheticBuilder.from_memory() clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv' builder.add_connector_uri('clinical_health', uri=clinical_health) discover: DataDiscovery = Transition.from_memory().discover A = discover.analysis2dict(header='age', dtype='int', granularity=10.0, lower=21, upper=90) B = discover.analysis2dict(header='pregnancies') columns_list = [A, B] df_clinical = builder.load_canonical('clinical_health') analysis_blob = discover.analyse_association(df_clinical, columns_list=columns_list) canonical = pd.DataFrame(index=range(1973)) df = builder.tools.model_analysis(canonical, analysis_blob=analysis_blob, column_name='clinical') self.assertEqual((1973, 2), df.shape) pregnancies = Commons.list_standardize( Commons.list_formatter(df_clinical.pregnancies)) low, high = discover.bootstrap_confidence_interval( pd.Series(pregnancies), func=np.mean) pregnancies = Commons.list_standardize( Commons.list_formatter(df.pregnancies)) self.assertTrue(low <= np.mean(pregnancies) <= high)
def test_dict_generate(self): builder = SyntheticBuilder.from_env('generator', has_contract=False) tools: SyntheticIntentModel = builder.tools df = pd.DataFrame() df['gender'] = tools.get_category(selection=['M', 'F'], column_name='gender') df['age'] = tools.get_number(from_value=18, to_value=90, column_name='age') target = {'method': '@generate', 'task_name': 'generator'} result = tools._get_canonical(data=target) self.assertCountEqual(['age', 'gender'], result.columns.to_list()) target = {'method': '@generate', 'task_name': 'generator', 'size': 100} result = tools._get_canonical(data=target) self.assertCountEqual(['age', 'gender'], result.columns.to_list()) self.assertEqual(100, result.shape[0]) selection = [tools.select2dict(column='gender', condition="@=='M'")] target = { 'method': '@generate', 'task_name': 'generator', 'size': 100, 'selection': selection } result = tools._get_canonical(data=target) self.assertGreater(result.shape[0], 0) self.assertEqual(0, (result[result['gender'] == 'F']).shape[0])
def test_correlate_mark_outliers(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools df = pd.DataFrame() df["number"] = tools.get_dist_normal(2, 1, size=1000, seed=99) result = tools.correlate_mark_outliers(canonical=df, header="number", measure=1.5, method='quantile') df['quantile'] = result result = tools.correlate_mark_outliers(canonical=df, header="number", measure=3, method='empirical') df['empirical'] = result result = tools.correlate_mark_outliers(canonical=df, header="number", measure=0.002, method='probability') df['probability'] = result self.assertEqual([992, 8], df['quantile'].value_counts().values.tolist()) self.assertEqual([995, 5], df['empirical'].value_counts().values.tolist()) self.assertEqual([996, 4], df['probability'].value_counts().values.tolist())
def test_dict_int(self): builder = SyntheticBuilder.from_memory() tools = builder.tools result = tools._get_canonical(data=0) self.assertEqual((0, 0), result.shape) result = tools._get_canonical(data=2) self.assertEqual((2, 0), result.shape)
def test_dict_method_selection(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools builder.add_connector_uri( 'titanic', "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv" ) # frame selection action = tools.canonical2dict(method='frame_selection', canonical='titanic', headers=['survived', 'sex', 'fare']) result = tools._get_canonical(data=action) self.assertEqual((891, 3), result.shape) # correlate selection action = tools.action2dict(method='@header', header='sex') action = tools.canonical2dict(method='correlate_selection', canonical='titanic', selection=[], action=action) result = tools._get_canonical(data=action, header='default') self.assertEqual((891, 1), result.shape) # get selection sample_size = builder.load_canonical('titanic').shape[0] action = tools.canonical2dict(method='get_selection', canonical='titanic', column_header='survived') result = tools._get_canonical(data=action, header='default', size=sample_size) self.assertEqual((891, 1), result.shape)
def test_discovery_associate(self): tools = SyntheticBuilder.scratch_pad() df = pd.DataFrame() df['cat'] = tools.get_category(list('AB'), relative_freq=[1,3], size=1000) df['gender'] = tools.get_category(list('MF'), relative_freq=[1,3], size=1000) result = Discover.analyse_association(df, columns_list=['cat', 'gender']) self.assertEqual(['cat', 'gender'], list(result))
def test_discovery_analytics_class(self): tools = SyntheticBuilder.scratch_pad() dataset = tools.get_category(list('ABCDE')+[np.nan], relative_freq=[1,3,2,7,4], size=694) result = Discover.analyse_category(dataset) analytics = DataAnalytics(analysis=result) self.assertEqual(analytics.intent.selection, analytics.sample_map.index.to_list()) self.assertEqual(analytics.patterns.sample_distribution, analytics.sample_map.to_list())
def setUp(self): # clean out any old environments for key in os.environ.keys(): if key.startswith('HADRON'): del os.environ[key] os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config') os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data') try: os.makedirs(os.environ['HADRON_PM_PATH']) os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: raise IOError('Unable to create directories') PropertyManager._remove_all() builder = SyntheticBuilder.from_env('task1', has_contract=False) builder.set_persist() builder.pm_persist() tr = Transition.from_env('task2', has_contract=False) tr.set_source_uri(builder.get_persist_contract().raw_uri) tr.set_persist() tr.pm_persist() wr = Wrangle.from_env('task3', has_contract=False) wr.set_source_uri(tr.get_persist_contract().raw_uri) wr.set_persist() wr.pm_persist()
def test_complex_sample_modelling(self): tools = SyntheticBuilder.from_memory().tools state_code = [ "CA", "NY", "LA", "NJ", "VA", "CO", "NV", "GA", "IN", "OH", "KY", "ME", "MO", "WI", ] df = pd.DataFrame(index=range(100)) df = tools.model_sample_map( canonical=df, sample_map="us_zipcode", state_filter=state_code, column_name="zipcodes", ) sample_data = tools.action2dict( method="model_sample_map", canonical=tools.action2dict(method="@empty"), sample_map="us_healthcare_practitioner", headers=["city", "pcp_tax_id"], shuffle=False, ) merge_data = tools.action2dict( method="model_group", canonical=sample_data, headers="pcp_tax_id", group_by="city", aggregator="list", ) df = tools.model_merge( df, merge_data, how="left", left_on="city", right_on="city", column_name="pcp_tax_id", ) self.assertCountEqual( [ "city", "state_abbr", "state", "county_fips", "county", "zipcode", "pcp_tax_id", ], df.columns.to_list(), )
def test_runs(self): """Basic smoke test""" im = SyntheticBuilder.from_env('tester', default_save=False, default_save_intent=False, reset_templates=False, has_contract=False).intent_model self.assertTrue(SyntheticIntentModel, type(im))
def test_str(self): builder = SyntheticBuilder.from_memory() tools = builder.tools df = pd.DataFrame(data={'A': list('12345')}) builder.add_connector_persist(connector_name='test', uri_file='test.pickle') builder.save_canonical(connector_name='test', canonical=df) result = tools._get_canonical(data='test') self.assertDictEqual(df.to_dict(), result.to_dict())
def test_model_modify_except(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools df = pd.DataFrame(data={"A": [1, 2, 3, 4, 5], "B": [1, 2, 'A', 4, 5]}) other = {"headers": ["A", "B"], "target": [2, 0.2]} with self.assertRaises(TypeError) as context: result = tools.model_modifier(df, other, aggregator='sum') self.assertTrue( "The column B is not of type numeric" in str(context.exception))
def setUp(self): # clean out any old environments for key in os.environ.keys(): if key.startswith('HADRON'): del os.environ[key] # Local Domain Contract os.environ['HADRON_PM_PATH'] = os.path.join('working', 'contracts') os.environ['HADRON_PM_TYPE'] = 'json' # Local Connectivity os.environ['HADRON_DEFAULT_PATH'] = Path('working/data').as_posix() # Specialist Component try: os.makedirs(os.environ['HADRON_PM_PATH']) except: pass try: os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: pass PropertyManager._remove_all() builder = SyntheticBuilder.from_env('builder', has_contract=False) builder.set_persist() sample_size = 10 df = pd.DataFrame() df['cat'] = builder.tools.get_category(selection=['a', 'b', 'c', 'd'], size=sample_size, column_name='cat') df['norm'] = builder.tools.get_dist_normal(mean=4, std=1, size=sample_size, column_name='norm') df['pois'] = builder.tools.get_dist_poisson(interval=7, size=sample_size, column_name='pois') df['norm_std'] = builder.tools.correlate_numbers( df, header='norm', standardize=True, column_name='norm_std') df['jitter1'] = builder.tools.correlate_numbers(df, header='pois', jitter=0.1, column_name='jitter1') df['jitter2'] = builder.tools.correlate_numbers(df, header='pois', jitter=0.8, column_name='jitter2') df['jitter3'] = builder.tools.correlate_numbers(df, header='pois', jitter=1.5, column_name='jitter3') df['jitter4'] = builder.tools.correlate_numbers(df, header='pois', jitter=2, column_name='jitter4') df['jitter5'] = builder.tools.correlate_numbers(df, header='pois', jitter=3, column_name='jitter5') builder.run_component_pipeline()
def test_analyse_date(self): tools = SyntheticBuilder.scratch_pad() str_dates = tools.get_datetime('12/01/2016', '12/01/2018', date_format='%d-%m-%Y', size=10, seed=31) ts_dates = tools.get_datetime('12/01/2016', '12/01/2018', size=10, seed=31) result = Discover.analyse_date(str_dates, granularity=3, date_format='%Y-%m-%d') control = {'intent': {'date_format': '%Y-%m-%d', 'day_first': False, 'dtype': 'date', 'granularity': 3, 'lowest': '2017-12-02', 'selection': [('2017-12-02', '2018-03-13', 'both'), ('2018-03-13', '2018-06-22', 'right'), ('2018-06-22', '2018-10-01', 'right')], 'highest': '2018-10-01', 'freq_precision': 2, 'year_first': False}, 'patterns': {'sample_distribution': [1, 0, 3], 'relative_freq': [25.0, 0.0, 75.0]}, 'stats': {'bootstrap_bci': (17572.5, 17797.75), 'emp_outliers': [0, 0], 'excluded_percent': 0.0, 'irq_outliers': [1, 0], 'kurtosis': 3.86, 'mean': '2018-07-04', 'nulls_percent': 60.0, 'sample': 4, 'skew': -1.96}} self.assertEqual(control, result) result = Discover.analyse_date(ts_dates, granularity=3) control = {'intent': {'day_first': False, 'dtype': 'date', 'granularity': 3, 'lowest': pd.Timestamp('2017-02-12 19:02:11.531780+0000', tz='UTC'), 'selection': [(pd.Timestamp('2017-02-12 19:02:11.531780+0000', tz='UTC'), pd.Timestamp('2017-09-08 17:43:30.973860+0000', tz='UTC'), 'both'), (pd.Timestamp('2017-09-08 17:43:30.973860+0000', tz='UTC'), pd.Timestamp('2018-04-04 16:24:50.415940+0000', tz='UTC'), 'right'), (pd.Timestamp('2018-04-04 16:24:50.415940+0000', tz='UTC'), pd.Timestamp('2018-10-29 15:06:09.858020+0000', tz='UTC'), 'right')], 'highest': pd.Timestamp('2018-10-29 15:06:09.858020+0000', tz='UTC'), 'freq_precision': 2, 'year_first': False}, 'patterns': {'sample_distribution': [2, 3, 5], 'relative_freq': [20.0, 30.0, 50.0]}, 'stats': {'bootstrap_bci': (17493.5054775573, 17724.4628926684), 'emp_outliers': [0, 0], 'excluded_percent': 0.0, 'irq_outliers': [1, 0], 'kurtosis': 0.64, 'mean': pd.Timestamp('2018-03-22 17:31:12+0000', tz='UTC'), 'nulls_percent': 0.0, 'excluded_percent': 0.0, 'sample': 10, 'skew': -0.94}} self.assertEqual(control, result)
def test_correlate_number_encode(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools df = pd.DataFrame() df["number"] = tools.get_dist_poisson(interval=2, size=10) result = tools.correlate_numbers(canonical=df, header="number", transform="log") print(result)
def test_model_dict(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools sample = [ { "task": "members_sim", "source": 100000 }, { "task": "pcp_sim", "source": 0 }, { "task": "members_gen", "source": "members_sim", "persist": True }, None, ] df = pd.DataFrame(data={ "A": [5, 2, 3, 4], "X": sample, "Y": list("VWXY") }) df = tools._model_dict_column(df, header="X") self.assertCountEqual(["A", "Y", "task", "source", "persist"], df.columns.to_list()) # as strings sample = [ "{'task': 'members_sim', 'source': 100000}", "{'task': 'pcp_sim', 'source': 0}", "{'task': 'members_gen', 'source': 'members_sim', 'persist': True}", None, ] df = pd.DataFrame(data={ "A": [5, 2, None, 4], "X": sample, "Y": list("VWXY") }) df = tools._model_dict_column(df, header="X", convert_str=True) self.assertCountEqual(["A", "Y", "task", "source", "persist"], df.columns.to_list()) # replace nulls df = pd.DataFrame(data={ "A": [5, 2, None, 4], "X": sample, "Y": list("VWXY") }) df = tools._model_dict_column(df, header="X", convert_str=True, replace_null="default") self.assertEqual( ["default", "default", "default"], df.loc[3, ["task", "source", "persist"]].to_list(), )
def test_set_report_persist(self): builder = SyntheticBuilder.from_env('tester', default_save=False, has_contract=False) builder.setup_bootstrap(domain='domain', project_name='project_name', path=None) report = builder.report_connectors(stylise=False) _, file = os.path.split(report.uri.iloc[-1]) self.assertTrue(file.startswith('project_name'))
def test_json(self): df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10) handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/handler_test.json', '', '', file_type='json')) handler.persist_canonical(df) self.assertTrue(handler.exists()) result = pd.DataFrame(data=handler.load_canonical()) self.assertEqual(df.shape, result.shape) self.assertCountEqual(df.columns, result.columns) handler.remove_canonical() self.assertFalse(handler.exists())
def test_dict_method_model(self): builder = SyntheticBuilder.from_env('generator', has_contract=False) tools: SyntheticIntentModel = builder.tools action = tools.canonical2dict(method='model_sample_map', canonical=tools.action2dict( method='@empty', size=100), sample_map='us_persona', female_bias=0.3) result = tools._get_canonical(data=action) self.assertEqual((100, 5), result.shape) self.assertEqual(30, result['gender'].value_counts().loc['F'])
def test_list(self): builder = SyntheticBuilder.from_memory() tools = builder.tools sample = list('12345') result = tools._get_canonical(data=sample) self.assertEqual(sample, result['default'].to_list()) result = tools._get_canonical(data=sample, header='sample') self.assertEqual(sample, result['sample'].to_list()) sample = pd.Series(sample) result = tools._get_canonical(data=sample, header='sample') self.assertEqual(sample.to_list(), result['sample'].to_list())
def setUp(self): os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config') os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data') try: os.makedirs(os.environ['HADRON_PM_PATH']) os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: pass PropertyManager._remove_all() self.tools = SyntheticBuilder.scratch_pad() self.clean = Transition.scratch_pad()
def test_filter_correlated(self): tools = SyntheticBuilder.scratch_pad() df = pd.DataFrame() df['col1'] = [1, 2, 3, 4, 5, 6, 7] df['col2'] = [1, 2, 3, 4, 5, 6, 7] df['col3'] = [2, 2, 3, 2, 2, 2, 3] df['col4'] = [2, 2, 3, 2, 2, 2, 3] df['col5'] = [2, 2, 3, 2, 2, 2, 3] df['col4'] = [7, 2, 4, 2, 1, 6, 4] df['target'] = [1, 0, 1, 1, 0, 0, 1] result = DataDiscovery.filter_correlated(df, target='target') print(result)
def test_dict_to_dataframe(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools df = {'A': [1, 2, 3, 4, 5], 'B': list('ABCDE'), 'C': list('ABCDE')} result = tools._get_canonical(data=df) self.assertEqual((5, 3), result.shape) self.assertEqual(list('ABC'), result.columns.to_list()) df = {'A': [4, 5], 'B': list('ABCDE'), 'C': list('ABCD')} with self.assertRaises(ValueError) as context: result = tools._get_canonical(data=df) self.assertTrue("The canonical data passed was of type 'dict'" in str( context.exception))
def test_filter_correlate(self): builder = SyntheticBuilder.from_env('builder') tr = Transition.from_env("tr1", has_contract=False) cleaners: TransitionIntentModel = tr.cleaners tr.set_source_uri(builder.get_persist_contract().raw_uri) tr.set_persist() df = tr.load_source_canonical() self.assertEqual((1000, 9), df.shape) df = cleaners.auto_brute_force_correlated(df) self.assertEqual((1000, 7), df.shape) df = cleaners.auto_brute_force_correlated(df, threshold=0.8) self.assertEqual((1000, 3), df.shape)
def test_persist_backup(self): handler = PandasPersistHandler(ConnectorContract('work/data/2_transition/example01.json', '', '')) df = SyntheticBuilder.scratch_pad().model_noise(pd.DataFrame(index=range(1000)), num_columns=10) self.assertTrue(handler.persist_canonical(df)) df = pd.DataFrame(data=handler.load_canonical()) self.assertEqual((1000, 10), df.shape) handler.remove_canonical() self.assertFalse(handler.exists()) # Backup uri = 'work/data/2_transition/example01.pq.bak?file_type=parquet' self.assertFalse(os.path.exists(uri)) handler.backup_canonical(canonical=df, uri=uri) self.assertTrue(os.path.exists(ConnectorContract.parse_address(uri)))