def test_associate_analysis_complex(self): builder = SyntheticBuilder.from_memory() clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv' builder.add_connector_uri('clinical_health', uri=clinical_health) discover: DataDiscovery = Transition.from_memory().discover A = discover.analysis2dict(header='age', dtype='int', granularity=10.0, lower=21, upper=90) B = discover.analysis2dict(header='pregnancies') columns_list = [A, B] df_clinical = builder.load_canonical('clinical_health') analysis_blob = discover.analyse_association(df_clinical, columns_list=columns_list) canonical = builder.tools.canonical2dict(method='@empty', size=1973) df = builder.tools.model_analysis(canonical, analytics_model=analysis_blob, column_name='clinical') self.assertEqual((1973, 2), df.shape) pregnancies = SyntheticCommons.list_standardize( SyntheticCommons.list_formatter(df_clinical.pregnancies)) low, high = discover.bootstrap_confidence_interval( pd.Series(pregnancies), func=np.mean) pregnancies = SyntheticCommons.list_standardize( SyntheticCommons.list_formatter(df.pregnancies)) self.assertTrue(low <= np.mean(pregnancies) <= high)
def test_model_sample_map(self): builder = SyntheticBuilder.from_memory(default_save_intent=False) result = builder.tools.model_sample_map(pd.DataFrame(), sample_map='us_healthcare_practitioner') self.assertEqual((70655, 10), result.shape) result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner') result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner', headers=['pcp_tax_id']) self.assertEqual((50, 1), result.shape)
def test_str(self): builder = SyntheticBuilder.from_memory() tools = builder.tools df = pd.DataFrame(data={'A': list('12345')}) builder.add_connector_persist(connector_name='test', uri_file='test.pickle') builder.save_canonical(connector_name='test', canonical=df) result = tools._get_canonical(data='test') self.assertDictEqual(df.to_dict(), result.to_dict())
def test_model_us_person(self): builder = SyntheticBuilder.from_memory(default_save_intent=False) df = pd.DataFrame(index=range(300)) result = builder.tools.model_sample_map(canonical=df, sample_map='us_persona') self.assertCountEqual(['first_name', 'middle_name', 'gender', 'family_name', 'email'], result.columns.to_list()) self.assertEqual(300, result.shape[0]) df = pd.DataFrame(index=range(1000)) df = builder.tools.model_sample_map(canonical=df, sample_map='us_persona', female_bias=0.3) self.assertEqual((1000, 5), df.shape) print(df['gender'].value_counts().loc['F'])
def test_list(self): builder = SyntheticBuilder.from_memory() tools = builder.tools sample = list('12345') result = tools._get_canonical(data=sample) self.assertEqual(sample, result['default'].to_list()) result = tools._get_canonical(data=sample, header='sample') self.assertEqual(sample, result['sample'].to_list()) sample = pd.Series(sample) result = tools._get_canonical(data=sample, header='sample') self.assertEqual(sample.to_list(), result['sample'].to_list())
def test_complex_sample_modelling(self): tools = SyntheticBuilder.from_memory().tools state_code = ['CA', 'NY', 'LA', 'NJ', 'VA', 'CO', 'NV', 'GA', 'IN', 'OH', 'KY', 'ME', 'MO', 'WI'] df = tools.model_sample_map(canonical={'method': '@empty', 'size':100}, sample_map='us_zipcode', state_filter=state_code, column_name='zipcodes') sample_data = tools.action2dict(method='model_sample_map', canonical=tools.action2dict(method='@empty'), sample_map='us_healthcare_practitioner', headers=['city', 'pcp_tax_id'], shuffle=False) merge_data = tools.action2dict(method='model_group', canonical=sample_data, headers='pcp_tax_id', group_by='city', aggregator='list') df = tools.model_merge(df, merge_data, how='left', left_on='city', right_on='city', column_name='pcp_tax_id') self.assertCountEqual(['city', 'state_abbr', 'state', 'county_fips', 'county', 'zipcode', 'pcp_tax_id'], df.columns.to_list())
def test_model_group(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools builder.add_connector_uri('titanic', uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv") df = tools.model_group('titanic', headers='fare', group_by=['survived', 'sex'], aggregator='sum') self.assertEqual((4, 3), df.shape) df = tools.model_group('titanic', headers=['class', 'embark_town'], group_by=['survived', 'sex'], aggregator='set', list_choice=2) # print(df.loc[:, ['class', 'embark_town']]) self.assertEqual((4, 4), df.shape) self.assertCountEqual(['class', 'embark_town', 'survived', 'sex'], df.columns.to_list()) df = tools.model_group('titanic', headers=['fare', 'survived'], group_by='sex', aggregator='sum', include_weighting=True) self.assertEqual((2, 4), df.shape) self.assertCountEqual(['survived', 'sex', 'fare', 'weighting'], df.columns.to_list())
def test_dict_generate_remote(self): builder = SyntheticBuilder.from_memory() tools: SyntheticIntentModel = builder.tools canonical = tools.canonical2dict(method='@empty', size=1000) other = tools.canonical2dict( method='@generate', task_name='members', uri_pm_repo= 'https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/contracts/healthcare/factory/members/' ) result = builder.intent_model.model_concat( canonical=canonical, other=other, as_rows=False, headers=[ 'member_id', 'state', 'prev_flu_shot', 'age', 'channel_pref' ], column_name='member_reference') print(result.columns)
def test_associate_analysis_dominance(self): sample = pd.DataFrame() sample['values'] = [0, 1, 0, 0, 7, 0, 0, 4, 2, 0, 0, 5, 8, 7, 0, 0] discover: DataDiscovery = Transition.from_memory().discover columns_list = [ discover.analysis2dict(header='values', dtype='int', precision=0, exclude_dominant=True) ] analysis_blob = discover.analyse_association(sample, columns_list=columns_list) builder = SyntheticBuilder.from_memory() canonical = builder.tools.canonical2dict(method='@empty', size=1000) df = builder.tools.model_analysis(canonical, analytics_model=analysis_blob, apply_bias=True) self.assertAlmostEqual( df['values'].value_counts().iloc[0] / df.shape[0], sample['values'].value_counts().iloc[0] / sample.shape[0], places=2)
def test_dataframe(self): tools = SyntheticBuilder.from_memory().tools df = pd.DataFrame(data={'A': list('12345')}) result = tools._get_canonical(data=df) self.assertDictEqual(df.to_dict(), result.to_dict())