示例#1
0
 def test_associate_analysis_complex(self):
     builder = SyntheticBuilder.from_memory()
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     builder.add_connector_uri('clinical_health', uri=clinical_health)
     discover: DataDiscovery = Transition.from_memory().discover
     A = discover.analysis2dict(header='age',
                                dtype='int',
                                granularity=10.0,
                                lower=21,
                                upper=90)
     B = discover.analysis2dict(header='pregnancies')
     columns_list = [A, B]
     df_clinical = builder.load_canonical('clinical_health')
     analysis_blob = discover.analyse_association(df_clinical,
                                                  columns_list=columns_list)
     canonical = builder.tools.canonical2dict(method='@empty', size=1973)
     df = builder.tools.model_analysis(canonical,
                                       analytics_model=analysis_blob,
                                       column_name='clinical')
     self.assertEqual((1973, 2), df.shape)
     pregnancies = SyntheticCommons.list_standardize(
         SyntheticCommons.list_formatter(df_clinical.pregnancies))
     low, high = discover.bootstrap_confidence_interval(
         pd.Series(pregnancies), func=np.mean)
     pregnancies = SyntheticCommons.list_standardize(
         SyntheticCommons.list_formatter(df.pregnancies))
     self.assertTrue(low <= np.mean(pregnancies) <= high)
 def test_model_sample_map(self):
     builder = SyntheticBuilder.from_memory(default_save_intent=False)
     result = builder.tools.model_sample_map(pd.DataFrame(), sample_map='us_healthcare_practitioner')
     self.assertEqual((70655, 10), result.shape)
     result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner')
     result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner',
                                             headers=['pcp_tax_id'])
     self.assertEqual((50, 1), result.shape)
 def test_str(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     df = pd.DataFrame(data={'A': list('12345')})
     builder.add_connector_persist(connector_name='test',
                                   uri_file='test.pickle')
     builder.save_canonical(connector_name='test', canonical=df)
     result = tools._get_canonical(data='test')
     self.assertDictEqual(df.to_dict(), result.to_dict())
 def test_model_us_person(self):
     builder = SyntheticBuilder.from_memory(default_save_intent=False)
     df = pd.DataFrame(index=range(300))
     result = builder.tools.model_sample_map(canonical=df, sample_map='us_persona')
     self.assertCountEqual(['first_name', 'middle_name', 'gender', 'family_name', 'email'], result.columns.to_list())
     self.assertEqual(300, result.shape[0])
     df = pd.DataFrame(index=range(1000))
     df = builder.tools.model_sample_map(canonical=df, sample_map='us_persona', female_bias=0.3)
     self.assertEqual((1000, 5), df.shape)
     print(df['gender'].value_counts().loc['F'])
 def test_list(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     sample = list('12345')
     result = tools._get_canonical(data=sample)
     self.assertEqual(sample, result['default'].to_list())
     result = tools._get_canonical(data=sample, header='sample')
     self.assertEqual(sample, result['sample'].to_list())
     sample = pd.Series(sample)
     result = tools._get_canonical(data=sample, header='sample')
     self.assertEqual(sample.to_list(), result['sample'].to_list())
 def test_complex_sample_modelling(self):
     tools = SyntheticBuilder.from_memory().tools
     state_code = ['CA', 'NY', 'LA', 'NJ', 'VA', 'CO', 'NV', 'GA', 'IN', 'OH', 'KY', 'ME', 'MO', 'WI']
     df = tools.model_sample_map(canonical={'method': '@empty', 'size':100}, sample_map='us_zipcode',
                                 state_filter=state_code, column_name='zipcodes')
     sample_data = tools.action2dict(method='model_sample_map', canonical=tools.action2dict(method='@empty'),
                                     sample_map='us_healthcare_practitioner', headers=['city', 'pcp_tax_id'],
                                     shuffle=False)
     merge_data = tools.action2dict(method='model_group', canonical=sample_data, headers='pcp_tax_id',
                                    group_by='city', aggregator='list')
     df = tools.model_merge(df, merge_data, how='left', left_on='city', right_on='city', column_name='pcp_tax_id')
     self.assertCountEqual(['city', 'state_abbr', 'state', 'county_fips', 'county', 'zipcode', 'pcp_tax_id'], df.columns.to_list())
 def test_model_group(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_uri('titanic', uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
     df = tools.model_group('titanic', headers='fare', group_by=['survived', 'sex'], aggregator='sum')
     self.assertEqual((4, 3), df.shape)
     df = tools.model_group('titanic', headers=['class', 'embark_town'], group_by=['survived', 'sex'],
                            aggregator='set', list_choice=2)
     # print(df.loc[:, ['class', 'embark_town']])
     self.assertEqual((4, 4), df.shape)
     self.assertCountEqual(['class', 'embark_town', 'survived', 'sex'], df.columns.to_list())
     df = tools.model_group('titanic', headers=['fare', 'survived'], group_by='sex', aggregator='sum', include_weighting=True)
     self.assertEqual((2, 4), df.shape)
     self.assertCountEqual(['survived', 'sex', 'fare', 'weighting'], df.columns.to_list())
 def test_dict_generate_remote(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     canonical = tools.canonical2dict(method='@empty', size=1000)
     other = tools.canonical2dict(
         method='@generate',
         task_name='members',
         uri_pm_repo=
         'https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/contracts/healthcare/factory/members/'
     )
     result = builder.intent_model.model_concat(
         canonical=canonical,
         other=other,
         as_rows=False,
         headers=[
             'member_id', 'state', 'prev_flu_shot', 'age', 'channel_pref'
         ],
         column_name='member_reference')
     print(result.columns)
示例#9
0
 def test_associate_analysis_dominance(self):
     sample = pd.DataFrame()
     sample['values'] = [0, 1, 0, 0, 7, 0, 0, 4, 2, 0, 0, 5, 8, 7, 0, 0]
     discover: DataDiscovery = Transition.from_memory().discover
     columns_list = [
         discover.analysis2dict(header='values',
                                dtype='int',
                                precision=0,
                                exclude_dominant=True)
     ]
     analysis_blob = discover.analyse_association(sample,
                                                  columns_list=columns_list)
     builder = SyntheticBuilder.from_memory()
     canonical = builder.tools.canonical2dict(method='@empty', size=1000)
     df = builder.tools.model_analysis(canonical,
                                       analytics_model=analysis_blob,
                                       apply_bias=True)
     self.assertAlmostEqual(
         df['values'].value_counts().iloc[0] / df.shape[0],
         sample['values'].value_counts().iloc[0] / sample.shape[0],
         places=2)
 def test_dataframe(self):
     tools = SyntheticBuilder.from_memory().tools
     df = pd.DataFrame(data={'A': list('12345')})
     result = tools._get_canonical(data=df)
     self.assertDictEqual(df.to_dict(), result.to_dict())