Пример #1
0
 def test_from_component(self):
     # EventBook
     os.environ['HADRON_DEFAULT_PATH'] = 'eb://grey_storage/'
     os.environ[
         'HADRON_DEFAULT_MODULE'] = 'ds_engines.handlers.event_handlers'
     os.environ['HADRON_DEFAULT_SOURCE_HANDLER'] = 'EventPersistHandler'
     os.environ['HADRON_DEFAULT_PERSIST_HANDLER'] = 'EventSourceHandler'
     # Portfolio
     builder = SyntheticBuilder.from_env('members', has_contract=False)
     builder.set_outcome(uri_file="synthetic_members")
     builder = SyntheticBuilder.from_env('members')
Пример #2
0
 def test_associate_analysis_complex(self):
     builder = SyntheticBuilder.from_memory()
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     builder.add_connector_uri('clinical_health', uri=clinical_health)
     discover: DataDiscovery = Transition.from_memory().discover
     A = discover.analysis2dict(header='age',
                                dtype='int',
                                granularity=10.0,
                                lower=21,
                                upper=90)
     B = discover.analysis2dict(header='pregnancies')
     columns_list = [A, B]
     df_clinical = builder.load_canonical('clinical_health')
     analysis_blob = discover.analyse_association(df_clinical,
                                                  columns_list=columns_list)
     canonical = builder.tools.canonical2dict(method='@empty', size=1973)
     df = builder.tools.model_analysis(canonical,
                                       analytics_model=analysis_blob,
                                       column_name='clinical')
     self.assertEqual((1973, 2), df.shape)
     pregnancies = SyntheticCommons.list_standardize(
         SyntheticCommons.list_formatter(df_clinical.pregnancies))
     low, high = discover.bootstrap_confidence_interval(
         pd.Series(pregnancies), func=np.mean)
     pregnancies = SyntheticCommons.list_standardize(
         SyntheticCommons.list_formatter(df.pregnancies))
     self.assertTrue(low <= np.mean(pregnancies) <= high)
 def test_runs(self):
     """Basic smoke test"""
     im = SyntheticBuilder.from_env('tester',
                                    default_save=False,
                                    default_save_intent=False,
                                    reset_templates=False).intent_model
     self.assertTrue(SyntheticIntentModel, type(im))
 def test_dict_generate(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     df = pd.DataFrame()
     df['gender'] = tools.get_category(selection=['M', 'F'],
                                       column_name='gender')
     df['age'] = tools.get_number(from_value=18,
                                  to_value=90,
                                  column_name='age')
     target = {'method': '@generate', 'task_name': 'generator'}
     result = tools._get_canonical(data=target)
     self.assertCountEqual(['age', 'gender'], result.columns.to_list())
     target = {'method': '@generate', 'task_name': 'generator', 'size': 100}
     result = tools._get_canonical(data=target)
     self.assertCountEqual(['age', 'gender'], result.columns.to_list())
     self.assertEqual(100, result.shape[0])
     selection = [tools.select2dict(column='gender', condition="@=='M'")]
     target = {
         'method': '@generate',
         'task_name': 'generator',
         'size': 100,
         'selection': selection
     }
     result = tools._get_canonical(data=target)
     self.assertGreater(result.shape[0], 0)
     self.assertEqual(0, (result[result['gender'] == 'F']).shape[0])
 def test_model_us_zip(self):
     builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False)
     df = pd.DataFrame(index=range(300))
     result = builder.tools.model_us_zip(df, state_code_filter=['NY', 'TX', 'FRED'])
     self.assertCountEqual(['NY', 'TX'], result['StateCode'].value_counts().index.to_list())
     self.assertCountEqual(['StateAbbrev', 'Zipcode', 'City', 'State', 'StateCode', 'Phone'], result.columns.to_list())
     self.assertEqual(300, result.shape[0])
 def test_remove_unwanted_headers(self):
     builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False)
     builder.set_source_uri(uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
     selection = [builder.tools.select2dict(column='survived', condition='==1')]
     result = builder.tools.frame_selection(canonical=builder.CONNECTOR_SOURCE, selection=selection, headers=['survived', 'sex', 'fare'])
     self.assertCountEqual(['survived', 'sex', 'fare'], list(result.columns))
     self.assertEqual(1, result['survived'].min())
Пример #7
0
 def test_run_pipeline_with_analytics(self):
     builder: SyntheticBuilder = SyntheticBuilder.from_env(
         'sample', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     # load the sample dataset to analyse and rename columns
     clinical_health = 'https://assets.datacamp.com/production/repositories/628/datasets/444cdbf175d5fbf564b564bd36ac21740627a834/diabetes.csv'
     builder.add_connector_uri('clinical_health', uri=clinical_health)
     df_clinical = builder.load_canonical('clinical_health')
 def test_model_sample_map(self):
     builder = SyntheticBuilder.from_memory(default_save_intent=False)
     result = builder.tools.model_sample_map(pd.DataFrame(), sample_map='us_healthcare_practitioner')
     self.assertEqual((70655, 10), result.shape)
     result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner')
     result = builder.tools.model_sample_map(pd.DataFrame(index=range(50)), sample_map='us_healthcare_practitioner',
                                             headers=['pcp_tax_id'])
     self.assertEqual((50, 1), result.shape)
 def test_model_columns_headers(self):
     builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     builder.set_source_uri(uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
     df = pd.DataFrame(index=range(300))
     result = tools.model_concat(df, other=builder.CONNECTOR_SOURCE, as_rows=False, headers=['survived', 'sex', 'fare'])
     self.assertCountEqual(['survived', 'sex', 'fare'], list(result.columns))
     self.assertEqual(300, result.shape[0])
 def test_canonical_run_pipeline_dict(self):
     tools = self.builder.intent_model
     df = pd.DataFrame()
     df['numbers'] = tools.get_number(1, 2, column_name='numbers')
     # create a remote pm contract
     inst = SyntheticBuilder.from_env('sub_set', has_contract=False)
     _ = inst.tools.get_category(selection=['A', 'B'], column_name='value')
     sub_set = SyntheticCommons.param2dict()
     df['corr_num'] = tools.correlate_numbers(df, offset=1, header='numbers', column_name='numbers', intent_order=1)
 def test_str(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     df = pd.DataFrame(data={'A': list('12345')})
     builder.add_connector_persist(connector_name='test',
                                   uri_file='test.pickle')
     builder.save_canonical(connector_name='test', canonical=df)
     result = tools._get_canonical(data='test')
     self.assertDictEqual(df.to_dict(), result.to_dict())
 def test_model_us_person(self):
     builder = SyntheticBuilder.from_memory(default_save_intent=False)
     df = pd.DataFrame(index=range(300))
     result = builder.tools.model_sample_map(canonical=df, sample_map='us_persona')
     self.assertCountEqual(['first_name', 'middle_name', 'gender', 'family_name', 'email'], result.columns.to_list())
     self.assertEqual(300, result.shape[0])
     df = pd.DataFrame(index=range(1000))
     df = builder.tools.model_sample_map(canonical=df, sample_map='us_persona', female_bias=0.3)
     self.assertEqual((1000, 5), df.shape)
     print(df['gender'].value_counts().loc['F'])
 def test_list(self):
     builder = SyntheticBuilder.from_memory()
     tools = builder.tools
     sample = list('12345')
     result = tools._get_canonical(data=sample)
     self.assertEqual(sample, result['default'].to_list())
     result = tools._get_canonical(data=sample, header='sample')
     self.assertEqual(sample, result['sample'].to_list())
     sample = pd.Series(sample)
     result = tools._get_canonical(data=sample, header='sample')
     self.assertEqual(sample.to_list(), result['sample'].to_list())
 def test_dict_method(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     action = tools.canonical2dict(method='model_sample_map',
                                   canonical=tools.action2dict(
                                       method='@empty', size=100),
                                   sample_map='us_persona',
                                   female_bias=0.3)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 5), result.shape)
     self.assertEqual(30, result['gender'].value_counts().loc['F'])
 def test_complex_sample_modelling(self):
     tools = SyntheticBuilder.from_memory().tools
     state_code = ['CA', 'NY', 'LA', 'NJ', 'VA', 'CO', 'NV', 'GA', 'IN', 'OH', 'KY', 'ME', 'MO', 'WI']
     df = tools.model_sample_map(canonical={'method': '@empty', 'size':100}, sample_map='us_zipcode',
                                 state_filter=state_code, column_name='zipcodes')
     sample_data = tools.action2dict(method='model_sample_map', canonical=tools.action2dict(method='@empty'),
                                     sample_map='us_healthcare_practitioner', headers=['city', 'pcp_tax_id'],
                                     shuffle=False)
     merge_data = tools.action2dict(method='model_group', canonical=sample_data, headers='pcp_tax_id',
                                    group_by='city', aggregator='list')
     df = tools.model_merge(df, merge_data, how='left', left_on='city', right_on='city', column_name='pcp_tax_id')
     self.assertCountEqual(['city', 'state_abbr', 'state', 'county_fips', 'county', 'zipcode', 'pcp_tax_id'], df.columns.to_list())
Пример #16
0
 def setUp(self):
     os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config')
     os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data')
     try:
         os.makedirs(os.environ['HADRON_PM_PATH'])
         os.makedirs(os.environ['HADRON_DEFAULT_PATH'])
     except:
         pass
     PropertyManager._remove_all()
     self.builder: SyntheticBuilder = SyntheticBuilder.from_env(
         'sample', has_contract=False)
     self.builder.setup_bootstrap()
     self.tools: SyntheticIntentModel = self.builder.tools
 def test_model_group(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_uri('titanic', uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
     df = tools.model_group('titanic', headers='fare', group_by=['survived', 'sex'], aggregator='sum')
     self.assertEqual((4, 3), df.shape)
     df = tools.model_group('titanic', headers=['class', 'embark_town'], group_by=['survived', 'sex'],
                            aggregator='set', list_choice=2)
     # print(df.loc[:, ['class', 'embark_town']])
     self.assertEqual((4, 4), df.shape)
     self.assertCountEqual(['class', 'embark_town', 'survived', 'sex'], df.columns.to_list())
     df = tools.model_group('titanic', headers=['fare', 'survived'], group_by='sex', aggregator='sum', include_weighting=True)
     self.assertEqual((2, 4), df.shape)
     self.assertCountEqual(['survived', 'sex', 'fare', 'weighting'], df.columns.to_list())
Пример #18
0
 def test_run_synthetic_pipeline_seed(self):
     builder = SyntheticBuilder.from_env('tester', has_contract=False)
     builder.set_persist()
     tools: SyntheticIntentModel = builder.tools
     _ = tools.get_category(selection=['M', 'F'], relative_freq=[4, 3], column_name='gender')
     _ = tools.get_number(from_value=18, to_value=80, column_name='age')
     builder.run_synthetic_pipeline(size=1000, seed=23)
     df = builder.load_synthetic_canonical()
     dist = df['gender'].value_counts().values
     mean = df['age'].mean()
     builder.run_synthetic_pipeline(size=1000, seed=23)
     df = builder.load_synthetic_canonical()
     self.assertCountEqual(dist, df['gender'].value_counts().values)
     self.assertEqual(mean, df['age'].mean())
 def test_dict_empty(self):
     builder = SyntheticBuilder.from_env('generator', has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     action = tools.canonical2dict(method='@empty')
     result = tools._get_canonical(data=action)
     self.assertEqual((0, 0), result.shape)
     action = tools.canonical2dict(method='@empty', size=100)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 0), result.shape)
     action = tools.canonical2dict(method='@empty', size=100)
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 0), result.shape)
     action = tools.canonical2dict(method='@empty',
                                   size=100,
                                   headers=['A', 'B', 'C'])
     result = tools._get_canonical(data=action)
     self.assertEqual((100, 3), result.shape)
 def test_dict_generate_remote(self):
     builder = SyntheticBuilder.from_memory()
     tools: SyntheticIntentModel = builder.tools
     canonical = tools.canonical2dict(method='@empty', size=1000)
     other = tools.canonical2dict(
         method='@generate',
         task_name='members',
         uri_pm_repo=
         'https://raw.githubusercontent.com/project-hadron/hadron-asset-bank/master/contracts/healthcare/factory/members/'
     )
     result = builder.intent_model.model_concat(
         canonical=canonical,
         other=other,
         as_rows=False,
         headers=[
             'member_id', 'state', 'prev_flu_shot', 'age', 'channel_pref'
         ],
         column_name='member_reference')
     print(result.columns)
Пример #21
0
 def test_associate_analysis_dominance(self):
     sample = pd.DataFrame()
     sample['values'] = [0, 1, 0, 0, 7, 0, 0, 4, 2, 0, 0, 5, 8, 7, 0, 0]
     discover: DataDiscovery = Transition.from_memory().discover
     columns_list = [
         discover.analysis2dict(header='values',
                                dtype='int',
                                precision=0,
                                exclude_dominant=True)
     ]
     analysis_blob = discover.analyse_association(sample,
                                                  columns_list=columns_list)
     builder = SyntheticBuilder.from_memory()
     canonical = builder.tools.canonical2dict(method='@empty', size=1000)
     df = builder.tools.model_analysis(canonical,
                                       analytics_model=analysis_blob,
                                       apply_bias=True)
     self.assertAlmostEqual(
         df['values'].value_counts().iloc[0] / df.shape[0],
         sample['values'].value_counts().iloc[0] / sample.shape[0],
         places=2)
    def recommend_heuristic(profile: pd.Series,
                            items: pd.DataFrame,
                            recommend: int = None,
                            top: int = None,
                            exclude_items: list = None) -> list:
        """ takes a profile of an entity where the index of the profile represents the columns in the items.
        for example the profile will be an index list or film genres and how many times these categories
        have been watched. The items will be columns of categories with the index the films and row values
        being the count of film watches in the column categories

        :param profile: a pandas series of categories (index) counters for a single profile
        :param items: a pandas dataframe of item counts (index) of columns (categories
        :param recommend: the number of recommended items to select from
        :param top: limits the cut-off of the top categories to select from
        :param exclude_items: item index to not include
        :return: a list of recommendations
        """
        recommend = 10 if recommend is None else recommend
        top = 10 if top is None or top < 1 else top
        # drop the entities in the exclude
        _df = items.drop(index=exclude_items, errors='ignore')
        if profile is None or profile.size == 0:
            return []
        categories = profile.sort_values(ascending=False).iloc[:top]
        choices = SyntheticBuilder.scratch_pad().get_category(
            selection=categories.index.to_list(),
            weight_pattern=categories.values.tolist(),
            size=recommend)
        choices_count = pd.Series(choices).value_counts()
        selection_dict = {}
        for index in choices_count.index:
            selection_dict.update({
                index:
                _df[index].sort_values(ascending=False).
                iloc[:choices_count.loc[index]].index.to_list()
            })
        rtn_list = []
        for item in choices:
            rtn_list.append(selection_dict[item].pop())
        return rtn_list
 def test_model_iterator(self):
     builder = SyntheticBuilder.from_env('test', default_save=False, default_save_intent=False, has_contract=False)
     tools: SyntheticIntentModel = builder.tools
     builder.add_connector_uri('titanic', uri="https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")
     # do nothing
     result = tools.model_iterator(canonical='titanic')
     self.assertEqual(builder.load_canonical('titanic').shape, result.shape)
     # add marker
     result = tools.model_iterator(canonical='titanic', marker_col='marker')
     self.assertEqual(builder.load_canonical('titanic').shape[1]+1, result.shape[1])
     # with selection
     selection = [tools.select2dict(column='survived', condition="==1")]
     control = tools.frame_selection(canonical='titanic', selection=selection)
     result = tools.model_iterator(canonical='titanic', marker_col='marker', selection=selection)
     self.assertEqual(control.shape[0], result.shape[0])
     # with iteration
     result = tools.model_iterator(canonical='titanic', marker_col='marker', iter_stop=3)
     self.assertCountEqual([0,1,2], result['marker'].value_counts().index.to_list())
     # with actions
     actions = {2: (tools.action2dict(method='get_category', selection=[4,5]))}
     result = tools.model_iterator(canonical='titanic', marker_col='marker', iter_stop=3, iteration_actions=actions)
     self.assertCountEqual([0,1,4,5], result['marker'].value_counts().index.to_list())
 def test_dataframe(self):
     tools = SyntheticBuilder.from_memory().tools
     df = pd.DataFrame(data={'A': list('12345')})
     result = tools._get_canonical(data=df)
     self.assertDictEqual(df.to_dict(), result.to_dict())
 def builder(self) -> SyntheticBuilder:
     return SyntheticBuilder.from_env('tester')
 def builder(self) -> SyntheticBuilder:
     return SyntheticBuilder.from_env('tester', has_contract=False)
Пример #27
0
 def test_set_report_persist(self):
     builder = SyntheticBuilder.from_env('tester', default_save=False, has_contract=False)
     builder.setup_bootstrap(domain='domain', project_name='project_name', path=None)
     report = builder.report_connectors(stylise=False)
     _, file = os.path.split(report.uri.iloc[-1])
     self.assertTrue(file.startswith('project_name'))
Пример #28
0
 def test_runs(self):
     """Basic smoke test"""
     self.assertEqual(SyntheticBuilder, type(SyntheticBuilder.from_env('tester', has_contract=False)))
 def tools(self) -> SyntheticIntentModel:
     return SyntheticBuilder.scratch_pad()
 def test_runs(self):
     """Basic smoke test"""
     self.assertEqual(SyntheticBuilder, type(SyntheticBuilder.from_env('tester')))