def test_correlate_categories_builder(self): builder = Wrangle.from_env('test', has_contract=False) builder.set_persist_contract( ConnectorContract(uri="eb://synthetic_members", module_name='ds_engines.handlers.event_handlers', handler='EventPersistHandler')) df = pd.DataFrame() df['pcp_tax_id'] = [ 993406113, 133757370, 260089066, 448512481, 546434723 ] * 2 correlations = [993406113, 133757370, 260089066, 448512481, 546434723] actions = { 0: 'LABCORP OF AMERICA', 1: 'LPCH MEDICAL GROUP', 2: 'ST JOSEPH HERITAGE MEDICAL', 3: 'MONARCH HEALTHCARE', 4: 'PRIVIA MEICAL GROUP' } df['pcp_name'] = builder.tools.correlate_categories( df, header='pcp_tax_id', correlations=correlations, actions=actions, column_name='pcp_name') result = builder.tools.run_intent_pipeline(df) self.assertEqual((10, 2), result.shape)
def setUp(self): # clean out any old environments for key in os.environ.keys(): if key.startswith('HADRON'): del os.environ[key] os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config') os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data') try: os.makedirs(os.environ['HADRON_PM_PATH']) os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: raise IOError('Unable to create directories') PropertyManager._remove_all() builder = SyntheticBuilder.from_env('task1', has_contract=False) builder.set_persist() builder.pm_persist() tr = Transition.from_env('task2', has_contract=False) tr.set_source_uri(builder.get_persist_contract().raw_uri) tr.set_persist() tr.pm_persist() wr = Wrangle.from_env('task3', has_contract=False) wr.set_source_uri(tr.get_persist_contract().raw_uri) wr.set_persist() wr.pm_persist()
def setUp(self): # clean out any old environments for key in os.environ.keys(): if key.startswith('HADRON'): del os.environ[key] # Local Domain Contract os.environ['HADRON_PM_PATH'] = os.path.join('working', 'contracts') os.environ['HADRON_PM_TYPE'] = 'json' # Local Connectivity os.environ['HADRON_DEFAULT_PATH'] = Path('working/data').as_posix() # Specialist Component try: os.makedirs(os.environ['HADRON_PM_PATH']) except: pass try: os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: pass PropertyManager._remove_all() tr = Transition.from_env('task1', has_contract=False) tr.set_source_uri( "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv" ) tr.set_persist() wr = Wrangle.from_env('task2', has_contract=False) wr.set_source_uri(tr.get_persist_contract().raw_uri) wr.set_persist() controller = Controller.from_env(has_contract=False) controller.intent_model.transition(canonical=pd.DataFrame(), task_name='task1', intent_level='transition') controller.intent_model.wrangle(canonical=pd.DataFrame(), task_name='task2', intent_level='wrangle')
def test_repeat_iterations(self): wr = Wrangle.from_env('task2') wr.set_persist( wr.pm.file_pattern(name='tester', prefix='result1_', file_type='parquet')) controller = Controller.from_env() controller.run_controller(repeat=1) self.assertEqual(['result1_tester.parquet'], os.listdir('work/data/')) shutil.rmtree('work/data') os.makedirs(os.environ['HADRON_DEFAULT_PATH']) wr.set_persist( wr.pm.file_pattern(name='tester', prefix='result1_', file_type='parquet', stamped='ns')) controller.run_controller(repeat=3) self.assertEqual(3, len(os.listdir('work/data/'))) shutil.rmtree('work/data') os.makedirs(os.environ['HADRON_DEFAULT_PATH']) controller.run_controller(repeat=2, sleep=1) self.assertEqual(2, len(os.listdir('work/data/'))) shutil.rmtree('work/data') os.makedirs(os.environ['HADRON_DEFAULT_PATH']) controller.run_controller(repeat=2, sleep=1, run_time=4) self.assertEqual(4, len(os.listdir('work/data/')))
def setUp(self): # clean out any old environments for key in os.environ.keys(): if key.startswith('HADRON'): del os.environ[key] os.environ['HADRON_PM_PATH'] = os.path.join('work', 'config') os.environ['HADRON_DEFAULT_PATH'] = os.path.join('work', 'data') try: os.makedirs(os.environ['HADRON_PM_PATH']) os.makedirs(os.environ['HADRON_DEFAULT_PATH']) except: pass PropertyManager._remove_all() tr = Transition.from_env('task1', has_contract=False) tr.set_source_uri( "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv" ) tr.set_persist() wr = Wrangle.from_env('task2', has_contract=False) wr.set_source_uri(tr.get_persist_contract().raw_uri) wr.set_persist() controller = Controller.from_env(has_contract=False) controller.intent_model.transition(canonical=pd.DataFrame(), task_name='task1', intent_level='task1_tr') controller.intent_model.wrangle(canonical=pd.DataFrame(), task_name='task2', intent_level='task2_wr')
def test_runs(self): """Basic smoke test""" im = Wrangle.from_env('tester', default_save=False, default_save_intent=False, reset_templates=False, has_contract=False).intent_model self.assertTrue(WrangleIntentModel, type(im))