def test_run_cache_states(clear_caches, int_frame, other_int_frame, run_manager, report, remote_1): time.sleep(0.001) def func_1(df): np.random.seed(int(time.time() * 1000000) % 1000000) if df.train: df.state['tmp'] = np.random.randint(1000) tmp = df.state['tmp'] res = pd.DataFrame(index=df.index) res['tmp'] = tmp return res func_1 = FeatureConstructor(func_1) func_1.parallel = remote_1 res_1 = run_manager.run([func_1], frame=int_frame, train=True, fold="test_run_cache_states", ret=True, report=report)['func_1'] time.sleep(0.001) run_manager.merge_scheduled() res_2 = run_manager.run([func_1], frame=other_int_frame, train=False, fold="test_run_cache_states", ret=True, report=report)['func_1'] assert (res_1.tmp.mean() == res_2.tmp.mean())
def test_run_cache_columns(clear_caches, int_frame, other_int_frame, run_manager, report, remote_1): def func_1(df): res = pd.DataFrame({'w': range(len(df))}) if df.train: res['kk'] = 15 else: res['mm'] = 18 return res func_1 = FeatureConstructor(func_1) func_1.parallel = remote_1 res_1 = run_manager.run([func_1], frame=int_frame, train=True, fold='test_run_cache_columns', ret=True, report=report)['func_1'] run_manager.merge_scheduled() assert all(res_1.columns == ['w', 'kk']) assert set(func_1.columns) == set(['w', 'kk']) res_2 = run_manager.run([func_1], frame=other_int_frame, train=False, fold='test_run_cache_columns', ret=True, report=report)['func_1'] assert all(res_2.columns == ['w', 'kk'])
def test_deeply_nested_fc_run(clear_caches, int_frame, run_manager, report, remote_1, remote_2, remote_3): am.clear.remote() def func_1(df): return df + 1 func_1 = FeatureConstructor(func_1) func_1.parallel = remote_1 def func_2(df): return func_1(df)**2 func_2 = FeatureConstructor(func_2) func_2.parallel = remote_2 def func_3(df): return func_2(df)**2 # + func_1(df) func_3 = FeatureConstructor(func_3) func_3.parallel = remote_3 assert len(run_manager.scheduled) == 0 res = run_manager.run([func_3], frame=int_frame, train=True, fold='preview', ret=True, report=report) res_frame = res['func_3'] run_manager.merge_scheduled() assert all(res_frame == (int_frame + 1)**2 + (int_frame + 1)**4)
def test_scheduler_cache(clear_caches, int_frame, run_manager, report, remote_2, remote_3, remote_1=True): def func_1(df): return df + np.random.randint(1000) func_1 = FeatureConstructor(func_1) func_1.parallel = remote_1 def func_2(df): return func_1(df) func_2 = FeatureConstructor(func_2) func_2.parallel = remote_2 def func_3(df): return func_1(df) func_3 = FeatureConstructor(func_3) func_3.parallel = remote_3 res = run_manager.run([func_1, func_2, func_3], frame=int_frame, train=True, fold='preview', ret=True, report=report) assert all(res['func_1'] == res['func_2']) assert all(res['func_1'] == res['func_3'])
def register(function): if not isinstance(function, GenericFeatureConstructor): feature_constructor = FeatureConstructor(function) else: feature_constructor = function feature_constructor.cache = cache feature_constructor.parallel = parallel feature_constructor.verbose = verbose feature_list.register(feature_constructor) update_dashboard() return feature_constructor
def test_fc_run(clear_caches, int_frame, run_manager, report, remote): am.clear.remote() def func(df): return df**2 fc = FeatureConstructor(func) fc.parallel = remote assert len(run_manager.scheduled) == 0 res = run_manager.run([fc], frame=int_frame, train=True, fold='preview', ret=True, report=report) res_frame = res['func'] run_manager.merge_scheduled() assert all(res_frame == int_frame**2)
def test_static_cache(clear_caches, int_frame, run_manager, report, remote_1): def func_1(df): return df + np.random.randint(1000) func_1 = FeatureConstructor(func_1) func_1.parallel = remote_1 res_1 = run_manager.run([func_1], frame=int_frame, train=True, fold='test_static_cache', ret=True, report=report)['func_1'] run_manager.merge_scheduled() assert len(run_manager.scheduled) == 0 res_2 = run_manager.run([func_1], frame=int_frame, train=True, fold='test_static_cache', ret=True, report=report)['func_1'] assert all(res_1 == res_2)
def test_cv_feature_set(frame, folds, clear_caches, report, remote_1, remote_2): def simple_feature(df): return df[['int']] * 2 simple_feature = FeatureConstructor(simple_feature) simple_feature.parallel = remote_1 def stateful_feature(df): res = pd.DataFrame(index=df.index) if df.train: df.state['mean'] = df['int'].mean() mean = df.state['mean'] res['mean'] = mean return res stateful_feature = FeatureConstructor(stateful_feature) stateful_feature.parallel = remote_2 fs = FeatureSet([simple_feature], [stateful_feature], train_frame=frame, targets='float_rand') cfs = fs.split(folds) cfs.compute(report=report) assert len(run_manager.scheduled) == 0 for i in range(len(folds)): train_mean = cfs.fold(i).train[:, 1] valid_mean = cfs.fold(i).valid[:, 1] assert len(set(train_mean)) == 1 assert len(set(valid_mean)) == 1 assert train_mean.mean() == pytest.approx(valid_mean.mean()) assert len(run_manager.scheduled) == 0
def test_auxiliary(frame, clear_caches, report, remote_1, remote_2): cfg.preview_mode = False def aux_feature_1(df): res = pd.DataFrame(index=df.index) res['aux_1'] = df[['int']] * 2 return res aux_feature_1 = FeatureConstructor(aux_feature_1) aux_feature_1.parallel = remote_1 def aux_feature_2(df): res = pd.DataFrame(index=df.index) res['aux_2'] = df[['float_rand']]**2 return res aux_feature_2 = FeatureConstructor(aux_feature_2) aux_feature_2.parallel = remote_2 fs = FeatureSet([aux_feature_1, aux_feature_2], train_frame=frame, targets=['int'], auxiliary=['aux_1', 'aux_2', 'str']) fs.compute() assert len(run_manager.scheduled) == 0 aux = fs.aux assert (aux['aux_1'].values == pytest.approx(frame['int'].values * 2)) assert (aux['aux_2'].values == pytest.approx( frame['float_rand'].values**2)) pd.testing.assert_series_equal(aux['str'], frame['str']) assert len(run_manager.scheduled) == 0
def test_nested_progressbar(clear_caches, int_frame, run_manager, report, remote_1, remote_2): am.clear.remote() def func_1(df): for _ in pbar(range(10)): time.sleep(0.1) return df + 1 func_1 = FeatureConstructor(func_1) func_1.parallel = remote_1 def func_2(df): for _ in pbar(range(10), title='test'): time.sleep(0.1) return func_1(df)**2 func_2 = FeatureConstructor(func_2) func_2.parallel = remote_2 assert len(run_manager.scheduled) == 0 res = run_manager.run([func_2], frame=int_frame, train=True, fold='preview', ret=True, report=report) res_frame = res['func_2'] run_manager.merge_scheduled() assert all(res_frame == (int_frame + 1)**2)
def _preview(obj): report = FeatureComputingReport(feature_list) if isinstance(obj, GenericFeatureConstructor): feature_constructor = obj() elif not isinstance(obj, BaseFeatureConstructor): # in case of stl preview feature_constructor = FeatureConstructor(obj) else: feature_constructor = obj feature_constructor.parallel = parallel try: cfg.feature_computing_report = report for size in sizes: results = run_manager.run([feature_constructor], frame=frame.head(size), train=train, fold='preview', ret=True, report=report) report.finish() display(results[feature_constructor.name]) finally: run_manager.merge_scheduled() cfg.feature_computing_report = None
def test_computed_target(frame, folds, clear_caches, report, remote_1, remote_2): cfg.preview_mode = False def target_feature_1(df): res = pd.DataFrame(index=df.index) res['target_1'] = df[['int']] * 2 return res target_feature_1 = FeatureConstructor(target_feature_1) target_feature_1.parallel = remote_1 def target_feature_2(df): res = pd.DataFrame(index=df.index) res['target_2'] = df[['float_rand']]**2 return res target_feature_2 = FeatureConstructor(target_feature_2) target_feature_2.parallel = remote_2 fs = FeatureSet([target_feature_1, target_feature_2], train_frame=frame, targets=['target_1', 'target_2']) cfs = fs.split(folds) cfs.compute(report=report) assert len(run_manager.scheduled) == 0 for i, (idx_train, idx_valid) in enumerate(folds): train_target = cfs.fold(i).train_target valid_target = cfs.fold(i).valid_target assert train_target[:, 0] == pytest.approx( frame['int'].values[idx_train] * 2) assert train_target[:, 1] == pytest.approx( frame['float_rand'].values[idx_train]**2) assert valid_target[:, 0] == pytest.approx( frame['int'].values[idx_valid] * 2) assert valid_target[:, 1] == pytest.approx( frame['float_rand'].values[idx_valid]**2) assert len(run_manager.scheduled) == 0
def test_no_stateful(frame, folds, clear_caches, report, remote_1, remote_2): def simple_feature(df): return df[['int']] * 2 simple_feature = FeatureConstructor(simple_feature) simple_feature.parallel = remote_1 def another_simple_feature(df): return df[['int_rand']] * 2 another_simple_feature = FeatureConstructor(another_simple_feature) another_simple_feature.parallel = remote_2 fs = FeatureSet([simple_feature, another_simple_feature], train_frame=frame, targets='float_rand') cfs = fs.split(folds) cfs.compute(report=report) assert len(run_manager.scheduled) == 0 for i, (idx_train, idx_valid) in enumerate(folds): assert cfs.fold(i).train.shape[0] == idx_train.shape[0] assert cfs.fold(i).valid.shape[0] == idx_valid.shape[0] assert len(run_manager.scheduled) == 0
def call(self, *args, **kwargs): instance_kwargs = copy(self.kwargs) for k, v in zip(self.arg_names, args): instance_kwargs[k] = v if k in kwargs: raise TypeError( f"{self.name}() got multiple values for argument {repr(k)}" ) for k, v in kwargs.items(): if k not in self.kwargs: raise ValueError(f"Unexpected arg: {k}") instance_kwargs[k] = v res = FeatureConstructor(self.modify(self.func, instance_kwargs), internal=True) res.name = f"{self.name}__" + "_".join( map(str, instance_kwargs.values())) res.description = f"An instance of generic feature constructor <tt>{self.name}</tt>" res.source = f"{self.name}({', '.join(f'{repr(instance_kwargs[k])}' for k in self.arg_names)})" res.additional_source = self.source # res.source = f"{self.name}({', '.join(f'{k}={repr(v)}' for k, v in instance_kwargs.items())})" res.requirements = self.requirements res.dependencies = dict() res.parallel = self.parallel res.cache = self.cache res.verbose = self.verbose return res