示例#1
0
def test_run_cache_states(clear_caches, int_frame, other_int_frame,
                          run_manager, report, remote_1):
    time.sleep(0.001)

    def func_1(df):
        np.random.seed(int(time.time() * 1000000) % 1000000)
        if df.train:
            df.state['tmp'] = np.random.randint(1000)
        tmp = df.state['tmp']
        res = pd.DataFrame(index=df.index)
        res['tmp'] = tmp
        return res

    func_1 = FeatureConstructor(func_1)
    func_1.parallel = remote_1
    res_1 = run_manager.run([func_1],
                            frame=int_frame,
                            train=True,
                            fold="test_run_cache_states",
                            ret=True,
                            report=report)['func_1']
    time.sleep(0.001)
    run_manager.merge_scheduled()
    res_2 = run_manager.run([func_1],
                            frame=other_int_frame,
                            train=False,
                            fold="test_run_cache_states",
                            ret=True,
                            report=report)['func_1']
    assert (res_1.tmp.mean() == res_2.tmp.mean())
示例#2
0
def test_run_cache_columns(clear_caches, int_frame, other_int_frame,
                           run_manager, report, remote_1):
    def func_1(df):
        res = pd.DataFrame({'w': range(len(df))})
        if df.train:
            res['kk'] = 15
        else:
            res['mm'] = 18
        return res

    func_1 = FeatureConstructor(func_1)
    func_1.parallel = remote_1
    res_1 = run_manager.run([func_1],
                            frame=int_frame,
                            train=True,
                            fold='test_run_cache_columns',
                            ret=True,
                            report=report)['func_1']
    run_manager.merge_scheduled()
    assert all(res_1.columns == ['w', 'kk'])
    assert set(func_1.columns) == set(['w', 'kk'])
    res_2 = run_manager.run([func_1],
                            frame=other_int_frame,
                            train=False,
                            fold='test_run_cache_columns',
                            ret=True,
                            report=report)['func_1']
    assert all(res_2.columns == ['w', 'kk'])
示例#3
0
def test_deeply_nested_fc_run(clear_caches, int_frame, run_manager, report,
                              remote_1, remote_2, remote_3):
    am.clear.remote()

    def func_1(df):
        return df + 1

    func_1 = FeatureConstructor(func_1)
    func_1.parallel = remote_1

    def func_2(df):
        return func_1(df)**2

    func_2 = FeatureConstructor(func_2)
    func_2.parallel = remote_2

    def func_3(df):
        return func_2(df)**2  # + func_1(df)

    func_3 = FeatureConstructor(func_3)
    func_3.parallel = remote_3
    assert len(run_manager.scheduled) == 0
    res = run_manager.run([func_3],
                          frame=int_frame,
                          train=True,
                          fold='preview',
                          ret=True,
                          report=report)
    res_frame = res['func_3']
    run_manager.merge_scheduled()
    assert all(res_frame == (int_frame + 1)**2 + (int_frame + 1)**4)
示例#4
0
def test_scheduler_cache(clear_caches,
                         int_frame,
                         run_manager,
                         report,
                         remote_2,
                         remote_3,
                         remote_1=True):
    def func_1(df):
        return df + np.random.randint(1000)

    func_1 = FeatureConstructor(func_1)
    func_1.parallel = remote_1

    def func_2(df):
        return func_1(df)

    func_2 = FeatureConstructor(func_2)
    func_2.parallel = remote_2

    def func_3(df):
        return func_1(df)

    func_3 = FeatureConstructor(func_3)
    func_3.parallel = remote_3
    res = run_manager.run([func_1, func_2, func_3],
                          frame=int_frame,
                          train=True,
                          fold='preview',
                          ret=True,
                          report=report)
    assert all(res['func_1'] == res['func_2'])
    assert all(res['func_1'] == res['func_3'])
示例#5
0
 def register(function):
     if not isinstance(function, GenericFeatureConstructor):
         feature_constructor = FeatureConstructor(function)
     else:
         feature_constructor = function
     feature_constructor.cache = cache
     feature_constructor.parallel = parallel
     feature_constructor.verbose = verbose
     feature_list.register(feature_constructor)
     update_dashboard()
     return feature_constructor
示例#6
0
def test_fc_run(clear_caches, int_frame, run_manager, report, remote):
    am.clear.remote()

    def func(df):
        return df**2

    fc = FeatureConstructor(func)
    fc.parallel = remote
    assert len(run_manager.scheduled) == 0
    res = run_manager.run([fc],
                          frame=int_frame,
                          train=True,
                          fold='preview',
                          ret=True,
                          report=report)
    res_frame = res['func']
    run_manager.merge_scheduled()
    assert all(res_frame == int_frame**2)
示例#7
0
def test_static_cache(clear_caches, int_frame, run_manager, report, remote_1):
    def func_1(df):
        return df + np.random.randint(1000)

    func_1 = FeatureConstructor(func_1)
    func_1.parallel = remote_1
    res_1 = run_manager.run([func_1],
                            frame=int_frame,
                            train=True,
                            fold='test_static_cache',
                            ret=True,
                            report=report)['func_1']
    run_manager.merge_scheduled()
    assert len(run_manager.scheduled) == 0
    res_2 = run_manager.run([func_1],
                            frame=int_frame,
                            train=True,
                            fold='test_static_cache',
                            ret=True,
                            report=report)['func_1']
    assert all(res_1 == res_2)
示例#8
0
def test_cv_feature_set(frame, folds, clear_caches, report, remote_1,
                        remote_2):
    def simple_feature(df):
        return df[['int']] * 2

    simple_feature = FeatureConstructor(simple_feature)
    simple_feature.parallel = remote_1

    def stateful_feature(df):
        res = pd.DataFrame(index=df.index)
        if df.train:
            df.state['mean'] = df['int'].mean()
        mean = df.state['mean']
        res['mean'] = mean
        return res

    stateful_feature = FeatureConstructor(stateful_feature)
    stateful_feature.parallel = remote_2

    fs = FeatureSet([simple_feature], [stateful_feature],
                    train_frame=frame,
                    targets='float_rand')
    cfs = fs.split(folds)
    cfs.compute(report=report)
    assert len(run_manager.scheduled) == 0

    for i in range(len(folds)):
        train_mean = cfs.fold(i).train[:, 1]
        valid_mean = cfs.fold(i).valid[:, 1]
        assert len(set(train_mean)) == 1
        assert len(set(valid_mean)) == 1
        assert train_mean.mean() == pytest.approx(valid_mean.mean())
    assert len(run_manager.scheduled) == 0
示例#9
0
def test_auxiliary(frame, clear_caches, report, remote_1, remote_2):
    cfg.preview_mode = False

    def aux_feature_1(df):
        res = pd.DataFrame(index=df.index)
        res['aux_1'] = df[['int']] * 2
        return res

    aux_feature_1 = FeatureConstructor(aux_feature_1)
    aux_feature_1.parallel = remote_1

    def aux_feature_2(df):
        res = pd.DataFrame(index=df.index)
        res['aux_2'] = df[['float_rand']]**2
        return res

    aux_feature_2 = FeatureConstructor(aux_feature_2)
    aux_feature_2.parallel = remote_2

    fs = FeatureSet([aux_feature_1, aux_feature_2],
                    train_frame=frame,
                    targets=['int'],
                    auxiliary=['aux_1', 'aux_2', 'str'])
    fs.compute()
    assert len(run_manager.scheduled) == 0

    aux = fs.aux

    assert (aux['aux_1'].values == pytest.approx(frame['int'].values * 2))
    assert (aux['aux_2'].values == pytest.approx(
        frame['float_rand'].values**2))
    pd.testing.assert_series_equal(aux['str'], frame['str'])

    assert len(run_manager.scheduled) == 0
示例#10
0
def test_nested_progressbar(clear_caches, int_frame, run_manager, report,
                            remote_1, remote_2):
    am.clear.remote()

    def func_1(df):
        for _ in pbar(range(10)):
            time.sleep(0.1)
        return df + 1

    func_1 = FeatureConstructor(func_1)
    func_1.parallel = remote_1

    def func_2(df):
        for _ in pbar(range(10), title='test'):
            time.sleep(0.1)
        return func_1(df)**2

    func_2 = FeatureConstructor(func_2)
    func_2.parallel = remote_2
    assert len(run_manager.scheduled) == 0
    res = run_manager.run([func_2],
                          frame=int_frame,
                          train=True,
                          fold='preview',
                          ret=True,
                          report=report)
    res_frame = res['func_2']
    run_manager.merge_scheduled()
    assert all(res_frame == (int_frame + 1)**2)
示例#11
0
 def _preview(obj):
     report = FeatureComputingReport(feature_list)
     if isinstance(obj, GenericFeatureConstructor):
         feature_constructor = obj()
     elif not isinstance(obj,
                         BaseFeatureConstructor):  # in case of stl preview
         feature_constructor = FeatureConstructor(obj)
     else:
         feature_constructor = obj
     feature_constructor.parallel = parallel
     try:
         cfg.feature_computing_report = report
         for size in sizes:
             results = run_manager.run([feature_constructor],
                                       frame=frame.head(size),
                                       train=train,
                                       fold='preview',
                                       ret=True,
                                       report=report)
             report.finish()
             display(results[feature_constructor.name])
     finally:
         run_manager.merge_scheduled()
         cfg.feature_computing_report = None
示例#12
0
def test_computed_target(frame, folds, clear_caches, report, remote_1,
                         remote_2):
    cfg.preview_mode = False

    def target_feature_1(df):
        res = pd.DataFrame(index=df.index)
        res['target_1'] = df[['int']] * 2
        return res

    target_feature_1 = FeatureConstructor(target_feature_1)
    target_feature_1.parallel = remote_1

    def target_feature_2(df):
        res = pd.DataFrame(index=df.index)
        res['target_2'] = df[['float_rand']]**2
        return res

    target_feature_2 = FeatureConstructor(target_feature_2)
    target_feature_2.parallel = remote_2

    fs = FeatureSet([target_feature_1, target_feature_2],
                    train_frame=frame,
                    targets=['target_1', 'target_2'])
    cfs = fs.split(folds)
    cfs.compute(report=report)
    assert len(run_manager.scheduled) == 0

    for i, (idx_train, idx_valid) in enumerate(folds):
        train_target = cfs.fold(i).train_target
        valid_target = cfs.fold(i).valid_target

        assert train_target[:, 0] == pytest.approx(
            frame['int'].values[idx_train] * 2)
        assert train_target[:, 1] == pytest.approx(
            frame['float_rand'].values[idx_train]**2)
        assert valid_target[:, 0] == pytest.approx(
            frame['int'].values[idx_valid] * 2)
        assert valid_target[:, 1] == pytest.approx(
            frame['float_rand'].values[idx_valid]**2)
    assert len(run_manager.scheduled) == 0
示例#13
0
def test_no_stateful(frame, folds, clear_caches, report, remote_1, remote_2):
    def simple_feature(df):
        return df[['int']] * 2

    simple_feature = FeatureConstructor(simple_feature)
    simple_feature.parallel = remote_1

    def another_simple_feature(df):
        return df[['int_rand']] * 2

    another_simple_feature = FeatureConstructor(another_simple_feature)
    another_simple_feature.parallel = remote_2

    fs = FeatureSet([simple_feature, another_simple_feature],
                    train_frame=frame,
                    targets='float_rand')
    cfs = fs.split(folds)
    cfs.compute(report=report)
    assert len(run_manager.scheduled) == 0
    for i, (idx_train, idx_valid) in enumerate(folds):
        assert cfs.fold(i).train.shape[0] == idx_train.shape[0]
        assert cfs.fold(i).valid.shape[0] == idx_valid.shape[0]
    assert len(run_manager.scheduled) == 0
示例#14
0
文件: generic.py 项目: konodyuk/kts
 def call(self, *args, **kwargs):
     instance_kwargs = copy(self.kwargs)
     for k, v in zip(self.arg_names, args):
         instance_kwargs[k] = v
         if k in kwargs:
             raise TypeError(
                 f"{self.name}() got multiple values for argument {repr(k)}"
             )
     for k, v in kwargs.items():
         if k not in self.kwargs:
             raise ValueError(f"Unexpected arg: {k}")
         instance_kwargs[k] = v
     res = FeatureConstructor(self.modify(self.func, instance_kwargs),
                              internal=True)
     res.name = f"{self.name}__" + "_".join(
         map(str, instance_kwargs.values()))
     res.description = f"An instance of generic feature constructor <tt>{self.name}</tt>"
     res.source = f"{self.name}({', '.join(f'{repr(instance_kwargs[k])}' for k in self.arg_names)})"
     res.additional_source = self.source
     # res.source = f"{self.name}({', '.join(f'{k}={repr(v)}' for k, v in instance_kwargs.items())})"
     res.requirements = self.requirements
     res.dependencies = dict()
     res.parallel = self.parallel
     res.cache = self.cache
     res.verbose = self.verbose
     return res