예제 #1
0
def run(config_path):

    local_path = './experimentos/producao/'
    filename = 'config.yaml'

    #caminho do bucket para fzer o download do arquivo para a maquina local
    bucket_name_download = config_path[5:].split('/')[0]

    Utils.download_file_from_gcp(config_path,
                                 local_path=local_path,
                                 filename=filename,
                                 bucket_name=bucket_name_download)

    #variáveis de ambiente de acordo com desenvolvimento ou produção
    config = config_pre_tratamento(local_path + filename)
    project = config['project']
    config['caminho_saida_dados'] = local_path
    d6tflow.set_dir(config['caminho_saida_dados'])
    params = get_tasks(config)
    t = tasks.TaskPrdReport(**params)

    d6tflow.preview(t)
    d6tflow.run(t, workers=config['workers'])

    model = tasks.TaskTrainModel(
        task_engineer_params=params['task_engineer_params'],
        task_te_params=params['task_te_params'],
        task_ps_params=params['task_ps_params'],
        task_model_params=params['task_model_params']).output().load()
    salvar_modelo(t, model, config)
    return True
예제 #2
0
파일: main.py 프로젝트: savadev/d6tflow
def test_plot(cleanup):
    df1 = pd.DataFrame({'a': range(10)})
    df2 = pd.DataFrame({'b': range(10, 20)})

    plt1 = df1.plot.bar()
    plt2 = df2.plot.bar()

    class TaskPlot(d6tflow.tasks.TaskMatplotlib):
        def run(self):
            self.save(plt1)

    TaskPlot().run()
    assert TaskPlot().output().exists()
    TaskPlot().invalidate(confirm=False)
    assert not TaskPlot().output().exists()

    class TaskPlot2(d6tflow.tasks.TaskMatplotlib):
        persist = ['plot1', 'plot2']

        def run(self):
            self.save({'plot1': plt1, 'plot2': plt2})

    TaskPlot2().run()
    assert TaskPlot2().complete()
    TaskPlot2().invalidate(confirm=False)
    assert not TaskPlot2().complete()

    d6tflow.preview(TaskPlot2(), clip_params=True)
예제 #3
0
파일: main.py 프로젝트: keithsu0215/d6tflow
def test_preview():
    t1 = Task1()
    t2 = Task2()
    t3 = Task3()
    d6tflow.invalidate_upstream(t3, confirm=False)

    import io
    from contextlib import redirect_stdout

    with io.StringIO() as buf, redirect_stdout(buf):
        d6tflow.preview(t3)
        output = buf.getvalue()
        assert output.count('PENDING') == 3
        assert output.count('COMPLETE') == 0

    with io.StringIO() as buf, redirect_stdout(buf):
        d6tflow.run(t3)
        d6tflow.preview(t3)
        output = buf.getvalue()
        assert output.count('PENDING') == 0
        assert output.count('COMPLETE') == 3

    with io.StringIO() as buf, redirect_stdout(buf):
        d6tflow.preview(Task3(do_preprocess=False))
        output = buf.getvalue()
        assert output.count('PENDING') == 1
        assert output.count('COMPLETE') == 2
예제 #4
0
 def preview(self, func_to_preview, params: dict):
     func_params = params
     name = func_to_preview.__name__
     all_params = self.params_used.get(name, None)
     if func_params:
         d6tflow.preview(self.steps[name](**func_params))
     elif all_params:
         for params in self.params_used[name]:
             d6tflow.preview(self.steps[name](**params))
     else:
         d6tflow.preview(self.steps[name]())
예제 #5
0
import d6tflow
import cfg, tasks, visualize

# Check task dependencies and their execution status
d6tflow.preview(tasks.TaskTrain())

# Execute the model training task including dependencies. See https://d6tflow.readthedocs.io/en/latest/run.html
d6tflow.run(tasks.TaskTrain())

# use output
visualize.accuracy()
visualize.plot_importances()

# change parameter and rerun, see https://d6tflow.readthedocs.io/en/latest/advparam.html
d6tflow.run(tasks.TaskTrain(do_preprocess=False))
visualize.accuracy(do_preprocess=False)  # task output is parameter specific

# rerun flow after code changes
import importlib
importlib.reload(cfg)
importlib.reload(tasks)

# say you changed TaskGetData, reset all tasks depending on TaskGetData
d6tflow.invalidate_downstream(tasks.TaskGetData(), tasks.TaskTrain())

d6tflow.preview(tasks.TaskTrain())
d6tflow.run(tasks.TaskTrain())
예제 #6
0
파일: main.py 프로젝트: savadev/d6tflow
def test_pipes_advanced(cleanup_pipe):
    import d6tflow.pipes
    d6tflow.pipes.init(cfg['d6tpipe_pipe1'],
                       profile=cfg['d6tpipe_profile'],
                       local_pipe=True,
                       reset=True)
    assert 'Local' in d6tflow.pipes.get_pipe().__class__.__name__
    d6tflow.pipes.init(cfg['d6tpipe_pipe1'],
                       profile=cfg['d6tpipe_profile'],
                       reset=True)

    class Task1(d6tflow.tasks.TaskPqPandas):
        def run(self):
            self.save(df)

    t1 = Task1()
    pipe1 = t1.get_pipe()
    pipedir = pipe1.dirpath
    t1filepath = t1.output().path
    t1file = str(PurePosixPath(t1filepath.relative_to(pipedir)))

    d6tflow.preview(t1)
    assert d6tflow.run(t1)
    assert t1.complete()

    with fuckit:
        pipe1._pullpush_luigi([t1file], op='remove')

    assert pipe1.scan_remote(cached=False) == []
    assert t1.pull_preview() == []
    assert t1.push_preview() == [t1file]
    assert d6tflow.pipes.all_push_preview(t1) == {
        cfg['d6tpipe_pipe1']: [t1file]
    }
    assert d6tflow.pipes.all_push(t1) == {cfg['d6tpipe_pipe1']: [t1file]}

    class Task1(d6tflow.tasks.TaskPqPandas):
        external = True
        pipename = cfg['d6tpipe_pipe1']

    class Task2(d6tflow.tasks.TaskPqPandas):
        persist = ['df2', 'df4']

        def requires(self):
            return Task1()

        def run(self):
            df2fun(self)

    import importlib
    importlib.reload(d6tflow)
    importlib.reload(d6tflow.pipes)
    d6tflow.cache.pipes = {}
    d6tflow.pipes.init(cfg['d6tpipe_pipe2'],
                       profile=cfg['d6tpipe_profile2'],
                       reset=True)
    t1 = Task1()
    assert t1.get_pipename() == cfg['d6tpipe_pipe1']
    assert not t1.complete()
    assert t1.pull_preview() == [str(t1file)]
    assert d6tflow.pipes.all_pull_preview(t1) == {
        cfg['d6tpipe_pipe1']: [t1file]
    }
    assert t1.pull() == [str(t1file)]
    assert t1.complete()
    assert t1.output().load().equals(df)

    t2 = Task2()
    d6tflow.show([t2])
    assert d6tflow.run([t2])  # run as list

    pipe2 = t2.get_pipe()
    pipedir = t2.get_pipe().dirpath
    # assert False
    t2files = [
        str(PurePosixPath(p.path.relative_to(pipedir)))
        for p in t2.output().values()
    ]

    assert d6tflow.pipes.all_push_preview(t2) == {
        cfg['d6tpipe_pipe2']: t2files
    }

    # cleanup
    pipe1._pullpush_luigi([t1file], op='remove')
    assert pipe1.scan_remote(cached=False) == []
예제 #7
0
    idx3 = luigi.Parameter(default='test3')
    export = False

    def run(self):
        self.save({'df': df, 'df2': df})


@d6tflow.requires(Task1A, Task1B, Task1C)
class Task1All(d6tflow.tasks.TaskCache):
    def run(self):
        self.save(df)


d6tflow.run(Task1All())
d6tflow.invalidate_upstream(Task1All(), confirm=False)
d6tflow.preview(Task1All())

task = Task1All()

#**************************************************
# tests
#**************************************************

import pytest

import d6tflow.pipes


def readfile(file_dir):
    with open(file_dir, 'r') as f:
        file = f.read()
예제 #8
0
파일: tmp.py 프로젝트: wl1830/d6tflow
import importlib
import d6tflow
import luigi

import pandas as pd
df = pd.DataFrame({'a': range(10)})


class Task1(d6tflow.tasks.TaskCache):
    persist = ['df']
    idx = luigi.Parameter(default='test')
    idx2 = luigi.Parameter(default='test')

    def run(self):
        self.save({'df': df})


@d6tflow.inherits(Task1)
@d6tflow.clone_parent
class Task2(d6tflow.tasks.TaskCache):
    def run(self):
        self.save({'df': df})


d6tflow.preview(Task2(), clip_params=True)
예제 #9
0
import d6tflow

# Import workflow tasks and output visualizations
import flow_tasks, flow_viz

# Instantiate terminal task with parameters
params = {'data_size': 6, 'mini_batch_size': 2}
task = flow_tasks.TaskModelTrain(**params)

# optional: reset everything every time workflow is run
d6tflow.invalidate_upstream(task, confirm=False)

# Preview terminal task
d6tflow.preview(task, clip_params=True)

# Run terminal task
d6tflow.run(task)

# Show output
if task.complete():
    flow_viz.show_test_prints(params)
예제 #10
0
파일: functional.py 프로젝트: Mozin/d6tflow
 def preview(self, func_to_preview, params=None):
     self._instantiate([func_to_preview], params=params)
     return d6tflow.preview(
         self.instantiated_tasks[func_to_preview.__name__])
예제 #11
0
파일: example.py 프로젝트: wl1830/d6tflow
        return self.clone_parent()  # automatically pass parameters upstream

    def run(self):
        df_train = self.input().load()
        if self.model == 'ols':
            model = sklearn.linear_model.LogisticRegression()
        elif self.model == 'svm':
            model = sklearn.svm.SVC()
        else:
            raise ValueError('invalid model selection')
        model.fit(df_train.iloc[:, :-1], df_train['y'])
        self.save(model)


# Check task dependencies and their execution status
d6tflow.preview(TaskTrain())
'''
└─--[TaskTrain-{'do_preprocess': 'False', 'model': 'ols'} (PENDING)]
   └─--[TaskPreprocess-{'do_preprocess': 'False'} (PENDING)]
      └─--[TaskGetData-{} (PENDING)]
'''

# Execute the model training task including dependencies
d6tflow.run(TaskTrain())
'''
===== Luigi Execution Summary =====

Scheduled 3 tasks of which:
* 3 ran successfully:
    - 1 TaskGetData()
    - 1 TaskPreprocess(do_preprocess=False)
예제 #12
0
    symbols = ['CAT','WMT'],
    lookback_period = 1
    )
strategy2 = strategy1.copy()
strategy2['symbols']=['MSFT','FB'] # run another universe
strategy3 = strategy1.copy()
strategy3['date_start']= datetime.date(2019,1,1) # run another time period

#************************************************************
# run backtests
#************************************************************

# run backtest including necessary dependencies
for istrat, strategy in enumerate([strategy1,strategy2,strategy3]):
    print(f'run strategy #{istrat+1}')
    print(d6tflow.preview(Backtest(**strategy)))  # show which tasks will be run
    d6tflow.run(Backtest(**strategy))
    df_pnl1 = Backtest(**strategy).output()['pnl'].load() # load task output
    print(f'pnl strategy #{istrat+1}:', df_pnl1.sum().sum().round(3))

def dev():
    TradingSignals(**strategy1).reset() # reset after making updates


#************************************************************
# backtest output
#************************************************************

'''
run strategy #1
예제 #13
0
# run workflow for model 1
d6tflow.run(TaskTrain(**params_model1))

'''
===== Luigi Execution Summary =====

Scheduled 3 tasks of which:
* 3 ran successfully:
    - 1 TaskGetData()
    - 1 TaskPreprocess(do_preprocess=False)
    - 1 TaskTrain(do_preprocess=False, model=ols)
'''

# Intelligently rerun workflow after changing parameters
d6tflow.preview(TaskTrain(**params_model2))

'''
└─--[TaskTrain-{'do_preprocess': 'False'} (PENDING)]
   └─--[TaskPreprocess-{'do_preprocess': 'False'} (PENDING)]
      └─--[TaskGetData-{} (COMPLETE)] => this doesn't change and doesn't need to rerun
'''

# run workflow for model 2
d6tflow.run(TaskTrain(**params_model2))

# compare results from new model
# Load task output to pandas dataframe and model object for model evaluation

model1 = TaskTrain(**params_model1).output().load()
df_train = TaskPreprocess(**params_model1).output().load()
예제 #14
0
        self.save(df_train)

class TaskTrain(dt.tasks.TaskPickle):
    do_preprocess = li.BoolParameter(default = True)

    def requires(self):
        return TaskPreprocess(do_preprocess = self.do_preprocess)

    def run(self):
        df_train = self.input().load()
        class_weights = {0:1, 1:2}
        model = RandomForestClassifier(n_estimators = 400, random_state = 0, max_depth=20, class_weight=class_weights, min_samples_split=5,min_samples_leaf=4)
        model.fit(df_train.iloc[:, :-1], df_train['LuxL'])
        self.save(model)

dt.preview(TaskTrain())
dt.run(TaskTrain())

model = TaskTrain().output().load()

# run model on test data
def load_test_data(file_name):
    test_df = pd.read_csv(test_file_name)
    return(test_df)

def get_test_preds(test_data):
    test_df = test
    test_df = test_df.fillna(test_df.mean())
    test_df.iloc[:,:-1] = sk.preprocessing.scale(test_df.iloc[:,:-1])
    preds = model.predict(test_df.iloc[:, :-1])
    return(preds)
예제 #15
0
            assert df_train.equals(self.input()['data'].load())
            assert df_train.equals(self.inputLoad(task='data'))
            assert df_trainX.equals(self.input()['data-drain']['x'].load())
            assert df_trainX.equals(self.inputLoad(task='data-train')[0])
            assert df_trainX.equals(
                self.inputLoad(task='data-train', as_dict=True)['x'])

        df_train['target_naive1'] = df_train['target'].mean()
        df_train['target_ols'] = data['ols'].predict(df_trainX)
        df_train['target_lgbm'] = data['lgbm'].predict(df_trainX)

        self.save(df_train)


params = dict()
d6tflow.preview(ModelEval(**params))
d6tflow.run(ModelEval(
    **params))  #,forced_all=True,confirm=False, forced_all_upstream=True)

# multi model comparison
df_train = ModelEval(**params).outputLoad()
print('insample errors')
print('naive mean',
      mean_squared_error(df_train[cfg_col_Y], df_train['target_naive1']))
print('ols', mean_squared_error(df_train[cfg_col_Y], df_train['target_ols']))
print('gbm', mean_squared_error(df_train[cfg_col_Y], df_train['target_lgbm']))

print('cv errors')
model_ols = ModelTrainOLS(**params)
mod_lgbm = ModelTrainLGBM(**params)
df_trainX, df_trainY = DataTrain(**params).outputLoad()
import d6tflow
from d6tflow.tasks import TaskCSVPandas, TaskJson
import pandas as pd
from pandas.io.json import json_normalize


class Task_Dual(TaskJson):
    '''
    illustration of adding status information
    '''
    def run(self):
        D = pd.DataFrame({'Test': [1, 2, 3]})

        out = {'dataframe': D.to_json(), 'status': True}

        self.save(out)


if __name__ == "__main__":

    print(d6tflow.preview(Task_Dual()))
    d6tflow.run(Task_Dual())

    out_json = Task_Dual().output().load()['dataframe']
    out = pd.read_json(out_json)

    print(out)