Пример #1
0
def test_preview():
    t1 = Task1()
    t2 = Task2()
    t3 = Task3()
    d6tflow.invalidate_upstream(t3, confirm=False)

    import io
    from contextlib import redirect_stdout

    with io.StringIO() as buf, redirect_stdout(buf):
        d6tflow.preview(t3)
        output = buf.getvalue()
        assert output.count('PENDING') == 3
        assert output.count('COMPLETE') == 0

    with io.StringIO() as buf, redirect_stdout(buf):
        d6tflow.run(t3)
        d6tflow.preview(t3)
        output = buf.getvalue()
        assert output.count('PENDING') == 0
        assert output.count('COMPLETE') == 3

    with io.StringIO() as buf, redirect_stdout(buf):
        d6tflow.preview(Task3(do_preprocess=False))
        output = buf.getvalue()
        assert output.count('PENDING') == 1
        assert output.count('COMPLETE') == 2
Пример #2
0
def test_task_inputLoad_multiple_multiple():
    @d6tflow.requires({
        'input1': TaskMultipleOutput1,
        'input2': TaskMultipleOutput2
    })
    class TaskMultipleInput2(d6tflow.tasks.TaskPqPandas):
        def run(self):
            data = self.inputLoad(as_dict=True)
            assert data["input1"]["output1"].equals(data["input2"]["output1"])
            assert data["input1"]["output2"].equals(data["input2"]["output2"])

            data1a, data1b = self.inputLoad()["input1"]
            data2a, data2b = self.inputLoad()["input2"]

            assert data1a.equals(data2a)
            assert data1b.equals(data2b)

            data1a, data1b = self.inputLoad(task='input1')
            data2a, data2b = self.inputLoad(task='input2')
            assert data1a.equals(data2a)
            assert data1b.equals(data2b)

            data1 = self.inputLoad(task='input1', as_dict=True)
            data2 = self.inputLoad(task='input2', as_dict=True)
            assert data1["output1"].equals(data2["output1"])
            assert data1["output2"].equals(data2["output2"])

    d6tflow.run(TaskMultipleInput2(),
                forced_all=True,
                forced_all_upstream=True,
                confirm=False)
Пример #3
0
    def run(self, funcs_to_run, params: dict = None, *args, **kwargs):
        """
            Runs flow steps locally. See luigi.build for additional details
            Parameters
            ----------
            funcs_to_run : function or list of functions

            params : dict
                dictionary of paramaters. Keys are param names and values are the values of params.

            Examples
            --------

            flow.run(func, params={'multiplier':2})

            flow.run([func1, func2], params={'multiplier':42})

            flow.run(func)
        """
        funcs_to_run = funcs_to_run if isinstance(funcs_to_run,
                                                  list) else [funcs_to_run]

        self._instantiate(funcs_to_run, params=params)

        d6tflow.run(list(self.instantiated_tasks.values()), *args, **kwargs)
Пример #4
0
def run(config_path):

    local_path = './experimentos/producao/'
    filename = 'config.yaml'

    #caminho do bucket para fzer o download do arquivo para a maquina local
    bucket_name_download = config_path[5:].split('/')[0]

    Utils.download_file_from_gcp(config_path,
                                 local_path=local_path,
                                 filename=filename,
                                 bucket_name=bucket_name_download)

    #variáveis de ambiente de acordo com desenvolvimento ou produção
    config = config_pre_tratamento(local_path + filename)
    project = config['project']
    config['caminho_saida_dados'] = local_path
    d6tflow.set_dir(config['caminho_saida_dados'])
    params = get_tasks(config)
    t = tasks.TaskPrdReport(**params)

    d6tflow.preview(t)
    d6tflow.run(t, workers=config['workers'])

    model = tasks.TaskTrainModel(
        task_engineer_params=params['task_engineer_params'],
        task_te_params=params['task_te_params'],
        task_ps_params=params['task_ps_params'],
        task_model_params=params['task_model_params']).output().load()
    salvar_modelo(t, model, config)
    return True
Пример #5
0
def process_one_file(file_path):
    """ Run the processing pipeline on one file """
    # Create a identifier based on the file path
    identifier = file_path.split("/")[-1]
    """ Uncomment these lines depending on what the 'aim' of the processing is, and what steps should be re-run """
    """ Uncomment line below to mark that all tasks should be re-run """
    p.TaskGetInitialData(path=file_path,
                         identifier=identifier).invalidate(confirm=False)
    """ Uncomment line below to mark that tasks related to BGs should be re-run """
    # p.TaskGetBGData(path=file_path, identifier=identifier).invalidate(confirm=False)
    """ Uncomment line below to mark that tasks for the preprocessing should be re-run """
    # p.TaskPreprocessData(path=file_path, identifier=identifier).invalidate(confirm=False)
    # p.TaskPreprocessBGs(path=file_path, identifier=identifier).invalidate(confirm=False)
    """ Uncomment lines below to mark that tasks to identify abnormal boluses &/or basals with a KNN model should be re-run """
    # p.TaskGetAbnormalBoluses(path=file_path, model_type="knn", identifier=identifier).invalidate(confirm=False)
    # p.TaskGetAbnormalBasals(path=file_path, identifier=identifier).invalidate(confirm=False)
    """ Uncomment lines below to mark that tasks to identify abnormal boluses with an Isolation Forest model should be re-run """
    # p.TaskGetAbnormalBoluses(path=file_path, model_type="isolation_forest", identifier=identifier).invalidate(confirm=False)
    """ Uncomment line below to find the abnormal boluses using k-nearest neighbors"""
    d6tflow.run(
        p.TaskGetAbnormalBoluses(path=file_path,
                                 model_type="knn",
                                 identifier=identifier))
    """ Uncomment line below to find the abnormal boluses using an Isolation Forest model """
    # d6tflow.run(p.TaskGetAbnormalBoluses(path=file_path, model_type="isolation_forest", identifier=identifier))
    """ Uncomment line below to find the abnormal basals """
    # d6tflow.run(p.TaskGetAbnormalBasals(path=file_path, identifier=identifier))
    """ Uncomment line below to process the dose data """
Пример #6
0
def test_task_inputLoad_multiple_multiple_tuple():
    @d6tflow.requires(TaskMultipleOutput1, TaskMultipleOutput2)
    class TaskMultipleInput2(d6tflow.tasks.TaskPqPandas):
        def run(self):
            data = self.inputLoad(as_dict=True)
            assert data[0]["output1"].equals(data[1]["output1"])
            assert data[0]["output2"].equals(data[1]["output2"])

            data1a, data1b = self.inputLoad()[0]
            data2a, data2b = self.inputLoad()[1]

            assert data1a.equals(data2a)
            assert data1b.equals(data2b)

            data1a, data1b = self.inputLoad(task=0)
            data2a, data2b = self.inputLoad(task=1)
            assert data1a.equals(data2a)
            assert data1b.equals(data2b)

            data1 = self.inputLoad(task=0, as_dict=True)
            data2 = self.inputLoad(task=1, as_dict=True)
            assert data1["output1"].equals(data2["output1"])
            assert data1["output2"].equals(data2["output2"])

    d6tflow.run(TaskMultipleInput2(),
                forced_all=True,
                forced_all_upstream=True,
                confirm=False)
    def run(self):
        ball_start_pos = self.input()['ball_start_pos'].load()
        PPCFa = self.input()['pitch_control_frame'].load()['PPCFa']
        xgrid = self.input()['pitch_control_frame'].load()['xgrid']
        ygrid = self.input()['pitch_control_frame'].load()['ygrid']

        # initialise transition grid
        TP = np.zeros(shape=(len(ygrid), len(xgrid)))

        if np.sum(PPCFa[-1]) == 0:
            N_TP = TP
            self.save({'N_TP': N_TP, 'TP': TP, 'xgrid': xgrid, 'ygrid': ygrid})

        else:
            # calculate transition model at each location on the pitch
            for i in range(len(ygrid)):
                for j in range(len(xgrid)):
                    target_position = np.array([xgrid[j], ygrid[i]])
                    d6t.run(CalcTransitionProbabilityTarget(
                        target_position=tuple(target_position),
                        ball_start_pos=tuple(ball_start_pos),
                        PPCFa=PPCFa[-1, i, j]),
                            execution_summary=False)
                    TP[i, j] = CalcTransitionProbabilityTarget(
                        target_position=tuple(target_position),
                        ball_start_pos=tuple(ball_start_pos),
                        PPCFa=PPCFa[-1, i, j]).output().load()

            # normalize T to unity
            N_TP = TP / np.sum(TP)

            self.save({'N_TP': N_TP, 'TP': TP, 'xgrid': xgrid, 'ygrid': ygrid})
Пример #8
0
def test_multiple_deps_on_input_load():
    # define 2 tasks that load raw data
    class Task1(d6tflow.tasks.TaskCache):
        persist = ['a1', 'a2']

        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save({'a1': df, 'a2': df})  # quickly save dataframe

    class Task2(d6tflow.tasks.TaskCache):
        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe

    # define another task that depends on data from task1 and task2
    @d6tflow.requires(Task1, Task2)
    class Task3(d6tflow.tasks.TaskCache):
        def run(self):
            data = self.inputLoad()
            df1 = data[0]['a1']
            assert df1.equals(data[0]['a2'])
            df2 = data[1]
            assert df2.equals(df1)
            df = df1.join(df2, lsuffix='1', rsuffix='2')
            self.save(df)

    # Execute task including all its dependencies
    d6tflow.run(Task3(), forced_all_upstream=True, confirm=False)
Пример #9
0
def test_outputLoadAllMeta():
    class Task1(d6tflow.tasks.TaskCache):

        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe
            self.metaSave({'columns': 42})

    class Task2(Task1):
        pass

    @d6tflow.requires({'upstream1': Task1, 'upstream2': Task2})
    class Task3(d6tflow.tasks.TaskCache):
        multiplier = d6tflow.IntParameter(default=2)

        def run(self):
            meta = self.metaLoad()['upstream1']
            print(meta)
            print(meta['columns'])

            df1 = self.input()['upstream1'].load()  # quickly load input data
            df2 = self.input()['upstream2'].load()  # quickly load input data
            df = df1.join(df2, lsuffix='1', rsuffix='2')
            df['b'] = df['a1']*self.multiplier  # use task parameter
            self.save(df)
            self.metaSave({'columns': 100})

    d6tflow.run(Task3())
    meta_all = Task3().outputLoadAllMeta()
    assert meta_all["Task1"]["columns"] == 42
    assert meta_all["Task2"]["columns"] == 42
    assert meta_all["Task3"]["columns"] == 100
Пример #10
0
def test_outputLoadMeta():
    class MetaSave(d6tflow.tasks.TaskCache):
        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe
            self.metaSave({'metadata': df})

    d6tflow.run(MetaSave())
    df = pd.DataFrame({'a': range(3)})
    assert MetaSave().outputLoadMeta()["metadata"].equals(df)
Пример #11
0
def test_task_inputLoad_single_single():
    @d6tflow.requires(TaskSingleOutput)
    class TaskSingleInput(d6tflow.tasks.TaskPqPandas):
        def run(self):
            data = self.inputLoad()
            assert data.equals(pd.DataFrame({'a': range(3)}))

    d6tflow.run(TaskSingleInput(),
                forced_all=True,
                forced_all_upstream=True,
                confirm=False)
Пример #12
0
def test_dynamic():

    class TaskCollector(d6tflow.tasks.TaskAggregator):
        def run(self):
            yield Task1()
            yield Task2()

    d6tflow.run(TaskCollector())
    assert Task1().complete() and Task2().complete() and TaskCollector().complete()
    assert TaskCollector().outputLoad()[0].equals(Task1().outputLoad())
    assert TaskCollector().outputLoad()[1][0].equals(Task2().outputLoad()[0])
    TaskCollector().invalidate(confirm=False)
    assert not (Task1().complete() and Task2().complete() and TaskCollector().complete())
    def run(self):
        events = self.input()['events']['events'].load()
        home_ids = self.input()['ids'].load()['home_ids']
        away_ids = self.input()['ids'].load()['away_ids']

        # break the pitch down into a grid
        n_grid_cells_y = int(self.n_grid_cells_x * self.field_dimen[1] / self.field_dimen[0])
        xgrid = np.linspace(-self.field_dimen[0] / 2., self.field_dimen[0] / 2., self.n_grid_cells_x)
        ygrid = np.linspace(-self.field_dimen[1] / 2., self.field_dimen[1] / 2., n_grid_cells_y)

        RPCa_Home = np.zeros(shape=(len(events[events.Team == 'Home']), len(home_ids) + 1, len(ygrid), len(xgrid)))
        RPCd_Home = np.zeros(shape=(len(events[events.Team == 'Away']), len(home_ids) + 1, len(ygrid), len(xgrid)))
        RPCa_Away = np.zeros(shape=(len(events[events.Team == 'Away']), len(away_ids) + 1, len(ygrid), len(xgrid)))
        RPCd_Away = np.zeros(shape=(len(events[events.Team == 'Home']), len(away_ids) + 1, len(ygrid), len(xgrid)))

        home_rows = events[events.Team == 'Home'].index
        away_rows = events[events.Team == 'Away'].index

        home_int_fail = []
        away_int_fail = []

        for i in tqdm(range(len(home_rows)), desc="executing game{} home events".format(self.gameid)):
            d6t.settings.check_dependencies = False
            d6t.settings.log_level = 'ERROR'
            d6t.run(rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[home_rows[i], 'Start Frame'], in_execution=True))
            RPCa_Home[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[home_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCa']
            RPCd_Away[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[home_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCd']
            if np.sum(RPCa_Home[i][-1]) == 0:
                home_int_fail.append(i)

        if len(home_int_fail) > 0:
            RPCa_Home = np.delete(RPCa_Home, obj=np.array(home_int_fail), axis=0)
            RPCd_Away = np.delete(RPCd_Away, obj=np.array(home_int_fail), axis=0)

        for i in tqdm(range(len(away_rows)), desc="executing game{} away events".format(self.gameid)):
            d6t.settings.check_dependencies = False
            d6t.settings.log_level = 'ERROR'
            d6t.run(rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[away_rows[i], 'Start Frame'], in_execution=True))
            RPCa_Away[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[away_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCa']
            RPCd_Home[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[away_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCd']
            if np.sum(RPCa_Away[i][-1]) == 0:
                away_int_fail.append(i)

        if len(away_int_fail) > 0:
            RPCa_Away = np.delete(RPCa_Away, obj=np.array(away_int_fail), axis=0)
            RPCd_Home = np.delete(RPCd_Home, obj=np.array(away_int_fail), axis=0)

        self.save({'RPCa_Home': RPCa_Home, 'RPCd_Home': RPCd_Home, 'RPCa_Away': RPCa_Away, 'RPCd_Away': RPCd_Away,
                   'home_int_fail': home_int_fail, 'away_int_fail': away_int_fail})
Пример #14
0
def test_task_inputLoad_multiple_single():
    @d6tflow.requires({
        'input1': TaskSingleOutput1,
        'input2': TaskSingleOutput2
    })
    class TaskMultipleInput(d6tflow.tasks.TaskPqPandas):
        def run(self):
            data1 = self.inputLoad()['input1']
            assert (data1.equals(pd.DataFrame({'b': range(3)})))
            data2 = self.inputLoad()['input2']
            assert (data2.equals(pd.DataFrame({'c': range(3)})))

    d6tflow.run(TaskMultipleInput(),
                forced_all=True,
                forced_all_upstream=True,
                confirm=False)
Пример #15
0
def test_metaLoad_single_input():
    class MetaSave(d6tflow.tasks.TaskCache):
        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe
            self.metaSave({'metadata': df})

    @d6tflow.requires(MetaSave)
    class MetaLoad(d6tflow.tasks.TaskCache):
        def run(self):
            meta = self.metaLoad()
            assert meta['metadata'].equals(
                pd.DataFrame({'a': range(3)})
            )

    d6tflow.run(MetaLoad())
Пример #16
0
def test_tasks(cleanup):
    t1=Task1(); t2=Task2();
    assert not t1.complete(); assert not t2.complete();

    t1.run()
    assert t1.complete()
    assert t1.invalidate()
    assert not t1.complete()

    assert d6tflow.run([Task2()])
    assert t1.complete(); assert t2.complete();
    assert (pathdata/'Task1'/'Task1__99914b932b-data.pq').exists()
    assert (pathdata/'Task2'/'Task2__99914b932b-df2.pq').exists()

    t1.output().load().equals(df)
    t1.loadall()['data'].equals(df)

    t2.output()['df2'].load().equals(dfc2)
    t2.loadall()['df2'].equals(dfc2)

    # check downstream incomplete
    t1.invalidate()
    assert not t2.complete()
    d6tflow.settings.check_dependencies=False
    assert t2.complete()
    d6tflow.settings.check_dependencies=True
Пример #17
0
def test_task_inputLoad_single_multiple():
    @d6tflow.requires(TaskMultipleOutput)
    class TaskSingleInput2(d6tflow.tasks.TaskPqPandas):
        def run(self):
            output1, output2 = self.inputLoad()
            assert output1.equals(pd.DataFrame({'ax': range(3)}))
            assert output2.equals(pd.DataFrame({'ay': range(3)}))

            outputDict = self.inputLoad(as_dict=True)
            assert outputDict['output1'].equals(pd.DataFrame({'ax': range(3)}))
            assert outputDict['output2'].equals(pd.DataFrame({'ay': range(3)}))

    d6tflow.run(TaskSingleInput2(),
                forced_all=True,
                forced_all_upstream=True,
                confirm=False)
Пример #18
0
def read(selection, plottype):
    # selection = 'Daimler'
    runDate = get_runDate()
    d6tflow.run(Task_getTickers(runDate=runDate))
    companyTicker = Task_getTickers(runDate=runDate).output().load()

    ticker = selection  # companyTicker[selection]
    plot_dict = dict(runDate=runDate, stockticker=ticker, plottype=plottype)

    d6tflow.run(Task_getPlot(**plot_dict))

    fig: dict = Task_getPlot(**plot_dict).output().load()

    fig_serialied = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)

    return fig_serialied
Пример #19
0
    def run(self,
            funcs_to_run,
            params: dict = None,
            multi_params: dict = None,
            *args,
            **kwargs):
        """
            Runs flow steps locally. See luigi.build for additional details
            Parameters
            ----------
            funcs_to_run : function or list of functions

            params : dict
                dictionary of paramaters. Keys are param names and values are the values of params.

            Examples
            --------

            flow.run(func, params={'multiplier':2})

            flow.run([func1, func2], params={'multiplier':42})

            flow.run(func)
        """
        funcs_to_run = funcs_to_run if isinstance(funcs_to_run,
                                                  list) else [funcs_to_run]

        if multi_params:
            self.multi_params = multi_params
            self.multi_params_tasks = {}

            for params in multi_params:
                for func in funcs_to_run:
                    self._instantiate([func], params=multi_params[params])
                    self.multi_params_tasks[params] = self.instantiated_tasks[
                        func.__name__]
                    d6tflow.run(self.multi_params_tasks[params], *args,
                                **kwargs)
        else:
            # Reset to single params mode
            self.multi_params = None
            self.multi_params_tasks = {}
            self._instantiate(funcs_to_run, params=params)

            d6tflow.run(list(self.instantiated_tasks.values()), *args,
                        **kwargs)
Пример #20
0
def test_tasks(cleanup):
    t1=Task1(); t2=Task2();
    assert not t1.complete(); assert not t2.complete();

    t1.run()
    assert t1.complete()
    assert t1.invalidate(confirm=False)
    assert not t1.complete()

    assert d6tflow.run([Task2()])
    assert t1.complete(); assert t2.complete();
    assert (pathdata/'Task1'/'Task1__99914b932b-data.parquet').exists()
    assert (pathdata/'Task2'/'Task2__99914b932b-df2.parquet').exists()

    # load outputs
    t1.output().load().equals(df)
    t1.outputLoad(as_dict=True).equals(df)
    t1.outputLoad().equals(df)

    t2.output()['df2'].load().equals(dfc2)
    t2.outputLoad(as_dict=True)['df2'].equals(dfc2)
    df2, df4 = t2.outputLoad()
    df2.equals(dfc2)
    df2, = t2.outputLoad(keys=['df2'])
    df2.equals(dfc2)

    # test inputs
    class TaskMultiInput(d6tflow.tasks.TaskCache):
        def requires(self):
            return Task1()
        def run(self):
            dft1 = self.inputLoad()
            assert dft1.equals(df)
    TaskMultiInput().run()

    class TaskMultiInput(d6tflow.tasks.TaskCache):
        def requires(self):
            return Task1(), Task1()
        def run(self):
            dft1, dft2 = self.inputLoad()
            assert dft1.equals(dft2)
    TaskMultiInput().run()

    class TaskMultiInput(d6tflow.tasks.TaskCache):
        def requires(self):
            return {1:Task1(), 2:Task1()}
        def run(self):
            dft1, dft2 = self.inputLoad()
            assert dft1.equals(dft2)
    TaskMultiInput().run()

    # check downstream incomplete
    t1.invalidate(confirm=False)
    assert not t2.complete()
    d6tflow.settings.check_dependencies=False
    assert t2.complete()
    d6tflow.settings.check_dependencies=True
Пример #21
0
def run(config_path):     
    config = handle_config(config_path)
    
    d6tflow.set_dir(config['run_dir'] + 'data/')
    
    bkt_params = backtest_generator.BacktestParameters(
            config                 = config,
            ds_name                = config['ds_name'],
            base_dir               = config['base_dir'],
            table_name             = config['table_name'],
            all_features           = config['dataset_generator_params']['all_features'],
            dataset_filter         = config['dataset_filter'],
            analysis_variables     = config['analysis_variables'],
            date_col               = config['original_params']['date_col'])
    
    
    parameter_generator = lambda: bkt_params.create_parameters(
                     initial_training_month = config['initial_training_month'], 
                     last_predicting_month  = config['last_predicting_month'], 
                     lead_time              = config['dataset_generator_params']['ld'], 
                     lead_time_mode         = config['dataset_generator_params']['ld_mode'],
                     training_length        = config['dataset_generator_params']['tl'],
                     training_length_mode   = config['dataset_generator_params']['tl_mode'],
                     test_length            = config['dataset_generator_params']['testl'],
                     test_length_mode       = config['dataset_generator_params']['testl_mode'],
                     stride_length          = config['dataset_generator_params']['sl'],
                     stride_length_mode     = config['dataset_generator_params']['sl_mode'])
    
    task_generator = lambda params: tasks.TaskModelMetrics(**params)
    
    backtest_tasks = backtest_generator.CreateTaskList(task_constructor  = task_generator, 
                                                       parameter_generator = parameter_generator)
    params =  parameter_generator()
    bkt_task = tasks.BacktestReport(task_bkt_params = params)
    d6tflow.run(bkt_task, workers = config['workers'])
    
    create_folder(config['run_dir'] + 'report/')
    
    metrics_df = bkt_task.output()['full_metrics'].load()
    
    # Salvando o .csv para o frontend
    path_final = config['run_dir'] + 'df_backtest_final.csv'
    
    metrics_df.to_csv(path_final,index=False)
Пример #22
0
def test_metaSave_persists_after_run():
    class MetaSave(d6tflow.tasks.TaskCache):
        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe
            self.metaSave({'metadata': df})

    ms = MetaSave()
    d6tflow.run(ms)

    # From disk
    metadata = pickle.load(open(ms._get_meta_path(ms), "rb"))
    assert metadata['metadata'].equals(
        pd.DataFrame({'a': range(3)})
    )

    # From object
    assert ms.metadata['metadata'].equals(
        pd.DataFrame({'a': range(3)})
    )
Пример #23
0
def test_requires():
    class Task1(d6tflow.tasks.TaskCache):
        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe

    class Task2(Task1):
        pass

    # define another task that depends on data from task1 and task2
    @d6tflow.requires({'a': Task1, 'b': Task2})
    class Task3(d6tflow.tasks.TaskCache):
        def run(self):
            df1 = self.input()['a'].load()  # quickly load input data
            df2 = self.input()['b'].load()  # quickly load input data

            assert (df1.equals(pd.DataFrame({'a': range(3)})))

    task3 = Task3()
    d6tflow.run(task3)
Пример #24
0
    def run(self):
        centers = self.input().load()['centers']
        rpc_surface = centers[self.clusterid].reshape((50, 32), order='F')

        d6t.run(PlotPitch(), forced_all=True)
        pitch = PlotPitch().output().load()
        fig = pitch['fig']
        ax = pitch['ax']

        # plot axis labels
        ax.set_xlabel('x (m)', fontsize=20)
        ax.set_ylabel('y (m)', fontsize=20)
        ax.tick_params(labelsize=14)

        # break the pitch down into a grid
        n_grid_cells_y = int(self.n_grid_cells_x * self.field_dimen[1] /
                             self.field_dimen[0])
        xgrid = np.linspace(-self.field_dimen[0] / 2.,
                            self.field_dimen[0] / 2., self.n_grid_cells_x)
        ygrid = np.linspace(-self.field_dimen[1] / 2.,
                            self.field_dimen[1] / 2., n_grid_cells_y)

        ax.imshow(np.flipud(rpc_surface),
                  extent=(np.amin(xgrid), np.amax(xgrid), np.amin(ygrid),
                          np.amax(ygrid)),
                  interpolation='hanning',
                  vmin=0.0,
                  cmap=self.surface_color,
                  alpha=0.625)
        norm = colors.Normalize(vmin=0, vmax=np.max(rpc_surface))
        cb = fig.colorbar(plt.cm.ScalarMappable(norm=norm,
                                                cmap=self.surface_color),
                          ax=ax,
                          alpha=0.625,
                          shrink=0.80)
        cb.ax.tick_params(labelsize=14)
        ax.set_title('Relevant Pitch Control - Cluster {} Centroid'.format(
            self.clusterid),
                     fontdict={'fontsize': 30})

        self.save({'fig': fig, 'ax': ax})
Пример #25
0
    def __init__(self,
                 tasks,
                 pipename,
                 write_dir='.',
                 write_filename_tasks='tasks_d6tpipe.py',
                 write_filename_run='run_d6tpipe.py',
                 run_load_values=True,
                 run=False,
                 run_params=None):
        # todo NN: copy = False # copy task output to pipe
        if not isinstance(tasks, (list, )):
            tasks = [tasks]
        if run:
            run_params = {} if run_params is None else run_params
            d6tflow.run(tasks, **run_params)

        self.tasks = tasks
        self.pipename = pipename
        self.write_dir = pathlib.Path(write_dir)
        self.write_filename_tasks = write_filename_tasks
        self.write_filename_run = write_filename_run
        self.run_load_values = run_load_values

        # file templates
        self.tmpl_tasks = '''
import d6tflow
import luigi

{% for task in tasks -%}

class {{task.name}}({{task.class}}):
    external=True
    persist={{task.obj.persist}}
    {% for param in task.params -%}
    {{param.name}}={{param.class}}(default={{param.value}})
    {% endfor %}
{% endfor %}
'''

        self.tmpl_run = '''
Пример #26
0
def test_metaLoad_multiple_input_tuple():
    class MetaSave(d6tflow.tasks.TaskCache):
        def run(self):
            df = pd.DataFrame({'a': range(3)})
            self.save(df)  # quickly save dataframe
            self.metaSave({'metadata': df})

    class MetaSave2(MetaSave):
        pass

    @d6tflow.requires(MetaSave, MetaSave2)
    class MetaLoad(d6tflow.tasks.TaskCache):
        def run(self):
            meta = self.metaLoad()
            assert meta[0]['metadata'].equals(
                pd.DataFrame({'a': range(3)})
            )
            assert meta[1]['metadata'].equals(
                pd.DataFrame({'a': range(3)})
            )

    d6tflow.run(MetaLoad())
Пример #27
0
def test_pipes_base(cleanup_pipe):
    import d6tflow.pipes
    d6tflow.pipes.init(cfg['d6tpipe_pipe1'], profile=cfg['d6tpipe_profile'])

    t1 = Task1()
    pipe1 = d6tflow.pipes.get_pipe()
    pipedir = pipe1.dirpath
    t1filepath = t1.output().path
    t1file = str(PurePosixPath(t1filepath.relative_to(pipedir)))

    assert d6tflow.run(t1)
    assert t1.complete()
    with fuckit:
        pipe1._pullpush_luigi([t1file], op='remove')
    assert pipe1.push_preview() == [t1file]
    assert pipe1.push() == [t1file]
    assert pipe1.scan_remote(cached=False) == [t1file]
    # cleanup
    pipe1.delete_files(confirm=False, all_local=True)
    assert pipe1.scan_remote(cached=False) == []
Пример #28
0
def test_execute(cleanup):
    # execute
    t1 = Task1()
    t2 = Task2()
    t3 = Task3()
    [t.invalidate(confirm=False) for t in [t1, t2, t3]]
    d6tflow.run(t3)
    assert all(t.complete() for t in [t1, t2, t3])
    t1.invalidate(confirm=False)
    t2.invalidate(confirm=False)
    assert not t3.complete()  # cascade upstream
    d6tflow.settings.check_dependencies = False
    assert t3.complete()  # no cascade upstream
    d6tflow.run([t3])
    assert t3.complete() and not t1.complete()
    d6tflow.settings.check_dependencies = True
    d6tflow.run([t3])
    assert all(t.complete() for t in [t1, t2, t3])

    # forced single
    class TaskTest(d6tflow.tasks.TaskCachePandas):
        def run(self):
            self.save(df)

    d6tflow.run(TaskTest())
    assert TaskTest().output().load().equals(df)

    class TaskTest(d6tflow.tasks.TaskCachePandas):
        def run(self):
            self.save(df * 2)

    d6tflow.run(TaskTest())
    assert TaskTest().output().load().equals(df)
    d6tflow.run(TaskTest(), forced=TaskTest(), confirm=False)
    assert TaskTest().output().load().equals(df * 2)
    d6tflow.run([TaskTest()], forced=[TaskTest()], confirm=False)

    # forced flow
    mtimes = [
        t1.output().path.stat().st_mtime,
        t2.output()['df2'].path.stat().st_mtime
    ]
    d6tflow.run(t3, forced=t1, confirm=False)
    assert t1.output().path.stat().st_mtime > mtimes[0]
    assert t2.output()['df2'].path.stat().st_mtime > mtimes[1]

    # forced_all => run task3 only
    mtimes = [
        t1.output().path.stat().st_mtime,
        t2.output()['df2'].path.stat().st_mtime,
        t3.output().path.stat().st_mtime
    ]
    d6tflow.run(t3, forced_all=True, confirm=False)
    assert t1.output().path.stat().st_mtime == mtimes[0]
    assert t2.output()['df2'].path.stat().st_mtime == mtimes[1]
    assert t3.output().path.stat().st_mtime > mtimes[2]

    # forced_all_upstream => run all tasks
    mtimes = [
        t1.output().path.stat().st_mtime,
        t2.output()['df2'].path.stat().st_mtime,
        t3.output().path.stat().st_mtime
    ]
    d6tflow.run(t3, forced_all_upstream=True, confirm=False)
    assert t1.output().path.stat().st_mtime > mtimes[0]
    assert t2.output()['df2'].path.stat().st_mtime > mtimes[1]
    assert t3.output().path.stat().st_mtime > mtimes[2]

    # downstream
    assert d6tflow.run(t3)
    d6tflow.invalidate_downstream(t2, t3, confirm=False)
    assert not (t2.complete() and t3.complete()) and t1.complete()

    # upstream
    assert d6tflow.run(t3)
    d6tflow.invalidate_upstream(t3, confirm=False)
    assert not all(t.complete() for t in [t1, t2, t3])
Пример #29
0
import d6tflow
import d6tflow.pipes

import cfg, tasks

d6tflow.pipes.init('top10-mistakes-stats',
                   local_pipe=True)  # save flow output to local pipe directory
pipe = d6tflow.pipes.get_pipe()
pipe.delete_files_local(confirm=False, delete_all=True)  # start clean

# run tasks
d6tflow.run(
    tasks.ModelOutliers())  # output automatically saved in pipe directory
d6tflow.run(tasks.ModelTS())
d6tflow.run(tasks.OLSvsRF())

# push output
do_push = True
if do_push:
    d6tflow.pipes.init('top10-mistakes-stats',
                       reset=True)  # connect to remote pipe
    pipe = d6tflow.pipes.get_pipe()
    pipe.delete_files_remote(confirm=False)  # start clean
    pipe.pull(cached=False)
    pipe.push()
Пример #30
0
import d6tflow
import cfg, tasks
# import visualize

# d6tflow.preview(tasks.TaskTrain())

d6tflow.run(tasks.TaskPreprocess(), forced_all_upstream=True, confirm=False)
quit()
# visualize.accuracy()
# visualize.plot_importances()
#
# d6tflow.run(tasks.TaskTrain(do_preprocess=False))
# visualize.accuracy(do_preprocess=False)
#
# d6tflow.invalidate_downstream(tasks.TaskGetData(), tasks.TaskTrain())
#
# d6tflow.preview(tasks.TaskTrain())
# d6tflow.run(tasks.TaskTrain())