def test_preview(): t1 = Task1() t2 = Task2() t3 = Task3() d6tflow.invalidate_upstream(t3, confirm=False) import io from contextlib import redirect_stdout with io.StringIO() as buf, redirect_stdout(buf): d6tflow.preview(t3) output = buf.getvalue() assert output.count('PENDING') == 3 assert output.count('COMPLETE') == 0 with io.StringIO() as buf, redirect_stdout(buf): d6tflow.run(t3) d6tflow.preview(t3) output = buf.getvalue() assert output.count('PENDING') == 0 assert output.count('COMPLETE') == 3 with io.StringIO() as buf, redirect_stdout(buf): d6tflow.preview(Task3(do_preprocess=False)) output = buf.getvalue() assert output.count('PENDING') == 1 assert output.count('COMPLETE') == 2
def test_task_inputLoad_multiple_multiple(): @d6tflow.requires({ 'input1': TaskMultipleOutput1, 'input2': TaskMultipleOutput2 }) class TaskMultipleInput2(d6tflow.tasks.TaskPqPandas): def run(self): data = self.inputLoad(as_dict=True) assert data["input1"]["output1"].equals(data["input2"]["output1"]) assert data["input1"]["output2"].equals(data["input2"]["output2"]) data1a, data1b = self.inputLoad()["input1"] data2a, data2b = self.inputLoad()["input2"] assert data1a.equals(data2a) assert data1b.equals(data2b) data1a, data1b = self.inputLoad(task='input1') data2a, data2b = self.inputLoad(task='input2') assert data1a.equals(data2a) assert data1b.equals(data2b) data1 = self.inputLoad(task='input1', as_dict=True) data2 = self.inputLoad(task='input2', as_dict=True) assert data1["output1"].equals(data2["output1"]) assert data1["output2"].equals(data2["output2"]) d6tflow.run(TaskMultipleInput2(), forced_all=True, forced_all_upstream=True, confirm=False)
def run(self, funcs_to_run, params: dict = None, *args, **kwargs): """ Runs flow steps locally. See luigi.build for additional details Parameters ---------- funcs_to_run : function or list of functions params : dict dictionary of paramaters. Keys are param names and values are the values of params. Examples -------- flow.run(func, params={'multiplier':2}) flow.run([func1, func2], params={'multiplier':42}) flow.run(func) """ funcs_to_run = funcs_to_run if isinstance(funcs_to_run, list) else [funcs_to_run] self._instantiate(funcs_to_run, params=params) d6tflow.run(list(self.instantiated_tasks.values()), *args, **kwargs)
def run(config_path): local_path = './experimentos/producao/' filename = 'config.yaml' #caminho do bucket para fzer o download do arquivo para a maquina local bucket_name_download = config_path[5:].split('/')[0] Utils.download_file_from_gcp(config_path, local_path=local_path, filename=filename, bucket_name=bucket_name_download) #variáveis de ambiente de acordo com desenvolvimento ou produção config = config_pre_tratamento(local_path + filename) project = config['project'] config['caminho_saida_dados'] = local_path d6tflow.set_dir(config['caminho_saida_dados']) params = get_tasks(config) t = tasks.TaskPrdReport(**params) d6tflow.preview(t) d6tflow.run(t, workers=config['workers']) model = tasks.TaskTrainModel( task_engineer_params=params['task_engineer_params'], task_te_params=params['task_te_params'], task_ps_params=params['task_ps_params'], task_model_params=params['task_model_params']).output().load() salvar_modelo(t, model, config) return True
def process_one_file(file_path): """ Run the processing pipeline on one file """ # Create a identifier based on the file path identifier = file_path.split("/")[-1] """ Uncomment these lines depending on what the 'aim' of the processing is, and what steps should be re-run """ """ Uncomment line below to mark that all tasks should be re-run """ p.TaskGetInitialData(path=file_path, identifier=identifier).invalidate(confirm=False) """ Uncomment line below to mark that tasks related to BGs should be re-run """ # p.TaskGetBGData(path=file_path, identifier=identifier).invalidate(confirm=False) """ Uncomment line below to mark that tasks for the preprocessing should be re-run """ # p.TaskPreprocessData(path=file_path, identifier=identifier).invalidate(confirm=False) # p.TaskPreprocessBGs(path=file_path, identifier=identifier).invalidate(confirm=False) """ Uncomment lines below to mark that tasks to identify abnormal boluses &/or basals with a KNN model should be re-run """ # p.TaskGetAbnormalBoluses(path=file_path, model_type="knn", identifier=identifier).invalidate(confirm=False) # p.TaskGetAbnormalBasals(path=file_path, identifier=identifier).invalidate(confirm=False) """ Uncomment lines below to mark that tasks to identify abnormal boluses with an Isolation Forest model should be re-run """ # p.TaskGetAbnormalBoluses(path=file_path, model_type="isolation_forest", identifier=identifier).invalidate(confirm=False) """ Uncomment line below to find the abnormal boluses using k-nearest neighbors""" d6tflow.run( p.TaskGetAbnormalBoluses(path=file_path, model_type="knn", identifier=identifier)) """ Uncomment line below to find the abnormal boluses using an Isolation Forest model """ # d6tflow.run(p.TaskGetAbnormalBoluses(path=file_path, model_type="isolation_forest", identifier=identifier)) """ Uncomment line below to find the abnormal basals """ # d6tflow.run(p.TaskGetAbnormalBasals(path=file_path, identifier=identifier)) """ Uncomment line below to process the dose data """
def test_task_inputLoad_multiple_multiple_tuple(): @d6tflow.requires(TaskMultipleOutput1, TaskMultipleOutput2) class TaskMultipleInput2(d6tflow.tasks.TaskPqPandas): def run(self): data = self.inputLoad(as_dict=True) assert data[0]["output1"].equals(data[1]["output1"]) assert data[0]["output2"].equals(data[1]["output2"]) data1a, data1b = self.inputLoad()[0] data2a, data2b = self.inputLoad()[1] assert data1a.equals(data2a) assert data1b.equals(data2b) data1a, data1b = self.inputLoad(task=0) data2a, data2b = self.inputLoad(task=1) assert data1a.equals(data2a) assert data1b.equals(data2b) data1 = self.inputLoad(task=0, as_dict=True) data2 = self.inputLoad(task=1, as_dict=True) assert data1["output1"].equals(data2["output1"]) assert data1["output2"].equals(data2["output2"]) d6tflow.run(TaskMultipleInput2(), forced_all=True, forced_all_upstream=True, confirm=False)
def run(self): ball_start_pos = self.input()['ball_start_pos'].load() PPCFa = self.input()['pitch_control_frame'].load()['PPCFa'] xgrid = self.input()['pitch_control_frame'].load()['xgrid'] ygrid = self.input()['pitch_control_frame'].load()['ygrid'] # initialise transition grid TP = np.zeros(shape=(len(ygrid), len(xgrid))) if np.sum(PPCFa[-1]) == 0: N_TP = TP self.save({'N_TP': N_TP, 'TP': TP, 'xgrid': xgrid, 'ygrid': ygrid}) else: # calculate transition model at each location on the pitch for i in range(len(ygrid)): for j in range(len(xgrid)): target_position = np.array([xgrid[j], ygrid[i]]) d6t.run(CalcTransitionProbabilityTarget( target_position=tuple(target_position), ball_start_pos=tuple(ball_start_pos), PPCFa=PPCFa[-1, i, j]), execution_summary=False) TP[i, j] = CalcTransitionProbabilityTarget( target_position=tuple(target_position), ball_start_pos=tuple(ball_start_pos), PPCFa=PPCFa[-1, i, j]).output().load() # normalize T to unity N_TP = TP / np.sum(TP) self.save({'N_TP': N_TP, 'TP': TP, 'xgrid': xgrid, 'ygrid': ygrid})
def test_multiple_deps_on_input_load(): # define 2 tasks that load raw data class Task1(d6tflow.tasks.TaskCache): persist = ['a1', 'a2'] def run(self): df = pd.DataFrame({'a': range(3)}) self.save({'a1': df, 'a2': df}) # quickly save dataframe class Task2(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe # define another task that depends on data from task1 and task2 @d6tflow.requires(Task1, Task2) class Task3(d6tflow.tasks.TaskCache): def run(self): data = self.inputLoad() df1 = data[0]['a1'] assert df1.equals(data[0]['a2']) df2 = data[1] assert df2.equals(df1) df = df1.join(df2, lsuffix='1', rsuffix='2') self.save(df) # Execute task including all its dependencies d6tflow.run(Task3(), forced_all_upstream=True, confirm=False)
def test_outputLoadAllMeta(): class Task1(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe self.metaSave({'columns': 42}) class Task2(Task1): pass @d6tflow.requires({'upstream1': Task1, 'upstream2': Task2}) class Task3(d6tflow.tasks.TaskCache): multiplier = d6tflow.IntParameter(default=2) def run(self): meta = self.metaLoad()['upstream1'] print(meta) print(meta['columns']) df1 = self.input()['upstream1'].load() # quickly load input data df2 = self.input()['upstream2'].load() # quickly load input data df = df1.join(df2, lsuffix='1', rsuffix='2') df['b'] = df['a1']*self.multiplier # use task parameter self.save(df) self.metaSave({'columns': 100}) d6tflow.run(Task3()) meta_all = Task3().outputLoadAllMeta() assert meta_all["Task1"]["columns"] == 42 assert meta_all["Task2"]["columns"] == 42 assert meta_all["Task3"]["columns"] == 100
def test_outputLoadMeta(): class MetaSave(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe self.metaSave({'metadata': df}) d6tflow.run(MetaSave()) df = pd.DataFrame({'a': range(3)}) assert MetaSave().outputLoadMeta()["metadata"].equals(df)
def test_task_inputLoad_single_single(): @d6tflow.requires(TaskSingleOutput) class TaskSingleInput(d6tflow.tasks.TaskPqPandas): def run(self): data = self.inputLoad() assert data.equals(pd.DataFrame({'a': range(3)})) d6tflow.run(TaskSingleInput(), forced_all=True, forced_all_upstream=True, confirm=False)
def test_dynamic(): class TaskCollector(d6tflow.tasks.TaskAggregator): def run(self): yield Task1() yield Task2() d6tflow.run(TaskCollector()) assert Task1().complete() and Task2().complete() and TaskCollector().complete() assert TaskCollector().outputLoad()[0].equals(Task1().outputLoad()) assert TaskCollector().outputLoad()[1][0].equals(Task2().outputLoad()[0]) TaskCollector().invalidate(confirm=False) assert not (Task1().complete() and Task2().complete() and TaskCollector().complete())
def run(self): events = self.input()['events']['events'].load() home_ids = self.input()['ids'].load()['home_ids'] away_ids = self.input()['ids'].load()['away_ids'] # break the pitch down into a grid n_grid_cells_y = int(self.n_grid_cells_x * self.field_dimen[1] / self.field_dimen[0]) xgrid = np.linspace(-self.field_dimen[0] / 2., self.field_dimen[0] / 2., self.n_grid_cells_x) ygrid = np.linspace(-self.field_dimen[1] / 2., self.field_dimen[1] / 2., n_grid_cells_y) RPCa_Home = np.zeros(shape=(len(events[events.Team == 'Home']), len(home_ids) + 1, len(ygrid), len(xgrid))) RPCd_Home = np.zeros(shape=(len(events[events.Team == 'Away']), len(home_ids) + 1, len(ygrid), len(xgrid))) RPCa_Away = np.zeros(shape=(len(events[events.Team == 'Away']), len(away_ids) + 1, len(ygrid), len(xgrid))) RPCd_Away = np.zeros(shape=(len(events[events.Team == 'Home']), len(away_ids) + 1, len(ygrid), len(xgrid))) home_rows = events[events.Team == 'Home'].index away_rows = events[events.Team == 'Away'].index home_int_fail = [] away_int_fail = [] for i in tqdm(range(len(home_rows)), desc="executing game{} home events".format(self.gameid)): d6t.settings.check_dependencies = False d6t.settings.log_level = 'ERROR' d6t.run(rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[home_rows[i], 'Start Frame'], in_execution=True)) RPCa_Home[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[home_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCa'] RPCd_Away[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[home_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCd'] if np.sum(RPCa_Home[i][-1]) == 0: home_int_fail.append(i) if len(home_int_fail) > 0: RPCa_Home = np.delete(RPCa_Home, obj=np.array(home_int_fail), axis=0) RPCd_Away = np.delete(RPCd_Away, obj=np.array(home_int_fail), axis=0) for i in tqdm(range(len(away_rows)), desc="executing game{} away events".format(self.gameid)): d6t.settings.check_dependencies = False d6t.settings.log_level = 'ERROR' d6t.run(rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[away_rows[i], 'Start Frame'], in_execution=True)) RPCa_Away[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[away_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCa'] RPCd_Home[i] = rpc.CalcRelevantPitchControlFrame(gameid=self.gameid, rownumber=events.loc[away_rows[i], 'Start Frame'], in_execution=True).output().load()['RPCd'] if np.sum(RPCa_Away[i][-1]) == 0: away_int_fail.append(i) if len(away_int_fail) > 0: RPCa_Away = np.delete(RPCa_Away, obj=np.array(away_int_fail), axis=0) RPCd_Home = np.delete(RPCd_Home, obj=np.array(away_int_fail), axis=0) self.save({'RPCa_Home': RPCa_Home, 'RPCd_Home': RPCd_Home, 'RPCa_Away': RPCa_Away, 'RPCd_Away': RPCd_Away, 'home_int_fail': home_int_fail, 'away_int_fail': away_int_fail})
def test_task_inputLoad_multiple_single(): @d6tflow.requires({ 'input1': TaskSingleOutput1, 'input2': TaskSingleOutput2 }) class TaskMultipleInput(d6tflow.tasks.TaskPqPandas): def run(self): data1 = self.inputLoad()['input1'] assert (data1.equals(pd.DataFrame({'b': range(3)}))) data2 = self.inputLoad()['input2'] assert (data2.equals(pd.DataFrame({'c': range(3)}))) d6tflow.run(TaskMultipleInput(), forced_all=True, forced_all_upstream=True, confirm=False)
def test_metaLoad_single_input(): class MetaSave(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe self.metaSave({'metadata': df}) @d6tflow.requires(MetaSave) class MetaLoad(d6tflow.tasks.TaskCache): def run(self): meta = self.metaLoad() assert meta['metadata'].equals( pd.DataFrame({'a': range(3)}) ) d6tflow.run(MetaLoad())
def test_tasks(cleanup): t1=Task1(); t2=Task2(); assert not t1.complete(); assert not t2.complete(); t1.run() assert t1.complete() assert t1.invalidate() assert not t1.complete() assert d6tflow.run([Task2()]) assert t1.complete(); assert t2.complete(); assert (pathdata/'Task1'/'Task1__99914b932b-data.pq').exists() assert (pathdata/'Task2'/'Task2__99914b932b-df2.pq').exists() t1.output().load().equals(df) t1.loadall()['data'].equals(df) t2.output()['df2'].load().equals(dfc2) t2.loadall()['df2'].equals(dfc2) # check downstream incomplete t1.invalidate() assert not t2.complete() d6tflow.settings.check_dependencies=False assert t2.complete() d6tflow.settings.check_dependencies=True
def test_task_inputLoad_single_multiple(): @d6tflow.requires(TaskMultipleOutput) class TaskSingleInput2(d6tflow.tasks.TaskPqPandas): def run(self): output1, output2 = self.inputLoad() assert output1.equals(pd.DataFrame({'ax': range(3)})) assert output2.equals(pd.DataFrame({'ay': range(3)})) outputDict = self.inputLoad(as_dict=True) assert outputDict['output1'].equals(pd.DataFrame({'ax': range(3)})) assert outputDict['output2'].equals(pd.DataFrame({'ay': range(3)})) d6tflow.run(TaskSingleInput2(), forced_all=True, forced_all_upstream=True, confirm=False)
def read(selection, plottype): # selection = 'Daimler' runDate = get_runDate() d6tflow.run(Task_getTickers(runDate=runDate)) companyTicker = Task_getTickers(runDate=runDate).output().load() ticker = selection # companyTicker[selection] plot_dict = dict(runDate=runDate, stockticker=ticker, plottype=plottype) d6tflow.run(Task_getPlot(**plot_dict)) fig: dict = Task_getPlot(**plot_dict).output().load() fig_serialied = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder) return fig_serialied
def run(self, funcs_to_run, params: dict = None, multi_params: dict = None, *args, **kwargs): """ Runs flow steps locally. See luigi.build for additional details Parameters ---------- funcs_to_run : function or list of functions params : dict dictionary of paramaters. Keys are param names and values are the values of params. Examples -------- flow.run(func, params={'multiplier':2}) flow.run([func1, func2], params={'multiplier':42}) flow.run(func) """ funcs_to_run = funcs_to_run if isinstance(funcs_to_run, list) else [funcs_to_run] if multi_params: self.multi_params = multi_params self.multi_params_tasks = {} for params in multi_params: for func in funcs_to_run: self._instantiate([func], params=multi_params[params]) self.multi_params_tasks[params] = self.instantiated_tasks[ func.__name__] d6tflow.run(self.multi_params_tasks[params], *args, **kwargs) else: # Reset to single params mode self.multi_params = None self.multi_params_tasks = {} self._instantiate(funcs_to_run, params=params) d6tflow.run(list(self.instantiated_tasks.values()), *args, **kwargs)
def test_tasks(cleanup): t1=Task1(); t2=Task2(); assert not t1.complete(); assert not t2.complete(); t1.run() assert t1.complete() assert t1.invalidate(confirm=False) assert not t1.complete() assert d6tflow.run([Task2()]) assert t1.complete(); assert t2.complete(); assert (pathdata/'Task1'/'Task1__99914b932b-data.parquet').exists() assert (pathdata/'Task2'/'Task2__99914b932b-df2.parquet').exists() # load outputs t1.output().load().equals(df) t1.outputLoad(as_dict=True).equals(df) t1.outputLoad().equals(df) t2.output()['df2'].load().equals(dfc2) t2.outputLoad(as_dict=True)['df2'].equals(dfc2) df2, df4 = t2.outputLoad() df2.equals(dfc2) df2, = t2.outputLoad(keys=['df2']) df2.equals(dfc2) # test inputs class TaskMultiInput(d6tflow.tasks.TaskCache): def requires(self): return Task1() def run(self): dft1 = self.inputLoad() assert dft1.equals(df) TaskMultiInput().run() class TaskMultiInput(d6tflow.tasks.TaskCache): def requires(self): return Task1(), Task1() def run(self): dft1, dft2 = self.inputLoad() assert dft1.equals(dft2) TaskMultiInput().run() class TaskMultiInput(d6tflow.tasks.TaskCache): def requires(self): return {1:Task1(), 2:Task1()} def run(self): dft1, dft2 = self.inputLoad() assert dft1.equals(dft2) TaskMultiInput().run() # check downstream incomplete t1.invalidate(confirm=False) assert not t2.complete() d6tflow.settings.check_dependencies=False assert t2.complete() d6tflow.settings.check_dependencies=True
def run(config_path): config = handle_config(config_path) d6tflow.set_dir(config['run_dir'] + 'data/') bkt_params = backtest_generator.BacktestParameters( config = config, ds_name = config['ds_name'], base_dir = config['base_dir'], table_name = config['table_name'], all_features = config['dataset_generator_params']['all_features'], dataset_filter = config['dataset_filter'], analysis_variables = config['analysis_variables'], date_col = config['original_params']['date_col']) parameter_generator = lambda: bkt_params.create_parameters( initial_training_month = config['initial_training_month'], last_predicting_month = config['last_predicting_month'], lead_time = config['dataset_generator_params']['ld'], lead_time_mode = config['dataset_generator_params']['ld_mode'], training_length = config['dataset_generator_params']['tl'], training_length_mode = config['dataset_generator_params']['tl_mode'], test_length = config['dataset_generator_params']['testl'], test_length_mode = config['dataset_generator_params']['testl_mode'], stride_length = config['dataset_generator_params']['sl'], stride_length_mode = config['dataset_generator_params']['sl_mode']) task_generator = lambda params: tasks.TaskModelMetrics(**params) backtest_tasks = backtest_generator.CreateTaskList(task_constructor = task_generator, parameter_generator = parameter_generator) params = parameter_generator() bkt_task = tasks.BacktestReport(task_bkt_params = params) d6tflow.run(bkt_task, workers = config['workers']) create_folder(config['run_dir'] + 'report/') metrics_df = bkt_task.output()['full_metrics'].load() # Salvando o .csv para o frontend path_final = config['run_dir'] + 'df_backtest_final.csv' metrics_df.to_csv(path_final,index=False)
def test_metaSave_persists_after_run(): class MetaSave(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe self.metaSave({'metadata': df}) ms = MetaSave() d6tflow.run(ms) # From disk metadata = pickle.load(open(ms._get_meta_path(ms), "rb")) assert metadata['metadata'].equals( pd.DataFrame({'a': range(3)}) ) # From object assert ms.metadata['metadata'].equals( pd.DataFrame({'a': range(3)}) )
def test_requires(): class Task1(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe class Task2(Task1): pass # define another task that depends on data from task1 and task2 @d6tflow.requires({'a': Task1, 'b': Task2}) class Task3(d6tflow.tasks.TaskCache): def run(self): df1 = self.input()['a'].load() # quickly load input data df2 = self.input()['b'].load() # quickly load input data assert (df1.equals(pd.DataFrame({'a': range(3)}))) task3 = Task3() d6tflow.run(task3)
def run(self): centers = self.input().load()['centers'] rpc_surface = centers[self.clusterid].reshape((50, 32), order='F') d6t.run(PlotPitch(), forced_all=True) pitch = PlotPitch().output().load() fig = pitch['fig'] ax = pitch['ax'] # plot axis labels ax.set_xlabel('x (m)', fontsize=20) ax.set_ylabel('y (m)', fontsize=20) ax.tick_params(labelsize=14) # break the pitch down into a grid n_grid_cells_y = int(self.n_grid_cells_x * self.field_dimen[1] / self.field_dimen[0]) xgrid = np.linspace(-self.field_dimen[0] / 2., self.field_dimen[0] / 2., self.n_grid_cells_x) ygrid = np.linspace(-self.field_dimen[1] / 2., self.field_dimen[1] / 2., n_grid_cells_y) ax.imshow(np.flipud(rpc_surface), extent=(np.amin(xgrid), np.amax(xgrid), np.amin(ygrid), np.amax(ygrid)), interpolation='hanning', vmin=0.0, cmap=self.surface_color, alpha=0.625) norm = colors.Normalize(vmin=0, vmax=np.max(rpc_surface)) cb = fig.colorbar(plt.cm.ScalarMappable(norm=norm, cmap=self.surface_color), ax=ax, alpha=0.625, shrink=0.80) cb.ax.tick_params(labelsize=14) ax.set_title('Relevant Pitch Control - Cluster {} Centroid'.format( self.clusterid), fontdict={'fontsize': 30}) self.save({'fig': fig, 'ax': ax})
def __init__(self, tasks, pipename, write_dir='.', write_filename_tasks='tasks_d6tpipe.py', write_filename_run='run_d6tpipe.py', run_load_values=True, run=False, run_params=None): # todo NN: copy = False # copy task output to pipe if not isinstance(tasks, (list, )): tasks = [tasks] if run: run_params = {} if run_params is None else run_params d6tflow.run(tasks, **run_params) self.tasks = tasks self.pipename = pipename self.write_dir = pathlib.Path(write_dir) self.write_filename_tasks = write_filename_tasks self.write_filename_run = write_filename_run self.run_load_values = run_load_values # file templates self.tmpl_tasks = ''' import d6tflow import luigi {% for task in tasks -%} class {{task.name}}({{task.class}}): external=True persist={{task.obj.persist}} {% for param in task.params -%} {{param.name}}={{param.class}}(default={{param.value}}) {% endfor %} {% endfor %} ''' self.tmpl_run = '''
def test_metaLoad_multiple_input_tuple(): class MetaSave(d6tflow.tasks.TaskCache): def run(self): df = pd.DataFrame({'a': range(3)}) self.save(df) # quickly save dataframe self.metaSave({'metadata': df}) class MetaSave2(MetaSave): pass @d6tflow.requires(MetaSave, MetaSave2) class MetaLoad(d6tflow.tasks.TaskCache): def run(self): meta = self.metaLoad() assert meta[0]['metadata'].equals( pd.DataFrame({'a': range(3)}) ) assert meta[1]['metadata'].equals( pd.DataFrame({'a': range(3)}) ) d6tflow.run(MetaLoad())
def test_pipes_base(cleanup_pipe): import d6tflow.pipes d6tflow.pipes.init(cfg['d6tpipe_pipe1'], profile=cfg['d6tpipe_profile']) t1 = Task1() pipe1 = d6tflow.pipes.get_pipe() pipedir = pipe1.dirpath t1filepath = t1.output().path t1file = str(PurePosixPath(t1filepath.relative_to(pipedir))) assert d6tflow.run(t1) assert t1.complete() with fuckit: pipe1._pullpush_luigi([t1file], op='remove') assert pipe1.push_preview() == [t1file] assert pipe1.push() == [t1file] assert pipe1.scan_remote(cached=False) == [t1file] # cleanup pipe1.delete_files(confirm=False, all_local=True) assert pipe1.scan_remote(cached=False) == []
def test_execute(cleanup): # execute t1 = Task1() t2 = Task2() t3 = Task3() [t.invalidate(confirm=False) for t in [t1, t2, t3]] d6tflow.run(t3) assert all(t.complete() for t in [t1, t2, t3]) t1.invalidate(confirm=False) t2.invalidate(confirm=False) assert not t3.complete() # cascade upstream d6tflow.settings.check_dependencies = False assert t3.complete() # no cascade upstream d6tflow.run([t3]) assert t3.complete() and not t1.complete() d6tflow.settings.check_dependencies = True d6tflow.run([t3]) assert all(t.complete() for t in [t1, t2, t3]) # forced single class TaskTest(d6tflow.tasks.TaskCachePandas): def run(self): self.save(df) d6tflow.run(TaskTest()) assert TaskTest().output().load().equals(df) class TaskTest(d6tflow.tasks.TaskCachePandas): def run(self): self.save(df * 2) d6tflow.run(TaskTest()) assert TaskTest().output().load().equals(df) d6tflow.run(TaskTest(), forced=TaskTest(), confirm=False) assert TaskTest().output().load().equals(df * 2) d6tflow.run([TaskTest()], forced=[TaskTest()], confirm=False) # forced flow mtimes = [ t1.output().path.stat().st_mtime, t2.output()['df2'].path.stat().st_mtime ] d6tflow.run(t3, forced=t1, confirm=False) assert t1.output().path.stat().st_mtime > mtimes[0] assert t2.output()['df2'].path.stat().st_mtime > mtimes[1] # forced_all => run task3 only mtimes = [ t1.output().path.stat().st_mtime, t2.output()['df2'].path.stat().st_mtime, t3.output().path.stat().st_mtime ] d6tflow.run(t3, forced_all=True, confirm=False) assert t1.output().path.stat().st_mtime == mtimes[0] assert t2.output()['df2'].path.stat().st_mtime == mtimes[1] assert t3.output().path.stat().st_mtime > mtimes[2] # forced_all_upstream => run all tasks mtimes = [ t1.output().path.stat().st_mtime, t2.output()['df2'].path.stat().st_mtime, t3.output().path.stat().st_mtime ] d6tflow.run(t3, forced_all_upstream=True, confirm=False) assert t1.output().path.stat().st_mtime > mtimes[0] assert t2.output()['df2'].path.stat().st_mtime > mtimes[1] assert t3.output().path.stat().st_mtime > mtimes[2] # downstream assert d6tflow.run(t3) d6tflow.invalidate_downstream(t2, t3, confirm=False) assert not (t2.complete() and t3.complete()) and t1.complete() # upstream assert d6tflow.run(t3) d6tflow.invalidate_upstream(t3, confirm=False) assert not all(t.complete() for t in [t1, t2, t3])
import d6tflow import d6tflow.pipes import cfg, tasks d6tflow.pipes.init('top10-mistakes-stats', local_pipe=True) # save flow output to local pipe directory pipe = d6tflow.pipes.get_pipe() pipe.delete_files_local(confirm=False, delete_all=True) # start clean # run tasks d6tflow.run( tasks.ModelOutliers()) # output automatically saved in pipe directory d6tflow.run(tasks.ModelTS()) d6tflow.run(tasks.OLSvsRF()) # push output do_push = True if do_push: d6tflow.pipes.init('top10-mistakes-stats', reset=True) # connect to remote pipe pipe = d6tflow.pipes.get_pipe() pipe.delete_files_remote(confirm=False) # start clean pipe.pull(cached=False) pipe.push()
import d6tflow import cfg, tasks # import visualize # d6tflow.preview(tasks.TaskTrain()) d6tflow.run(tasks.TaskPreprocess(), forced_all_upstream=True, confirm=False) quit() # visualize.accuracy() # visualize.plot_importances() # # d6tflow.run(tasks.TaskTrain(do_preprocess=False)) # visualize.accuracy(do_preprocess=False) # # d6tflow.invalidate_downstream(tasks.TaskGetData(), tasks.TaskTrain()) # # d6tflow.preview(tasks.TaskTrain()) # d6tflow.run(tasks.TaskTrain())