def run(config_path): local_path = './experimentos/producao/' filename = 'config.yaml' #caminho do bucket para fzer o download do arquivo para a maquina local bucket_name_download = config_path[5:].split('/')[0] Utils.download_file_from_gcp(config_path, local_path=local_path, filename=filename, bucket_name=bucket_name_download) #variáveis de ambiente de acordo com desenvolvimento ou produção config = config_pre_tratamento(local_path + filename) project = config['project'] config['caminho_saida_dados'] = local_path d6tflow.set_dir(config['caminho_saida_dados']) params = get_tasks(config) t = tasks.TaskPrdReport(**params) d6tflow.preview(t) d6tflow.run(t, workers=config['workers']) model = tasks.TaskTrainModel( task_engineer_params=params['task_engineer_params'], task_te_params=params['task_te_params'], task_ps_params=params['task_ps_params'], task_model_params=params['task_model_params']).output().load() salvar_modelo(t, model, config) return True
def set_default(name): """ Set default pipe. Will also change d6tflow directory Args: name (str): name of pipe """ pipe_ = get_pipe(name) d6tflow.cache.pipe_default_name = name d6tflow.settings.isinitpipe = False d6tflow.set_dir(pipe_.dir) d6tflow.settings.isinitpipe = True
def init(default_pipe_name, profile=None, local_pipe=False, local_api=False, reset=False, api=None, set_dir=True, api_args=None, pipe_args=None): """ Initialize d6tpipe Args: default_pipe_name (str): name of pipe to store results. Override by setting Task.pipe attribute profile (str): name of d6tpipe profile to get api if api not provided local_pipe (bool): use `PipeLocal()` local_api (bool): use `APILocal()` reset (bool): reset api and pipe connection api (obj): d6tpipe api object. if not provided will be loaded set_dir (bool): if True, set d6tflow directory to default pipe directory api_args (dir): arguments to pass to api pipe_args (dir): arguments to pass to pipe """ if not d6tflow.settings.isinitpipe or reset: d6tflow.cache.pipe_default_name = default_pipe_name api_args = {} if api_args is None else api_args pipe_args = {} if pipe_args is None else pipe_args if local_pipe: pipe_ = d6tpipe.PipeLocal(default_pipe_name, profile=profile) else: if api is None: if local_api: d6tflow.cache.api = d6tpipe.APILocal(profile=profile, **api_args) else: d6tflow.cache.api = d6tpipe.APIClient(profile=profile, **api_args) else: d6tflow.cache.api = api pipe_ = d6tpipe.Pipe(d6tflow.cache.api, default_pipe_name, **pipe_args) d6tflow.cache.pipes[default_pipe_name] = pipe_ if set_dir: d6tflow.settings.isinitpipe = False d6tflow.set_dir(pipe_.dir) d6tflow.settings.isinitpipe = True
def run(config_path): config = handle_config(config_path) d6tflow.set_dir(config['run_dir'] + 'data/') bkt_params = backtest_generator.BacktestParameters( config = config, ds_name = config['ds_name'], base_dir = config['base_dir'], table_name = config['table_name'], all_features = config['dataset_generator_params']['all_features'], dataset_filter = config['dataset_filter'], analysis_variables = config['analysis_variables'], date_col = config['original_params']['date_col']) parameter_generator = lambda: bkt_params.create_parameters( initial_training_month = config['initial_training_month'], last_predicting_month = config['last_predicting_month'], lead_time = config['dataset_generator_params']['ld'], lead_time_mode = config['dataset_generator_params']['ld_mode'], training_length = config['dataset_generator_params']['tl'], training_length_mode = config['dataset_generator_params']['tl_mode'], test_length = config['dataset_generator_params']['testl'], test_length_mode = config['dataset_generator_params']['testl_mode'], stride_length = config['dataset_generator_params']['sl'], stride_length_mode = config['dataset_generator_params']['sl_mode']) task_generator = lambda params: tasks.TaskModelMetrics(**params) backtest_tasks = backtest_generator.CreateTaskList(task_constructor = task_generator, parameter_generator = parameter_generator) params = parameter_generator() bkt_task = tasks.BacktestReport(task_bkt_params = params) d6tflow.run(bkt_task, workers = config['workers']) create_folder(config['run_dir'] + 'report/') metrics_df = bkt_task.output()['full_metrics'].load() # Salvando o .csv para o frontend path_final = config['run_dir'] + 'df_backtest_final.csv' metrics_df.to_csv(path_final,index=False)
import pytest import os import glob import shutil from pathlib import Path, PurePosixPath import pandas as pd import fuckit import warnings import luigi import d6tcollect d6tcollect.submit = False import d6tflow pathdata = d6tflow.set_dir('tests-data/') d6tflow.settings.log_level = 'WARNING' # test data df = pd.DataFrame({'a': range(10)}) dfc2 = df.copy() dfc2['a'] = dfc2['a'] * 2 dfc4 = df.copy() dfc4['a'] = dfc4['a'] * 2 * 2 @pytest.fixture def cleanup(scope="module"): with fuckit: shutil.rmtree(pathdata) pathdata.mkdir(exist_ok=True) yield True
def run(config_path): config = handle_config(config_path) # defining function to optimize def model_fn(parameters): print( "################################################################################" ) print(parameters) print( "################################################################################" ) config_cp = gridsearch_generator.update_config_tunning( copy.deepcopy(config), parameters) if len(config_cp['dataset_generator_params']['all_features']) > 0: bkt_params = backtest_generator.BacktestParameters( config=config_cp, ds_name=config_cp['ds_name'], base_dir=config_cp['base_dir'] + "{0}/".format(datetime.now().strftime("%Y%m%d%H%M%f")), table_name=config_cp['table_name'], all_features=gridsearch_generator.get_all_possible_features( config_cp['variable_groups']), dataset_filter=config_cp['dataset_filter'], analysis_variables=config_cp['analysis_variables'], date_col=config_cp['original_params']['date_col']) parameter_generator = lambda: bkt_params.create_parameters( initial_training_month=config_cp['initial_training_month'], last_predicting_month=config_cp['last_predicting_month'], lead_time=config_cp['dataset_generator_params']['ld'], lead_time_mode=config_cp['dataset_generator_params']['ld_mode' ], training_length=config_cp['dataset_generator_params']['tl'], training_length_mode=config_cp['dataset_generator_params'][ 'tl_mode'], test_length=config_cp['dataset_generator_params']['testl'], test_length_mode=config_cp['dataset_generator_params'][ 'testl_mode'], stride_length=config_cp['dataset_generator_params']['sl'], stride_length_mode=config_cp['dataset_generator_params'][ 'sl_mode']) task_generator = lambda params: tasks.TaskModelMetrics(**params) backtest_tasks = backtest_generator.CreateTaskList( task_constructor=task_generator, parameter_generator=parameter_generator) params = parameter_generator() bkt_task = tasks.BacktestReport(task_bkt_params=params) d6tflow.run(bkt_task, workers=config_cp['workers']) gridsearch_generator.calculate_metric( bkt_task.output()['full_metrics'].load(), config_cp['metric_weights']) gridsearch_generator.save_metrics( bkt_task.output()['full_metrics'].load(), parameters, file_path=config_cp['run_dir']) metric = gridsearch_generator.calculate_metric( bkt_task.output()['full_metrics'].load(), config_cp['metric_weights']) else: metric = np.inf print( "################################################################################" ) print('Metric', metric) print( "################################################################################" ) return metric d6tflow.set_dir(config['run_dir'] + 'data/') create_folder(config['run_dir'] + 'report/') opt_filename, best_pars = gridsearch_generator.run_opt( model_fn=model_fn, parameter_space=config['space'], max_iter=config['hyperopt_max_iterations'], save_iter=config['hyperopt_save_iterations'], load=config['hyperopt_load'], path=config['run_dir']) ## Plotting optimization report report = GridSearchReport.Report(opt_file=opt_filename, report_path=config['run_dir'] + 'report/', test_id=config['opt_name']) report.plot_optimization_overview() report.plot_k_best(k=10)
def run(config_path): config = handle_config(config_path) d6tflow.set_dir(config['run_dir'] + 'data/') ## Deal with taks_te_params = { 'method' : config_methods.train_eval_creator_name, 'create_ds' : { 'ds_name' : config['ds_name'], 'dir' : config['base_dir'], 'table_name' : config['table_name']}, 'ds_params' : { 'period_begin' : config['train_begin'], 'period_end' : config['train_end'], 'cols' : config['all_features'], 'where' : config['dataset_filter'], 'date_col': config['original_params']['date_col']}, 'key' : config['train_end'].strftime("%Y%W") } task_ps_params = { 'method' : config_methods.predict_score_creator_name, 'create_ds' : { 'ds_name' : config['ds_name'], 'dir' : config['base_dir'], 'table_name' : config['table_name']}, 'ds_params' : { 'period_begin' : config['test_begin'], 'period_end' : config['test_end'], 'cols' : config['all_features'], 'analysis_variables' : config['analysis_variables'], 'where' : config['dataset_filter'], 'date_col': config['original_params']['date_col']}, 'key' : config['train_end'].strftime("%Y%W") } task_engineer_params = { 'method' : config_methods.engineer_creator_name, 'engineer' : {'small_categorical' : config['small_categorical'], 'large_categorical' : config['large_categorical'], 'variable_set' : config['all_features'], 'date' : config['original_params']['date_col']} } key = 'teste' task_model_params = { 'method' : config_methods.model_creator_name, 'model' : { 'model_params' : {'colsample_bytree': 0.75, 'eta': 0.375, 'gamma': 0.15000000000000002, 'max_depth': 5, 'min_child_weight': 7.0, 'n_estimators': config['iterations'], 'reg_alpha': 2.5500000000000003, 'reg_lambda ': 0.12, 'subsample': 0.75, 'verbose' : 20, 'eval_metric': 'logloss', # 'auc', 'objective': 'binary:logistic', 'booster': "gbtree", 'tree_method':'gpu_hist', 'gpu_id' : 0, 'random_seed' : 42}, 'ds_params': {'date_col': config['original_params']['date_col'], 'target': config['original_params']['target'], 'hash_col': config['original_params']['hash_col']}}, 'build_params' : {'name' : key} } task_el_ds_params = { 'date_col' : config['original_params']['date_col'], 'elasticity_begin' : config['elasticity_begin'], 'elasticity_end' : config['elasticity_end'], 'n_min' : config['factor_min'], 'n_max' : config['factor_max'], 'qtd_pass' : config['num_points'], } task_elasticity_params = { 'model_name' : key, 'predict_params' : { 'elasticity_transform' : False, 'elasticity_factor' : 1, 'elasticity_col' : 'TESTE'} } task_elasticity_report_params = { 'model_name' : key, 'n_min' : config['factor_min'], 'n_max' : config['factor_max'], 'real_qtd_pass' : config['real_num_points'], 'qtd_pass' : config['num_points'], 'target' : config['original_params']['target'], 'output' : config['params']['model_output'] } t = el_tasks.TaskElasticityReport( taks_te_params = taks_te_params, task_ps_params = task_ps_params, task_engineer_params = task_engineer_params, task_model_params = task_model_params, task_el_ds_params = task_el_ds_params, task_elasticity_params = task_elasticity_params, task_elasticity_report_params = task_elasticity_report_params) d6tflow.run(t, workers = config['workers']) real_df = t.output()['real_df'].load() model_df = t.output()['model_df'].load() real_df.to_csv(config['run_dir'] + 'real_elasticity.csv', index = False) model_df.to_csv(config['run_dir'] + 'model_elasticity.csv', index = False)
import multiprocessing import d6tflow import d6tcollect from os import listdir from os.path import exists, isdir, join import optimized_analysis_pipeline as p d6tcollect.submit = False # Turn off automatic error reporting d6tflow.settings.log_level = "ERROR" # Decrease console printout d6tflow.set_dir("../results") # Save output to a results folder def find_csv_filenames(path_to_dir, suffix=".csv"): """ Find all csv filenames in given dir """ file_names = listdir(path_to_dir) return [ join(path_to_dir, filename) for filename in file_names if filename.endswith(suffix) ] def is_valid(path): """ Check if a file is valid, and print an appropriate error message """ if not exists(path): print(path, "is invalid file path") return False return True def is_directory(path):
def run(config_path): config = handle_config(config_path) d6tflow.set_dir(config['run_dir'] + 'data/') available_gpu = [0, 1] ## TODO USE AS PARAMETER train_end, prediction_period, prediction_period_end, test_begin, test_end = BacktestParameters._get_times(config['initial_training_month'], config['dataset_generator_params']['ld'], config['dataset_generator_params']['ld_mode'], config['dataset_generator_params']['tl'], config['dataset_generator_params']['tl_mode'], config['dataset_generator_params']['testl'], config['dataset_generator_params']['testl_mode']) taks_te_params = { 'method' : config['dataset_generator_params']['method'], 'create_ds' : { 'ds_name' : config['ds_name'], 'dir' : config['base_dir'], 'table_name' : config['table_name'], 'project' : config['table_name'].replace("`", "").split(".")[0], 'dataset' : config['table_name'].replace("`", "").split(".")[1]}, 'ds_params' : { 'period_begin' : config['initial_training_month'], 'period_end' : train_end, 'cols' : config['dataset_generator_params']['all_features'], 'where' : config['dataset_filter'], 'date_col' : config['original_params']['date_col'], 'target' : config['original_params']['target']}, 'key' : train_end.strftime("%Y%W") } task_ps_params = { 'method' : config['dataset_generator_params']['method'], 'create_ds' : { 'ds_name' : config['ds_name'], 'dir' : config['base_dir'], 'table_name' : config['table_name'], 'project' : config['table_name'].replace("`", "").split(".")[0], 'dataset' : config['table_name'].replace("`", "").split(".")[1]}, 'ds_params' : { 'period_begin' : test_begin, 'period_end' : test_end, 'cols' : config['dataset_generator_params']['all_features'], 'analysis_variables' : config['analysis_variables'], 'where' : config['dataset_filter'], 'date_col' : config['original_params']['date_col'], 'target' : config['original_params']['target']}, 'key' : train_end.strftime("%Y%W") } task_engineer_params = { 'method' : sorted(list(config['task_engineer_params']['methods'].keys())), 'engineer' : BacktestParameters.get_engineer_params(config)} task_model_params = { 'method' : config['task_model_params']['method'], 'model' : {'model_params' : config['task_model_params']['model_params'], 'ds_params': {'date_col': config['original_params']['date_col'], 'target': config['original_params']['target'], 'hash_col': config['original_params']['hash_col']}}, 'build_params' : {'name' : 'single_run'}} task_model_params['model']['model_params']['gpu_id'] = available_gpu[0] task_predict_params = { 'predict_params' : { 'elasticity_transform' : False, 'elasticity_factor' : 1, 'elasticity_col' : config['elasticity_variables']}} task_metrics_params = { 'method' : sorted(list(config['task_metric_params']['methods'].keys())), 'metrics' : config['task_metric_params']['methods'], 'model_name' : config['model_name'], 'score_params' : {'erro_train' : (config['initial_training_month'], config['dataset_generator_params']['tl']), 'erro_oos' : (prediction_period, config['dataset_generator_params']['ld'])} } parameters = { 'taks_te_params' : taks_te_params, 'task_ps_params' : task_ps_params, 'task_engineer_params' : task_engineer_params, 'task_model_params' : task_model_params, 'task_predict_params' : task_predict_params, 'task_metrics_params' : task_metrics_params} tk_metrics = tasks.TaskModelMetrics(**parameters) d6tflow.run(tk_metrics, workers = config['workers']) create_folder(config['run_dir'] + 'report/') metrics_df = tk_metrics.output()['full_metrics'].load() metrics_df.to_csv(config['run_dir'] + 'base_model.csv', index=False)
def run(config): def model_fn(parameters): print("################################################################################") print(parameters) print("################################################################################") selected_features = gridsearch_generator.get_features(parameters, config['variable_groups']) if len(selected_features) > 0: task_engineer_params = { 'method' : config_methods.engineer_creator_name, 'engineer' : { 'small_categorical' : config['small_categorical'], 'large_categorical' : config['large_categorical'], 'variable_set' : selected_features} } key = 'teste' task_model_params = { 'method' : config_methods.model_creator_name, 'model' : {'variable_set' : selected_features, 'log_experiment' : config['save_experiment'], 'model_params' : {'colsample_bytree': 0.75, 'eta': 0.375, 'gamma': 0.15000000000000002, 'max_depth': 5, 'min_child_weight': 7.0, 'n_estimators': parameters['iterations'], 'reg_alpha': 2.5500000000000003, 'reg_lambda ': 0.12, 'subsample': 0.75, 'verbose' : 20, 'eval_metric': 'logloss', # 'auc', 'objective': 'binary:logistic', 'booster': "gbtree", 'tree_method':'gpu_hist', 'gpu_id' : 0, 'random_seed' : 42}}, 'build_params' : {'name' : key} } task_predict_params = { 'predict_params' : { 'elasticity_transform' : False, 'elasticity_factor' : 1, 'elasticity_col' : config['elasticity_variables']} } task_metrics_params = { 'method' : config_methods.metrics_creator_name, 'method_regional' : '', 'model_name' : config['model_name'], 'score_params' : { 'training' : (config['initial_training_month'], 1) , 'oos' : (config['initial_training_month'], 1)} } bkt_params = backtest_generator.BacktestParameters( ds_name = config['ds_name'], base_dir = config['base_dir'] + "{0}/".format(parameters['train_length']), table_name = config['table_name'], all_features = gridsearch_generator.get_all_possible_features(config['variable_groups']), dataset_filter = config['dataset_filter'], analysis_variables = config['analysis_variables'], train_eval_creator_name = config_methods.train_eval_creator_name, predict_score_creator_name = config_methods.predict_score_creator_name, task_engineer_params = task_engineer_params, task_model_params = task_model_params, task_predict_params = task_predict_params, task_metrics_params = task_metrics_params) parameter_generator = lambda: bkt_params.create_parameters( initial_training_month = config['initial_training_month'], last_predicting_month = config['last_predicting_month'], lead_time = config['ld'], lead_time_mode = config['ld_mode'], training_length = parameters['train_length'], training_length_mode = config['tl_mode'], test_length = config['testl'], test_length_mode = config['testl_mode'], stride_length = config['sl'], stride_length_mode = config['sl_mode']) task_generator = lambda params: tasks.TaskModelMetrics(**params) backtest_tasks = backtest_generator.CreateTaskList(task_constructor = task_generator, parameter_generator = parameter_generator) params = parameter_generator() bkt_task = tasks.BacktestReport(task_bkt_params = params) d6tflow.run(bkt_task, workers = config['workers']) metric = gridsearch_generator.calculate_metric(bkt_task.output()['full_metrics'].load(), config['metric_weights']) gridsearch_generator.save_model_data(bkt_task.output()['full_metrics'].load(), parameters, metric, file_path = config['run_dir']) else: metric = np.inf print("################################################################################") print('Metric', metric) print("################################################################################") return metric d6tflow.set_dir(config['run_dir'] + 'data/') create_folder(config['run_dir'] + 'report/') k = config['k'] with open(config['opt_file_name'], "rb") as f: tpe_trials = pickle.load(f) loss_par = list(set([(x['result']['loss'], str(x['misc']['vals'])) for x in tpe_trials.trials])) loss_par = sorted(loss_par, key = lambda tup: tup[0]) pars = [x[1] for x in loss_par][:k] for par in pars: par = json.loads(par.replace("'", "\"")) for key in par.keys(): par[key] = par[key][0] if 'group' in key: if par[key] == 1: par[key] = False, else: par[key] = True, if 'iterations' in key or 'train_length' in key: par[key] = int(par[key]) print('running for par', par) model_fn(par) def group_dict(x): if x[0] == 0: return True else: return False def identity(x): return x[0] translation = tunning_fs.get_translation_dict(tunning_trials = tpe_trials, identity_func = identity, group_func = group_dict) results = tunning_fs.parse_hyperopt_pickle(tunning_trials = tpe_trials, translation = translation) search = {'results' : results, 'group_translation' : config['variable_groups']} with open(config['run_dir'] + 'search.pkl', 'wb') as file: pickle.dump(search, file)
def run(config_path): config = handle_config(config_path) d6tflow.set_dir(config['run_dir'] + 'data/') task_engineer_params = { 'method' : config_methods.engineer_creator_name, 'engineer' : {'small_categorical' : config['small_categorical'], 'large_categorical' : config['large_categorical'], 'variable_set' : config['all_features']} } key = 'teste' task_model_params = { 'method' : config_methods.model_creator_name, 'model' : { 'model_params' : {'colsample_bytree': 0.75, 'eta': 0.375, 'gamma': 0.15000000000000002, 'max_depth': 5, 'min_child_weight': 7.0, 'n_estimators': config['iterations'], 'reg_alpha': 2.5500000000000003, 'reg_lambda ': 0.12, 'subsample': 0.75, 'verbose' : 20, 'eval_metric': 'logloss', # 'auc', 'objective': 'binary:logistic', 'booster': "gbtree", 'tree_method':'gpu_hist', 'gpu_id' : 0, 'random_seed' : 42}}, 'build_params' : {'name' : key} } task_predict_params = { 'predict_params' : { 'elasticity_transform' : False, 'elasticity_factor' : 1, 'elasticity_col' : []} } task_metrics_params = { 'method' : config_methods.metrics_creator_name, 'model_name' : config['model_name'], 'method_regional' : '', 'score_params' : None } task_el_ds_params = { 'date_col' : config['original_params']['date_col'], 'elasticity_begin' : None, 'elasticity_end' : None, 'n_min' : config['factor_min'], 'n_max' : config['factor_max'], 'qtd_pass' : config['num_points'], } task_elasticity_params = { 'model_name' : config['model_name'], 'predict_params' : { 'elasticity_transform' : False, 'elasticity_factor' : 1, 'elasticity_col' : 'TESTE'} } task_opt_params = { 'train_tag' : None, 'prediction_tag' : None , 'prediction_begin' : None, 'prediction_end' : None, 'target' : config['params']['target'], 'output' : config['params']['model_output'], 'key' : config['original_params']['hash_col'], 'date_col': config['params']['date_col'], 'metrics' : ['perc_err', 'auc', 'elasticity_err'], } opt_params = optm_generator.OptParameters( ds_name = config['ds_name'], base_dir = config['base_dir'], table_name = config['table_name'], all_features = config['all_features'], dataset_filter = config['dataset_filter'], analysis_variables = config['analysis_variables'], train_eval_creator_name = config_methods.train_eval_creator_name, predict_score_creator_name = config_methods.predict_score_creator_name, task_engineer_params = task_engineer_params, task_model_params = task_model_params, task_predict_params = task_predict_params, task_metrics_params = task_metrics_params, task_el_ds_params = task_el_ds_params, task_elasticity_params = task_elasticity_params, task_opt_params = task_opt_params) params = opt_params.create_parameters( initial_training_month = config['initial_training_month'], last_predicting_month = config['last_predicting_month'], lead_time = config['ld'], lead_time_mode = config['ld_mode'], training_length = config['tl'], training_length_mode = config['tl_mode'], test_length = config['testl'], test_length_mode = config['testl_mode'], stride_length = config['sl'], stride_length_mode = config['sl_mode']) print('NUM_OPT', len(params)) to_run_task = opt_tasks.TaskOptSummary(task_opt_summary_params = params) d6tflow.run(to_run_task, workers = config['workers']) create_folder(config['run_dir'] + 'report/') result = to_run_task.output().load() result.to_csv('opt_result.csv', index = False)