def get_scoring_case_data_paths() -> Tuple[str, str]: train_file_path = os.path.join('cases', 'data', 'scoring', 'scoring_train.csv') test_file_path = os.path.join('cases', 'data', 'scoring', 'scoring_test.csv') full_train_file_path = os.path.join(str(project_root()), train_file_path) full_test_file_path = os.path.join(str(project_root()), test_file_path) return full_train_file_path, full_test_file_path
def get_cancer_case_data_paths() -> Tuple[str, str]: train_file_path = os.path.join('cases', 'data', 'benchmark', 'cancer_train.csv') test_file_path = os.path.join('cases', 'data', 'benchmark', 'cancer_test.csv') full_train_file_path = os.path.join(str(project_root()), train_file_path) full_test_file_path = os.path.join(str(project_root()), test_file_path) return full_train_file_path, full_test_file_path
def test_credit_scoring_problem(): test_file_path = str(os.path.dirname(__file__)) file_path_train = os.path.join(test_file_path, 'data/simple_classification.csv') file_path_test = file_path_train full_path_train = os.path.join(str(project_root()), file_path_train) full_path_test = os.path.join(str(project_root()), file_path_test) roc_auc_test = run_credit_scoring_problem( full_path_train, full_path_test, max_lead_time=timedelta(minutes=0.1)) assert roc_auc_test > 0.5
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain = get_composite_lstm_chain() chain_simple = Chain() node_single = PrimaryNode('ridge') chain_simple.add_node(node_single) chain_lstm = Chain() node_lstm = PrimaryNode('lstm') chain_lstm.add_node(node_lstm) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise) chain_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise) print(f'RMSE composite: {rmse_on_valid}') print(f'RMSE simple: {rmse_on_valid_simple}') print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def test_metocean_forecasting_problem(): test_file_path = str(os.path.dirname(__file__)) file_path_train = os.path.join(test_file_path, 'data/simple_time_series.csv') file_path_test = file_path_train full_path_train = os.path.join(str(project_root()), file_path_train) full_path_test = os.path.join(str(project_root()), file_path_test) rmse = run_metocean_forecasting_problem(full_path_train, full_path_test, forecast_length=1, max_window_size=1) assert rmse < 50
def create_multi_clf_examples_from_excel(file_path: str, return_df: bool = False): df = pd.read_excel(file_path) train, test = split_data(df) file_dir_name = file_path.replace('.', '/').split('/')[-2] file_csv_name = f'{file_dir_name}.csv' directory_names = ['examples', 'data', file_dir_name] ensure_directory_exists(directory_names) if return_df: path = os.path.join(directory_names[0], directory_names[1], directory_names[2], file_csv_name) full_file_path = os.path.join(str(project_root()), path) save_file_to_csv(df, full_file_path) return df, full_file_path else: full_train_file_path, full_test_file_path = get_split_data_paths( directory_names) save_file_to_csv(train, full_train_file_path) save_file_to_csv(train, full_test_file_path) return full_train_file_path, full_test_file_path
class ComposerVisualiser: root_parent_path = os.path.join('../', str(project_root())) root_parent_path_dirname = os.path.dirname(root_parent_path) temp_path = os.path.join(root_parent_path_dirname, 'tmp/') if 'tmp' not in os.listdir(root_parent_path_dirname): os.mkdir(temp_path) gif_prefix = 'for_gif_' @staticmethod def visualise(chain: Chain, save_path: Optional[str] = None): try: graph, node_labels = as_nx_graph(chain=chain) pos = node_positions(graph.to_undirected()) plt.figure(figsize=(10, 16)) nx.draw(graph, pos=pos, with_labels=True, labels=node_labels, font_size=12, font_family='calibri', font_weight='bold', node_size=7000, width=2.0, node_color=colors_by_node_labels(node_labels), cmap='Set3') if not save_path: plt.show() else: plt.savefig(save_path) except Exception as ex: print(f'Visualisation failed with {ex}') @staticmethod def _visualise_chains(chains, fitnesses): fitnesses = deepcopy(fitnesses) last_best_chain = chains[0] prev_fit = fitnesses[0] for ch_id, chain in enumerate(chains): graph, node_labels = as_nx_graph(chain=chain) pos = node_positions(graph.to_undirected()) plt.rcParams['axes.titlesize'] = 20 plt.rcParams['axes.labelsize'] = 20 plt.rcParams['figure.figsize'] = [10, 10] plt.title('Current chain') nx.draw(graph, pos=pos, with_labels=True, labels=node_labels, font_size=12, font_family='calibri', font_weight='bold', node_size=scaled_node_size(chain.length), width=2.0, node_color=colors_by_node_labels(node_labels), cmap='Set3') path = f'{ComposerVisualiser.temp_path}ch_{ch_id}.png' plt.savefig(path, bbox_inches='tight') plt.cla() plt.clf() plt.close('all') path_best = f'{ComposerVisualiser.temp_path}best_ch_{ch_id}.png' if fitnesses[ch_id] > prev_fit: fitnesses[ch_id] = prev_fit else: last_best_chain = chain prev_fit = fitnesses[ch_id] best_graph, best_node_labels = as_nx_graph(chain=last_best_chain) pos = node_positions(best_graph.to_undirected()) plt.rcParams['axes.titlesize'] = 20 plt.rcParams['axes.labelsize'] = 20 plt.rcParams['figure.figsize'] = [10, 10] plt.title(f'Best chain after {round(ch_id)} evals') nx.draw(best_graph, pos=pos, with_labels=True, labels=best_node_labels, font_size=12, font_family='calibri', font_weight='bold', node_size=scaled_node_size(chain.length), width=2.0, node_color=colors_by_node_labels(best_node_labels), cmap='Set3') plt.savefig(path_best, bbox_inches='tight') plt.cla() plt.clf() plt.close('all') @staticmethod def _visualise_convergence(fitness_history): fitness_history = deepcopy(fitness_history) prev_fit = fitness_history[0] for fit_id, fit in enumerate(fitness_history): if fit > prev_fit: fitness_history[fit_id] = prev_fit prev_fit = fitness_history[fit_id] ts_set = list(range(len(fitness_history))) df = pd.DataFrame({ 'ts': ts_set, 'fitness': [-f for f in fitness_history] }) ind = 0 for ts in ts_set: plt.rcParams['axes.titlesize'] = 20 plt.rcParams['axes.labelsize'] = 20 plt.rcParams['figure.figsize'] = [10, 10] ind = ind + 1 plt.plot(df['ts'], df['fitness'], label='Composer') plt.xlabel('Evaluation', fontsize=18) plt.ylabel('Best ROC AUC', fontsize=18) plt.axvline(x=ts, color='black') plt.legend(loc='upper left') path = f'{ComposerVisualiser.temp_path}{ind}.png' plt.savefig(path, bbox_inches='tight') plt.cla() plt.clf() plt.close('all') @staticmethod def visualise_history(chains, fitnesses): print('START VISUALISATION') try: ComposerVisualiser._clean(with_gif=True) ComposerVisualiser._visualise_chains(chains, fitnesses) ComposerVisualiser._visualise_convergence(fitnesses) ComposerVisualiser._merge_images(len(chains)) ComposerVisualiser._combine_gifs() ComposerVisualiser._clean() except Exception as ex: print(f'Visualisation failed with {ex}') @staticmethod def _merge_images(num_images): for img_idx in (range(1, num_images)): images = list( map(Image.open, [ f'{ComposerVisualiser.temp_path}ch_{img_idx}.png', f'{ComposerVisualiser.temp_path}best_ch_{img_idx}.png', f'{ComposerVisualiser.temp_path}{img_idx}.png' ])) widths, heights = zip(*(i.size for i in images)) total_width = sum(widths) max_height = max(heights) new_im = Image.new('RGB', (total_width, max_height)) x_offset = 0 for im in images: new_im.paste(im, (x_offset, 0)) x_offset += im.size[0] new_im.save( f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}{img_idx}.png' ) @staticmethod def _combine_gifs(): files = [ file_name for file_name in iglob( f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}*.png' ) ] files_idx = [ int(file_name[len( f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}' ):(len(file_name) - len('.png'))]) for file_name in iglob( f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}*.png' ) ] files = [file for _, file in sorted(zip(files_idx, files))] with get_writer( f'{ComposerVisualiser.temp_path}final_{str(time())}.gif', mode='I', duration=0.5) as writer: for filename in files: image = imread(filename) writer.append_data(image) @staticmethod def _clean(with_gif=False): try: files = glob(f'{ComposerVisualiser.temp_path}*.png') if with_gif: files += glob(f'{ComposerVisualiser.temp_path}*.gif') for file in files: remove(file) except Exception as ex: print(ex)
chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) if is_visualise: ComposerVisualiser.visualise(chain_evo_composed) # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed if __name__ == '__main__': # the dataset was obtained from https://www.kaggle.com/c/GiveMeSomeCredit # a dataset that will be used as a train and test set during composition file_path_train = 'cases/data/scoring/scoring_train.csv' full_path_train = os.path.join(str(project_root()), file_path_train) # a dataset for a final validation of the composed model file_path_test = 'cases/data/scoring/scoring_test.csv' full_path_test = os.path.join(str(project_root()), file_path_test) run_credit_scoring_problem(full_path_train, full_path_test, is_visualise=True)
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, with_visualisation=True): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) ref_chain = get_composite_lstm_chain() available_model_types_primary = ['trend_data_model', 'residual_data_model'] available_model_types_secondary = [ 'rfr', 'linear', 'ridge', 'lasso', 'additive_data_model' ] composer = FixedStructureComposer() composer_requirements = GPComposerRequirements( primary=available_model_types_primary, secondary=available_model_types_secondary, max_arity=2, max_depth=4, pop_size=10, num_of_generations=10, crossover_prob=0, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=20)) chain = composer.compose_chain(data=dataset_to_train, initial_chain=ref_chain, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) if with_visualisation: ComposerVisualiser.visualise(chain) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise=with_visualisation) print(f'RMSE composite: {rmse_on_valid}') return rmse_on_valid
def run_oil_forecasting_problem(train_file_path, train_file_path_crm, forecast_length, max_window_size, is_visualise=False, well_id='Unknown'): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), train_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') full_path_train_crm = os.path.join(str(project_root()), train_file_path_crm) dataset_to_train_crm = InputData.from_csv(full_path_train_crm, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') dataset_to_validate_crm = copy(dataset_to_train_crm) prediction_full = None prediction_full_crm = None prediction_full_crm_opt = None forecast_window_shift_num = 4 depth = 100 for forecasting_step in range(forecast_window_shift_num): start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_train_local = dataset_to_train.subset(start, end) dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end) start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_validate_local = dataset_to_validate.subset( start + depth, end + depth) dataset_to_validate_local_crm = dataset_to_validate_crm.subset( start + depth, end + depth) chain_simple = Chain(PrimaryNode('lstm')) chain_simple_crm = Chain(PrimaryNode('lstm')) chain_crm_opt = get_comp_chain() chain_simple.fit_from_scratch(input_data=dataset_to_train_local, verbose=False) chain_simple_crm.fit_from_scratch( input_data=dataset_to_train_local_crm, verbose=False) chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm, verbose=False) prediction = chain_simple.predict(dataset_to_validate_local) prediction_crm = chain_simple_crm.predict( dataset_to_validate_local_crm) prediction_crm_opt = chain_crm_opt.predict( dataset_to_validate_local_crm) prediction_full = merge_datasets(prediction_full, prediction, forecasting_step) prediction_full_crm = merge_datasets(prediction_full_crm, prediction_crm, forecasting_step) prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt, prediction_crm_opt, forecasting_step) rmse_on_valid_simple = calculate_validation_metric( prediction_full, prediction_full_crm, prediction_full_crm_opt, dataset_to_validate, well_id, is_visualise) print(well_id) print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}') print(f'RMSE ML: {round(rmse_on_valid_simple[1])}') print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}') print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}') print(f'DTW CRM: {round(rmse_on_valid_simple[4])}') print(f'DTW ML: {round(rmse_on_valid_simple[5])}') print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}') print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}') return rmse_on_valid_simple
print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}') print(f'RMSE ML: {round(rmse_on_valid_simple[1])}') print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}') print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}') print(f'DTW CRM: {round(rmse_on_valid_simple[4])}') print(f'DTW ML: {round(rmse_on_valid_simple[5])}') print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}') print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}') return rmse_on_valid_simple if __name__ == '__main__': # the dataset was obtained from Volve dataset of oil field for well in ['5351', '5599', '7078', '7289', '7405f']: full_path_train_crm = f'../production_forecasting/data/oil_crm_prod_X{well}.csv' full_path_train_crm = os.path.join(str(project_root()), full_path_train_crm) file_path_train = f'../production_forecasting/data/oil_prod_X{well}.csv' full_path_train = os.path.join(str(project_root()), file_path_train) run_oil_forecasting_problem(full_path_train, full_path_train_crm, forecast_length=100, max_window_size=100, is_visualise=True, well_id=well)