def main_run_linear_models(train_ds, val_ds, test_ds, data_props, max_backlooking=None, layer_type='dense', activation_funcs=['sigmoid', 'relu', 'tanh'], max_serach_iterations=200, NN_max_depth=3, MAX_EPOCHS=800, patience=25, model_name='linear', examples=None, return_permutation_importances=True, redo_serach_best_model=False): mlflow.set_experiment(model_name) experiment_date_time = int( datetime.datetime.now().strftime("%Y%m%d%H%M%S")) flatten_input = True if layer_type == 'dense' else False def _extract_just_important_data_props(data_props): kwargs = {} kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][ 'cols_just_these'] kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][ 'cols_drop'] kwargs['dataset_cols_y'] = data_props['third_filter'][ 'y_cols_just_these'] kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset']) kwargs['dataset_hash_first'] = data_props['first_step_data_hash'] kwargs['dataset_hash_second'] = data_props['second_step_data_hash'] kwargs['dataset_split_method'] = data_props['second_step'][ 'split_method'] kwargs['dataset_split_steps_train'] = data_props['second_step'][ 'split_props']['train_time_steps'] kwargs['dataset_split_steps_val'] = data_props['second_step'][ 'split_props']['val_time_steps'] kwargs['dataset_split_steps_test'] = data_props['second_step'][ 'split_props']['test_time_steps'] kwargs['dataset_iter_step'] = data_props['iter_step'] kwargs['dataset_normalization'] = data_props['second_step'][ 'normalize_method'] kwargs['dataset_window_backlooking'] = data_props['first_step'][ 'window_input_width'] kwargs['dataset_window_prediction'] = data_props['first_step'][ 'window_pred_width'] kwargs['dataset_window_shift'] = data_props['first_step'][ 'window_shift'] return kwargs def _hp_tranform_param_dict(param_dict): new_param_dict = {} for key, value in param_dict.items(): if type(value) == list: new_param_dict[key] = hp.choice(key, value) elif type(value) == set: new_param_dict[key] = hp.uniform(key, *values) else: new_param_dict[key] = value return new_param_dict max_backlooking = data_props['first_step'][ 'window_input_width'] if max_backlooking is None else max_backlooking param_grid = dict( n_layers=list(range(1, NN_max_depth + 1)), first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8], last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4], activation_func=activation_funcs, backlooking_window=list(range(1, max_backlooking + 1))) hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid) hp_param_dict['model_name'] = model_name hp_param_dict['data_props'] = data_props hp_param_dict['layer_type'] = layer_type def _optimize_objective(*args, **kwargs): if args != (): kwargs = args[ 0] # if positional arguments expect first to be dictionary with all kwargs if type(kwargs) != dict: raise Exception( f'kwargs is not dict - it is {type(kwargs)} with values: {kwargs}' ) backlooking_window = kwargs.pop('backlooking_window') n_layers = kwargs.pop('n_layers') first_layer_nodes = kwargs.pop('first_layer_nodes') last_layer_nodes = kwargs.pop('last_layer_nodes') activation_func = kwargs.pop('activation_func') return_everything = kwargs.pop('return_everything', False) verbose = kwargs.pop('verbose', 0) model_name = kwargs.pop('model_name', 'linear') data_props = kwargs.pop('data_props') layer_type = kwargs.pop('layer_type', 'dense') dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=backlooking_window) now = datetime.datetime.now() date_time = str(now.strftime("%y%m%d%H%M%S")) model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}" kwargs = dict( model_name=model_name, n_layers=n_layers, first_layer_nodes=first_layer_nodes, last_layer_nodes=last_layer_nodes, activation_func=activation_func, input_size=dataset['input_shape'] if layer_type == 'dense' else tuple(list(train_ds.element_spec[0].shape)[1:]), output_size=dataset['output_shape'], backlooking_window=backlooking_window, layer_type=layer_type) model = createmodel(**kwargs) history, mlflow_additional_params = compile_and_fit( model=model, train=dataset['train_ds'], val=dataset['val_ds'], MAX_EPOCHS=MAX_EPOCHS, patience=patience, model_name=model_name, verbose=verbose) # Get all data props for documentation in MLflow kwargs.update(_extract_just_important_data_props(data_props)) kwargs['run'] = experiment_date_time mlflow_additional_params['kwargs'] = kwargs train_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['train_ds']))) val_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['val_ds']))) test_performance = dict( zip( model.metrics_names, evaluate_model( model=model, tf_data=dataset['test_ds'], mlflow_additional_params=mlflow_additional_params))) mlflow_additional_params['data_props'] = data_props # Only save model if close to 15% best models try: best_loss = float(trials.best_trial['result']['loss']) current_loss = min(history.history['val_loss']) if current_loss <= best_loss * (1 + 0.15): save_model = True else: save_model = False except: save_model = True mlflow_saved = my_helpers.mlflow_last_run_add_param( param_dict=mlflow_additional_params, save_model=save_model) tf.keras.backend.clear_session() return_metrics = dict(loss=val_performance['loss'], all_metrics={ 'train': train_performance, 'val': val_performance, 'test': test_performance }, status=STATUS_OK, mlflow=mlflow_saved, model_name=model_name) if return_everything: return_metrics['model'] = model return_metrics['history'] = history return return_metrics ###### Get old best model records ###### storage_file_path = os.path.join( my_helpers.get_project_directories(key='cache_dir'), 'storage_best_model.json') if not os.path.exists(storage_file_path): best_model_storage = {} else: with open(storage_file_path) as json_file: best_model_storage = json.load(json_file) ######## Search for best model ######## if redo_serach_best_model or model_name not in best_model_storage or data_props[ 'iter_step'] not in best_model_storage[model_name]: warnings.filterwarnings('ignore') trials = Trials() best = fmin(fn=_optimize_objective, space=hp_param_dict, algo=tpe.suggest, max_evals=max_serach_iterations, trials=trials, early_stop_fn=no_progress_loss(iteration_stop_count=int( max_serach_iterations / 4), percent_increase=0.025)) warnings.simplefilter('always') # getting all parameters for best model storage mlflow_best_model = trials.best_trial['result']['mlflow'] best_params = {} for key, idx in best.items(): best_params[key] = param_grid[key][idx] coef_names_ = list( data_props['look_ups']['out_lookup_col_name']['X'].keys()) coef_names_ = coef_names_ + [ col + f'_sft_{i}' for i in range(1, best_params['backlooking_window']) for col in coef_names_ ] # Saving best model to storage if model_name not in best_model_storage: best_model_storage[model_name] = {} if data_props['iter_step'] not in best_model_storage[model_name]: best_model_storage[model_name][data_props['iter_step']] = { 'best_model': { 'result': { 'loss': 10**10 } }, 'history': {} } best_model_param = dict( result={ 'loss': trials.best_trial['result']['loss'], 'all_metrics': trials.best_trial['result']['all_metrics'] }, model_name=trials.best_trial['result']['model_name'], model_id=trials.best_trial['result']['mlflow']['model_id'], run_id=experiment_date_time, input_coefs=coef_names_, path_saved_model=trials.best_trial['result']['mlflow'] ['saved_model_path'], status=trials.best_trial['result']['status'], params=best_params, data=_extract_just_important_data_props(data_props)) best_model_storage[model_name][data_props['iter_step']]['history'][ experiment_date_time] = best_model_param if trials.best_trial['result']['loss'] < best_model_storage[model_name][ data_props['iter_step']]['best_model']['result']['loss']: best_model_storage[model_name][ data_props['iter_step']]['best_model'] = best_model_param with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) else: # Get best model from storage best_model_param = best_model_storage[model_name][ data_props['iter_step']]['best_model'] ######## Get Best model again ######## best_model = tf.keras.models.load_model( best_model_param['path_saved_model']) best_model.compile(loss=tf.losses.MeanAbsoluteError(), optimizer=tf.optimizers.Adam(), metrics=[ tf.metrics.MeanAbsoluteError(), CustomMeanDirectionalAccuracy(), tf.losses.Huber(), tf.metrics.MeanAbsolutePercentageError(), tf.metrics.MeanSquaredError(), tf.metrics.MeanSquaredLogarithmicError() ]) print('Best model is:', best_model_param) out = dict(best_model_param) ####### Get examples for plotting ####### if examples is not None: example_X = examples['X'] periods = best_model_param['params']['backlooking_window'] if layer_type == 'dense': example_X = tf.data.Dataset.from_tensors( np.reshape(example_X[:, -periods:, :], (example_X.shape[0], -1))) else: example_X = tf.data.Dataset.from_tensors(example_X) out['examples_pred_y'] = best_model.predict(example_X) ###### For 1 layer dense/linear models get coef & p-values ###### if NN_max_depth == 1 and isinstance(best_model.layers[0], tf.keras.layers.Dense): # Get coefs intercept_ = best_model.layers[0].bias.numpy() coef_ = best_model.layers[0].weights[0].numpy() out['coef_'] = pd.Series( dict( zip(['intercept_'] + best_model_param['input_coefs'], intercept_.tolist() + coef_.squeeze().tolist()))) dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=True, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) # get p-values import app.d_prediction.my_custom_pvalue_calc as my_p_lib out['p_values'] = {} for data_set in ['train', 'val', 'test']: y_pred = best_model.predict(dataset[f'{data_set}_X']) y_pred = np.reshape(y_pred, (-1, 1)) try: p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'], dataset[f'{data_set}_y'], coef_, intercept_, y_pred) p_values = pd.Series( dict(zip(best_model_param['input_coefs'], p_values))) out['p_values'][data_set] = p_values except: warnings.warn( "P-Values: ValueError: Input contains infinity or nan.") out['p_values'][data_set] = pd.Series( dict( zip(best_model_param['input_coefs'], ['error'] * len(best_model_param['input_coefs'])))) out['p_values'] = pd.DataFrame(out['p_values']) ##### Get Column Feature Importance ##### if return_permutation_importances: if 'feature_importance' in best_model_param: out['feature_importance'] = best_model_param['feature_importance'] else: import eli5 from eli5.sklearn import PermutationImportance sklearn_model = KerasRegressor(build_fn=best_model) sklearn_model.model = best_model dataset = _get_prep_data( train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) out['feature_importance'] = {} for data_set in ['train', 'val']: # Calculate actual FeatureImporttance try: perm = PermutationImportance( sklearn_model, cv='prefit').fit( dataset[f'{data_set}_X'].numpy(), np.reshape(dataset[f'{data_set}_y'].numpy(), (-1, 1))) feature_importances = eli5.format_as_dataframe( eli5.explain_weights( perm, feature_names=best_model_param['input_coefs'], top=10**10)) out['feature_importance'][ data_set] = feature_importances.set_index( 'feature').to_dict() except: warnings.warn( "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')." ) if out['feature_importance'] != {}: best_model_param['feature_importance'] = out[ 'feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['best_model'][ 'feature_importance'] = out['feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['history'][experiment_date_time][ 'feature_importance'] = out['feature_importance'] with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) out['status'] = 'ok' return out
def TrainNetwork(model, modelfile, x_train=None, y_train=None, x_valid=None, y_valid=None, sample_weight=None, callbacks=[], epochs=20, batch_size=200, verbose=1, overwriteModel=False, finishTraining=True): model, custom_objects = model.model, model.custom_objects # Set up our KerasRegressor wrapper. # I'm not 100% sure why we do this for our regressors (but not our classifiers), # but as we use this in the original training code I'll keep it for now. regressor = KerasRegressor(build_fn=model, batch_size=batch_size, epochs=epochs, verbose=verbose) # Make the model directory if it does not already exist. model_dir = '/'.join(modelfile.split('/')[:-1]) try: os.makedirs(model_dir) except: pass # Check if the model exists -- and load it if not overwriting. history_filename = 0 if ('.h5' in modelfile): history_filename = '.'.join(modelfile.split('.')[:-1]) + '.csv' else: history_filename = modelfile + '.csv' # if using .tf format, there won't be a file extension on the string at all. initial_epoch = 0 if (pathlib.Path(modelfile).exists() and not overwriteModel): regressor.model = load_model(modelfile, custom_objects=custom_objects) # Now we want to figure out for how many epochs the loaded model was already trained, # so that it's trained, in total, for the requested number of epochs. # keras models don't seem to hold on to an epoch attribute for whatever reason, # so we will figure out the current epoch based on CSVLogger output if it exists. if (pathlib.Path(history_filename).exists()): with open(history_filename) as f: for i, l in enumerate(f): pass initial_epoch = i # zero-indexing will take care of the 1st line, which has headers if (not finishTraining): initial_epoch = regressor.get_params()['epochs'] regressor.set_params(initial_epoch=initial_epoch) history = 0 # Train the model if we've specified "finishTraining", or if we don't even # have a model yet. Setting finishTraining=False lets one immediately skip # to evaluating the model, which is especially helpful if EarlyStopping was used # and the final model didn't reach the specified last epoch. if (finishTraining or not pathlib.Path(modelfile).exists()): history = regressor.fit(x=x_train, y=y_train, validation_data=(x_valid, y_valid), sample_weight=sample_weight, callbacks=callbacks) saveModel = True if (initial_epoch == epochs or not finishTraining): saveModel = False if (saveModel): print(' Saving model to {}.'.format(modelfile)) regressor.model.save(modelfile) # Now get the history from the log file, if it exists. # This is a better method than using the results of model.fit(), # since this will give us the whole history (not just whatever # was fitted right now). However, it relies on us having passed # a CSVLogger as one of our callbacks, which we normally do # but might not do in some specific circumstances. # fallback try: history = history.history except: history = {} pass if (pathlib.Path(history_filename).exists()): df = pd.read_csv(history_filename) history = {} for key in df.keys(): history[key] = df[key].to_numpy() else: print('Warning: No log file found for model {}.'.format()) print('This may result in an empty/incomplete history being returned.') print( 'Please provide a CSVLogger callback to prevent this in the future.' ) return regressor, history