示例#1
0
def _load_model(**parameters):
    """
    Load a saved ML model, and its training data 
    """
    io = IO()
    time = parameters['time']
    target = parameters['target']
    resample_method = parameters['resample']
    normalize_method = parameters['normalize']
    imputer_method = parameters['imputer']
    drop_opt = parameters['drop_opt']
    model_name = parameters['model_name']
    feature_selection_method = parameters.get('feature_selection_method', '')

    if model_name == "LogisticRegression":
        print(f'{model_name} is being loaded with "standard" normalization')
        normalize_method = 'standard'

    save_fname = f'{model_name}_{time}_{target}_{resample_method}_{normalize_method}_{imputer_method}{drop_opt}{feature_selection_method}.pkl'

    print(f'Loading {save_fname}...')
    model = joblib.load(join(config.ML_MODEL_SAVE_PATH, save_fname))

    return model
示例#2
0
import os
os.environ['KMP_WARNINGS'] = '0'
""" usage: stdbuf -oL python evaluate_models.py  2 > & log_evaluate_models & """

#######################################################
# This script contains a pipeline method for training ML models.
# The pipeline includes imputations, normalizing, resampling,
# and calibration all with a date-based cross-validation
# train-test split. Hyperparameter tuning is performed by
# a random search method. The final model is saved
# as well as the cross-validation results, which contain
# training and testing evaluations for multiple metrics.
#######################################################

# Iterates over model type, target type, resampling method
io = IO()

########################################
# USER-DEFINED PARAMETERS              #
########################################
model_set = ['RandomForest']  #, 'LogisticRegression', 'XGBoost']
target_set = ['severe_hail']  #['tornado', 'severe_hail', 'severe_wind']
time_set = ['first_hour']  #, 'second_hour']
resampling_method_set = [None]  #['under', None]
normalize_method_set = [None]  #['standard', 'robust', None]
imputer_method = 'simple'
n_iter = 1000
#######################################

metrics = [
    roc_auc_score, average_precision_score, Metrics.performance_curve,
from wofs_ml.preprocess.preprocess import CorrelationFilter
from wofs_ml.io.io import IO, base_vars_to_drop
from wofs.util import config

import itertools
import pickle 
from os.path import join

io = IO()

time_set = ['first_hour', 'second_hour']
target_set = ['tornado', 'severe_hail', 'severe_wind']
cc_value=0.8

drop_opt = '_manual_drop_time_max_spatial_mean'

filter_obj = CorrelationFilter()

iterator = itertools.product(time_set,target_set)
path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'

for combo in iterator:
    time, target = combo

    if drop_opt == '_manual_drop_time_max_spatial_mean':
        path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'
        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path,fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop
示例#4
0
from wofs_ml.io.io import IO
from wofs.util import config

from os.path import join
from glob import glob

# Save the individual netcdf files for
# each date, time, and forecast time index
# into a single pandas.DataFrame
# which can used for ML.

io = IO()

file_start = 'PROBABILITY_OBJECTS'
file_end = '.nc'
times = ['second_hour']

times_dict = {
    'first_hour': ['*[0][0-9]', '*[1][0-2]'],
    'second_hour': ['*[1][3-9]', '*[2][0-4]']
}

for time in times_dict.keys():
    fname_strs = [f"{file_start}{re}{file_end}" for re in times_dict[time]]

    nc_file_paths = []
    for fname_str in fname_strs:
        nc_file_paths.extend([
            glob(join(config.ML_INPUT_PATH, str(date), fname_str))
            for date in config.ml_dates
        ])
示例#5
0
def _load_test_data(base_vars_to_drop=base_vars_to_drop,
                    return_info=None,
                    **parameters):
    """
    Load  test data 
    """
    io = IO()
    time = parameters['time']
    target = parameters['target']
    drop_opt = parameters['drop_opt']
    model_name = parameters.get('model_name', None)

    path = '/home/monte.flora/wofs_ml/interpret/correlation_filtering_results/'
    if drop_opt == '_drop_high_corr_pred':
        fname = f'correlated_features_to_drop_{time}_{target}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_0.8_corr_pred':
        fname = f'correlated_features_to_drop_{time}_{target}_0.8.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_manual_drop_0.9_corr':
        fname = f'correlated_features_to_drop_{time}_{target}_0.9_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            add_columns_to_drop = pickle.load(fp)

        vars_to_drop += add_columns_to_drop

    elif drop_opt == '_manual_drop_0.8_corr':
        fname = f'correlated_features_to_drop_{time}_{target}_0.8_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            add_columns_to_drop = pickle.load(fp)

        vars_to_drop += add_columns_to_drop

    elif '_manual_drop_time_max_spatial_mean' in drop_opt:
        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_irrelevant_features':
        fname = f'irrelevant_features_to_drop_{time}_{target}_{model_name}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif drop_opt == '_drop_object_morph_pred':
        object_pred = ['area', 'minor_axis_length', 'major_axis_length']
        vars_to_drop = base_vars_to_drop + object_pred

    elif 'L1_based_feature_selection' in drop_opt and 'manual' not in drop_opt and 'aggres' not in drop_opt:
        path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop:
            columns_to_drop.remove('Run Date')
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif 'L1_based_feature_selection_aggressive' in drop_opt:
        path = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}aggresive.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop:
            columns_to_drop.remove('Run Date')
        vars_to_drop = base_vars_to_drop + columns_to_drop

    elif 'L1_based_feature_selection_with_manual' in drop_opt:
        path1 = '/home/monte.flora/wofs_ml/interpret/L1_based_features'
        fname = f'L1_based_features_to_drop_{time}_{target}_manual_drop_time_max_spatial_mean.pkl'
        with open(join(path1, fname), 'rb') as fp:
            columns_to_drop1 = list(pickle.load(fp))
        if 'Run Date' in columns_to_drop1:
            columns_to_drop1.remove('Run Date')

        fname = f'time_max_spatial_mean_features.pkl'
        with open(join(path, fname), 'rb') as fp:
            columns_to_drop2 = pickle.load(fp)
        vars_to_drop = base_vars_to_drop + columns_to_drop1 + columns_to_drop2

    else:
        vars_to_drop = base_vars_to_drop

    # LOAD DATA
    print(f'Loading {time} {target} data...(from _load_test_data)')
    fname = join(config.ML_DATA_STORAGE_PATH,
                 f'{time}_testing_matched_to_{target}_0km_dataset.pkl')
    test_data = io.load_dataframe(fname=fname,
                                  target_vars=[
                                      'matched_to_tornado_0km',
                                      'matched_to_severe_hail_0km',
                                      'matched_to_severe_wind_0km'
                                  ],
                                  vars_to_drop=vars_to_drop)

    examples = test_data['examples']
    target_values = test_data[f'matched_to_{target}_0km'].values
    if drop_opt == '_only_important_pred':
        path = '/work/mflora/ML_DATA/permutation_importance/'
        if 'Log' in model_name:
            tag = '_drop_high_corr_pred'
        else:
            tag = ''
        fname = join(
            path,
            f'permutation_importance_{model_name}_{target}_{time}_training_norm_aupdc{tag}.pkl'
        )
        perm_imp_results = load_pickle([fname])
        myInterpreter = InterpretToolkit(model=[None])
        myInterpreter.set_results(perm_imp_results,
                                  option='permutation_importance')
        important_vars = myInterpreter.get_important_vars(perm_imp_results,
                                                          multipass=True)
        important_vars += ['Run Date']

        examples = examples[important_vars]

    if return_info:
        info = test_data['info']
        return examples, target_values, info
    else:
        return examples, target_values
示例#6
0
from cross_validation_generator import DateBasedCV
from wofs.util import config
from wofs_ml.io.io import IO, vars_to_drop

import pandas as pd
from os.path import join

# Iterates over model type, target type, resampling method
#
io = IO()

time = 'first_hour'
fname = join(config.ML_DATA_STORAGE_PATH, f'{time}_training_dataset.pkl')
target_vars = ['matched_to_tornado_0km']

data = io.load_dataframe(fname=fname,
                         target_vars=target_vars,
                         vars_to_drop=vars_to_drop)

examples = data['examples']
targets = data['matched_to_tornado_0km']

feature_names = examples.columns.to_list()
date_col_idx = feature_names.index('Run Date')

cv = DateBasedCV(n_splits=5,
                 date_col_idx=date_col_idx,
                 y=targets,
                 verbose=True)

X = examples.to_numpy()