def own_grid_search(model_name, train_data, test_data, train_data2): ''' Input: String, Pandas DataFrame, Pandas DataFrame Output: Best fit model from grid search of parameters. ''' model = get_model(model_name, train_data) if isinstance(model, keras.models.Sequential): model = fit_neural_net(model, train_data, test_data) return model roc_auc_scores_list = [] grid_parameters = get_grid_params(model_name) param_names, param_combs = prepare_grid_params(grid_parameters) for idx, param_comb in enumerate(param_combs): output_dict = defaultdict(list) param_dict = {} output_dict['model'] = idx for idx, param in enumerate(param_names): output_dict[param] = param_comb[idx] param_dict[param] = param_comb[idx] for months_forward in xrange(0, 78, 2): date_split = train.date_fire.max() - datetime.timedelta(weeks=months_forward) training_set, validation_set = tt_split_same_months(train, 2013, [1], days_back=14, exact_split_date = date_split, direct_prior_days=False, add_test=True) # for months_forward in xrange(0, 132, 2): # for months_forward in xrange(0, 33, 1): # date_split = datetime.date(2013, 1, 1) # training_set, validation_set = tt_split_early_late(train, date_split, months_forward, months_backward=0.5, days_forward=2, weeks_forward=months_forward) # training_set, validation_set = tt_split_same_months(train, 2013, [months_forward], days_back=None) # training_set, validation_set = tt_split_early_late(train, input_date, months_forward, months_backward=0.5, days_forward=30) # If there are no actual fires here, then training/testing on it is pointless and the ROC # area under the curve can't be calculated. print training_set.shape, validation_set.shape if validation_set.fire_bool.sum() > 0 and training_set.fire_bool.sum() > 0: model = fit_model(model, param_dict, training_set.drop('date_fire', axis=1)) roc_auc_score = predict_score_model(model, validation_set.drop('date_fire', axis=1)) output_dict['roc_auc'].append(roc_auc_score) roc_auc_scores_list.append(output_dict) del train['year'] del train['month'] roc_save_filename = './model_output/roc_auc_daysprioryear_lessm_15_' + model_name with open(roc_save_filename, 'w+') as f: pickle.dump(roc_auc_scores_list, f) best_params, best_roc_auc = return_best_params(roc_auc_scores_list) model = fit_model(model, best_params, train_data2.drop('date_fire', axis=1)) return model, best_roc_auc
if __name__ == '__main__': with open(sys.argv[1]) as f: input_df = pickle.load(f) days_back = 60 train, test = tt_split_all_less_n_days(input_df, days_back=days_back) train.loc[:, 'date_fire'] = pd.to_datetime(train['date_fire'].copy()) for months_forward in xrange(0, 78, 2): date_split = train.date_fire.max() - datetime.timedelta( weeks=months_forward) training_set, validation_set = tt_split_same_months( train, 2013, [1], days_back=14, exact_split_date=date_split, direct_prior_days=False, add_test=True) # training_set, validation_set = tt_split_early_late(train, date_split, months_forward, months_backward=None, days_forward=2, weeks_forward=months_forward) print months_forward for year in training_set.year.unique(): print training_set.query('year == @year').date_fire.min( ), training_set.query('year == @year').date_fire.max() print 'on to validation' for year in validation_set.year.unique(): print validation_set.query('year == @year').date_fire.min( ), validation_set.query('year == @year').date_fire.max() print '\n' * 2 '''
return input_df if __name__ == '__main__': # sys.argv[1] will hold the name of the model we want to run (logit, random forest, etc.), # and sys.argv[2] will hold our input dataframe (data will all the features and target). model_name = sys.argv[1] with open(sys.argv[2]) as f: input_df = pickle.load(f) days_back = 14 train, test = tt_split_all_less_n_days(input_df, days_back=days_back) # train2, test2 = tt_split_early_late(train, train.date_fire.max(), months_forward = 0, months_backward=0.5) train2, test2 = tt_split_same_months(train, 2012, [1], days_back=14, exact_split_date=test.date_fire.max(), direct_prior_days=False, add_test=True) train_cols = train.columns test_cols = test.columns train2_cols = train2.columns test2_cols = test2.columns for col in train_cols: if 'month' in col : del train[col] for col in test_cols: if 'month' in col: del test[col] for col in train2_cols:
import datetime import pandas as pd from data_manip.tt_splits import tt_split_all_less_n_days, tt_split_early_late, tt_split_same_months if __name__ == '__main__': with open(sys.argv[1]) as f: input_df = pickle.load(f) days_back = 60 train, test = tt_split_all_less_n_days(input_df, days_back=days_back) train.loc[:, 'date_fire'] = pd.to_datetime(train['date_fire'].copy()) for months_forward in xrange(0, 78, 2): date_split = train.date_fire.max() - datetime.timedelta(weeks=months_forward) training_set, validation_set = tt_split_same_months(train, 2013, [1], days_back=14, exact_split_date = date_split, direct_prior_days=False, add_test=True) # training_set, validation_set = tt_split_early_late(train, date_split, months_forward, months_backward=None, days_forward=2, weeks_forward=months_forward) print months_forward for year in training_set.year.unique(): print training_set.query('year == @year').date_fire.min(), training_set.query('year == @year').date_fire.max() print 'on to validation' for year in validation_set.year.unique(): print validation_set.query('year == @year').date_fire.min(), validation_set.query('year == @year').date_fire.max() print '\n' * 2 ''' print training_set.date_fire.min(), training_set.date_fire.max() print validation_set.date_fire.min(), validation_set.date_fire.max() print training_set.fire_bool.sum(), validation_set.fire_bool.sum() print '\n' * 2