def main(): for d_name in args.d_name: dataset = loaddata_utils.load_data(d_name) X, y, problem = dataset['full']['X'], dataset['full']['y'], dataset['problem'] if 'xgb' in args.model_name: model = get_xgb_model(args.model_name, problem) X, y, problem = dataset['full']['X'], dataset['full']['y'], dataset['problem'] cv = StratifiedShuffleSplit(n_splits=5, test_size=0.15, random_state=1377) # hyperparameteres.... model = get_ebm_model(model_name, problem, random_state=1377) param_grid = [ {'min_child_weight': [0., 0.5, 1., 2., 3.]}, {'learning_rate': [0.5, 0.2, 0.1, 0.05]}, {'reg_lambda': [1.0, 0.1, 0.01, 0.]}, ] # with parallel_backend('threading'): cv_model = GridSearchCV(model, param_grid=param_grid, n_jobs=5, scoring='roc_auc', cv=cv, refit=False) cv_model.fit(X, y)
def convert_old_model(model, d_name): if hasattr(model, 'cat_columns'): return model from loaddata_utils import load_data X = load_data(d_name)['full']['X'] if isinstance(model, MyFLAMClassifier) or isinstance(model, MyFLAMRegressor) \ or isinstance(model, MyRSplineClassifier) or isinstance(model, MyRSplineRegressor): # get the old df, transform and set it back df = model.get_GAM_plot_dataframe() model.GAM_plot_dataframe = model.revert_dataframe(df) if (isinstance(model, MyBaggingRegressor) or isinstance(model, MyBaggingClassifier)): model.not_revert = True model.cat_columns = X.columns[X.dtypes == object].values.tolist() if isinstance(model, LabelEncodingFitMixin): raise Exception('Should just discard these models!') return model
def main(): if not os.path.exists(os.path.join(args.output_dir, args.identifier)): os.mkdir(os.path.join(args.output_dir, args.identifier)) csv_path = os.path.join('./results/', '%s.csv' % args.identifier) if not os.path.exists('./results'): os.mkdir('./results/') curr_content_lookup = None if os.path.exists(csv_path): curr_content_lookup = pd.read_csv(csv_path).set_index( ['d_name', 'model_name', 'split_idx']).sort_index() for d_name in args.d_name: global dataset # to make it accessible in the function get_model() dataset = load_data(d_name) print(d_name) # Handle the spline lam parameters. Reset for every datasets args.lam = None X, y, problem = dataset['full']['X'], dataset['full']['y'], dataset[ 'problem'] test_size = args.test_size args.split_cls = StratifiedShuffleSplit if problem == 'classification' else ShuffleSplit train_test_ss = args.split_cls(n_splits=args.n_splits, test_size=test_size, random_state=args.random_state) idxes_generator = train_test_ss.split(X, y) for split_idx, (train_idx, test_idx) in enumerate(idxes_generator): if split_idx < args.start_split: continue X_train, y_train = X.iloc[train_idx], y.iloc[train_idx] X_test, y_test = X.iloc[test_idx], y.iloc[test_idx] # print('y_train mean:', np.mean(y_train), 'y_test mean:', np.mean(y_test)) # print('y_train shape:', y_train.shape, 'y_test shape:', y_test.shape) for model_name in args.model_name: if args.check_in_records and curr_content_lookup is not None \ and (d_name, model_name, split_idx) in curr_content_lookup.index: print('Found in the record. Skip! "%s %s %d"' % (d_name, model_name, split_idx)) continue print('Start running "%s %s %d"' % (d_name, model_name, split_idx)) start_time = time.time() additional_model_args = {} # Set the range of hyperparameter to search for each dataset to save time if model_name.startswith('spline') and 'search_lam' in dataset: additional_model_args['search_lam'] = dataset['search_lam'] if model_name.startswith('spline') and 'n_splines' in dataset: additional_model_args['n_splines'] = dataset['n_splines'] if model_name.startswith('rspline') and 'discrete' in dataset: additional_model_args['discrete'] = dataset['discrete'] if model_name.startswith('rspline') and 'maxk' in dataset: additional_model_args['maxk'] = dataset['maxk'] exp_mode_fn = eval('get_%s' % args.exp_mode) experiment_result = exp_mode_fn(X_train, y_train, X_test, y_test, problem, d_name, model_name, split_idx, **additional_model_args) if experiment_result is None: continue record = OrderedDict() record['d_name'] = d_name record['model_name'] = model_name record['split_idx'] = split_idx record['n_splits'] = args.n_splits record['random_state'] = args.random_state record['fit_time'] = float(time.time() - start_time) record['test_size'] = test_size record.update(experiment_result) # Follow the column order output_csv(csv_path, record) print('finish %s %s %d/%d and %s with %.1fs' % (args.exp_mode, d_name, split_idx, args.n_splits, model_name, float(time.time() - start_time))) import gc gc.collect()
def main(): if not os.path.exists(args.data_path): exit('Exit! Not existing this file %s' % args.data_path) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) data_path_filename = args.data_path.split('/')[-1].split('.')[0] output_dir = os.path.join( args.output_dir, '%s-%s-df' % (args.identifier, data_path_filename)) if not os.path.exists(output_dir): os.mkdir(output_dir) records_df = pd.read_csv(args.data_path) if args.d_name is None: args.d_name = records_df.d_name.unique() if args.model_name is None: args.model_name = ['gnd_truth' ] + records_df.model_name.unique().tolist() for d_name in args.d_name: output_path = os.path.join(output_dir, '%s.pkl' % (d_name)) result_dict = {} if os.path.exists(output_path): with open(output_path, 'rb') as fp: result_dict = pickle.load(fp) with Timer(d_name, remove_start_msg=False): # Handle ss baseline df2 = records_df[records_df.d_name == d_name] if len(df2) == 0: print('No record found for this dataset %s' % (d_name)) continue dset = load_data(d_name) default_x_values_lookup, default_x_counts = {}, {} for feat_name in dset['full']['X']: X_uni, X_counts = np.unique(dset['full']['X'][feat_name], return_counts=True) default_x_values_lookup[feat_name] = X_uni default_x_counts[feat_name] = X_counts # To create importance, we cache the map to get the counts for each feature value X_map_dict = {} for feat_name in dset['full']['X']: X_map = pd.Series(X_counts, X_uni) # map unique value to counts X_map_dict[feat_name] = X_map for model_name in args.model_name: # Handle gnd_truth model class in ss experiments if model_name == 'gnd_truth' and d_name.startswith('ss'): if 'gnd_truth' not in result_dict or args.overwrite: gnd_truth_models = mypickle_load(dset['models_path']) result_dict[ 'gnd_truth'] = get_GAM_plot_dataframe_by_models( gnd_truth_models, default_x_values_lookup) # X_values_counts = dset['full']['X'].apply(lambda x: x.value_counts().sort_index().to_dict(), axis=0) # result_dict['gnd_truth']['sample_weights'] = result_dict['gnd_truth'].feat_name.apply( # lambda x: np.array(list(X_values_counts[x].values()), dtype=np.int) if x != 'offset' else None) result_dict['gnd_truth']['sample_weights'] = result_dict['gnd_truth'].apply( lambda row: None if row.feat_name == 'offset' \ else default_x_counts[row.feat_name] , axis=1) result_dict['gnd_truth']['importance'] = result_dict['gnd_truth'].apply( lambda row: -1 if row.feat_name == 'offset' \ else np.average(np.abs(row.y), weights=row.sample_weights) , axis=1) pickle.dump(result_dict, open(output_path, 'wb')) print('Finish this %s gnd_truth' % (d_name)) else: print('Already finish this %s %s' % (d_name, model_name)) continue df = df2[df2.model_name == model_name] if len(df) == 0: print('No record found for this model %s in dataset %s' % (model_name, d_name)) continue if not args.overwrite and model_name in result_dict: print('Already finish this %s %s' % (d_name, model_name)) continue if 'rf' in model_name or 'xgb-d3' in model_name or 'skgbt-d3' in model_name: print(model_name, 'is not a GAM. Skip!') continue with Timer('loading %s to check if it is a GAM' % model_name): model = mypickle_load(df.model_path.iloc[0]) if not hasattr(model, 'is_GAM') or not model.is_GAM: print(model_name, 'is not a GAM. Skip!') continue with Timer(d_name + ' ' + model_name): # use a generator to save memory of loading each model to get df models = model_generator(df.model_path.tolist()) result_dict[model_name] = get_GAM_plot_dataframe_by_models( models, default_x_values_lookup) result_dict[model_name]['importance'] = result_dict[model_name].apply( lambda row: -1 if row.feat_name == 'offset' \ else np.average(np.abs(row.y), weights=default_x_counts[row.feat_name]) , axis=1) pickle.dump(result_dict, open(output_path, 'wb'))
def main(): if not os.path.exists(args.data_path): exit('Exit! Not existing this file %s' % args.data_path) # Read into the inputs records_df = pd.read_csv(args.data_path) if args.model_name is not None: records_df = records_df.loc[vector_in(records_df.model_name, args.model_name)] if args.d_name is not None: records_df = records_df.loc[vector_in(records_df.d_name, args.d_name)] if args.end_splits is not None: records_df = records_df.loc[records_df.split_idx < args.end_splits] if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) data_path_filename = args.data_path.split('/')[-1].split('.')[0] output_path = os.path.join( args.output_dir, '%s-fimp-%s-%s.tsv' % (args.identifier, args.exp_mode, data_path_filename)) # Check the record if overwrite flag is 1! if args.overwrite and os.path.exists(output_path): if args.model_name is None and args.d_name is None: os.remove(output_path) else: records_df_index = records_df.set_index(['d_name', 'model_name']).index df = pd.read_csv(output_path, sep='\t') new_df = pd.DataFrame([ row for r_idx, row in df.iterrows() if (row.d_name, row.model_name) not in records_df_index ]) if len(new_df) == 0: os.remove(output_path) else: new_df.to_csv(output_path, sep='\t', index=None) curr_content_lookup = None if not args.overwrite and os.path.exists(output_path): curr_content_lookup = pd.read_csv(output_path, sep='\t') \ .set_index(['d_name', 'model_name', 'split_idx', 'metric']).sort_index() print('Total df size: ', len(records_df)) for d_name, df in records_df.groupby('d_name'): dataset = load_data(d_name) if dataset['problem'] == 'regression' and args.metric != 'mse': print( 'Regression dataset only uses mse as the metric. Skip dataset %s for metric %s.' % (d_name, args.metric)) continue for row_idx, (df_idx, record) in enumerate(df.iterrows()): if curr_content_lookup is not None and ( d_name, record.model_name, record.split_idx, args.metric) in curr_content_lookup.index: print('Found in the record. Skip! "%s %s %d %s"' \ % (d_name, record.model_name, record.split_idx, args.metric)) continue model = mypickle_load(record.model_path) if not hasattr(model, 'is_GAM') or not model.is_GAM: # model is not a GAM continue with Timer( 'handling record dataset %s %s %d with idx %d of total %d (%d)' % (d_name, record.model_name, record.split_idx, row_idx, df.shape[0], df_idx)): # Reload the train and test set for that record X_train, X_test, y_train, y_test = \ load_train_test_data(dataset, record.split_idx, record.n_splits, record.test_size, record.random_state) # Record the metadata the_result = OrderedDict() for k in [ 'd_name', 'model_name', 'model_path', 'split_idx', 'n_splits', 'test_size', 'random_state' ]: the_result[k] = record[k] the_result['metric'] = args.metric for mode_name, X_selection, y_selection, X_report, y_report in [ # ('train', X_train, y_train, None, None), # ('test', X_test, y_test, None, None), # ('train_test', X_train, y_train, X_test, y_test), ('test_test', X_test.iloc[:int(X_test.shape[0] / 2)], y_test.iloc[:int(X_test.shape[0] / 2)], X_test.iloc[int(X_test.shape[0] / 2):], y_test.iloc[int(X_test.shape[0] / 2):]), ]: exp_obj = args.exp_cls(X_selection, y_selection, dataset['problem'], model, args.metric, args.n_features_limit, X_report, y_report) exp_result = exp_obj.run_exp() for k in exp_result: the_result['%s_%s' % (mode_name, k)] = exp_result[k] output_csv(output_path, the_result, delimiter='\t')