from mime import * import pmlb import pandas as pd from sklearn.svm import SVC import matplotlib.pyplot as plt x, y = pmlb.fetch_data('spambase', return_X_y=True) data = pd.DataFrame(x) explainer = Mime(data, y, categorical=[55, 56]) blackBox = SVC() blackBox.fit(x[:3000], y[:3000]) print(blackBox.score(x[3000:], y[3000:])) all_explanations = [] for i in range(100): explanation, pred = explainer.explain(x[-1], blackBox.predict) all_explanations.append(explanation) print(all_explanations[:10]) importance_distributions = [list(column) for column in zip(*all_explanations)] for feature in importance_distributions: plt.hist(feature) plt.xlim((0, 5)) plt.show()
def fit(p): out_name = p._str(p) # generate random fname str before saving seed(p.seed) s = S_save(p) #################################################################### DATA ############################################################## # testing data should always be generated with the same seed if p.dset == 'gaussian': p.n_train = int(p.n_train_over_num_features * p.num_features) # warning - this reseeds! X_train, y_train, X_test, y_test, s.betastar = \ data.get_data_train_test(n_train=p.n_train, n_test=p.n_test, p=p.num_features, noise_std=p.noise_std, noise_distr=p.noise_distr, iid=p.iid, # parameters to be determined beta_type=p.beta_type, beta_norm=p.beta_norm, seed_for_training_data=p.seed, cov_param=p.cov_param) elif p.dset == 'pmlb': s.dset_name = data.REGRESSION_DSETS_LARGE_NAMES_RECOGNIZABLE[ p.dset_num] seed(703858704) X, y = pmlb.fetch_data(s.dset_name, return_X_y=True) # normalize the data X = (X - np.mean(X, axis=1).reshape(-1, 1)) / np.std( X, axis=1).reshape(-1, 1) y = (y - np.mean(y)) / np.std(y) X_train, X_test, y_train, y_test = train_test_split(X, y) # get test set seed(p.seed) X_train, y_train = shuffle(X_train, y_train) p.num_features = X_train.shape[1] p.n_train = int(p.n_train_over_num_features * p.num_features) ''' while p.n_train <= X_train.shape[0]: X_train = np.vstack((X_train, 1e-3 * np.random.randn(X_train.shape[0], X_train.shape[1]))) y_train = np.vstack((y_train, y_train)) ''' if p.n_train > X_train.shape[0]: print('this value of n too large') exit(0) elif p.n_train <= 1: print('this value of n too small') exit(0) else: X_train = X_train[:p.n_train] y_train = y_train[:p.n_train] #################################################################### FITTING ############################################################## if not p.model_type == 'rf': # fit model if p.model_type == 'linear_sta': s.w = X_train.T @ y_train / X_train.shape[0] elif 'mdl' in p.model_type: if p.model_type == 'mdl_orig': U, sv, Vh = npl.svd(X_train / np.sqrt(p.n_train)) a = U.T @ y_train # / (np.sqrt(p.n_train) * p.noise_std) a = a[:sv.size] def mdl_loss(l): return np.sum( np.square(a) / (1 + np.square(sv) / l) + np.log(1 + np.square(sv) / l)) opt_solved = minimize(mdl_loss, x0=1e-10) s.lambda_opt = opt_solved.x s.loss_val = opt_solved.fun inv = npl.pinv(X_train.T @ X_train / p.n_train + s.lambda_opt * np.eye(p.num_features)) s.w = inv @ X_train.T @ y_train / p.n_train elif p.model_type == 'mdl_m1': eigenvals, eigenvecs = npl.eig(X_train.T @ X_train) if p.dset == 'pmlb' and p.n_train > p.num_features + 1: def estimate_sigma_unbiased(): m = LinearRegression(fit_intercept=False) m.fit(X_train, y_train) y_pred = m.predict(X_train) return np.sum(np.square(y_train - y_pred)) / ( p.n_train - p.num_features - 1) p.noise_std = estimate_sigma_unbiased() var = p.noise_std**2 def mdl1_loss(l): inv = npl.pinv(X_train.T @ X_train + l * np.eye(p.num_features)) thetahat = inv @ X_train.T @ y_train mse_norm = npl.norm(y_train - X_train @ thetahat)**2 / (2 * var) theta_norm = npl.norm(thetahat)**2 / (2 * var) eigensum = 0.5 * np.sum(np.log((eigenvals + l) / l)) return mse_norm + theta_norm + eigensum opt_solved = minimize(mdl1_loss, x0=1e-10) s.lambda_opt = opt_solved.x s.loss_val = opt_solved.fun inv = npl.pinv(X_train.T @ X_train + s.lambda_opt * np.eye(p.num_features)) s.w = inv @ X_train.T @ y_train else: if p.model_type == 'ols': m = LinearRegression(fit_intercept=False) elif p.model_type == 'lasso': m = Lasso(fit_intercept=False, alpha=p.reg_param) elif p.model_type == 'ridge': if p.reg_param < 0: if p.reg_param == -1: m = RidgeCV(fit_intercept=False, alphas=np.logspace(-3, 3, num=10, base=10)) else: m = RidgeCV(fit_intercept=False, alphas=np.logspace(-3, 3, num=10, base=10), cv=int(-1 * p.reg_param)) else: m = Ridge(fit_intercept=False, alpha=p.reg_param) m.fit(X_train, y_train) if p.reg_param < 0 and p.model_type == 'ridge': s.lambda_opt = m.alpha_ s.w = m.coef_ # save df if p.model_type == 'ridge': S = X_train @ np.linalg.pinv(X_train.T @ X_train + p.reg_param * np.eye(X_train.shape[1])) @ X_train.T s.df1 = np.trace(S @ S.T) s.df2 = np.trace(2 * S - S.T @ S) s.df3 = np.trace(S) else: s.df1 = min(p.n_train, p.num_features) s.df2 = s.df1 s.df3 = s.df1 # store predictions and things about w # s.H_trace = np.trace(H) s.wnorm = np.linalg.norm(s.w) s.num_nonzero = np.count_nonzero(s.w) s.preds_train = X_train @ s.w s.preds_test = X_test @ s.w elif p.model_type == 'rf': rf = RandomForestRegressor(n_estimators=p.num_trees, max_depth=p.max_depth) rf.fit(X_train, y_train) s.preds_train = rf.predict(X_train) s.preds_test = rf.predict(X_test) # set things s.train_mse = metrics.mean_squared_error(s.preds_train, y_train) s.test_mse = metrics.mean_squared_error(s.preds_test, y_test) save(out_name, p, s)
def get_run_results(dataset_name, params: Bunch, df=None): pd.set_option('display.max_columns', 500) pd.set_option('max_colwidth', 1000) pd.set_option('display.width', 1000) tf.set_random_seed(123) np.random.seed(123) model_dir = os.path.join('/tmp/robulin', dataset_name) # fetch a PMLB dataset as a data-frame if df is None: df = fetch_data(dataset_name) results = run_5_combos(dataset_name, df, model_dir, params) perturb = params.perturb_frac results_dir = make_sibling_dir(__file__, f'results/pert={perturb}') few_fields = [ 'train', 'test', 'loss', 'auc', 'acc', 'wts_ent', 'wts_l1', 'wts_l1_linf', 'wts_1pct', 'wts_pct1pct', 'av_ent', 'av_high', 'a_ent', 'g_ent', 'f_a_ent', 'f_g_ent' ] keys = ['nat_nat', 'nat_per', 'per_nat', 'per_per', 'per_per_all'] results_few = pd.DataFrame( [sub_dict(results[k], few_fields) for k in keys])[few_fields] with open(os.path.join(results_dir, 'summary.csv'), 'w+') as fd: results_few.to_csv(fd, float_format='%.3f', index=False) print(results_few) # show nat_nat and per_nat IG attribs attr_nat = pd.DataFrame([results['nat_nat']['f_g_dict']]).transpose() attr_per = pd.DataFrame([results['per_nat']['f_g_dict']]).transpose() attribs: pd.DataFrame = pd.concat([attr_nat, attr_per], axis=1) attribs.columns = ['nat', 'adv'] attribs['feature'] = attribs.index print(f'IG Attribs: nat_nat vs per_nat') print(attribs.sort_values(by='nat', ascending=False)) if platform.system() != 'Linux': plot_multi(attribs, 'feature', value='attrib', order_by='nat', var='train mode') # show nat_nat and per_nat wts wts_nat = pd.DataFrame([results['nat_nat']['wts_dict']]).transpose() wts_per = pd.DataFrame([results['per_nat']['wts_dict']]).transpose() wts: pd.DataFrame = pd.concat([wts_nat, wts_per], axis=1) wts.columns = ['nat', 'adv'] wts['feature'] = wts.index wts['nat_abs'] = abs(wts['nat']) print(f'Wts: nat_nat vs per_nat') print(wts.sort_values(by='nat_abs', ascending=False)) if platform.system() != 'Linux': plot_multi(wts, 'feature', value='wt', order_by='nat_abs', ignore=['nat_abs'], var='train mode') # save various results for k in keys: dir = make_sibling_dir( __file__, f'results/{dataset_name}/' f'pert={perturb}/attrib/{k}') ig_dict = pd.DataFrame([results[k]['f_g_dict']]).transpose() afvi_dict = pd.DataFrame([results[k]['f_a_dict']]).transpose() attribs = pd.concat([ig_dict, afvi_dict], axis=1) attribs.columns = ['ig', 'afvi'] # if k in ['nat_nat', 'per_nat']: # print(f'attrib for {k}:') # print(attribs) # attribs['feature'] = attribs.index # if platform.system() != 'Linux': # plot_multibar(attribs, 'feature', value='attrib', order_by='ig') with open(os.path.join(dir, 'attrib.csv'), 'w+') as fd: attribs.to_csv(fd) tf.logging.info('*** tensorboard cmd:') tf.logging.info(f'tensorboard --logdir={model_dir}') return results_few
raise RuntimeError( "Cannot use 'tpot_all' and 'use_classic' simultaneously") if tpot_all and estimator_select: raise RuntimeError( "Cannot use 'tpot_all' and set 'estimator_select' simultaneously") print(">> TRAINING TPOT NN EVALUATION MODEL") print(">> JOB START TIME: {0:.2f}".format(time.time())) print(">> DATASET: {0}".format(args.dataset)) print(">> USING CLASSIC TPOT: {0}".format(args.use_classic)) print(">> USING TPOT-NN: {0}".format(args.use_nn)) conf_type = 'template' if args.use_template else 'config_dict' print(">> CONFIGURATION TYPE: {0}".format(conf_type)) X, y = fetch_data(args.dataset, return_X_y=True, local_cache_dir="pmlb_data_cache/") if conf_type == 'template': if tpot_all: template_str = 'Selector-Transformer-Estimator' elif use_nn: if estimator_select == 'lr': template_str = 'Selector-Transformer-PytorchLRClassifier' elif estimator_select == 'mlp': template_str = 'Selector-Transformer-PytorchMLPClassifier' else: if estimator_select == 'lr': template_str = 'Selector-Transformer-LogisticRegression' elif estimator_select == 'mlp': template_str = 'Selector-Transformer-MLPClassifier'
#/usr/share/python3 from pmlb import fetch_data, dataset_names from progressbar import ProgressBar, Percentage, Bar, ETA import matplotlib.pyplot as plt import seaborn as sb import pandas as pd adult_X, adult_labels = fetch_data('adult', return_X_y=True) adult_Xdf = pd.DataFrame(adult_X) ### PCA from sklearn.decomposition import PCA # plt.show() def cca_fig(name, data): pca = PCA(n_components=2) trans_X = pd.DataFrame(pca.fit_transform(data)) sb.regplot(x=trans_X[0], y=trans_X[1], fit_reg=False) plt.savefig(name + ".png", dpi=400) plt.clf() names = dataset_names pbar = ProgressBar(widgets=[Percentage(), Bar(right="| "), ETA()], maxval=len(names)).start() for i, n in enumerate(names):
def make_column_specs(dataset): df = fetch_data(dataset) return df_column_specs(df)
def fetch_data_Xy(name): return fetch_data(name, return_X_y=True, local_cache_dir="~/Isi/pmlb-cache")
'svc__gamma': np.logspace(-3, 3, 13), 'svc__C': np.logspace(-7, 5, 13) }, cv=5, n_jobs=-1) dum = GridSearchCV( make_pipeline(StandardScaler(), DummyClassifier()), {'dummyclassifier__strategy': ['stratified', 'most_frequent', 'uniform']}, cv=5, n_jobs=-1) n_max = 256 for dataset in classification_dataset_names: X, y = fetch_data(dataset, True) # maximum n_max samples if len(y) > n_max: S = np.random.permutation(len(y))[:n_max] I = np.zeros(len(y)) I[S] = 1 I = I > 0 X = X[I] y = y[I] pscores = cross_val_score(poly, X, y, cv=5, n_jobs=-1) rscores = cross_val_score(rbf, X, y, cv=5, n_jobs=-1) dscores = cross_val_score(dum, X, y, cv=5, n_jobs=-1)
def experiment(dataset, exp_args): X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='cache') # making features 0 mean and unit variance scaler = StandardScaler() train_X, test_X, train_y, test_y = train_test_split(X, y) scaler.fit(train_X) train_X_scaled = scaler.transform(train_X) test_X_scaled = scaler.transform(test_X) # optionally scale the targets, from a single test, overall performance is worse with scaled targets # if exp_args.regression: # scaler_reg = StandardScaler() # # scaler_reg.fit(train_y.reshape(-1, 1)) # train_y = scaler_reg.transform(train_y.reshape(-1, 1)).reshape(-1) # test_y = scaler_reg.transform(test_y.reshape(-1, 1)).reshape(-1) # don't use adam on the smaller datasets if len(train_X) > 1000: solver = 'adam' else: solver = 'lbfgs' # max_iters = [300] # hidden_layers = [(512,), (1024,), (256, 256)] # seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9] # scales = [0.25, 1.0] # dims = [256] max_iters = [args.max_iters] scales = [0.25] dims = [args.single_dim] if exp_args.encoding_type == 'all': enc_types = [ 'independent-ssp', 'combined-ssp', 'combined-simplex-ssp', 'one-hot', 'tile-code', 'pc-gauss', 'pc-gauss-tiled' ] elif exp_args.encoding_type == 'all-ssp': enc_types = ['independent-ssp', 'combined-ssp', 'combined-simplex-ssp'] elif exp_args.encoding_type == 'all-other': enc_types = ['one-hot', 'tile-code', 'pc-gauss', 'pc-gauss-tiled'] else: enc_types = [exp_args.encoding_type] if exp_args.debug: seeds = [1, 2, 3] hidden_layers = [(512, )] inter_fname = '{}/debug_enc_{}_results_{}iters_{}.csv'.format( exp_args.folder, exp_args.encoding_type, exp_args.max_iters, dataset) else: seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9] if args.less_hidden_layers: hidden_layers = [(512, 512), (1024, )] else: hidden_layers = [(256, ), (512, ), (1024, ), (256, 256), (512, 512), (1024, 1024)] inter_fname = '{}/enc_{}_results_{}iters_{}.csv'.format( exp_args.folder, exp_args.encoding_type, exp_args.max_iters, dataset) # only run if the data does not already exist if not os.path.exists(inter_fname): # contains all results for this dataset df = pd.DataFrame() for max_iter in max_iters: for hidden_layer_sizes in hidden_layers: for seed in seeds: for scale in scales: for dim in dims: for enc_type in enc_types: # train_X_enc = encode_dataset(train_X, dim=dim, seed=seed, scale=scale) # test_X_enc = encode_dataset(test_X, dim=dim, seed=seed, scale=scale) if enc_type == 'independent-ssp': train_X_enc_scaled = encode_dataset( train_X_scaled, dim=dim, seed=seed, scale=scale) test_X_enc_scaled = encode_dataset( test_X_scaled, dim=dim, seed=seed, scale=scale) encoding_name = 'SSP Normalized' elif enc_type == 'combined-ssp': train_X_enc_scaled = encode_dataset_nd( train_X_scaled, dim=dim, seed=seed, scale=scale, style='normal') test_X_enc_scaled = encode_dataset_nd( test_X_scaled, dim=dim, seed=seed, scale=scale, style='normal') encoding_name = 'Combined SSP Normalized' elif enc_type == 'combined-simplex-ssp': train_X_enc_scaled = encode_dataset_nd( train_X_scaled, dim=dim, seed=seed, scale=scale, style='simplex') test_X_enc_scaled = encode_dataset_nd( test_X_scaled, dim=dim, seed=seed, scale=scale, style='simplex') encoding_name = 'Combined Simplex SSP Normalized' elif enc_type in [ 'one-hot', 'tile-code', 'pc-gauss', 'pc-gauss-tiled', 'legendre', 'ssp-proj' ]: train_X_enc_scaled = encode_comparison_dataset( train_X_scaled, encoding=enc_type, seed=seed, dim=dim, scale=scale, **params) test_X_enc_scaled = encode_comparison_dataset( test_X_scaled, encoding=enc_type, seed=seed, dim=dim, scale=scale, **params) if enc_type == 'one-hot': encoding_name = 'One Hot' elif enc_type == 'tile-code': encoding_name = 'Tile Coding' elif enc_type == 'pc-gauss': encoding_name = 'RBF' elif enc_type == 'pc-gauss-tiled': encoding_name = 'RBF Tiled' elif enc_type == 'legendre': encoding_name = 'Legendre' elif enc_type == 'ssp-proj': encoding_name = 'SSP Projected Axis' else: raise NotImplementedError( 'unknown encoding type: {}'.format( enc_type)) mlp = MLP( hidden_layer_sizes=hidden_layer_sizes, activation='relu', solver=solver, max_iter=max_iter, random_state=seed, early_stopping=True, validation_fraction=0.1, ) mlp.fit(train_X_enc_scaled, train_y) acc = mlp.score(test_X_enc_scaled, test_y) df = df.append( { 'Dim': dim, 'Seed': seed, 'Scale': scale if 'ssp' in enc_type else 0, 'N-Tiles': exp_args.n_tiles if enc_type == 'tile-coding' else 0, 'Sigma': exp_args.sigma if ((enc_type == 'pc-guass') or (enc_type == 'pc-guass-tiled')) else 0, 'Encoding': encoding_name, 'Dataset': dataset, 'Model': 'MLP - {}'.format(hidden_layer_sizes), 'Accuracy': acc, 'Solver': solver, 'Max Iter': max_iter, }, ignore_index=True, ) if not args.only_encoding: mlp = MLP( hidden_layer_sizes=hidden_layer_sizes, activation='relu', solver=solver, max_iter=max_iter, random_state=seed, early_stopping=True, validation_fraction=0.1, ) mlp.fit(train_X_scaled, train_y) acc = mlp.score(test_X_scaled, test_y) df = df.append( { 'Dim': 0, 'Seed': seed, 'Scale': 0, 'N-Tiles': 0, 'Sigma': 0, 'Encoding': 'Normalized', 'Dataset': dataset, 'Model': 'MLP - {}'.format(hidden_layer_sizes), 'Accuracy': acc, 'Solver': solver, 'Max Iter': max_iter, }, ignore_index=True, ) # save each dataset individually, in case the run crashes and needs to be restarted df.to_csv(inter_fname) return dataset
# some continuous features some_continuous = [] # 10 or less continuous features, only continuous small_continuous = [] # dataset_names = classification_dataset_names dataset_names = regression_dataset_names n_datasets = len(dataset_names) # for i, classification_dataset in enumerate(['banana', 'iris', 'titanic']): for i, dataset in enumerate(dataset_names): print('\x1b[2K\r {} of {}. {}'.format(i+1, n_datasets, dataset), end="\r") df = fetch_data(dataset, return_X_y=False) # feat = count_features_type(df.ix[:, df.columns != 'class']) feat = count_features_type(df.ix[:, df.columns != 'target']) n_binary = feat[0] n_integer = feat[1] n_float = feat[2] # if classification_dataset == 'banana': # print('banana:') # print(feat) # print(df) # if classification_dataset == 'titanic': # print('titanic:') # print(feat) # print(df) # if classification_dataset == 'iris':
import numpy as np import pandas as pd from sklearn.metrics import r2_score, mean_squared_error from sklearn.model_selection import train_test_split from pmlb import fetch_data, dataset_names, classification_dataset_names, regression_dataset_names from operon.sklearn import SymbolicRegressor import seaborn as sns import matplotlib.pyplot as plt from sympy import parse_expr, symbols, lambdify # fetch data df = fetch_data('192_vineyard', return_X_y=False, local_cache_dir='./data/') print(df) X = df.iloc[:, :-1].to_numpy() y = df.iloc[:, -1].to_numpy() # split the data into train and test X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, shuffle=True) # do a regression reg = SymbolicRegressor() reg.fit(X_train, y_train)
# SPDX-License-Identifier: MIT # SPDX-FileCopyrightText: Copyright 2019-2021 Heal Research import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import r2_score, make_scorer from scipy.stats import pearsonr from operon import RSquared from operon.sklearn import SymbolicRegressor from pmlb import fetch_data, dataset_names, classification_dataset_names, regression_dataset_names #print(regression_dataset_names) X, y = fetch_data('1027_ESL', return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, shuffle=True, random_state=1234) reg = SymbolicRegressor(allowed_symbols='add,sub,mul,div,constant,variable', offspring_generator='basic', local_iterations=10, n_threads=4, objectives=['r2', 'shape'], random_state=1234)
if d == None: d = D assert(N >= n and D >= d) return arr[:n, :d] def dot(a, b): assert(len(a) == len(b)) out = [] for x,y in zip(a, b): out += ["{} * {}".format(x, y)] return " + ".join(out) if __name__ == "__main__": X, y = fetch_data(sys.argv[1], return_X_y=True) # X, y = slice(Xorig, 4, 3), yorig[:3] n, d = X.shape for i in range(y.size): if (y[i] == 0): y[i] = -1 # Read in the file with open('sgd_temp.c', 'r') as file : C = file.read() ws = ["{}{}".format("w", i) for i in range(d) ] WS = ["{}{}".format("W",i) for i in range(d) ] xis = ["{}{}".format("x", i) for i in range(d) ] substitutions = {
#!/usr/bin/env python3 import pmlb import pandas as pd data = pmlb.fetch_data('iris') data.to_csv('iris.csv')
index=[0]) assert (local_cache_dir != None) stats_df.to_csv( os.path.join(local_cache_dir, dataset_name, 'summary_stats.csv')) if __name__ == '__main__': # assuming this is run from the repo root directory local_dir = 'datasets/' overwrite = True for d in classification_dataset_names: print(d, '...') df = fetch_data(d, local_cache_dir=local_dir) generate_description(df, d, 'classification', overwrite_existing=overwrite, local_cache_dir=local_dir) generate_summarystats(df, d, 'classification', local_cache_dir=local_dir) for d in regression_dataset_names: print(d, '...') df = fetch_data(d, local_cache_dir=local_dir) generate_description(df, d, 'regression',
def benchmark(config='', dmin=5, dmax=6): from pmlb import fetch_data, classification_dataset_names from sdv.evaluation import evaluate for classification_dataset in classification_dataset_names[dmin:dmax]: X, y = fetch_data(classification_dataset, return_X_y=True) X_train_full, X_test, y_train_full, y_test = train_test_split( X, y, test_size=0.05, random_state=2021) X_train, X_valid, y_train, y_valid = train_test_split( X_train_full, y_train_full, random_state=2021) def post_process_fun(y): return int(y) def pre_process_fun(y): return int(y) ##### # y = y.astype('uint8') num_classes = len(np.unique(y)) print(np.unique(y)) model_pars = { 'model_pars': { 'original_dim': X.shape[1], 'class_num': num_classes, 'intermediate_dim': 64, 'intermediate_dim_2': 16, 'latent_dim': 3, 'Lambda1': 1, 'batch_size': 256, 'Lambda2': 200, 'Alpha': 0.075 }, 'post_process_fun': post_process_fun ### After prediction ########################################## , 'pre_process_pars': { 'y_norm_fun': pre_process_fun, ### Before training ########################## ### Pipeline for data processing ############################## 'pipe_list': [ #### coly target prorcessing { 'uri': 'source/prepro.py::pd_coly', 'pars': {}, 'cols_family': 'coly', 'cols_out': 'coly', 'type': 'coly' }, { 'uri': 'source/prepro.py::pd_colnum_bin', 'pars': {}, 'cols_family': 'colnum', 'cols_out': 'colnum_bin', 'type': '' }, { 'uri': 'source/prepro.py::pd_colcat_bin', 'pars': {}, 'cols_family': 'colcat', 'cols_out': 'colcat_bin', 'type': '' }, ], } } log(f'{classification_dataset} Metrics: ------------') column = [f'col_{i}' for i in range(X.shape[1])] real_df = pd.DataFrame(X_test, columns=column) ##### VAEMDN vae, vae_enc, vae_dec = VAEMDN(model_pars=model_pars['model_pars']) vae.fit([X_train_full, y_train_full], epochs=50) vae_data = vae.predict([X_test, y_test]) vae_df = pd.DataFrame(vae_data, columns=column) evl_vae = evaluate(real_df, vae_df, metrics=['LogisticDetection', 'CSTest', 'KSTest']) log(f'Evaluation on VAE: {evl_vae}') log("##### AE") basic_ae, ae_enc, ae_dec = AUTOENCODER_BASIC(X.shape[1]) basic_ae.fit(X_train_full, X_train_full, epochs=50) basic_data = basic_ae.predict(X_test) basic_df = pd.DataFrame(basic_data, columns=column) evl_ae = evaluate(real_df, basic_df, metrics=['LogisticDetection', 'CSTest', 'KSTest']) log(f'Evaluation on Basic_AE: {evl_ae}')
from pmlb import fetch_data from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier from sklearn.linear_model import LinearRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif from sklearn.metrics import classification_report import numpy as np import random import matplotlib.pyplot as plt # Choose data set. adult_data, adult_labels = fetch_data('adult', return_X_y=True, local_cache_dir='./') print(adult_data.shape, adult_labels.shape) # Algorithms to be used logreg = LogisticRegression( solver='lbfgs' ) gaussNB = GaussianNB() sgd = SGDClassifier( loss="hinge", penalty="l2", max_iter=5 ) linear = LinearRegression() rfc = RandomForestClassifier(n_estimators=200) # Columns used to create predictions feature_columns = ['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
if __name__ == '__main__': penn_data = Path('./datasets.csv') dataset = [] if penn_data.is_file(): df = pd.read_csv(penn_data) dataset = df['dataset_names'].values else: print('Please create nonempty csv-file with datasets') if len(dataset) == 0: dataset = classification_dataset_names + regression_dataset_names for name_of_dataset in dataset: pmlb_data = fetch_data(name_of_dataset) num_classes, _ = imbalance_metrics(pmlb_data['target'].tolist()) problem_class, metric_names = _problem_and_metric_for_dataset( name_of_dataset, num_classes) if not problem_class or not metric_names: print('Incorrect dataset') continue train_file, test_file = get_penn_case_data_paths(name_of_dataset) config_models_data = get_models_hyperparameters() case_name = f'penn_ml_{name_of_dataset}' try: result_metrics = CaseExecutor(params=ExecutionParams( train_file=train_file, test_file=test_file,
def evaluate_model(dataset, pipeline_components, pipeline_parameters, resultdir="."): input_data = fetch_data(dataset) features = input_data.drop('target', axis=1).values.astype(float) labels = input_data['target'].values # pipelines = [dict(zip(pipeline_parameters.keys(), list(parameter_combination))) # for parameter_combination in itertools.product(*pipeline_parameters.values())] # pipelines = pipeline_parameters results_dict = {} classifier_class = pipeline_components[-1] # tmpfn = '{}/tmp--{}--{}.pkl'.format(resultdir, dataset, classifier_class.__name__) # Path(tmpfn).touch() with warnings.catch_warnings(): # Squash warning messages. Turn this off when debugging! warnings.simplefilter('ignore') # for pipe_parameters in pipelines: pipeline = [] for component in pipeline_components: # if component in pipe_parameters: if component in pipeline_parameters: args = pipeline_parameters[component] pipeline.append(component(**args)) else: pipeline.append(component()) try: clf = make_pipeline(*pipeline) cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=90483257) scoring = { 'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'bal_accuracy': make_scorer(balanced_accuracy_score) } validation = cross_validate(clf, features, labels, cv=cv, scoring=scoring) avg = map2_dict(lambda k, v: ("avg_{}".format(k), np.mean(v)), validation) stddev = map2_dict(lambda k, v: ("std_{}".format(k), np.std(v)), validiation) # balanced_accuracy = balanced_accuracy_score(labels, cv_predictions) except KeyboardInterrupt: sys.exit(1) # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter # combination or bad data. Turn this off when debugging! # except Exception as e: # continue param_string = "default" if pipeline_parameters != {}: param_string = ','.join([ '{}={}'.format(parameter, value) for parameter, value in pipeline_parameters[classifier_class].items() ]) dict_safe_append(results_dict, 'dataset', dataset) dict_safe_append(results_dict, 'classifier', classifier_class.__name__) dict_safe_append(results_dict, 'parameters', param_string) for key in merge_dicts(avg, stddev).keys(): dict_safe_append(results_dict, key, avg[key]) # out_text = '\t'.join(map_dict(lambda v: str(v[-1]), results_dict).values()) # print(out_text, flush=True) # pd.DataFrame(results_dict).to_pickle(tmpfn) # os.remove(tmpfn) # final_fn = '{}/final--{}--{}.pkl'.format(resultdir, dataset, classifier_class.__name__) # pd.DataFrame(results_dict).to_pickle(final_fn) return results_dict
## https://github.com/EpistasisLab/penn-ml-benchmarks ## pip install pmlb import numpy as np from pmlb import fetch_data from pmlb import dataset_names x = np.zeros(len(dataset_names)) for i, dn in enumerate(dataset_names): d = fetch_data(dn) n = d.describe()["class"]["count"] x[i] = n print(str(n) + " " + str(dn)) x.min() np.percentile(x, 50) np.percentile(x, 80) np.percentile(x, 90) x.max() #In [6]: x.min() #Out[6]: 32.0 # #In [7]: np.percentile(x, 50) #Out[7]: 690.0 # #In [8]: np.percentile(x, 80) #Out[8]: 3772.0 # #In [9]: np.percentile(x, 90)
def test_fetch_data_1(): """Test fetch_data can fetch data from GitHub.""" mushroom = fetch_data('mushroom') assert not mushroom.empty assert not mushroom.isnull().values.any()
def load(dataset="monk3", multi="normal"): """ Returns X (features) and y (classes) for a dataset. The dataset can be from PMLB, OpenML100, a .dat file in the data/ directory, or one of the synthetic examples from Figure 1. Args: dataset (str): name of the dataset multi (str): mode for processing multi-class problems There are three valid choices: - "normal": return multi-class problems normally - "small": convert multi-class problem into a smallest class against all problem - "large": convert multi-class problem into a largest class against all problem Returns: X (np.array): features the data points y (np.array): classes for the data points """ try: # PMLB does not provide a simple way to check if a dataset is available # Just attempt to load, and continue through list of datasets if not found pathlib.Path(".pmlb").mkdir(parents=True, exist_ok=True) X, y = pmlb.fetch_data(dataset, return_X_y=True, local_cache_dir=".pmlb") isPMLB = True except ValueError: isPMLB = False if isPMLB: # PMLB data already loaded pass elif dataset in map(str, openml.study.get_study("OpenML100", "tasks").tasks): task = openml.tasks.get_task(dataset) X, y = task.get_X_and_y() X = X[:, sum(np.isnan(X)) == 0] elif os.path.isfile("data/" + dataset + ".dat"): # Datasets not in PMLB or OpenML (load from file) X = np.genfromtxt("data/" + dataset + ".dat", delimiter=",") X, y = X[:, :-1], X[:, -1].astype(np.int64) elif dataset == "easy": # Synthetic example for Figure 1a r1 = 225 r2 = 25 b = 250 X = np.concatenate([ np.concatenate([ 0.10 * np.random.rand(r1, 1) - 1.0, 2 * np.random.rand(r1, 1) - 1 ], axis=1), np.concatenate([ 0.25 * np.random.rand(r2, 1) - 0.2, 2 * np.random.rand(r2, 1) - 1 ], axis=1), np.concatenate([ 0.10 * np.random.rand(b, 1) + 0.0, 2 * np.random.rand(b, 1) - 1 ], axis=1) ]) y = np.array((r1 + r2) * [1] + b * [0]) elif dataset == "imbalance": # Synthetic example for Figure 1b r = 20 b = 480 X = np.concatenate([ np.concatenate([ 1.05 * np.random.rand(r, 1) - 1.0, 2 * np.random.rand(r, 1) - 1 ], axis=1), np.concatenate([ 1.00 * np.random.rand(b, 1) + 0.0, 2 * np.random.rand(b, 1) - 1 ], axis=1) ]) y = np.array(r * [1] + b * [0]) elif dataset == "imbalance+outlier": # Synthetic example for Figure 1c r = 20 b = 480 X = np.concatenate([ np.concatenate( [1 * np.random.rand(r, 1) - 1.0, 2 * np.random.rand(r, 1) - 1], axis=1), np.concatenate( [1 * np.random.rand(b, 1) + 0.0, 2 * np.random.rand(b, 1) - 1], axis=1), np.array([[-1.0, 0.0]]) ]) y = np.array(r * [1] + b * [0] + [0]) elif dataset == "overlap": # Synthetic example for Figure 1d r = 250 b = 250 X = np.concatenate([ np.concatenate( [1 * np.random.rand(r, 1) - 1, 2 * np.random.rand(r, 1) - 1], axis=1), np.concatenate( [2 * np.random.rand(b, 1) - 1, 2 * np.random.rand(b, 1) - 1], axis=1) ]) y = np.array(r * [1] + b * [0]) else: raise ValueError("Dataset " + dataset + " is not recognized.") # Map classes down to 0 to (number_of_classes - 1) unique = np.unique(y) new = {old: new for (new, old) in enumerate(unique)} y = np.array([new[i] for i in y]) if multi == "normal": # Treat multi-class problems normally pass else: # Convert multi-class problems to binary count = np.bincount(y) if multi == "small": # Smallest class against all count = -count elif multi == "large": # Largest class against all pass else: raise ValueError("Multi-class setting \"" + multi + "\" not recognized.") ind = np.argmax(count) y = (y == ind).astype(np.int) return X, y
def test_fetch_data_2(): """Test fetch_data can fetch data from local cache.""" mushroom = fetch_data('mushroom', local_cache_dir="datasets/") assert not mushroom.empty
def _more_tags(self): return {'non_deterministic': True, 'binary_only': True} TEST_SKLEARN = False TEST_PYTORCH = True if __name__ == "__main__": import warnings warnings.filterwarnings("ignore", category=ConvergenceWarning) # Good binary classification dataset with floating features and appx. equal # class balance. Very high accuracy attainable using LR (>0.99 accuracy) X, y = fetch_data('clean2', return_X_y=True) if True: # first two features are IDs for the molecule! The decision function will just learn to look at these... X = X[:, 2:] X_train, X_test, y_train, y_test = train_test_split(X, y) if TEST_SKLEARN: clf_sklearn = LogisticRegression(penalty='l2', solver='sag', max_iter=1000) clf_sklearn.fit(X_train, y_train) print("SKLEARN ACCURACY: {0:.3f}".format( clf_sklearn.score(X_test, y_test))) #print(clf_sklearn.coef_)
def test_fetch_data_6(): """Test fetch_data can fetch data from GitHub with return_X_y.""" X, y = fetch_data('mushroom', return_X_y=True) assert isinstance(X, np.ndarray) assert isinstance(y, np.ndarray)
for name in classifier_names: for pre in preprocs: test_scores[name + pre] = [] combination_names.append(name + pre) for i, classification_dataset in enumerate(datasets): # temporarily skipping the bigger datasets to save time prototyping if (classification_dataset == 'shuttle') or (classification_dataset == 'magic'): continue print('\x1b[2K\r {} of {}. {}'.format(i + 1, n_datasets, classification_dataset), end="\r") X, y = fetch_data(classification_dataset, return_X_y=True) # making features 0 mean and unit variance scaler = StandardScaler() train_X, test_X, train_y, test_y = train_test_split(X, y) scaler.fit(train_X) train_X_scaled = scaler.transform(train_X) test_X_scaled = scaler.transform(test_X) train_X_enc = encode_dataset(train_X, dim=256, seed=13, scale=1.0) test_X_enc = encode_dataset(test_X, dim=256, seed=13, scale=1.0) train_X_enc_scaled = encode_dataset(train_X_scaled,
#!/usr/bin/env python3 import pmlb import pandas as pd data = pmlb.fetch_data('yeast') data.to_csv('yeast.csv')
def evaluate_model(dataset, pipeline_components, pipeline_parameters, resultdir="."): '''dataset: str, pipeline_components: List[Object], Dict[Object, Dict[str, Any]]''' # download dataset from PMLB input_data = fetch_data(dataset) # separate features and labels features = input_data.drop('target', axis=1).values.astype(float) labels = input_data['target'].values results_dict = {} # initialize a dictionary to store the results classifier_class = pipeline_components[-1] # the classifier is the last element on the components list with warnings.catch_warnings(): # Squash warning messages. Turn this off when debugging! warnings.simplefilter('ignore') # initialize each of the components in the pipeline, passing in parameters if we have them pipeline = [] for component in pipeline_components: if component in pipeline_parameters: args = pipeline_parameters[component] pipeline.append(component(**args)) else: pipeline.append(component()) try: clf = make_pipeline(*pipeline) # make the pipeline cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=90483257) # initialize the cross validiation # these are the metrics we are collecting scoring = {'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'bal_accuracy': make_scorer(balanced_accuracy_score)} validation = cross_validate(clf, features, labels, cv=cv, scoring=scoring) # perform the cross validiation avg = map2_dict(lambda k, v: ("avg_{}".format(k), np.mean(v)), validation) # save average of cross validiation stddev = map2_dict(lambda k, v: ("std_{}".format(k), np.std(v)), validation) # save std dev of cross valiiation except KeyboardInterrupt: sys.exit(1) # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter # combination or bad data. Turn this off when debugging! except Exception as e: pass # construct parameter string param_string = "default" if pipeline_parameters != {}: param_string = ','.join(['{}={}'.format(parameter, value) for parameter, value in pipeline_parameters[classifier_class].items()]) # add things to the results dictionary dict_safe_append(results_dict, 'dataset', dataset) dict_safe_append(results_dict, 'classifier', classifier_class.__name__) dict_safe_append(results_dict, 'parameters', param_string) # merge the avg and stddev dictionaries merged = {**avg, **stddev} # add everything to the results dictionary for key in merged: dict_safe_append(results_dict, key, merged[key]) return results_dict
if __name__ == '__main__': results = { 'problem': [], 'method': [], 'score': [] } if len(sys.argv) > 1 and sys.argv[1] == '--skip-train': results = pd.read_csv("./data/results.csv") else: for classification_dataset in classification_dataset_names: print("Starting", classification_dataset) X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/') train_X, test_X, train_y, test_y = train_test_split(X, y) rf = RandomForestClassifier() lexRF = LexicaseForestClassifier() rf.fit(train_X, train_y) lexRF.fit(train_X, train_y) rf_score = rf.score(test_X, test_y) lexRF_score = lexRF.score(test_X, test_y) results['problem'] = results['problem'] + ([classification_dataset] * 2) results['method'] = results['method'] + ['RF', 'LexRF'] results['score'].append(rf_score) results['score'].append(lexRF_score)
from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.metrics import * import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=UserWarning) #dataset = sys.argv[1] #input_data = pd.read_csv(dataset, compression='gzip', sep='\t') dataset = [] cnt = 0 for reg_data in regression_dataset_names: X, y = fetch_data(reg_data, return_X_y=True, local_cache_dir='../dataset') if (X.shape[0] > 100 or X.shape[1] > 10): continue dataset.append(reg_data) cnt += 1 print('There are in total %d datasets' % cnt) hyper_params = [{ 'learning_rate': ( 0.01, 0.1, 1.0, 10.0, ), 'n_estimators': ( 10,
# set params for i in range(1, len(sys.argv), 2): t = type(getattr(p, sys.argv[i])) if sys.argv[i + 1] == 'True': setattr(p, sys.argv[i], t(True)) elif sys.argv[i + 1] == 'False': setattr(p, sys.argv[i], t(False)) else: setattr(p, sys.argv[i], t(sys.argv[i + 1])) out_name = p._str(p) # generate random fname str before saving np.random.seed(p.seed) random_state = p.seed data_dir = '/scratch/users/vision/data/pmlb' dset_name = p.dset_name # dset_names[p.dset_num] X, y = pmlb.fetch_data(dset_name, return_X_y=True, local_cache_dir=data_dir) type_orig = y.dtype y -= np.min(y) y = (y / np.max(y)).astype(type_orig) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=random_state) # defaults to 0.75: 0.25 splitx` num_to_flip = np.max([2, int(X_train.shape[0] * p.flip_frac)]) flipped = np.zeros(X_train.shape[0], dtype=np.bool) idxs = np.random.choice(X_train.shape[0], num_to_flip, replace=False) flipped[idxs] = 1 y_train[idxs] = 1 - y_train[idxs] num_to_flip = int(X_test.shape[0] * p.flip_frac) flipped_test = np.zeros(X_test.shape[0], dtype=np.bool)