Python fetch_data 예제들, pmlb.fetch_data Python 예제들

예제 #1

0

파일 보기

from mime import *
import pmlb
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt

x, y = pmlb.fetch_data('spambase', return_X_y=True)
data = pd.DataFrame(x)
explainer = Mime(data, y, categorical=[55, 56])
blackBox = SVC()
blackBox.fit(x[:3000], y[:3000])
print(blackBox.score(x[3000:], y[3000:]))
all_explanations = []

for i in range(100):
    explanation, pred = explainer.explain(x[-1], blackBox.predict)
    all_explanations.append(explanation)

print(all_explanations[:10])
importance_distributions = [list(column) for column in zip(*all_explanations)]
for feature in importance_distributions:
    plt.hist(feature)
    plt.xlim((0, 5))
    plt.show()

예제 #2

0

파일 보기

def fit(p):
    out_name = p._str(p)  # generate random fname str before saving
    seed(p.seed)
    s = S_save(p)

    #################################################################### DATA ##############################################################

    # testing data should always be generated with the same seed
    if p.dset == 'gaussian':
        p.n_train = int(p.n_train_over_num_features * p.num_features)

        # warning - this reseeds!
        X_train, y_train, X_test, y_test, s.betastar = \
            data.get_data_train_test(n_train=p.n_train, n_test=p.n_test, p=p.num_features,
                                noise_std=p.noise_std, noise_distr=p.noise_distr, iid=p.iid, # parameters to be determined
                                beta_type=p.beta_type, beta_norm=p.beta_norm,
                                seed_for_training_data=p.seed, cov_param=p.cov_param)
    elif p.dset == 'pmlb':
        s.dset_name = data.REGRESSION_DSETS_LARGE_NAMES_RECOGNIZABLE[
            p.dset_num]
        seed(703858704)
        X, y = pmlb.fetch_data(s.dset_name, return_X_y=True)
        # normalize the data
        X = (X - np.mean(X, axis=1).reshape(-1, 1)) / np.std(
            X, axis=1).reshape(-1, 1)
        y = (y - np.mean(y)) / np.std(y)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y)  # get test set
        seed(p.seed)
        X_train, y_train = shuffle(X_train, y_train)
        p.num_features = X_train.shape[1]
        p.n_train = int(p.n_train_over_num_features * p.num_features)
        '''
        while p.n_train <= X_train.shape[0]:
            X_train = np.vstack((X_train, 
                                 1e-3 * np.random.randn(X_train.shape[0], X_train.shape[1])))
            y_train = np.vstack((y_train, y_train))
        '''
        if p.n_train > X_train.shape[0]:
            print('this value of n too large')
            exit(0)
        elif p.n_train <= 1:
            print('this value of n too small')
            exit(0)
        else:
            X_train = X_train[:p.n_train]
            y_train = y_train[:p.n_train]

    #################################################################### FITTING ##############################################################

    if not p.model_type == 'rf':

        # fit model
        if p.model_type == 'linear_sta':
            s.w = X_train.T @ y_train / X_train.shape[0]
        elif 'mdl' in p.model_type:
            if p.model_type == 'mdl_orig':
                U, sv, Vh = npl.svd(X_train / np.sqrt(p.n_train))
                a = U.T @ y_train  # / (np.sqrt(p.n_train) * p.noise_std)
                a = a[:sv.size]

                def mdl_loss(l):
                    return np.sum(
                        np.square(a) / (1 + np.square(sv) / l) +
                        np.log(1 + np.square(sv) / l))

                opt_solved = minimize(mdl_loss, x0=1e-10)
                s.lambda_opt = opt_solved.x
                s.loss_val = opt_solved.fun
                inv = npl.pinv(X_train.T @ X_train / p.n_train +
                               s.lambda_opt * np.eye(p.num_features))
                s.w = inv @ X_train.T @ y_train / p.n_train
            elif p.model_type == 'mdl_m1':
                eigenvals, eigenvecs = npl.eig(X_train.T @ X_train)

                if p.dset == 'pmlb' and p.n_train > p.num_features + 1:

                    def estimate_sigma_unbiased():
                        m = LinearRegression(fit_intercept=False)
                        m.fit(X_train, y_train)
                        y_pred = m.predict(X_train)
                        return np.sum(np.square(y_train - y_pred)) / (
                            p.n_train - p.num_features - 1)

                    p.noise_std = estimate_sigma_unbiased()

                var = p.noise_std**2

                def mdl1_loss(l):
                    inv = npl.pinv(X_train.T @ X_train +
                                   l * np.eye(p.num_features))
                    thetahat = inv @ X_train.T @ y_train
                    mse_norm = npl.norm(y_train -
                                        X_train @ thetahat)**2 / (2 * var)
                    theta_norm = npl.norm(thetahat)**2 / (2 * var)
                    eigensum = 0.5 * np.sum(np.log((eigenvals + l) / l))
                    return mse_norm + theta_norm + eigensum

                opt_solved = minimize(mdl1_loss, x0=1e-10)
                s.lambda_opt = opt_solved.x
                s.loss_val = opt_solved.fun
                inv = npl.pinv(X_train.T @ X_train +
                               s.lambda_opt * np.eye(p.num_features))
                s.w = inv @ X_train.T @ y_train
        else:
            if p.model_type == 'ols':
                m = LinearRegression(fit_intercept=False)
            elif p.model_type == 'lasso':
                m = Lasso(fit_intercept=False, alpha=p.reg_param)
            elif p.model_type == 'ridge':
                if p.reg_param < 0:
                    if p.reg_param == -1:
                        m = RidgeCV(fit_intercept=False,
                                    alphas=np.logspace(-3, 3, num=10, base=10))
                    else:
                        m = RidgeCV(fit_intercept=False,
                                    alphas=np.logspace(-3, 3, num=10, base=10),
                                    cv=int(-1 * p.reg_param))
                else:
                    m = Ridge(fit_intercept=False, alpha=p.reg_param)

            m.fit(X_train, y_train)
            if p.reg_param < 0 and p.model_type == 'ridge':
                s.lambda_opt = m.alpha_
            s.w = m.coef_

        # save df
        if p.model_type == 'ridge':
            S = X_train @ np.linalg.pinv(X_train.T @ X_train + p.reg_param *
                                         np.eye(X_train.shape[1])) @ X_train.T
            s.df1 = np.trace(S @ S.T)
            s.df2 = np.trace(2 * S - S.T @ S)
            s.df3 = np.trace(S)
        else:
            s.df1 = min(p.n_train, p.num_features)
            s.df2 = s.df1
            s.df3 = s.df1

        # store predictions and things about w
        # s.H_trace = np.trace(H)
        s.wnorm = np.linalg.norm(s.w)
        s.num_nonzero = np.count_nonzero(s.w)
        s.preds_train = X_train @ s.w
        s.preds_test = X_test @ s.w

    elif p.model_type == 'rf':
        rf = RandomForestRegressor(n_estimators=p.num_trees,
                                   max_depth=p.max_depth)
        rf.fit(X_train, y_train)
        s.preds_train = rf.predict(X_train)
        s.preds_test = rf.predict(X_test)

    # set things
    s.train_mse = metrics.mean_squared_error(s.preds_train, y_train)
    s.test_mse = metrics.mean_squared_error(s.preds_test, y_test)

    save(out_name, p, s)

예제 #3

0

파일 보기

def get_run_results(dataset_name, params: Bunch, df=None):
    pd.set_option('display.max_columns', 500)
    pd.set_option('max_colwidth', 1000)
    pd.set_option('display.width', 1000)

    tf.set_random_seed(123)
    np.random.seed(123)
    model_dir = os.path.join('/tmp/robulin', dataset_name)

    # fetch a PMLB dataset as a data-frame
    if df is None:
        df = fetch_data(dataset_name)
    results = run_5_combos(dataset_name, df, model_dir, params)
    perturb = params.perturb_frac

    results_dir = make_sibling_dir(__file__, f'results/pert={perturb}')

    few_fields = [
        'train', 'test', 'loss', 'auc', 'acc', 'wts_ent', 'wts_l1',
        'wts_l1_linf', 'wts_1pct', 'wts_pct1pct', 'av_ent', 'av_high', 'a_ent',
        'g_ent', 'f_a_ent', 'f_g_ent'
    ]

    keys = ['nat_nat', 'nat_per', 'per_nat', 'per_per', 'per_per_all']
    results_few = pd.DataFrame(
        [sub_dict(results[k], few_fields) for k in keys])[few_fields]

    with open(os.path.join(results_dir, 'summary.csv'), 'w+') as fd:
        results_few.to_csv(fd, float_format='%.3f', index=False)

    print(results_few)

    # show nat_nat and per_nat IG attribs
    attr_nat = pd.DataFrame([results['nat_nat']['f_g_dict']]).transpose()
    attr_per = pd.DataFrame([results['per_nat']['f_g_dict']]).transpose()
    attribs: pd.DataFrame = pd.concat([attr_nat, attr_per], axis=1)
    attribs.columns = ['nat', 'adv']
    attribs['feature'] = attribs.index
    print(f'IG Attribs: nat_nat vs per_nat')
    print(attribs.sort_values(by='nat', ascending=False))
    if platform.system() != 'Linux':
        plot_multi(attribs,
                   'feature',
                   value='attrib',
                   order_by='nat',
                   var='train mode')

    # show nat_nat and per_nat wts
    wts_nat = pd.DataFrame([results['nat_nat']['wts_dict']]).transpose()
    wts_per = pd.DataFrame([results['per_nat']['wts_dict']]).transpose()
    wts: pd.DataFrame = pd.concat([wts_nat, wts_per], axis=1)
    wts.columns = ['nat', 'adv']
    wts['feature'] = wts.index
    wts['nat_abs'] = abs(wts['nat'])
    print(f'Wts: nat_nat vs per_nat')
    print(wts.sort_values(by='nat_abs', ascending=False))
    if platform.system() != 'Linux':
        plot_multi(wts,
                   'feature',
                   value='wt',
                   order_by='nat_abs',
                   ignore=['nat_abs'],
                   var='train mode')

    # save various results
    for k in keys:
        dir = make_sibling_dir(
            __file__, f'results/{dataset_name}/'
            f'pert={perturb}/attrib/{k}')
        ig_dict = pd.DataFrame([results[k]['f_g_dict']]).transpose()
        afvi_dict = pd.DataFrame([results[k]['f_a_dict']]).transpose()
        attribs = pd.concat([ig_dict, afvi_dict], axis=1)
        attribs.columns = ['ig', 'afvi']
        # if k in ['nat_nat', 'per_nat']:
        #   print(f'attrib for {k}:')
        #   print(attribs)
        #   attribs['feature'] = attribs.index
        #   if platform.system() != 'Linux':
        #     plot_multibar(attribs, 'feature', value='attrib', order_by='ig')

        with open(os.path.join(dir, 'attrib.csv'), 'w+') as fd:
            attribs.to_csv(fd)

    tf.logging.info('*** tensorboard cmd:')
    tf.logging.info(f'tensorboard --logdir={model_dir}')

    return results_few

예제 #4

0

파일 보기

    raise RuntimeError(
        "Cannot use 'tpot_all' and 'use_classic' simultaneously")
if tpot_all and estimator_select:
    raise RuntimeError(
        "Cannot use 'tpot_all' and set 'estimator_select' simultaneously")

print(">> TRAINING TPOT NN EVALUATION MODEL")
print(">> JOB START TIME:        {0:.2f}".format(time.time()))
print(">> DATASET:               {0}".format(args.dataset))
print(">> USING CLASSIC TPOT:    {0}".format(args.use_classic))
print(">> USING TPOT-NN:         {0}".format(args.use_nn))
conf_type = 'template' if args.use_template else 'config_dict'
print(">> CONFIGURATION TYPE:    {0}".format(conf_type))

X, y = fetch_data(args.dataset,
                  return_X_y=True,
                  local_cache_dir="pmlb_data_cache/")

if conf_type == 'template':
    if tpot_all:
        template_str = 'Selector-Transformer-Estimator'
    elif use_nn:
        if estimator_select == 'lr':
            template_str = 'Selector-Transformer-PytorchLRClassifier'
        elif estimator_select == 'mlp':
            template_str = 'Selector-Transformer-PytorchMLPClassifier'
    else:
        if estimator_select == 'lr':
            template_str = 'Selector-Transformer-LogisticRegression'
        elif estimator_select == 'mlp':
            template_str = 'Selector-Transformer-MLPClassifier'

예제 #5

0

파일 보기

파일: cca.py 프로젝트: sgpthomas/isi-metafeature-exploration

#/usr/share/python3

from pmlb import fetch_data, dataset_names
from progressbar import ProgressBar, Percentage, Bar, ETA

import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd

adult_X, adult_labels = fetch_data('adult', return_X_y=True)
adult_Xdf = pd.DataFrame(adult_X)

### PCA
from sklearn.decomposition import PCA

# plt.show()


def cca_fig(name, data):
    pca = PCA(n_components=2)
    trans_X = pd.DataFrame(pca.fit_transform(data))
    sb.regplot(x=trans_X[0], y=trans_X[1], fit_reg=False)
    plt.savefig(name + ".png", dpi=400)
    plt.clf()


names = dataset_names
pbar = ProgressBar(widgets=[Percentage(), Bar(right="| "),
                            ETA()],
                   maxval=len(names)).start()
for i, n in enumerate(names):

예제 #6

0

파일 보기

def make_column_specs(dataset):
    df = fetch_data(dataset)
    return df_column_specs(df)

예제 #7

0

파일 보기

파일: util.py 프로젝트: sgpthomas/isi-analysis

def fetch_data_Xy(name):
    return fetch_data(name,
                      return_X_y=True,
                      local_cache_dir="~/Isi/pmlb-cache")

예제 #8

0

파일 보기

    'svc__gamma': np.logspace(-3, 3, 13),
    'svc__C': np.logspace(-7, 5, 13)
},
                   cv=5,
                   n_jobs=-1)

dum = GridSearchCV(
    make_pipeline(StandardScaler(), DummyClassifier()),
    {'dummyclassifier__strategy': ['stratified', 'most_frequent', 'uniform']},
    cv=5,
    n_jobs=-1)

n_max = 256

for dataset in classification_dataset_names:
    X, y = fetch_data(dataset, True)

    # maximum n_max samples
    if len(y) > n_max:
        S = np.random.permutation(len(y))[:n_max]
        I = np.zeros(len(y))
        I[S] = 1
        I = I > 0

        X = X[I]
        y = y[I]

    pscores = cross_val_score(poly, X, y, cv=5, n_jobs=-1)
    rscores = cross_val_score(rbf, X, y, cv=5, n_jobs=-1)
    dscores = cross_val_score(dum, X, y, cv=5, n_jobs=-1)

예제 #9

0

파일 보기

def experiment(dataset, exp_args):
    X, y = fetch_data(dataset, return_X_y=True, local_cache_dir='cache')

    # making features 0 mean and unit variance
    scaler = StandardScaler()

    train_X, test_X, train_y, test_y = train_test_split(X, y)

    scaler.fit(train_X)

    train_X_scaled = scaler.transform(train_X)
    test_X_scaled = scaler.transform(test_X)

    # optionally scale the targets, from a single test, overall performance is worse with scaled targets
    # if exp_args.regression:
    #     scaler_reg = StandardScaler()
    #
    #     scaler_reg.fit(train_y.reshape(-1, 1))
    #     train_y = scaler_reg.transform(train_y.reshape(-1, 1)).reshape(-1)
    #     test_y = scaler_reg.transform(test_y.reshape(-1, 1)).reshape(-1)

    # don't use adam on the smaller datasets
    if len(train_X) > 1000:
        solver = 'adam'
    else:
        solver = 'lbfgs'

    # max_iters = [300]
    # hidden_layers = [(512,), (1024,), (256, 256)]
    # seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    # scales = [0.25, 1.0]
    # dims = [256]

    max_iters = [args.max_iters]
    scales = [0.25]
    dims = [args.single_dim]

    if exp_args.encoding_type == 'all':
        enc_types = [
            'independent-ssp', 'combined-ssp', 'combined-simplex-ssp',
            'one-hot', 'tile-code', 'pc-gauss', 'pc-gauss-tiled'
        ]
    elif exp_args.encoding_type == 'all-ssp':
        enc_types = ['independent-ssp', 'combined-ssp', 'combined-simplex-ssp']
    elif exp_args.encoding_type == 'all-other':
        enc_types = ['one-hot', 'tile-code', 'pc-gauss', 'pc-gauss-tiled']
    else:
        enc_types = [exp_args.encoding_type]

    if exp_args.debug:
        seeds = [1, 2, 3]
        hidden_layers = [(512, )]
        inter_fname = '{}/debug_enc_{}_results_{}iters_{}.csv'.format(
            exp_args.folder, exp_args.encoding_type, exp_args.max_iters,
            dataset)
    else:
        seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9]
        if args.less_hidden_layers:
            hidden_layers = [(512, 512), (1024, )]
        else:
            hidden_layers = [(256, ), (512, ), (1024, ), (256, 256),
                             (512, 512), (1024, 1024)]
        inter_fname = '{}/enc_{}_results_{}iters_{}.csv'.format(
            exp_args.folder, exp_args.encoding_type, exp_args.max_iters,
            dataset)

    # only run if the data does not already exist
    if not os.path.exists(inter_fname):
        # contains all results for this dataset
        df = pd.DataFrame()
        for max_iter in max_iters:
            for hidden_layer_sizes in hidden_layers:
                for seed in seeds:
                    for scale in scales:
                        for dim in dims:
                            for enc_type in enc_types:

                                # train_X_enc = encode_dataset(train_X, dim=dim, seed=seed, scale=scale)
                                # test_X_enc = encode_dataset(test_X, dim=dim, seed=seed, scale=scale)

                                if enc_type == 'independent-ssp':
                                    train_X_enc_scaled = encode_dataset(
                                        train_X_scaled,
                                        dim=dim,
                                        seed=seed,
                                        scale=scale)
                                    test_X_enc_scaled = encode_dataset(
                                        test_X_scaled,
                                        dim=dim,
                                        seed=seed,
                                        scale=scale)
                                    encoding_name = 'SSP Normalized'
                                elif enc_type == 'combined-ssp':
                                    train_X_enc_scaled = encode_dataset_nd(
                                        train_X_scaled,
                                        dim=dim,
                                        seed=seed,
                                        scale=scale,
                                        style='normal')
                                    test_X_enc_scaled = encode_dataset_nd(
                                        test_X_scaled,
                                        dim=dim,
                                        seed=seed,
                                        scale=scale,
                                        style='normal')
                                    encoding_name = 'Combined SSP Normalized'
                                elif enc_type == 'combined-simplex-ssp':
                                    train_X_enc_scaled = encode_dataset_nd(
                                        train_X_scaled,
                                        dim=dim,
                                        seed=seed,
                                        scale=scale,
                                        style='simplex')
                                    test_X_enc_scaled = encode_dataset_nd(
                                        test_X_scaled,
                                        dim=dim,
                                        seed=seed,
                                        scale=scale,
                                        style='simplex')
                                    encoding_name = 'Combined Simplex SSP Normalized'
                                elif enc_type in [
                                        'one-hot', 'tile-code', 'pc-gauss',
                                        'pc-gauss-tiled', 'legendre',
                                        'ssp-proj'
                                ]:
                                    train_X_enc_scaled = encode_comparison_dataset(
                                        train_X_scaled,
                                        encoding=enc_type,
                                        seed=seed,
                                        dim=dim,
                                        scale=scale,
                                        **params)
                                    test_X_enc_scaled = encode_comparison_dataset(
                                        test_X_scaled,
                                        encoding=enc_type,
                                        seed=seed,
                                        dim=dim,
                                        scale=scale,
                                        **params)
                                    if enc_type == 'one-hot':
                                        encoding_name = 'One Hot'
                                    elif enc_type == 'tile-code':
                                        encoding_name = 'Tile Coding'
                                    elif enc_type == 'pc-gauss':
                                        encoding_name = 'RBF'
                                    elif enc_type == 'pc-gauss-tiled':
                                        encoding_name = 'RBF Tiled'
                                    elif enc_type == 'legendre':
                                        encoding_name = 'Legendre'
                                    elif enc_type == 'ssp-proj':
                                        encoding_name = 'SSP Projected Axis'
                                else:
                                    raise NotImplementedError(
                                        'unknown encoding type: {}'.format(
                                            enc_type))

                                mlp = MLP(
                                    hidden_layer_sizes=hidden_layer_sizes,
                                    activation='relu',
                                    solver=solver,
                                    max_iter=max_iter,
                                    random_state=seed,
                                    early_stopping=True,
                                    validation_fraction=0.1,
                                )

                                mlp.fit(train_X_enc_scaled, train_y)
                                acc = mlp.score(test_X_enc_scaled, test_y)

                                df = df.append(
                                    {
                                        'Dim':
                                        dim,
                                        'Seed':
                                        seed,
                                        'Scale':
                                        scale if 'ssp' in enc_type else 0,
                                        'N-Tiles':
                                        exp_args.n_tiles
                                        if enc_type == 'tile-coding' else 0,
                                        'Sigma':
                                        exp_args.sigma if
                                        ((enc_type == 'pc-guass') or
                                         (enc_type == 'pc-guass-tiled')) else
                                        0,
                                        'Encoding':
                                        encoding_name,
                                        'Dataset':
                                        dataset,
                                        'Model':
                                        'MLP - {}'.format(hidden_layer_sizes),
                                        'Accuracy':
                                        acc,
                                        'Solver':
                                        solver,
                                        'Max Iter':
                                        max_iter,
                                    },
                                    ignore_index=True,
                                )

                    if not args.only_encoding:

                        mlp = MLP(
                            hidden_layer_sizes=hidden_layer_sizes,
                            activation='relu',
                            solver=solver,
                            max_iter=max_iter,
                            random_state=seed,
                            early_stopping=True,
                            validation_fraction=0.1,
                        )

                        mlp.fit(train_X_scaled, train_y)
                        acc = mlp.score(test_X_scaled, test_y)

                        df = df.append(
                            {
                                'Dim': 0,
                                'Seed': seed,
                                'Scale': 0,
                                'N-Tiles': 0,
                                'Sigma': 0,
                                'Encoding': 'Normalized',
                                'Dataset': dataset,
                                'Model': 'MLP - {}'.format(hidden_layer_sizes),
                                'Accuracy': acc,
                                'Solver': solver,
                                'Max Iter': max_iter,
                            },
                            ignore_index=True,
                        )
        # save each dataset individually, in case the run crashes and needs to be restarted
        df.to_csv(inter_fname)

    return dataset

예제 #10

0

파일 보기

파일: find_continuous_features.py 프로젝트: bjkomer/ssp-experiments

# some continuous features
some_continuous = []

# 10 or less continuous features, only continuous
small_continuous = []

# dataset_names = classification_dataset_names
dataset_names = regression_dataset_names

n_datasets = len(dataset_names)

# for i, classification_dataset in enumerate(['banana', 'iris', 'titanic']):
for i, dataset in enumerate(dataset_names):
    print('\x1b[2K\r {} of {}. {}'.format(i+1, n_datasets, dataset), end="\r")
    df = fetch_data(dataset, return_X_y=False)
    # feat = count_features_type(df.ix[:, df.columns != 'class'])
    feat = count_features_type(df.ix[:, df.columns != 'target'])
    n_binary = feat[0]
    n_integer = feat[1]
    n_float = feat[2]

    # if classification_dataset == 'banana':
    #     print('banana:')
    #     print(feat)
    #     print(df)
    # if classification_dataset == 'titanic':
    #     print('titanic:')
    #     print(feat)
    #     print(df)
    # if classification_dataset == 'iris':

예제 #11

0

파일 보기

import numpy as np
import pandas as pd

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from pmlb import fetch_data, dataset_names, classification_dataset_names, regression_dataset_names
from operon.sklearn import SymbolicRegressor

import seaborn as sns
import matplotlib.pyplot as plt

from sympy import parse_expr, symbols, lambdify

# fetch data
df = fetch_data('192_vineyard', return_X_y=False, local_cache_dir='./data/')
print(df)
X = df.iloc[:, :-1].to_numpy()
y = df.iloc[:, -1].to_numpy()

# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    test_size=0.25,
                                                    shuffle=True)

# do a regression
reg = SymbolicRegressor()
reg.fit(X_train, y_train)

예제 #12

0

파일 보기

파일: operon-sklearn.py 프로젝트: lf-shaw/operon

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: Copyright 2019-2021 Heal Research

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, make_scorer
from scipy.stats import pearsonr

from operon import RSquared
from operon.sklearn import SymbolicRegressor

from pmlb import fetch_data, dataset_names, classification_dataset_names, regression_dataset_names
#print(regression_dataset_names)

X, y = fetch_data('1027_ESL', return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.75,
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=1234)

reg = SymbolicRegressor(allowed_symbols='add,sub,mul,div,constant,variable',
                        offspring_generator='basic',
                        local_iterations=10,
                        n_threads=4,
                        objectives=['r2', 'shape'],
                        random_state=1234)

예제 #13

0

파일 보기

  if d == None:
    d = D
  assert(N >= n and D >= d)
  return arr[:n, :d]

def dot(a, b):
  assert(len(a) == len(b))
  out = []
  for x,y in zip(a, b):
    out += ["{} * {}".format(x, y)]

  return " + ".join(out)

if __name__ == "__main__":

    X, y = fetch_data(sys.argv[1], return_X_y=True)
    # X, y = slice(Xorig, 4, 3), yorig[:3]
    n, d = X.shape
    for i in range(y.size):
        if (y[i] == 0):
            y[i] = -1

    # Read in the file
    with open('sgd_temp.c', 'r') as file :

      C = file.read()
      ws = ["{}{}".format("w", i) for i in range(d) ]
      WS =  ["{}{}".format("W",i) for i in range(d) ]
      xis = ["{}{}".format("x", i) for i in range(d) ]

      substitutions = {

예제 #14

0

파일 보기

파일: download-data.py 프로젝트: avbhandaru/ANN

#!/usr/bin/env python3

import pmlb
import pandas as pd

data = pmlb.fetch_data('iris')
data.to_csv('iris.csv')

예제 #15

0

파일 보기

파일: write_metadata.py 프로젝트: Mgarouani/penn-ml-benchmarks

        index=[0])

    assert (local_cache_dir != None)
    stats_df.to_csv(
        os.path.join(local_cache_dir, dataset_name, 'summary_stats.csv'))


if __name__ == '__main__':

    # assuming this is run from the repo root directory
    local_dir = 'datasets/'
    overwrite = True

    for d in classification_dataset_names:
        print(d, '...')
        df = fetch_data(d, local_cache_dir=local_dir)
        generate_description(df,
                             d,
                             'classification',
                             overwrite_existing=overwrite,
                             local_cache_dir=local_dir)
        generate_summarystats(df,
                              d,
                              'classification',
                              local_cache_dir=local_dir)
    for d in regression_dataset_names:
        print(d, '...')
        df = fetch_data(d, local_cache_dir=local_dir)
        generate_description(df,
                             d,
                             'regression',

예제 #16

0

파일 보기

def benchmark(config='', dmin=5, dmax=6):
    from pmlb import fetch_data, classification_dataset_names
    from sdv.evaluation import evaluate

    for classification_dataset in classification_dataset_names[dmin:dmax]:
        X, y = fetch_data(classification_dataset, return_X_y=True)

        X_train_full, X_test, y_train_full, y_test = train_test_split(
            X, y, test_size=0.05, random_state=2021)
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_full, y_train_full, random_state=2021)

        def post_process_fun(y):
            return int(y)

        def pre_process_fun(y):
            return int(y)

        #####
        # y = y.astype('uint8')
        num_classes = len(np.unique(y))
        print(np.unique(y))
        model_pars = {
            'model_pars': {
                'original_dim': X.shape[1],
                'class_num': num_classes,
                'intermediate_dim': 64,
                'intermediate_dim_2': 16,
                'latent_dim': 3,
                'Lambda1': 1,
                'batch_size': 256,
                'Lambda2': 200,
                'Alpha': 0.075
            },
            'post_process_fun':
            post_process_fun  ### After prediction  ##########################################
            ,
            'pre_process_pars': {
                'y_norm_fun':
                pre_process_fun,  ### Before training  ##########################

                ### Pipeline for data processing ##############################
                'pipe_list': [  #### coly target prorcessing
                    {
                        'uri': 'source/prepro.py::pd_coly',
                        'pars': {},
                        'cols_family': 'coly',
                        'cols_out': 'coly',
                        'type': 'coly'
                    },
                    {
                        'uri': 'source/prepro.py::pd_colnum_bin',
                        'pars': {},
                        'cols_family': 'colnum',
                        'cols_out': 'colnum_bin',
                        'type': ''
                    },
                    {
                        'uri': 'source/prepro.py::pd_colcat_bin',
                        'pars': {},
                        'cols_family': 'colcat',
                        'cols_out': 'colcat_bin',
                        'type': ''
                    },
                ],
            }
        }

        log(f'{classification_dataset} Metrics: ------------')
        column = [f'col_{i}' for i in range(X.shape[1])]
        real_df = pd.DataFrame(X_test, columns=column)

        ##### VAEMDN
        vae, vae_enc, vae_dec = VAEMDN(model_pars=model_pars['model_pars'])
        vae.fit([X_train_full, y_train_full], epochs=50)
        vae_data = vae.predict([X_test, y_test])
        vae_df = pd.DataFrame(vae_data, columns=column)
        evl_vae = evaluate(real_df,
                           vae_df,
                           metrics=['LogisticDetection', 'CSTest', 'KSTest'])
        log(f'Evaluation on VAE: {evl_vae}')

        log("##### AE")
        basic_ae, ae_enc, ae_dec = AUTOENCODER_BASIC(X.shape[1])
        basic_ae.fit(X_train_full, X_train_full, epochs=50)
        basic_data = basic_ae.predict(X_test)
        basic_df = pd.DataFrame(basic_data, columns=column)
        evl_ae = evaluate(real_df,
                          basic_df,
                          metrics=['LogisticDetection', 'CSTest', 'KSTest'])
        log(f'Evaluation on Basic_AE: {evl_ae}')

예제 #17

0

파일 보기

from pmlb import fetch_data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.metrics import classification_report
import numpy as np
import random
import matplotlib.pyplot as plt

# Choose data set.
adult_data, adult_labels = fetch_data('adult', return_X_y=True, local_cache_dir='./')
print(adult_data.shape, adult_labels.shape)


# Algorithms to be used
logreg = LogisticRegression( solver='lbfgs' )
gaussNB = GaussianNB()
sgd = SGDClassifier( loss="hinge", penalty="l2", max_iter=5 )
linear = LinearRegression()
rfc = RandomForestClassifier(n_estimators=200)

# Columns used to create predictions
feature_columns = ['age', 'workclass', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

예제 #18

0

파일 보기


if __name__ == '__main__':
    penn_data = Path('./datasets.csv')
    dataset = []
    if penn_data.is_file():
        df = pd.read_csv(penn_data)
        dataset = df['dataset_names'].values
    else:
        print('Please create nonempty csv-file with datasets')

    if len(dataset) == 0:
        dataset = classification_dataset_names + regression_dataset_names

    for name_of_dataset in dataset:
        pmlb_data = fetch_data(name_of_dataset)
        num_classes, _ = imbalance_metrics(pmlb_data['target'].tolist())
        problem_class, metric_names = _problem_and_metric_for_dataset(
            name_of_dataset, num_classes)
        if not problem_class or not metric_names:
            print('Incorrect dataset')
            continue

        train_file, test_file = get_penn_case_data_paths(name_of_dataset)
        config_models_data = get_models_hyperparameters()
        case_name = f'penn_ml_{name_of_dataset}'

        try:
            result_metrics = CaseExecutor(params=ExecutionParams(
                train_file=train_file,
                test_file=test_file,

예제 #19

0

파일 보기

파일: evaluate_model.py 프로젝트: sgpthomas/sklearn-benchmarks

def evaluate_model(dataset,
                   pipeline_components,
                   pipeline_parameters,
                   resultdir="."):
    input_data = fetch_data(dataset)
    features = input_data.drop('target', axis=1).values.astype(float)
    labels = input_data['target'].values

    # pipelines = [dict(zip(pipeline_parameters.keys(), list(parameter_combination)))
    #              for parameter_combination in itertools.product(*pipeline_parameters.values())]
    # pipelines = pipeline_parameters

    results_dict = {}
    classifier_class = pipeline_components[-1]
    # tmpfn = '{}/tmp--{}--{}.pkl'.format(resultdir, dataset, classifier_class.__name__)
    # Path(tmpfn).touch()
    with warnings.catch_warnings():
        # Squash warning messages. Turn this off when debugging!
        warnings.simplefilter('ignore')

        # for pipe_parameters in pipelines:
        pipeline = []
        for component in pipeline_components:
            # if component in pipe_parameters:
            if component in pipeline_parameters:
                args = pipeline_parameters[component]
                pipeline.append(component(**args))
            else:
                pipeline.append(component())

        try:

            clf = make_pipeline(*pipeline)
            cv = StratifiedKFold(n_splits=10,
                                 shuffle=True,
                                 random_state=90483257)
            scoring = {
                'accuracy': 'accuracy',
                'f1_macro': 'f1_macro',
                'bal_accuracy': make_scorer(balanced_accuracy_score)
            }
            validation = cross_validate(clf,
                                        features,
                                        labels,
                                        cv=cv,
                                        scoring=scoring)
            avg = map2_dict(lambda k, v: ("avg_{}".format(k), np.mean(v)),
                            validation)
            stddev = map2_dict(lambda k, v: ("std_{}".format(k), np.std(v)),
                               validiation)
            # balanced_accuracy = balanced_accuracy_score(labels, cv_predictions)
        except KeyboardInterrupt:
            sys.exit(1)
        # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter
        # combination or bad data. Turn this off when debugging!
        # except Exception as e:
        #     continue

        param_string = "default"
        if pipeline_parameters != {}:
            param_string = ','.join([
                '{}={}'.format(parameter, value) for parameter, value in
                pipeline_parameters[classifier_class].items()
            ])

        dict_safe_append(results_dict, 'dataset', dataset)
        dict_safe_append(results_dict, 'classifier', classifier_class.__name__)
        dict_safe_append(results_dict, 'parameters', param_string)
        for key in merge_dicts(avg, stddev).keys():
            dict_safe_append(results_dict, key, avg[key])

        # out_text = '\t'.join(map_dict(lambda v: str(v[-1]), results_dict).values())
        # print(out_text, flush=True)

        # pd.DataFrame(results_dict).to_pickle(tmpfn)

        # os.remove(tmpfn)
        # final_fn = '{}/final--{}--{}.pkl'.format(resultdir, dataset, classifier_class.__name__)
        # pd.DataFrame(results_dict).to_pickle(final_fn)
        return results_dict

예제 #20

0

파일 보기

## https://github.com/EpistasisLab/penn-ml-benchmarks

## pip install pmlb

import numpy as np
from pmlb import fetch_data
from pmlb import dataset_names

x = np.zeros(len(dataset_names))
for i, dn in enumerate(dataset_names):
    d = fetch_data(dn)
    n = d.describe()["class"]["count"]
    x[i] = n
    print(str(n) + "   " + str(dn))

x.min()
np.percentile(x, 50)
np.percentile(x, 80)
np.percentile(x, 90)
x.max()

#In [6]: x.min()
#Out[6]: 32.0
#
#In [7]: np.percentile(x, 50)
#Out[7]: 690.0
#
#In [8]: np.percentile(x, 80)
#Out[8]: 3772.0
#
#In [9]: np.percentile(x, 90)

예제 #21

0

파일 보기

def test_fetch_data_1():
    """Test fetch_data can fetch data from GitHub."""

    mushroom = fetch_data('mushroom')
    assert not mushroom.empty
    assert not mushroom.isnull().values.any()

예제 #22

0

파일 보기

def load(dataset="monk3", multi="normal"):
    """
    Returns X (features) and y (classes) for a dataset.

    The dataset can be from PMLB, OpenML100, a .dat file in the data/ directory, or one of the
    synthetic examples from Figure 1.


    Args:
        dataset (str): name of the dataset
        multi (str): mode for processing multi-class problems
            There are three valid choices:
                - "normal": return multi-class problems normally
                - "small": convert multi-class problem into a smallest class against all problem
                - "large": convert multi-class problem into a largest class against all problem

    Returns:
        X (np.array): features the data points
        y (np.array): classes for the data points
    """

    try:
        # PMLB does not provide a simple way to check if a dataset is available
        # Just attempt to load, and continue through list of datasets if not found
        pathlib.Path(".pmlb").mkdir(parents=True, exist_ok=True)
        X, y = pmlb.fetch_data(dataset,
                               return_X_y=True,
                               local_cache_dir=".pmlb")
        isPMLB = True
    except ValueError:
        isPMLB = False

    if isPMLB:
        # PMLB data already loaded
        pass
    elif dataset in map(str,
                        openml.study.get_study("OpenML100", "tasks").tasks):
        task = openml.tasks.get_task(dataset)
        X, y = task.get_X_and_y()
        X = X[:, sum(np.isnan(X)) == 0]
    elif os.path.isfile("data/" + dataset + ".dat"):
        # Datasets not in PMLB or OpenML (load from file)
        X = np.genfromtxt("data/" + dataset + ".dat", delimiter=",")
        X, y = X[:, :-1], X[:, -1].astype(np.int64)
    elif dataset == "easy":
        # Synthetic example for Figure 1a
        r1 = 225
        r2 = 25
        b = 250
        X = np.concatenate([
            np.concatenate([
                0.10 * np.random.rand(r1, 1) - 1.0,
                2 * np.random.rand(r1, 1) - 1
            ],
                           axis=1),
            np.concatenate([
                0.25 * np.random.rand(r2, 1) - 0.2,
                2 * np.random.rand(r2, 1) - 1
            ],
                           axis=1),
            np.concatenate([
                0.10 * np.random.rand(b, 1) + 0.0, 2 * np.random.rand(b, 1) - 1
            ],
                           axis=1)
        ])
        y = np.array((r1 + r2) * [1] + b * [0])
    elif dataset == "imbalance":
        # Synthetic example for Figure 1b
        r = 20
        b = 480
        X = np.concatenate([
            np.concatenate([
                1.05 * np.random.rand(r, 1) - 1.0, 2 * np.random.rand(r, 1) - 1
            ],
                           axis=1),
            np.concatenate([
                1.00 * np.random.rand(b, 1) + 0.0, 2 * np.random.rand(b, 1) - 1
            ],
                           axis=1)
        ])
        y = np.array(r * [1] + b * [0])
    elif dataset == "imbalance+outlier":
        # Synthetic example for Figure 1c
        r = 20
        b = 480
        X = np.concatenate([
            np.concatenate(
                [1 * np.random.rand(r, 1) - 1.0, 2 * np.random.rand(r, 1) - 1],
                axis=1),
            np.concatenate(
                [1 * np.random.rand(b, 1) + 0.0, 2 * np.random.rand(b, 1) - 1],
                axis=1),
            np.array([[-1.0, 0.0]])
        ])
        y = np.array(r * [1] + b * [0] + [0])
    elif dataset == "overlap":
        # Synthetic example for Figure 1d
        r = 250
        b = 250
        X = np.concatenate([
            np.concatenate(
                [1 * np.random.rand(r, 1) - 1, 2 * np.random.rand(r, 1) - 1],
                axis=1),
            np.concatenate(
                [2 * np.random.rand(b, 1) - 1, 2 * np.random.rand(b, 1) - 1],
                axis=1)
        ])
        y = np.array(r * [1] + b * [0])
    else:
        raise ValueError("Dataset " + dataset + " is not recognized.")

    # Map classes down to 0 to (number_of_classes - 1)
    unique = np.unique(y)
    new = {old: new for (new, old) in enumerate(unique)}
    y = np.array([new[i] for i in y])

    if multi == "normal":
        # Treat multi-class problems normally
        pass
    else:
        # Convert multi-class problems to binary
        count = np.bincount(y)
        if multi == "small":
            # Smallest class against all
            count = -count
        elif multi == "large":
            # Largest class against all
            pass
        else:
            raise ValueError("Multi-class setting \"" + multi +
                             "\" not recognized.")
        ind = np.argmax(count)
        y = (y == ind).astype(np.int)

    return X, y

예제 #23

0

파일 보기

def test_fetch_data_2():
    """Test fetch_data can fetch data from local cache."""

    mushroom = fetch_data('mushroom', local_cache_dir="datasets/")
    assert not mushroom.empty

예제 #24

0

파일 보기

    def _more_tags(self):
        return {'non_deterministic': True, 'binary_only': True}


TEST_SKLEARN = False
TEST_PYTORCH = True

if __name__ == "__main__":

    import warnings
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

    # Good binary classification dataset with floating features and appx. equal
    # class balance. Very high accuracy attainable using LR (>0.99 accuracy)
    X, y = fetch_data('clean2', return_X_y=True)

    if True:
        # first two features are IDs for the molecule! The decision function will just learn to look at these...
        X = X[:, 2:]

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    if TEST_SKLEARN:
        clf_sklearn = LogisticRegression(penalty='l2',
                                         solver='sag',
                                         max_iter=1000)
        clf_sklearn.fit(X_train, y_train)
        print("SKLEARN ACCURACY: {0:.3f}".format(
            clf_sklearn.score(X_test, y_test)))
        #print(clf_sklearn.coef_)

예제 #25

0

파일 보기

def test_fetch_data_6():
    """Test fetch_data can fetch data from GitHub with return_X_y."""
    X, y = fetch_data('mushroom', return_X_y=True)
    assert isinstance(X, np.ndarray)
    assert isinstance(y, np.ndarray)

예제 #26

0

파일 보기

파일: encoding_example.py 프로젝트: bjkomer/ssp-experiments

for name in classifier_names:
    for pre in preprocs:
        test_scores[name + pre] = []
        combination_names.append(name + pre)

for i, classification_dataset in enumerate(datasets):

    # temporarily skipping the bigger datasets to save time prototyping
    if (classification_dataset == 'shuttle') or (classification_dataset
                                                 == 'magic'):
        continue

    print('\x1b[2K\r {} of {}. {}'.format(i + 1, n_datasets,
                                          classification_dataset),
          end="\r")
    X, y = fetch_data(classification_dataset, return_X_y=True)

    # making features 0 mean and unit variance
    scaler = StandardScaler()

    train_X, test_X, train_y, test_y = train_test_split(X, y)

    scaler.fit(train_X)

    train_X_scaled = scaler.transform(train_X)
    test_X_scaled = scaler.transform(test_X)

    train_X_enc = encode_dataset(train_X, dim=256, seed=13, scale=1.0)
    test_X_enc = encode_dataset(test_X, dim=256, seed=13, scale=1.0)

    train_X_enc_scaled = encode_dataset(train_X_scaled,

예제 #27

0

파일 보기

파일: download-data.py 프로젝트: avbhandaru/ANN

#!/usr/bin/env python3

import pmlb
import pandas as pd

data = pmlb.fetch_data('yeast')
data.to_csv('yeast.csv')

예제 #28

0

파일 보기

def evaluate_model(dataset, pipeline_components, pipeline_parameters, resultdir="."):
    '''dataset: str, pipeline_components: List[Object], Dict[Object, Dict[str, Any]]'''

    # download dataset from PMLB
    input_data = fetch_data(dataset)

    # separate features and labels
    features = input_data.drop('target', axis=1).values.astype(float)
    labels = input_data['target'].values

    results_dict = {} # initialize a dictionary to store the results
    classifier_class = pipeline_components[-1] # the classifier is the last element on the components list

    with warnings.catch_warnings():
        # Squash warning messages. Turn this off when debugging!
        warnings.simplefilter('ignore')

        # initialize each of the components in the pipeline, passing in parameters if we have them
        pipeline = []
        for component in pipeline_components:
            if component in pipeline_parameters:
                args = pipeline_parameters[component]
                pipeline.append(component(**args))
            else:
                pipeline.append(component())

        try:
            clf = make_pipeline(*pipeline) # make the pipeline
            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=90483257) # initialize the cross validiation
            # these are the metrics we are collecting
            scoring = {'accuracy': 'accuracy',
                        'f1_macro': 'f1_macro',
                        'bal_accuracy': make_scorer(balanced_accuracy_score)}
            validation = cross_validate(clf, features, labels, cv=cv, scoring=scoring) # perform the cross validiation
            avg = map2_dict(lambda k, v: ("avg_{}".format(k), np.mean(v)), validation) # save average of cross validiation
            stddev = map2_dict(lambda k, v: ("std_{}".format(k), np.std(v)), validation) # save std dev of cross valiiation
        except KeyboardInterrupt:
            sys.exit(1)
        # This is a catch-all to make sure that the evaluation won't crash due to a bad parameter
        # combination or bad data. Turn this off when debugging!
        except Exception as e:
            pass

        # construct parameter string
        param_string = "default"
        if pipeline_parameters != {}:
            param_string = ','.join(['{}={}'.format(parameter, value)
                                        for parameter, value in
                                        pipeline_parameters[classifier_class].items()])

        # add things to the results dictionary
        dict_safe_append(results_dict, 'dataset', dataset)
        dict_safe_append(results_dict, 'classifier', classifier_class.__name__)
        dict_safe_append(results_dict, 'parameters', param_string)

        # merge the avg and stddev dictionaries
        merged = {**avg, **stddev}

        # add everything to the results dictionary
        for key in merged:
            dict_safe_append(results_dict, key, merged[key])

        return results_dict

예제 #29

0

파일 보기

if __name__ == '__main__':

    results = {
        'problem': [],
        'method': [],
        'score': []
    }

    if len(sys.argv) > 1 and sys.argv[1] == '--skip-train':
        results = pd.read_csv("./data/results.csv")
    else:
        for classification_dataset in classification_dataset_names:
            print("Starting", classification_dataset)

            X, y = fetch_data(classification_dataset, return_X_y=True, local_cache_dir='./data/')
            train_X, test_X, train_y, test_y = train_test_split(X, y)

            rf = RandomForestClassifier()
            lexRF = LexicaseForestClassifier()

            rf.fit(train_X, train_y)
            lexRF.fit(train_X, train_y)

            rf_score = rf.score(test_X, test_y)
            lexRF_score = lexRF.score(test_X, test_y)

            results['problem'] = results['problem'] + ([classification_dataset] * 2)
            results['method'] = results['method'] + ['RF', 'LexRF']
            results['score'].append(rf_score)
            results['score'].append(lexRF_score)

예제 #30

0

파일 보기

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import *

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

#dataset = sys.argv[1]

#input_data = pd.read_csv(dataset, compression='gzip', sep='\t')

dataset = []

cnt = 0
for reg_data in regression_dataset_names:
    X, y = fetch_data(reg_data, return_X_y=True, local_cache_dir='../dataset')
    if (X.shape[0] > 100 or X.shape[1] > 10): continue
    dataset.append(reg_data)
    cnt += 1

print('There are in total %d datasets' % cnt)

hyper_params = [{
    'learning_rate': (
        0.01,
        0.1,
        1.0,
        10.0,
    ),
    'n_estimators': (
        10,

예제 #31

0

파일 보기

# set params
for i in range(1, len(sys.argv), 2):
    t = type(getattr(p, sys.argv[i]))
    if sys.argv[i + 1] == 'True':
        setattr(p, sys.argv[i], t(True))
    elif sys.argv[i + 1] == 'False':
        setattr(p, sys.argv[i], t(False))
    else:
        setattr(p, sys.argv[i], t(sys.argv[i + 1]))

out_name = p._str(p)  # generate random fname str before saving
np.random.seed(p.seed)
random_state = p.seed
data_dir = '/scratch/users/vision/data/pmlb'
dset_name = p.dset_name  # dset_names[p.dset_num]
X, y = pmlb.fetch_data(dset_name, return_X_y=True, local_cache_dir=data_dir)
type_orig = y.dtype
y -= np.min(y)
y = (y / np.max(y)).astype(type_orig)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=random_state)  # defaults to 0.75: 0.25 splitx`

num_to_flip = np.max([2, int(X_train.shape[0] * p.flip_frac)])
flipped = np.zeros(X_train.shape[0], dtype=np.bool)
idxs = np.random.choice(X_train.shape[0], num_to_flip, replace=False)
flipped[idxs] = 1
y_train[idxs] = 1 - y_train[idxs]

num_to_flip = int(X_test.shape[0] * p.flip_frac)
flipped_test = np.zeros(X_test.shape[0], dtype=np.bool)