示例#1
0
    """The function to predict."""
    return 1/3*x * np.sin(x)

# ----------------------------------------------------------------------
#  First the noiseless case
# X = np.atleast_2d([1., 3., 6., 8.]).T
X = np.atleast_2d([1, 3., 5, 8, 9, 10, 11, 12]).T
# Observations
y = f(X).ravel()

# Mesh the input space for evaluations of the real function, the prediction and
# its MSE
x = np.atleast_2d(np.linspace(0, 13, 1000)).T

# Instantiate a Gaussian Process model
kernel = C(1.0, (1e-3, 1e3)) * RBF(1, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(X, y)

# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred, sigma = gp.predict(x, return_std=True)

# Plot the function, the prediction and the 95% confidence interval based on
# the MSE
plt.figure()
plt.plot(x, f(x), 'r:', label=u'Objective function')
plt.plot(X, y, 'r.', markersize=10, label=u'Observations')
plt.plot(x, y_pred, 'b-', label=u'Prediction')
plt.fill(np.concatenate([x, x[::-1]]),
示例#2
0
def test_no_optimizer():
    # Test that kernel parameters are unmodified when optimizer is None.
    kernel = RBF(1.0)
    gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None).fit(X, y)
    assert np.exp(gpr.kernel_.theta) == 1.0
示例#3
0
def timeseries_smooth(adata,
                      genes='none',
                      gene_symbols='none',
                      key='louvain',
                      groups='all',
                      style='-b',
                      n_restarts_optimizer=10,
                      likelihood_landscape=False,
                      normalize_y=False,
                      noise_level=0.5,
                      noise_level_bounds=(1e-2, 1e+1),
                      length_scale=1,
                      length_scale_bounds=(1e-2, 1e+1),
                      save='none',
                      title='long'):
    """
    Plot a timeseries of some genes in pseudotime
    
    Keyword arguments:
    adata -- anndata object
    genes -- list of genes. If 'none', the first 5 genes are plotted
    gene_symbols -- variable annotation. If 'none', the index is used
    key -- observation annotation. 
    groups -- basically branches, chosen from the annotations in key
    style -- line plotting style
    """

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib.colors as colors
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel

    # select one branch
    if groups != 'all':
        adata_selected = adata[np.in1d(adata.obs[key], groups)]
    else:
        adata_selected = adata

    # select genes
    if genes == 'none':

        # no genes specified, we just use the first 5
        genes = adata_selected.var_names.values[0:5]
        m = {'Mapped': genes, 'Original': genes}

    elif gene_symbols != 'none':

        # a gene annotation is used, we map the gene names
        mapping_table = pd.DataFrame(adata_selected.var[gene_symbols])
        name_mapping = mapping_table.set_index(gene_symbols)
        name_mapping['Ensembl'] = mapping_table.index
        genes_mapped = name_mapping.loc[genes, :]

        # save in dict
        m = {'Mapped': genes_mapped['Ensembl'], 'Original': genes}
    else:
        m = {'Mapped': genes, 'Original': genes}

    # construct a look up table
    gene_table = pd.DataFrame(data=m)

    # extract the pseudotime
    time = adata_selected.obs['dpt_pseudotime']

    # construct a data frame which has time as index
    exp_data = pd.DataFrame(data=adata_selected[:, gene_table['Mapped']].X,
                            index=time,
                            columns=[gene_table['Original'].values])

    # sort according to pseudotime
    exp_data.sort_index(inplace=True)
    ()

    # remove the last entry
    (m, n) = exp_data.shape
    exp_data = exp_data.iloc[:m - 1, :]

    # loop counter
    i = 0

    # loop over all genes we wish to plot
    for index, row in gene_table.iterrows():

        # select data
        data_selected = exp_data.loc[:, row['Original']].reset_index()

        # create the labels
        X = np.atleast_2d(data_selected['dpt_pseudotime'].values)

        # create the targets
        y = data_selected[row['Original']].values.ravel()

        # Mesh the input space for evaluations of the prediction and
        # its MSE
        x = np.atleast_2d(np.linspace(0, 1, 1000)).T

        # Initiate a Gaussian process modell. We use a sum of two kernels here, this allows
        # us to estimate the noice level via optimisation of the marginal likelihood as well
        kernel = 1.0 * RBF(length_scale=length_scale, length_scale_bounds=length_scale_bounds) \
        + WhiteKernel(noise_level=noise_level, noise_level_bounds=noise_level_bounds)
        gp = GaussianProcessRegressor(
            kernel=kernel,
            alpha=0.0,
            n_restarts_optimizer=n_restarts_optimizer,
            normalize_y=normalize_y).fit(X, y)

        # obtain a prediction from this model. Also return the covariance matrix, so we can calculate
        # confidence intervals
        y_mean, y_cov = gp.predict(x, return_cov=True)

        # plot current genes
        plt.figure(num=i, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
        plt.plot(x, y_mean, 'k', lw=3, zorder=9, label='Prediction')
        plt.fill_between(x.ravel(),
                         y_mean - np.sqrt(np.diag(y_cov)),
                         y_mean + np.sqrt(np.diag(y_cov)),
                         alpha=0.5,
                         color='k')
        plt.scatter(X,
                    y,
                    c='r',
                    s=50,
                    zorder=10,
                    edgecolors=(0, 0, 0),
                    label='Observation')
        if title == 'long':
            plt.title(
                "Gene: %s\nInitial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
                % (row['Original'], kernel, gp.kernel_,
                   gp.log_marginal_likelihood(gp.kernel_.theta)))
        else:
            plt.title("Gene: %s" % (row['Original']))
        plt.xlabel('$t_{pseudo}$')
        plt.ylabel('Expression')
        plt.legend(loc='upper left')
        if save != 'none':
            plt.savefig(save + row['Original'] + '_dynamics.pdf')

        if likelihood_landscape == True:

            # Plot LML landscape
            i += 1
            plt.figure(num=i,
                       figsize=(8, 6),
                       dpi=80,
                       facecolor='w',
                       edgecolor='k')
            theta0 = np.logspace(-2, 3, 49)  # length scale
            theta1 = np.logspace(-1.5, 0, 50)  # Noise level
            Theta0, Theta1 = np.meshgrid(theta0, theta1)
            LML = [[
                gp.log_marginal_likelihood(
                    np.log([0.36, Theta0[i, j], Theta1[i, j]]))
                for i in range(Theta0.shape[0])
            ] for j in range(Theta0.shape[1])]
            LML = np.array(LML).T

            vmin, vmax = (-LML).min(), (-LML).max()
            #vmax = 50
            level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50),
                              decimals=1)
            plt.contour(Theta0,
                        Theta1,
                        -LML,
                        levels=level,
                        norm=colors.LogNorm(vmin=vmin, vmax=vmax))
            plt.colorbar()
            plt.xscale("log")
            plt.yscale("log")
            plt.xlabel("Length-scale")
            plt.ylabel("Noise-level")
            plt.title("Log-marginal-likelihood")
            #plt.tight_layout()
            plt.show()

        # increase loop counter
        i += 1
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    args = parser.parse_args()

    print(args)

    no_data_points = 100
    no_samples = 50
    env = gym.make(args.environment)
    #env.render(mode='human')

    states = []
    actions = []
    rewards = []
    next_states = []
    data_points = 0
    while no_data_points > data_points:
        state = env.reset()

        while True:
            action = np.random.uniform(low=env.action_space.low,
                                       high=env.action_space.high)
            next_state, reward, done, _ = env.step(action)
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            data_points += 1

            state = next_state.copy()

            if done:
                break

    X_train = np.concatenate(
        [np.stack(states, axis=0),
         np.stack(actions, axis=0)], axis=-1)
    y_train = np.stack(next_states, axis=0)
    print(X_train.shape)
    print(y_train.shape)
    '''
    # Kernel with optimized parameters
    k1 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
    k2 = 2.0**2 * RBF(length_scale=100.0) \
        * ExpSineSquared(length_scale=1.0, periodicity=1.0,
                         periodicity_bounds="fixed")  # seasonal component
    # medium term irregularities
    k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
    k4 = 0.1**2 * RBF(length_scale=0.1) \
        + WhiteKernel(noise_level=0.1**2,
                      noise_level_bounds=(1e-3, np.inf))  # noise terms
    #kernel = k1 + k2 + k3 + k4
    kernel = k1 + k3 + k4
    '''
    #kernel = Matern(nu=.5) + RBF()
    kernel = RBF() + RationalQuadratic() + WhiteKernel()

    gp = GaussianProcessRegressor(kernel=kernel).fit(X_train, y_train)

    while True:
        state = env.reset()
        states = []
        actions = []
        rewards = []
        next_states = []
        while True:
            action = np.random.uniform(low=env.action_space.low,
                                       high=env.action_space.high)
            next_state, reward, done, _ = env.step(action)
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)

            state = next_state.copy()

            if done:
                break

        X_test = np.concatenate(
            [np.stack(states, axis=0),
             np.stack(actions, axis=0)], axis=-1)
        y_test = np.stack(next_states, axis=0)

        y_mean, y_cov = gp.predict(X_test, return_cov=True)

        for i in range(y_test.shape[-1]):
            plt.figure()
            plt.plot(np.arange(len(y_test[:, i])), y_test[:, i])

            y = y_mean[:, i]
            error = np.sqrt(np.diag(y_cov))

            plt.plot(np.arange(len(y)), y)
            plt.fill_between(np.arange(len(y)),
                             y + error,
                             y - error,
                             alpha=.4,
                             color='C1')

            plt.grid()
        plt.show()
        exit()
示例#5
0
from sklearn.utils._testing \
    import (assert_array_less,
            assert_almost_equal, assert_raise_message,
            assert_array_almost_equal, assert_array_equal,
            assert_allclose)


def f(x):
    return x * np.sin(x)


X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T
X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T
y = f(X).ravel()

fixed_kernel = RBF(length_scale=1.0, length_scale_bounds="fixed")
kernels = [RBF(length_scale=1.0), fixed_kernel,
           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
           C(1.0, (1e-2, 1e2)) *
           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)),
           C(1.0, (1e-2, 1e2)) *
           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) +
           C(1e-5, (1e-5, 1e2)),
           C(0.1, (1e-2, 1e2)) *
           RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) +
           C(1e-5, (1e-5, 1e2))]
non_fixed_kernels = [kernel for kernel in kernels
                     if kernel != fixed_kernel]


@pytest.mark.parametrize('kernel', kernels)
示例#6
0
def main(_):
    num_steps_autoencoder = 0 if FLAGS.uniform_weights else TRAINING_STEPS

    training_df = pd.read_csv(FLAGS.training_data_path, header=0, sep=',')
    testing_df = pd.read_csv(FLAGS.testing_data_path, header=0, sep=',')
    validation_df = pd.read_csv(FLAGS.validation_data_path, header=0, sep=',')

    train_labels = training_df['label']
    validation_labels = validation_df['label']
    test_labels = testing_df['label']
    train_population = training_df['population']
    train_features = training_df[FEATURES]
    validation_features = validation_df[FEATURES]
    test_features = testing_df[FEATURES]
    train_wqs = training_df['racePctWhite_quantile']
    validation_wqs = validation_df['racePctWhite_quantile']
    test_wqs = testing_df['racePctWhite_quantile']

    tf.reset_default_graph()
    x = tf.placeholder(tf.float32, shape=(None, len(FEATURES)), name='x')
    y = tf.placeholder(tf.float32, shape=(None, OUTPUT_DIM), name='y')
    population = tf.placeholder(tf.float32,
                                shape=(None, OUTPUT_DIM),
                                name='population')

    xy = tf.concat([x, y], axis=1)
    autoencoder_layer1 = tf.layers.dense(inputs=xy,
                                         units=10,
                                         activation=tf.sigmoid)
    autoencoder_embedding_layer = tf.layers.dense(inputs=autoencoder_layer1,
                                                  units=EMBEDDING_DIM,
                                                  activation=tf.sigmoid)
    autoencoder_layer3 = tf.layers.dense(inputs=autoencoder_embedding_layer,
                                         units=10,
                                         activation=tf.sigmoid)
    autoencoder_out_x = tf.layers.dense(inputs=autoencoder_layer3,
                                        units=len(FEATURES))
    autoencoder_out_y_logits = tf.layers.dense(inputs=autoencoder_layer3,
                                               units=OUTPUT_DIM)

    autoencoder_y_loss = tf.losses.hinge_loss(labels=y,
                                              logits=autoencoder_out_y_logits)
    autoencoder_x_loss = tf.losses.mean_squared_error(
        labels=x, predictions=autoencoder_out_x)
    autoencoder_loss = autoencoder_x_loss + autoencoder_y_loss
    autoencoder_optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(
        autoencoder_loss)

    parallel_logits = []
    parallel_losses = []
    parallel_optimizers = []

    parallel_alphas = tf.placeholder(tf.float32,
                                     shape=(NUM_PARALLEL_ALPHAS,
                                            EMBEDDING_DIM),
                                     name='parallel_alphas')
    unstack_parallel_alphas = tf.unstack(parallel_alphas, axis=0)
    embedding = tf.placeholder(tf.float32,
                               shape=(None, EMBEDDING_DIM),
                               name='embedding')

    with tf.variable_scope('classifiers'):
        for alpha_index in range(NUM_PARALLEL_ALPHAS):
            logits = classifier(x)
            alpha = tf.reshape(unstack_parallel_alphas[alpha_index],
                               shape=[EMBEDDING_DIM, 1])
            optimizer, loss = optimization(logits, y, population, embedding,
                                           alpha)

            parallel_logits.append(logits)
            parallel_losses.append(loss)
            parallel_optimizers.append(optimizer)

    init = tf.global_variables_initializer()
    classifiers_init = tf.variables_initializer(
        tf.global_variables(scope='classifiers'))

    kernel = RBF(length_scale=FLAGS.sampling_radius,
                 length_scale_bounds=(FLAGS.sampling_radius * 1e-3,
                                      FLAGS.sampling_radius *
                                      1e3)) * ConstantKernel(1.0, (1e-3, 1e3))

    alphas = np.zeros(shape=(0, EMBEDDING_DIM))
    validation_metrics = []
    test_metrics = []

    with tf.Session() as sess:
        sess.run(init)
        # Training autoencoder
        for _ in range(num_steps_autoencoder):
            batch_index = random.sample(range(len(train_labels)), BATCH_SIZE)
            batch_x = train_features.iloc[batch_index, :].values
            batch_y = train_labels.iloc[batch_index].values.reshape(
                BATCH_SIZE, 1)
            _, _ = sess.run([autoencoder_optimizer, autoencoder_loss],
                            feed_dict={
                                x: batch_x,
                                y: batch_y,
                            })

        # GetCandidatesAlpha (Algorithm 2 in paper)
        for alpha_batch_index in range(NUM_ALPHA_BATCHES):
            sess.run(classifiers_init)
            if FLAGS.uniform_weights:
                alpha_batch = np.zeros(shape=(NUM_PARALLEL_ALPHAS,
                                              EMBEDDING_DIM))
            elif alpha_batch_index == 0:
                # We first start uniformly.
                alpha_batch = sample_from_ball(
                    size=(NUM_PARALLEL_ALPHAS, EMBEDDING_DIM),
                    sampling_radius=FLAGS.sampling_radius)
            else:
                # Use UCB to generate candidates.
                alpha_batch = np.zeros(shape=(0, EMBEDDING_DIM))
                sample_alphas = np.copy(alphas)
                sample_validation_metrics = [m[0] for m in validation_metrics]
                candidates = sample_from_ball(
                    size=(10000, EMBEDDING_DIM),
                    sampling_radius=FLAGS.sampling_radius)
                for alpha_index in range(NUM_PARALLEL_ALPHAS):
                    gp = GaussianProcessRegressor(
                        kernel=kernel,
                        alpha=1e-1).fit(sample_alphas,
                                        sample_validation_metrics)

                    metric_mles, metric_stds = gp.predict(candidates,
                                                          return_std=True)
                    metric_lcbs = metric_mles - 1.0 * metric_stds

                    best_index = np.argmin(metric_lcbs)
                    best_alpha = [candidates[best_index]]
                    best_alpha_metric_ucb = metric_mles[best_index] \
                      + 1.0 * metric_stds[best_index]
                    alpha_batch = np.concatenate([alpha_batch, best_alpha])

                    # Add candidate to the GP, assuming the metric observation is the LCB.
                    sample_alphas = np.concatenate([sample_alphas, best_alpha])
                    sample_validation_metrics.append(best_alpha_metric_ucb)

            # Training classifiers
            for _ in range(TRAINING_STEPS):
                batch_index = random.sample(range(len(train_labels)),
                                            BATCH_SIZE)
                batch_x = train_features.iloc[batch_index, :].values
                batch_y = train_labels.iloc[batch_index].values.reshape(
                    BATCH_SIZE, 1)
                batch_population = train_population.iloc[
                    batch_index].values.reshape(BATCH_SIZE, 1)
                batch_embedding = sess.run(autoencoder_embedding_layer,
                                           feed_dict={
                                               x: batch_x,
                                               y: batch_y,
                                           })
                _, _ = sess.run(
                    [parallel_optimizers, parallel_losses],
                    feed_dict={
                        x: batch_x,
                        y: batch_y,
                        population: batch_population,
                        embedding: batch_embedding,
                        parallel_alphas: alpha_batch,
                    })

            parallel_train_logits = sess.run(parallel_logits,
                                             feed_dict={
                                                 x:
                                                 train_features.values,
                                                 y:
                                                 train_labels.values.reshape(
                                                     len(train_labels), 1),
                                             })
            alphas = np.concatenate([alphas, alpha_batch])
            parallel_validation_logits = sess.run(
                parallel_logits,
                feed_dict={
                    x: validation_features.values,
                    y:
                    validation_labels.values.reshape(len(validation_labels),
                                                     1),
                })
            parallel_test_logits = sess.run(parallel_logits,
                                            feed_dict={
                                                x:
                                                test_features.values,
                                                y:
                                                test_labels.values.reshape(
                                                    len(test_labels), 1),
                                            })
            parallel_thresholds = [
                find_threshold(train_labels, train_logits, train_wqs,
                               FLAGS.post_shift)
                for train_logits in parallel_train_logits
            ]
            logits_thresholds = zip(parallel_validation_logits,
                                    parallel_thresholds)
            parallel_validation_metrics = [
                metrics(validation_labels, logits, validation_wqs, thresholds)
                for (logits, thresholds) in logits_thresholds
            ]
            validation_metrics.extend(parallel_validation_metrics)
            parallel_test_metrics = [
                metrics(test_labels, test_logits, test_wqs, thresholds)
                for (test_logits, thresholds
                     ) in zip(parallel_test_logits, parallel_thresholds)
            ]
            test_metrics.extend(parallel_test_metrics)

    best_observed_index = np.argmin([m[0] for m in validation_metrics])
    print('[metric] validation_acc={}'.format(
        validation_metrics[best_observed_index][0]))
    print('[metric] validation_violation={}'.format(
        validation_metrics[best_observed_index][1]))
    print('[metric] test_acc={}'.format(test_metrics[best_observed_index][0]))
    print('[metric] test_violation={}'.format(
        test_metrics[best_observed_index][1]))

    return 0
示例#7
0
            ppmv_sums.append(float(ppmv))
            counts.append(1)
        else:
            # aggregate monthly sum to produce average
            ppmv_sums[-1] += float(ppmv)
            counts[-1] += 1

    months = np.asarray(months).reshape(-1, 1)
    avg_ppmvs = np.asarray(ppmv_sums) / counts
    return months, avg_ppmvs


X, y = load_mauna_loa_atmospheric_c02()

# Kernel with parameters given in GPML book
k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
k2 = 2.4**2 * RBF(length_scale=90.0) \
    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component
# medium term irregularity
k3 = 0.66**2 \
    * RationalQuadratic(length_scale=1.2, alpha=0.78)
k4 = 0.18**2 * RBF(length_scale=0.134) \
    + WhiteKernel(noise_level=0.19**2)  # noise terms
kernel_gpml = k1 + k2 + k3 + k4

gp = GaussianProcessRegressor(kernel=kernel_gpml,
                              alpha=0,
                              optimizer=None,
                              normalize_y=True)
gp.fit(X, y)
示例#8
0
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
from sklearn.datasets import fetch_mldata

# get data
data = fetch_mldata('mauna-loa-atmospheric-co2').data
x = data[:, [1]]
y = data[:, 0]
x_train, y_train = x[:int(0.7 * len(x))], y[:int(0.7 * len(y))]
x_test, y_test = x[int(0.7 * len(x)):], y[int(0.7 * len(y)):]

# Kernel from [1]
# it was implemented by Jan Hendrik Metzen <*****@*****.**>
k1 = 50.0**2 * RBF(length_scale=50.0)  # linear trend
k2 = 2.0**2 * RBF(length_scale=100.0) \
    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
                     periodicity_bounds="fixed")  # oscillations
k3 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
k4 = 0.1**2 * RBF(length_scale=0.1) \
    + WhiteKernel(noise_level=0.1**2,
                  noise_level_bounds=(1e-3, np.inf)) # noise
kernel = k1 + k2 + k3 + k4

gp = GaussianProcessRegressor(kernel=kernel, alpha=0, normalize_y=True)

# fit GP
gp.fit(x_train, y_train)
# produce vector of values on x-axis
X_ = np.linspace(x.min(), x.max(), 1000)[:, np.newaxis]
示例#9
0
[MIN, MAX] = h5py.File('./data/CNN/model_CNN_0521_K2M_rel.h5',
                       'r')['minmax'][:]

#src_path  = 'I:/AllData_0327/'
#src_path  = 'C:/Users/Dawnknight/Documents/GitHub/K_project/data/'
src_path = 'D:/Project/K_project/data/'
Mfolder = 'unified data array/Unified_MData/'
Kfolder = 'unified data array/Unified_KData/'
Rfolder = 'unified data array/reliability/'
gprfolder = 'GPR_Kernel/'
Errfolder = 'GPR_cluster_err/'

Rel_th = 0.7
factor = 5

k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend

k4 = 0.18**2 * RBF(length_scale=0.134) \
    + WhiteKernel(noise_level=0.19**2)  # noise terms

kernel_gpml = k1 + k4

M_train_rel = cPickle.load(file('GPR_training_testing_set33.pkl',
                                'rb'))['Rel_train_M'][12:30, :].T
K_train_rel = cPickle.load(file('GPR_training_testing_set33.pkl',
                                'rb'))['Rel_train_K'][12:30, :].T

K_test_rel = (cPickle.load(file('GPR_training_testing_set33.pkl',
                                'rb'))['Rel_test_K'][12:30, :].T -
              MIN) / (MAX - MIN)
M_test_rel = cPickle.load(file('GPR_training_testing_set33.pkl',
# reload = True
reload = False
n_iter = 1000
N_EARLY_STOPPING = 1000

# ALPHA = MEAN  # prior:
ALPHA = 0.001  # ndim ** 1

GAMMA = 10**(-2) * 2 * ndim
GAMMA0 = 0.01 * GAMMA
GAMMA_Y = 10**(-2)  # weight of adjacen

IS_EDGE_NORMALIZED = True

# kernel = Matern(nu=2.5)
kernel = C(1) * RBF(2)
# kernel = None

BURNIN = False  # TODO
INITIAL_K = 10
INITIAL_THETA = 10

UPDATE_HYPERPARAM_FUNC = 'pairwise_sampling'  # None

output_dir = 'output'  # _gmrf_min0max1_easy'
parameter_dir = os.path.join('param_dir', 'csv_files')
result_filename = os.path.join(output_dir, 'gaussian_result_2dim.csv')

ACQUISITION_FUNC = 'ucb'  # 'ei'
ACQUISITION_PARAM_DIC = {'beta': 5}
    def test_gpr_rbf_unfitted(self):

        se = (C(1.0, (1e-3, 1e3)) *
              RBF(length_scale=10, length_scale_bounds=(1e-3, 1e3)))
        kernel = (Sum(
            se,
            C(0.1, (1e-3, 1e3)) *
            RBF(length_scale=1, length_scale_bounds=(1e-3, 1e3))))

        gp = GaussianProcessRegressor(alpha=1e-7,
                                      kernel=kernel,
                                      n_restarts_optimizer=15,
                                      normalize_y=True)

        # return_cov=False, return_std=False
        model_onnx = to_onnx(gp,
                             initial_types=[('X', FloatTensorType([]))],
                             dtype=np.float32)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(Xtest_.astype(np.float32),
                            gp,
                            model_onnx,
                            verbose=False,
                            basename="SklearnGaussianProcessRBFUnfitted")

        # return_cov=True, return_std=True
        options = {
            GaussianProcessRegressor: {
                "return_std": True,
                "return_cov": True
            }
        }
        try:
            to_onnx(gp, Xtrain_.astype(np.float32), options=options)
        except RuntimeError as e:
            assert "Not returning standard deviation" in str(e)

        # return_std=True
        options = {GaussianProcessRegressor: {"return_std": True}}
        model_onnx = to_onnx(gp,
                             options=options,
                             initial_types=[('X', FloatTensorType([None,
                                                                   None]))],
                             dtype=np.float32)
        self.assertTrue(model_onnx is not None)
        self.check_outputs(
            gp,
            model_onnx,
            Xtest_.astype(np.float32),
            predict_attributes=options[GaussianProcessRegressor])

        # return_cov=True
        options = {GaussianProcessRegressor: {"return_cov": True}}
        # model_onnx = to_onnx(gp, Xtrain_.astype(np.float32), options=options)
        model_onnx = to_onnx(gp,
                             options=options,
                             initial_types=[('X', FloatTensorType([None,
                                                                   None]))],
                             dtype=np.float32)
        self.assertTrue(model_onnx is not None)
        self.check_outputs(
            gp,
            model_onnx,
            Xtest_.astype(np.float32),
            predict_attributes=options[GaussianProcessRegressor])
print(__doc__)

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = np.array(iris.target, dtype=int)

h = .02  # step size in the mesh

kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
kernel = 1.0 * RBF([1.0, 1.0])
gpc_rbf_anisotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

titles = ["Isotropic RBF", "Anisotropic RBF"]
plt.figure(figsize=(10, 5))
for i, clf in enumerate((gpc_rbf_isotropic, gpc_rbf_anisotropic)):
    # Plot the predicted probabilities. For that, we will assign a color to
    # each point in the mesh [x_min, m_max]x[y_min, y_max].
示例#13
0
# Author: Alexandre Gramfort <*****@*****.**>
# License: BSD 3 clause
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:, 0:2]  # we only take the first two features for visualization
y = iris.target
n_features = X.shape[1]
C = 10
kernel = 1.0 * RBF([1.0, 1.0])  # for GPC
# Create different classifiers.
classifiers = {
    'L1 logistic':
    LogisticRegression(C=C,
                       penalty='l1',
                       solver='saga',
                       multi_class='multinomial',
                       max_iter=10000),
    'L2 logistic (Multinomial)':
    LogisticRegression(C=C,
                       penalty='l2',
                       solver='saga',
                       multi_class='multinomial',
                       max_iter=10000),
    'L2 logistic (OvR)':
示例#14
0
def main(_):
    mnist = input_data.read_data_sets('/tmp/data/', one_hot=True, seed=12345)
    random_weight_vector = np.random.uniform(low=0.1,
                                             high=1.9,
                                             size=TRAIN_INPUT_SIZE)

    x = tf.placeholder(tf.float32, shape=(None, INPUT_DIM), name='x')
    y = tf.placeholder(tf.float32, shape=(None, OUTPUT_DIM), name='y')
    weight = tf.placeholder(tf.float32,
                            shape=(None, OUTPUT_DIM),
                            name='weight')
    parallel_alphas = tf.placeholder(tf.float32,
                                     shape=(FLAGS.num_parallel_alphas,
                                            OUTPUT_DIM),
                                     name='parallel_alphas')
    unstack_parallel_alphas = tf.unstack(parallel_alphas, axis=0)
    parallel_logits = []
    parallel_losses = []
    parallel_optimizers = []
    validation_metrics = []
    test_metrics = []
    all_test_metrics = []

    with tf.variable_scope('classifier'):
        for alpha_index in range(FLAGS.num_parallel_alphas):
            logits = classifier(x)
            alpha = tf.reshape(unstack_parallel_alphas[alpha_index],
                               shape=[OUTPUT_DIM, 1])
            optimizer, loss = optimization(logits, y, weight, alpha,
                                           LEARNING_RATE)
            parallel_logits.append(logits)
            parallel_losses.append(loss)
            parallel_optimizers.append(optimizer)

    init = tf.global_variables_initializer()
    classifiers_init = tf.variables_initializer(
        tf.global_variables(scope='classifier'))
    with tf.Session() as sess:
        sess.run(init)

        # GetCandidatesAlpha (Algorithm 2 in paper)
        sample_alphas = np.zeros(shape=(0, OUTPUT_DIM))
        for alpha_batch_index in range(FLAGS.num_alpha_batches):
            sess.run(classifiers_init)
            if FLAGS.uniform_weights:
                alpha_batch = np.zeros(shape=(FLAGS.num_parallel_alphas,
                                              OUTPUT_DIM))
            elif FLAGS.random_alpha or alpha_batch_index < 1:
                alpha_batch = sample_from_ball(
                    size=(FLAGS.num_parallel_alphas, OUTPUT_DIM),
                    sampling_radius=FLAGS.sampling_radius)
                sample_alphas = np.concatenate([sample_alphas, alpha_batch])
            else:
                # Use LCB to generate candidates.
                alpha_batch = np.zeros(shape=(0, OUTPUT_DIM))
                sample_metrics = validation_metrics[:]
                for alpha_index in range(FLAGS.num_parallel_alphas):
                    kernel = RBF(length_scale=FLAGS.sampling_radius,
                                 length_scale_bounds=(
                                     FLAGS.sampling_radius * 1e-3,
                                     FLAGS.sampling_radius *
                                     1e3)) * ConstantKernel(1.0, (1e-3, 1e3))
                    gp = GaussianProcessRegressor(kernel=kernel,
                                                  alpha=1e-4).fit(
                                                      sample_alphas,
                                                      np.log1p(sample_metrics))
                    candidates = sample_from_ball((10000, OUTPUT_DIM),
                                                  FLAGS.sampling_radius)

                    metric_mles, metric_stds = gp.predict(candidates,
                                                          return_std=True)
                    metric_lcbs = np.maximum(
                        np.expm1(metric_mles - 1.0 * metric_stds), 0.0)
                    metric_lcbs += np.random.random(
                        size=metric_lcbs.shape) * 0.001  # break ties
                    best_index = np.argmin(metric_lcbs)

                    best_alpha = [candidates[best_index]]
                    best_alpha_metric_estimate = np.minimum(
                        np.expm1(metric_mles[best_index] +
                                 1.0 * metric_stds[best_index]), 1.0)
                    alpha_batch = np.concatenate([alpha_batch, best_alpha])

                    sample_alphas = np.concatenate([sample_alphas, best_alpha])
                    sample_metrics.append(best_alpha_metric_estimate)

            # Training classifiers
            for step in range(TRAINING_STEPS):
                batch_index = range(
                    step * BATCH_SIZE % TRAIN_INPUT_SIZE,
                    step * BATCH_SIZE % TRAIN_INPUT_SIZE + BATCH_SIZE)
                (batch_x, batch_y) = mnist.train.next_batch(BATCH_SIZE,
                                                            shuffle=False)
                batch_weight = [[random_weight_vector[i]] * OUTPUT_DIM
                                for i in batch_index]
                _, _ = sess.run(
                    [parallel_optimizers, parallel_losses],
                    feed_dict={
                        x: batch_x,
                        y: batch_y,
                        weight: batch_weight,
                        parallel_alphas: alpha_batch,
                    })

            parallel_validation_logits = sess.run(parallel_logits,
                                                  feed_dict={
                                                      x:
                                                      mnist.validation.images,
                                                      y:
                                                      mnist.validation.labels,
                                                  })
            parallel_validation_metrics = [
                metric(mnist.validation.labels,
                       validation_logits,
                       all_digits=False)
                for validation_logits in parallel_validation_logits
            ]
            validation_metrics.extend(parallel_validation_metrics)

            parallel_test_logits = sess.run(parallel_logits,
                                            feed_dict={
                                                x: mnist.test.images,
                                                y: mnist.test.labels,
                                            })
            parallel_test_metrics = [
                metric(mnist.test.labels, test_logits, all_digits=False)
                for test_logits in parallel_test_logits
            ]
            test_metrics.extend(parallel_test_metrics)

            parallel_all_test_metrics = [
                metric(mnist.test.labels, test_logits, all_digits=True)
                for test_logits in parallel_test_logits
            ]
            all_test_metrics.extend(parallel_all_test_metrics)

    best_observed_index = np.argmin(validation_metrics)
    print('[metric] validation={}'.format(
        validation_metrics[best_observed_index]))
    print('[metric] test={}'.format(test_metrics[best_observed_index]))
    for i in range(10):
        print('[all test metrics] {}={}'.format(
            i, all_test_metrics[best_observed_index][i]))
示例#15
0
    import PAIRWISE_KERNEL_FUNCTIONS, euclidean_distances, pairwise_kernels
from sklearn.gaussian_process.kernels \
    import (RBF, Matern, RationalQuadratic, ExpSineSquared, DotProduct,
            ConstantKernel, WhiteKernel, PairwiseKernel, KernelOperator,
            Exponentiation)
from sklearn.base import clone

from sklearn.utils.testing import (assert_almost_equal,
                                   assert_array_equal,
                                   assert_array_almost_equal)


X = np.random.RandomState(0).normal(0, 1, (5, 2))
Y = np.random.RandomState(0).normal(0, 1, (6, 2))

kernel_white = RBF(length_scale=2.0) + WhiteKernel(noise_level=3.0)
kernels = [RBF(length_scale=2.0), RBF(length_scale_bounds=(0.5, 2.0)),
           ConstantKernel(constant_value=10.0),
           2.0 * RBF(length_scale=0.33, length_scale_bounds="fixed"),
           2.0 * RBF(length_scale=0.5), kernel_white,
           2.0 * RBF(length_scale=[0.5, 2.0]),
           2.0 * Matern(length_scale=0.33, length_scale_bounds="fixed"),
           2.0 * Matern(length_scale=0.5, nu=0.5),
           2.0 * Matern(length_scale=1.5, nu=1.5),
           2.0 * Matern(length_scale=2.5, nu=2.5),
           2.0 * Matern(length_scale=[0.5, 2.0], nu=0.5),
           3.0 * Matern(length_scale=[2.0, 0.5], nu=1.5),
           4.0 * Matern(length_scale=[0.5, 0.5], nu=2.5),
           RationalQuadratic(length_scale=0.5, alpha=1.5),
           ExpSineSquared(length_scale=0.5, periodicity=1.5),
           DotProduct(sigma_0=2.0), DotProduct(sigma_0=2.0) ** 2,
示例#16
0
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF
from sklearn.model_selection import train_test_split

url = 'https://raw.githubusercontent.com/carlson9/KocPython2019/master/Homework/immSurvey.csv'
tt = pd.read_csv(url, index_col=0, parse_dates=[0])
tt.head()

alphas = tt.stanMeansNewSysPooled
sample = tt.textToSend

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1)
analyze = vec.build_analyzer()
X = vec.fit_transform(sample)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, alphas, random_state=1)

rbf = ConstantKernel(1.0) * RBF(length_scale=1.0)
gpr = GaussianProcessRegressor(kernel=rbf, alpha=1e-8)

gpr.fit(Xtrain.toarray(), ytrain)

# Compute posterior predictive mean and covariance
mu_s, cov_s = gpr.predict(Xtest.toarray(), return_cov=True)

#test correlation between test and mus
print np.corrcoef(ytest, mu_s)
# Custom defined list of Gaussian Process regression models to be used by TPOT
import numpy as np
import pdb
from itertools import product

from skrvm import RVR

# Define list of Kernels
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
                                              ExpSineSquared, DotProduct,
                                              ConstantKernel)

# The hyperparameters for the GPR, will be optimised during fitting
kernels = [RBF(), RationalQuadratic(), ExpSineSquared(), Matern()]
tpot_config_gpr = {
    'sklearn.gaussian_process.GaussianProcessRegressor': {
        'kernel': kernels,
        'random_state': [42],
        'alpha': np.arange(1e-2, 10, 30)
    },
    'skrvm.RVR': {
        'kernel': kernels,
        'alpha': [1e-10, 1e-06, 1e-02, 1],
        'beta': [1e-10, 1e-06, 1e-02, 1],
    },
    'sklearn.svm.LinearSVR': {
        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
        'dual': [True, False],
        'random_state': [42],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [2**-6, 2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 2**0, 2**1.],
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = [
    "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
    "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes",
    "QDA"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
#     train = val['train']
#     test = val['test']

numfeat = 9

classifiernames = [
    "Nearest Neighbors", "Linear SVC", "RBF SVC", "Gaussian Process",
    "Decision Tree", "Random Forest", "Multilayer Perceptron", "AdaBoost",
    "Naive Bayes", "QDA", "XGBoost", "Logistic Regression"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100, max_features='auto'),
    MLPClassifier(alpha=1, max_iter=int(1e8)),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(), XGBClassifier,
    LogisticRegression()
]

selectors = [
    reliefF.reliefF, fisher_score.fisher_score, gini_index.gini_index,
    chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim,
    ICAP.icap, MRMR.mrmr, MIFS.mifs
]
示例#20
0
    def classificationHandler(self, e):
        model = None
        if e.get("Algorithm") == "Perceptron":
            perceptron_max_iter = int(e.get("Params").get("Max_Iter"))
            perceptron_penalty = e.get("Params").get("penalty")
            perceptron_alpha = float(e.get("Params").get("alpha"))
            perceptron_tol = float(e.get("Params").get("tol"))
            perceptron_shuffle = bool(e.get("Params").get("shuffle"))
            perceptron_eta0 = float(e.get("Params").get("eta0"))
            model = Perceptron(max_iter=perceptron_max_iter, penalty=perceptron_penalty,
                        alpha=perceptron_alpha, tol=perceptron_tol, shuffle=perceptron_shuffle, eta0= perceptron_eta0)

        elif e.get("Algorithm") == "Decision Tree":
            dtc_max_depth = e.get("Params").get("Max Depth")
            dtc_max_depth = int(dtc_max_depth) if dtc_max_depth != "None" else None
            dtc_criterion = e.get("Params").get("criterion")
            dtc_splitter = e.get("Params").get("splitter")
            dtc_min_samples_split = e.get("Params").get("min_samples_split")
            print(dtc_min_samples_split)
            dtc_min_samples_split = float(dtc_min_samples_split) if int(dtc_min_samples_split) == 0 else int(dtc_min_samples_split)
            dtc_min_samples_leaf = e.get("Params").get("min_samples_leaf")
            print(dtc_min_samples_leaf)
            dtc_min_samples_leaf = float(dtc_min_samples_leaf) if int(dtc_min_samples_leaf) == 0 else int(dtc_min_samples_leaf)
            model = DecisionTreeClassifier(max_depth=dtc_max_depth, criterion=dtc_criterion, splitter=dtc_splitter,
                                           min_samples_split=dtc_min_samples_split, min_samples_leaf=dtc_min_samples_leaf)
        elif e.get("Algorithm") == "Support Vector Classifier":
            svc_kernel = e.get("Params").get("Kernel")
            model = SVC(kernel=svc_kernel)
        elif e.get("Algorithm") == "K Neighbors Classifier":
            knc_n_neighbors = int(e.get("Params").get("n_neighbors"))
            model = KNeighborsClassifier(n_neighbors=knc_n_neighbors)
        elif e.get("Algorithm") == "Gaussian Process Classifier":
            gpc_kernel = 1.0 * RBF(1.0)
            model = GaussianProcessClassifier(kernel=gpc_kernel)
        elif e.get("Algorithm") == "Random Forest Classifier":
            rf_max_depth = e.get("Params").get("Max Depth")
            if rf_max_depth != "None":
                rf_max_depth = int(rf_max_depth)
            else:
                rf_max_depth = None
            rf_n_estimators = int(e.get("Params").get("Number of Trees (n_estimators)"))
            model = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth)
        elif e.get("Algorithm") == "Multi-layer Perceptron":
            mlp_alpha = float(e.get("Params").get("alpha"))
            model = MLPClassifier(alpha=mlp_alpha)
        elif e.get("Algorithm") == "AdaBoost Classifier":
            model = AdaBoostClassifier()
        elif e.get("Algorithm") == "Gaussian Naive Bayes":
            model = GaussianNB()
        elif e.get("Algorithm") == "Quadratic Discriminant Analysis":
            model = QuadraticDiscriminantAnalysis()
        elif e.get("Algorithm") == "Neural Network (Keras)":
            networkDef = e.get("Params").get("Network")
            overallNetwork = networkDef[0]
            if overallNetwork[0] == "Sequential":
                model = k.Sequential()

            model.add(Dense(units=networkDef[1].get("Nodes"),  activation=networkDef[1].get("Activation")))
            layerDefs = networkDef[2:]
            for layer in layerDefs:
                if layer.get("Type") == "Dense":
                    model.add(Dense(units=layer.get("Nodes"), activation=layer.get("Activation")))
                elif layer.get("Type") == "Conv 1D":
                    model.add(Conv1D(layer.get("Nodes"), 3, activation=layer.get("Activation")))
            model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=[overallNetwork[1]])
        if e.get("Algorithm") == "Neural Network (Keras)":
            start = time.time()
            model.fit(self.trainingDataX.values, k.utils.to_categorical(self.trainingDataY.values), epochs=overallNetwork[2], batch_size=overallNetwork[3])
            end = time.time()
        else:
            start = time.time()
            try:
                model.fit(self.trainingDataX.values, self.trainingDataY.values)
            except MemoryError:
                print("Memory Error")
            end = time.time()
        entry = {
            "Type": "Classification",
            "Algorithm": e.get("Algorithm"),
            "Model": model,
            "Params": e.get("Params"),
            "Statistics":
                {
                }
        }
        if self.typeOfData == "K-Fold 1 CSV":
            scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
            scores = cross_validate(model, self.trainingDataX, self.trainingDataY, scoring=scoring, cv=3,
                                    return_train_score=True)
            entry["Statistics"] = {
                "Accuracy": str(scores['test_accuracy'].mean())[0:4],
                "Precision": str(scores["test_precision_macro"].mean())[0:4],
                "Recall": str(scores["test_recall_macro"].mean())[0:4],
                "F1": str(scores["test_f1_macro"].mean())[0:4]
            }

        elif self.typeOfData == "All-n-One" or self.typeOfData == "1 Training 1 Testing":
            if e.get("Algorithm") == "Neural Network (Keras)":
                entry["Statistics"] = {
                    "Accuracy": str(1.0),
                    "Precision": str(1.0),
                    "Recall": str(1.0),
                    "F1": str(1.0)
                }
            else:
                entry["Statistics"] = {
                    "Accuracy": str(metrics.accuracy_score(self.testingDataY.values, model.predict(self.testingDataX.values)))[0:4],
                    "Precision": str(
                        metrics.precision_score(self.testingDataY.values, model.predict(self.testingDataX.values), average='macro'))[0:4],
                    "Recall": str(
                        metrics.recall_score(self.testingDataY.values, model.predict(self.testingDataX.values), average='macro'))[0:4],
                    "F1": str(metrics.f1_score(self.testingDataY.values, model.predict(self.testingDataX.values), average='macro'))[0:4]
                }
        entry["Statistics"]["Fit Time"] = int(end - start)
        return entry
示例#21
0
    def fit(self, X, y):
        """Fit Gaussian process classification model

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_features)
            Training data

        y : array-like, shape = (n_samples,)
            Target values, must be binary

        Returns
        -------
        self : returns an instance of self.
        """
        if self.kernel is None:  # Use an RBF kernel as default
            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
                * RBF(1.0, length_scale_bounds="fixed")
        else:
            self.kernel_ = clone(self.kernel)

        self.rng = check_random_state(self.random_state)

        self.X_train_ = np.copy(X) if self.copy_X_train else X

        # Encode class labels and check that it is a binary classification
        # problem
        label_encoder = LabelEncoder()
        self.y_train_ = label_encoder.fit_transform(y)
        self.classes_ = label_encoder.classes_
        if self.classes_.size > 2:
            raise ValueError("%s supports only binary classification. "
                             "y contains classes %s" %
                             (self.__class__.__name__, self.classes_))
        elif self.classes_.size == 1:
            raise ValueError("{0:s} requires 2 classes.".format(
                self.__class__.__name__))

        if self.optimizer is not None and self.kernel_.n_dims > 0:
            # Choose hyperparameters based on maximizing the log-marginal
            # likelihood (potentially starting from several initial values)
            def obj_func(theta, eval_gradient=True):
                if eval_gradient:
                    lml, grad = self.log_marginal_likelihood(
                        theta, eval_gradient=True)
                    return -lml, -grad
                else:
                    return -self.log_marginal_likelihood(theta)

            # First optimize starting from theta specified in kernel
            optima = [
                self._constrained_optimization(obj_func, self.kernel_.theta,
                                               self.kernel_.bounds)
            ]

            # Additional runs are performed from log-uniform chosen initial
            # theta
            if self.n_restarts_optimizer > 0:
                if not np.isfinite(self.kernel_.bounds).all():
                    raise ValueError(
                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
                        "requires that all bounds are finite.")
                bounds = self.kernel_.bounds
                for iteration in range(self.n_restarts_optimizer):
                    theta_initial = np.exp(
                        self.rng.uniform(bounds[:, 0], bounds[:, 1]))
                    optima.append(
                        self._constrained_optimization(obj_func, theta_initial,
                                                       bounds))
            # Select result from run with minimal (negative) log-marginal
            # likelihood
            lml_values = list(map(itemgetter(1), optima))
            self.kernel_.theta = optima[np.argmin(lml_values)][0]
            self.log_marginal_likelihood_value_ = -np.min(lml_values)
        else:
            self.log_marginal_likelihood_value_ = \
                self.log_marginal_likelihood(self.kernel_.theta)

        # Precompute quantities required for predictions which are independent
        # of actual query points
        K = self.kernel_(self.X_train_)

        _, (self.pi_, self.W_sr_, self.L_, _, _) = \
            self._posterior_mode(K, return_temporaries=True)

        return self
示例#22
0
    def classification_summary(self):
        print('Starting classification_summary...')
        print('TOP 10 FEATURE IMPORTANCE')
        from sklearn.ensemble import AdaBoostClassifier
        import pandas as pd
        import warnings
        warnings.filterwarnings('ignore')
        ab = AdaBoostClassifier().fit(self.X, self.y)
        print(pd.Series(ab.feature_importances_, index=self.X.columns).sort_values(ascending=False).head(10))

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(
                    self.X.values, self.y.values, test_size=float(input("Please enter test size (for eg. please enter 0.20 for 20% test size):\n")), random_state=1)
             
        from sklearn.neural_network import MLPClassifier
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.svm import SVC
        from sklearn.svm import LinearSVC
        from sklearn.gaussian_process import GaussianProcessClassifier
        from sklearn.gaussian_process.kernels import RBF
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
        from catboost import CatBoostClassifier
        from sklearn.naive_bayes import GaussianNB
        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
        from sklearn.linear_model import LogisticRegression
        import xgboost as xgb
        from sklearn.metrics import accuracy_score, f1_score
        from sklearn.metrics import roc_auc_score
        import warnings
        warnings.filterwarnings('ignore')
        classifiers = {
            "Logistic"     : LogisticRegression(max_iter=1000),
            "KNN(3)"       : KNeighborsClassifier(3), 
            "Decision Tree": DecisionTreeClassifier(max_depth=7), 
            "Random Forest": RandomForestClassifier(max_depth=7, n_estimators=10, max_features=4), 
            "Neural Net"   : MLPClassifier(alpha=1), 
            "XGBoost"      : xgb.XGBClassifier(max_depth=4, n_estimators=10, learning_rate=0.1, n_jobs=1),
            "AdaBoost"     : AdaBoostClassifier(),
            "CatBoost"     : CatBoostClassifier(silent=True),
            "Naive Bayes"  : GaussianNB(), 
            "QDA"          : QuadraticDiscriminantAnalysis(),
            "Linear SVC"   : LinearSVC(),
            "Linear SVM"   : SVC(kernel="linear"), 
            "Gaussian Proc": GaussianProcessClassifier(1.0 * RBF(1.0)),
        }
        from time import time
        k = 10      
        head = list(classifiers.items())[:k]

        for name, classifier in head:
            start = time()
            classifier.fit(X_train, y_train)
            train_time = time() - start
            start = time()
            predictions = classifier.predict(X_test)
            predict_time = time()-start
            acc_score= (accuracy_score(y_test,predictions))
            roc_score= (roc_auc_score(y_test,predictions))
            f1_macro= (f1_score(y_test, predictions, average='macro'))
            print("{:<15}| acc_score = {:.3f} | roc_score = {:,.3f} | f1_score(macro) = {:,.3f} | Train time = {:,.3f}s | Pred. time = {:,.3f}s".format(name, acc_score, roc_score, f1_macro, train_time, predict_time))