Пример #1
0
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
Пример #2
0
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()),
                            ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
Пример #3
0
def main(args):
    # Use the MNIST dataset.
    dataset = MNIST(data_size=5000)

    # Split the dataset into a train set and a test set.
    train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
        dataset.data, dataset.target, test_size=args.test_size, random_state=args.seed)

    pca = [("PCA", PCATransformer(args.pca, args.seed))] if args.pca else []
    if args.with_reference:
        pca = [("PCA", sklearn.decomposition.PCA(n_components=args.pca, random_state=args.seed))]

    pipeline = sklearn.pipeline.Pipeline(
        [("scaling", sklearn.preprocessing.MinMaxScaler())] +
        pca +
        [("classifier", sklearn.linear_model.LogisticRegression(solver="saga", max_iter=args.max_iter, random_state=args.seed))]
    )
    pipeline.fit(train_data, train_target)

    test_accuracy = pipeline.score(test_data, test_target)
    return test_accuracy
Пример #4
0
n_states = 3
tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
subsampler = mixtape.utils.Subsampler(lag_time=lag_time)
msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_components)
cluster = mixtape.cluster.GMM(n_components=n_states, covariance_type='full')
feature_pipeline = sklearn.pipeline.Pipeline([("features", featurizer),
                                              ('tica', tica)])
cluster_pipeline = sklearn.pipeline.Pipeline([("features", featurizer),
                                              ('tica', tica),
                                              ("cluster", cluster)])
pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica),
                                      ("subsampler", subsampler),
                                      ("cluster", cluster), ("msm", msm)])

pipeline.fit(train)
pipeline.score(train), pipeline.score(test)

X_all = feature_pipeline.transform(trajectories)
q = np.concatenate(X_all)

covars_ = cluster.covars_
covars_ = cluster.covars_.diagonal(axis1=1, axis2=2)

for i, j in [(0, 1)]:
    fig = plt.figure()
    plt.hexbin(q[:, i], q[:, j], bins='log')
    plt.errorbar(cluster.means_[:, i],
                 cluster.means_[:, j],
                 xerr=covars_[:, i]**0.5,
                 yerr=covars_[:, j]**0.5,
                 fmt='kx',
Пример #5
0
import mixtape.featurizer, mixtape.tica, mixtape.cluster, mixtape.markovstatemodel, mixtape.ghmm
import numpy as np
import mdtraj as md
from parameters import load_trajectories, build_full_featurizer
import sklearn.pipeline, sklearn.externals.joblib
import mixtape.utils

n_choose = 50
stride = 1
lag_time = 1
n_components = 2

trj0, trajectories, filenames = load_trajectories(stride=stride)

train = trajectories[0::2]
test = trajectories[1::2]

featurizer = sklearn.externals.joblib.load("./featurizer-%d-%d.job" % (n_components, n_choose))


for n_states in [10, 20, 30, 40, 50]:
    n_components = n_components
    tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
    msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=5)
    cluster = mixtape.cluster.KMeans(n_clusters=n_states)
    pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster), ("msm", msm)])
    pipeline.fit(train)
    print(pipeline.score(train), pipeline.score(test))
    pipeline.fit(trajectories)
    print(msm.timescales_)
Пример #6
0
n_components_list = [8]
n_states_list = range(5, 60)
train_scores = np.zeros((len(n_components_list), len(n_states_list)))
test_scores = np.zeros((len(n_components_list), len(n_states_list)))

for i, n_components in enumerate(n_components_list):
    for j, n_states in enumerate(n_states_list):
        print(n_components, n_states)
        tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
        subsampler = mixtape.utils.Subsampler(lag_time=lag_time)
        msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_components)
        cluster = mixtape.cluster.KMeans(n_states)
        pipeline = sklearn.pipeline.Pipeline(
            [("features", featurizer), ("tica", tica), ("subsampler", subsampler), ("cluster", cluster), ("msm", msm)]
        )
        pipeline.fit(train)
        train_scores[i, j] = pipeline.score(train)
        test_scores[i, j] = pipeline.score(test)


plot(n_states_list, train_scores.T, "o", label="train")
plot(n_states_list, test_scores.T, "o", label="test")

xlabel("n_states")
ylabel("Score")
title("tICA KMeans SETD2")
legend(loc=0)
ylim(4, 10)
savefig("/home/kyleb/src/kyleabeauchamp/MixtapeTalk/figures/SETD2_tICA_KMeans.png")
Пример #7
0
test = trajectories[1::2]

featurizer = sklearn.externals.joblib.load("./featurizer-%d.job" % n_choose)

n_components = 3
n_states = 100
tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time)
subsampler = mixtape.utils.Subsampler(lag_time=lag_time)
msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_components)
cluster = mixtape.cluster.KMeans(n_states)
feature_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)])
cluster_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster)])
pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("subsampler", subsampler), ("cluster", cluster), ("msm", msm)])

pipeline.fit(train)
pipeline.score(train), pipeline.score(test)


X_all = feature_pipeline.transform(trajectories)
q = np.concatenate(X_all)
covars_ = cluster.covars_
covars_ = cluster.covars_.diagonal(axis1=1, axis2=2)

for i, j in [(0, 1)]:
    figure()
    hexbin(q[:,i], q[:, j], bins='log')
    errorbar(cluster.means_[:, i], cluster.means_[:, j], xerr=covars_[:,i] ** 0.5, yerr=covars_[:, j] ** 0.5, fmt='kx', linewidth=4)


states = cluster_pipeline.transform(trajectories)
ind = msm.draw_samples(states, 3)
Пример #8
0
#in[152]
log_reg.score(X_test, y_test)

#in[153]
from sklearn.pipeline import Pipeline

#in[154]
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="liblinear", random_state=42))
])
pipeline.fit(X_train, y_train)

#ini[155]
pipeline.score(X_test, y_test)

#in[156]
1 - (1 - 0.9822222)/(1 - 0.9666666)

#in[157]
from sklearn.model_selection import GridSearchCV

#in[158]
param_grid = dict(kmeans__n_clusters=range(2, 100)) # Create a dictionary
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)

#in[159]
grid_clf.best_params_