def test_pipeline(): pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) pipeline = pipeline.fit(X, y) y2 = pipeline.predict(X) score = pipeline.score(X, y) assert isinstance(y2, di.Value) assert isinstance(score, di.Value) assert isinstance(score.compute(), float) assert pipeline.score(X, y).key == pipeline.score(X, y).key assert score.compute() == score.compute() y22 = y2.compute() assert y22.shape == y.shape assert y22.dtype == y.dtype skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) skpipeline.fit(X, y) sk_y2 = skpipeline.predict(X) sk_score = skpipeline.score(X, y) assert sk_score == score.compute()
def main(args): # Use the MNIST dataset. dataset = MNIST(data_size=5000) # Split the dataset into a train set and a test set. train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split( dataset.data, dataset.target, test_size=args.test_size, random_state=args.seed) pca = [("PCA", PCATransformer(args.pca, args.seed))] if args.pca else [] if args.with_reference: pca = [("PCA", sklearn.decomposition.PCA(n_components=args.pca, random_state=args.seed))] pipeline = sklearn.pipeline.Pipeline( [("scaling", sklearn.preprocessing.MinMaxScaler())] + pca + [("classifier", sklearn.linear_model.LogisticRegression(solver="saga", max_iter=args.max_iter, random_state=args.seed))] ) pipeline.fit(train_data, train_target) test_accuracy = pipeline.score(test_data, test_target) return test_accuracy
n_states = 3 tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) subsampler = mixtape.utils.Subsampler(lag_time=lag_time) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_components) cluster = mixtape.cluster.GMM(n_components=n_states, covariance_type='full') feature_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)]) cluster_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster)]) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("subsampler", subsampler), ("cluster", cluster), ("msm", msm)]) pipeline.fit(train) pipeline.score(train), pipeline.score(test) X_all = feature_pipeline.transform(trajectories) q = np.concatenate(X_all) covars_ = cluster.covars_ covars_ = cluster.covars_.diagonal(axis1=1, axis2=2) for i, j in [(0, 1)]: fig = plt.figure() plt.hexbin(q[:, i], q[:, j], bins='log') plt.errorbar(cluster.means_[:, i], cluster.means_[:, j], xerr=covars_[:, i]**0.5, yerr=covars_[:, j]**0.5, fmt='kx',
import mixtape.featurizer, mixtape.tica, mixtape.cluster, mixtape.markovstatemodel, mixtape.ghmm import numpy as np import mdtraj as md from parameters import load_trajectories, build_full_featurizer import sklearn.pipeline, sklearn.externals.joblib import mixtape.utils n_choose = 50 stride = 1 lag_time = 1 n_components = 2 trj0, trajectories, filenames = load_trajectories(stride=stride) train = trajectories[0::2] test = trajectories[1::2] featurizer = sklearn.externals.joblib.load("./featurizer-%d-%d.job" % (n_components, n_choose)) for n_states in [10, 20, 30, 40, 50]: n_components = n_components tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=5) cluster = mixtape.cluster.KMeans(n_clusters=n_states) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster), ("msm", msm)]) pipeline.fit(train) print(pipeline.score(train), pipeline.score(test)) pipeline.fit(trajectories) print(msm.timescales_)
n_components_list = [8] n_states_list = range(5, 60) train_scores = np.zeros((len(n_components_list), len(n_states_list))) test_scores = np.zeros((len(n_components_list), len(n_states_list))) for i, n_components in enumerate(n_components_list): for j, n_states in enumerate(n_states_list): print(n_components, n_states) tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) subsampler = mixtape.utils.Subsampler(lag_time=lag_time) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_components) cluster = mixtape.cluster.KMeans(n_states) pipeline = sklearn.pipeline.Pipeline( [("features", featurizer), ("tica", tica), ("subsampler", subsampler), ("cluster", cluster), ("msm", msm)] ) pipeline.fit(train) train_scores[i, j] = pipeline.score(train) test_scores[i, j] = pipeline.score(test) plot(n_states_list, train_scores.T, "o", label="train") plot(n_states_list, test_scores.T, "o", label="test") xlabel("n_states") ylabel("Score") title("tICA KMeans SETD2") legend(loc=0) ylim(4, 10) savefig("/home/kyleb/src/kyleabeauchamp/MixtapeTalk/figures/SETD2_tICA_KMeans.png")
test = trajectories[1::2] featurizer = sklearn.externals.joblib.load("./featurizer-%d.job" % n_choose) n_components = 3 n_states = 100 tica = mixtape.tica.tICA(n_components=n_components, lag_time=lag_time) subsampler = mixtape.utils.Subsampler(lag_time=lag_time) msm = mixtape.markovstatemodel.MarkovStateModel(n_timescales=n_components) cluster = mixtape.cluster.KMeans(n_states) feature_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica)]) cluster_pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("cluster", cluster)]) pipeline = sklearn.pipeline.Pipeline([("features", featurizer), ('tica', tica), ("subsampler", subsampler), ("cluster", cluster), ("msm", msm)]) pipeline.fit(train) pipeline.score(train), pipeline.score(test) X_all = feature_pipeline.transform(trajectories) q = np.concatenate(X_all) covars_ = cluster.covars_ covars_ = cluster.covars_.diagonal(axis1=1, axis2=2) for i, j in [(0, 1)]: figure() hexbin(q[:,i], q[:, j], bins='log') errorbar(cluster.means_[:, i], cluster.means_[:, j], xerr=covars_[:,i] ** 0.5, yerr=covars_[:, j] ** 0.5, fmt='kx', linewidth=4) states = cluster_pipeline.transform(trajectories) ind = msm.draw_samples(states, 3)
#in[152] log_reg.score(X_test, y_test) #in[153] from sklearn.pipeline import Pipeline #in[154] pipeline = Pipeline([ ("kmeans", KMeans(n_clusters=50, random_state=42)), ("log_reg", LogisticRegression(multi_class="ovr", solver="liblinear", random_state=42)) ]) pipeline.fit(X_train, y_train) #ini[155] pipeline.score(X_test, y_test) #in[156] 1 - (1 - 0.9822222)/(1 - 0.9666666) #in[157] from sklearn.model_selection import GridSearchCV #in[158] param_grid = dict(kmeans__n_clusters=range(2, 100)) # Create a dictionary grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2) grid_clf.fit(X_train, y_train) #in[159] grid_clf.best_params_