def fit(self, X, y): """Fit the model using X, y as training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data. y : array-like, shape = [n_samples] Target values. Will be cast to X's dtype if necessary Returns ------- self : object Returns an instance of self. """ X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True, ensure_min_samples=2, estimator=self) X = as_float_array(X, copy=False) n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = \ self._preprocess_data(X, y, self.fit_intercept, self.normalize) estimator_func, params = self._make_estimator_and_params(X, y) memory = self.memory if memory is None: memory = Memory(cachedir=None, verbose=0) elif isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) elif not isinstance(memory, Memory): raise ValueError("'memory' should either be a string or" " a sklearn.utils.Memory" " instance, got 'memory={!r}' instead.".format( type(memory))) scores_ = memory.cache(_resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch' ])(estimator_func, X, y, scaling=self.scaling, n_resampling=self.n_resampling, n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch, random_state=self.random_state, sample_fraction=self.sample_fraction, **params) if scores_.ndim == 1: scores_ = scores_[:, np.newaxis] self.all_scores_ = scores_ self.scores_ = np.max(self.all_scores_, axis=1) return self
def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib memory = Memory(cachedir=cachedir, verbose=10) else: memory = Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert_true(pipeline.memory is memory) pipeline = make_pipeline(DummyTransf(), SVC()) assert_true(pipeline.memory is None) shutil.rmtree(cachedir)
def test_pipeline_memory(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_false(hasattr(transf, 'means_')) # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_) finally: shutil.rmtree(cachedir)
def test_make_pipeline_memory(): cachedir = mkdtemp() memory = Memory(cachedir=cachedir) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert_true(pipeline.memory is memory) pipeline = make_pipeline(DummyTransf(), SVC()) assert_true(pipeline.memory is None) shutil.rmtree(cachedir)
for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise # ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge
import matplotlib.pyplot as plt import pandas from sklearn.utils.testing import ignore_warnings from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition.nmf import NMF from sklearn.decomposition.nmf import _initialize_nmf from sklearn.decomposition.nmf import _beta_divergence from sklearn.decomposition.nmf import INTEGER_TYPES, _check_init from sklearn.utils import Memory from sklearn.exceptions import ConvergenceWarning from sklearn.utils.extmath import safe_sparse_dot, squared_norm from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted, check_non_negative mem = Memory(cachedir='.', verbose=0) ################### # Start of _PGNMF # ################### # This class implements a projected gradient solver for the NMF. # The projected gradient solver was removed from scikit-learn in version 0.19, # and a simplified copy is used here for comparison purpose only. # It is not tested, and it may change or disappear without notice. def _norm(x): """Dot product-based Euclidean norm implementation See: http://fseoane.net/blog/2011/computing-the-vector-norm/ """ return np.sqrt(squared_norm(x))
import numpy as np from sklearn.datasets import fetch_covtype, get_data_home from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import zero_one_loss from sklearn.utils import Memory from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='C', random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(np.int) # Create train-test split (as [Joachims, 2006])
import numpy as np from sklearn.utils import Memory from sklearn.datasets import load_svmlight_file mem = Memory("./mycache") training_dataset = "./a9a/a9a" testing_dataset = "./a9a/a9a.t" eps = 10e-3 lamb = 0.03 @mem.cache def get_data(path): data = load_svmlight_file(path) X = data[0].todense() y = np.reshape(data[1], (-1, 1)) # make y a proper vector y[y == -1] = 0 # make sure labels are {0, 1} return X, y def w_init(d): return np.zeros((d, 1)) def p(y, X, w): N, d = X.shape assert w.shape == (d, 1) assert y == 0 or y == 1 return (np.exp(y * np.dot(X, w))) / (1 + np.exp((np.dot(X, w)))) def pred(X, w): N, d = X.shape
embeddings_k2[:, 0], 'y_k2': embeddings_k2[:, 1], 'z_k2': embeddings_k2[:, 2] }) print('clusters') import hdbscan std_args = { 'algorithm': 'best', 'alpha': 1.0, 'approx_min_span_tree': True, 'gen_min_span_tree': False, 'leaf_size': 40, 'memory': Memory(cachedir=None), 'metric': 'euclidean', 'min_cluster_size': 15, 'min_samples': None, 'p': None } def cluster(embeddings_array, **kwargs): print('dimensionality', embeddings_array.shape) clusterer = hdbscan.HDBSCAN(**kwargs) clusterer.fit(np.array(embeddings_array)) print('number of clusters: ', max(clusterer.labels_)) return clusterer.labels_
resources = [ (redirects_url, redirects_filename), (page_links_url, page_links_filename), ] for url, filename in resources: if not os.path.exists(filename): print("Downloading data from '%s', please wait..." % url) opener = urlopen(url) open(filename, 'wb').write(opener.read()) print() # ############################################################################# # Loading the redirect files memory = Memory(cachedir=".") def index(redirects, index_map, k): """Find the index of an article name after redirect resolution""" k = redirects.get(k, k) return index_map.setdefault(k, len(index_map)) DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/") SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1) def short_name(nt_uri): """Remove the < and > URI markers and the common URI prefix""" return nt_uri[SHORTNAME_SLICE]
import json import argparse from sklearn.utils import Memory from sklearn.datasets import fetch_mldata from sklearn.manifold import TSNE from sklearn.neighbors import NearestNeighbors from sklearn.decomposition import PCA from sklearn.utils import check_array from sklearn.utils import shuffle as _shuffle LOG_DIR = "mnist_tsne_output" if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) memory = Memory(os.path.join(LOG_DIR, 'mnist_tsne_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='C', shuffle=True, seed=0): """Load the data, then cache and memmap the train/test split""" print("Loading dataset...") data = fetch_mldata('MNIST original') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] if shuffle: X, y = _shuffle(X, y, random_state=seed) # Normalize features
for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise # ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function
from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.utils import Memory from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_mldata('MNIST original') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255
def build_model(cv=StratifiedKFold(3)): """Build a full nlp classification pipeline with GridSearchCV for best parameters. ARGUMENTS: X_train: training features (df or array) y_train: training labels (df or array) cv: type of CV, default is StratifiedKFold(3) RETURNS: cv: grid search object that can be fitted to the data to transform it and find the best parameters. """ # define classifier and scoring function clf = SGDClassifier(loss='log', fit_intercept=False, class_weight='balanced', max_iter=5, tol=None, random_state=1, n_jobs=-1) scorer = make_scorer(f1_score, average='weighted') # create temporary folder to store pipeline transformers (cache) cachedir = mkdtemp() memory = Memory(location=cachedir, verbose=1) full_pipe = Pipeline( [('features', FeatureUnion([ ('text', Pipeline([ ('text_select', MessageSelector('message')), ('vect', CountVectorizer(tokenizer=tokenize_text)), ('tfidf', TfidfTransformer()), ])), ('genre', Pipeline([ ('cat_select', CategoricalsSelector('genre')), ('ohe', OneHotEncoder()), ])), ])), ('clf', MultiOutputClassifier(clf))], memory=memory) parameters = { 'features__text__vect__max_df': [0.8, 0.9], } # create grid search object cv = GridSearchCV(full_pipe, param_grid=parameters, scoring=scorer, cv=cv, error_score='raise', n_jobs=1, verbose=1) return cv # delete the temporary cache before exiting rmtree(cachedir)
from IPM import IPM from sklearn.utils import Memory from sklearn.datasets import load_svmlight_file import os import numpy as np from scipy.sparse import coo_matrix,block_diag,eye,dia_matrix,csc_matrix,hstack,vstack from scipy.sparse.linalg import spsolve from numpy.linalg import norm mem = Memory('./cache') from IPM import IPM from sklearn.utils import Memory from sklearn.datasets import load_svmlight_file import os import numpy as np from scipy.sparse import coo_matrix, block_diag, eye, dia_matrix, csc_matrix, hstack, vstack from scipy.sparse.linalg import spsolve from numpy.linalg import norm mem = Memory('./cache') class SVM(IPM): def __init__(self, tao): os.chdir('./IPM-SVM') self.tao = tao pass def load_data(self, filename): data = load_svmlight_file(filename)
# since it could be used again. Using a pipeline in ``GridSearchCV`` triggers # such situations. Therefore, we use the argument ``memory`` to enable caching. # # .. warning:: # Note that this example is, however, only an illustration since for this # specific case fitting PCA is not necessarily slower than loading the # cache. Hence, use the ``memory`` constructor parameter when the fitting # of a transformer is costly. from tempfile import mkdtemp from shutil import rmtree from sklearn.utils import Memory # Create a temporary folder to store the transformers of the pipeline cachedir = mkdtemp() memory = Memory(cachedir=cachedir, verbose=10) cached_pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())], memory=memory) # This time, a cached pipeline will be used within the grid search grid = GridSearchCV(cached_pipe, cv=3, n_jobs=1, param_grid=param_grid) digits = load_digits() grid.fit(digits.data, digits.target) # Delete the temporary cache before exiting rmtree(cachedir) ############################################################################### # The ``PCA`` fitting is only computed at the evaluation of the first # configuration of the ``C`` parameter of the ``LinearSVC`` classifier. The
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): mem = Memory(cachedir=expanduser('~/cache'), verbose=0) if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': digits = load_digits() X, y = digits.data, digits.target if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] cached_fit = mem.cache(fit_single) out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(cached_fit)(solver, X, y, penalty=penalty, single_target=single_target, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for penalty in penalties) res = [] idx = 0 for solver in solvers: for penalty in penalties: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)