def _save_model(model, outdir: Path): """ Saves model and vectorizer offline to load it later :param model: model to save :param outdir: directory where it'll be saved """ dump(model, str(outdir / "model.pth")) flag = outdir / '.SUCCESS' flag.touch()
def setup_module(): # Create some memory mapped data global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_') X, y = make_classification(n_samples=30, n_features=5, random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) filename = os.path.join(TEMP_FOLDER, 'test_data.pkl') _joblib.dump((X, y, y_ml), filename) X_mm, y_mm, y_ml_mm = _joblib.load(filename, mmap_mode='r') ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
def train(self, X: np.array, label: str, n_components: int = 10, cov_type: str = 'diag', n_iter: int = 1000) -> None: print('Now is trining model for {} ----->'.format(label)) self.label = label self.model = hmm.GaussianHMM(n_components=n_components, covariance_type=cov_type, n_iter=n_iter) np.seterr(all='ignore') self.model.fit(X) # Train here _joblib.dump(self.model, 'weights/' + label + ".m") with open('weights/' + label + '.lb', 'w') as f: f.write(label)
def test_pickling(tmpdir, sample_weight): # Make sure that predictions are the same before and after pickling. Used # to be a bug because sample_weights wasn't pickled and the resulting tree # would miss some info. kde = KernelDensity() data = np.reshape([1., 2., 3.], (-1, 1)) kde.fit(data, sample_weight=sample_weight) X = np.reshape([1.1, 2.1], (-1, 1)) scores = kde.score_samples(X) file_path = str(tmpdir.join('dump.pkl')) _joblib.dump(kde, file_path) kde = _joblib.load(file_path) scores_pickled = kde.score_samples(X) assert_allclose(scores, scores_pickled)
def save(self, overwrite=True, **kwargs): print("Saving model") path = models_store_path + self.name pickle.dump(self, open(path, 'wb')) _joblib.dump(self.model, self.name)
import warnings warnings.filterwarnings("ignore") # Save Model Using joblib from pandas import read_csv from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.utils._joblib import dump from sklearn.utils._joblib import load filename = 'diabetes.data.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7) # Fit the model on 33% model = LogisticRegression() model.fit(X_train, Y_train) # save the model to disk filename = 'finalized_model2.sav' dump(model, filename) # some time later... # load the model from disk loaded_model = load(filename) result = loaded_model.score(X_test, Y_test) print(result)
def save(self, overwrite=True, **kwargs): print("Saving model") abs_path = os.path.abspath(os.path.dirname(__file__)) path = models_store_path + self.name _joblib.dump(self.model, os.path.join(abs_path, path))
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Read more in the :ref:`User Guide <datasets>`. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns -------- The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1624,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (620,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. - For an example of using this dataset with scikit-learn, see :ref:`examples/applications/plot_species_distribution_modeling.py <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`. """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME) if not exists(archive_path): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") logger.info('Downloading species data from %s to %s' % ( SAMPLES.url, data_home)) samples_path = _fetch_remote(SAMPLES, dirname=data_home) with np.load(samples_path) as X: # samples.zip is a valid npz for f in X.files: fhandle = BytesIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) remove(samples_path) logger.info('Downloading coverage data from %s to %s' % ( COVERAGES.url, data_home)) coverages_path = _fetch_remote(COVERAGES, dirname=data_home) with np.load(coverages_path) as X: # coverages.zip is a valid npz coverages = [] for f in X.files: fhandle = BytesIO(X[f]) logger.debug(' - converting {}'.format(f)) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) remove(coverages_path) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) _joblib.dump(bunch, archive_path, compress=9) else: bunch = _joblib.load(archive_path) return bunch
def save(self, overwrite=True, **kwargs): print("Saving model") #path = models_store_path+self.name _joblib.dump(self.model, 'oursvm.pkl')
def fetch_uci_libras(data_home=None, shuffle=False, random_state=0, download_if_missing=True): """Load the UCI libra data-set from AT&T (classification). Download it if necessary. ================= ===================== Classes 15 Samples total 360 Dimensionality 90 Features real ================= ===================== Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. shuffle : boolean, optional If True the order of the dataset is shuffled to avoid having images of the same person grouped. random_state : int, RandomState instance or None (default=0) Determines random number generation for dataset shuffling. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : numpy array of shape (360, 90) Each row corresponds to a libras feature of 9 dimension target : numpy array of shape (360, ) Labels associated to each glas. Those labels are from range(15) and correspond to the Subject IDs. """ global LIBRAS data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'uci_libras.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") print('downloading UCI LIBRAS from %s to %s' % (LIBRAS.url, data_home)) data_path = _fetch_remote(LIBRAS, dirname=data_home) libras = np.genfromtxt(data_path, delimiter=",") _joblib.dump(libras, filepath, compress=6) remove(data_path) else: libras = _joblib.load(filepath) feature = libras[:, 0:-1] target = libras[:, -1] if shuffle: random_state = check_random_state(random_state) order = random_state.permutation(len(libras)) feature = libras[order] target = target[order] return (feature, target)
from sklearn.utils import _joblib as joblib from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier import pandas as pd parameter_0_0_1_0 = pd.read_csv( filepath_or_buffer="/code/Media/207/upload/iris(1).csv", header=0) data = parameter_0_0_1_0 parameter_1_0_3_0 = data.iloc[:, :-1] parameter_1_1_3_1 = data.iloc[:, -1] parameter_3_0_4_0, parameter_3_1_2_1, parameter_3_2_4_1, parameter_3_3_2_2 = train_test_split( parameter_1_0_3_0, parameter_1_1_3_1, random_state=None, test_size=0.8) parameter_4_0_2_0 = DecisionTreeClassifier(criterion="entropy", splitter="random", max_depth=50, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0, max_features=None, random_state=None, max_leaf_nodes=50, min_impurity_decrease=1, min_impurity_split=0, class_weight=None, ccp_alpha=0) parameter_4_0_2_0.fit(parameter_3_0_4_0, parameter_3_2_4_1) joblib.dump(parameter_4_0_2_0, filename="/code/Media/207/upload/1.m") parameter_2_0_2_null = parameter_4_0_2_0.score(parameter_3_1_2_1, parameter_3_3_2_2)
def fetch_species_distributions(data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) Read more in the :ref:`User Guide <datasets>`. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns -------- The data is returned as a Bunch object with the following attributes: coverages : array, shape = [14, 1592, 1212] These represent the 14 features measured at each point of the map grid. The latitude/longitude values for the grid are discussed below. Missing data is represented by the value -9999. train : record array, shape = (1624,) The training points for the data. Each point has three fields: - train['species'] is the species name - train['dd long'] is the longitude, in degrees - train['dd lat'] is the latitude, in degrees test : record array, shape = (620,) The test points for the data. Same format as the training data. Nx, Ny : integers The number of longitudes (x) and latitudes (y) in the grid x_left_lower_corner, y_left_lower_corner : floats The (x,y) position of the lower-left corner, in degrees grid_size : float The spacing between points of the grid, in degrees References ---------- * `"Maximum entropy modeling of species geographic distributions" <http://rob.schapire.net/papers/ecolmod.pdf>`_ S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling, 190:231-259, 2006. Notes ----- This dataset represents the geographic distribution of species. The dataset is provided by Phillips et. al. (2006). The two species are: - `"Bradypus variegatus" <http://www.iucnredlist.org/details/3038/0>`_ , the Brown-throated Sloth. - `"Microryzomys minutus" <http://www.iucnredlist.org/details/13408/0>`_ , also known as the Forest Small Rice Rat, a rodent that lives in Peru, Colombia, Ecuador, Peru, and Venezuela. - For an example of using this dataset with scikit-learn, see :ref:`examples/applications/plot_species_distribution_modeling.py <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`. """ data_home = get_data_home(data_home) if not exists(data_home): makedirs(data_home) # Define parameters for the data files. These should not be changed # unless the data model changes. They will be saved in the npz file # with the downloaded data. extra_params = dict(x_left_lower_corner=-94.8, Nx=1212, y_left_lower_corner=-56.05, Ny=1592, grid_size=0.05) dtype = np.int16 archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME) if not exists(archive_path): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") logger.info('Downloading species data from %s to %s' % (SAMPLES.url, data_home)) samples_path = _fetch_remote(SAMPLES, dirname=data_home) with np.load(samples_path) as X: # samples.zip is a valid npz for f in X.files: fhandle = BytesIO(X[f]) if 'train' in f: train = _load_csv(fhandle) if 'test' in f: test = _load_csv(fhandle) remove(samples_path) logger.info('Downloading coverage data from %s to %s' % (COVERAGES.url, data_home)) coverages_path = _fetch_remote(COVERAGES, dirname=data_home) with np.load(coverages_path) as X: # coverages.zip is a valid npz coverages = [] for f in X.files: fhandle = BytesIO(X[f]) logger.debug(' - converting {}'.format(f)) coverages.append(_load_coverage(fhandle)) coverages = np.asarray(coverages, dtype=dtype) remove(coverages_path) bunch = Bunch(coverages=coverages, test=test, train=train, **extra_params) _joblib.dump(bunch, archive_path, compress=9) else: bunch = _joblib.load(archive_path) return bunch
def save_model(self, path): _joblib.dump(self.model, path)
def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None, download_if_missing=True, return_X_y=False): """Load the 20 newsgroups dataset and vectorize it into token counts \ (classification). Download it if necessary. This is a convenience function; the transformation is done using the default settings for :class:`sklearn.feature_extraction.text.CountVectorizer`. For more advanced usage (stopword filtering, n-gram extraction, etc.), combine fetch_20newsgroups with a custom :class:`sklearn.feature_extraction.text.CountVectorizer`, :class:`sklearn.feature_extraction.text.HashingVectorizer`, :class:`sklearn.feature_extraction.text.TfidfTransformer` or :class:`sklearn.feature_extraction.text.TfidfVectorizer`. ================= ========== Classes 20 Samples total 18846 Dimensionality 130107 Features real ================= ========== Read more in the :ref:`User Guide <20newsgroups_dataset>`. Parameters ---------- subset : 'train' or 'test', 'all', optional Select the dataset to load: 'train' for the training set, 'test' for the test set, 'all' for both, with shuffled ordering. remove : tuple May contain any subset of ('headers', 'footers', 'quotes'). Each of these are kinds of text that will be detected and removed from the newsgroup posts, preventing classifiers from overfitting on metadata. 'headers' removes newsgroup headers, 'footers' removes blocks at the ends of posts that look like signatures, and 'quotes' removes lines that appear to be quoting another post. data_home : optional, default: None Specify an download and cache folder for the datasets. If None, all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise an IOError if the data is not locally available instead of trying to download the data from the source site. return_X_y : boolean, default=False. If True, returns ``(data.data, data.target)`` instead of a Bunch object. .. versionadded:: 0.20 Returns ------- bunch : Bunch object bunch.data: sparse matrix, shape [n_samples, n_features] bunch.target: array, shape [n_samples] bunch.target_names: list, length [n_classes] bunch.DESCR: a description of the dataset. (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.20 """ data_home = get_data_home(data_home=data_home) filebase = '20newsgroup_vectorized' if remove: filebase += 'remove-' + ('-'.join(remove)) target_file = _pkl_filepath(data_home, filebase + ".pkl") # we shuffle but use a fixed seed for the memoization data_train = fetch_data(data_home=data_home, subset='train', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing) data_test = fetch_data(data_home=data_home, subset='test', categories=None, shuffle=True, random_state=12, remove=remove, download_if_missing=download_if_missing) if os.path.exists(target_file): X_train, X_test = _joblib.load(target_file) else: vectorizer = CountVectorizer(dtype=np.int16) X_train = vectorizer.fit_transform(data_train.data).tocsr() X_test = vectorizer.transform(data_test.data).tocsr() _joblib.dump((X_train, X_test), target_file, compress=9) # the data is stored as int16 for compactness # but normalize needs floats X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) normalize(X_train, copy=False) normalize(X_test, copy=False) target_names = data_train.target_names if subset == "train": data = X_train target = data_train.target elif subset == "test": data = X_test target = data_test.target elif subset == "all": data = sp.vstack((X_train, X_test)).tocsr() target = np.concatenate((data_train.target, data_test.target)) else: raise ValueError("%r is not a valid subset: should be one of " "['train', 'test', 'all']" % subset) module_path = dirname(__file__) with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file: fdescr = rst_file.read() if return_X_y: return data, target return Bunch(data=data, target=target, target_names=target_names, DESCR=fdescr)