Exemplo n.º 1
0
def _save_model(model, outdir: Path):
    """
	Saves model and vectorizer offline to load it later
	:param model: model to save
	:param outdir: directory where it'll be saved
	"""
    dump(model, str(outdir / "model.pth"))
    flag = outdir / '.SUCCESS'
    flag.touch()
def setup_module():
    # Create some memory mapped data
    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
    TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_')
    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             random_state=0)
    filename = os.path.join(TEMP_FOLDER, 'test_data.pkl')
    _joblib.dump((X, y, y_ml), filename)
    X_mm, y_mm, y_ml_mm = _joblib.load(filename, mmap_mode='r')
    ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
def setup_module():
    # Create some memory mapped data
    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
    TEMP_FOLDER = tempfile.mkdtemp(prefix='sklearn_test_score_objects_')
    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             random_state=0)
    filename = os.path.join(TEMP_FOLDER, 'test_data.pkl')
    _joblib.dump((X, y, y_ml), filename)
    X_mm, y_mm, y_ml_mm = _joblib.load(filename, mmap_mode='r')
    ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
Exemplo n.º 4
0
 def train(self,
           X: np.array,
           label: str,
           n_components: int = 10,
           cov_type: str = 'diag',
           n_iter: int = 1000) -> None:
     print('Now is trining model for {} ----->'.format(label))
     self.label = label
     self.model = hmm.GaussianHMM(n_components=n_components,
                                  covariance_type=cov_type,
                                  n_iter=n_iter)
     np.seterr(all='ignore')
     self.model.fit(X)  # Train here
     _joblib.dump(self.model, 'weights/' + label + ".m")
     with open('weights/' + label + '.lb', 'w') as f:
         f.write(label)
Exemplo n.º 5
0
def test_pickling(tmpdir, sample_weight):
    # Make sure that predictions are the same before and after pickling. Used
    # to be a bug because sample_weights wasn't pickled and the resulting tree
    # would miss some info.

    kde = KernelDensity()
    data = np.reshape([1., 2., 3.], (-1, 1))
    kde.fit(data, sample_weight=sample_weight)

    X = np.reshape([1.1, 2.1], (-1, 1))
    scores = kde.score_samples(X)

    file_path = str(tmpdir.join('dump.pkl'))
    _joblib.dump(kde, file_path)
    kde = _joblib.load(file_path)
    scores_pickled = kde.score_samples(X)

    assert_allclose(scores, scores_pickled)
Exemplo n.º 6
0
def test_pickling(tmpdir, sample_weight):
    # Make sure that predictions are the same before and after pickling. Used
    # to be a bug because sample_weights wasn't pickled and the resulting tree
    # would miss some info.

    kde = KernelDensity()
    data = np.reshape([1., 2., 3.], (-1, 1))
    kde.fit(data, sample_weight=sample_weight)

    X = np.reshape([1.1, 2.1], (-1, 1))
    scores = kde.score_samples(X)

    file_path = str(tmpdir.join('dump.pkl'))
    _joblib.dump(kde, file_path)
    kde = _joblib.load(file_path)
    scores_pickled = kde.score_samples(X)

    assert_allclose(scores, scores_pickled)
 def save(self, overwrite=True, **kwargs):
     print("Saving model")
     path = models_store_path + self.name
     pickle.dump(self, open(path, 'wb'))
     _joblib.dump(self.model, self.name)
Exemplo n.º 8
0
import warnings
warnings.filterwarnings("ignore")

# Save Model Using joblib
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils._joblib import dump
from sklearn.utils._joblib import load
filename = 'diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# Fit the model on 33%
model = LogisticRegression()
model.fit(X_train, Y_train)

# save the model to disk
filename = 'finalized_model2.sav'
dump(model, filename)

# some time later...

# load the model from disk
loaded_model = load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)
Exemplo n.º 9
0
 def save(self, overwrite=True, **kwargs):
     print("Saving model")
     abs_path = os.path.abspath(os.path.dirname(__file__))
     path = models_store_path + self.name
     _joblib.dump(self.model, os.path.join(abs_path, path))
Exemplo n.º 10
0
def fetch_species_distributions(data_home=None,
                                download_if_missing=True):
    """Loader for species distribution dataset from Phillips et. al. (2006)

    Read more in the :ref:`User Guide <datasets>`.

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    --------
    The data is returned as a Bunch object with the following attributes:

    coverages : array, shape = [14, 1592, 1212]
        These represent the 14 features measured at each point of the map grid.
        The latitude/longitude values for the grid are discussed below.
        Missing data is represented by the value -9999.

    train : record array, shape = (1624,)
        The training points for the data.  Each point has three fields:

        - train['species'] is the species name
        - train['dd long'] is the longitude, in degrees
        - train['dd lat'] is the latitude, in degrees

    test : record array, shape = (620,)
        The test points for the data.  Same format as the training data.

    Nx, Ny : integers
        The number of longitudes (x) and latitudes (y) in the grid

    x_left_lower_corner, y_left_lower_corner : floats
        The (x,y) position of the lower-left corner, in degrees

    grid_size : float
        The spacing between points of the grid, in degrees

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://rob.schapire.net/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Notes
    -----

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    - For an example of using this dataset with scikit-learn, see
      :ref:`examples/applications/plot_species_distribution_modeling.py
      <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
    """
    data_home = get_data_home(data_home)
    if not exists(data_home):
        makedirs(data_home)

    # Define parameters for the data files.  These should not be changed
    # unless the data model changes.  They will be saved in the npz file
    # with the downloaded data.
    extra_params = dict(x_left_lower_corner=-94.8,
                        Nx=1212,
                        y_left_lower_corner=-56.05,
                        Ny=1592,
                        grid_size=0.05)
    dtype = np.int16

    archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)

    if not exists(archive_path):
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")
        logger.info('Downloading species data from %s to %s' % (
            SAMPLES.url, data_home))
        samples_path = _fetch_remote(SAMPLES, dirname=data_home)
        with np.load(samples_path) as X:  # samples.zip is a valid npz
            for f in X.files:
                fhandle = BytesIO(X[f])
                if 'train' in f:
                    train = _load_csv(fhandle)
                if 'test' in f:
                    test = _load_csv(fhandle)
        remove(samples_path)

        logger.info('Downloading coverage data from %s to %s' % (
            COVERAGES.url, data_home))
        coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
        with np.load(coverages_path) as X:  # coverages.zip is a valid npz
            coverages = []
            for f in X.files:
                fhandle = BytesIO(X[f])
                logger.debug(' - converting {}'.format(f))
                coverages.append(_load_coverage(fhandle))
            coverages = np.asarray(coverages, dtype=dtype)
        remove(coverages_path)

        bunch = Bunch(coverages=coverages,
                      test=test,
                      train=train,
                      **extra_params)
        _joblib.dump(bunch, archive_path, compress=9)
    else:
        bunch = _joblib.load(archive_path)

    return bunch
Exemplo n.º 11
0
 def save(self, overwrite=True, **kwargs):
     print("Saving model")
     #path = models_store_path+self.name
     _joblib.dump(self.model, 'oursvm.pkl')
def fetch_uci_libras(data_home=None,
                     shuffle=False,
                     random_state=0,
                     download_if_missing=True):
    """Load the UCI libra  data-set from AT&T (classification).
    Download it if necessary.

    =================   =====================
    Classes                                15
    Samples total                         360
    Dimensionality                       90
    Features                            real
    =================   =====================    
    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    shuffle : boolean, optional
        If True the order of the dataset is shuffled to avoid having
        images of the same person grouped.
    random_state : int, RandomState instance or None (default=0)
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.
    Returns
    -------    
    data : numpy array of shape (360, 90)
        Each row corresponds to a libras feature of 9 dimension
    target : numpy array of shape (360, )
        Labels associated to each glas. Those labels are from
        range(15) and correspond to the Subject IDs.
    """
    global LIBRAS
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)
    filepath = _pkl_filepath(data_home, 'uci_libras.pkz')
    if not exists(filepath):
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")

        print('downloading UCI LIBRAS from %s to %s' % (LIBRAS.url, data_home))
        data_path = _fetch_remote(LIBRAS, dirname=data_home)

        libras = np.genfromtxt(data_path, delimiter=",")
        _joblib.dump(libras, filepath, compress=6)
        remove(data_path)

    else:
        libras = _joblib.load(filepath)

    feature = libras[:, 0:-1]
    target = libras[:, -1]
    if shuffle:
        random_state = check_random_state(random_state)
        order = random_state.permutation(len(libras))
        feature = libras[order]
        target = target[order]

    return (feature, target)
Exemplo n.º 13
0
from sklearn.utils import _joblib as joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
parameter_0_0_1_0 = pd.read_csv(
    filepath_or_buffer="/code/Media/207/upload/iris(1).csv", header=0)
data = parameter_0_0_1_0
parameter_1_0_3_0 = data.iloc[:, :-1]
parameter_1_1_3_1 = data.iloc[:, -1]
parameter_3_0_4_0, parameter_3_1_2_1, parameter_3_2_4_1, parameter_3_3_2_2 = train_test_split(
    parameter_1_0_3_0, parameter_1_1_3_1, random_state=None, test_size=0.8)
parameter_4_0_2_0 = DecisionTreeClassifier(criterion="entropy",
                                           splitter="random",
                                           max_depth=50,
                                           min_samples_split=2,
                                           min_samples_leaf=1,
                                           min_weight_fraction_leaf=0,
                                           max_features=None,
                                           random_state=None,
                                           max_leaf_nodes=50,
                                           min_impurity_decrease=1,
                                           min_impurity_split=0,
                                           class_weight=None,
                                           ccp_alpha=0)
parameter_4_0_2_0.fit(parameter_3_0_4_0, parameter_3_2_4_1)
joblib.dump(parameter_4_0_2_0, filename="/code/Media/207/upload/1.m")
parameter_2_0_2_null = parameter_4_0_2_0.score(parameter_3_1_2_1,
                                               parameter_3_3_2_2)
Exemplo n.º 14
0
def fetch_species_distributions(data_home=None, download_if_missing=True):
    """Loader for species distribution dataset from Phillips et. al. (2006)

    Read more in the :ref:`User Guide <datasets>`.

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    --------
    The data is returned as a Bunch object with the following attributes:

    coverages : array, shape = [14, 1592, 1212]
        These represent the 14 features measured at each point of the map grid.
        The latitude/longitude values for the grid are discussed below.
        Missing data is represented by the value -9999.

    train : record array, shape = (1624,)
        The training points for the data.  Each point has three fields:

        - train['species'] is the species name
        - train['dd long'] is the longitude, in degrees
        - train['dd lat'] is the latitude, in degrees

    test : record array, shape = (620,)
        The test points for the data.  Same format as the training data.

    Nx, Ny : integers
        The number of longitudes (x) and latitudes (y) in the grid

    x_left_lower_corner, y_left_lower_corner : floats
        The (x,y) position of the lower-left corner, in degrees

    grid_size : float
        The spacing between points of the grid, in degrees

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://rob.schapire.net/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Notes
    -----

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    - For an example of using this dataset with scikit-learn, see
      :ref:`examples/applications/plot_species_distribution_modeling.py
      <sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py>`.
    """
    data_home = get_data_home(data_home)
    if not exists(data_home):
        makedirs(data_home)

    # Define parameters for the data files.  These should not be changed
    # unless the data model changes.  They will be saved in the npz file
    # with the downloaded data.
    extra_params = dict(x_left_lower_corner=-94.8,
                        Nx=1212,
                        y_left_lower_corner=-56.05,
                        Ny=1592,
                        grid_size=0.05)
    dtype = np.int16

    archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)

    if not exists(archive_path):
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")
        logger.info('Downloading species data from %s to %s' %
                    (SAMPLES.url, data_home))
        samples_path = _fetch_remote(SAMPLES, dirname=data_home)
        with np.load(samples_path) as X:  # samples.zip is a valid npz
            for f in X.files:
                fhandle = BytesIO(X[f])
                if 'train' in f:
                    train = _load_csv(fhandle)
                if 'test' in f:
                    test = _load_csv(fhandle)
        remove(samples_path)

        logger.info('Downloading coverage data from %s to %s' %
                    (COVERAGES.url, data_home))
        coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
        with np.load(coverages_path) as X:  # coverages.zip is a valid npz
            coverages = []
            for f in X.files:
                fhandle = BytesIO(X[f])
                logger.debug(' - converting {}'.format(f))
                coverages.append(_load_coverage(fhandle))
            coverages = np.asarray(coverages, dtype=dtype)
        remove(coverages_path)

        bunch = Bunch(coverages=coverages,
                      test=test,
                      train=train,
                      **extra_params)
        _joblib.dump(bunch, archive_path, compress=9)
    else:
        bunch = _joblib.load(archive_path)

    return bunch
Exemplo n.º 15
0
 def save_model(self, path):
     _joblib.dump(self.model, path)
Exemplo n.º 16
0
def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None,
                                  download_if_missing=True, return_X_y=False):
    """Load the 20 newsgroups dataset and vectorize it into token counts \
(classification).

    Download it if necessary.

    This is a convenience function; the transformation is done using the
    default settings for
    :class:`sklearn.feature_extraction.text.CountVectorizer`. For more
    advanced usage (stopword filtering, n-gram extraction, etc.), combine
    fetch_20newsgroups with a custom
    :class:`sklearn.feature_extraction.text.CountVectorizer`,
    :class:`sklearn.feature_extraction.text.HashingVectorizer`,
    :class:`sklearn.feature_extraction.text.TfidfTransformer` or
    :class:`sklearn.feature_extraction.text.TfidfVectorizer`.

    =================   ==========
    Classes                     20
    Samples total            18846
    Dimensionality          130107
    Features                  real
    =================   ==========

    Read more in the :ref:`User Guide <20newsgroups_dataset>`.

    Parameters
    ----------
    subset : 'train' or 'test', 'all', optional
        Select the dataset to load: 'train' for the training set, 'test'
        for the test set, 'all' for both, with shuffled ordering.

    remove : tuple
        May contain any subset of ('headers', 'footers', 'quotes'). Each of
        these are kinds of text that will be detected and removed from the
        newsgroup posts, preventing classifiers from overfitting on
        metadata.

        'headers' removes newsgroup headers, 'footers' removes blocks at the
        ends of posts that look like signatures, and 'quotes' removes lines
        that appear to be quoting another post.

    data_home : optional, default: None
        Specify an download and cache folder for the datasets. If None,
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing : optional, True by default
        If False, raise an IOError if the data is not locally available
        instead of trying to download the data from the source site.

    return_X_y : boolean, default=False.
        If True, returns ``(data.data, data.target)`` instead of a Bunch
        object.

        .. versionadded:: 0.20

    Returns
    -------
    bunch : Bunch object
        bunch.data: sparse matrix, shape [n_samples, n_features]
        bunch.target: array, shape [n_samples]
        bunch.target_names: list, length [n_classes]
        bunch.DESCR: a description of the dataset.

    (data, target) : tuple if ``return_X_y`` is True

        .. versionadded:: 0.20
    """
    data_home = get_data_home(data_home=data_home)
    filebase = '20newsgroup_vectorized'
    if remove:
        filebase += 'remove-' + ('-'.join(remove))
    target_file = _pkl_filepath(data_home, filebase + ".pkl")

    # we shuffle but use a fixed seed for the memoization
    data_train = fetch_data(data_home=data_home,
                            subset='train',
                            categories=None,
                            shuffle=True,
                            random_state=12,
                            remove=remove,
                            download_if_missing=download_if_missing)

    data_test = fetch_data(data_home=data_home,
                           subset='test',
                           categories=None,
                           shuffle=True,
                           random_state=12,
                           remove=remove,
                           download_if_missing=download_if_missing)

    if os.path.exists(target_file):
        X_train, X_test = _joblib.load(target_file)
    else:
        vectorizer = CountVectorizer(dtype=np.int16)
        X_train = vectorizer.fit_transform(data_train.data).tocsr()
        X_test = vectorizer.transform(data_test.data).tocsr()
        _joblib.dump((X_train, X_test), target_file, compress=9)

    # the data is stored as int16 for compactness
    # but normalize needs floats
    X_train = X_train.astype(np.float64)
    X_test = X_test.astype(np.float64)
    normalize(X_train, copy=False)
    normalize(X_test, copy=False)

    target_names = data_train.target_names

    if subset == "train":
        data = X_train
        target = data_train.target
    elif subset == "test":
        data = X_test
        target = data_test.target
    elif subset == "all":
        data = sp.vstack((X_train, X_test)).tocsr()
        target = np.concatenate((data_train.target, data_test.target))
    else:
        raise ValueError("%r is not a valid subset: should be one of "
                         "['train', 'test', 'all']" % subset)

    module_path = dirname(__file__)
    with open(join(module_path, 'descr', 'twenty_newsgroups.rst')) as rst_file:
        fdescr = rst_file.read()

    if return_X_y:
        return data, target

    return Bunch(data=data,
                 target=target,
                 target_names=target_names,
                 DESCR=fdescr)