Exemplo n.º 1
 def fit_mix(self, u_feats, l_feats, l_targets):
     random_state = check_random_state(self.random_state)
     best_inertia = None
     if effective_n_jobs(self.n_jobs) == 1:
         for it in range(self.n_init):
             labels, inertia, centers, n_iters = self.fit_mix_once(
                 u_feats, l_feats, l_targets, random_state)
             if best_inertia is None or inertia < best_inertia:
                 self.labels_ = labels.clone()
                 self.cluster_centers_ = centers.clone()
                 best_inertia = inertia
                 self.inertia_ = inertia
                 self.n_iter_ = n_iters
         # parallelisation of k-means runs
         seeds = random_state.randint(np.iinfo(np.int32).max,
         results = Parallel(n_jobs=self.n_jobs, verbose=0)(
             delayed(self.fit_mix_once)(u_feats, l_feats, l_targets, seed)
             for seed in seeds)
         # Get results with the lowest inertia
         labels, inertia, centers, n_iters = zip(*results)
         best = np.argmin(inertia)
         self.labels_ = labels[best]
         self.inertia_ = inertia[best]
         self.cluster_centers_ = centers[best]
         self.n_iter_ = n_iters[best]
def _parallel_pairwise_(X, Y, func, n_jobs, **kwds):
    """Break the pairwise matrix in n_jobs even slices
    and compute them in parallel"""

    if Y is None:
        Y = X

    if effective_n_jobs(n_jobs) == 1:
        return func(X, Y, **kwds)

    # TODO: in some cases, backend='threading' may be appropriate
    fd = delayed(func)
    ret = Parallel(n_jobs=n_jobs, verbose=0)(
        fd(X, Y[s], **kwds)
        for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs)))

    return np.hstack(ret)
Exemplo n.º 3
    def score_samples(self, X, lengths=None):
        """Compute the log probability under the model and compute posteriors.

        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, ), optional
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        logprob : float
            Log likelihood of ``X``.

        posteriors : array, shape (n_samples, n_components)
            State-membership probabilities for each sample in ``X``.

        See Also
        score : Compute the log probability under the model.
        decode : Find most likely state sequence corresponding to ``X``.
        X = self._trim_array(X)
        check_is_fitted(self, self.check_fitted)

        X = _check_array(X)

        n_jobs = effective_n_jobs(self.n_jobs)
        parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))

        lengths = X[0].shape[0] // n_jobs

        results = parallel(
            delayed(batch_compute_posterior)(self, get_batch(X, i, j))
            for i, j in iter_from_X_lengths(X, lengths))

        _, posteriors, _, _, logprob_ = zip(*results)

        logprob = sum(logprob_)
        posteriors = np.vstack(posteriors)
        return logprob, posteriors
Exemplo n.º 4
    def robust_predict_proba(self, X):
        """Compute the robust posteriors across replicates.

        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.


        posteriors_mean : array, shape (n_samples, n_components)
            Average State-membership probabilities for each sample in ``X`` across replicates.

        posteriors_std : array, shape (n_samples, n_components)
            Std. dev. State-membership probabilities for each sample in ``X`` across replicates.

        See Also
        score : Compute the log probability under the model.
        decode : Find most likely state sequence corresponding to ``X``.
        X = self._trim_array(X)
        check_is_fitted(self, self.check_fitted)

        X = _check_array(X)

        n_jobs = effective_n_jobs(self.n_jobs)
        parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))

        lengths = X[0].shape[0] // n_jobs

        results = parallel(
            delayed(batch_compute_posterior_robust)(self, get_batch(X, i, j))
            for i, j in iter_from_X_lengths(X, lengths))

        posteriors_means, posteriors_stds = zip(*results)

        posteriors_means = np.vstack(posteriors_means)
        posteriors_stds = np.vstack(posteriors_stds)
        return posteriors_means, posteriors_stds
Exemplo n.º 5
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix inputs.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics

    Read more in the :ref:`User Guide <metrics>`.

    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in

        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    if (metric not in _VALID_METRICS
        and not callable(metric)
        and metric != "precomputed"):
        raise ValueError("Unknown metric %s. "
                         "Valid metrics are %s, or 'precomputed', or a "
                         "callable" % (metric, _VALID_METRICS))

    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True)
        whom = ("`pairwise_distances`. Precomputed distance "
                                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
    elif ((metric == 'cosine') and (Y is None)
          and (not issparse(X)) and X.dtype == np.float64):
        return _daal4py_cosine_distance_dense(X)
    elif ((metric == 'correlation') and (Y is None) and
          (not issparse(X)) and X.dtype == np.float64):
        return _daal4py_correlation_distance_dense(X)
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric, **kwds)
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if (dtype == bool
            and (X.dtype != bool or (Y is not None and Y.dtype != bool))):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype)

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Exemplo n.º 6
def k_means(X,
    """K-means clustering algorithm.

    Read more in the :ref:`User Guide <k_means>`.

    X : array-like or sparse matrix, shape (n_samples, n_features)
        The observations to cluster. It must be noted that the data
        will be converted to C ordering, which will cause a memory copy
        if the given data is not C-contiguous.

    n_clusters : int
        The number of clusters to form as well as the number of
        centroids to generate.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    init : {'k-means++', 'random', or ndarray, or a callable}, optional
        Method for initialization, default to 'k-means++':

        'k-means++' : selects initial cluster centers for k-mean
        clustering in a smart way to speed up convergence. See section
        Notes in k_init for more details.

        'random': choose k observations (rows) at random from data for
        the initial centroids.

        If an ndarray is passed, it should be of shape (n_clusters, n_features)
        and gives the initial centers.

        If a callable is passed, it should take arguments X, k and
        and a random state and return an initialization.

    precompute_distances : {'auto', True, False}
        Precompute distances (faster but takes more memory).

        'auto' : do not precompute distances if n_samples * n_clusters > 12
        million. This corresponds to about 100MB overhead per job using
        double precision.

        True : always precompute distances

        False : never precompute distances

    n_init : int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    max_iter : int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    verbose : boolean, optional
        Verbosity mode.

    tol : float, optional
        The relative increment in the results before declaring convergence.

    random_state : int, RandomState instance or None (default)
        Determines random number generation for centroid initialization. Use
        an int to make the randomness deterministic.
        See :term:`Glossary <random_state>`.

    copy_x : boolean, optional
        When pre-computing distances it is more numerically accurate to center
        the data first.  If copy_x is True (default), then the original data is
        not modified, ensuring X is C-contiguous.  If False, the original data
        is modified, and put back before the function returns, but small
        numerical differences may be introduced by subtracting and then adding
        the data mean, in this case it will also not ensure that data is
        C-contiguous which may cause a significant slowdown.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by computing
        each of the n_init runs in parallel.

        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    algorithm : "auto", "full" or "elkan", default="auto"
        K-means algorithm to use. The classical EM-style algorithm is "full".
        The "elkan" variation is more efficient by using the triangle
        inequality, but currently doesn't support sparse data. "auto" chooses
        "elkan" for dense data and "full" for sparse data.

    return_n_iter : bool, optional
        Whether or not to return the number of iterations.

    centroid : float ndarray with shape (k, n_features)
        Centroids found at the last iteration of k-means.

    label : integer ndarray with shape (n_samples,)
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia : float
        The final value of the inertia criterion (sum of squared distances to
        the closest centroid for all observations in the training set).

    best_n_iter : int
        Number of iterations corresponding to the best results.
        Returned only if `return_n_iter` is set to True.

    if n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % n_init)
    random_state = check_random_state(random_state)

    if max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % max_iter)

    # avoid forcing order when copy_x=False
    order = "C" if copy_x else None
    X = check_array(X,
                    dtype=[np.float64, np.float32],
    # verify that the number of samples given is larger than k
    if _num_samples(X) < n_clusters:
        raise ValueError("n_samples=%d should be >= n_clusters=%d" %
                         (_num_samples(X), n_clusters))

    tol = _tolerance(X, tol)

    # If the distances are precomputed every job will create a matrix of shape
    # (n_clusters, n_samples). To stop KMeans from eating up memory we only
    # activate this if the created matrix is guaranteed to be under 100MB. 12
    # million entries consume a little under 100MB if they are of type double.
    if precompute_distances == 'auto':
        n_samples = X.shape[0]
        precompute_distances = (n_clusters * n_samples) < 12e6
    elif isinstance(precompute_distances, bool):
        raise ValueError("precompute_distances should be 'auto' or True/False"
                         ", but a value of %r was passed" %

    # Validate init array
    if hasattr(init, '__array__'):
        init = check_array(init, dtype=X.dtype.type, copy=True)
        _validate_center_shape(X, n_clusters, init)

        if n_init != 1:
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d' %
            n_init = 1

    # subtract of mean of x for more accurate distance computations
    if not sp.issparse(X):
        X_mean = X.mean(axis=0)
        # The copy was already done above
        X -= X_mean

        if hasattr(init, '__array__'):
            init -= X_mean

    # precompute squared norms of data points
    x_squared_norms = row_norms(X, squared=True)

    best_labels, best_inertia, best_centers = None, None, None

    if algorithm == "full":
        kmeans_single = _kmeans_single_lloyd
        raise ValueError("Algorithm must be 'full'" " %s" % str(algorithm))
    if effective_n_jobs(n_jobs):
        # For a single thread, less memory is needed if we just store one set
        # of the best results (as opposed to one set per run per thread).
        for it in range(n_init):
            # run a k-means once
            labels, inertia, centers, n_iter_ = kmeans_single(
            # determine if these results are the best so far
            if best_inertia is None or inertia < best_inertia:
                best_labels = labels.copy()
                best_centers = centers.copy()
                best_inertia = inertia
                best_n_iter = n_iter_
        # parallelisation of k-means runs
        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        results = Parallel(n_jobs=n_jobs, verbose=0)(
                # Change seed to ensure variety
                group=group) for seed in seeds)
        # Get results with the lowest inertia
        labels, inertia, centers, n_iters = zip(*results)
        best = np.argmin(inertia)
        best_labels = labels[best]
        best_inertia = inertia[best]
        best_centers = centers[best]
        best_n_iter = n_iters[best]

    if not sp.issparse(X):
        if not copy_x:
            X += X_mean
        best_centers += X_mean

    distinct_clusters = len(set(best_labels))
    if distinct_clusters < n_clusters:
        warnings.warn("Number of distinct clusters ({}) found smaller than "
                      "n_clusters ({}). Possibly due to duplicate points "
                      "in X.".format(distinct_clusters, n_clusters),

    if return_n_iter:
        return best_centers, best_labels, best_inertia, best_n_iter
        return best_centers, best_labels, best_inertia
Exemplo n.º 7
    def fit(self, X, y, groups=None):
        """Fit the RFE model and automatically tune the number of selected
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the total number of features.
        y : array-like, shape = [n_samples]
            Target values (integers for classification, real numbers for
        groups : array-like, shape = [n_samples], optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        if type(self.step) is not list:
            return super(DyRFECV, self).fit(X, y, groups)

        X, y = check_X_y(X, y, "csr")

        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        step = []
        for s in self.step:
            if 0.0 < s < 1.0:
                step.append(int(max(1, s * n_features)))
            if s <= 0:
                raise ValueError("Step must be >0")

        # Build an RFE object, which will evaluate and score each possible
        # feature count, down to self.min_features_to_select
        rfe = DyRFE(estimator=self.estimator,
                    step=self.step, verbose=self.verbose)

        # Determine the number of subsets of features by fitting across
        # the train folds and choosing the "features_to_select" parameter
        # that gives the least averaged error across all folds.

        # Note that joblib raises a non-picklable error for bound methods
        # even if n_jobs is set to 1 with the default multiprocessing
        # backend.
        # This branching is done so that to
        # make sure that user code that sets n_jobs to 1
        # and provides bound methods as scorers is not broken with the
        # addition of n_jobs parameter in version 0.18.

        if effective_n_jobs(self.n_jobs) == 1:
            parallel, func = list, _rfe_single_fit
            parallel = Parallel(n_jobs=self.n_jobs)
            func = delayed(_rfe_single_fit)

        scores = parallel(
            func(rfe, self.estimator, X, y, train, test, scorer)
            for train, test in cv.split(X, y, groups))

        scores = np.sum(scores, axis=0)
        diff = int(scores.shape[0]) - len(step)
        if diff > 0:
            step = np.r_[step, [step[-1]] * diff]
        scores_rev = scores[::-1]
        argmax_idx = len(scores) - np.argmax(scores_rev) - 1
        n_features_to_select = max(
            n_features - sum(step[:argmax_idx]),

        # Re-execute an elimination with best_k over the whole set
        rfe = DyRFE(estimator=self.estimator,
                    n_features_to_select=n_features_to_select, step=self.step,

        rfe.fit(X, y)

        # Set final attributes
        self.support_ = rfe.support_
        self.n_features_ = rfe.n_features_
        self.ranking_ = rfe.ranking_
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(self.transform(X), y)

        # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1
        # here, the scores are normalized by get_n_splits(X, y)
        self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups)
        return self
Exemplo n.º 8
    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
        """Finds the K-neighbors of a point.
        Returns indices of and distances to the neighbors of each point.
        X : array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            The query point or points.
            If not provided, neighbors of each indexed point are returned.
            In this case, the query point is not considered its own neighbor.
        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).
        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned
        dist : array
            Array representing the lengths to points, only present if
        ind : array
            Indices of the nearest points in the population matrix.
        In the following example, we construct a NeighborsClassifier
        class from an array representing our data set and ask who's
        the closest point to [1,1,1]
        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
        >>> from sklearn.neighbors import NearestNeighbors
        >>> neigh = NearestNeighbors(n_neighbors=1)
        >>> neigh.fit(samples) # doctest: +ELLIPSIS
        NearestNeighbors(algorithm='auto', leaf_size=30, ...)
        >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
        (array([[0.5]]), array([[2]]))
        As you can see, it returns [[0.5]], and [[2]], which means that the
        element is at distance 0.5 and is the third element of samples
        (indexes start at 0). You can also query for multiple points:
        >>> X = [[0., 1., 0.], [1., 0., 1.]]
        >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS

        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        elif n_neighbors <= 0:
            raise ValueError(
                "Expected n_neighbors > 0. Got %d" %
            if not np.issubdtype(type(n_neighbors), np.integer):
                raise TypeError(
                    "n_neighbors does not take %s value, "
                    "enter integer value" %

        if X is not None:
            query_is_train = False
            query_is_train = True
            X = self._fit_X
            # Include an extra neighbor to account for the sample itself being
            # returned, which is removed later
            n_neighbors += 1

        train_size = self._fit_X.shape[0]
        if n_neighbors > train_size:
            raise ValueError(
                "Expected n_neighbors <= n_samples, "
                " but n_samples = %d, n_neighbors = %d" %
                (train_size, n_neighbors)
        n_samples = X.shape[0]
        sample_range = np.arange(n_samples)[:, None]

        n_jobs = effective_n_jobs(self.n_jobs)
        if self._fit_method == 'brute':

            reduce_func = partial(self._kneighbors_reduce_func,

            # for efficiency, use squared euclidean distances
            kwds = ({'squared': True} if self.effective_metric_ == 'euclidean'
                    else self.effective_metric_params_)

            result = list(pairwise_distances_chunked_(
                X, self._fit_X, reduce_func=reduce_func,
                metric=self.effective_metric_, n_jobs=n_jobs,

        elif self._fit_method in ['ball_tree', 'kd_tree']:
            if issparse(X):
                raise ValueError(
                    "%s does not work with sparse matrices. Densify the data, "
                    "or set algorithm='brute'" % self._fit_method)
            old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
            if old_joblib:
                # Deal with change of API in joblib
                check_pickle = False if old_joblib else None
                delayed_query = delayed(_tree_query_parallel_helper,
                parallel_kwargs = {"backend": "threading"}
                delayed_query = delayed(_tree_query_parallel_helper)
                parallel_kwargs = {"prefer": "threads"}
            result = Parallel(n_jobs, **parallel_kwargs)(
                    self._tree, X[s], n_neighbors, return_distance)
                for s in gen_even_slices(X.shape[0], n_jobs)
            raise ValueError("internal: _fit_method not recognized")

        if return_distance:
            dist, neigh_ind = zip(*result)
            result = np.vstack(dist), np.vstack(neigh_ind)
            result = np.vstack(result)

        if not query_is_train:
            return result
            # If the query data is the same as the indexed data, we would like
            # to ignore the first nearest neighbor of every sample, i.e
            # the sample itself.
            if return_distance:
                dist, neigh_ind = result
                neigh_ind = result

            sample_mask = neigh_ind != sample_range

            # Corner case: When the number of duplicates are more
            # than the number of neighbors, the first NN will not
            # be the sample, but a duplicate.
            # In that case mask the first duplicate.
            dup_gr_nbrs = np.all(sample_mask, axis=1)
            sample_mask[:, 0][dup_gr_nbrs] = False

            neigh_ind = np.reshape(
                neigh_ind[sample_mask], (n_samples, n_neighbors - 1))

            if return_distance:
                dist = np.reshape(
                    dist[sample_mask], (n_samples, n_neighbors - 1))
                return dist, neigh_ind
        return neigh_ind
Exemplo n.º 9
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
                            force_all_finite=True, **kwds):
    """ Compute the distance matrix from a vector array X and optional Y.

    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.

    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector

    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.

    Valid values for metric are:

    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix
      ['nan_euclidean'] but it does not yet support sparse matrices.

    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.

    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics

    Read more in the :ref:`User Guide <metrics>`.

    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
             [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.

    Y : array [n_samples_b, n_features], optional
        An optional second feature array. Only allowed if
        metric != "precomputed".

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.

    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in

        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf and np.nan in array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accept both np.inf and np.nan in array.
        - 'allow-nan': accept only np.nan values in array. Values cannot
          be infinite.

        .. versionadded:: 0.22

    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.

    See also
    pairwise_distances_chunked : performs the same calculation as this
        function, but returns a generator of chunks of the distance matrix, in
        order to limit memory usage.
    paired_distances : Computes the distances between corresponding
                       elements of two arrays
    if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed":
        raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', "
                         "or a callable" % (metric, _VALID_METRICS))

    X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'],

    _patching_status = PatchingConditionsChain(
    _dal_ready = _patching_status.and_conditions([
        (metric == 'cosine' or metric == 'correlation',
            f"'{metric}' metric is not supported. "
            "Only 'cosine' and 'correlation' metrics are supported."),
        (Y is None, "Second feature array is not supported."),
        (not issparse(X), "X is sparse. Sparse input is not supported."),
        (X.dtype == np.float64,
            f"{X.dtype} X data type is not supported. Only np.float64 is supported.")
    if _dal_ready:
        if metric == 'cosine':
            return _daal4py_cosine_distance_dense(X)
        if metric == 'correlation':
            return _daal4py_correlation_distance_dense(X)
        raise ValueError(f"'{metric}' distance is wrong for daal4py.")
    if metric == "precomputed":
        X, _ = check_pairwise_arrays(X, Y, precomputed=True,
        whom = ("`pairwise_distances`. Precomputed distance "
                " need to have non-negative values.")
        check_non_negative(X, whom=whom)
        return X
        func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
    elif callable(metric):
        func = partial(_pairwise_callable, metric=metric,
                       force_all_finite=force_all_finite, **kwds)
        if issparse(X) or issparse(Y):
            raise TypeError("scipy distance metrics do not"
                            " support sparse matrices.")

        dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None

        if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)):
            msg = "Data was converted to boolean for metric %s" % metric
            warnings.warn(msg, DataConversionWarning)

        X, Y = check_pairwise_arrays(X, Y, dtype=dtype,

        # precompute data-derived metric params
        params = _precompute_metric_params(X, Y, metric=metric, **kwds)

        if effective_n_jobs(n_jobs) == 1 and X is Y:
            return distance.squareform(distance.pdist(X, metric=metric,
        func = partial(distance.cdist, metric=metric, **kwds)

    return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
Exemplo n.º 10
    def fit(self, X, lengths=None):
        """Estimate model parameters.

        An initialization step is performed before entering the
        EM algorithm. If you want to avoid this step for a subset of
        the parameters, pass proper ``init_params`` keyword argument
        to estimator's constructor.

        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, )
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        self : object
            Returns self.
        X = self._trim_array(X)
        X = _check_array(X)
        self._init(X, lengths=lengths)

        if True:
            n_jobs = effective_n_jobs(self.n_jobs)
            parallel = Parallel(n_jobs=n_jobs,
                                verbose=max(0, self.verbose - 1))

            lengths = X[0].shape[0] // n_jobs

            for iter_ in range(self.n_iter):
                #stats = self._initialize_sufficient_statistics()

                curr_logprob = 0

                results = parallel(
                                                         get_batch(X, i, j))
                    for i, j in iter_from_X_lengths(X, lengths))

                _, _, _, _, logprob, statssub = zip(*results)
                n = 0
                stats = self._initialize_sufficient_statistics()
                for i, j in iter_from_X_lengths(X, lengths):
                    for k in stats:
                        if isinstance(stats[k], list):
                            for i, _ in enumerate(stats[k]):
                                stats[k][i] += statssub[n][k][i]
                            stats[k] += statssub[n][k]
                    curr_logprob += logprob[n]
                    n += 1

                # XXX must be before convergence check, because otherwise
                #     there won't be any updates for the case ``n_iter=1``.

                delta = curr_logprob - self.monitor_.history[
                    -1] if self.monitor_.history else np.nan
                    self.monitor_._template.format(iter=iter_ + 1,
                if self.monitor_.converged:

        return self
Exemplo n.º 11
Exemplo n.º 12
    def kneighbors(self,
        """Fast finds the approximate K-neighbors of each point using sketch.
        Returns indices of and distances to the neighbors of each point.

        X : array-like, shape (n_query, n_features).
            The query point or points.
        n_neighbors : :obj:`int`, :obj:
            Number of neighbors to get.
        sketch_method : {:obj:`None`, 'symmetric', 'asymmetric', 'g_asymmetric', 'PCA'}, defalut = :obj:`None`
            Method to be used to filter candidates before rank the real distances.
            If non None value passed to the constructor, this value will be
            ignored. If both constructor and this method get None, It will not
            use any sketch filter, act just like normal KNN. See constructor
            for more details.
        candidates_scale : :obj:`int`, default is the value passed to the constructor
            Scale up n_neighbors as number of candidate when filtering using
        return_distance : :obj:`boolean`, default = :obj:`False`.
            If False, distances will not be returned

        dist : :obj:`array`
            Array representing the lengths to points, only present if
            return_distance= :obj:`True`
        ind : :obj:`array`
            Indices of the nearest points in the population matrix.
        check_is_fitted(self, ["_fit_X"])

        if n_neighbors is None:
            n_neighbors = self.n_neighbors

        X = check_array(X)

        if self.sketch_method is not None:
            sketch_method = self.sketch_method

        # reduce_func for neighbors
        reduce_func_k = partial(self._kneighbors_reduce_func,
        kwds = ({'squared': True})
        n_jobs = effective_n_jobs(self.n_jobs)

        # find candidates
        if sketch_method is None:  # KNN
            if candidates_scale is None:
                candidates_scale = self.candidates_scale
            n_candidates = self.n_neighbors * candidates_scale
            reduce_func_1 = partial(self._kneighbors_reduce_func,
            if sketch_method == 'symmetric':
                sketch_X = self._sketch(X)
                candidates = list(
            elif sketch_method == 'asymmetric':
                # TODO: sketch X (query points)
                sketch_X, weight = self._sketch(X, return_weight=True)
                _sketch_X_weight = sketch_X + weight  # encode sketch_X and weight together
                # TODO: filter candidates
                candidates = list(
            elif sketch_method == 'PCA':
                # sketch X (query points)
                sketch_X = self._pca.transform(X)
                # filter candidates
                candidates = list(
            elif sketch_method == 'g_asymmetric':
                # TODO: sketch X (query points)
                sketch_X, weight, g_sketch_X, g_weight = self._sketch(
                    X, return_weight=True, return_label=True)
                _sketch_X_weight = sketch_X + weight  # encode sketch_X and weight together
                # TODO: filter label
                Candidate_inds = []
                for g_sketch_X_i, g_weight_i in zip(
                        g_sketch_X, g_weight):  # for each query point
                    labels = self._getlabels(
                        g_weight_i, g_sketch_X_i,
                        self.g_threshold)  # get query point's labels
                    inds = set()
                    for label in labels:
                        inds |= self._g_dict[label]
                    sketchlist = g_sketch_X_i
                    wlist = g_weight_i
                    while len(inds) < n_candidates:
                        wlist = [
                            w if w >= self.g_threshold else math.inf
                            for w in g_weight_i
                        ind = np.argmin(wlist)  # find index of smallest weight
                        sketchlist[ind] = 1 - sketchlist[ind]
                        label = ''.join(str(elm) for elm in sketchlist)
                        inds |= self._g_dict[label]
                    del wlist
                    )  # get row number of data points that matched query point's labels
                # TODO: filter candidates
                candidates = []
                for i in range(len(Candidate_inds)):  # for each query point
                    candidate_inds = sorted(list(
                        Candidate_inds[i]))  # get matched inds
                    tmp1 = self._sketch_X[candidate_inds, :]
                    tmp2 = _sketch_X_weight[[i]]
                    iinds = list(
                    iinds[0][0] = np.array(
                        [candidate_inds[ii] for ii in list(iinds[0][0])])
                    candidates += iinds
                raise ValueError(
                    "%s sketch_method has not been implemented.".format(
            candidates = np.vstack(candidates)

        # result to return
        if return_distance:
            dists = np.empty([0, n_neighbors])
        neight_inds = np.empty([0, n_neighbors], dtype=int)

        # find neighbors
        if sketch_method is None:  # KNN
            # find neighbors from all data points
            result = list(
            if return_distance:
                dist, neigh_ind = zip(*result)
                result = np.vstack(dist), np.vstack(neigh_ind)
                result = np.vstack(result)
            # find neighbors from the candidate points.
            for i in range(len(candidates)):
                result = list(
                    pairwise_distances_chunked(X[[i], :],
                if return_distance:
                    dist, neigh_ind = zip(*result)
                    dist = np.vstack(dist)
                    neigh_ind = candidates[i][np.vstack(neigh_ind).reshape(-1)]
                    dists = np.concatenate((dists, dist), axis=0)
                    neight_inds = np.vstack((neight_inds, neigh_ind))
                    neigh_ind = candidates[i][np.vstack(result)[0]]
                    neight_inds = np.vstack((neight_inds, neigh_ind))
            if return_distance:
                result = dists, neight_inds
                result = neight_inds

        return result
