def fit_mix(self, u_feats, l_feats, l_targets): random_state = check_random_state(self.random_state) best_inertia = None if effective_n_jobs(self.n_jobs) == 1: for it in range(self.n_init): labels, inertia, centers, n_iters = self.fit_mix_once( u_feats, l_feats, l_targets, random_state) if best_inertia is None or inertia < best_inertia: self.labels_ = labels.clone() self.cluster_centers_ = centers.clone() best_inertia = inertia self.inertia_ = inertia self.n_iter_ = n_iters else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init) results = Parallel(n_jobs=self.n_jobs, verbose=0)( delayed(self.fit_mix_once)(u_feats, l_feats, l_targets, seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) self.labels_ = labels[best] self.inertia_ = inertia[best] self.cluster_centers_ = centers[best] self.n_iter_ = n_iters[best]
def _parallel_pairwise_(X, Y, func, n_jobs, **kwds): """Break the pairwise matrix in n_jobs even slices and compute them in parallel""" if Y is None: Y = X if effective_n_jobs(n_jobs) == 1: return func(X, Y, **kwds) # TODO: in some cases, backend='threading' may be appropriate fd = delayed(func) ret = Parallel(n_jobs=n_jobs, verbose=0)( fd(X, Y[s], **kwds) for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs))) return np.hstack(ret)
def score_samples(self, X, lengths=None): """Compute the log probability under the model and compute posteriors. Parameters ---------- X : array-like, shape (n_samples, n_features) Feature matrix of individual samples. lengths : array-like of integers, shape (n_sequences, ), optional Lengths of the individual sequences in ``X``. The sum of these should be ``n_samples``. Returns ------- logprob : float Log likelihood of ``X``. posteriors : array, shape (n_samples, n_components) State-membership probabilities for each sample in ``X``. See Also -------- score : Compute the log probability under the model. decode : Find most likely state sequence corresponding to ``X``. """ X = self._trim_array(X) check_is_fitted(self, self.check_fitted) self._check() X = _check_array(X) n_jobs = effective_n_jobs(self.n_jobs) parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) lengths = X[0].shape[0] // n_jobs results = parallel( delayed(batch_compute_posterior)(self, get_batch(X, i, j)) for i, j in iter_from_X_lengths(X, lengths)) _, posteriors, _, _, logprob_ = zip(*results) logprob = sum(logprob_) posteriors = np.vstack(posteriors) return logprob, posteriors
def robust_predict_proba(self, X): """Compute the robust posteriors across replicates. Parameters ---------- X : array-like, shape (n_samples, n_features) Feature matrix of individual samples. Returns ------- posteriors_mean : array, shape (n_samples, n_components) Average State-membership probabilities for each sample in ``X`` across replicates. posteriors_std : array, shape (n_samples, n_components) Std. dev. State-membership probabilities for each sample in ``X`` across replicates. See Also -------- score : Compute the log probability under the model. decode : Find most likely state sequence corresponding to ``X``. """ X = self._trim_array(X) check_is_fitted(self, self.check_fitted) self._check() X = _check_array(X) n_jobs = effective_n_jobs(self.n_jobs) parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) lengths = X[0].shape[0] // n_jobs results = parallel( delayed(batch_compute_posterior_robust)(self, get_batch(X, i, j)) for i, j in iter_from_X_lengths(X, lengths)) posteriors_means, posteriors_stds = zip(*results) posteriors_means = np.vstack(posteriors_means) posteriors_stds = np.vstack(posteriors_stds) return posteriors_means, posteriors_stds
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) whom = ("`pairwise_distances`. Precomputed distance " " need to have non-negative values.") check_non_negative(X, whom=whom) return X elif ((metric == 'cosine') and (Y is None) and (not issparse(X)) and X.dtype == np.float64): return _daal4py_cosine_distance_dense(X) elif ((metric == 'correlation') and (Y is None) and (not issparse(X)) and X.dtype == np.float64): return _daal4py_correlation_distance_dense(X) elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None if (dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool))): msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays(X, Y, dtype=dtype) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def k_means(X, n_clusters, sample_weight=None, init='k-means++', precompute_distances='auto', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=None, algorithm="full", return_n_iter=False, group=None): """K-means clustering algorithm. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The observations to cluster. It must be noted that the data will be converted to C ordering, which will cause a memory copy if the given data is not C-contiguous. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. precompute_distances : {'auto', True, False} Precompute distances (faster but takes more memory). 'auto' : do not precompute distances if n_samples * n_clusters > 12 million. This corresponds to about 100MB overhead per job using double precision. True : always precompute distances False : never precompute distances n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. verbose : boolean, optional Verbosity mode. tol : float, optional The relative increment in the results before declaring convergence. random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True (default), then the original data is not modified, ensuring X is C-contiguous. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean, in this case it will also not ensure that data is C-contiguous which may cause a significant slowdown. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. algorithm : "auto", "full" or "elkan", default="auto" K-means algorithm to use. The classical EM-style algorithm is "full". The "elkan" variation is more efficient by using the triangle inequality, but currently doesn't support sparse data. "auto" chooses "elkan" for dense data and "full" for sparse data. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter : int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) # avoid forcing order when copy_x=False order = "C" if copy_x else None X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], order=order, copy=copy_x) # verify that the number of samples given is larger than k if _num_samples(X) < n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters)) tol = _tolerance(X, tol) # If the distances are precomputed every job will create a matrix of shape # (n_clusters, n_samples). To stop KMeans from eating up memory we only # activate this if the created matrix is guaranteed to be under 100MB. 12 # million entries consume a little under 100MB if they are of type double. if precompute_distances == 'auto': n_samples = X.shape[0] precompute_distances = (n_clusters * n_samples) < 12e6 elif isinstance(precompute_distances, bool): pass else: raise ValueError("precompute_distances should be 'auto' or True/False" ", but a value of %r was passed" % precompute_distances) # Validate init array if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if algorithm == "full": kmeans_single = _kmeans_single_lloyd else: raise ValueError("Algorithm must be 'full'" " %s" % str(algorithm)) if effective_n_jobs(n_jobs): # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = kmeans_single( X, sample_weight, n_clusters, max_iter=max_iter, init=init, verbose=verbose, precompute_distances=precompute_distances, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state, group=group) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(kmeans_single)( X, sample_weight, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, precompute_distances=precompute_distances, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed, group=group) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean distinct_clusters = len(set(best_labels)) if distinct_clusters < n_clusters: warnings.warn("Number of distinct clusters ({}) found smaller than " "n_clusters ({}). Possibly due to duplicate points " "in X.".format(distinct_clusters, n_clusters), ConvergenceWarning, stacklevel=2) if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def fit(self, X, y, groups=None): """Fit the RFE model and automatically tune the number of selected features. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where `n_samples` is the number of samples and `n_features` is the total number of features. y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). groups : array-like, shape = [n_samples], optional Group labels for the samples used while splitting the dataset into train/test set. """ if type(self.step) is not list: return super(DyRFECV, self).fit(X, y, groups) X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] step = [] for s in self.step: if 0.0 < s < 1.0: step.append(int(max(1, s * n_features))) else: step.append(int(s)) if s <= 0: raise ValueError("Step must be >0") # Build an RFE object, which will evaluate and score each possible # feature count, down to self.min_features_to_select rfe = DyRFE(estimator=self.estimator, n_features_to_select=self.min_features_to_select, step=self.step, verbose=self.verbose) # Determine the number of subsets of features by fitting across # the train folds and choosing the "features_to_select" parameter # that gives the least averaged error across all folds. # Note that joblib raises a non-picklable error for bound methods # even if n_jobs is set to 1 with the default multiprocessing # backend. # This branching is done so that to # make sure that user code that sets n_jobs to 1 # and provides bound methods as scorers is not broken with the # addition of n_jobs parameter in version 0.18. if effective_n_jobs(self.n_jobs) == 1: parallel, func = list, _rfe_single_fit else: parallel = Parallel(n_jobs=self.n_jobs) func = delayed(_rfe_single_fit) scores = parallel( func(rfe, self.estimator, X, y, train, test, scorer) for train, test in cv.split(X, y, groups)) scores = np.sum(scores, axis=0) diff = int(scores.shape[0]) - len(step) if diff > 0: step = np.r_[step, [step[-1]] * diff] scores_rev = scores[::-1] argmax_idx = len(scores) - np.argmax(scores_rev) - 1 n_features_to_select = max( n_features - sum(step[:argmax_idx]), self.min_features_to_select) # Re-execute an elimination with best_k over the whole set rfe = DyRFE(estimator=self.estimator, n_features_to_select=n_features_to_select, step=self.step, verbose=self.verbose) rfe.fit(X, y) # Set final attributes self.support_ = rfe.support_ self.n_features_ = rfe.n_features_ self.ranking_ = rfe.ranking_ self.estimator_ = clone(self.estimator) self.estimator_.fit(self.transform(X), y) # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1 # here, the scores are normalized by get_n_splits(X, y) self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups) return self
def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features), \ or (n_query, n_indexed) if metric == 'precomputed' The query point or points. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the lengths to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. Examples -------- In the following example, we construct a NeighborsClassifier class from an array representing our data set and ask who's the closest point to [1,1,1] >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] >>> from sklearn.neighbors import NearestNeighbors >>> neigh = NearestNeighbors(n_neighbors=1) >>> neigh.fit(samples) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30, ...) >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS (array([[0.5]]), array([[2]])) As you can see, it returns [[0.5]], and [[2]], which means that the element is at distance 0.5 and is the third element of samples (indexes start at 0). You can also query for multiple points: >>> X = [[0., 1., 0.], [1., 0., 1.]] >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS array([[1], [2]]...) """ if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: raise ValueError( "Expected n_neighbors > 0. Got %d" % n_neighbors ) else: if not np.issubdtype(type(n_neighbors), np.integer): raise TypeError( "n_neighbors does not take %s value, " "enter integer value" % type(n_neighbors)) if X is not None: query_is_train = False else: query_is_train = True X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 train_size = self._fit_X.shape[0] if n_neighbors > train_size: raise ValueError( "Expected n_neighbors <= n_samples, " " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors) ) n_samples = X.shape[0] sample_range = np.arange(n_samples)[:, None] n_jobs = effective_n_jobs(self.n_jobs) if self._fit_method == 'brute': reduce_func = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) # for efficiency, use squared euclidean distances kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' else self.effective_metric_params_) result = list(pairwise_distances_chunked_( X, self._fit_X, reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " "or set algorithm='brute'" % self._fit_method) old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12') if old_joblib: # Deal with change of API in joblib check_pickle = False if old_joblib else None delayed_query = delayed(_tree_query_parallel_helper, check_pickle=check_pickle) parallel_kwargs = {"backend": "threading"} else: delayed_query = delayed(_tree_query_parallel_helper) parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query( self._tree, X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) else: raise ValueError("internal: _fit_method not recognized") if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) if not query_is_train: return result else: # If the query data is the same as the indexed data, we would like # to ignore the first nearest neighbor of every sample, i.e # the sample itself. if return_distance: dist, neigh_ind = result else: neigh_ind = result sample_mask = neigh_ind != sample_range # Corner case: When the number of duplicates are more # than the number of neighbors, the first NN will not # be the sample, but a duplicate. # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False neigh_ind = np.reshape( neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) if return_distance: dist = np.reshape( dist[sample_mask], (n_samples, n_neighbors - 1)) return dist, neigh_ind return neigh_ind
def daal_pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, force_all_finite=True, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. ['nan_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf and np.nan in array. The possibilities are: - True: Force all values of array to be finite. - False: accept both np.inf and np.nan in array. - 'allow-nan': accept only np.nan values in array. Values cannot be infinite. .. versionadded:: 0.22 **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. See also -------- pairwise_distances_chunked : performs the same calculation as this function, but returns a generator of chunks of the distance matrix, in order to limit memory usage. paired_distances : Computes the distances between corresponding elements of two arrays """ if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed": raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', " "or a callable" % (metric, _VALID_METRICS)) X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'], force_all_finite=force_all_finite) _patching_status = PatchingConditionsChain( "sklearn.metrics.pairwise_distances") _dal_ready = _patching_status.and_conditions([ (metric == 'cosine' or metric == 'correlation', f"'{metric}' metric is not supported. " "Only 'cosine' and 'correlation' metrics are supported."), (Y is None, "Second feature array is not supported."), (not issparse(X), "X is sparse. Sparse input is not supported."), (X.dtype == np.float64, f"{X.dtype} X data type is not supported. Only np.float64 is supported.") ]) _patching_status.write_log() if _dal_ready: if metric == 'cosine': return _daal4py_cosine_distance_dense(X) if metric == 'correlation': return _daal4py_correlation_distance_dense(X) raise ValueError(f"'{metric}' distance is wrong for daal4py.") if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True, force_all_finite=force_all_finite) whom = ("`pairwise_distances`. Precomputed distance " " need to have non-negative values.") check_non_negative(X, whom=whom) return X if metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)): msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) X, Y = check_pairwise_arrays(X, Y, dtype=dtype, force_all_finite=force_all_finite) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def fit(self, X, lengths=None): """Estimate model parameters. An initialization step is performed before entering the EM algorithm. If you want to avoid this step for a subset of the parameters, pass proper ``init_params`` keyword argument to estimator's constructor. Parameters ---------- X : array-like, shape (n_samples, n_features) Feature matrix of individual samples. lengths : array-like of integers, shape (n_sequences, ) Lengths of the individual sequences in ``X``. The sum of these should be ``n_samples``. Returns ------- self : object Returns self. """ X = self._trim_array(X) X = _check_array(X) self._init(X, lengths=lengths) self._check() self.monitor_._reset() if True: n_jobs = effective_n_jobs(self.n_jobs) parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) lengths = X[0].shape[0] // n_jobs for iter_ in range(self.n_iter): #stats = self._initialize_sufficient_statistics() curr_logprob = 0 results = parallel( delayed(batch_accumulate_suff_state)(self, get_batch(X, i, j)) for i, j in iter_from_X_lengths(X, lengths)) _, _, _, _, logprob, statssub = zip(*results) n = 0 stats = self._initialize_sufficient_statistics() for i, j in iter_from_X_lengths(X, lengths): for k in stats: if isinstance(stats[k], list): for i, _ in enumerate(stats[k]): stats[k][i] += statssub[n][k][i] else: stats[k] += statssub[n][k] curr_logprob += logprob[n] n += 1 # XXX must be before convergence check, because otherwise # there won't be any updates for the case ``n_iter=1``. self._do_mstep(stats) self.print_progress() delta = curr_logprob - self.monitor_.history[ -1] if self.monitor_.history else np.nan logging.debug( self.monitor_._template.format(iter=iter_ + 1, logprob=curr_logprob, delta=delta)) self.monitor_.report(curr_logprob) if self.monitor_.converged: break return self
def kneighbors(self, X, n_neighbors=None, sketch_method=None, candidates_scale=None, return_distance=False): """Fast finds the approximate K-neighbors of each point using sketch. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_query, n_features). The query point or points. n_neighbors : :obj:`int`, :obj: Number of neighbors to get. sketch_method : {:obj:`None`, 'symmetric', 'asymmetric', 'g_asymmetric', 'PCA'}, defalut = :obj:`None` Method to be used to filter candidates before rank the real distances. If non None value passed to the constructor, this value will be ignored. If both constructor and this method get None, It will not use any sketch filter, act just like normal KNN. See constructor for more details. candidates_scale : :obj:`int`, default is the value passed to the constructor Scale up n_neighbors as number of candidate when filtering using sketch. return_distance : :obj:`boolean`, default = :obj:`False`. If False, distances will not be returned Returns ------- dist : :obj:`array` Array representing the lengths to points, only present if return_distance= :obj:`True` ind : :obj:`array` Indices of the nearest points in the population matrix. """ check_is_fitted(self, ["_fit_X"]) if n_neighbors is None: n_neighbors = self.n_neighbors X = check_array(X) if self.sketch_method is not None: sketch_method = self.sketch_method # reduce_func for neighbors reduce_func_k = partial(self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance) kwds = ({'squared': True}) n_jobs = effective_n_jobs(self.n_jobs) # find candidates if sketch_method is None: # KNN pass else: if candidates_scale is None: candidates_scale = self.candidates_scale n_candidates = self.n_neighbors * candidates_scale reduce_func_1 = partial(self._kneighbors_reduce_func, n_neighbors=n_candidates, return_distance=False) if sketch_method == 'symmetric': sketch_X = self._sketch(X) candidates = list( pairwise_distances_chunked(sketch_X, self._sketch_X, reduce_func=reduce_func_1, metric=paired_hamming_distance, n_jobs=n_jobs)) elif sketch_method == 'asymmetric': # TODO: sketch X (query points) sketch_X, weight = self._sketch(X, return_weight=True) _sketch_X_weight = sketch_X + weight # encode sketch_X and weight together # TODO: filter candidates candidates = list( pairwise_distances_chunked( _sketch_X_weight, self._sketch_X, reduce_func=reduce_func_1, metric=paired_asymmetric_distance, n_jobs=n_jobs)) elif sketch_method == 'PCA': # sketch X (query points) sketch_X = self._pca.transform(X) # filter candidates candidates = list( pairwise_distances_chunked(sketch_X, self._pca_X, reduce_func=reduce_func_1, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) elif sketch_method == 'g_asymmetric': # TODO: sketch X (query points) sketch_X, weight, g_sketch_X, g_weight = self._sketch( X, return_weight=True, return_label=True) _sketch_X_weight = sketch_X + weight # encode sketch_X and weight together # TODO: filter label Candidate_inds = [] for g_sketch_X_i, g_weight_i in zip( g_sketch_X, g_weight): # for each query point labels = self._getlabels( g_weight_i, g_sketch_X_i, self.g_threshold) # get query point's labels inds = set() for label in labels: inds |= self._g_dict[label] sketchlist = g_sketch_X_i wlist = g_weight_i while len(inds) < n_candidates: wlist = [ w if w >= self.g_threshold else math.inf for w in g_weight_i ] ind = np.argmin(wlist) # find index of smallest weight sketchlist[ind] = 1 - sketchlist[ind] label = ''.join(str(elm) for elm in sketchlist) inds |= self._g_dict[label] del wlist Candidate_inds.append( inds ) # get row number of data points that matched query point's labels # TODO: filter candidates candidates = [] for i in range(len(Candidate_inds)): # for each query point candidate_inds = sorted(list( Candidate_inds[i])) # get matched inds tmp1 = self._sketch_X[candidate_inds, :] tmp2 = _sketch_X_weight[[i]] iinds = list( pairwise_distances_chunked( tmp2, tmp1, reduce_func=reduce_func_1, metric=paired_asymmetric_distance, n_jobs=n_jobs)) iinds[0][0] = np.array( [candidate_inds[ii] for ii in list(iinds[0][0])]) candidates += iinds else: raise ValueError( "%s sketch_method has not been implemented.".format( sketch_method)) candidates = np.vstack(candidates) # result to return if return_distance: dists = np.empty([0, n_neighbors]) neight_inds = np.empty([0, n_neighbors], dtype=int) # find neighbors if sketch_method is None: # KNN # find neighbors from all data points result = list( pairwise_distances_chunked(X, self._fit_X, reduce_func=reduce_func_k, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) if return_distance: dist, neigh_ind = zip(*result) result = np.vstack(dist), np.vstack(neigh_ind) else: result = np.vstack(result) else: # find neighbors from the candidate points. for i in range(len(candidates)): result = list( pairwise_distances_chunked(X[[i], :], self._fit_X[candidates[i]], reduce_func=reduce_func_k, metric=self.effective_metric_, n_jobs=n_jobs, **kwds)) if return_distance: dist, neigh_ind = zip(*result) dist = np.vstack(dist) neigh_ind = candidates[i][np.vstack(neigh_ind).reshape(-1)] dists = np.concatenate((dists, dist), axis=0) neight_inds = np.vstack((neight_inds, neigh_ind)) else: neigh_ind = candidates[i][np.vstack(result)[0]] neight_inds = np.vstack((neight_inds, neigh_ind)) if return_distance: result = dists, neight_inds else: result = neight_inds return result
def pairwise_distances_(X, Y=None, metric="euclidean", n_jobs=None, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. See also -------- pairwise_distances_chunked : performs the same calculation as this function, but returns a generator of chunks of the distance matrix, in order to limit memory usage. paired_distances : Computes the distances between corresponding elements of two arrays """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) if metric == "precomputed": whom = ("`pairwise_distances`. Precomputed distance " " need to have non-negative values.") check_non_negative(X, whom=whom) return X elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable_, metric=metric, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise_(X, Y, func, n_jobs, **kwds)