示例#1
0
    def transform(self, X):
        """Transform data X according to the fitted model.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.


        Returns
        -------
        doc_topic_distr : shape=(n_samples, n_topics)
            Unnormalized document topic distribution for X.

        """
        X = self._check_inference(X, "HierarchicalDirichletProcess.transform")

        n_jobs = _get_n_jobs(self.n_jobs)
        verbose = max(0, self.verbose-1)
        with Parallel(n_jobs=n_jobs, verbose=verbose) as parallel:
            doc_topic_distr, _, _ = self._e_step(X,
                                                 cal_sstats=False,
                                                 cal_doc_distr=True,
                                                 cal_likelihood=False,
                                                 parallel=parallel)
        return doc_topic_distr
    def _e_step(self, X, cal_sstats, random_init, parallel=None):
        """E-step in EM update.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        cal_sstats : boolean
            Parameter that indicate whether to calculate sufficient statistics
            or not. Set ``cal_sstats`` to True when we need to run M-step.

        random_init : boolean
            Parameter that indicate whether to initialize document topic
            distribution randomly in the E-step. Set it to True in training
            steps.

        parallel : joblib.Parallel (optional)
            Pre-initialized instance of joblib.Parallel.

        Returns
        -------
        (doc_topic_distr, suff_stats) :
            `doc_topic_distr` is unnormalized topic distribution for each
            document. In the literature, this is called `gamma`.
            `suff_stats` is expected sufficient statistics for the M-step.
            When `cal_sstats == False`, it will be None.

        """

        # Run e-step in parallel
        random_state = self.random_state_ if random_init else None

        # TODO: make Parallel._effective_n_jobs public instead?
        n_jobs = _get_n_jobs(self.n_jobs)
        if parallel is None:
            parallel = Parallel(n_jobs=n_jobs,
                                verbose=max(0, self.verbose - 1))
        results = parallel(
            delayed(_update_doc_distribution)
            (X[idx_slice, :], self.exp_dirichlet_component_,
             self.doc_topic_prior_, self.max_doc_update_iter,
             self.mean_change_tol, cal_sstats, random_state)
            for idx_slice in gen_even_slices(X.shape[0], n_jobs))

        # merge result
        doc_topics, sstats_list = zip(*results)
        doc_topic_distr = np.vstack(doc_topics)

        if cal_sstats:
            # This step finishes computing the sufficient statistics for the
            # M-step.
            suff_stats = np.zeros(self.components_.shape)
            for sstats in sstats_list:
                suff_stats += sstats
            suff_stats *= self.exp_dirichlet_component_
        else:
            suff_stats = None

        return (doc_topic_distr, suff_stats)
def _partition_estimators(n_estimators, n_jobs):
	"""Private function used to partition estimators between jobs."""
	# Compute the number of jobs
	n_jobs = min(_get_n_jobs(n_jobs), n_estimators)

	# Partition estimators between jobs
	n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs, dtype=np.int)
	n_estimators_per_job[:n_estimators % n_jobs] += 1
	starts = np.cumsum(n_estimators_per_job)

	return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
示例#4
0
def _partition_estimators(n_estimators, n_jobs):
    """Private function used to partition estimators between jobs."""
    # Compute the number of jobs
    n_jobs = min(_get_n_jobs(n_jobs), n_estimators)

    # Partition estimators between jobs
    n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
                                                              dtype=np.int)
    n_estimators_per_job[:n_estimators % n_jobs] += 1
    starts = np.cumsum(n_estimators_per_job)

    return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
示例#5
0
    def _approximate_bound(self, X):
        """Calculate approximate log-likelihood for the model

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        Returns
        -------
        likelihood : float
            approximate log-likelihood for variational parameters
        """
        likelihood = 0.0
        # calculate doc likelihood
        n_jobs = _get_n_jobs(self.n_jobs)
        verbose = max(0, self.verbose-1)
        with Parallel(n_jobs=n_jobs, verbose=verbose) as parallel:
            _, _, doc_likelihood = self._e_step(X,
                                                cal_sstats=False,
                                                cal_doc_distr=False,
                                                cal_likelihood=True,
                                                parallel=parallel)
        likelihood += doc_likelihood

        # E[log(p(beta|eta) - log(q(beta|lambda))]
        # `beta` is Dirichlet distribution
        lambda_ = self.lambda_
        elog_beta_ = self.elog_beta_
        n_features = lambda_.shape[1]
        likelihood += np.sum((self.eta - lambda_) * elog_beta_)
        likelihood += np.sum(gammaln(lambda_) - gammaln(self.eta))
        likelihood += np.sum(gammaln(self.eta * n_features) -
                             gammaln(np.sum(lambda_, 1)))

        # E[log(p(v_k|omega)) - log(q(v_k|a_k))]
        # `v_k` is Beta distribution
        v_k = self.v_stick_
        likelihood += (v_k.shape[1] * np.log(self.omega))
        v_k_col_sum = np.sum(v_k, 0)
        dig_sum = psi(v_k_col_sum)
        likelihood += np.sum(
            (np.array([1.0, self.omega])[:, np.newaxis] - v_k) *
            (psi(v_k) - dig_sum))
        likelihood += np.sum(gammaln(v_k))
        likelihood -= np.sum(gammaln(v_k_col_sum))
        return likelihood
示例#6
0
 def get_neighbors(self, X=None, n_neighbors=None, return_distance=True):
     
     n_neighbors = self.n_neighbors       
     query_is_train = False
     X = check_array(X, accept_sparse='csr')
     train_size = self._fit_X.shape[0]
     n_samples, _ = X.shape
     sample_range = np.arange(n_samples)[:, None]
     n_jobs = _get_n_jobs(self.n_jobs)
     
     result = Parallel(n_jobs, backend='threading')(
         delayed(self._tree.query, check_pickle=False)(
             X[s], n_neighbors, return_distance)
         for s in gen_even_slices(X.shape[0], n_jobs)
     )
     if return_distance:
         dist, neigh_ind = tuple(zip(*result))
         result = np.vstack(dist), np.vstack(neigh_ind)
     else:
         result = np.vstack(result)
     return result
    def partial_fit(self, X, y=None):
        """Online VB with Mini-Batch update.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        y : Ignored.

        Returns
        -------
        self
        """
        self._check_params()
        X = self._check_non_neg_array(X,
                                      "LatentDirichletAllocation.partial_fit")
        n_samples, n_features = X.shape
        batch_size = self.batch_size

        # initialize parameters or check
        if not hasattr(self, 'components_'):
            self._init_latent_vars(n_features)

        if n_features != self.components_.shape[1]:
            raise ValueError("The provided data has %d dimensions while "
                             "the model was trained with feature size %d." %
                             (n_features, self.components_.shape[1]))

        n_jobs = _get_n_jobs(self.n_jobs)
        with Parallel(n_jobs=n_jobs,
                      verbose=max(0, self.verbose - 1)) as parallel:
            for idx_slice in gen_batches(n_samples, batch_size):
                self._em_step(X[idx_slice, :],
                              total_samples=self.total_samples,
                              batch_update=False,
                              parallel=parallel)

        return self
示例#8
0
    def fit(self, X, y=None):
        """Learn model for the data X

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        Returns
        -------
        self

        """
        self._check_params()
        X = self._check_non_neg_array(
            X, "HierarchicalDirichletProcess.fit")
        self._init_global_latent_vars(*X.shape)

        n_jobs = _get_n_jobs(self.n_jobs)
        verbose = max(0, self.verbose - 1)
        evaluate_every = self.evaluate_every
        with Parallel(n_jobs=n_jobs, verbose=verbose) as parallel:
            for i in xrange(self.max_iter):
                # batch update
                _, sstats, _ = self._e_step(X,
                                            cal_sstats=True,
                                            cal_doc_distr=False,
                                            cal_likelihood=False,
                                            parallel=parallel)
                self._m_step(sstats, n_samples=X.shape[0], online_update=False)

                # check perplexity
                if evaluate_every > 0 and (i + 1) % evaluate_every == 0:
                    bound = self.score(X)
                    if self.verbose:
                        print('iteration: %d, ELOB: %.4f' % (i + 1, bound))
                self.n_iter_ += 1
        return self
    def fit(self, X, y=None):
        """Learn model for the data X with variational Bayes method.

        When `learning_method` is 'online', use mini-batch update.
        Otherwise, use batch update.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.

        y : Ignored.

        Returns
        -------
        self
        """
        self._check_params()
        X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit")
        n_samples, n_features = X.shape
        max_iter = self.max_iter
        evaluate_every = self.evaluate_every
        learning_method = self.learning_method
        if learning_method is None:
            warnings.warn(
                "The default value for 'learning_method' will be "
                "changed from 'online' to 'batch' in the release "
                "0.20. This warning was introduced in 0.18.",
                DeprecationWarning)
            learning_method = 'online'

        batch_size = self.batch_size

        # initialize parameters
        self._init_latent_vars(n_features)
        # change to perplexity later
        last_bound = None
        n_jobs = _get_n_jobs(self.n_jobs)
        with Parallel(n_jobs=n_jobs,
                      verbose=max(0, self.verbose - 1)) as parallel:
            for i in xrange(max_iter):
                if learning_method == 'online':
                    for idx_slice in gen_batches(n_samples, batch_size):
                        self._em_step(X[idx_slice, :],
                                      total_samples=n_samples,
                                      batch_update=False,
                                      parallel=parallel)
                else:
                    # batch update
                    self._em_step(X,
                                  total_samples=n_samples,
                                  batch_update=True,
                                  parallel=parallel)

                # check perplexity
                if evaluate_every > 0 and (i + 1) % evaluate_every == 0:
                    doc_topics_distr, _ = self._e_step(X,
                                                       cal_sstats=False,
                                                       random_init=False,
                                                       parallel=parallel)
                    bound = self._perplexity_precomp_distr(X,
                                                           doc_topics_distr,
                                                           sub_sampling=False)
                    if self.verbose:
                        print(
                            'iteration: %d of max_iter: %d, perplexity: %.4f' %
                            (i + 1, max_iter, bound))

                    if last_bound and abs(last_bound - bound) < self.perp_tol:
                        break
                    last_bound = bound

                elif self.verbose:
                    print('iteration: %d of max_iter: %d' % (i + 1, max_iter))
                self.n_iter_ += 1

        # calculate final perplexity value on train set
        doc_topics_distr, _ = self._e_step(X,
                                           cal_sstats=False,
                                           random_init=False,
                                           parallel=parallel)
        self.bound_ = self._perplexity_precomp_distr(X,
                                                     doc_topics_distr,
                                                     sub_sampling=False)

        return self
示例#10
0
    def partial_fit(self, X, y=None):
        """Online VB with Mini-Batch update.
        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Document word matrix.
        Returns
        -------
        self
        """
        self._check_params()
        X = self._check_non_neg_array(X,
                                      "LatentDirichletAllocation.partial_fit")
        n_samples, n_features = X.shape
        batch_size = self.batch_size

        self.total_samples += n_samples

        # initialize parameters or check
        if not hasattr(self, 'components_'):
            raise ValueError
            # self._init_latent_vars(n_features)

        if n_features != self.components_.shape[1]:
            raise ValueError("The provided data has %d dimensions while "
                             "the model was trained with feature size %d." %
                             (n_features, self.components_.shape[1]))

        n_jobs = _get_n_jobs(self.n_jobs)
        max_iter = self.partial_max_iter
        evaluate_every = self.partial_evaluate_every

        self.n_partial_iter_ = 0
        last_bound = None
        doc_topic_distr = None

        with Parallel(n_jobs=n_jobs,
                      verbose=max(0, self.verbose - 1)) as parallel:
            for i in xrange(max_iter):
                for idx_slice in gen_batches(n_samples, batch_size):
                    self._em_step(X[idx_slice, :],
                                  total_samples=self.total_samples,
                                  batch_update=False,
                                  parallel=parallel)

                # check perplexity
                if evaluate_every > 0 and (i + 1) % evaluate_every == 0:
                    doc_topic_distr, _ = self._e_step(X,
                                                      cal_sstats=False,
                                                      random_init=False,
                                                      parallel=parallel)
                    bound = self.perplexity(X,
                                            doc_topic_distr,
                                            sub_sampling=False)
                    if self.verbose:
                        print('iteration: %d, perplexity: %.4f' %
                              (i + 1, bound))

                    if last_bound and abs(last_bound - bound) < self.perp_tol:
                        break
                    last_bound = bound
                self.n_partial_iter_ += 1

        if doc_topic_distr is None:
            doc_topic_distr = self.transform(X)
        else:
            doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
        return doc_topic_distr
示例#11
0
    def kneighbors(self,
                   X=None,
                   E=None,
                   n_neighbors=None,
                   return_distance=True):  #IY modified to account std dev
        """Finds the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : array-like, shape (n_query, n_features), \
                or (n_query, n_indexed) if metric == 'precomputed'
            The query point or points.
            If not provided, neighbors of each indexed point are returned.
            In this case, the query point is not considered its own neighbor.

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the lengths to points, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.

        Examples
        --------
        In the following example, we construct a NeighborsClassifier
        class from an array representing our data set and ask who's
        the closest point to [1,1,1]

        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
        >>> from sklearn.neighbors import NearestNeighbors
        >>> neigh = NearestNeighbors(n_neighbors=1)
        >>> neigh.fit(samples) # doctest: +ELLIPSIS
        NearestNeighbors(algorithm='auto', leaf_size=30, ...)
        >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
        (array([[ 0.5]]), array([[2]]...))

        As you can see, it returns [[0.5]], and [[2]], which means that the
        element is at distance 0.5 and is the third element of samples
        (indexes start at 0). You can also query for multiple points:

        >>> X = [[0., 1., 0.], [1., 0., 1.]]
        >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS
        array([[1],
               [2]]...)

        """
        if self._fit_method is None:  #IY: usually in SOMPY is set to 'auto'
            raise NotFittedError("Must fit neighbors before querying.")

        if n_neighbors is None:
            n_neighbors = self.n_neighbors

        if X is not None:
            query_is_train = False
            X = check_array(X, accept_sparse='csr')
        else:
            query_is_train = True
            X = self._fit_X
            # Include an extra neighbor to account for the sample itself being
            # returned, which is removed later. Results don't consider the test point itself as a neighbor
            n_neighbors += 1
        train_size = self._fit_X.shape[0]
        if n_neighbors > train_size:
            raise ValueError("Expected n_neighbors <= n_samples, "
                             " but n_samples = %d, n_neighbors = %d" %
                             (train_size, n_neighbors))
        n_samples, _ = X.shape
        sample_range = np.arange(n_samples)[:, None]

        n_jobs = _get_n_jobs(self.n_jobs)  #IY: single core at the moment
        if self._fit_method == 'brute':
            # for efficiency, use squared euclidean distances
            if self.effective_metric_ == 'euclidean':
                dist = pairwise_chidistances(
                    X,
                    self._fit_X,
                    'euclidean',  #IY
                    n_jobs=n_jobs,
                    sigma=E,
                    squared=True)
            else:
                raise ValueError("kneighbor for project_realdata() works only"
                                 " with euclidean metric")

            neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1)
            neigh_ind = neigh_ind[:, :n_neighbors]
            # argpartition doesn't guarantee sorted order, so we sort again
            neigh_ind = neigh_ind[sample_range,
                                  np.argsort(dist[sample_range, neigh_ind])]

            if return_distance:
                result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind
            else:
                result = neigh_ind

        ## IY: only brute force at the moment...
        #elif self._fit_method in ['ball_tree', 'kd_tree']:
        #    if issparse(X):
        #        raise ValueError(
        #            "%s does not work with sparse matrices. Densify the data, "
        #            "or set algorithm='brute'" % self._fit_method)
        #    result = Parallel(n_jobs, backend='threading')(
        #        delayed(self._tree.query, check_pickle=False)(
        #            X[s], n_neighbors, return_distance)
        #        for s in gen_even_slices(X.shape[0], n_jobs)
        #    )
        #    if return_distance:
        #        dist, neigh_ind = tuple(zip(*result))
        #        result = np.vstack(dist), np.vstack(neigh_ind)
        #    else:
        #        result = np.vstack(result)
        else:
            raise ValueError(
                "only brute force algorithm accepted as _fit_method ")

        if not query_is_train:
            return result
        else:
            # If the query data is the same as the indexed data, we would like
            # to ignore the first nearest neighbor of every sample, i.e
            # the sample itself.
            if return_distance:
                dist, neigh_ind = result
            else:
                neigh_ind = result

            sample_mask = neigh_ind != sample_range

            # Corner case: When the number of duplicates are more
            # than the number of neighbors, the first NN will not
            # be the sample, but a duplicate.
            # In that case mask the first duplicate.
            dup_gr_nbrs = np.all(sample_mask, axis=1)
            sample_mask[:, 0][dup_gr_nbrs] = False

            neigh_ind = np.reshape(neigh_ind[sample_mask],
                                   (n_samples, n_neighbors - 1))

            if return_distance:
                dist = np.reshape(dist[sample_mask],
                                  (n_samples, n_neighbors - 1))
                return dist, neigh_ind
            return neigh_ind