def test_check_X():
    from sklearn.mixture.base import _check_X
    rng = np.random.RandomState(0)

    n_samples, n_components, n_features = 10, 2, 2

    X_bad_dim = rng.rand(n_components - 1, n_features)
    assert_raise_message(ValueError,
                         'Expected n_samples >= n_components '
                         'but got n_components = %d, n_samples = %d'
                         % (n_components, X_bad_dim.shape[0]),
                         _check_X, X_bad_dim, n_components)

    X_bad_dim = rng.rand(n_components, n_features + 1)
    assert_raise_message(ValueError,
                         'Expected the input data X have %d features, '
                         'but got %d features'
                         % (n_features, X_bad_dim.shape[1]),
                         _check_X, X_bad_dim, n_components, n_features)

    X = rng.rand(n_samples, n_features)
    assert_array_equal(X, _check_X(X, n_components, n_features))
def test_check_X():
    from sklearn.mixture.base import _check_X
    rng = np.random.RandomState(0)

    n_samples, n_components, n_features = 10, 2, 2

    X_bad_dim = rng.rand(n_components - 1, n_features)
    assert_raise_message(ValueError,
                         'Expected n_samples >= n_components '
                         'but got n_components = %d, n_samples = %d'
                         % (n_components, X_bad_dim.shape[0]),
                         _check_X, X_bad_dim, n_components)

    X_bad_dim = rng.rand(n_components, n_features + 1)
    assert_raise_message(ValueError,
                         'Expected the input data X have %d features, '
                         'but got %d features'
                         % (n_features, X_bad_dim.shape[1]),
                         _check_X, X_bad_dim, n_components, n_features)

    X = rng.rand(n_samples, n_features)
    assert_array_equal(X, _check_X(X, n_components, n_features))
Пример #3
0
    def fit(self, X, y=None):
        """Fit the clustered linear regressor model for a training 
        data set using the EM algorithm.
        It does n_init instances of the algorithm and keeps the one with the
        highest complete log-likelyhood.
        Each initialization of the algorithm runs until convergence or max_iter
        times.
        If we enable warm_start, we will have a unique initialisation.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        y : array, shape (n_samples, 1)

        Returns
        -------
        self
        """
        # Check that all the data is well conditioned
        print('ESTÁ A 0')
        X = _check_X(X, self.n_components)
        n_samples, n_features = X.shape
        y = self._check_y(y, n_samples)
        self._check_initial_parameters(X)
        random_state = check_random_state(self.random_state)

        # Extended X with a column of ones for the bias terms:
        X_ext = np.concatenate((np.ones((n_samples, 1)), X), axis=1)

        # If we enable warm_start, we will have a unique initialisation
        do_init = not (self.warm_start and hasattr(self, 'converged_'))
        n_init = self.n_init if do_init else 1

        max_lower_bound = -np.infty
        self.converged_ = False

        init = 0
        while init < n_init:
            self._print_verbose_msg_init_beg(init)
            if do_init:
                try:
                    self._initialize(X, X_ext, y, random_state)
                except (ValueError, linalg.LinAlgError) as error:
                    print("Bad conditions at init. Error type:")
                    print(error)
                    print("Please try a different initialisation strategy.")
                    sys.exit
                self.lower_bound_ = -np.infty
            init += 1

            for n_iter in range(self.max_iter):
                prev_lower_bound = self.lower_bound_

                # EM steps
                (log_sum_gamma, log_resp, log_mix_probabilities,
                 log_reg_probabilities) = self._log_e_step_supervised(
                     X, X_ext, y)
                self.log_resp_ = log_resp
                self.log_mix_probabilities_ = log_mix_probabilities
                self.log_reg_probabilities_ = log_reg_probabilities
                try:
                    self._m_step_supervised(X, X_ext, y, np.exp(log_resp))
                except (ValueError, linalg.LinAlgError) as error:
                    print("Bad conditions at execution. Error type:")
                    print(error)
                    print("Resetting initialisation {}.".format(init))
                    init -= 1
                    break

                # Compute log likelyhood
                self.lower_bound_ = self._compute_log_lower_bound(
                    log_sum_gamma)

                # Check convergence
                change = abs(self.lower_bound_ - prev_lower_bound)
                self._print_verbose_msg_iter_end(n_iter, change)

                if change < self.tol:
                    self.converged_ = True
                    break

            self._print_verbose_msg_init_end(self.lower_bound_)

            # If there is an improvement over the last best initialization, save data
            if self.lower_bound_ > max_lower_bound:
                max_lower_bound = self.lower_bound_
                best_params = self._get_parameters()
                best_n_iter = n_iter
                self.labels_ = self.log_resp_.argmax(axis=1)
                self.X_labels_ = self.log_mix_probabilities_.argmax(axis=1)
                self.y_labels_ = self.log_reg_probabilities_.argmax(axis=1)

        if not self.converged_:
            warnings.warn(
                'Initialization %d did not converge. '
                'Try different init parameters, '
                'or increase max_iter, tol '
                'or check for degenerate data.' % (init + 1),
                ConvergenceWarning)

        self._set_parameters(best_params)
        self.n_iter_ = best_n_iter

        return self
Пример #4
0
    def fit(
        self,
        X,
        y=None
    ):  # replace to iterative formula using super ._e_step() and overrided ._m_step() 3.5
        X = _check_X(X, self.n_components)
        self._check_initial_parameters(X)

        # if we enable warm_start, we will have a unique initialisation
        do_init = not (self.warm_start and hasattr(self, 'converged_'))
        n_init = self.n_init if do_init else 1

        max_lower_bound = -np.infty
        self.converged_ = False

        random_state = check_random_state(self.random_state)

        n_samples, _ = X.shape
        for init in range(n_init):
            self._print_verbose_msg_init_beg(init)

            if do_init:
                self._initialize_parameters(X, random_state)
                self.lower_bound_ = -np.infty

            for n_iter in range(self.max_iter):
                # ARD EM
                prev_lower_bound = self.lower_bound_
                log_prob_norm, log_resp = self._e_step(X)
                resp = np.exp(log_resp)
                self._m_step(X, resp)

                # resp : array-like, shape (n_samples, n_components)
                # The responsibilities for each data sample in X.

                # update covariance
                F = np.diag(1. / (np.dot(resp, self.weights_)**2))
                # F = (1./(np.diag(np.dot(resp, self.weights_)) + 1e-4)) ** 2 # array-like, shape (n_samples, n_samples)
                A = np.diag(self.reg_weights_
                            )  # array-like, shape (n_components, n_components)
                H = np.dot(np.dot(resp.T, F), resp) + A
                S = np.vstack([
                    np.diag(np.ones(self.n_components - 1)),
                    -1.0 * np.ones(self.n_components - 1)
                ])
                Sigma = np.linalg.inv(np.dot(np.dot(S.T, H), S))
                idx = np.arange(self.n_components - 1)

                # update weights
                self.reg_weights_ = np.hstack([
                    (1 - self.reg_weights_[idx] * Sigma[idx, idx]) /
                    (self.weights_[idx]**2),
                    (1 - self.reg_weights_[-1] * Sigma.sum()) /
                    self.weights_[-1]**2
                ])

                # drop extra components
                keep_idx = np.argwhere(
                    np.logical_and(
                        self.reg_weights_ < self.alpha_bound,
                        self.weights_ > self.weight_bound)).squeeze()
                if len(keep_idx) > 0:
                    self.weights_ = self.weights_[keep_idx]
                    self.reg_weights_ = self.reg_weights_[keep_idx]
                    self.means_ = self.means_[keep_idx, :]
                    if self.covariance_type != 'tied':
                        self.covariances_ = self.covariances_[keep_idx]
                        self.precisions_cholesky_ = self.precisions_cholesky_[
                            keep_idx]
                    self.n_components = keep_idx.shape[0]

                # early stop cryterium
                self.lower_bound_ = self._compute_lower_bound(
                    log_resp, log_prob_norm)
                change = self.lower_bound_ - prev_lower_bound
                self._print_verbose_msg_iter_end(n_iter, change)
                if abs(change) < self.tol and len(keep_idx) > 0:
                    self.converged_ = True
                    break

            self._print_verbose_msg_init_end(self.lower_bound_)

            if self.lower_bound_ > max_lower_bound:
                max_lower_bound = self.lower_bound_
                best_params = self._get_parameters()
                best_n_iter = n_iter

        if not self.converged_:
            warnings.warn(
                'Initialization %d did not converge. '
                'Try different init parameters, '
                'or increase max_iter, tol '
                'or check for degenerate data.' % (init + 1),
                ConvergenceWarning)

        self._set_parameters(best_params)
        self.n_iter_ = best_n_iter

        return self
    def fit(self, X_arr, y=None):
        """Estimate model parameters with the EM algorithm.
    
        Modified to do cotraining.

        The method fit the model `n_init` times and set the parameters with
        which the model has the largest likelihood or lower bound. Within each
        trial, the method iterates between E-step and M-step for `max_iter`
        times until the change of likelihood or lower bound is less than
        `tol`, otherwise, a `ConvergenceWarning` is raised.

        Parameters
        ----------
        X_arr : array-like, shape (n_diff_data_sources)
            Each array-like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.

        Returns
        -------
        self
        """

        print("Fitting with co-training...")

        for i, X in enumerate(X_arr):
            X = _check_X(X, self.n_components)
            self._check_initial_parameters(X)
            X_arr[i] = X

        # if we enable warm_start, we will have a unique initialisation
        do_init = not (self.warm_start and hasattr(self, 'converged_'))
        n_init = self.n_init if do_init else 1

        max_lower_bound = -np.infty
        self.converged_ = False

        random_state = check_random_state(self.random_state)

        n_samples, _ = X_arr[0].shape
        for init in range(n_init):
            self._print_verbose_msg_init_beg(init)

            if do_init:
                self._initialize_parameters(X, random_state)
                self.lower_bound_ = -np.infty

            for n_iter in range(self.max_iter):
                # With co-training, each iteration goes thru each data source

                prev_lower_bound = self.lower_bound_

                for X in X_arr:
                    log_prob_norm, log_resp = self._e_step(X)
                    self._m_step(X, log_resp)

                # use log_resp and log_prob_norm from last data src
                self.lower_bound_ = self._compute_lower_bound(
                    log_resp, log_prob_norm)

                change = self.lower_bound_ - prev_lower_bound
                self._print_verbose_msg_iter_end(n_iter, change)

                if abs(change) < self.tol:
                    self.converged_ = True
                    break

            self._print_verbose_msg_init_end(self.lower_bound_)

            if self.lower_bound_ > max_lower_bound:
                max_lower_bound = self.lower_bound_
                best_params = self._get_parameters()
                best_n_iter = n_iter

        if not self.converged_:
            warnings.warn(
                'Initialization %d did not converge. '
                'Try different init parameters, '
                'or increase max_iter, tol '
                'or check for degenerate data.' % (init + 1),
                ConvergenceWarning)

        self._set_parameters(best_params)
        self.n_iter_ = best_n_iter

        return self