def test_check_X(): from sklearn.mixture.base import _check_X rng = np.random.RandomState(0) n_samples, n_components, n_features = 10, 2, 2 X_bad_dim = rng.rand(n_components - 1, n_features) assert_raise_message(ValueError, 'Expected n_samples >= n_components ' 'but got n_components = %d, n_samples = %d' % (n_components, X_bad_dim.shape[0]), _check_X, X_bad_dim, n_components) X_bad_dim = rng.rand(n_components, n_features + 1) assert_raise_message(ValueError, 'Expected the input data X have %d features, ' 'but got %d features' % (n_features, X_bad_dim.shape[1]), _check_X, X_bad_dim, n_components, n_features) X = rng.rand(n_samples, n_features) assert_array_equal(X, _check_X(X, n_components, n_features))
def fit(self, X, y=None): """Fit the clustered linear regressor model for a training data set using the EM algorithm. It does n_init instances of the algorithm and keeps the one with the highest complete log-likelyhood. Each initialization of the algorithm runs until convergence or max_iter times. If we enable warm_start, we will have a unique initialisation. Parameters ---------- X : array-like, shape (n_samples, n_features) y : array, shape (n_samples, 1) Returns ------- self """ # Check that all the data is well conditioned print('ESTÁ A 0') X = _check_X(X, self.n_components) n_samples, n_features = X.shape y = self._check_y(y, n_samples) self._check_initial_parameters(X) random_state = check_random_state(self.random_state) # Extended X with a column of ones for the bias terms: X_ext = np.concatenate((np.ones((n_samples, 1)), X), axis=1) # If we enable warm_start, we will have a unique initialisation do_init = not (self.warm_start and hasattr(self, 'converged_')) n_init = self.n_init if do_init else 1 max_lower_bound = -np.infty self.converged_ = False init = 0 while init < n_init: self._print_verbose_msg_init_beg(init) if do_init: try: self._initialize(X, X_ext, y, random_state) except (ValueError, linalg.LinAlgError) as error: print("Bad conditions at init. Error type:") print(error) print("Please try a different initialisation strategy.") sys.exit self.lower_bound_ = -np.infty init += 1 for n_iter in range(self.max_iter): prev_lower_bound = self.lower_bound_ # EM steps (log_sum_gamma, log_resp, log_mix_probabilities, log_reg_probabilities) = self._log_e_step_supervised( X, X_ext, y) self.log_resp_ = log_resp self.log_mix_probabilities_ = log_mix_probabilities self.log_reg_probabilities_ = log_reg_probabilities try: self._m_step_supervised(X, X_ext, y, np.exp(log_resp)) except (ValueError, linalg.LinAlgError) as error: print("Bad conditions at execution. Error type:") print(error) print("Resetting initialisation {}.".format(init)) init -= 1 break # Compute log likelyhood self.lower_bound_ = self._compute_log_lower_bound( log_sum_gamma) # Check convergence change = abs(self.lower_bound_ - prev_lower_bound) self._print_verbose_msg_iter_end(n_iter, change) if change < self.tol: self.converged_ = True break self._print_verbose_msg_init_end(self.lower_bound_) # If there is an improvement over the last best initialization, save data if self.lower_bound_ > max_lower_bound: max_lower_bound = self.lower_bound_ best_params = self._get_parameters() best_n_iter = n_iter self.labels_ = self.log_resp_.argmax(axis=1) self.X_labels_ = self.log_mix_probabilities_.argmax(axis=1) self.y_labels_ = self.log_reg_probabilities_.argmax(axis=1) if not self.converged_: warnings.warn( 'Initialization %d did not converge. ' 'Try different init parameters, ' 'or increase max_iter, tol ' 'or check for degenerate data.' % (init + 1), ConvergenceWarning) self._set_parameters(best_params) self.n_iter_ = best_n_iter return self
def fit( self, X, y=None ): # replace to iterative formula using super ._e_step() and overrided ._m_step() 3.5 X = _check_X(X, self.n_components) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation do_init = not (self.warm_start and hasattr(self, 'converged_')) n_init = self.n_init if do_init else 1 max_lower_bound = -np.infty self.converged_ = False random_state = check_random_state(self.random_state) n_samples, _ = X.shape for init in range(n_init): self._print_verbose_msg_init_beg(init) if do_init: self._initialize_parameters(X, random_state) self.lower_bound_ = -np.infty for n_iter in range(self.max_iter): # ARD EM prev_lower_bound = self.lower_bound_ log_prob_norm, log_resp = self._e_step(X) resp = np.exp(log_resp) self._m_step(X, resp) # resp : array-like, shape (n_samples, n_components) # The responsibilities for each data sample in X. # update covariance F = np.diag(1. / (np.dot(resp, self.weights_)**2)) # F = (1./(np.diag(np.dot(resp, self.weights_)) + 1e-4)) ** 2 # array-like, shape (n_samples, n_samples) A = np.diag(self.reg_weights_ ) # array-like, shape (n_components, n_components) H = np.dot(np.dot(resp.T, F), resp) + A S = np.vstack([ np.diag(np.ones(self.n_components - 1)), -1.0 * np.ones(self.n_components - 1) ]) Sigma = np.linalg.inv(np.dot(np.dot(S.T, H), S)) idx = np.arange(self.n_components - 1) # update weights self.reg_weights_ = np.hstack([ (1 - self.reg_weights_[idx] * Sigma[idx, idx]) / (self.weights_[idx]**2), (1 - self.reg_weights_[-1] * Sigma.sum()) / self.weights_[-1]**2 ]) # drop extra components keep_idx = np.argwhere( np.logical_and( self.reg_weights_ < self.alpha_bound, self.weights_ > self.weight_bound)).squeeze() if len(keep_idx) > 0: self.weights_ = self.weights_[keep_idx] self.reg_weights_ = self.reg_weights_[keep_idx] self.means_ = self.means_[keep_idx, :] if self.covariance_type != 'tied': self.covariances_ = self.covariances_[keep_idx] self.precisions_cholesky_ = self.precisions_cholesky_[ keep_idx] self.n_components = keep_idx.shape[0] # early stop cryterium self.lower_bound_ = self._compute_lower_bound( log_resp, log_prob_norm) change = self.lower_bound_ - prev_lower_bound self._print_verbose_msg_iter_end(n_iter, change) if abs(change) < self.tol and len(keep_idx) > 0: self.converged_ = True break self._print_verbose_msg_init_end(self.lower_bound_) if self.lower_bound_ > max_lower_bound: max_lower_bound = self.lower_bound_ best_params = self._get_parameters() best_n_iter = n_iter if not self.converged_: warnings.warn( 'Initialization %d did not converge. ' 'Try different init parameters, ' 'or increase max_iter, tol ' 'or check for degenerate data.' % (init + 1), ConvergenceWarning) self._set_parameters(best_params) self.n_iter_ = best_n_iter return self
def fit(self, X_arr, y=None): """Estimate model parameters with the EM algorithm. Modified to do cotraining. The method fit the model `n_init` times and set the parameters with which the model has the largest likelihood or lower bound. Within each trial, the method iterates between E-step and M-step for `max_iter` times until the change of likelihood or lower bound is less than `tol`, otherwise, a `ConvergenceWarning` is raised. Parameters ---------- X_arr : array-like, shape (n_diff_data_sources) Each array-like, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- self """ print("Fitting with co-training...") for i, X in enumerate(X_arr): X = _check_X(X, self.n_components) self._check_initial_parameters(X) X_arr[i] = X # if we enable warm_start, we will have a unique initialisation do_init = not (self.warm_start and hasattr(self, 'converged_')) n_init = self.n_init if do_init else 1 max_lower_bound = -np.infty self.converged_ = False random_state = check_random_state(self.random_state) n_samples, _ = X_arr[0].shape for init in range(n_init): self._print_verbose_msg_init_beg(init) if do_init: self._initialize_parameters(X, random_state) self.lower_bound_ = -np.infty for n_iter in range(self.max_iter): # With co-training, each iteration goes thru each data source prev_lower_bound = self.lower_bound_ for X in X_arr: log_prob_norm, log_resp = self._e_step(X) self._m_step(X, log_resp) # use log_resp and log_prob_norm from last data src self.lower_bound_ = self._compute_lower_bound( log_resp, log_prob_norm) change = self.lower_bound_ - prev_lower_bound self._print_verbose_msg_iter_end(n_iter, change) if abs(change) < self.tol: self.converged_ = True break self._print_verbose_msg_init_end(self.lower_bound_) if self.lower_bound_ > max_lower_bound: max_lower_bound = self.lower_bound_ best_params = self._get_parameters() best_n_iter = n_iter if not self.converged_: warnings.warn( 'Initialization %d did not converge. ' 'Try different init parameters, ' 'or increase max_iter, tol ' 'or check for degenerate data.' % (init + 1), ConvergenceWarning) self._set_parameters(best_params) self.n_iter_ = best_n_iter return self