def _init_random_gaussians(self, X): n_samples = np.shape(X)[0] self.priors = (1 / self.k) * np.ones(self.k) for i in range(self.k): params = {} params["mean"] = X[np.random.choice(range(n_samples))] params["cov"] = calculate_covariance_matrix(X) self.parameters.append(params)
def fit(self, X, y): # Separate data by class X1 = X[y == 0] X2 = X[y == 1] # Calculate the covariance matrices of the two datasets cov1 = calculate_covariance_matrix(X1) cov2 = calculate_covariance_matrix(X2) cov_tot = cov1 + cov2 # Calculate the mean of the two datasets mean1 = X1.mean(0) mean2 = X2.mean(0) mean_diff = np.atleast_1d(mean1 - mean2) # Determine the vector which when X is projected onto it best separates the # data by class. w = (mean1 - mean2) / (cov1 + cov2) self.w = np.linalg.pinv(cov_tot).dot(mean_diff)
def _transform(self, X, dim): covariance = calculate_covariance_matrix(X) eigenvalues, eigenvectors = np.linalg.eig(covariance) # Sort eigenvalues and eigenvector by largest eigenvalues idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx][:dim] eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :dim] # Project the data onto principal components X_transformed = X.dot(eigenvectors) return X_transformed
def transform(self, X, n_components): covariance = calculate_covariance_matrix(X) # Get the eigenvalues and eigenvectors. # (eigenvector[:,0] corresponds to eigenvalue[0]) eigenvalues, eigenvectors = np.linalg.eig(covariance) # Sort the eigenvalues and corresponding eigenvectors from largest # to smallest eigenvalue and select the first n_components idx = eigenvalues.argsort()[::-1] eigenvalues = eigenvalues[idx][:n_components] eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :n_components] # Project the data onto principal components X_transformed = X.dot(eigenvectors) return X_transformed
def _calculate_scatter_matrices(self, X, y): n_features = np.shape(X)[1] labels = np.unique(y) # Within class scatter matrix: # SW = sum{ (X_for_class - mean_of_X_for_class)^2 } SW = np.empty((n_features, n_features)) for label in labels: _X = X[y == label] SW += (len(_X) - 1) * calculate_covariance_matrix(_X) # Between class scatter: # SB = sum{ n_samples_for_class * (mean_for_class - total_mean)^2 } total_mean = np.mean(X, axis=0) SB = np.empty((n_features, n_features)) for label in labels: _X = X[y == label] _mean = np.mean(_X, axis=0) SB += len(_X) * (_mean - total_mean).dot((_mean - total_mean).T) return SW, SB