def _cov(X, shrinkage=None): """Estimate covariance matrix (using optional shrinkage). Parameters ---------- X : array-like, shape (n_samples, n_features) Input data. shrinkage : string or float, optional Shrinkage parameter, possible values: - None or 'empirical': no shrinkage (default). - 'auto': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage parameter. Returns ------- s : array, shape (n_features, n_features) Estimated covariance matrix. """ shrinkage = "empirical" if shrinkage is None else shrinkage if isinstance(shrinkage, str): if shrinkage == 'auto': sc = StandardScaler() # standardize features X = sc.fit_transform(X) s = ledoit_wolf(X)[0] # rescale s = sc.scale_[:, np.newaxis] * s * sc.scale_[np.newaxis, :] elif shrinkage == 'empirical': s = empirical_covariance(X) else: raise ValueError('unknown shrinkage parameter') elif isinstance(shrinkage, float) or isinstance(shrinkage, int): if shrinkage < 0 or shrinkage > 1: raise ValueError('shrinkage parameter must be between 0 and 1') s = shrunk_covariance(empirical_covariance(X), shrinkage) else: raise TypeError('shrinkage must be of string or int type') return s
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='spectral'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), norm='frobenius'), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), scaling=False), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X), squared=False), 0) # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert(np.amax(mahal_dist) < 250) assert(np.amin(mahal_dist) > 50) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result)
def test_shrunk_covariance(): """Tests ShrunkCovariance module on a simple dataset. """ # compare shrunk covariance obtained from data and from MLE estimate cov = ShrunkCovariance(shrinkage=0.5) cov.fit(X) assert_array_almost_equal( shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4) # same test with shrinkage not provided cov = ShrunkCovariance() cov.fit(X) assert_array_almost_equal( shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4) # same test with shrinkage = 0 (<==> empirical_covariance) cov = ShrunkCovariance(shrinkage=0.) cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = ShrunkCovariance(shrinkage=0.3) cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) cov = ShrunkCovariance(shrinkage=0.5, store_precision=False) cov.fit(X) assert(cov.precision_ is None)
def _nonrobust_covariance(self, data, assume_centered=False): """Non-robust estimation of the covariance to be used within MCD. Parameters ---------- data: array_like, shape (n_samples, n_features) Data for which to compute the non-robust covariance matrix. assume_centered: Boolean Whether or not the observations should be considered as centered. Returns ------- nonrobust_covariance: array_like, shape (n_features, n_features) The non-robust covariance of the data. """ try: cov, prec = graph_lasso( empirical_covariance(data, assume_centered=assume_centered), self.shrinkage) except: print " > Exception!" emp_cov = empirical_covariance( data, assume_centered=assume_centered) emp_cov.flat[::data.shape[1] + 1] += 1e-06 cov, prec = graph_lasso(emp_cov, self.shrinkage) return cov
def test_shrunk_covariance(): # Tests ShrunkCovariance module on a simple dataset. # compare shrunk covariance obtained from data and from MLE estimate cov = ShrunkCovariance(shrinkage=0.5) cov.fit(X) assert_array_almost_equal( shrunk_covariance(empirical_covariance(X), shrinkage=0.5), cov.covariance_, 4) # same test with shrinkage not provided cov = ShrunkCovariance() cov.fit(X) assert_array_almost_equal(shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4) # same test with shrinkage = 0 (<==> empirical_covariance) cov = ShrunkCovariance(shrinkage=0.) cov.fit(X) assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = ShrunkCovariance(shrinkage=0.3) cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) cov = ShrunkCovariance(shrinkage=0.5, store_precision=False) cov.fit(X) assert (cov.precision_ is None)
def _nonrobust_covariance(self, data, assume_centered=False): """Non-robust estimation of the covariance to be used within MCD. Parameters ---------- data: array_like, shape (n_samples, n_features) Data for which to compute the non-robust covariance matrix. assume_centered: Boolean Whether or not the observations should be considered as centered. Returns ------- nonrobust_covariance: array_like, shape (n_features, n_features) The non-robust covariance of the data. """ try: cov, prec = graph_lasso( empirical_covariance(data, assume_centered=assume_centered), self.shrinkage) except: print " > Exception!" emp_cov = empirical_covariance(data, assume_centered=assume_centered) emp_cov.flat[::data.shape[1] + 1] += 1e-06 cov, prec = graph_lasso(emp_cov, self.shrinkage) return cov
def flgl_path(X_train, links=None, etas=[0.1], mus=[0.1], X_test=None, tol=1e-3, max_iter=200, update_rho=False, verbose=0, score='ebic', random_state=None): score_func = {'likelihood': log_likelihood, 'bic': BIC, 'ebic': partial(EBIC, n=X_test.shape[0]), 'ebicm': partial(EBIC_m, n=X_test.shape[0])} try: score_func = score_func[score] except KeyError: warnings.warn("The score type passed is not available, using log likelihood.") score_func = log_likelihood emp_cov = empirical_covariance(X_train) covariance_ = emp_cov.copy() covariances_ = list() precisions_ = list() hiddens_ = list() scores_ = list() if X_test is not None: test_emp_cov = empirical_covariance(X_test) for eta in etas: for mu in mus: try: # Capture the errors, and move on cov_, prec_, hid_,_ = two_layers_fixed_links_GL( emp_cov, links, mu, eta, max_iter=max_iter, random_state=random_state, return_n_iter=False) covariances_.append(cov_) precisions_.append(prec_) hiddens_.append(hid_) if X_test is not None: this_score = score_func(test_emp_cov, prec_) except FloatingPointError: this_score = -np.inf covariances_.append(np.nan) precisions_.append(np.nan) if X_test is not None: if not np.isfinite(this_score): this_score = -np.inf scores_.append(this_score) if verbose: if X_test is not None: print('[graphical_lasso_path] eta: %.2e, mu: %.2e, score: %.2e' % (eta, mu, this_score)) else: print('[graphical_lasso_path] eta: %.2e, mu: %.2e' % (eta, mu)) if X_test is not None: return covariances_, precisions_, hiddens_, scores_ return covariances_, precisions_, hiddens_
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print(np.amin(mahal_dist), np.amax(mahal_dist)) assert(np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # FIXME I don't know what this test does X_1sample = np.arange(5) cov = EmpiricalCovariance() assert_warns(UserWarning, cov.fit, X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal( cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal( cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal( cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print np.amin(mahal_dist), np.amax(mahal_dist) assert(np.amin(mahal_dist) > 0) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample X_1sample = np.arange(5) cov = EmpiricalCovariance() with warnings.catch_warnings(record=True): cov.fit(X_1sample) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_equal(cov.location_, np.zeros(X.shape[1]))
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0) assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) with pytest.raises(NotImplementedError): cov.error_norm(emp_cov, norm='foo') # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal( cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0) # test with one sample # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() warn_msg = ( "Only one sample available. You may want to reshape your data array") with pytest.warns(UserWarning, match=warn_msg): cov.fit(X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def LinRegErr(A,B,iteractions=50): #retorna os erros do slope e do intercept n=len(A) meanA=np.mean(A) meanB=np.mean(B) Iteractions=iteractions cont=0 Slps=[] Ints=[] Covs=[] while cont <Iteractions: Sample= mvn.rvs(mean=[meanA,meanB], cov=np.cov(A,B),size=n) # criando amostras de tamanho 'n' extraída da distribuição estimada da população popt, pcov =curve_fit(linf,Sample[:,0],Sample[:,1]) #ajustando a reta Slope=popt[0] Intercept=popt[1] Sample_cov=empirical_covariance(Sample)[0,1] Slps=np.append(Slps,Slope) Ints=np.append(Ints, Intercept) Covs=np.append(Covs,Sample_cov) print(cont) cont+=1 return(np.std(Slps),np.std(Ints),np.mean(Sample_cov))
def get_cov(data): dat = data.training_data_all_ways + data.testing_data_all_ways num_ways = len(data.get_list_of_ways()) m = {} i = 0 for way in data.get_list_of_ways(): m[way] = i i += 1 mat = np.zeros((num_ways,num_ways)) for elem in dat: ways = elem[1] for way in ways: mat[m[way],m[way]] = mat[m[way],m[way]] + 1 for w1 in ways: for w2 in ways: if w1 == w2: continue mat[m[w1],m[w2]] = mat[m[w1],m[w2]] + 1 print mat emp_cov = empirical_covariance(mat) print emp_cov corr = np.zeros((num_ways,num_ways)) for i in range(num_ways): for j in range(num_ways): corr[i,j] = emp_cov[i,j]/(math.sqrt(emp_cov[i,i])*math.sqrt(emp_cov[j,j])) print corr sns.heatmap(corr,vmin = -1, vmax = 1,square=True,xticklabels=m.keys(),yticklabels=m.keys()) sns.plt.title("Covariance of WAYS frequencies") sns.plt.show()
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MinCovDet(random_state=rand_gen).fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) assert (error_location < tol_loc) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) assert (error_cov < tol_cov) assert (np.sum(H) >= tol_support) assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def fit(self, X, y=None): """Fit the GraphLasso model to X. Parameters ---------- X : ndarray, shape (n_time, n_samples, n_features), or (n_samples, n_features, n_time) Data from which to compute the covariance estimate. If shape is (n_samples, n_features, n_time), then set `bypass_transpose = False`. y : (ignored) """ if not self.bypass_transpose: X = X.transpose(2, 0, 1) # put time as first dimension # Covariance does not make sense for a single feature # X = check_array(X, allow_nd=True, estimator=self) # if X.ndim != 3: # raise ValueError("Found array with dim %d. %s expected <= 2." # % (X.ndim, self.__class__.__name__)) X = np.array([check_array(x, ensure_min_features=2, ensure_min_samples=2, estimator=self) for x in X]) if self.assume_centered: self.location_ = np.zeros((X.shape[0], 1, X.shape[2])) else: self.location_ = X.mean(1).reshape(X.shape[0], 1, X.shape[2]) self.emp_cov = np.array([empirical_covariance( x, assume_centered=self.assume_centered) for x in X]) self.precision_, self.latent_, self.covariance_, self.n_iter_ = \ latent_time_graph_lasso( self.emp_cov, alpha=self.alpha, tau=self.tau, rho=self.rho, beta=self.beta, eta=self.eta, mode=self.mode, tol=self.tol, rtol=self.rtol, psi=self.psi, phi=self.phi, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=False) return self
def test_oas(): """Tests OAS module on a simple dataset. """ # test shrinkage coeff on a simple data set oa = OAS() oa.fit(X, assume_centered=True) assert_almost_equal(oa.shrinkage_, 0.018740, 4) assert_almost_equal(oa.score(X, assume_centered=True), -5.03605, 4) # compare shrunk covariance obtained from data and from MLE estimate oa_cov_from_mle, oa_shinkrage_from_mle = oas(X, assume_centered=True) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) # compare estimates given by OAS and ShrunkCovariance scov = ShrunkCovariance(shrinkage=oa.shrinkage_) scov.fit(X, assume_centered=True) assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) # test with n_features = 1 oa = OAS() oa.fit(X_1d, assume_centered=True) oa_cov_from_mle, oa_shinkrage_from_mle = oas(X_1d, assume_centered=True) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, oa.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) oa = OAS(store_precision=False) oa.fit(X, assume_centered=True) assert_almost_equal(oa.score(X, assume_centered=True), -5.03605, 4) assert(oa.precision_ is None) ### Same tests without assuming centered data # test shrinkage coeff on a simple data set oa = OAS() oa.fit(X) assert_almost_equal(oa.shrinkage_, 0.020236, 4) assert_almost_equal(oa.score(X), 2.079025, 4) # compare shrunk covariance obtained from data and from MLE estimate oa_cov_from_mle, oa_shinkrage_from_mle = oas(X) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) # compare estimates given by OAS and ShrunkCovariance scov = ShrunkCovariance(shrinkage=oa.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) # test with n_features = 1 oa = OAS() oa.fit(X_1d) oa_cov_from_mle, oa_shinkrage_from_mle = oas(X_1d) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) oa = OAS(store_precision=False) oa.fit(X) assert_almost_equal(oa.score(X), 2.079025, 4) assert(oa.precision_ is None)
def construct_motion_gaussian_models(char_dict): # fit a 2D gaussian model for each key # plot the mean and variance ellipse on the keyboard layout # img = mpimg.imread('keyboard_screen_shot.jpg') # imgplot = plt.imshow(img) # convert data to numpy char_list = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ' ] # create a gaussian model for every character model_dict = {} scaler = MinMaxScaler() for char in char_list: if char in char_dict.keys(): # print(char) # print(np.array(char_dict[char]).T.shape) dim = np.array(char_dict[char]).shape X = np.array(char_dict[char]) scaler.fit(X) # X_t = scaler.transform(X) X_t = X dim = X_t.shape mu = np.mean(X_t, axis=0) # print(np.array(char_dict[char].shape)) if dim[0] > 1: # cannot use np.cov when there is only one data point # cov = np.cov(np.array(char_dict[char]).T) cov = empirical_covariance(X_t) # print(cov) else: cov = np.zeros((dim[1], dim[1])) # save the model parameter in a dictionary model_dict[char] = {} model_dict[char]['mean'] = mu # model_dict[char]['cov'] = np.diag(np.diag(cov)) model_dict[char]['cov'] = cov sigma_det = np.linalg.det(cov) # by definition sigma_det should not be zero!!! # as the covariance matrix is semi-positive definite if sigma_det <= 0: print(char + " : " + str(sigma_det)) print(X_t.shape) print(cov) # print(cov.shape) # print("Generated plot for " + char) # plt.scatter(x, y) # plt.savefig(posture + '_training_data_pattern/' + posture + '_2D_Gauss.png', dpi=200) return model_dict
def fit(self, X, y): """Fit the TimeGraphicalLasso model to X. Parameters ---------- X : ndarray, shape = (n_samples * n_times, n_dimensions) Data matrix. y : ndarray, shape = (n_times,) Indicate the temporal belonging of each sample. """ # Covariance does not make sense for a single feature X, y = check_X_y(X, y, accept_sparse=False, dtype=np.float64, order="C", ensure_min_features=2, estimator=self) n_dimensions = X.shape[1] self.classes_, n_samples = np.unique(y, return_counts=True) n_times = self.classes_.size # n_samples = np.array([x.shape[0] for x in X]) if self.assume_centered: self.location_ = np.zeros((n_times, n_dimensions)) else: self.location_ = np.array([X[y == cl].mean(0) for cl in self.classes_]) emp_cov = np.array( [empirical_covariance(X[y == cl], assume_centered=self.assume_centered) for cl in self.classes_] ) return self._fit(emp_cov, n_samples)
def score(self, X_test, y=None): """Computes the log-likelihood of a Gaussian data set with `self.covariance_` as an estimator of its covariance matrix. Parameters ---------- X_test : lenght-2 list of array-like of shape (n_samples1, n_features1) and (n_samples2, n_features2) Test data of which we compute the likelihood, where n_samples is the number of samples and n_features is the number of features. X_test is assumed to be drawn from the same distribution than the data used in fit (including centering). The number of features must correspond. y : not used, present for API consistence purpose. Returns ------- res : float The likelihood of the data set with `self.covariance1_` and `self.covariance1_`as an estimator of its covariance matrix. """ if self.covariance_ is None : sys.error("The estimator is not fit on training data.") sys.exit(0) check_data_dimensions(X_test, layers=2) # compute empirical covariance of the test set test_cov = empirical_covariance(x - self.location_, assumed_centered=True) res = log_likelihood(test_cov, self.get_precision()) return res
def fit(self, X, y=None): """Fit the GraphLasso model to X. Parameters ---------- X : array-like shape (n_samples, n_features) Data from which to compute the covariance estimate. y : (ignored) """ self.random_state = check_random_state(self.random_state) # check_data_dimensions(X, layers=2) X = check_array(X, ensure_min_features=2, ensure_min_samples=2, estimator=self) self.X_train = X if self.assume_centered: self.location_ = np.zeros((X.shape[0], X.shape[1])) else: self.location_ = X.mean(0) emp_cov = empirical_covariance( X, assume_centered=self.assume_centered) self.precision_, self.hidden_, \ self.observed_, self.emp_cov, \ self.n_iter_ = two_layers_fixed_links_GL( emp_cov, self.L, eta=self.eta, mu=self.mu, rho=self.rho, tol=self.tol, rtol=self.rtol, max_iter=self.max_iter, verbose=self.verbose, return_n_iter=True, return_history=False, compute_objective=self.compute_objective) return self
def test_graphical_lasso_iris_singular(): # Small subset of rows to test the rank-deficient case # Need to choose samples such that none of the variances are zero indices = np.arange(10, 13) # Hard-coded solution from R glasso package for alpha=0.01 cov_R = np.array([ [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149], [0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222], [0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009], [0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222], ]) icov_R = np.array([ [24.42244057, -16.831679593, 0.0, 0.0], [-16.83168201, 24.351841681, -6.206896552, -12.5], [0.0, -6.206896171, 153.103448276, 0.0], [0.0, -12.499999143, 0.0, 462.5], ]) X = datasets.load_iris().data[indices, :] emp_cov = empirical_covariance(X) for method in ("cd", "lars"): cov, icov = graphical_lasso(emp_cov, alpha=0.01, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_R, decimal=5) assert_array_almost_equal(icov, icov_R, decimal=5)
def test_graph_lasso(random_state=0): # Sample data from a sparse multivariate normal dim = 20 n_samples = 100 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) for alpha in (.1, .01): covs = dict() for method in ('cd', 'lars'): cov_, _, costs = graph_lasso(emp_cov, alpha=.1, return_costs=True) covs[method] = cov_ costs, dual_gap = np.array(costs).T # Check that the costs always decrease assert_array_less(np.diff(costs), 0) # Check that the 2 approaches give similar results assert_array_almost_equal(covs['cd'], covs['lars']) # Smoke test the estimator model = GraphLasso(alpha=.1).fit(X) assert_array_almost_equal(model.covariance_, covs['cd'])
def test_graphical_lasso_iris(): # Hard-coded solution from R glasso package for alpha=1.0 # (need to set penalize.diagonal to FALSE) cov_R = np.array( [ [0.68112222, 0.0000000, 0.265820, 0.02464314], [0.00000000, 0.1887129, 0.000000, 0.00000000], [0.26582000, 0.0000000, 3.095503, 0.28697200], [0.02464314, 0.0000000, 0.286972, 0.57713289], ] ) icov_R = np.array( [ [1.5190747, 0.000000, -0.1304475, 0.0000000], [0.0000000, 5.299055, 0.0000000, 0.0000000], [-0.1304475, 0.000000, 0.3498624, -0.1683946], [0.0000000, 0.000000, -0.1683946, 1.8164353], ] ) X = datasets.load_iris().data emp_cov = empirical_covariance(X) for method in ("cd", "lars"): cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_R) assert_array_almost_equal(icov, icov_R)
def get_moment(samples, name): if name == 'covariance': return empirical_covariance(samples)[0, 1] elif name == 'var': return np.square(np.std(samples, axis=0)) else: raise ValueError("unknown name: {}".format(name))
def pca_it(self, spec, recipes, process_recipe): MB_SIZE = 10 process_func = ProcessFunc(process_recipe, spec) output_director = MinibatchOutputDirector2( MB_SIZE, x_shape=(spec['target_channels'], spec['target_h'], spec['target_w']), y_shape=(self.Y_SHAPE, )) iterator = create_standard_iterator(process_func, recipes, output_director, pool_size=6, buffer_size=40, chunk_size=MB_SIZE * 3) print 'computing eigenvalues ...' X = np.concatenate( [batch['mb_x'][0, ...].reshape((3, -1)).T for batch in iterator]) n = X.shape[0] limit = 125829120 if n > limit: X = X[np.random.randint(n, size=limit), :] print X.shape cov = empirical_covariance(X) print cov evs, U = eigh(cov) print evs print U return evs, U
def fit(self, X, y=None): """Fits the GraphLasso model to X. Parameters ---------- X : ndarray, shape (n_samples, n_features) Data from which to compute the covariance estimate y : (ignored) """ # Covariance does not make sense for a single feature X = check_array(X, ensure_min_features=2, ensure_min_samples=2, estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: self.location_ = X.mean(0) emp_cov = empirical_covariance(X, assume_centered=self.assume_centered) self.precision_, self.covariance_, self.n_iter_ = graph_lasso( emp_cov, alpha=self.alpha, tol=self.tol, rtol=self.rtol, max_iter=self.max_iter, over_relax=self.over_relax, rho=self.rho, verbose=self.verbose, return_n_iter=True, return_history=False, mode=self.mode, update_rho_options=self.update_rho_options, compute_objective=self.compute_objective) return self
def friedman_results(data_grid, K, K_obs, ells, alpha): from rpy2.robjects.packages import importr glasso = importr('glasso').glasso tic = time.time() iters = [] precisions = [] for d in data_grid.transpose(2, 0, 1): emp_cov = empirical_covariance(d) out = glasso(emp_cov, alpha) iters.append(int(out[-1][0])) precisions.append(np.array(out[1])) tac = time.time() iterations = np.max(iters) precisions = np.array(precisions) F1score = utils.structure_error(K, precisions)['f1'] MSE_observed = None MSE_precision = utils.error_norm(K, precisions, upper_triangular=True) MSE_latent = None mean_rank_error = None res = dict(n_dim_obs=K.shape[1], time=tac - tic, iterations=iterations, F1score=F1score, MSE_precision=MSE_precision, MSE_observed=MSE_observed, MSE_latent=MSE_latent, mean_rank_error=mean_rank_error, likelihood=likelihood_score(data_grid.transpose(2, 0, 1), precisions), note=None, estimator=None) return res
def chandresekeran_results(data_grid, K, K_obs, ells, tau, alpha, **kwargs): emp_cov = np.array([ empirical_covariance(x, assume_centered=True) for x in data_grid.transpose(2, 0, 1) ]).transpose(1, 2, 0) rho = 1. / np.sqrt(data_grid.shape[0]) result = lvglasso(emp_cov, alpha, tau, rho) ma_output = Bunch(**result) R = np.array(ma_output.R).T S = np.array(ma_output.S).T L = np.array(ma_output.L).T ss = utils.structure_error(K, S) MSE_observed = utils.error_norm(K_obs, R) MSE_precision = utils.error_norm(K, S, upper_triangular=True) MSE_latent = utils.error_norm(ells, L) mean_rank_error = utils.error_rank(ells, L) res = dict(n_dim_obs=K.shape[1], time=ma_output.elapsed_time, iterations=np.max(ma_output.iter), MSE_precision=MSE_precision, MSE_observed=MSE_observed, MSE_latent=MSE_latent, mean_rank_error=mean_rank_error, note=None, estimator=ma_output, likelihood=likelihood_score(data_grid.transpose(2, 0, 1), R), latent=L) res = dict(res, **ss) return res
def GWishartFit(X, G, GWprior, mode='covsel'): """Fit G-Wishart distribution.""" n_samples, n_dim = X.shape d0 = GWprior.d0 S0 = GWprior.S0 # check prior size violations if G.shape[0] != n_dim or G.shape[1] != n_dim: raise ValueError('G must be p-by-p, with p dimensions X') if S0.shape[0] != n_dim or S0.shape[1] != n_dim: raise ValueError('GWprior.S0 must be p-by-p, with p dimensions X') # compute posterior scatter matrix dn = n_samples + d0 # X'*X - but I dont assume X to be centered emp_cov = empirical_covariance(X) S = n_samples * emp_cov C = (S + S0) / (dn - 2) if mode == 'covsel': precision = precision_selection(G, n_dim, C) else: # use graph_lasso # convert G to alpha alpha = np.zeros_like(G, dtype=float) alpha[~(G + G.T)] = np.inf precision = graphical_lasso(emp_cov=C, alpha=alpha)[0] return precision, S
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov, tol_support): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MinCovDet(random_state=rand_gen).fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ H = mcd_fit.support_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) assert(error_location < tol_loc) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) assert(error_cov < tol_cov) assert(np.sum(H) >= tol_support) assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def test_graphical_lasso_iris_singular(): # Small subset of rows to test the rank-deficient case # Need to choose samples such that none of the variances are zero indices = np.arange(10, 13) # Hard-coded solution from R glasso package for alpha=0.01 cov_R = np.array([ [0.08, 0.056666662595, 0.00229729713223, 0.00153153142149], [0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222], [0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009], [0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222] ]) icov_R = np.array([ [24.42244057, -16.831679593, 0.0, 0.0], [-16.83168201, 24.351841681, -6.206896552, -12.5], [0.0, -6.206896171, 153.103448276, 0.0], [0.0, -12.499999143, 0.0, 462.5] ]) X = datasets.load_iris().data[indices, :] emp_cov = empirical_covariance(X) for method in ('cd', 'lars'): cov, icov = graphical_lasso(emp_cov, alpha=0.01, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_R, decimal=5) assert_array_almost_equal(icov, icov_R, decimal=5)
def save(X, Y, **kwargs): """ Parameters ---------- X : array-like, shape = [N, D] Training data, where N is the number of samples and D is the number of features. Y : array-like, shape = [N] Response variable, where n_samples is the number of samples Argument dictionary should contain: kwargs = { 'd' : intrinsic dimension (int) 'n_levelsets' : number of slices to use (int) 'split_by' : 'dyadic' (dyadic decomposition) or 'stateq' (statistically equivalent blocks) (default: 'dyadic') 'return_mat' : Boolean whether key SIR matrix should be returned (defaults to False). } Returns ----------- proj_vecs : array-like, shape = [n_features, d] Orthonormal system spanning the sufficient dimension subspace, where d refers to the intrinsic dimension. M : SAVE matrix, only if return_mat option is True } """ # Extract arguments from dictionary d = kwargs['d'] n_levelsets = kwargs['n_levelsets'] split_by = kwargs.get('split_by', 'dyadic') return_mat = kwargs.get('return_mat', False) N, D = X.shape # Standardize X Z, cov_all_sqrtinv = whiten_data(X) # Create partition labels = split(Y, n_levelsets, split_by) M = np.zeros((D, D)) # Container for key matrix in SIR # Compute SAVE matrix empirical_probabilities = np.zeros(n_levelsets) for i in range(n_levelsets): empirical_probabilities[i] = float(len( np.where(labels == i)[0])) / float(N) if empirical_probabilities[i] == 0: continue cov_sub = empirical_covariance( Z[labels == i, :]) # Covariance of all samples M += empirical_probabilities[i] * (np.eye(D) - cov_sub).dot(np.eye(D) - cov_sub) U, S, V = np.linalg.svd(M) # Apply inverse transformation vecs = cov_all_sqrtinv.dot(U[:, :d]) proj_vecs, dummy = np.linalg.qr(vecs) if return_mat: return proj_vecs, M else: return proj_vecs
def kfold_cv(X, K=10, isotonic=True): """K-fold cross-validated eigenvalues for LW nonlinear shrinkage""" S = empirical_covariance(X) lam, U = np.linalg.eigh(S) d = _nls_cv(X, S, K) if isotonic: d = isotonic_regression(d, increasing=True) return U @ np.diag(d) @ U.T
def fit(self, X, alpha): self.alpha = alpha emp_cov = empirical_covariance(X) self.covariance_, self.precision_ = graph_lasso(emp_cov, alpha=self.alpha, tol=self.tol, max_iter=self.max_iter) return self.covariance_, self.precision_
def empirical_covariances(subjects, assume_centered=False, standardize=False): """Compute empirical covariances for several signals. Parameters ---------- subjects : list of numpy.ndarray, shape for each (n_samples, n_features) input subjects. Each subject is a 2D array, whose columns contain signals. Sample number can vary from subject to subject, but all subjects must have the same number of features (i.e. of columns). assume_centered : bool, optional if True, assume that all input signals are centered. This slightly decreases computation time by avoiding useless computation. Default=False. standardize : bool, optional if True, set every signal variance to one before computing their covariance matrix (i.e. compute a correlation matrix). Default=False. Returns ------- emp_covs : numpy.ndarray, shape : (feature number, feature number, subject number) empirical covariances. n_samples : numpy.ndarray, shape: (subject number,) number of samples for each subject. dtype is np.float64. """ if not hasattr(subjects, "__iter__"): raise ValueError("'subjects' input argument must be an iterable. " "You provided {0}".format(subjects.__class__)) n_subjects = [s.shape[1] for s in subjects] if len(set(n_subjects)) > 1: raise ValueError("All subjects must have the same number of " "features.\nYou provided: {0}".format( str(n_subjects))) n_subjects = len(subjects) n_features = subjects[0].shape[1] # Enable to change dtype here because depending on user, conversion from # single precision to double will be required or not. emp_covs = np.empty((n_features, n_features, n_subjects), order="F") for k, s in enumerate(subjects): if standardize: s = s / s.std(axis=0) # copy on purpose M = empirical_covariance(s, assume_centered=assume_centered) # Force matrix symmetry, for numerical stability # of _group_sparse_covariance emp_covs[..., k] = M + M.T emp_covs /= 2 n_samples = np.asarray([s.shape[0] for s in subjects], dtype=np.float64) return emp_covs, n_samples
def likelihood_score(X, precision_): # compute empirical covariance of the test set location_ = X.mean(1).reshape(X.shape[0], 1, X.shape[2]) test_cov = np.array( [empirical_covariance(x, assume_centered=True) for x in X - location_]) res = sum(log_likelihood(S, K) for S, K in zip(test_cov, precision_)) return res
def covariances(): subject_to_means, subject_to_values = load_data(TRAINING_DATA_FILENAME, True) subject_to_covariance = {} full_matrix = None for key in subject_to_values.keys(): if full_matrix is None: full_matrix = subject_to_values[key] subject_to_covariance[key] = empirical_covariance(subject_to_values[key]) print subject_to_means[key] print subject_to_covariance[key] else: full_matrix = np.append(full_matrix, subject_to_values[key], axis = 0) subject_to_covariance[key] = empirical_covariance(subject_to_values[key]) full_mean = full_matrix.mean(axis=0) full_covariance = empirical_covariance(full_matrix) print full_mean print full_covariance return subject_to_covariance, full_covariance, full_mean
def objective_function(self, data, location, covariance): """Objective function minimized at each step of the MCD algorithm. """ precision = pinvh(covariance) det = fast_logdet(precision) trace = np.trace( np.dot(empirical_covariance(data - location, assume_centered=True), precision)) pen = self.shrinkage * np.trace(precision) return -det + trace + pen
def test_covariance(): """Tests Covariance module on a simple dataset. """ # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0) assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) assert_raises(NotImplementedError, cov.error_norm, emp_cov, norm="foo") # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) print(np.amin(mahal_dist), np.amax(mahal_dist)) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0) # test with one sample X_1sample = np.arange(5) cov = EmpiricalCovariance() with warnings.catch_warnings(record=True): cov.fit(X_1sample) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def test_empirical_covariance(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.covariance.empirical_covariance() expected = covariance.empirical_covariance(iris.data) self.assertIsInstance(result, pdml.ModelFrame) tm.assert_index_equal(result.index, df.data.columns) tm.assert_index_equal(result.columns, df.data.columns) self.assert_numpy_array_almost_equal(result.values, expected)
def test_empirical_covariance(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.covariance.empirical_covariance() expected = covariance.empirical_covariance(iris.data) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_index_equal(result.index, df.data.columns) self.assert_index_equal(result.columns, df.data.columns) self.assert_numpy_array_almost_equal(result.values, expected)
def empirical_covariances(subjects, assume_centered=False, standardize=False): """Compute empirical covariances for several signals. Parameters ---------- subjects : list of numpy.ndarray, shape for each (n_samples, n_features) input subjects. Each subject is a 2D array, whose columns contain signals. Sample number can vary from subject to subject, but all subjects must have the same number of features (i.e. of columns). assume_centered : bool, optional if True, assume that all input signals are centered. This slightly decreases computation time by avoiding useless computation. standardize : bool, optional if True, set every signal variance to one before computing their covariance matrix (i.e. compute a correlation matrix). Returns ------- emp_covs : numpy.ndarray, shape : (feature number, feature number, subject number) empirical covariances. n_samples : numpy.ndarray, shape: (subject number,) number of samples for each subject. dtype is np.float. """ if not hasattr(subjects, "__iter__"): raise ValueError("'subjects' input argument must be an iterable. " "You provided {0}".format(subjects.__class__)) n_subjects = [s.shape[1] for s in subjects] if len(set(n_subjects)) > 1: raise ValueError("All subjects must have the same number of " "features.\nYou provided: {0}".format(str(n_subjects)) ) n_subjects = len(subjects) n_features = subjects[0].shape[1] # Enable to change dtype here because depending on user, conversion from # single precision to double will be required or not. emp_covs = np.empty((n_features, n_features, n_subjects), order="F") for k, s in enumerate(subjects): if standardize: s = s / s.std(axis=0) # copy on purpose M = empirical_covariance(s, assume_centered=assume_centered) # Force matrix symmetry, for numerical stability # of _group_sparse_covariance emp_covs[..., k] = M + M.T emp_covs /= 2 n_samples = np.asarray([s.shape[0] for s in subjects], dtype=np.float) return emp_covs, n_samples
def feat_select(f): cp = load(read) (X, y, t) = cp.export_data(f) data = numpy.c_[X, y] cov = empirical_covariance(data, False) print cov for i in range(cov.shape[0] - 1): print cov[i, -1]
def test_graph_lasso_2D(): # Hard-coded solution from Python skggm package # obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)` cov_skggm = np.array([[3.09550269, 1.186972], [1.186972, 0.57713289]]) icov_skggm = np.array([[1.52836773, -3.14334831], [-3.14334831, 8.19753385]]) X = datasets.load_iris().data[:, 2:] emp_cov = empirical_covariance(X) for method in ('cd', 'lars'): cov, icov = graphical_lasso(emp_cov, alpha=.1, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_skggm) assert_array_almost_equal(icov, icov_skggm)
def train(self, use_entropy=False): """ Train the classifier for all the models that it knows. """ if len(self.dict_categories) < 2: print "At least two categories are needed for training..." print "Training is skipped." return (X, Y, W) = self._get_example_matrix(use_entropy) if (hasattr(self.classifier, 'metric') and self.classifier.metric == 'mahalanobis'): # The mahalanobis distance needs the covariance of the data cov = covariance.empirical_covariance(X) self.classifier.metric_kwds['V'] = cov print "Training with {} categories and {} views.".format( len(self.dict_categories), len(Y)) print self.classifier.fit(X, Y)
def plot_all(X): tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) #---------------------------------------------------------------------- # Pre-processing print "t-SNE Scaling" X_scaled = preprocessing.scale(X) #zero mean, unit variance X_tsne_scaled = tsne.fit_transform(X_scaled) #normalize the data (scaling individual samples to have unit norm) print "t-SNE L2 Norm" X_normalized = preprocessing.normalize(X, norm='l2') X_tsne_norm = tsne.fit_transform(X_normalized) #whiten the data print "t-SNE Whitening" # the mean computed by the scaler is for the feature dimension. # We want the normalization to be in feature dimention. # Zero mean for each sample assumes stationarity which is not necessarily true for CNN features. # X: NxD where N is number of examples and D is number of features. # scaler = preprocessing.StandardScaler(with_std=False).fit(X) scaler = preprocessing.StandardScaler().fit(X) #this scales each feature to have std-dev 1 X_centered = scaler.transform(X) # U, s, Vh = linalg.svd(X_centered) shapeX = X_centered.shape IPython.embed() # this is DxD matrix where D is the feature dimension # still to figure out: It seems computation is not a problem but carrying around a 50kx50k matrix is memory killer! sig = (1/shapeX[0]) * np.dot(X_centered.T, X_centered) sig2= covariance.empirical_covariance(X_centered, assume_centered=True) #estimated -- this is better. sig3, shrinkage= covariance.oas(X_centered, assume_centered=True) #estimated U, s, Vh = linalg.svd(sig, full_matrices=False) eps = 1e-2 # this affects how many low- freq eigevalues are eliminated invS = np.diag (np.reciprocal(np.sqrt(s+eps))) #PCA_whiten X_pca = np.dot(invS, np.dot(U.T, X_centered)) X_tsne_pca = tsne.fit_transform(X_pca) #whiten the data (ZCA) X_zca = np.dot(U, X_pca) X_tsne_zca = tsne.fit_transform(X_zca) return X_tsne_scaled, X_tsne_norm, X_tsne_pca, X_tsne_zca
def test_graphical_lasso(random_state=0): # Sample data from a sparse multivariate normal dim = 20 n_samples = 100 random_state = check_random_state(random_state) prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state) cov = linalg.inv(prec) X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples) emp_cov = empirical_covariance(X) for alpha in (0., .1, .25): covs = dict() icovs = dict() for method in ('cd', 'lars'): cov_, icov_, costs = graphical_lasso(emp_cov, return_costs=True, alpha=alpha, mode=method) covs[method] = cov_ icovs[method] = icov_ costs, dual_gap = np.array(costs).T # Check that the costs always decrease (doesn't hold if alpha == 0) if not alpha == 0: assert_array_less(np.diff(costs), 0) # Check that the 2 approaches give similar results assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4) assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4) # Smoke test the estimator model = GraphicalLasso(alpha=.25).fit(X) model.score(X) assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4) assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4) # For a centered matrix, assume_centered could be chosen True or False # Check that this returns indeed the same result for centered data Z = X - X.mean(0) precs = list() for assume_centered in (False, True): prec_ = GraphicalLasso( assume_centered=assume_centered).fit(Z).precision_ precs.append(prec_) assert_array_almost_equal(precs[0], precs[1])
def _naive_ledoit_wolf_shrinkage(X): # A simple implementation of the formulas from Ledoit & Wolf # The computation below achieves the following computations of the # "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for # Large-Dimensional Covariance Matrices" # beta and delta are given in the beginning of section 3.2 n_samples, n_features = X.shape emp_cov = empirical_covariance(X, assume_centered=False) mu = np.trace(emp_cov) / n_features delta_ = emp_cov.copy() delta_.flat[::n_features + 1] -= mu delta = (delta_ ** 2).sum() / n_features X2 = X ** 2 beta_ = 1. / (n_features * n_samples) \ * np.sum(np.dot(X2.T, X2) / n_samples - emp_cov ** 2) beta = min(beta_, delta) shrinkage = beta / delta return shrinkage
def lasso_gsc_comparison(): """Check that graph lasso and group-sparse covariance give the same output for a single task.""" from sklearn.covariance import graph_lasso, empirical_covariance parameters = {'n_tasks': 1, 'n_var': 20, 'density': 0.15, 'rho': .2, 'tol': 1e-4, 'max_iter': 50} _, _, gt = create_signals(parameters, output_dir=output_dir) signals = gt["signals"] _, gsc_precision = utils.timeit(group_sparse_covariance)( signals, parameters['rho'], max_iter=parameters['max_iter'], tol=parameters['tol'], verbose=1, debug=False) emp_cov = empirical_covariance(signals[0]) _, gl_precision = utils.timeit(graph_lasso)( emp_cov, parameters['rho'], tol=parameters['tol'], max_iter=parameters['max_iter']) np.testing.assert_almost_equal(gl_precision, gsc_precision[..., 0], decimal=4)
def prepareProblem(filePath, shrinkage=False, subset=False, subsetSize=0): # Import data from .csv df = pd.read_csv(filePath, sep=';') df.index = df.date df = df.drop('date', axis=1) # Subset, if called via subset == True if subset == True: df = df.tail(subsetSize) # Estimate covariance using Empirical/MLE # Expected input is returns, hence set: assume_centered = True mleFitted = empirical_covariance(X=df, assume_centered=True) sigma = mleFitted if shrinkage == True: # Estimate covariance using LedoitWolf, first create instance of object lw = LedoitWolf(assume_centered=True) lwFitted = lw.fit(X=df).covariance_ sigma = lwFitted return sigma
def test_graphical_lasso_iris(): # Hard-coded solution from R glasso package for alpha=1.0 # (need to set penalize.diagonal to FALSE) cov_R = np.array([ [0.68112222, 0.0000000, 0.265820, 0.02464314], [0.00000000, 0.1887129, 0.000000, 0.00000000], [0.26582000, 0.0000000, 3.095503, 0.28697200], [0.02464314, 0.0000000, 0.286972, 0.57713289] ]) icov_R = np.array([ [1.5190747, 0.000000, -0.1304475, 0.0000000], [0.0000000, 5.299055, 0.0000000, 0.0000000], [-0.1304475, 0.000000, 0.3498624, -0.1683946], [0.0000000, 0.000000, -0.1683946, 1.8164353] ]) X = datasets.load_iris().data emp_cov = empirical_covariance(X) for method in ('cd', 'lars'): cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_R) assert_array_almost_equal(icov, icov_R)
def test_graph_lasso_iris(): # Hard-coded solution from R glasso package for alpha=1.0 # The iris datasets in R and scikit-learn do not match in a few places, # these values are for the scikit-learn version. cov_R = np.array([ [0.68112222, 0.0, 0.2651911, 0.02467558], [0.00, 0.1867507, 0.0, 0.00], [0.26519111, 0.0, 3.0924249, 0.28774489], [0.02467558, 0.0, 0.2877449, 0.57853156] ]) icov_R = np.array([ [1.5188780, 0.0, -0.1302515, 0.0], [0.0, 5.354733, 0.0, 0.0], [-0.1302515, 0.0, 0.3502322, -0.1686399], [0.0, 0.0, -0.1686399, 1.8123908] ]) X = datasets.load_iris().data emp_cov = empirical_covariance(X) for method in ('cd', 'lars'): cov, icov = graph_lasso(emp_cov, alpha=1.0, return_costs=False, mode=method) assert_array_almost_equal(cov, cov_R) assert_array_almost_equal(icov, icov_R)
def _nonrobust_covariance(self, data, assume_centered=False): """Non-robust estimation of the covariance to be used within MCD. Parameters ---------- data: array_like, shape (n_samples, n_features) Data for which to compute the non-robust covariance matrix. assume_centered: Boolean Whether or not the observations should be considered as centered. Returns ------- nonrobust_covariance: array_like, shape (n_features, n_features) The non-robust covariance of the data. """ if self.cov_computation_method is None: cov = empirical_covariance(data, assume_centered=assume_centered) cov.flat[::data.shape[1] + 1] += self.shrinkage elif self.cov_computation_method == "diag": cov = np.diag(np.var(data, 0)) / self.shrinkage else: raise NotImplemented return cov
def set_optimal_shrinkage_amount(self, X, verbose=False): """ Parameters ---------- X: array-like, shape = [n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. Returns ------- optimal_shrinkage: The optimal amount of shrinkage, chosen with a 10-fold cross-validation. (or a Leave-One Out cross-validation if n_samples < 10). """ n_samples, n_features = X.shape std_shrinkage = np.trace(empirical_covariance(X)) / \ float(n_samples * n_features) # use L2 here? (was done during research work, changed for consistency) rmcd = RMCDl1(shrinkage=std_shrinkage).fit(X) cov = GraphLassoCV().fit(X[rmcd.raw_support_]) self.shrinkage = cov.alpha_ return cov.cv_alphas_, cov.cv_scores
def fit(self,X,Y): self.predictions=[] n=len(Y) self.MSEs=[] self.weights=[] for reg in self.regList: self.predictions.append(cross_val_predict(reg,X,Y,cv=self.cv)) MSE=sum([(p-a)**2. for (p,a) in zip(self.predictions[-1],Y)])/n self.MSEs.append(MSE) reg.fit(X,Y) if self.weighting=='uniform': self.weights=[1./len(self.regList)]*len(self.regList) elif self.weighting=='score': tot=sum([1./s for s in self.MSEs]) self.weights=[1./(s*tot) for s in self.MSEs] elif self.weighting=='varMin': self.covariance=empirical_covariance(np.array(self.predictions).T) self.weights=smallestVarianceWeights(self.covariance,self.MSEs,self.biasWeighting) elif self.weighting=='linearReg': self.stacker.fit(np.array(self.predictions).T,Y) self.weights=self.stacker.coef_ print self.weights
def launch_mcd_on_dataset(n_samples, n_features, n_outliers): rand_gen = np.random.RandomState(0) data = rand_gen.randn(n_samples, n_features) # add some outliers outliers_index = rand_gen.permutation(n_samples)[:n_outliers] outliers_offset = 10. * \ (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5) data[outliers_index] += outliers_offset inliers_mask = np.ones(n_samples).astype(bool) inliers_mask[outliers_index] = False pure_data = data[inliers_mask] # compute MCD by fitting an object mcd_fit = MCD().fit(data) T = mcd_fit.location_ S = mcd_fit.covariance_ # compare with the estimates learnt from the inliers error_location = np.mean((pure_data.mean(0) - T) ** 2) print error_location assert(error_location < 1.) error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2) print error_cov assert(error_cov < 1.)
def test_oas(): """Tests OAS module on a simple dataset. """ # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) oa = OAS(assume_centered=True) oa.fit(X_centered) shrinkage_ = oa.shrinkage_ score_ = oa.score(X_centered) # compare shrunk covariance obtained from data and from MLE estimate oa_cov_from_mle, oa_shinkrage_from_mle = oas(X_centered, assume_centered=True) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) # compare estimates given by OAS and ShrunkCovariance scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) oa = OAS(assume_centered=True) oa.fit(X_1d) oa_cov_from_mle, oa_shinkrage_from_mle = oas(X_1d, assume_centered=True) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, oa.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) oa = OAS(store_precision=False, assume_centered=True) oa.fit(X_centered) assert_almost_equal(oa.score(X_centered), score_, 4) assert(oa.precision_ is None) ### Same tests without assuming centered data # test shrinkage coeff on a simple data set oa = OAS() oa.fit(X) assert_almost_equal(oa.shrinkage_, shrinkage_, 4) assert_almost_equal(oa.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate oa_cov_from_mle, oa_shinkrage_from_mle = oas(X) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) # compare estimates given by OAS and ShrunkCovariance scov = ShrunkCovariance(shrinkage=oa.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) oa = OAS() oa.fit(X_1d) oa_cov_from_mle, oa_shinkrage_from_mle = oas(X_1d) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shinkrage_from_mle, oa.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4) # test with one sample X_1sample = np.arange(5) oa = OAS() with warnings.catch_warnings(record=True): oa.fit(X_1sample) # test shrinkage coeff on a simple data set (without saving precision) oa = OAS(store_precision=False) oa.fit(X) assert_almost_equal(oa.score(X), score_, 4) assert(oa.precision_ is None)
def test_ledoit_wolf(): """Tests LedoitWolf module on a simple dataset. """ # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X, assume_centered=True) assert_almost_equal(lw.shrinkage_, 0.00192, 4) assert_almost_equal(lw.score(X, assume_centered=True), -2.89795, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X, assume_centered=True) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d, assume_centered=True) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X, assume_centered=True) assert_almost_equal(lw.score(X, assume_centered=True), -2.89795, 4) assert(lw.precision_ is None) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, 0.007582, 4) assert_almost_equal(lw.score(X), 2.243483, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), 2.2434839, 4) assert(lw.precision_ is None)