def simulateLogNormal(data, covtype='Estimate', nsamples=2000, **kwargs): """ :param data: :param covtype: Type of covariance matrix estimator. Allowed types are: - Estimate (default): - Diagonal: - Shrinkage OAS: :param int nsamples: Number of simulated samples to draw :return: simulated data and empirical covariance est """ try: # Offset data to make sure there are no 0 values for log transform offset = np.min(data) + 1 offdata = data + offset # log on the offsetted data logdata = np.log(offdata) # Get the means meanslog = np.mean(logdata, axis=0) # Specify covariance # Regular covariance estimator if covtype == "Estimate": covlog = np.cov(logdata, rowvar=0) # Shrinkage covariance estimator, using LedoitWolf elif covtype == "ShrinkageLedoitWolf": scov = LedoitWolf() scov.fit(logdata) covlog = scov.covariance_ elif covtype == "ShrinkageOAS": scov = OAS() scov.fit(logdata) covlog = scov.covariance_ # Diagonal covariance matrix (no between variable correlation) elif covtype == "Diagonal": covlogdata = np.var( logdata, axis=0) #get variance of log data by each column covlog = np.diag( covlogdata ) #generate a matrix with diagonal of variance of log Data else: raise ValueError('Unknown Covariance type') simData = np.random.multivariate_normal(meanslog, covlog, nsamples) simData = np.exp(simData) simData -= offset ##Set to 0 negative values simData[np.where(simData < 0)] = 0 # work out the correlation of matrix by columns, each column is a variable corrMatrix = np.corrcoef(simData, rowvar=0) return simData, corrMatrix except Exception as exp: raise exp
def test_ledoit_wolf_small(): # Compare our blocked implementation to the naive implementation X_small = X[:, :4] lw = LedoitWolf() lw.fit(X_small) shrinkage_ = lw.shrinkage_ assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
def maximization(self): # mean maximization for i in range(self._K): mu[i] = mu_ss[i] / ndata_ss # covariance maximization for i in range(self._K): for j in range(self._K): cov[i,j] = (1.0/ ndata_ss) * cov_ss[i,j] + ndata_ss * mu[i] * mu[j] - mu_ss[i] * mu[j] - mu_ss[j] * mu[i] # covariance shrinkage lw = LedoitWolf() cov_result = lw.fit(cov,assume_centered=True).covariance_ inv_cov = np.linalg.inv(cov_result) log_det_inv_cov = np.log(np.linalg.det(inv_cov)) # topic maximization for i in range(self._K): sum_m = 0 for j in range(self._W): sum_m += beta_ss[i,j] if sum_m == 0: sum_m = -1000 * self._W else: sum_m = np.log(sum_m) for j in range(self._W): log_beta[i,j] = np.log(beta_ss[i,j] - sum_m)
def LW_est(X): ''' Ledoit-Wolf optimal shrinkage coefficient estimate X_size = (n_samples, n_features) ''' lw = LedoitWolf() cov_lw = lw.fit(X).covariance_ return cov_lw
def covarianceEstimation(daily_returns, cov_estimator): lw = LedoitWolf() if cov_estimator == "shrinkage": return lw.fit(daily_returns).covariance_ elif cov_estimator == "empirical": return daily_returns.cov() elif cov_estimator == "multifactor": # FIXME return None else: raise Exception("协方差矩阵类型为[shrinkage,empirical,multifactor]")
def partial_corrconn(activity_matrix, estimator='EmpiricalCovariance', target_ts=None): """ activity_matrix: Activity matrix should be nodes X time target_ts: Optional, used when only a single target time series (returns 1 X nnodes matrix) estimator: can be either 'Empirical covariance' the default, or 'LedoitWolf' partial correlation with Ledoit-Wolf shrinkage Output: connectivity_mat, formatted targets X sources Credit goes to nilearn connectivity_matrices.py which contains code that was simplified for this use. """ nnodes = activity_matrix.shape[0] timepoints = activity_matrix.shape[1] if nnodes > timepoints: print('activity_matrix shape: ', np.shape(activity_matrix)) raise Exception( 'More nodes (regressors) than timepoints! Use regularized regression' ) if 2 * nnodes > timepoints: print('activity_matrix shape: ', np.shape(activity_matrix)) print('Consider using a shrinkage method') if target_ts is None: connectivity_mat = np.zeros((nnodes, nnodes)) # calculate covariance if estimator is 'LedoitWolf': cov_estimator = LedoitWolf(store_precision=False) elif estimator is 'EmpiricalCovariance': cov_estimator = EmpiricalCovariance(store_precision=False) covariance = cov_estimator.fit(activity_matrix.T).covariance_ # calculate precision precision = linalg.inv(covariance) # precision to partial corr diagonal = np.atleast_2d(1. / np.sqrt(np.diag(precision))) correlation = precision * diagonal * diagonal.T # Force exact 0. on diagonal np.fill_diagonal(correlation, 0.) connectivity_mat = -correlation else: #Computing values for a single target node connectivity_mat = np.zeros((nnodes, 1)) X = activity_matrix.T y = target_ts #Note: LinearRegression fits intercept by default (intercept beta not included in coef_ output) reg = LinearRegression().fit(X, y) connectivity_mat = reg.coef_ return connectivity_mat
def covariance_estimator(matrix, method='ledoit-wolf', assume_centered=True, store_precision=True, **kwargs): """ Return a pre-fit estimator for covariance from one of the scikit-learn estimators :param matrix: matrix to fit covariance to :param method: method one of `SUPPORTED_SKLEARN_COVARIANCE_ESTIMATORS` :param assume_centered: whether to assume data to be centered :param store_precision: if true, computes precision matrix (i.e. the inverse covariance) too :param kwargs: other kwargs to pass to estimator :return: """ estimator = None if method == 'ledoit-wolf': estimator = LedoitWolf(assume_centered=assume_centered, store_precision=store_precision, **kwargs) elif method == 'oas': estimator = OAS(assume_centered=assume_centered, store_precision=store_precision, **kwargs) elif method == 'mincovdet': estimator = MinCovDet(assume_centered=assume_centered, store_precision=store_precision, **kwargs) elif method == 'empirical': estimator = EmpiricalCovariance(assume_centered=assume_centered, store_precision=store_precision, **kwargs) else: raise Exception('Unsupported estimator {!r}'.format(estimator)) estimator.fit(matrix.T) return estimator
def max_IC_weight(ic_df, factors_dict, holding_period, covariance_type="shrink"): """ 输入ic_df(ic值序列矩阵),指定持有期和滚动窗口,给出相应的多因子组合权重 :param factors_dict: 若干因子组成的字典(dict),形式为: {"factor_name_1":factor_1,"factor_name_2":factor_2} 每个因子值格式为一个pd.DataFrame,索引(index)为date,column为asset :param ic_df: ic值序列矩阵 (pd.Dataframe),索引(index)为datetime,columns为各因子名称。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 :param holding_period: 持有周期(int) :param covariance_type:"shrink"/"simple" 协防差矩阵估算方式 Ledoit-Wolf压缩估计或简单估计 :return: weight_df:使用Sample协方差矩阵估算方法得到的因子权重(pd.Dataframe), 索引(index)为datetime,columns为待合成的因子名称。 """ weight_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns) lw = LedoitWolf() # 最大化第t天的ic,用到了截止到t+period的数据(算收益), # 算得的权重用于t+period的因子进行加权 for dt in ic_df.index: f_dt = pd.concat([ factors_dict[factor_name].loc[dt] for factor_name in ic_df.columns ], axis=1).dropna() if len(f_dt) == 0: continue if covariance_type == "shrink": try: f_cov_mat = lw.fit(f_dt.as_matrix()).covariance_ except: f_cov_mat = np.mat(np.cov(f_dt.T.as_matrix()).astype(float)) else: f_cov_mat = np.mat(np.cov(f_dt.T.as_matrix()).astype(float)) inv_f_cov_mat = np.linalg.inv(f_cov_mat) weight = inv_f_cov_mat * np.mat(ic_df.loc[dt].values).reshape( len(inv_f_cov_mat), 1) weight = np.array(weight.reshape(len(weight), ))[0] weight_df.ix[dt] = weight / np.sum(np.abs(weight)) return weight_df.shift(holding_period)
def maximization(self): ''' M-step of EM algorithm, use scikit.learn's LedoitWolf method to perfom covariance matrix shrinkage. Arguments: sufficient statistics, i.e. model parameters Returns: the updated sufficient statistics which all in self definition, so no return values ''' logger.info("running maximization function") logger.info("mean maximization") mu = np.divide(self.mu, self.ndata) logger.info("covariance maximization") for i in range(self._K): for j in range(self._K): self.cov[i, j] = (1.0 / self.ndata) * self.cov[i, j] + self.ndata * mu[i] * mu[j] - self.mu[i] * mu[j] - self.mu[j] * mu[i] logger.info(" performing covariance shrinkage using sklearn module") lw = LedoitWolf() cov_result = lw.fit(self.cov, assume_centered=True).covariance_ self.inv_cov = np.linalg.inv(cov_result) self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov)) logger.info("topic maximization") for i in range(self._K): sum_m = 0 sum_m += np.sum(self.beta, axis=0)[i] if sum_m == 0: sum_m = -1000 * self._W else: sum_m = np.log(sum_m) for j in range(self._W): self.log_beta[i, j] = math_utli.safe_log(self.beta[i, j] - sum_m) logger.info("write model parameters to file") logger.info("write gaussian") with open('ctm_nu', 'w') as ctm_nu_dump: cPickle.dump(self.nu, ctm_nu_dump) with open('ctm_cov', 'w') as ctm_cov_dump: cPickle.dump(self.cov, ctm_cov_dump) with open('ctm_inv_cov', 'w') as ctm_inv_cov_dump: cPickle.dump(self.inv_cov, ctm_inv_cov_dump) with open('ctm_log_det_inv_cov', 'w') as ctm_log_det_inv_cov_dump: cPickle.dump(self.log_det_inv_cov, ctm_log_det_inv_cov_dump) logger.info("write topic matrix") with open('ctm_log_beta', 'w') as ctm_log_beta_dump: cPickle.dump(self.log_beta, ctm_log_beta_dump)
def max_IR_weight(ic_df, holding_period, rollback_period=120, covariance_type="shrink"): """ 输入ic_df(ic值序列矩阵),指定持有期和滚动窗口,给出相应的多因子组合权重 :param ic_df: ic值序列矩阵 (pd.Dataframe),索引(index)为datetime,columns为各因子名称。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 :param holding_period: 持有周期(int) :param rollback_period: 滚动窗口,即计算每一天的因子权重时,使用了之前rollback_period下的IC时间序列来计算IC均值向量和IC协方差矩阵(int)。 :param covariance_type:"shrink"/"simple" 协防差矩阵估算方式 Ledoit-Wolf压缩估计或简单估计 :return: weight_df:使用Sample协方差矩阵估算方法得到的因子权重(pd.Dataframe), 索引(index)为datetime,columns为待合成的因子名称。 """ # 最大化t-n ~ t天的ic_ir,用到了截止到t+period的数据(算收益), # 算得的权重用于t+period的因子进行加权 n = rollback_period weight_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns) lw = LedoitWolf() for dt in ic_df.index: ic_dt = ic_df[ic_df.index <= dt].tail(n) if len(ic_dt) < n: continue if covariance_type == "shrink": try: ic_cov_mat = lw.fit(ic_dt.as_matrix()).covariance_ except: ic_cov_mat = np.mat(np.cov(ic_dt.T.as_matrix()).astype(float)) else: ic_cov_mat = np.mat(np.cov(ic_dt.T.as_matrix()).astype(float)) inv_ic_cov_mat = np.linalg.inv(ic_cov_mat) weight = inv_ic_cov_mat * np.mat(ic_dt.mean().values).reshape( len(inv_ic_cov_mat), 1) weight = np.array(weight.reshape(len(weight), ))[0] weight_df.ix[dt] = weight / np.sum(np.abs(weight)) return weight_df.shift(holding_period)
def get_ic_weight_shrink_df(self, ic_df, holding_period, rollback_period=120): """ 输入ic_df(ic值序列矩阵),指定持有期和滚动窗口,给出相应的多因子组合权重 :param ic_df: ic值序列矩阵 (pd.Dataframe),索引(index)为datetime,columns为各因子名称。 如: BP CFP EP ILLIQUIDITY REVS20 SRMI VOL20 date 2016-06-24 0.165260 0.002198 0.085632 -0.078074 0.173832 0.214377 0.068445 2016-06-27 0.165537 0.003583 0.063299 -0.048674 0.180890 0.202724 0.081748 2016-06-28 0.135215 0.010403 0.059038 -0.034879 0.111691 0.122554 0.042489 2016-06-29 0.068774 0.019848 0.058476 -0.049971 0.042805 0.053339 0.079592 2016-06-30 0.039431 0.012271 0.037432 -0.027272 0.010902 0.077293 -0.050667 :param holding_period: 持有周期(int) :param rollback_period: 滚动窗口,即计算每一天的因子权重时,使用了之前rollback_period下的IC时间序列来计算IC均值向量和IC协方差矩阵(int)。 :return: ic_weight_shrink_df:使用Ledoit-Wolf压缩方法得到的因子权重(pd.Dataframe), 索引(index)为datetime,columns为待合成的因子名称。 """ from sklearn.covariance import LedoitWolf import numpy as np n = rollback_period ic_weight_shrink_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns) lw = LedoitWolf() for dt in ic_df.index: ic_dt = ic_df[ic_df.index < dt].tail(n) if len(ic_dt) < n: continue try: ic_cov_mat = lw.fit(ic_dt.as_matrix()).covariance_ except: ic_cov_mat = np.mat(np.cov(ic_dt.T.as_matrix()).astype(float)) inv_ic_cov_mat = np.linalg.inv(ic_cov_mat) weight = inv_ic_cov_mat * np.mat(ic_dt.mean()).reshape(len(inv_ic_cov_mat), 1) weight = np.array(weight.reshape(len(weight), ))[0] ic_weight_shrink_df.ix[dt] = weight / np.sum(weight) return ic_weight_shrink_df.shift(holding_period)
def prepareProblem(filePath, shrinkage=False, subset=False, subsetSize=0): # Import data from .csv df = pd.read_csv(filePath, sep=';') df.index = df.date df = df.drop('date', axis=1) # Subset, if called via subset == True if subset == True: df = df.tail(subsetSize) # Estimate covariance using Empirical/MLE # Expected input is returns, hence set: assume_centered = True mleFitted = empirical_covariance(X=df, assume_centered=True) sigma = mleFitted if shrinkage == True: # Estimate covariance using LedoitWolf, first create instance of object lw = LedoitWolf(assume_centered=True) lwFitted = lw.fit(X=df).covariance_ sigma = lwFitted return sigma
def Caculate_Weight_LW(fct_class_name,fct_class,n,dir,ic_df): store_path = dir + '/data_out/class_factor_weight/' + fct_class_name isExists = os.path.exists(store_path) if not isExists: os.makedirs(store_path) ic_weight_shrink_df = pd.DataFrame(index=ic_df.index, columns=ic_df.columns) e = 1e-10 # 非常接近0的值 lw = LedoitWolf() for dt in ic_df.index: ic_dt = ic_df[ic_df.index < dt].tail(n) if len(ic_dt) < n: continue ic_cov_mat = lw.fit(ic_dt.values).covariance_ inv_ic_cov_mat = np.linalg.inv(ic_cov_mat) weight=np.matmul(inv_ic_cov_mat,np.mat(ic_dt.mean()).reshape(len(inv_ic_cov_mat), 1)) #weight = inv_ic_cov_mat * np.mat(ic_dt.mean()).reshape(len(inv_ic_cov_mat), 1) weight = np.array(weight.reshape(len(weight), ))[0] ic_weight_shrink_df.ix[dt] = weight / np.sum(weight) '''IC = np.array(ic_dt.mean()).reshape(4,1) fun = lambda W: (-(np.matmul(W.T, IC) / np.sqrt(W.T * ic_cov_mat * W)))[0][0] # 约束函数 cons = ({'type': 'ineq', 'fun': lambda W: W - e}) W0 = np.random.rand(len(fct_class), 1) res = minimize(fun, W0, method='SLSQP', constraints=cons) ic_weight_shrink_df.ix[dt] = res.x''' ic_weight_shrink_df=ic_weight_shrink_df.dropna(axis=0,how='any') color = ['green', 'blue', 'orange', 'gray'] for fct in fct_class: # ic_weight_df[fct]=np.array(ic_weight_df[fct])/np.array(ic_weight_df['Col_sum']) plt.plot(ic_weight_shrink_df.index, ic_weight_shrink_df[fct], color=color[fct_class.index(fct)]) plt.legend() plt.title('Factor weight using sample covariance') plt.savefig(store_path + '/weight_maxIR_LW.png') plt.close() return ic_weight_shrink_df
def test_ledoit_wolf(): """Tests LedoitWolf module on a simple dataset. """ # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X, assume_centered=True) assert_almost_equal(lw.shrinkage_, 0.00192, 4) assert_almost_equal(lw.score(X, assume_centered=True), -2.89795, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X, assume_centered=True) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d, assume_centered=True) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X, assume_centered=True) assert_almost_equal(lw.score(X, assume_centered=True), -2.89795, 4) assert(lw.precision_ is None) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, 0.007582, 4) assert_almost_equal(lw.score(X), 2.243483, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), 2.2434839, 4) assert(lw.precision_ is None)
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 n_samples = 200 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] random_state = check_random_state(0) ledoit_estimator = LedoitWolf() for k in range(n_subjects): signal = random_state.randn(n_samples, n_features) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip([EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert(is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert(is_spd(gmean_sqrt, decimal=7)) assert(is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal(conn_measure.whitening_.dot( gmean_sqrt), np.eye(n_features)) assert_array_almost_equal(gmean_sqrt.dot( _map_eigenvalues(np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert(is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert(is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal(d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal(d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec)))
real_cov = toeplitz(r**np.arange(n_features)) coloring_matrix = cholesky(real_cov) n_samples_range = np.arange(6, 31, 1) repeat = 100 lw_mse = np.zeros((n_samples_range.size, repeat)) oa_mse = np.zeros((n_samples_range.size, repeat)) lw_shrinkage = np.zeros((n_samples_range.size, repeat)) oa_shrinkage = np.zeros((n_samples_range.size, repeat)) for i, n_samples in enumerate(n_samples_range): for j in range(repeat): X = np.dot( np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) lw = LedoitWolf(store_precision=False) lw.fit(X, assume_centered=True) lw_mse[i,j] = lw.error_norm(real_cov, scaling=False) lw_shrinkage[i,j] = lw.shrinkage_ oa = OAS(store_precision=False) oa.fit(X, assume_centered=True) oa_mse[i,j] = oa.error_norm(real_cov, scaling=False) oa_shrinkage[i,j] = oa.shrinkage_ # plot MSE pl.subplot(2,1,1) pl.errorbar(n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1), label='Ledoit-Wolf', color='g') pl.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1), label='OAS', color='r') pl.ylabel("Squared error")
base_X_test = np.random.normal(size=(n_samples, n_features)) # Color samples coloring_matrix = np.random.normal(size=(n_features, n_features)) X_train = np.dot(base_X_train, coloring_matrix) X_test = np.dot(base_X_test, coloring_matrix) ############################################################################### # Compute Ledoit-Wolf and Covariances on a grid of shrinkages from sklearn.covariance import LedoitWolf, OAS, ShrunkCovariance, \ log_likelihood, empirical_covariance # Ledoit-Wolf optimal shrinkage coefficient estimate lw = LedoitWolf() loglik_lw = lw.fit(X_train, assume_centered=True).score( X_test, assume_centered=True) # OAS coefficient estimate oa = OAS() loglik_oa = oa.fit(X_train, assume_centered=True).score( X_test, assume_centered=True) # spanning a range of possible shrinkage coefficient values shrinkages = np.logspace(-3, 0, 30) negative_logliks = [-ShrunkCovariance(shrinkage=s).fit( X_train, assume_centered=True).score(X_test, assume_centered=True) \ for s in shrinkages] # getting the likelihood under the real model real_cov = np.dot(coloring_matrix.T, coloring_matrix) emp_cov = empirical_covariance(X_train)
def test_ledoit_wolf(): # Tests LedoitWolf module on a simple dataset. # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_centered, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert(lw.precision_ is None) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) lw = LedoitWolf() assert_warns(UserWarning, lw.fit, X_1sample) assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert(lw.precision_ is None)
def shrink(X): lw = LedoitWolf(store_precision=False, assume_centered=False) lw.fit(X) return lw.covariance_
def test_ledoit_wolf(): # Tests LedoitWolf module on a simple dataset. # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal( ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_) assert_almost_equal( ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_centered, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert (lw.precision_ is None) # (too) large data set X_large = np.ones((20, 200)) assert_raises(MemoryError, ledoit_wolf, X_large, block_size=100) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample # FIXME I don't know what this test does X_1sample = np.arange(5) lw = LedoitWolf() assert_warns(UserWarning, lw.fit, X_1sample) assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert (lw.precision_ is None)
except ImportError: has_sklearn = False print 'sklearn not available' def cov2corr(cov): std_ = np.sqrt(np.diag(cov)) corr = cov / np.outer(std_, std_) return corr if has_sklearn: from sklearn.covariance import LedoitWolf, OAS, MCD lw = LedoitWolf(store_precision=False) lw.fit(rr, assume_centered=False) cov_lw = lw.covariance_ corr_lw = cov2corr(cov_lw) oas = OAS(store_precision=False) oas.fit(rr, assume_centered=False) cov_oas = oas.covariance_ corr_oas = cov2corr(cov_oas) mcd = MCD() #.fit(rr, reweight=None) mcd.fit(rr, assume_centered=False) cov_mcd = mcd.covariance_ corr_mcd = cov2corr(cov_mcd) titles = ['raw correlation', 'lw', 'oas', 'mcd'] normcolor = None
def test_ledoit_wolf(): # Tests LedoitWolf module on a simple dataset. # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal( ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_) assert_almost_equal( ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_centered, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d**2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert (lw.precision_ is None) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) lw = LedoitWolf() warn_msg = ( "Only one sample available. You may want to reshape your data array") with pytest.warns(UserWarning, match=warn_msg): lw.fit(X_1sample) assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert (lw.precision_ is None)
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 n_samples = 200 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] random_state = check_random_state(0) ledoit_estimator = LedoitWolf() for k in range(n_subjects): signal = random_state.randn(n_samples, n_features) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip( [EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert (is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert (is_spd(gmean_sqrt, decimal=7)) assert (is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal( conn_measure.whitening_.dot(gmean_sqrt), np.eye(n_features)) assert_array_almost_equal( gmean_sqrt.dot(_map_eigenvalues( np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert (is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert (is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal( d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal( d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec)))
real_cov = toeplitz(r ** np.arange(n_features)) coloring_matrix = cholesky(real_cov) n_samples_range = np.arange(6, 31, 1) repeat = 100 lw_mse = np.zeros((n_samples_range.size, repeat)) oa_mse = np.zeros((n_samples_range.size, repeat)) lw_shrinkage = np.zeros((n_samples_range.size, repeat)) oa_shrinkage = np.zeros((n_samples_range.size, repeat)) for i, n_samples in enumerate(n_samples_range): for j in range(repeat): X = np.dot( np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X) lw_mse[i, j] = lw.error_norm(real_cov, scaling=False) lw_shrinkage[i, j] = lw.shrinkage_ oa = OAS(store_precision=False, assume_centered=True) oa.fit(X) oa_mse[i, j] = oa.error_norm(real_cov, scaling=False) oa_shrinkage[i, j] = oa.shrinkage_ # plot MSE plt.subplot(2, 1, 1) plt.errorbar(n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1), label='Ledoit-Wolf', color='g') plt.errorbar(n_samples_range, oa_mse.mean(1), yerr=oa_mse.std(1), label='OAS', color='r') plt.ylabel("Squared error")
def lda_train_scaled(fv, shrink=False): """Train the LDA classifier. Parameters ---------- fv : ``Data`` object the feature vector must have 2 dimensional data, the first dimension being the class axis. The unique class labels must be 0 and 1 otherwise a ``ValueError`` will be raised. shrink : Boolean, optional use shrinkage Returns ------- w : 1d array b : float Raises ------ ValueError : if the class labels are not exactly 0s and 1s Examples -------- >>> clf = lda_train(fv_train) >>> out = lda_apply(fv_test, clf) See Also -------- lda_apply """ assert shrink is True x = fv.data y = fv.axes[0] if len(np.unique(y)) != 2: raise ValueError( 'Should only have two unique class labels, instead got' ': {labels}'.format(labels=np.unique(y))) # Use sorted labels labels = np.sort(np.unique(y)) mu1 = np.mean(x[y == labels[0]], axis=0) mu2 = np.mean(x[y == labels[1]], axis=0) # x' = x - m m = np.empty(x.shape) m[y == labels[0]] = mu1 m[y == labels[1]] = mu2 x2 = x - m # w = cov(x)^-1(mu2 - mu1) if shrink: estimator = LW() covm = estimator.fit(x2).covariance_ else: covm = np.cov(x2.T) w = np.dot(np.linalg.pinv(covm), (mu2 - mu1)) # From matlab bbci toolbox: # https://github.com/bbci/bbci_public/blob/fe6caeb549fdc864a5accf76ce71dd2a926ff12b/classification/train_RLDAshrink.m#L133-L134 #C.w= C.w/(C.w'*diff(C_mean, 1, 2))*2; #C.b= -C.w' * mean(C_mean,2); w = (w / np.dot(w.T, (mu2 - mu1))) * 2 b = np.dot(-w.T, np.mean((mu1, mu2), axis=0)) assert not np.any(np.isnan(w)) assert not np.isnan(b) return w, b
def lda_train_scaled(fv, shrink=False): """Train the LDA classifier. Parameters ---------- fv : ``Data`` object the feature vector must have 2 dimensional data, the first dimension being the class axis. The unique class labels must be 0 and 1 otherwise a ``ValueError`` will be raised. shrink : Boolean, optional use shrinkage Returns ------- w : 1d array b : float Raises ------ ValueError : if the class labels are not exactly 0s and 1s Examples -------- >>> clf = lda_train(fv_train) >>> out = lda_apply(fv_test, clf) See Also -------- lda_apply """ assert shrink is True x = fv.data y = fv.axes[0] if len(np.unique(y)) != 2: raise ValueError('Should only have two unique class labels, instead got' ': {labels}'.format(labels=np.unique(y))) # Use sorted labels labels = np.sort(np.unique(y)) mu1 = np.mean(x[y == labels[0]], axis=0) mu2 = np.mean(x[y == labels[1]], axis=0) # x' = x - m m = np.empty(x.shape) m[y == labels[0]] = mu1 m[y == labels[1]] = mu2 x2 = x - m # w = cov(x)^-1(mu2 - mu1) if shrink: estimator = LW() covm = estimator.fit(x2).covariance_ else: covm = np.cov(x2.T) w = np.dot(np.linalg.pinv(covm), (mu2 - mu1)) # From matlab bbci toolbox: # https://github.com/bbci/bbci_public/blob/fe6caeb549fdc864a5accf76ce71dd2a926ff12b/classification/train_RLDAshrink.m#L133-L134 #C.w= C.w/(C.w'*diff(C_mean, 1, 2))*2; #C.b= -C.w' * mean(C_mean,2); w = (w / np.dot(w.T, (mu2 - mu1))) * 2 b = np.dot(-w.T, np.mean((mu1, mu2), axis=0)) assert not np.any(np.isnan(w)) assert not np.isnan(b) return w, b
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] ledoit_estimator = LedoitWolf() for k in range(n_subjects): n_samples = 200 + k signal, _, _ = generate_signals(n_features=n_features, n_confounds=5, length=n_samples, same_variance=False) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["covariance", "correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip([EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert isinstance(connectivities, np.ndarray) assert len(connectivities) == len(covs) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert(is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert(is_spd(gmean_sqrt, decimal=7)) assert(is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal(conn_measure.whitening_.dot( gmean_sqrt), np.eye(n_features)) assert_array_almost_equal(gmean_sqrt.dot( _map_eigenvalues(np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert(is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert(is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal(d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal(d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec))) # Check the mean_ for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) conn_measure.fit_transform(signals) assert (conn_measure.mean_).shape == (n_features, n_features) if kind != 'tangent': assert_array_almost_equal( conn_measure.mean_, np.mean(conn_measure.transform(signals), axis=0)) # Check that the mean isn't modified in transform conn_measure = ConnectivityMeasure(kind='covariance') conn_measure.fit(signals[:1]) mean = conn_measure.mean_ conn_measure.transform(signals[1:]) assert_array_equal(mean, conn_measure.mean_) # Check vectorization option for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) conn_measure = ConnectivityMeasure(vectorize=True, kind=kind) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal(vectorized_connectivities, sym_matrix_to_vec(connectivities)) # Check not fitted error with pytest.raises(ValueError, match='has not been fitted. '): ConnectivityMeasure().inverse_transform(vectorized_connectivities) # Check inverse transformation kinds.remove('tangent') for kind in kinds: # without vectorization: input matrices are returned with no change conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(connectivities), connectivities) # with vectorization: input vectors are reshaped into matrices # if diagonal has not been discarded conn_measure = ConnectivityMeasure(kind=kind, vectorize=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) # with vectorization if diagonal has been discarded for kind in ['correlation', 'partial correlation']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) for kind in ['covariance', 'precision']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) diagonal = np.array([np.diagonal(conn) / sqrt(2) for conn in connectivities]) inverse_transformed = conn_measure.inverse_transform( vectorized_connectivities, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, connectivities) with pytest.raises(ValueError, match='can not reconstruct connectivity matrices'): conn_measure.inverse_transform(vectorized_connectivities) # for 'tangent' kind, covariance matrices are reconstructed # without vectorization tangent_measure = ConnectivityMeasure(kind='tangent') displacements = tangent_measure.fit_transform(signals) covariances = ConnectivityMeasure(kind='covariance').fit_transform( signals) assert_array_almost_equal( tangent_measure.inverse_transform(displacements), covariances) # with vectorization # when diagonal has not been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True) vectorized_displacements = tangent_measure.fit_transform(signals) assert_array_almost_equal( tangent_measure.inverse_transform(vectorized_displacements), covariances) # when diagonal has been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True, discard_diagonal=True) vectorized_displacements = tangent_measure.fit_transform(signals) diagonal = np.array([np.diagonal(matrix) / sqrt(2) for matrix in displacements]) inverse_transformed = tangent_measure.inverse_transform( vectorized_displacements, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, covariances) with pytest.raises(ValueError, match='can not reconstruct connectivity matrices'): tangent_measure.inverse_transform(vectorized_displacements)
# different covariance matrix predictions cov_sample = predict_cov_sample(returns_sample) cor_sample = predict_cov_sample(returns_sample, True) cov_upper = cov_sample[np.triu_indices(cov_sample.shape[0], k=1)] cor_upper = cor_sample[np.triu_indices(cor_sample.shape[0], k=1)] sample_mean_cov = cov_upper.mean() sample_mean_cor = cor_upper.mean() sample_mean_var = np.diagonal(cov_sample).mean() if model_train_sample == "whole": if which_data == "both": if predict_corr: LW = LedoitWolf() cov_lw = LW.fit(returns_sample).covariance_ cov_model = predict_cov_matrix_both( lr, scaler, features_out_of_sample_reports, features_out_of_sample_industry, sample_mean_cor, standardize_cov_matrix, cov_lw) else: cov_model = predict_cov_matrix_both( lr, scaler, features_out_of_sample_reports, features_out_of_sample_industry, sample_mean_cov, standardize_cov_matrix, cov_sample, False) else: if predict_corr: LW = LedoitWolf() cov_lw = LW.fit(returns_sample).covariance_ cov_model = predict_correlation_matrix_model(
def plot_psds(psd_file, data_dir='/auto/tdrive/mschachter/data'): # read PairwiseCF file pcf_file = os.path.join(data_dir, 'aggregate', 'pairwise_cf.h5') pcf = AggregatePairwiseCF.load(pcf_file) # pcf.zscore_within_site() g = pcf.df.groupby(['bird', 'block', 'segment', 'electrode']) nsamps_electrodes = len(g) i = pcf.df.cell_index != -1 g = pcf.df[i].groupby(['bird', 'block', 'segment', 'electrode', 'cell_index']) nsamps_cells = len(g) print '# of electrodes: %d' % nsamps_electrodes print '# of cells: %d' % nsamps_cells print '# of lfp samples: %d' % (pcf.lfp_psds.shape[0]) print '# of spike psd samples: %d' % (pcf.spike_psds.shape[0]) # compute the LFP mean and std lfp_psds = deepcopy(pcf.lfp_psds) print 'lfp_psds_ind: max=%f, q99=%f' % (lfp_psds.max(), np.percentile(lfp_psds.ravel(), 99)) log_transform(lfp_psds) print 'lfp_psds_ind: max=%f, q99=%f' % (lfp_psds.max(), np.percentile(lfp_psds.ravel(), 99)) nz = lfp_psds.sum(axis=1) > 0 lfp_psds = lfp_psds[nz, :] lfp_psd_mean = lfp_psds.mean(axis=0) lfp_psd_std = lfp_psds.std(axis=0, ddof=1) nsamps_lfp = lfp_psds.shape[0] # get the spike rate spike_rate = pcf.df.spike_rate.values # plt.figure() # plt.hist(spike_rate, bins=20, color='g', alpha=0.7) # plt.title('Spike Rate Histogram, q1=%0.3f, q5=%0.3f, q10=%0.3f, q50=%0.3f, q99=%0.3f' % # (np.percentile(spike_rate, 1), np.percentile(spike_rate, 5), np.percentile(spike_rate, 10), # np.percentile(spike_rate, 50), np.percentile(spike_rate, 99))) # plt.show() # compute the covariance lfp_psd_z = deepcopy(lfp_psds) lfp_psd_z -= lfp_psd_mean lfp_psd_z /= lfp_psd_std lfp_and_spike_cov_est = LedoitWolf() lfp_and_spike_cov_est.fit(lfp_psd_z) lfp_and_spike_cov = lfp_and_spike_cov_est.covariance_ """ # read CRCNS file cell_data = dict() hf = h5py.File(psd_file, 'r') cnames = hf.attrs['col_names'] for c in cnames: cell_data[c] = np.array(hf[c]) crcns_psds = np.array(hf['psds']) freqs = hf.attrs['freqs'] hf.close() cell_df = pd.DataFrame(cell_data) print 'regions=',cell_df.superregion.unique() name_map = {'brainstem':'MLd', 'thalamus':'OV', 'cortex':'Field L+CM'} """ # resample the lfp mean and std freq_rs = np.linspace(pcf.freqs.min(), pcf.freqs.max(), 1000) lfp_mean_cs = interp1d(pcf.freqs, lfp_psd_mean, kind='cubic') lfp_mean_rs = lfp_mean_cs(freq_rs) lfp_std_cs = interp1d(pcf.freqs, lfp_psd_std, kind='cubic') lfp_std_rs = lfp_std_cs(freq_rs) # concatenate the lfp psd and log spike rate lfp_psd_and_spike_rate = list() for k,(li,si) in enumerate(zip(pcf.df['lfp_index'], pcf.df['spike_index'])): lpsd = pcf.lfp_psds[li, :] srate,sstd = pcf.spike_rates[si, :] if srate > 0: lfp_psd_and_spike_rate.append(np.hstack([lpsd, np.log(srate)])) lfp_psd_and_spike_rate = np.array(lfp_psd_and_spike_rate) nfreqs = len(pcf.freqs) lfp_rate_cc = np.zeros([nfreqs]) for k in range(nfreqs): lfp_rate_cc[k] = np.corrcoef(lfp_psd_and_spike_rate[:, k], lfp_psd_and_spike_rate[:, -1])[0, 1] fig = plt.figure(figsize=(24, 12)) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.30, hspace=0.30) nrows = 2 ncols = 100 gs = plt.GridSpec(nrows, ncols) ax = plt.subplot(gs[0, :35]) plt.errorbar(freq_rs, lfp_mean_rs, yerr=lfp_std_rs, c='k', linewidth=9.0, elinewidth=3.0, ecolor='#D8D8D8', alpha=0.5, capthick=0.) plt.axis('tight') plt.xlabel('Frequency (Hz)') plt.ylabel('Power (dB)') # plt.ylim(0, 1) plt.title('Mean LFP PSD') ax = plt.subplot(gs[1, :35]) plt.plot(pcf.freqs, lfp_rate_cc, '-', c=COLOR_BLUE_LFP, linewidth=9.0, alpha=0.7) plt.axhline(0, c='k') plt.axis('tight') plt.xlabel('Frequency (Hz)') plt.ylabel('Correlation Coefficient') plt.ylim(-0.05, 0.25) plt.title('LFP Power vs log Spike Rate') """ fi = freqs < 200 ax = plt.subplot(gs[1, :35]) clrs = ['k', '#d60036', COLOR_YELLOW_SPIKE] alphas = [0.8, 0.8, 0.6] for k,reg in enumerate(['brainstem', 'thalamus', 'cortex']): i = cell_df.superregion == reg indices = cell_df['index'][i].values psds = crcns_psds[indices, :] log_psds = deepcopy(psds) log_transform(log_psds) # compute the mean and sd of the power spectra psd_mean = log_psds.mean(axis=0) psd_std = log_psds.std(axis=0, ddof=1) psd_cv = psd_std / psd_mean # plot the mean power spectrum on the left plt.plot(freqs[fi], psd_mean[fi], c=clrs[k], linewidth=9.0, alpha=alphas[k]) plt.ylabel('Power (dB)') plt.xlabel('Frequency (Hz)') plt.axis('tight') plt.ylim(0, 1.0) plt.legend(['MLd', 'OV', 'Field L+CM'], fontsize='x-small', loc='upper right') plt.title('Mean PSTH PSDs (CRCNS Data)') """ ax = plt.subplot(gs[:, 40:]) plt.imshow(lfp_and_spike_cov, aspect='auto', interpolation='nearest', origin='lower', cmap=magma, vmin=0, vmax=1) plt.colorbar(label='Correlation Coefficient') xy = np.arange(len(pcf.freqs)) lbls = ['%d' % f for f in pcf.freqs] plt.xticks(xy, lbls, rotation=0) plt.yticks(xy, lbls) plt.axhline(nfreqs-0.5, c='w') plt.axvline(nfreqs-0.5, c='w') plt.xlabel('Frequency (Hz)') plt.ylabel('Frequency (Hz)') plt.title('LFP PSD Correlation Matrix') fname = os.path.join(get_this_dir(), 'crcns_data.svg') plt.savefig(fname, facecolor='w', edgecolor='none') plt.show()
import mne import numpy as np from sklearn.covariance import LedoitWolf from camcan.preprocessing import extract_connectivity from camcan.processing import map_tangent sample_data = Path(mne.datasets.sample.data_path()) fname = sample_data / Path('MEG/sample/sample_audvis_raw.fif') raw = mne.io.read_raw_fif(str(fname), preload=True) tmin = 0 tmax = 2 baseline = None events = mne.find_events(raw)[:10] raw.pick_types(meg='mag', eeg=False) epochs = mne.Epochs(raw=raw, tmin=tmin, tmax=tmax, events=events, decim=5) timeseries = epochs.get_data() connectivity_tangent = extract_connectivity(timeseries, kind='tangent') cov_estimator = LedoitWolf(store_precision=False) connectivities = [cov_estimator.fit(x).covariance_ for x in timeseries] connectivity_tangent2 = map_tangent(connectivities, diag=False) np.testing.assert_array_equal(connectivity_tangent, connectivity_tangent2)
c_des_out=np.logical_not(label[:,2]== b'des') tmp_out= np.logical_and(c_des_out,mask_block) c_rest_out=np.logical_not(label[:,0]== b'rest') cond_out= np.logical_and(tmp_out,c_rest_out) y=label[cond_out,2] labels=np.unique(y) # Prepare correlation estimator = LedoitWolf() scaler=StandardScaler() # Create np array result_matrix = np.empty([len(names),motor_region.shape[0],labels.shape[0],labels.shape[0]]) #Analysis for each subject for i,n in enumerate(sorted(names)): roi_name=fold_g+'mni4060/asymroi_'+smt+'_'+n+'.npz' roi=np.load(roi_name)['roi'][cond_out] roi=roi[:,motor_region-1] for j in range(motor_region.shape[0]): roi_j=roi[:,j] roi_mat=np.zeros(((y==b'imp').sum(),len(labels))) for z,lab in enumerate(sorted(labels)): roi_mat[:,z]=roi_j[y==lab] roi_sc=scaler.fit_transform(roi_mat) estimator.fit(roi_sc) matrix=estimator.covariance_ result_matrix[i,j]=1-matrix np.savez_compressed('F:/IRM_Marche/dismatrix.npz',result_matrix)
def get_weight(self, date, IC_length, period, weight_way, halflife=0): IC_use_all = self.IC_all.loc[:date, self.factor_list].iloc[-IC_length - period:-period] IC_use = copy.deepcopy(IC_use_all) temp = -1 loc = [] for f in self.factor_list: temp += 1 # 去掉IC缺失过多的因子 if Counter(np.isnan(IC_use[f]))[0] < IC_use.shape[0] * 0.2: loc.append(temp) IC_use = IC_use.drop(f, 1) ind_valid = np.where(~np.isnan( IC_use.sum(axis=1, skipna=False).values))[0] # 所有因子都有ic值的行index IC_use = IC_use.iloc[ind_valid] IC_mean = IC_use.mean(axis=0).values.reshape(IC_use.shape[1], 1) if weight_way == 'ICIR_Ledoit': lw = LedoitWolf() IC_sig = lw.fit(IC_use.values).covariance_ weight = np.dot(np.linalg.inv(IC_sig), IC_mean) elif weight_way == 'ICIR_sigma': IC_sig = np.cov(IC_use.values, rowvar=False) weight = np.dot(np.linalg.inv(IC_sig), IC_mean) elif weight_way == 'ICIR': IC_sig = (IC_use.std(axis=0)).values.reshape(IC_use.shape[1], 1) weight = IC_mean / IC_sig elif weight_way == 'IC_halflife': if halflife > 0: lam = pow(1 / 2, 1 / 60) else: lam = 1 len_IC = IC_use.shape[0] w = np.array([pow(lam, len_IC - 1 - i) for i in range(len_IC)]) w = w / sum(w) weight = IC_use.mul(pd.Series(data=w, index=IC_use.index), axis=0).sum(axis=0).values elif weight_way == 'ICIR_halflife': if halflife > 0: lam = pow(1 / 2, 1 / halflife) else: lam = 1 len_IC = IC_use.shape[0] w = np.array([pow(lam, len_IC - 1 - i) for i in range(len_IC)]) w = w / sum(w) ic_mean = IC_use.mul(pd.Series(data=w, index=IC_use.index), axis=0).sum(axis=0) ic_std = np.sqrt((np.power(IC_use - ic_mean, 2)).mul(pd.Series(data=w, index=IC_use.index), axis=0).sum(axis=0)) weight = ic_mean.values / ic_std.values elif weight_way == 'equal': weight = np.sign(IC_mean) w = np.array([np.nan] * len(self.factor_list)) flag = 0 for i in range(len(self.factor_list)): if i not in loc: w[i] = weight[flag] flag += 1 else: w[i] = 0.0 # IC有效值过少,因子权重为0 weight = pd.Series(w, index=self.factor_list) return weight
def test_ledoit_wolf(): """Tests LedoitWolf module on a simple dataset. """ # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_) assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_centered, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert(lw.precision_ is None) # (too) large data set X_large = np.ones((20, 200)) assert_raises(MemoryError, ledoit_wolf, X_large, block_size=100) # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shinkrage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shinkrage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample X_1sample = np.arange(5) lw = LedoitWolf() with warnings.catch_warnings(record=True): lw.fit(X_1sample) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert(lw.precision_ is None)
def threshold_from_simulations(self, X, precision=2000, verbose=False, n_jobs=-1): """ """ import multiprocessing as mp if n_jobs < 1: n_jobs = mp.cpu_count() n_samples, n_features = X.shape n = n_samples p = n_features h = self.support_.sum() lw = LedoitWolf() ref_covariance = lw.fit(X[self.support_]).covariance_ c = sp.stats.chi2(p + 2).cdf( sp.stats.chi2(p).ppf(float(h) / n)) / (float(h) / n) sigma_root = np.linalg.cholesky(ref_covariance / c) all_h = [] # inliers distribution dist_in = np.array([], ndmin=1) max_i = max(1, int(precision / float(self.support_.sum()))) for i in range(max_i): if verbose and max_i > 4 and (i % (max_i / 4) == 0): print "\t", 50 * i / float(max_i), "%" #sigma_root = np.diag(np.sqrt(eigenvalues)) #sigma_root = np.eye(n_features) X1, _ = dg.generate_gaussian( n_samples, n_features, np.zeros(n_features), cov_root=sigma_root) # learn location and shape clf = EllipticEnvelopeRMCDl1( correction=self.correction, shrinkage=self.shrinkage, h=self.support_.sum() / float(n_samples), no_fit=True).fit( X1) X2 = X1 - clf.location_ dist_in = np.concatenate( (dist_in, clf.decision_function( X2[clf.support_], raw_values=True))) all_h.append(clf.h) # outliers distribution dist_out = np.array([], ndmin=1) max_i = max(1, int(precision / float(n_samples - self.support_.sum()))) for i in range(max_i): if verbose and max_i > 4 and (i % (max_i / 4) == 0): print "\t", 50 * (1. + i / float(max_i)), "%" X1, _ = dg.generate_gaussian( n_samples, n_features, np.zeros(n_features), cov_root=sigma_root) # learn location and shape clf = EllipticEnvelopeRMCDl1( correction=self.correction, shrinkage=self.shrinkage, h=self.support_.sum() / float(n_samples), no_fit=True).fit(X1) X2 = X1 - clf.location_ dist_out = np.concatenate( (dist_out, clf.decision_function( X2[~clf.support_], raw_values=True))) all_h.append(clf.h) self.dist_in = np.sort(dist_in) self.dist_out = np.sort(dist_out) self.h_mean = np.mean(all_h) return self.dist_out
class DCS_kd(BaseEstimator): def __init__(self, k=2, gamma=1.0, covariance_estimator='ledoit-wolf'): self.k = float(k) self.gamma = gamma self.covariance_estimator = covariance_estimator if covariance_estimator == 'empirical': self.cov = EmpiricalCovariance(store_precision=False) elif covariance_estimator == 'ledoit-wolf': self.cov = LedoitWolf(store_precision=False) else: raise NotImplementedError('%s is not implemented' % covariance_estimator) self.x0 = None self.x1 = None def fit(self, x, y): self.x0 = x[y == min(y)] self.x1 = x[y == max(y)] def __str__(self): return 'Analytical Cauchy-Schwarz Divergence in {}-d'.format(self.k) def value(self, v): # We need matrix, not vector v = v.reshape(-1, self.k) ipx0 = self._ipx(self.x0, self.x0, v) ipx1 = self._ipx(self.x1, self.x1, v) ipx2 = self._ipx(self.x0, self.x1, v) return np.log(ipx0) + np.log(ipx1) - 2 * np.log(ipx2) def derivative(self, v): # We need matrix, not vector v = v.reshape(-1, self.k) ret = (self._d_ipx(self.x0, self.x0, v) / self._ipx(self.x0, self.x0, v) + self._d_ipx(self.x1, self.x1, v) / self._ipx(self.x1, self.x1, v) - 2 * self._d_ipx(self.x0, self.x1, v) / self._ipx(self.x0, self.x1, v)) return ret.reshape(-1) def _H(self, X0, X1): n = (4.0 / (self.k + 2)) ** (2.0 / (self.k + 4)) p = (-2.0 / (self.k + 4)) return n * (X0.shape[0] ** p * self.cov.fit(X0).covariance_ + X1.shape[0] ** p * self.cov.fit(X1).covariance_) def _f1(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) # return 1.0 / np.sqrt(la.det(vHv)) return 1.0 / (X0.shape[0] * X1.shape[0] * np.sqrt(la.det(vHv)) * (2 * np.pi) ** (self.k / 2)) def _g1(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) return - self._f1(X0, X1, v) * Hxy.dot(v).dot(la.inv(vHv)) def _f2(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) vHv_inv = la.inv(vHv) vx0 = X0.dot(v) vx1 = X1.dot(v) vx0c = vx0.dot(vHv_inv) vx1c = vx1.dot(vHv_inv) ret = 0.0 for i in range(X0.shape[0]): ret += np.exp(-0.5 * ((vx0c[i] - vx1c) * (vx0[i] - vx1)).sum(axis=1)).sum() return ret def _g2(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) vHv_inv = la.inv(vHv) # k x k vx0 = X0.dot(v) vx1 = X1.dot(v) vx0c = vx0.dot(vHv_inv) vx1c = vx1.dot(vHv_inv) eye = np.eye(v.shape[0]) right_expr = (eye - Hxy.dot(v).dot(vHv_inv).dot(v.T)) # d x d d = v.shape[0] k = int(self.k) ret = 0.0 for i in range(X0.shape[0]): f2_vals = np.exp(-0.5 * ((vx0c[i] - vx1c) * (vx0[i] - vx1)).sum(axis=1)).reshape(-1, 1) ws = (X0[i] - X1).reshape(X1.shape[0], d, 1) vxdiffs = (- f2_vals * (vx0[i] - vx1)).reshape(X1.shape[0], 1, k) ret += np.tensordot(ws, vxdiffs, ([0, 2], [0, 1])) return right_expr.dot(ret).dot(vHv_inv) def _ipx(self, X0, X1, v): return self._f1(X0, X1, v) * self._f2(X0, X1, v) def _d_ipx(self, X0, X1, v): return self._f1(X0, X1, v) * self._g2(X0, X1, v) + self._f2(X0, X1, v) * self._g1(X0, X1, v)
random.seed(42) print("Gathering examples...") # Use subsample of 200K for k-means and covariance estimates for i in random.sample(range(0, unlab_X.shape[2]), 200000): patches = view_as_windows(unlab_X[:, :, i], (w, w), step=s) re_shaped = numpy.reshape(patches, (patches.shape[0] * patches.shape[0], w * w)) # normalize the patches, per sample re_shaped = preprocessing.scale(re_shaped, axis=1) X_unlab_patches.append(re_shaped) X_unlab_patches = numpy.vstack(X_unlab_patches) # build whitening transform matrix print("Fitting ZCA Whitening Transform...") cov = LedoitWolf() cov.fit(X_unlab_patches) # fit covariance estimate D, U = numpy.linalg.eigh(cov.covariance_) V = numpy.sqrt(numpy.linalg.inv(numpy.diag(D + zca_eps))) Wh = numpy.dot(numpy.dot(U, V), U.T) mu = numpy.mean(X_unlab_patches, axis=0) X_unlab_patches = numpy.dot(X_unlab_patches - mu, Wh) # run k-means on unlabelled data print("Starting k-means...") clustr = sklearn.cluster.MiniBatchKMeans(n_clusters=n_clust, compute_labels=False, batch_size=300) k_means = clustr.fit(X_unlab_patches) def f_unsup(img): img_ptchs = view_as_windows(img, (w, w), step=s)
X_unlab_patches = [] random.seed(42) print "Gathering examples..." # Use subsample of 200K for k-means and covariance estimates for i in random.sample(range(0, unlab_X.shape[2]), 200000): patches = view_as_windows(unlab_X[:, :, i], (w, w), step=s) re_shaped = numpy.reshape(patches, (patches.shape[0]*patches.shape[0], w * w)) # normalize the patches, per sample re_shaped = preprocessing.scale(re_shaped, axis=1) X_unlab_patches.append(re_shaped) X_unlab_patches = numpy.vstack(X_unlab_patches) # build whitening transform matrix print "Fitting ZCA Whitening Transform..." cov = LedoitWolf() cov.fit(X_unlab_patches) # fit covariance estimate D, U = numpy.linalg.eigh(cov.covariance_) V = numpy.sqrt(numpy.linalg.inv(numpy.diag(D + zca_eps))) Wh = numpy.dot(numpy.dot(U, V), U.T) mu = numpy.mean(X_unlab_patches, axis=0) X_unlab_patches = numpy.dot(X_unlab_patches-mu, Wh) # run k-means on unlabelled data print "Starting k-means..." clustr = sklearn.cluster.MiniBatchKMeans(n_clusters=n_clust, compute_labels=False, batch_size=300) k_means = clustr.fit(X_unlab_patches) def f_unsup(img):
real_cov = toeplitz(r**np.arange(n_features)) coloring_matrix = cholesky(real_cov) n_samples_range = np.arange(6, 31, 1) repeat = 100 lw_mse = np.zeros((n_samples_range.size, repeat)) oa_mse = np.zeros((n_samples_range.size, repeat)) lw_shrinkage = np.zeros((n_samples_range.size, repeat)) oa_shrinkage = np.zeros((n_samples_range.size, repeat)) for i, n_samples in enumerate(n_samples_range): for j in range(repeat): X = np.dot(np.random.normal(size=(n_samples, n_features)), coloring_matrix.T) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X) lw_mse[i, j] = lw.error_norm(real_cov, scaling=False) lw_shrinkage[i, j] = lw.shrinkage_ oa = OAS(store_precision=False, assume_centered=True) oa.fit(X) oa_mse[i, j] = oa.error_norm(real_cov, scaling=False) oa_shrinkage[i, j] = oa.shrinkage_ # plot MSE plt.subplot(2, 1, 1) plt.errorbar( n_samples_range, lw_mse.mean(1), yerr=lw_mse.std(1), label="Ledoit-Wolf",
class DCS_kd(BaseEstimator): def __init__(self, k=2, gamma=1.0, covariance_estimator='ledoit-wolf'): self.k = float(k) self.gamma = gamma self.covariance_estimator = covariance_estimator if covariance_estimator == 'empirical': self.cov = EmpiricalCovariance(store_precision=False) elif covariance_estimator == 'ledoit-wolf': self.cov = LedoitWolf(store_precision=False) else: raise NotImplementedError('%s is not implemented' % covariance_estimator) self.x0 = None self.x1 = None def fit(self, x, y): self.x0 = x[y == min(y)] self.x1 = x[y == max(y)] def __str__(self): return 'Analytical Cauchy-Schwarz Divergence in {}-d'.format(self.k) def value(self, v): # We need matrix, not vector v = v.reshape(-1, self.k) ipx0 = self._ipx(self.x0, self.x0, v) ipx1 = self._ipx(self.x1, self.x1, v) ipx2 = self._ipx(self.x0, self.x1, v) return np.log(ipx0) + np.log(ipx1) - 2 * np.log(ipx2) def derivative(self, v): # We need matrix, not vector v = v.reshape(-1, self.k) ret = ( self._d_ipx(self.x0, self.x0, v) / self._ipx(self.x0, self.x0, v) + self._d_ipx(self.x1, self.x1, v) / self._ipx(self.x1, self.x1, v) - 2 * self._d_ipx(self.x0, self.x1, v) / self._ipx(self.x0, self.x1, v)) return ret.reshape(-1) def _H(self, X0, X1): n = (4.0 / (self.k + 2))**(2.0 / (self.k + 4)) p = (-2.0 / (self.k + 4)) return n * (X0.shape[0]**p * self.cov.fit(X0).covariance_ + X1.shape[0]**p * self.cov.fit(X1).covariance_) def _f1(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) # return 1.0 / np.sqrt(la.det(vHv)) return 1.0 / (X0.shape[0] * X1.shape[0] * np.sqrt(la.det(vHv)) * (2 * np.pi)**(self.k / 2)) def _g1(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) return -self._f1(X0, X1, v) * Hxy.dot(v).dot(la.inv(vHv)) def _f2(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) vHv_inv = la.inv(vHv) vx0 = X0.dot(v) vx1 = X1.dot(v) vx0c = vx0.dot(vHv_inv) vx1c = vx1.dot(vHv_inv) ret = 0.0 for i in range(X0.shape[0]): ret += np.exp(-0.5 * ((vx0c[i] - vx1c) * (vx0[i] - vx1)).sum(axis=1)).sum() return ret def _g2(self, X0, X1, v): Hxy = self.gamma * self.gamma * self._H(X0, X1) vHv = v.T.dot(Hxy).dot(v) vHv_inv = la.inv(vHv) # k x k vx0 = X0.dot(v) vx1 = X1.dot(v) vx0c = vx0.dot(vHv_inv) vx1c = vx1.dot(vHv_inv) eye = np.eye(v.shape[0]) right_expr = (eye - Hxy.dot(v).dot(vHv_inv).dot(v.T)) # d x d d = v.shape[0] k = int(self.k) ret = 0.0 for i in range(X0.shape[0]): f2_vals = np.exp(-0.5 * ((vx0c[i] - vx1c) * (vx0[i] - vx1)).sum(axis=1)).reshape( -1, 1) ws = (X0[i] - X1).reshape(X1.shape[0], d, 1) vxdiffs = (-f2_vals * (vx0[i] - vx1)).reshape(X1.shape[0], 1, k) ret += np.tensordot(ws, vxdiffs, ([0, 2], [0, 1])) return right_expr.dot(ret).dot(vHv_inv) def _ipx(self, X0, X1, v): return self._f1(X0, X1, v) * self._f2(X0, X1, v) def _d_ipx(self, X0, X1, v): return self._f1(X0, X1, v) * self._g2(X0, X1, v) + self._f2( X0, X1, v) * self._g1(X0, X1, v)
def test_connectivity_measure_outputs(): n_subjects = 10 n_features = 49 # Generate signals and compute covariances emp_covs = [] ledoit_covs = [] signals = [] ledoit_estimator = LedoitWolf() for k in range(n_subjects): n_samples = 200 + k signal, _, _ = generate_signals(n_features=n_features, n_confounds=5, length=n_samples, same_variance=False) signals.append(signal) signal -= signal.mean(axis=0) emp_covs.append((signal.T).dot(signal) / n_samples) ledoit_covs.append(ledoit_estimator.fit(signal).covariance_) kinds = ["covariance", "correlation", "tangent", "precision", "partial correlation"] # Check outputs properties for cov_estimator, covs in zip([EmpiricalCovariance(), LedoitWolf()], [emp_covs, ledoit_covs]): input_covs = copy.copy(covs) for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind, cov_estimator=cov_estimator) connectivities = conn_measure.fit_transform(signals) # Generic assert_true(isinstance(connectivities, np.ndarray)) assert_equal(len(connectivities), len(covs)) for k, cov_new in enumerate(connectivities): assert_array_equal(input_covs[k], covs[k]) assert(is_spd(covs[k], decimal=7)) # Positive definiteness if expected and output value checks if kind == "tangent": assert_array_almost_equal(cov_new, cov_new.T) gmean_sqrt = _map_eigenvalues(np.sqrt, conn_measure.mean_) assert(is_spd(gmean_sqrt, decimal=7)) assert(is_spd(conn_measure.whitening_, decimal=7)) assert_array_almost_equal(conn_measure.whitening_.dot( gmean_sqrt), np.eye(n_features)) assert_array_almost_equal(gmean_sqrt.dot( _map_eigenvalues(np.exp, cov_new)).dot(gmean_sqrt), covs[k]) elif kind == "precision": assert(is_spd(cov_new, decimal=7)) assert_array_almost_equal(cov_new.dot(covs[k]), np.eye(n_features)) elif kind == "correlation": assert(is_spd(cov_new, decimal=7)) d = np.sqrt(np.diag(np.diag(covs[k]))) if cov_estimator == EmpiricalCovariance(): assert_array_almost_equal(d.dot(cov_new).dot(d), covs[k]) assert_array_almost_equal(np.diag(cov_new), np.ones((n_features))) elif kind == "partial correlation": prec = linalg.inv(covs[k]) d = np.sqrt(np.diag(np.diag(prec))) assert_array_almost_equal(d.dot(cov_new).dot(d), -prec + 2 * np.diag(np.diag(prec))) # Check the mean_ for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) conn_measure.fit_transform(signals) assert_equal((conn_measure.mean_).shape, (n_features, n_features)) if kind != 'tangent': assert_array_almost_equal( conn_measure.mean_, np.mean(conn_measure.transform(signals), axis=0)) # Check that the mean isn't modified in transform conn_measure = ConnectivityMeasure(kind='covariance') conn_measure.fit(signals[:1]) mean = conn_measure.mean_ conn_measure.transform(signals[1:]) assert_array_equal(mean, conn_measure.mean_) # Check vectorization option for kind in kinds: conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) conn_measure = ConnectivityMeasure(vectorize=True, kind=kind) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal(vectorized_connectivities, sym_matrix_to_vec(connectivities)) # Check not fitted error assert_raises_regex( ValueError, 'has not been fitted. ', ConnectivityMeasure().inverse_transform, vectorized_connectivities) # Check inverse transformation kinds.remove('tangent') for kind in kinds: # without vectorization: input matrices are returned with no change conn_measure = ConnectivityMeasure(kind=kind) connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(connectivities), connectivities) # with vectorization: input vectors are reshaped into matrices # if diagonal has not been discarded conn_measure = ConnectivityMeasure(kind=kind, vectorize=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) # with vectorization if diagonal has been discarded for kind in ['correlation', 'partial correlation']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) assert_array_almost_equal( conn_measure.inverse_transform(vectorized_connectivities), connectivities) for kind in ['covariance', 'precision']: connectivities = ConnectivityMeasure(kind=kind).fit_transform(signals) conn_measure = ConnectivityMeasure(kind=kind, vectorize=True, discard_diagonal=True) vectorized_connectivities = conn_measure.fit_transform(signals) diagonal = np.array([np.diagonal(conn) / sqrt(2) for conn in connectivities]) inverse_transformed = conn_measure.inverse_transform( vectorized_connectivities, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, connectivities) assert_raises_regex(ValueError, 'can not reconstruct connectivity matrices', conn_measure.inverse_transform, vectorized_connectivities) # for 'tangent' kind, covariance matrices are reconstructed # without vectorization tangent_measure = ConnectivityMeasure(kind='tangent') displacements = tangent_measure.fit_transform(signals) covariances = ConnectivityMeasure(kind='covariance').fit_transform( signals) assert_array_almost_equal( tangent_measure.inverse_transform(displacements), covariances) # with vectorization # when diagonal has not been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True) vectorized_displacements = tangent_measure.fit_transform(signals) assert_array_almost_equal( tangent_measure.inverse_transform(vectorized_displacements), covariances) # when diagonal has been discarded tangent_measure = ConnectivityMeasure(kind='tangent', vectorize=True, discard_diagonal=True) vectorized_displacements = tangent_measure.fit_transform(signals) diagonal = np.array([np.diagonal(matrix) / sqrt(2) for matrix in displacements]) inverse_transformed = tangent_measure.inverse_transform( vectorized_displacements, diagonal=diagonal) assert_array_almost_equal(inverse_transformed, covariances) assert_raises_regex(ValueError, 'can not reconstruct connectivity matrices', tangent_measure.inverse_transform, vectorized_displacements)
time_series = masker.fit_transform(func_filename, confounds=[confound_filename]) ########################################################################## # Display time series import matplotlib.pyplot as plt for time_serie, label in zip(time_series.T, labels): plt.plot(time_serie, label=label) plt.title('Default Mode Network Time Series') plt.xlabel('Scan number') plt.ylabel('Normalized signal') plt.legend() plt.tight_layout() ########################################################################## # Compute precision matrices from sklearn.covariance import LedoitWolf cve = LedoitWolf() cve.fit(time_series) ########################################################################## # Display connectome from nilearn import plotting plotting.plot_connectome(cve.precision_, dmn_coords, title="Default Mode Network Connectivity") plotting.show()
# Perform Factor analysis fa = FactorAnalysis(n_components=64, random_state=1000) fah = FactorAnalysis(n_components=64, random_state=1000) Xfa = fa.fit_transform(X) Xfah = fah.fit_transform(Xh) print('Factor analysis score X: {}'.format(fa.score(X))) print('Factor analysis score Xh: {}'.format(fah.score(Xh))) # Perform Lodoit-Wolf shrinkage ldw = LedoitWolf() ldwh = LedoitWolf() ldw.fit(X) ldwh.fit(Xh) print('Ledoit-Wolf score X: {}'.format(ldw.score(X))) print('Ledoit-Wolf score Xh: {}'.format(ldwh.score(Xh))) # Show the components fig, ax = plt.subplots(8, 8, figsize=(10, 10)) for i in range(8): for j in range(8): ax[i, j].imshow(fah.components_[(i * 8) + j].reshape((28, 28)), cmap='gray') ax[i, j].axis('off') plt.show()
high_pass=0.01, t_r=2.5, memory='nilearn_cache', memory_level=1, verbose=2) func_filename = adhd_dataset.func[0] confound_filename = adhd_dataset.confounds[0] time_series = masker.fit_transform(func_filename, confounds=[confound_filename]) # Computing precision matrices ################################################ from sklearn.covariance import LedoitWolf cve = LedoitWolf() cve.fit(time_series) # Displaying results ########################################################## import matplotlib.pyplot as plt from nilearn import plotting # Display time series for time_serie, label in zip(time_series.T, labels): plt.plot(time_serie, label=label) plt.title('Default Mode Network Time Series') plt.xlabel('Scan number') plt.ylabel('Normalized signal') plt.legend() plt.tight_layout()
X = df_2.values[1:, :] window_size = 300 slide_size = 30 no_samples = X.shape[0] p = X.shape[1] no_runs = math.floor((no_samples - window_size) / (slide_size)) print("We're running %s times" % no_runs) X_new = X[0:window_size, :] #ss = StandardScaler() #X_new = ss.fit_transform(X_new) #s = space.SPACE_BIC(verbose=True) #s.fit_l2(X_new) lw = LedoitWolf() lw.fit(X_new) prec = precision_matrix_to_partial_corr(lw.precision_) l = lw.shrinkage_ np.fill_diagonal(prec, 0) corr = covariance_matrix_to_corr(lw.covariance_) np.fill_diagonal(corr, 0) G = nx.from_numpy_matrix(corr) G = nx.relabel_nodes(G, dict(zip(G.nodes(), company_names))) node_attributes = dict( zip(company_names[list(range(len(company_sectors)))], company_sectors)) nx.set_node_attributes(G, node_attributes, 'sector') G.graph['l'] = l nx.write_graphml(G, "network_over_time_%s.graphml" % 0) print("%s non-zero values" % np.count_nonzero(prec)) np.save("prec_0", lw.precision_)
import sklearn except ImportError: has_sklearn = False print('sklearn not available') def cov2corr(cov): std_ = np.sqrt(np.diag(cov)) corr = cov / np.outer(std_, std_) return corr if has_sklearn: from sklearn.covariance import LedoitWolf, OAS, MCD lw = LedoitWolf(store_precision=False) lw.fit(rr, assume_centered=False) cov_lw = lw.covariance_ corr_lw = cov2corr(cov_lw) oas = OAS(store_precision=False) oas.fit(rr, assume_centered=False) cov_oas = oas.covariance_ corr_oas = cov2corr(cov_oas) mcd = MCD()#.fit(rr, reweight=None) mcd.fit(rr, assume_centered=False) cov_mcd = mcd.covariance_ corr_mcd = cov2corr(cov_mcd) titles = ['raw correlation', 'lw', 'oas', 'mcd'] normcolor = None
# settings real_cov = np.dot(coloring_matrix.T, coloring_matrix) emp_cov = empirical_covariance(X_train) loglik_real = -log_likelihood(emp_cov, linalg.inv(real_cov)) # ############################################################################# # Compare different approaches to setting the parameter # GridSearch for an optimal shrinkage coefficient tuned_parameters = [{'shrinkage': shrinkages}] cv = GridSearchCV(ShrunkCovariance(), tuned_parameters, cv=5) cv.fit(X_train) # Ledoit-Wolf optimal shrinkage coefficient estimate lw = LedoitWolf() loglik_lw = lw.fit(X_train).score(X_test) # OAS coefficient estimate oa = OAS() loglik_oa = oa.fit(X_train).score(X_test) # ############################################################################# # Plot results fig = plt.figure() plt.title("Regularized covariance: likelihood and shrinkage coefficient") plt.xlabel('Regularization parameter: shrinkage coefficient') plt.ylabel('Error: negative log-likelihood on test data') # range shrinkage curve plt.loglog(shrinkages, negative_logliks, label="Negative log-likelihood") plt.plot(plt.xlim(), 2 * [loglik_real], '--r',