def fit(self, data): """Fit VAR model to data. Parameters ---------- data : array, shape (trials, channels, samples) or (channels, samples) Epoched or continuous data set. Returns ------- self : :class:`VAR` The :class:`VAR` object to facilitate method chaining (see usage example). """ data = atleast_3d(data) if self.delta == 0 or self.delta is None: # ordinary least squares x, y = self._construct_eqns(data) else: # regularized least squares (ridge regression) x, y = self._construct_eqns_rls(data) b, res, rank, s = sp.linalg.lstsq(x, y) self.coef = b.transpose() self.residuals = data - self.predict(data) self.rescov = sp.cov(cat_trials(self.residuals[:, :, self.p:])) return self
def pca(data, dim): """ Return the first dim principal components as colums of a matrix. Every row of the matrix resembles a point in the data space. """ assert dim <= data.shape[1], \ "dim must be less or equal than the original dimension" # We have to make a copy of the original data and substract the mean # of every entry data = makeCentered(data) cm = cov(data.T) # OPT only calculate the dim first eigenvectors here # The following calculation may seem a bit "weird" but also correct to me. # The eigenvectors with the dim highest eigenvalues have to be selected # We keep track of the indexes via enumerate to restore the right ordering # later. eigval, eigvec = eig(cm) eigval = [(val, ind) for ind, val in enumerate(eigval)] eigval.sort() eigval[:-dim] = [] # remove all but the highest dim elements # now we have to bring them back in the right order eig_indexes = [(ind, val) for val, ind in eigval] eig_indexes.sort(reverse=True) eig_indexes = [ind for ind, val in eig_indexes] return eigvec.take(eig_indexes, 1).T
def _initParams_fast(self): """ initialize the gp parameters 1) project Y on the known factor X0 -> Y0 average variance of Y0 is used to initialize the variance explained by X0 2) considers the residual Y1 = Y-Y0 (this equivals to regress out X0) 3) perform PCA on cov(Y1) and considers the first k PC for initializing X 4) the variance of all other PCs is used to initialize the noise 5) the variance explained by interaction is set to a small random number """ Xd = LA.pinv(self.X0) Y0 = self.X0.dot(Xd.dot(self.Y)) Y1 = self.Y-Y0 YY = SP.cov(Y1) S,U = LA.eigh(YY) X = U[:,-self.k:]*SP.sqrt(S[-self.k:]) a = SP.array([SP.sqrt(Y0.var(0).mean())]) b = 1e-3*SP.randn(1) c = SP.array([SP.sqrt((YY-SP.dot(X,X.T)).diagonal().mean())]) # gp hyper params params = limix.CGPHyperParams() if self.interaction: params['covar'] = SP.concatenate([a,X.reshape(self.N*self.k,order='F'),SP.ones(1),b]) else: params['covar'] = SP.concatenate([a,X.reshape(self.N*self.k,order='F')]) params['lik'] = c return params
def __init__(self, Y=None, Xr=None, F=None, Rr=None, factr=1e7, debug=False): """ Args: Y: [N, P] phenotype matrix Xr: [N, S] genotype data of the set component R: [N, S] genotype data of the set component factr: paramenter that determines the accuracy of the solution (see scipy.optimize.fmin_l_bfgs_b for more details) """ # avoid SVD failure by adding some jitter Xr+= 2e-6*(sp.rand(*Xr.shape)-0.5) # make sure it is normalised Xr-= Xr.mean(0) Xr/= Xr.std(0) Xr/= sp.sqrt(Xr.shape[1]) self.Y = Y self.F = F self.Xr = Xr self.covY = sp.cov(Y.T) self.factr = factr self.debug = debug self.gp = {} self.info = {} self.lowrank = Xr.shape[1]<Xr.shape[0] if Rr is not None: self.Rr = Rr else: if self.lowrank: self.Rr = None else: self.Rr = sp.dot(Xr, Xr.T)
def fit(self, data): """ Fit VAR model to data. Parameters ---------- data : array-like, shape = [n_samples, n_channels, n_trials] or [n_samples, n_channels] Continuous or segmented data set. Returns ------- self : :class:`VAR` The :class:`VAR` object to facilitate method chaining (see usage example) """ data = sp.atleast_3d(data) if self.delta == 0 or self.delta is None: # ordinary least squares (x, y) = self._construct_eqns(data) else: # regularized least squares (ridge regression) (x, y) = self._construct_eqns_rls(data) (b, res, rank, s) = sp.linalg.lstsq(x, y) self.coef = b.transpose() self.residuals = data - self.predict(data) self.rescov = sp.cov(cat_trials(self.residuals), rowvar=False) return self
def learn_gmm(self,x,y,tau=None): ''' Function that learns the GMM from training samples It is possible to add a regularizer term Sigma = Sigma + tau*I Input: x : the training samples y : the labels tau : the value of the regularizer, if tau = None (default) no regularization Output: the mean, covariance and proportion of each class ''' ## Get information from the data C = int(y.max(0)) # Number of classes n = x.shape[0] # Number of samples d = x.shape[1] # Number of variables ## Initialization self.ni = sp.empty((C,1)) # Vector of number of samples for each class self.prop = sp.empty((C,1)) # Vector of proportion self.mean = sp.empty((C,d)) # Vector of means self.cov = sp.empty((C,d,d)) # Matrix of covariance ## Learn the parameter of the model for each class for i in range(C): j = sp.where(y==(i+1))[0] self.ni[i] = float(j.size) self.prop[i] = self.ni[i]/n self.mean[i,:] = sp.mean(x[j,:],axis=0) self.cov[i,:,:] = sp.cov(x[j,:],bias=1,rowvar=0) # Normalize by ni to be consistent with the update formulae if tau is not None: self.tau = tau*sp.eye(d)
def _init_params(self, X): init = self.init n_samples, n_features = X.shape n_components = self.n_components if (init == 'kmeans'): km = Kmeans(n_components) clusters, mean, cov = km.cluster(X) coef = sp.array([c.shape[0] / n_samples for c in clusters]) comps = [multivariate_normal(mean[i], cov[i], allow_singular=True) for i in range(n_components)] elif (init == 'rand'): coef = sp.absolute(sprand.randn(n_components)) coef = coef / coef.sum() means = X[sprand.permutation(n_samples)[0: n_components]] clusters = [[] for i in range(n_components)] for x in X: idx = sp.argmin([spla.norm(x - mean) for mean in means]) clusters[idx].append(x) comps = [] for k in range(n_components): mean = means[k] cov = sp.cov(clusters[k], rowvar=0, ddof=0) comps.append(multivariate_normal(mean, cov, allow_singular=True)) self.coef = coef self.comps = comps
def __init__(self, Y=None, Xr=None, Rg=None, Ug=None, Sg=None, factr=1e7, debug=False): """ Args: Y: [N, P] phenotype matrix Xr: [N, S] genotype data of the set component R: [N, S] genotype data of the set component factr: paramenter that determines the accuracy of the solution (see scipy.optimize.fmin_l_bfgs_b for more details) """ # assert Xr Xr-= Xr.mean(0) Xr/= Xr.std(0) Xr/= sp.sqrt(Xr.shape[1]) self.Y = Y self.Xr = Xr if Sg is None or Ug is None: Sg, Ug = la.eigh(Rg) self.Rg = Rg self.Ug = Ug self.Sg = Sg self.covY = sp.cov(Y.T) self.factr = factr self.debug = debug self.gp = {} self.info = {} #_trRr = sp.diagonal(sp.dot(self.Ug, sp.dot(sp.diag(self.Sg), self.Ug.T))).sum() self.trRg = ((self.Ug*self.Sg**0.5)**2).sum()
def _maximum_likelihood(self, X): n_samples, n_features = X.shape if X.ndim > 1 else (1, X.shape[0]) n_components = self.n_components # Predict mean mu = X.mean(axis=0) # Predict covariance cov = sp.cov(X, rowvar=0) eigvals, eigvecs = self._eig_decomposition(cov) sigma2 = ((sp.sum(cov.diagonal()) - sp.sum(eigvals.sum())) / (n_features - n_components)) # FIXME: M < D? weight = sp.dot(eigvecs, sp.diag(sp.sqrt(eigvals - sigma2))) M = sp.dot(weight.T, weight) + sigma2 * sp.eye(n_components) inv_M = spla.inv(M) self.eigvals = eigvals self.eigvecs = eigvecs self.predict_mean = mu self.predict_cov = sp.dot(weight, weight.T) + sigma2 * sp.eye(n_features) self.latent_mean = sp.transpose(sp.dot(inv_M, sp.dot(weight.T, X.T - mu[:, sp.newaxis]))) self.latent_cov = sigma2 * inv_M self.sigma2 = sigma2 # FIXME! self.weight = weight self.inv_M = inv_M return self.latent_mean
def plot_covariance(history, dist_X): for dist_name in list(history.keys()): nTypes = len(history[dist_name].keys()) errors = sp.zeros((2,nTypes)) fig = plt.figure() fig.set_size_inches(6*nTypes,5) plt.subplot(1,nTypes+1,1) plt.imshow(dist_X.corr_matrix,cmap=plt.cm.gray,interpolation='none') counter = 0 for samp_name in list(history[dist_name].keys()): counter += 1 hist_single = history[dist_name][samp_name] nsteps = len(hist_single) nbatch = hist_single[-1]['X'].shape[1] N = hist_single[0]['X'].shape[0] X = sp.zeros((N,nbatch,nsteps)) P = sp.zeros((N,nbatch,nsteps)) for tt in range(nsteps): X[:,:,tt] = hist_single[tt]['X'] P[:,:,tt] = hist_single[tt]['P'] ax = plt.subplot(1,nTypes+1,counter+1) inv_var_diags = sp.diag(10.**sp.linspace(-dist_X.log_conditioning, 0, N))**.5 corr_matrix_calc = sp.dot(sp.dot(inv_var_diags**.5,sp.cov(X.reshape(N,nbatch*nsteps),rowvar = 1)),inv_var_diags**.5) plt.imshow(corr_matrix_calc,cmap=plt.cm.gray,interpolation='none') print (corr_matrix_calc) plt.show()
def cluster(self, X): self.fit(X) cluster = [X[sp.argmax(self.responsibility, axis=1) == k] for k in range(self.n_classes)] mean = self.center cov = [sp.cov(c, rowvar=0, ddof=0) for c in cluster] return cluster, mean, cov
def getEmpTraitCovar(self): """ Returns the empirical trait covariance matrix """ if self.P==1: out=self.Y[self.Iok].var() else: out=SP.cov(self.Y[self.Iok].T) return out
def fit(self, X): cov = sp.cov(X, rowvar=0) eigvals, eigvecs = self._eig_decomposition(cov) self.eigvals = eigvals self.eigvecs = eigvecs self.mean = X.mean(axis=0) return sp.dot(X, eigvecs)
def _initParams(self,init_method=None): """ this function initializes the paramenter and Ifilter """ if self.P==1: if self.bgRE: params0 = {'Cg':SP.sqrt(0.5)*SP.ones(1),'Cn':SP.sqrt(0.5)*SP.ones(1)} Ifilter = None else: params0 = {'Cr':1e-9*SP.ones(1),'Cn':SP.ones(1)} Ifilter = {'Cr':SP.zeros(1,dtype=bool),'Cn':SP.ones(1,dtype=bool)} else: if self.bgRE: if self.colCovarType=='freeform': if init_method=='pairwise': _RV = fitPairwiseModel(self.Y,XX=self.XX,S_XX=self.S_XX,U_XX=self.U_XX,verbose=False) params0 = {'Cg':_RV['params0_Cg'],'Cn':_RV['params0_Cn']} elif init_method=='random': params0 = {'Cg':SP.randn(self.Cg.getNumberParams()),'Cn':SP.randn(self.Cn.getNumberParams())} else: cov = 0.5*SP.cov(self.Y.T)+1e-4*SP.eye(self.P) chol = LA.cholesky(cov,lower=True) params = chol[SP.tril_indices(self.P)] params0 = {'Cg':params.copy(),'Cn':params.copy()} Ifilter = None else: if self.colCovarType=='freeform': cov = SP.cov(self.Y.T)+1e-4*SP.eye(self.P) chol = LA.cholesky(cov,lower=True) params = chol[SP.tril_indices(self.P)] #else: # S,U=LA.eigh(cov) # a = SP.sqrt(S[-self.rank_r:])[:,SP.newaxis]*U[:,-self.rank_r:] # if self.colCovarType=='lowrank_id': # c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(1) # else: # c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(self.P) # params0_Cn = SP.concatenate([a.T.ravel(),c]) params0 = {'Cr':1e-9*SP.ones(self.P),'Cn':params} Ifilter = {'Cr':SP.zeros(self.P,dtype=bool), 'Cn':SP.ones(params.shape[0],dtype=bool)} if self.mean.F is not None and self.bgRE: params0['mean'] = 1e-6*SP.randn(self.mean.getParams().shape[0]) if Ifilter is not None: Ifilter['mean'] = SP.ones(self.mean.getParams().shape[0],dtype=bool) return params0,Ifilter
def infer_full_post(self,X_i,D_i): class MJMError(Exception): pass [m,V] = self.infer_full(X_i,D_i) ns=X_i.shape[0] cv = sp.zeros([ns,ns]) for i in xrange(self.size): cv+=V[ns*i:ns*(i+1),:] cv= cv/self.size + sp.cov(m,rowvar=0,bias=1) return [sp.mean(m,axis=0).reshape([1,ns]),cv]
def randomized(cls, degree, dim, scale): mixcoeffs = scipy.random.random(degree) mixcoeffs /= mixcoeffs.sum() means = scipy.random.standard_normal((degree, dim)) * scale # Generate random covariances by generating random data. randomdata = (scipy.random.standard_normal((dim, 10)) * scale for _ in xrange(degree)) covs = [scipy.cov(i) for i in randomdata] return cls(mixcoeffs, means, covs)
def setUp(self): np.random.seed(1) # define phenotype N = 200 P = 2 Y = sp.randn(N,P) # define row caoriance f = 10 G = 1.*(sp.rand(N, f)<0.2) X = 1.*(sp.rand(N, f)<0.2) R = covar_rescale(sp.dot(X,X.T)) R+= 1e-4 * sp.eye(N) # define col covariances Cg = FreeFormCov(P) self._Cg = Cg Cn = FreeFormCov(P) Cg.setCovariance(0.5 * sp.cov(Y.T)) Cn.setCovariance(0.5 * sp.cov(Y.T)) # define gp self.gp = GP3KronSumLR(Y = Y, Cg = Cg, Cn = Cn, R = R, G = G, rank = 1)
def stats(self, startdate, enddate, mktbasket, avdate, output=False, mappingoverride=None): """ Calculates statistics for a fund over a period. Parameters ---------- startdate : datetime beginning of statistic period enddate : datetime end of statistic period mktbasket : dict dictionary of market streams output : bool if True, output results to db mappingoverride : None or mapping dictionary whether to override the db mapping Returns ------- stats : dict dictionary of statistics """ actualstream, projstream = self.project(mktbasket, mappingoverride) if actualstream[startdate:enddate] is None: return None if projstream[startdate:enddate] is None: return None actual = actualstream[startdate:enddate].returns projected = projstream[startdate:enddate].returns diff = actual - projected outdata = { 'TE' : scipy.std(diff) * 100.0 * 100.0, 'BETA' : scipy.cov(projected, actual, bias=1)[1, 0] / scipy.var(projected), 'ALPHA' : (scipy.product(diff + 1.0)) ** (1.0 / diff.size) - 1.0, 'VOL' : scipy.std(actual) * scipy.sqrt(252.0), 'PROJ' : scipy.product(1.0 + projected) - 1.0, 'ACT' : scipy.product(1.0 + actual) - 1.0, 'R2' : 0.0 if scipy.all(actual == 0.0) else scipy.corrcoef(projected, actual)[1, 0] ** 2.0, 'AV' : self.av(avdate), 'DELTA' : self.deltaestimate(avdate) } outdata['DIFF'] = outdata['ACT'] - outdata['PROJ'] outdata['PL'] = outdata['DELTA'] * outdata['DIFF'] * 100.0 if output: cnxn = pyodbc.connect(ORACLESTRING) cursor = cnxn.cursor() sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});' sql = sql.format(self.fundcode, outdata['PROJ'], outdata['ACT'], outdata['DIFF'], outdata['DELTA'], outdata['PL'], oracledatebuilder(startdate), oracledatebuilder(enddate), outdata['TE'], outdata['R2'], outdata['BETA'], outdata['ALPHA'], outdata['VOL'], outdata['AV']) cursor.execute(sql) cnxn.commit() cnxn.close() return outdata
self.mapping[indexes[i]] = finalbeta[i] return self.mapping def stats(self, startdate, enddate, mktbasket, output = False): """ Calculates statistics for a fund over a period. Parameters ---------- startdate : datetime beginning of statistic period enddate : datetime end of statistic period mktbasket : dict dictionary of market streams output : bool if True, output results to db Returns ------- stats : dict dictionary of statistics """ inputmatrix, fundreturns, indexes, daterange = self.align(startdate, enddate, mktbasket) if self.mapping and not(inputmatrix is None): weights = scipy.array([self.mapping[mykey] if mykey in self.mapping else 0.0 for mykey in mktbasket.keys()]) projected = scipy.dot(inputmatrix,weights.reshape(len(indexes),1)).flatten() actual = fundreturns.flatten() diff = actual-projected outdata = { 'TE' : scipy.std(diff)*100.0*100.0, 'BETA' : scipy.cov(projected,actual)[1,0]/scipy.var(projected), 'ALPHA' : (scipy.product(diff+1.0))**(1.0/diff.size)-1.0, 'VOL' : scipy.std(actual)*scipy.sqrt(252.0), 'PROJ' : scipy.product(1.0+projected)-1.0, 'ACT' : scipy.product(1.0+actual)-1.0, 'R2' : 0.0 if scipy.all(actual==0.0) else scipy.corrcoef(projected,actual)[1,0]**2.0, 'AV' : self.av(startdate), 'DELTA' : self.deltaestimate(startdate) } outdata['DIFF'] = outdata['ACT']-outdata['PROJ'] outdata['PL'] = outdata['DELTA']*outdata['DIFF']*100.0 if output: cnxn = pyodbc.connect(ORACLESTRING) cursor = cnxn.cursor() sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});' sql = sql.format(self.fundcode,outdata['PROJ'],outdata['ACT'],outdata['DIFF'], outdata['DELTA'],outdata['PL'],oracledatebuilder(startdate), oracledatebuilder(enddate),outdata['TE'],outdata['R2'],outdata['BETA'], outdata['ALPHA'],outdata['VOL'],outdata['AV']) cursor.execute(sql) cnxn.commit() cnxn.close()
def ex15(exclude=sc.array([1,2,3,4]),plotfilename='ex15.png', bovyprintargs={}): """ex15: solve exercise 15 Input: exclude - ID numbers to exclude from the analysis plotfilename - filename for the output plot Output: plot History: 2010-05-07 - Written - Bovy (NYU) """ #Read the data data= read_data('data_allerr.dat',allerr=True) ndata= len(data) nsample= ndata- len(exclude) #Put the dat in the appropriate arrays and matrices Y= sc.zeros(nsample) X= sc.zeros(nsample) Z= sc.zeros((nsample,2)) jj= 0 for ii in range(ndata): if sc.any(exclude == data[ii][0]): pass else: Y[jj]= data[ii][1][1] X[jj]= data[ii][1][0] Z[jj,0]= X[jj] Z[jj,1]= Y[jj] jj= jj+1 #Now compute the PCA solution Zm= sc.mean(Z,axis=0) Q= sc.cov(Z.T) eigs= linalg.eig(Q) maxindx= sc.argmax(eigs[0]) V= eigs[1][maxindx] V= V/linalg.norm(V) m= sc.sqrt(1/V[0]**2.-1) bestfit= sc.array([-m*Zm[0]+Zm[1],m]) #Plot result plot.bovy_print(**bovyprintargs) xrange=[0,300] yrange=[0,700] plot.bovy_plot(sc.array(xrange),bestfit[1]*sc.array(xrange)+bestfit[0], 'k--',xrange=xrange,yrange=yrange, xlabel=r'$x$',ylabel=r'$y$',zorder=2) plot.bovy_plot(X,Y,marker='o',color='k',linestyle='None', zorder=0,overplot=True) plot.bovy_text(r'$y = %4.2f \,x %4.0f' % (bestfit[1], bestfit[0])+r'$', bottom_right=True) plot.bovy_end_print(plotfilename)
def simulate(self, l, noisefunc=None, random_state=None): """Simulate vector autoregressive (VAR) model. This function generates data from the VAR model. Parameters ---------- l : int or [int, int] Number of samples to generate. Can be a tuple or list, where l[0] is the number of samples and l[1] is the number of trials. noisefunc : func, optional This function is used to create the generating noise process. If set to None, Gaussian white noise with zero mean and unit variance is used. Returns ------- data : array, shape (n_trials, n_samples, n_channels) Generated data. """ m, n = np.shape(self.coef) p = n // m try: l, t = l except TypeError: t = 1 if noisefunc is None: rng = check_random_state(random_state) noisefunc = lambda: rng.normal(size=(1, m)) n = l + 10 * p y = np.zeros((n, m, t)) res = np.zeros((n, m, t)) for s in range(t): for i in range(p): e = noisefunc() res[i, :, s] = e y[i, :, s] = e for i in range(p, n): e = noisefunc() res[i, :, s] = e y[i, :, s] = e for k in range(1, p + 1): y[i, :, s] += self.coef[:, (k - 1)::p].dot(y[i - k, :, s]) self.residuals = res[10 * p:, :, :].T self.rescov = sp.cov(cat_trials(self.residuals).T, rowvar=False) return y[10 * p:, :, :].transpose([2, 1, 0])
def simulate(self, l, noisefunc=None): """ Simulate vector autoregressive (VAR) model This function generates data from the VAR model. Parameters ---------- l : {int, [int, int]} Specify number of samples to generate. Can be a tuple or list where l[0] is the number of samples and l[1] is the number of trials. noisefunc : func, optional This function is used to create the generating noise process. If set to None Gaussian white noise with zero mean and unit variance is used. Returns ------- data : array, shape = [n_samples, n_channels, n_trials] """ (m, n) = sp.shape(self.coef) p = n // m try: (l, t) = l except TypeError: t = 1 if noisefunc is None: noisefunc = lambda: sp.random.normal(size=(1, m)) n = l + 10 * p y = sp.zeros((n, m, t)) res = sp.zeros((n, m, t)) for s in range(t): for i in range(p): e = noisefunc() res[i, :, s] = e y[i, :, s] = e for i in range(p, n): e = noisefunc() res[i, :, s] = e y[i, :, s] = e for k in range(1, p + 1): y[i, :, s] += self.coef[:, (k - 1)::p].dot(y[i - k, :, s]) self.residuals = res[10 * p:, :, :] self.rescov = sp.cov(cat_trials(self.residuals), rowvar=False) return y[10 * p:, :, :]
def _initParams(self, init_method=None): """ this function initializes the paramenter and Ifilter """ if self.bgRE: if init_method=='random': params0 = {'covar': sp.randn(self._gpNull.covar.getNumberParams())} else: if self.P==1: params0 = {'covar':sp.sqrt(0.5) * sp.ones(2)} else: cov = 0.5*sp.cov(self.Y.T) + 1e-4*sp.eye(self.P) chol = la.cholesky(cov, lower=True) params = chol[sp.tril_indices(self.P)] params0 = {'covar': sp.concatenate([params, params])} else: if self.P==1: params_cn = sp.array([1.]) else: cov = sp.cov(self.Y.T) + 1e-4*sp.eye(self.P) chol = la.cholesky(cov, lower=True) params_cn = chol[sp.tril_indices(self.P)] params0 = {'covar': params_cn} return params0
def setUp(self): np.random.seed(1) # define phenotype N = 200 P = 2 Y = sp.randn(N,P) # define fixed effects F = []; A = [] F.append(1.*(sp.rand(N,2)<0.5)) A.append(sp.eye(P)) # define row caoriance f = 10 G = 1.*(sp.rand(N, f)<0.2) # define col covariances Cr = FreeFormCov(P) self._Cr = Cr Cn = FreeFormCov(P) Cr.setCovariance(0.5 * sp.cov(Y.T)) Cn.setCovariance(0.5 * sp.cov(Y.T)) # define gp self.gp = GP2KronSumLR(Y = Y, F = F, A = A, Cn = Cn, G = G)
def __call__(self, gradient, error=None): # Append a copy to make sure this one is not changed after by the # client. self.samples.append(array(gradient)) # Return None if no new estimate is being given. if len(self.samples) < self.samplesize: return None # After all the samples have been put into a single array, we can # delete them. gradientarray = array(self.samples).T inv_covar = inv(cov(gradientarray)) self.values += dot(inv_covar, gradientarray.sum(axis=1)) return self.values
def _init_params_default(self): """ Internal method for default parameter initialization """ # if there are some nan -> mean impute Yimp = self.Y.copy() Inan = sp.isnan(Yimp) Yimp[Inan] = Yimp[~Inan].mean() if self.P==1: C = sp.array([[Yimp.var()]]) else: C = sp.cov(Yimp.T) C /= float(self.n_randEffs) for ti in range(self.n_randEffs): self.getTraitCovarFun(ti).setCovariance(C)
def correlationMatrix(mdata,linit,lend,nstep): lstep=(lend-linit)/nstep corr=np.zeros((mdata.shape[0],mdata.shape[0])) liter= [linit+(i*lstep) for i in range(nstep)] print liter, len(liter),lend zz = 0 for length in liter: corrs = cov(mdata[:,length:length+lstep]) corr += corrs zz += 1 print length, length+lstep, print zz corr /= nstep return corr
def calc_covariance_errors(history, dist_X): print ('Calculating covariance errors...') for dist_name in list(history.keys()): nTypes = len(history[dist_name].keys()) hist_single = history[dist_name][list(history[dist_name].keys())[0]] nsteps = len(hist_single) samp_names = [] errors = sp.zeros((nsteps,nTypes,2)) counter = 0 for samp_name in list(history[dist_name].keys()): samp_names.append(samp_name) hist_single = history[dist_name][samp_name] nsteps = len(hist_single) nbatch = hist_single[-1]['X'].shape[1] N = hist_single[0]['X'].shape[0] errors_tmp = sp.zeros((nsteps,nTypes)) X = sp.zeros((N,nbatch,nsteps)) P = sp.zeros((N,nbatch,nsteps)) for tt in range(nsteps): X[:,:,tt] = hist_single[tt]['X'] P[:,:,tt] = hist_single[tt]['P'] inv_var_diags = 10.**sp.linspace(-dist_X.log_conditioning, 0, N) corr_matrix_calc = sp.zeros((N,N,nsteps)) cov_matrix_calc = sp.zeros((N,N,nsteps)) for iN in sp.arange(1,nsteps): if (iN % (nsteps/10) == 0): print ("%s: %s errors calculated..." %(samp_name, iN)) cov_matrix_calc[:,:,iN] = sp.cov(X[:,:,:iN].reshape(N,nbatch*iN),rowvar=1) corr_matrix_calc[:,:,iN] = sp.dot(sp.dot(sp.diag(inv_var_diags**.5),cov_matrix_calc[:,:,iN]),sp.diag(inv_var_diags**.5)) errors_tmp[iN,0] = sp.sum((sp.diag(sp.diag(corr_matrix_calc[:,:,iN]))-sp.diag(sp.diag(dist_X.corr_matrix)))**2.0)/N errors_tmp[iN,1] = sp.sum((corr_matrix_calc[:,:,iN] - sp.diag(sp.diag(corr_matrix_calc[:,:,iN]))-dist_X.corr_matrix +sp.diag(sp.diag(dist_X.corr_matrix)))**2.0)/(N*(N-1)) print (corr_matrix_calc[:5,:5,-1]) print (dist_X.corr_matrix[:5,:5]) errors[:,counter,0] = errors_tmp[:,0] errors[:,counter,1] = errors_tmp[:,1] counter += 1 return errors, samp_names
def setUp(self): np.random.seed(1) # define phenotype N = 200 P = 2 self.Y = sp.randn(N, P) # define fixed effects self.F = []; self.A = [] self.F.append(1.*(sp.rand(N,2)<0.5)) self.A.append(sp.eye(P)) # define row caoriance f = 10 X = 1.*(sp.rand(N, f)<0.2) self.R = covar_rescale(sp.dot(X,X.T)) self.R += 1e-4 * sp.eye(N) # define col covariances self.Cg = FreeFormCov(P) self.Cn = FreeFormCov(P) self.Cg.setCovariance(0.5 * sp.cov(self.Y.T)) self.Cn.setCovariance(0.5 * sp.cov(self.Y.T)) # define gp self.gp = GP2KronSum(Y=self.Y, F=self.F, A=self.A, Cg=self.Cg, Cn=self.Cn, R=self.R)
def __init__ (self,dataTraining, classID, proportions = None): self.dataTraining = dataTraining #get the number of labels (since numbering goes form 0 to K-1, set class ID equal to K) nClasses= int(classID.max() + 1) #get the stats for each labels self.means = [] self.invVarCovarMatrix = [] self.constant = [] #last 3 terms in equation for i in range(nClasses): id = classID == i #array of bools proportions = id.mean() #ratio of trues:fales (sum of ones/# of entries) self.means.append(dataTraining[id, :].mean(axis= 0)) varCovarMatrix = scipy.cov(dataTraining[id,:],rowvar=0) self.invVarCovarMatrix.append(inv(varCovarMatrix)) self.constant.append(-0.5*scipy.dot(scipy.dot(self.means[-1],self.invVarCovarMatrix[-1]),scipy.transpose(self.means[-1])) +math.log(proportions) - 0.5*math.log(scipy.linalg.det(varCovarMatrix)))
def PCA_EigenVectors_Values(fullhits): ''' Input expects hits as a list of lists Utility function: from utilities package but here only requests the full set of eigenvectors in order to transform the data. ''' X1 = array([row[:3] for row in fullhits]) # voxel data only # takes data as numpy array data_array = transpose(X1) # Get eigenvalues and eigenvectors eigenval = [] etranspose = [] if (len(data_array) > 0): eigenval, eigenvec = linalg.eig(cov(data_array)) # Transpose eigenvec to return to dataset etranspose = transpose(eigenvec) return eigenval, etranspose
def PC_varExplained(Y, standardize=True): """Run PCA and calculate the cumulative fraction of variance Args: Y (dbl): phenotype values standardize (logical): if True, phenotypes are standardized Returns: var (dbl): cumulative distribution of variance explained """ # figuring out the number of latent factors if standardize: Y -= Y.mean(0) Y /= Y.std(0) covY = SP.cov(Y) S, U = linalg.eigh(covY + 1e-6 * SP.eye(covY.shape[0])) S = S[::-1] rv = np.array([S[0:i].sum() for i in range(1, S.shape[0])]) rv /= S.sum() return rv
def train(self, X): # データの中心化 self.X_mean = X.mean(0) X_centered = X - self.X_mean # 分散共分散行列の作成 V = sp.cov(X_centered.T) # Vの固有値計算 self.eigvals, self.eigvecs = linalg.eig(V) # 大きい方からn_components個の固有値を取り出し,それに対応する固有ベクトルを並べて基底を定める eigvals_idx = sp.argsort(self.eigvals) eigvals_idx = eigvals_idx[len(eigvals_idx)::-1] self.U = self.eigvecs[eigvals_idx[:self.n_components]] # 基底ベクトルから射影した点を求める X_pca = sp.dot(self.U, X_centered.T) X_pca = X_pca.T return X_pca, self.U
def roll_true(): data = pd.read_csv('000032.csv', index_col=0, parse_dates=True) data = data[::-1] # print(data.index[-1:][0]) enddate = data.index[-1:][0] begdate = enddate - relativedelta(months=2) print(begdate) print(enddate) month_data = data[data.index >= begdate] month_data = month_data[month_data.index <= enddate] print(month_data) month_data_close = month_data['close'].values d = np.diff(month_data_close) print(d) cov_ = sc.cov(d[:-1], d[1:]) print(cov_) if cov_[0, 1] < 0: print('roll spread for negetive', round(2 * sc.sqrt(-cov_[0, 1]), 3)) else: print('roll spread for positive', round(cov_[0, 1]))
def init_GPkronprod(Y, X_r, n_c): """ init parameters for kron(C + sigma I,R) + sigma*I """ # build linear kernel with the features covar0_r = SP.array([0]) covar_r = linear.LinearCF(n_dimensions=X_r.shape[1]) covar_r.X = X_r R = covar_r.K(covar0_r) var_R = utils.getVariance(R) cov = SP.cov(Y) # split into likelihood and noise terms ratio = SP.random.rand(3) ratio /= ratio.sum() lik0 = ratio[0] * SP.diag(cov).min() covar0_c = ratio[1] * SP.diag(cov).min() # remaining variance is assigned to latent factors if n_c > 1: X0_c = SP.zeros((Y.shape[0], n_c)) ratio = SP.random.rand(n_c) ratio /= ratio.sum() for i in range(n_c): # split further up X0_c[:, i] = SP.sign(SP.random.rand) * SP.sqrt( ratio[i] * (SP.diag(cov) - lik0 - covar0_c)) else: X0_c = SP.sign( SP.random.rand) * SP.sqrt(SP.diag(cov) - lik0 - covar0_c) X0_c = SP.reshape(X0_c, (X0_c.shape[0], n_c)) # check if variance of initial values match observed variance assert SP.allclose(SP.diag(cov), (X0_c**2).sum(1) + lik0 + covar0_c), 'ouch, something is wrong' # bring in correct format and transform as neccessary covar0_c = 0.5 * SP.log(SP.array([1. / var_R, covar0_c])) lik0 = 0.5 * SP.log(SP.array([lik0])) return X0_c, covar0_c, lik0, covar0_r
def column_covariances(X, uniformity_thresh): Xvert = high_frequency_vert(X, sigma=4.0) Xvertp = high_frequency_vert(X, sigma=3.0) models = [] use_C = [] for i in range(X.shape[2]): xsub = Xvert[:, :, i] xsubp = Xvertp[:, :, i] mu = xsub.mean(axis=0) dists = s.sqrt(pow((xsub - mu), 2).sum(axis=1)) distsp = s.sqrt(pow((xsubp - mu), 2).sum(axis=1)) thresh = percentile(dists, 95.0) uthresh = dists * uniformity_thresh #use = s.logical_and(dists<thresh, abs(dists-distsp) < uthresh) use = dists < thresh C = s.cov(xsub[use, :], rowvar=False) [U, V, D] = svd(C) V[V < 1e-8] = 1e-8 C = U.dot(s.diagflat(V)).dot(D) models.append(C) use_C.append(use) return s.array(models), Xvert, Xvertp, s.array(use_C).T
def __init__(self, Y=None, Xr=None, F=None, factr=1e7, Ie=None, debug=False): """ Args: Y: [N, 1] phenotype matrix Xr: [N, S] genotype data of the set component R: [N, S] genotype data of the set component factr: paramenter that determines the accuracy of the solution (see scipy.optimize.fmin_l_bfgs_b for more details) """ if F is None: F = sp.ones((y.shape[0], 1)) # kroneckerize F W = sp.zeros((Y.shape[0], 2 * F.shape[1])) W[:, :F.shape[1]] = Ie[:, sp.newaxis] * F W[:, F.shape[1]:] = (~Ie[:, sp.newaxis]) * F from limix_core.mean import MeanBase self.mean = MeanBase(Y, W) # avoid SVD failus by adding some jitter Xr += 2e-6 * (sp.rand(*Xr.shape) - 0.5) # store stuff Xr -= Xr.mean(0) Xr /= Xr.std(0) Xr /= sp.sqrt(Xr.shape[1]) self.Y = Y self.F = F self.Xr = Xr self.Ie = Ie self.covY = sp.cov(Y.T) self.factr = factr self.debug = debug self.gp = {} self.info = {}
def fit(self, data): """ Fit VAR model to data. Parameters ---------- data : array-like, shape = [n_samples, n_channels, n_trials] or [n_samples, n_channels] Continuous or segmented data set. Returns ------- self : :class:`VAR` The :class:`VAR` object. """ data = sp.atleast_3d(data) (x, y) = self._construct_eqns(data) self.fitting_model.fit(x, y) self.coef = self.fitting_model.coef_ self.residuals = data - self.predict(data) self.rescov = sp.cov(datatools.cat_trials(self.residuals[self.p:, :, :]), rowvar=False) return self
def fit(self, data): """Fit VAR model to data. Parameters ---------- data : array, shape (trials, channels, samples) Continuous or segmented data set. If the data is continuous, a 2D array of shape (channels, samples) can be provided. Returns ------- self : :class:`VAR` The :class:`VAR` object. """ data = atleast_3d(data) (x, y) = self._construct_eqns(data) self.fitting_model.fit(x, y) self.coef = self.fitting_model.coef_ self.residuals = data - self.predict(data) self.rescov = sp.cov(cat_trials(self.residuals[:, :, self.p:])) return self
def ex15( exclude=sc.array([1, 2, 3, 4]), plotfilename='ex15.png', bovyprintargs={}): """ex15: solve exercise 15 Input: exclude - ID numbers to exclude from the analysis plotfilename - filename for the output plot Output: plot History: 2010-05-07 - Written - Bovy (NYU) """ #Read the data data = read_data('data_allerr.dat', allerr=True) ndata = len(data) nsample = ndata - len(exclude) #Put the dat in the appropriate arrays and matrices Y = sc.zeros(nsample) X = sc.zeros(nsample) Z = sc.zeros((nsample, 2)) jj = 0 for ii in range(ndata): if sc.any(exclude == data[ii][0]): pass else: Y[jj] = data[ii][1][1] X[jj] = data[ii][1][0] Z[jj, 0] = X[jj] Z[jj, 1] = Y[jj] jj = jj + 1 #Now compute the PCA solution Zm = sc.mean(Z, axis=0) Q = sc.cov(Z.T) eigs = linalg.eig(Q) maxindx = sc.argmax(eigs[0]) V = eigs[1][maxindx] V = V / linalg.norm(V) m = sc.sqrt(1 / V[0]**2. - 1) bestfit = sc.array([-m * Zm[0] + Zm[1], m]) #Plot result plot.bovy_print(**bovyprintargs) xrange = [0, 300] yrange = [0, 700] plot.bovy_plot(sc.array(xrange), bestfit[1] * sc.array(xrange) + bestfit[0], 'k--', xrange=xrange, yrange=yrange, xlabel=r'$x$', ylabel=r'$y$', zorder=2) plot.bovy_plot(X, Y, marker='o', color='k', linestyle='None', zorder=0, overplot=True) plot.bovy_text(r'$y = %4.2f \,x %4.0f' % (bestfit[1], bestfit[0]) + r'$', bottom_right=True) plot.bovy_end_print(plotfilename)
def overlap_fp_fn(spikes, means=None, covariances=None): """ Return dicts of tuples (False positive rate, false negative rate) indexed by unit. This function needs :mod:`sklearn` if ``covariances`` is not set to ``'white'``. This function estimates the pairwise and total false positive and false negative rates for a number of waveform clusters. The results can be interpreted as follows: False positives are the fraction of spikes in a cluster that is estimated to belong to a different cluster (a specific cluster for pairwise results or any other cluster for total results). False negatives are the number spikes from other clusters that are estimated to belong to a given cluster (also expressed as fraction, this number can be larger than 1 in extreme cases). Details for the calculation can be found in (Hill et al. The Journal of Neuroscience. 2011). The calculation for total false positive and false negative rates does not follow Hill et al., who propose a simple addition of pairwise probabilities. Instead, the total error probabilities are estimated using all clusters at once. :param dict spikes: Dictionary, indexed by unit, of lists of spike waveforms as :class:`neo.core.Spike` objects or numpy arrays. If the waveforms have multiple channels, they will be flattened automatically. All waveforms need to have the same number of samples. :param dict means: Dictionary, indexed by unit, of lists of spike waveforms as :class:`neo.core.Spike` objects or numpy arrays. Means for units that are not in this dictionary will be estimated using the spikes. Note that if you pass ``'white'`` for ``covariances`` and you want to provide means, they have to be whitened in the same way as the spikes. Default: None, means will be estimated from data. :param covariances: Dictionary, indexed by unit, of lists of covariance matrices. Covariances for units that are not in this dictionary will be estimated using the spikes. It is useful to give a covariance matrix if few spikes are present - consider using the noise covariance. If you use prewhitened spikes (i.e. all clusters are normal distributed, so their covariance matrix is the identity), you can pass ``'white'`` here. The calculation will be much faster in this case and the sklearn package is not required. Default: None, covariances will estimated from data. :type covariances: dict or str :returns: Two values: * A dictionary (indexed by unit) of total (false positive rate, false negative rate) tuples. * A dictionary of dictionaries, both indexed by units, of pairwise (false positive rate, false negative rate) tuples. :rtype: dict, dict """ units = spikes.keys() total_spikes = 0 for spks in spikes.itervalues(): total_spikes += len(spks) if total_spikes < 1: return {u: (0.0, 0.0) for u in units}, {} if means is None: means = {} white = False if covariances is None: covariances = {} elif covariances == 'white': white = True covariances = {} # Convert Spike objects to arrays dimensionality = None spike_arrays = {} for u, spks in spikes.iteritems(): spikelist = [] if not spks or (len(spks) < 2 and u not in covariances): units.remove(u) continue for s in spks: if isinstance(s, neo.Spike): spikelist.append( sp.asarray(s.waveform.rescale(pq.uV)).T.flatten()) else: spikelist.append(s) spike_arrays[u] = sp.array(spikelist).T if dimensionality is None: dimensionality = spike_arrays[u].shape[0] elif dimensionality != spike_arrays[u].shape[0]: raise SpykeException('All spikes need to have the same number' 'of samples!') if not units: return {}, {} if len(units) == 1: return {units[0]: (0.0, 0.0)}, {} # Convert or calculate means and covariances shaped_means = {} covs = {} if white: cov = sp.eye(dimensionality) covariances = {u: cov for u in units} for u in units: if u in means and _object_has_size(means[u], dimensionality): mean = means[u] if isinstance(mean, neo.Spike): shaped_means[u] = sp.asarray(mean.waveform.rescale( pq.uV)).T.flatten() else: shaped_means[u] = means[u].T.flatten() else: shaped_means[u] = spike_arrays[u].mean(axis=1) if white: return _fast_overlap_whitened(spike_arrays, shaped_means) for u in units: if u not in covariances: covs[u] = sp.cov(spike_arrays[u]) else: covs[u] = covariances[u] # Calculate pairwise false positives/negatives singles = {u: {} for u in units} for i, u1 in enumerate(units): u1 = units[i] for u2 in units[i + 1:]: error_rates = _pair_overlap(spike_arrays[u1], spike_arrays[u2], shaped_means[u1], shaped_means[u2], covs[u1], covs[u2]) singles[u1][u2] = error_rates[0:2] singles[u2][u1] = error_rates[2:4] # Calculate complete false positives/negatives import sklearn mix = sklearn.mixture.GMM(n_components=2, covariance_type='full') mix_means = [] mix_covars = [] mix_weights = [] for u in units: mix_means.append(shaped_means[u]) mix_covars.append([covs[u]]) mix_weights.append(spike_arrays[u].shape[1]) mix.means_ = sp.vstack(mix_means) mix.covars_ = sp.vstack(mix_covars) mix_weights = sp.array(mix_weights, dtype=float) mix_weights /= mix_weights.sum() mix.weights_ = mix_weights # P(spikes of unit[i] in correct cluster) post_mean = sp.zeros(len(units)) # sum(P(spikes of unit[i] in cluster[j]) post_sum = sp.zeros((len(units), len(units))) for i, u in enumerate(units): posterior = mix.predict_proba(spike_arrays[u].T) post_mean[i] = posterior[:, i].mean() post_sum[i, :] = posterior.sum(axis=0) totals = {} for i, u in enumerate(units): fp = 1.0 - post_mean[i] ind = range(len(units)) ind.remove(i) fn = post_sum[ind, i].sum() / float(spike_arrays[u].shape[1]) totals[u] = (fp, fn) return totals, singles
def _propose(self, step, po=None): """ Generates proposals. returns two lists :Parameters: - `step`: Position in the markov chain history. - `po`: Process pool for parallel proposal generation :Returns: - `theta`: List of proposed self.dimensional points in parameter space - `prop`: List of self.nchains proposed phis. """ po = None thetalist = [] proplist = [] initcov = identity(self.dimensions) if self.meld.initheta and step <= 1: # start from user-defined point in parameter space. for i in range(self.nchains): thetalist.append(self.meld.initheta) self.lastcv = initcov # assume no covariance at the beginning else: for c in range(self.nchains): off = 0 if step <= 1 or self.seqhist[c] == []: # sample from the priors while off < 50: theta = [ self.parpriors[par].rvs() for par in self.parnames ] if not self.check_constraints(theta): continue if sum([ int(t >= self.parlimits[i][0] and t <= self.parlimits[i][1]) for i, t in enumerate(theta) ]) == self.dimensions: break off += 1 if off == 50: # try a compromising proposal theta = self.seqhist[c][ -1] # last accepted proposal for this chain # print "off:" , off self.lastcv = initcov # assume no covariance at the beginning else: # use gaussian proposal if step % 10 == 0 and len( self.seqhist[c] ) >= 10: # recalculate covariance matrix only every ten steps cv = self.scaling_factor * cov( array(self.seqhist[c][-10:]), rowvar=0 ) + self.scaling_factor * self.e * identity( self.dimensions) self.lastcv = cv else: cv = self.lastcv # print self.parlimits while off < 50: theta = multivariate_normal(self.seqhist[c][-1], cv, size=1).tolist()[0] if sum([ int(t >= self.parlimits[i][0] and t <= self.parlimits[i][1]) for i, t in enumerate(theta) ]) == self.dimensions: break off += 1 if off == 50: # try a compromising proposal theta = self.seqhist[c][ -1] # last accepted proposal for this chain # print "off:" , off thetalist.append(theta) if po: proplis = [ po.apply_async(model_as_ra, (t, self.meld.model, self.meld.phi.dtype.names)) for t in thetalist ] proplist = [job.get() for job in proplis] else: proplist = [ model_as_ra(t, self.meld.model, self.meld.phi.dtype.names) for t in thetalist ] propl = [p[:self.t] for p in proplist] return thetalist, propl
N = 1000 P = 4 K = 2 S = 500 Y, F, G, B0, Cg0, Cn0 = generate_data(N, P, K, S) # compute eigenvalue decomp of RRM R = sp.dot(G, G.T) R /= R.diagonal().mean() R += 1e-4 * sp.eye(R.shape[0]) Sr, Ur = la.eigh(R) # fit null model Cg = FreeFormCov(Y.shape[1]) Cn = FreeFormCov(Y.shape[1]) gp = GP2KronSum(Y=Y, S_R=Sr, U_R=Ur, Cg=Cg, Cn=Cn, F=F, A=sp.eye(P)) gp.covar.Cg.setCovariance(0.5 * sp.cov(Y.T)) gp.covar.Cn.setCovariance(0.5 * sp.cov(Y.T)) gp.optimize(factr=10) import pdb pdb.set_trace() # run MTLMM from limix_lmm.lmm_core import MTLMM mtlmm = MTLMM(Y, F=F, A=sp.eye(P), Asnp=sp.eye(P), covar=gp.covar) pv, B = mtlmm.process(G)
result.filters = filters result.args = args trace = result.trace result.trace = None save_results(result, rname) result.trace = trace # --- Plotting _ = display(result, savedir=args.plot_dir, show=args.display, root=pname) normchain = (result.chain - result.chain.mean(axis=0)) / result.chain.std(axis=0) corr = cov(normchain.T) # --- hemcee --- if args.backend == "hemcee": result = backends.run_hemcee(p0, scene, plans, scales=scales, nwarm=args.nwarm, niter=args.niter) result.labels = scene.parameter_names result.sourcepars = srcpars result.stamps = stamps result.filters = filters
def _propose(self, step, po=None): """ Generates proposals. returns two lists :Parameters: - `step`: Position in the markov chain history. - `po`: Process pool for parallel proposal generation :Returns: - `theta`: List of proposed self.dimensional points in parameter space - `prop`: List of self.nchains proposed phis. """ thetalist = [] proplist = [] initcov = np.identity(self.dimensions) for c in range(self.nchains): if step <= 1 or self.seqhist[c] == []: # sample from the priors while 1: theta = [self.parpriors[dist]() for dist in self.parnames] if not self.check_constraints(theta): continue if sum([ int( greater(t, self.parlimits[i][0]) and less(t, self.parlimits[i][1])) for i, t in enumerate(theta) ]) == self.dimensions: break self.lastcv = initcov # assume no covariance at the beginning else: # use gaussian proposal if step % 10 == 0 and len( self.seqhist[c] ) >= 10: # recalculate covariance matrix only every ten steps cv = self.scaling_factor * cov(array(self.seqhist[c][ -10:])) + self.scaling_factor * self.e * identity( self.dimensions) self.lastcv = cv else: cv = self.lastcv while 1: theta = multivariate_normal(self.seqhist[c][-1], cv, size=1).tolist()[0] if sum([ int( greater(t, self.parlimits[i][0]) and less(t, self.parlimits[i][1])) for i, t in enumerate(theta) ]) == self.dimensions: break thetalist.append(theta) if po: proplis = [ po.apply_async(model_as_ra, (t, self.meld.model, self.meld.phi.dtype.names)) for t in thetalist ] proplist = [job.get() for job in proplis] else: proplist = [ model_as_ra(t, self.meld.model, self.meld.phi.dtype.names) for t in thetalist ] propl = [p[:self.t] for p in proplist] return thetalist, propl
# 共分散 cov = sum((x - mu_x) * (y - mu_y)) / (N - 1) cov # 4 分散共分散行列 -------------------------------------------------------------------- # 元データの確認 cov_data # 系列の抽出 x = cov_data["x"] y = cov_data["y"] # 分散共分散行列の計算 # --- 母数をNとする sp.cov(x, y, ddof=0) # 分散共分散行列の計算 # --- 母数をN-1とする sp.cov(x, y, ddof=1) # 5 ピアソンの積率相関係数 ---------------------------------------------------------------- # <ポイント> # - 相関係数は線形的な関係性のみを評価できる # --- P128のような非線形な関係性は適切に評価できない点に注意 # 元データの確認 cov_data # 系列の抽出
def stats(self, startdate, enddate, mktbasket, avdate, output=False, mappingoverride=None): """ Calculates statistics for a fund over a period. Parameters ---------- startdate : datetime beginning of statistic period enddate : datetime end of statistic period mktbasket : dict dictionary of market streams output : bool if True, output results to db mappingoverride : None or mapping dictionary whether to override the db mapping Returns ------- stats : dict dictionary of statistics """ actualstream, projstream = self.project(mktbasket, mappingoverride) if actualstream[startdate:enddate] is None: return None if projstream[startdate:enddate] is None: return None actual = actualstream[startdate:enddate].returns projected = projstream[startdate:enddate].returns diff = actual - projected outdata = { 'TE': scipy.std(diff) * 100.0 * 100.0, 'BETA': scipy.cov(projected, actual, bias=1)[1, 0] / scipy.var(projected), 'ALPHA': (scipy.product(diff + 1.0))**(1.0 / diff.size) - 1.0, 'VOL': scipy.std(actual) * scipy.sqrt(252.0), 'PROJ': scipy.product(1.0 + projected) - 1.0, 'ACT': scipy.product(1.0 + actual) - 1.0, 'R2': 0.0 if scipy.all( actual == 0.0) else scipy.corrcoef(projected, actual)[1, 0]**2.0, 'AV': self.av(avdate), 'DELTA': self.deltaestimate(avdate) } outdata['DIFF'] = outdata['ACT'] - outdata['PROJ'] outdata['PL'] = outdata['DELTA'] * outdata['DIFF'] * 100.0 if output: cnxn = pyodbc.connect(ORACLESTRING) cursor = cnxn.cursor() sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});' sql = sql.format(self.fundcode, outdata['PROJ'], outdata['ACT'], outdata['DIFF'], outdata['DELTA'], outdata['PL'], oracledatebuilder(startdate), oracledatebuilder(enddate), outdata['TE'], outdata['R2'], outdata['BETA'], outdata['ALPHA'], outdata['VOL'], outdata['AV']) cursor.execute(sql) cnxn.commit() cnxn.close() return outdata
def correlated_noise(bias_files, target=0, make_plots=False, plot_corr=True, figsize=(8, 8), title=''): """ Compute the correlated noise statistics for the overscan regions of the list of files, optionally making plots of the distributions. Parameters ---------- bias_files: list List of bias files to analyze. This list must have at least as many files as the target file index + 1. target: int Bias frame to compare to the mean biases constructed from the remaining files. make_plots: bool [False] Flag to determine if the png plots will be generated. plot_corr: bool [True] Flag to plot the histograms of correlation-corrected pixel values. If False, then plot histograms of the uncorrected pixel values. figsize: tuple [(8, 8)] Figure size (in inches) of 4x4 grid of correlation plots. title: str [''] Title of 4x4 grid. Returns ------- (dict, figure, figure): tuple of results and matplotlib figures. The first item is a dict of BiasStats objects, BiasStats = namedtuple('BiasStats', \ 'noise_orig noise_corr corr_factor bias_oscan'.split()) that contain the results for each amplifier. """ # Extract the target filename and omit it from the list of bias files. target_file = bias_files.pop(target) # Get the target frame overscans. bias_oscans = get_overscans(target_file) oscan_shape = bias_oscans[1].shape # Construct the mean bias overscans from the remaining files. mean_oscans = get_mean_overscans(bias_files) # Loop over amps in target frame and compute statistics. bias_stats = dict() correlation_data = dict() for amp in bias_oscans: # Loop over other amps and construct the mean image of the # bias-subtracted overscans. Require included amps to have # (unsubtracted) overscans with 4 < stdev < 25 rms ADU. reduced_mean_oscan = np.zeros(oscan_shape) num_oscan = 0 for oamp, oscan in bias_oscans.items(): if oamp == amp: continue reduced_mean_oscan += (oscan - mean_oscans[oamp]) num_oscan += 1 reduced_mean_oscan -= np.mean(reduced_mean_oscan) reduced_mean_oscan /= num_oscan fdata1 = bias_oscans[amp] - mean_oscans[amp] fmean1 = np.mean(fdata1) fdata1 -= fmean1 dmat = np.vstack((reduced_mean_oscan.ravel(), fdata1.ravel())) covmat = scipy.cov(dmat, rowvar=True) corr_factor = covmat[0, 1] / covmat[0, 0] fdiff = fdata1 - corr_factor * reduced_mean_oscan bias_stats[amp] = BiasStats(np.sqrt(covmat[1, 1]), np.std(fdiff), corr_factor, np.mean(bias_oscans[amp])) correlation_data[amp] = reduced_mean_oscan, fdata1, fdiff f1 = None f2 = None if make_plots: f1, f2 = plot_correlated_noise(correlation_data, bias_stats, plot_corr=plot_corr, title=title, figsize=figsize) return (correlation_data, bias_stats), f1, f2
ax1.set_title('log10 scaled counts (mean)') sns.distplot(mean2, ax=ax2, bins=20, kde=False) sns.despine() ax2.set_title('log10 scaled gene counts + gaussianized (mean)') # tweak the title ttl1 = ax1.title ttl1.set_weight('bold') ttl2 = ax2.title ttl2.set_weight('bold') PL.figtext(0.01, 0.01, date.today().isoformat()) PL.tight_layout() PL.savefig(plotFile) PL.close() # Produce a PCA plot of the samples covY = SP.cov(Y3_gene) eigenvals, eigenvecs = linalg.eigh(covY + 1e-6 * SP.eye(covY.shape[0])) eigenvals = eigenvals[::-1] eigenvecs = eigenvecs[::-1] df_pcs = pd.DataFrame({ 'PC1': eigenvecs[0], 'PC2': eigenvecs[1], 'PC3': eigenvecs[2], 'PC4': eigenvecs[3], 'PC5': eigenvecs[4] }) ## PC pairs plot coloured by assay time ## just use date and time for assaytime print "... producing PC pairs plot coloured by assay time" assaytime = [ str(dt).split(" ")[0][:-3] for dt in sampleInfo['assaytime_rnaseq']
def estimate(file, detailed): # load in the strata distribution dist_line = ["0.0"] dist_line += file.readline().split() dist = array(dist_line, float) p = dist # probabilyt of a program being in each strata I = len(dist) # number of strata, including passive A = I - 1 # active strata Y = [[] for i in range(I) ] # empty collection of samples divided up by stratum Y[0] = [0] s = ones((I)) # estimated standard deviations for each stage & strata # read in log file results num_samples = 0 for result in file: stamp, stratum, perf1, perf2 = result.split() z = int(stratum) if True: #z > 10: Y[int(stratum)].append((float(perf1), float(perf2))) num_samples += 2 # compute empirical standard deviations for each stratum for i in range(1, I): if p[i] > 0.0 and len(Y[i]) > 2: YA = array(Y[i]) sample1 = YA[:, 0] # positive antithetic runs sample2 = YA[:, 1] # negative antithetic runs s1 = sample1.std(ddof=1) # 1 degree of freedom s2 = sample2.std(ddof=1) # 1 degree of freedom covariance = cov(sample1, sample2)[0, 1] # default is 1 df var = 0.25 * (s1 * s1 + s2 * s2 + 2.0 * covariance) s[i] = sqrt(var) else: s[i] = 1.0 # report current estimates by strata if detailed: for i in range(1, I): stratum_samples = len(Y[i]) * 2.0 print " % 3d % 5d" % (i, stratum_samples), if stratum_samples == 0: # no samples, so skip mean and half CI print elif stratum_samples < 4: # don't report half CI with less than 4 samples print " % 6.1f" % (array(Y[i]).mean()) else: # do a full report print " % 6.1f +/- % 5.1f" \ % (array(Y[i]).mean(), 1.96*s[i]/sqrt(stratum_samples) ) print # compute the current estimate and 95% confidence interval est = 0.0 for i in range(1, I): stratum_samples = len(Y[i]) * 2.0 if p[i] > 0.0 and stratum_samples > 2: est += p[i] / stratum_samples * array(Y[i]).sum() delta = 1.96 * sum(p * s) / sqrt(num_samples) print "%6i % 5.1f +/- % 5.1f" % (num_samples, est, delta), return
### mean imputation for remaining nans print('Imputing mean') for i in range(psi.shape[0]): n_idx = sp.where(sp.isnan(psi[i, :]))[0] if n_idx.shape[0] == 0: continue psi[i, n_idx] = spst.nanmean(psi[i, :]) ### center the data - I might not need to do this for the covariance psi -= sp.mean(psi, axis=1)[:, sp.newaxis] #psi -= sp.mean(psi, axis=0) ### compute kernel print('Computing covariances') K = sp.cov([psi[i, :] for i in range(psi.shape[0])]) ### PCA print('Compute PCA ...') w_g, Vt_g = eigh(K) V_g = Vt_g.T w_g = w_g[::-1] V_g = V_g[::-1, :] print('... done') pickle.dump((w_g, V_g, ctypes, tn_labels, psi), open(picklefile, 'w'), -1) else: print('Loading data from pickle: %s' % picklefile) (w_g, V_g, ctypes, tn_labels, psi) = pickle.load(open(picklefile, 'r'))
plt.imshow(ims, cmap="gray") plt.colorbar() # Visualization of band 2 plt.figure() ims = skip_extrem(im[:, :, b2]) plt.imshow(ims, cmap="gray") plt.colorbar() # Mean differences print "Median value of differences between band {} and {} is {}".format( b1, b2, 100. * sp.median( (im[:, :, b2].astype(float) - im[:, :, b1]) / im[:, :, b1])) # Computation of the correlation im.shape = (h * w, b) cov = sp.cov(im[::4, :], bias=1, rowvar=0) dcov = sp.sqrt(sp.diag(cov)) cor = cov / dcov[:, sp.newaxis] cor /= dcov[sp.newaxis, :] plt.figure() plt.imshow(cor, interpolation='nearest') plt.colorbar() # Compute condition number s = linalg.svd(cov, compute_uv=False) print("Condition number is {}".format(s[0] / s[-1])) # Plot all the figures plt.show()
def correlated_noise(bias_files, target=0, make_plots=False, plot_corr=True, figsize=(8, 8), title=''): """ Compute the correlated noise statistics for the overscan regions of the list of files, optionally making plots of the distributions. Parameters ---------- bias_files: list List of bias files to analyze. This list must have at least as many files as the target file index + 1. target: int Bias frame to compare to the mean biases constructed from the remaining files. make_plots: bool [False] Flag to determine if the png plots will be generated. plot_corr: bool [True] Flag to plot the histograms of correlation-corrected pixel values. If False, then plot histograms of the uncorrected pixel values. figsize: tuple [(8, 8)] Figure size (in inches) of 4x4 grid of correlation plots. title: str [''] Title of 4x4 grid. Returns ------- (dict, figure, figure): tuple of results and matplotlib figures. The first item is a dict of BiasStats objects, BiasStats = namedtuple('BiasStats', \ 'noise_orig noise_corr corr_factor bias_oscan'.split()) that contain the results for each amplifier. """ f1, f2 = None, None if make_plots: f1, ax1 = plt.subplots(4, 4, figsize=figsize) ax1 = {amp: subplot for amp, subplot in zip(imutils.allAmps(), ax1.flatten())} f2, ax2 = plt.subplots(4, 4, figsize=figsize) ax2 = {amp: subplot for amp, subplot in zip(imutils.allAmps(), ax2.flatten())} # Extract the target filename and omit it from the list of bias files. target_file = bias_files.pop(target) # Get the target frame overscans. bias_oscans = get_overscans(target_file) oscan_shape = bias_oscans[1].shape # Construct the mean bias overscans from the remaining files. mean_oscans = get_mean_overscans(bias_files) # Compute the mean values of the mean bias overscans. mean_oscan_values \ = {amp: np.mean(oscan) for amp, oscan in mean_oscans.items()} # Loop over amps in target frame and compute statistics. bias_stats = dict() for amp in bias_oscans: # Loop over other amps and construct the mean image of the # bias-subtracted overscans. Require included amps to have # (unsubtracted) overscans with 4 < stdev < 25 rms ADU. reduced_mean_oscan = np.zeros(oscan_shape) num_oscan = 0 for oamp, oscan in bias_oscans.items(): if oamp == amp: continue reduced_mean_oscan += (oscan - mean_oscans[oamp]) num_oscan += 1 reduced_mean_oscan -= np.mean(reduced_mean_oscan) reduced_mean_oscan /= num_oscan fdata1 = bias_oscans[amp] - mean_oscans[amp] fmean1 = np.mean(fdata1) fdata1 -= fmean1 dmat = np.vstack((reduced_mean_oscan.flatten(), fdata1.flatten())) covmat = scipy.cov(dmat, rowvar=True) corr_factor = covmat[0, 1]/covmat[0, 0] fdiff = fdata1 - corr_factor*reduced_mean_oscan bias_stats[amp] = BiasStats(np.sqrt(covmat[1, 1]), np.std(fdiff), corr_factor, np.mean(bias_oscans[amp])) #fmean1) if make_plots: f1.suptitle(title) f2.suptitle(title) ax1[amp].hist2d(reduced_mean_oscan.flatten(), fdata1.flatten(), bins=(100, 100), range=((-50, 50), (-50, 50))) label = 'amp %i, cov/var = %.2f' \ % (amp, bias_stats[amp].corr_factor) ax1[amp].text(-40, 40, label, fontsize=6, color='w', fontweight='bold') if plot_corr: ax2[amp].hist(fdiff.flatten(), bins=100, range=(-50, 50), histtype='step') else: ax2[amp].hist(fdata1.flatten(), bins=100, range=(-50, 50), histtype='step') return bias_stats, f1, f2
def bces(x1, x2, x1err=[], x2err=[], cerr=[], logify=True, model='yx', \ bootstrap=5000, verbose='normal', full_output=True): """ Bivariate, Correlated Errors and intrinsic Scatter (BCES) translated from the FORTRAN code by Christina Bird and Matthew Bershady (Akritas & Bershady, 1996) Linear regression in the presence of heteroscedastic errors on both variables and intrinsic scatter Parameters ---------- x1 : array of floats Independent variable, or observable x2 : array of floats Dependent variable x1err : array of floats (optional) Uncertainties on the independent variable x2err : array of floats (optional) Uncertainties on the dependent variable cerr : array of floats (optional) Covariances of the uncertainties in the dependent and independent variables logify : bool (default True) Whether to take the log of the measurements in order to estimate the best-fit power law instead of linear relation model : {'yx', 'xy', 'bi', 'orth'} BCES model with which to calculate regression. See Notes below for details. bootstrap : False or int (default 5000) get the errors from bootstrap resampling instead of the analytical prescription? if bootstrap is an int, it is the number of bootstrap resamplings verbose : str (default 'normal') Verbose level. Options are {'quiet', 'normal', 'debug'} full_output : bool (default True) If True, return also the covariance between the normalization and slope of the regression. Returns ------- a : tuple of length 2 Best-fit normalization and its uncertainty (a, da) b : tuple of length 2 Best-fit slope and its uncertainty (b, db) Optional outputs ---------------- cov_ab : 2x2 array of floats covariance between a and b. Returned if full_output is set to True. Notes ----- If verbose is normal or debug, the results from all the BCES models will be printed (still, only the one selected in *model* will be returned). the *model* parameter: -'yx' stands for BCES(Y|X) -'xy' stands for BCES(X|Y) -'bi' stands for BCES Bisector -'orth' stands for BCES Orthogonal """ def _bess_bootstrap(npts, x1, x2, x1err, x2err, cerr, nsim): ##added by Gerrit, July 2014 ##Unfortunately I needed a copy of the _bess function for bootstrapping. #Would be nicer if those two could be combined """ Do the entire regression calculation for 4 slopes: OLS(Y|X), OLS(X|Y), bisector, orthogonal """ #calculate sigma's for datapoints using length of confidence intervals sig11var = np.sum(x1err**2, axis=1, keepdims=True) / npts sig22var = np.sum(x2err**2, axis=1, keepdims=True) / npts sig12var = np.sum(cerr, axis=1, keepdims=True) / npts # calculate means and variances x1av = np.mean(x1, axis=1, keepdims=True) x1var = x1.var(axis=1, keepdims=True) x2av = np.mean(x2, axis=1, keepdims=True) x2var = x2.var(axis=1, keepdims=True) covar_x1x2 = np.mean((x1-np.mean(x1,axis=1,keepdims=True)) * \ (x2-np.mean(x2,axis=1,keepdims=True)), axis=1,keepdims=True) # compute the regression slopes for OLS(X2|X1), OLS(X1|X2), # bisector and orthogonal if model == 'yx': modelint = 1 else: modelint = 4 b = np.zeros((modelint, nsim)) b[0] = ((covar_x1x2 - sig12var) / (x1var - sig11var)).flatten() if model != 'yx': b[1] = ((x2var - sig22var) / (covar_x1x2 - sig12var)).flatten() b[2] = ((b[0] * b[1] - 1 + np.sqrt((1 + b[0] ** 2) * \ (1 + b[1] ** 2))) / (b[0] + b[1])).flatten() b[3] = 0.5 * ((b[1] - 1 / b[0]) + np.sign(covar_x1x2).flatten()* \ np.sqrt(4 + (b[1] - 1 / b[0]) ** 2)) # compute intercepts for above 4 cases: a = x2av.flatten() - b * x1av.flatten() # set up variables to calculate standard deviations of slope and # intercept xi = [] xi.append(((x1 - x1av) * (x2 - b[0].reshape(nsim,1) * x1 - \ a[0].reshape(nsim,1)) + \ b[0].reshape(nsim,1) * x1err ** 2) / \ (x1var - sig11var)) if model != 'yx': xi.append(((x2 - x2av) * (x2 - b[1].reshape(nsim,1) * x1 - \ a[1].reshape(nsim,1)) + x2err ** 2) / \ covar_x1x2) xi.append((xi[0] * (1 + b[1].reshape(nsim,1) ** 2) + \ xi[1] * (1 + b[0].reshape(nsim,1) ** 2)) / \ ((b[0].reshape(nsim,1) + \ b[1].reshape(nsim,1)) * \ np.sqrt((1 + b[0].reshape(nsim,1) ** 2) * \ (1 + b[1].reshape(nsim,1) ** 2)))) xi.append((xi[0] / b[0].reshape(nsim,1) ** 2 + xi[1]) * \ b[3].reshape(nsim,1) / \ np.sqrt(4 + (b[1].reshape(nsim,1) - \ 1 / b[0].reshape(nsim,1)) ** 2)) zeta = [] for i in xrange(modelint): zeta.append(x2 - b[i].reshape(nsim, 1) * x1 - x1av * xi[i]) # calculate variance for all a and b bvar = np.zeros((4, nsim)) avar = np.zeros((4, nsim)) for i in xrange(modelint): bvar[i] = xi[i].var(axis=1, keepdims=False) / npts avar[i] = zeta[i].var(axis=1, keepdims=False) / npts return a, b, avar, bvar, xi, zeta def _bess(npts, x1, x2, x1err, x2err, cerr): """ Do the entire regression calculation for 4 slopes: OLS(Y|X), OLS(X|Y), bisector, orthogonal """ # calculate sigma's for datapoints using length of confidence # intervals sig11var = sum(x1err**2) / npts sig22var = sum(x2err**2) / npts sig12var = sum(cerr) / npts # calculate means and variances x1av = scipy.average(x1) x1var = scipy.std(x1)**2 x2av = scipy.average(x2) x2var = scipy.std(x2)**2 covar_x1x2 = sum((x1 - x1av) * (x2 - x2av)) / npts # compute the regression slopes for OLS(X2|X1), OLS(X1|X2), # bisector and orthogonal b = scipy.zeros(4) b[0] = (covar_x1x2 - sig12var) / (x1var - sig11var) b[1] = (x2var - sig22var) / (covar_x1x2 - sig12var) b[2] = (b[0] * b[1] - 1 + scipy.sqrt((1 + b[0] ** 2) * \ (1 + b[1] ** 2))) / (b[0] + b[1]) b[3] = 0.5 * ((b[1] - 1 / b[0]) + scipy.sign(covar_x1x2) * \ scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2)) # compute intercepts for above 4 cases: a = x2av - b * x1av # set up variables to calculate standard deviations of slope # and intercept xi = [] xi.append(((x1 - x1av) * \ (x2 - b[0] * x1 - a[0]) + b[0] * x1err ** 2) / \ (x1var - sig11var)) xi.append(((x2 - x2av) * (x2 - b[1] * x1 - a[1]) + x2err ** 2) / \ covar_x1x2) xi.append((xi[0] * (1 + b[1] ** 2) + xi[1] * (1 + b[0] ** 2)) / \ ((b[0] + b[1]) * \ scipy.sqrt((1 + b[0] ** 2) * (1 + b[1] ** 2)))) xi.append((xi[0] / b[0] ** 2 + xi[1]) * b[3] / \ scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2)) zeta = [] for i in xrange(4): zeta.append(x2 - b[i] * x1 - x1av * xi[i]) # calculate variance for all a and b bvar = scipy.zeros(4) avar = scipy.zeros(4) for i in xrange(4): bvar[i] = scipy.std(xi[i])**2 / npts avar[i] = scipy.std(zeta[i])**2 / npts return a, b, avar, bvar, xi, zeta def _bootspbec(npts, x, y, xerr, yerr, cerr): """ Bootstrap samples """ j = scipy.random.randint(npts, size=npts) xboot = x[j] xerrboot = xerr[j] yboot = y[j] yerrboot = yerr[j] cerrboot = cerr[j] return xboot, yboot, xerrboot, yerrboot, cerrboot # ---- Main routine starts here ---- # # convert to scipy arrays just in case x1 = scipy.array(x1) x2 = scipy.array(x2) x1err = scipy.array(x1err) x2err = scipy.array(x2err) cerr = scipy.array(cerr) models = [['yx', 'xy', 'bi', 'orth'], ['BCES(Y|X)', 'BCES(X|Y)', 'BCES Bisector', 'BCES Orthogonal']] # which to return? j = models[0].index(model) npts = len(x1) # are the errors defined? if len(x1err) == 0: x1err = scipy.zeros(npts) if len(x2err) == 0: x2err = scipy.zeros(npts) if len(cerr) == 0: cerr = scipy.zeros(npts) if verbose == 'debug': print 'x1 =', x1 print 'x1err =', x1err print 'x2 =', x2 print 'x2err =', x2err print 'cerr =', cerr print '\n ** Returning values for', models[1][j], '**' if bootstrap is not False: print ' with errors from %d bootstrap resamplings' % bootstrap print '' # calculate nominal fits bessresults = _bess(npts, x1, x2, x1err, x2err, cerr) (a, b, avar, bvar, xi, zeta) = bessresults # covariance between normalization and slope if full_output: covar_ab = scipy.cov(xi[j], zeta[j]) if bootstrap is not False: # make bootstrap simulated datasets, and compute averages and # standard deviations of regression coefficients asum = scipy.zeros(4) assum = scipy.zeros(4) bsum = scipy.zeros(4) bssum = scipy.zeros(4) sda = scipy.zeros(4) sdb = scipy.zeros(4) for i in xrange(bootstrap): samples = _bootspbec(npts, x1, x2, x1err, x2err, cerr) (x1sim, x2sim, x1errsim, x2errsim, cerrsim) = samples besssim = _bess(npts, x1sim, x2sim, x1errsim, x2errsim, cerrsim) (asim, bsim, avarsim, bvarsim, xi, zeta) = besssim asum += asim assum += asim**2 bsum += bsim bssum += bsim**2 aavg = asum / bootstrap bavg = bsum / bootstrap for i in range(4): sdtest = assum[i] - bootstrap * aavg[i]**2 if sdtest > 0: sda[i] = scipy.sqrt(sdtest / (bootstrap - 1)) sdtest = bssum[i] - bootstrap * bavg[i]**2 if sdtest > 0: sdb[i] = scipy.sqrt(sdtest / (bootstrap - 1)) if verbose in ('normal', 'debug'): print '%s B err(B)' % ('Fit'.ljust(19)), print ' A err(A)' for i in range(4): print '%s %9.2e +/- %8.2e %10.3e +/- %9.3e' \ %(models[1][i].ljust(16), b[i], scipy.sqrt(bvar[i]), a[i], scipy.sqrt(avar[i])) if bootstrap is not False: print '%s %9.2e +/- %8.2e %10.3e +/- %9.3e' \ %('bootstrap'.ljust(16), bavg[i], sdb[i], aavg[i], sda[i]) print '' if verbose == 'debug': print 'cov[%s] =' % models[model] print covar_ab if bootstrap is not False: if full_output: return (a[j], sda[j]), (b[j], sdb[j]), covar_ab else: return (a[j], sda[j]), (b[j], sdb[j]) if full_output: out = ((a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j])), covar_ab) else: out = ((a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j]))) return out
# define mean term mean = LinearMean(Y) print mean.Y # add first fixed effect F = 1. * (SP.rand(N, 2) < 0.2) A = SP.eye(P) mean.addFixedEffect(F=F, A=A) # add first fixed effect F = 1. * (SP.rand(N, 3) < 0.2) A = SP.ones((1, P)) mean.addFixedEffect(F=F, A=A) # rotate stuff by row and cols C = SP.cov(Y.T) Sc, Uc = LA.eigh(C) Sr, Ur = LA.eigh(XX) d = SP.kron(Sc, Sr) mean.d = d mean.Lc = Uc.T mean.Lr = Ur.T mean.LRLdiag = Sr mean.LCL = C**2 if 1: # calculate stuff to see if it goes through print mean.Ystar() print mean.Yhat() print mean.Xstar() print mean.Xhat()
from numpy import array, mat, shape, transpose from scipy import cov, linalg from pylab import load, arange data2 = mat( array( load('raw3.dat', delimiter='\t', usecols=arange(0, 13, 1), unpack=True))) time_series = mat(cov(data2, rowvar=1)) print 'covariance matrix : ', shape(time_series) eval, evec = linalg.eig(mat(time_series)) print shape(eval), shape(evec) print abs(evec) print abs(eval)
def bces(x1, x2, x1err=None, x2err=None, cerr=None, nsim=5000, model='yx', \ bootstrap=5000, verbose='normal', full_output=True): """ Bivariate, Correlated Errors and intrinsic Scatter (BCES) translated from the FORTRAN code by Christina Bird and Matthew Bershady (Akritas & Bershady, 1996) Linear regression in the presence of heteroscedastic errors on both variables and intrinsic scatter Parameters ---------- x1 : array of floats Independent variable, or observable x2 : array of floats Dependent variable x1err : array of floats (optional) Uncertainties on the independent variable x2err : array of floats (optional) Uncertainties on the dependent variable cerr : array of floats (optional) Covariances of the uncertainties in the dependent and independent variables nsim : int (default 1000) Number of bootstrap samples for uncertainties on best-fit parameters model : {'yx', 'xy', 'bi', 'orth'} BCES model with which to calculate regression. See Notes below for details. bootstrap : False or int (default False) get the errors from bootstrap resampling instead of the analytical prescription? if bootstrap is an int, it is the number of bootstrap resamplings verbose : str (default 'normal') Verbose level. Options are {'quiet', 'normal', 'debug'} full_output : bool (default True) If True, return also the covariance between the normalization and slope of the regression. Returns ------- a : tuple of length 2 Best-fit normalization and its uncertainty (a, da) b : tuple of length 2 Best-fit slope and its uncertainty (b, db) Optional outputs ---------------- cov : 2x2 array of floats covariance between a and b. Returned if full_output is set to True. Notes ----- If verbose is normal or debug, the results from all the BCES models will be printed (still, only the one selected in *model* will be returned). the *model* parameter: -'yx' stands for BCES(Y|X) -'xy' stands for BCES(X|Y) -'bi' stands for BCES Bisector -'orth' stands for BCES Orthogonal """ def _bess(npts, x1, x2, x1err, x2err, cerr): """ Do the entire regression calculation for 4 slopes: OLS(Y|X), OLS(X|Y), bisector, orthogonal """ # calculate sigma's for datapoints using length of confidence intervals sig11var = sum(x1err**2) / npts sig22var = sum(x2err**2) / npts sig12var = sum(cerr) / npts # calculate means and variances x1av = scipy.average(x1) x1var = scipy.std(x1)**2 x2av = scipy.average(x2) x2var = scipy.std(x2)**2 covar_x1x2 = sum((x1 - x1av) * (x2 - x2av)) / npts # compute the regression slopes for OLS(X2|X1), OLS(X1|X2), # bisector and orthogonal b = scipy.zeros(4) b[0] = (covar_x1x2 - sig12var) / (x1var - sig11var) b[1] = (x2var - sig22var) / (covar_x1x2 - sig12var) b[2] = (b[0] * b[1] - 1 + scipy.sqrt((1 + b[0] ** 2) * \ (1 + b[1] ** 2))) / (b[0] + b[1]) b[3] = 0.5 * ((b[1] - 1 / b[0]) + scipy.sign(covar_x1x2) * \ scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2)) # compute intercepts for above 4 cases: a = x2av - b * x1av # set up variables to calculate standard deviations of slope and intercept xi = [] xi.append(((x1 - x1av) * (x2 - b[0] * x1 - a[0]) + b[0] * x1err ** 2) / \ (x1var - sig11var)) xi.append(((x2 - x2av) * (x2 - b[1] * x1 - a[1]) + x2err ** 2) / \ covar_x1x2) xi.append((xi[0] * (1 + b[1] ** 2) + xi[1] * (1 + b[0] ** 2)) / \ ((b[0] + b[1]) * scipy.sqrt((1 + b[0] ** 2) * (1 + b[1] ** 2)))) xi.append((xi[0] / b[0] ** 2 + xi[1]) * b[3] / \ scipy.sqrt(4 + (b[1] - 1 / b[0]) ** 2)) zeta = [] for i in range(4): zeta.append(x2 - b[i] * x1 - x1av * xi[i]) # calculate variance for all a and b bvar = scipy.zeros(4) avar = scipy.zeros(4) for i in range(4): bvar[i] = scipy.std(xi[i])**2 / npts avar[i] = scipy.std(zeta[i])**2 / npts return a, b, avar, bvar, xi, zeta def _bootspbec(npts, x, y, xerr, yerr, cerr): """ Bootstrap samples """ j = scipy.random.randint(npts, size=npts) xboot = x[j] xerrboot = xerr[j] yboot = y[j] yerrboot = yerr[j] cerrboot = cerr[j] return xboot, yboot, xerrboot, yerrboot, cerrboot # ---- Main routine starts here ---- # models = [['yx', 'xy', 'bi', 'orth'], ['BCES(Y|X)', 'BCES(X|Y)', 'BCES Bisector', 'BCES Orthogonal']] # which to return? j = models[0].index(model) npts = len(x1) # are the errors defined? if x1err is None: x1err = scipy.zeros(npts) if x2err is None: x2err = scipy.zeros(npts) if cerr is None: from scipy import random cerr = scipy.zeros(npts) #cerr = scipy.cov(x1err, x2err)[1][0] * scipy.ones(npts) if verbose == 'debug': print('x1 =', x1) print('x1err =', x1err) print('x2 =', x2) print('x2err =', x2err) print('cerr =', cerr) print('\n ** Returning values for', models[1][j], '**') if bootstrap is not False: print(' with errors from %d bootstrap resamplings' % bootstrap) print('') # calculate nominal fits bessresults = _bess(npts, x1, x2, x1err, x2err, cerr) (a, b, avar, bvar, xi, zeta) = bessresults # covariance between normalization and slope if full_output: covar_ab = scipy.cov(xi[j], zeta[j]) if bootstrap is not False: # make bootstrap simulated datasets, and compute averages and # standard deviations of regression coefficients asum = scipy.zeros(4) assum = scipy.zeros(4) bsum = scipy.zeros(4) bssum = scipy.zeros(4) sda = scipy.zeros(4) sdb = scipy.zeros(4) for i in range(nsim): samples = _bootspbec(npts, x1, x2, x1err, x2err, cerr) (x1sim, x2sim, x1errsim, x2errsim, cerrsim) = samples besssim = _bess(npts, x1sim, x2sim, x1errsim, x2errsim, cerrsim) (asim, bsim, avarsim, bvarsim, xi, zeta) = besssim asum += asim assum += asim**2 bsum += bsim bssum += bsim**2 aavg = asum / nsim bavg = bsum / nsim for i in range(4): sdtest = assum[i] - nsim * aavg[i]**2 if sdtest > 0: sda[i] = scipy.sqrt(sdtest / (nsim - 1)) sdtest = bssum[i] - nsim * bavg[i]**2 if sdtest > 0: sdb[i] = scipy.sqrt(sdtest / (nsim - 1)) if verbose in ('normal', 'debug'): print('%s B err(B)' % ('Fit'.ljust(19)), end=' ') print(' A err(A)') for i in range(4): print('%s %9.2e +/- %8.2e %10.3e +/- %9.3e' \ %(models[1][i].ljust(16), b[i], scipy.sqrt(bvar[i]), a[i], scipy.sqrt(avar[i]))) if bootstrap is not False: print('%s %9.2e +/- %8.2e %10.3e +/- %9.3e' \ %('bootstrap'.ljust(16), bavg[i], sdb[i], aavg[i], sda[i])) print('') if verbose == 'debug': print('cov[%s] =' % models[model]) print(covar_ab) if bootstrap is not False: if full_output: return (a[j], sda[j]), (b[j], sdb[j]), covar_ab else: return (a[j], sda[j]), (b[j], sdb[j]) if full_output: return (a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j])), covar_ab else: return (a[j], scipy.sqrt(avar[j])), (b[j], scipy.sqrt(bvar[j]))
def main(): a = [[1, 2, 3], [4, 5, 6]] print(median(a)) print(corrcoef(a)) print(cov(a))
""" Name : c8_19_Roll_spread.py Book : Python for Finance (2nd ed.) Publisher: Packt Publishing Ltd. Author : Yuxing Yan Date : 6/6/2017 email : [email protected] [email protected] """ from matplotlib.finance import quotes_historical_yahoo_ochl as getData import scipy as sp ticker='IBM' begdate=(2013,9,1) enddate=(2013,11,11) data= getData(ticker, begdate, enddate,asobject=True, adjusted=True) p=data.aclose d=sp.diff(p) cov_=sp.cov(d[:-1],d[1:]) if cov_[0,1]<0: print("Roll spread for ", ticker, 'is', round(2*sp.sqrt(-cov_[0,1]),3)) else: print("Cov is positive for ",ticker, 'positive', round(cov_[0,1],3))
''' #lets test the roll spread for IBM #lets import the modules well be using import yfinance as yf import scipy as sp #download data data = yf.download('IBM', start='2013-9-1', end='2013-11-11') ''' Key note is that roll spread is appropriate for high frequency data However for purposes of demonstration we'll use historical data from IBM ''' #determine change in prices returns = sp.diff(data['Adj Close']) #find covariance matrix covariance = sp.cov(returns[:-1], returns[1:]) if covariance[0, 1] < 0: #cov[0.1] defines a matrix of row 1 and column 0 print("Roll spread for IBM is", round(2 * sp.sqrt(-covariance[0, 1]), 3)) else: print("Cov is positive for IBM ", round(covariance[0, 1], 3)) ''' When roll value is positive, Roll's model would fail. In a real world, it could occur for many cases. Usually, practitioners adopt two approaches: when the spread is negative, we just ignore those cases or use other methods to estimate spread. The second approach is to add a negative sign in front of a positive covariance. '''