def pearsonr(x, y): """ generalized from scipy.stats.pearsonr """ # x and y should have same length. x_shape = x.shape if len(x_shape) > 1: x = x.reshape((x_shape[0],prod(x_shape[1:]))) x = np.asarray(x) y = np.asarray(y) n = len(x) mx = x.mean(0) my = y.mean(0) xm, ym = x-mx, y-my r_num = n*np.dot(xm.T,ym) r_den = n*np.sqrt(np.outer(ss(xm),ss(ym,0))) r = (r_num / r_den) # Presumably, if r > 1, then it is only some small artifact of floating # point arithmetic. r = np.minimum(r, 1.0) df = n-2 # Use a small floating point value to prevent divide-by-zero nonsense # fixme: TINY is probably not the right value and this is probably not # the way to be robust. The scheme used in spearmanr is probably better. TINY = 1.0e-20 t = r*np.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) prob = betai(0.5*df,0.5,df/(df+t*t)) return r,prob
def vectorized_correlation(x, y): """Compute correlation coefficient between arrays with vectorization. Parameters ---------- x, y : array-like Dimensions on the final axis should match, computation will be vectorized over preceding axes. Dimensions will be matched, or broadcasted, depending on shapes. In other words, passing two (m x n) arrays will compute the correlation between each pair of rows and return a vector of length n. Passing one vector of length n and one array of shape (m x n) will compute the correlation between the vector and each row in the array, also returning a vector of length n. Returns ------- r : array Correlation coefficient(s). """ x, y = np.asarray(x), np.asarray(y) mx = x.mean(axis=-1) my = y.mean(axis=-1) xm, ym = x - mx[..., None], y - my[..., None] r_num = np.add.reduce(xm * ym, axis=-1) r_den = np.sqrt(stats.ss(xm, axis=-1) * stats.ss(ym, axis=-1)) r = r_num / r_den return r
def pearsonr(x, y): """ generalized from scipy.stats.pearsonr """ # x and y should have same length. x_shape = x.shape if len(x_shape) > 1: x = x.reshape((x_shape[0], prod(x_shape[1:]))) x = np.asarray(x) y = np.asarray(y) n = len(x) mx = x.mean(0) my = y.mean(0) xm, ym = x - mx, y - my r_num = n * np.dot(xm.T, ym) r_den = n * np.sqrt(np.outer(ss(xm), ss(ym, 0))) r = (r_num / r_den) # Presumably, if r > 1, then it is only some small artifact of floating # point arithmetic. r = np.minimum(r, 1.0) df = n - 2 # Use a small floating point value to prevent divide-by-zero nonsense # fixme: TINY is probably not the right value and this is probably not # the way to be robust. The scheme used in spearmanr is probably better. TINY = 1.0e-20 t = r * np.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) prob = betai(0.5 * df, 0.5, df / (df + t * t)) return r, prob
def fit(xdata, ydata): """Calculate 2D regression. Args: xdata (numpy.ndarray): 1D array of independent data [ntim], where ntim is the number of time points (or other independent points). ydata (numpy.ndarray): 2D array of dependent data [ntim, nspat], where nspat is the number of spatial points (or other dependent points). Returns: numpy.ndarray of dimension [5, nspat]. The 5 outputs are: slope, intercept, Pearson's correlation coefficient, two-sided p-value for a hypothesis test with null hypothesis that the slope is zero, standard error for the slope estimate. """ # Small number to prevent divide-by-zero errors TINY = 1.0e-20 # Dimensions ntim = xdata.shape[0] nspat = ydata.shape[1] # Add a constant (1) to the xdata to allow for intercept calculation xdata_plus_const = utils.add_constant(xdata) # Calculate parameters of the regression by solving the OLS problem # in its matrix form mat1 = np.swapaxes( np.dot(xdata_plus_const.T, (xdata_plus_const[np.newaxis, :, :])), 0, 1) mat2 = np.dot(xdata_plus_const.T, ydata) beta = np.linalg.solve(mat1, mat2.T) output = beta.T # Pearson correlation coefficient xm, ym = xdata - xdata.mean(0), ydata - ydata.mean(0) r_num = np.dot(xm, ym) r_den = np.sqrt(stats.ss(xm) * stats.ss(ym)) pearson_r = r_num / r_den # Two-sided p-value for a hypothesis test whose null hypothesis is that # the slope is zero. df = ntim - 2 tval = pearson_r * np.sqrt(df / ((1.0 - pearson_r + TINY) * (1.0 + pearson_r + TINY))) pval = stats.distributions.t.sf(np.abs(tval), df) * 2 # Standard error of the slope estimate sst = np.sum(ym**2, 0) ssr = (output[0, :]**2) * np.sum(xm**2) se = np.sqrt((1. / df) * (sst - ssr)) stderr = se / np.sqrt(np.sum(xm**2)) return np.vstack([output, pearson_r, pval, stderr])
def fit(xdata, ydata): """Calculate 2D regression. Args: xdata (numpy.ndarray): 1D array of independent data [ntim], where ntim is the number of time points (or other independent points). ydata (numpy.ndarray): 2D array of dependent data [ntim, nspat], where nspat is the number of spatial points (or other dependent points). Returns: numpy.ndarray of dimension [5, nspat]. The 5 outputs are: slope, intercept, Pearson's correlation coefficient, two-sided p-value for a hypothesis test with null hypothesis that the slope is zero, standard error for the slope estimate. """ # Small number to prevent divide-by-zero errors TINY = 1.0e-20 # Dimensions ntim = xdata.shape[0] nspat = ydata.shape[1] # Add a constant (1) to the xdata to allow for intercept calculation xdata_plus_const = utils.add_constant(xdata) # Calculate parameters of the regression by solving the OLS problem # in its matrix form mat1 = np.swapaxes(np.dot(xdata_plus_const.T, (xdata_plus_const[np.newaxis, :, :])), 0, 1) mat2 = np.dot(xdata_plus_const.T, ydata) beta = np.linalg.solve(mat1, mat2.T) output = beta.T # Pearson correlation coefficient xm, ym = xdata - xdata.mean(0), ydata - ydata.mean(0) r_num = np.dot(xm, ym) r_den = np.sqrt(stats.ss(xm) * stats.ss(ym)) pearson_r = r_num / r_den # Two-sided p-value for a hypothesis test whose null hypothesis is that # the slope is zero. df = ntim - 2 tval = pearson_r * np.sqrt(df / ((1.0 - pearson_r + TINY) * (1.0 + pearson_r + TINY))) pval = stats.distributions.t.sf(np.abs(tval), df) * 2 # Standard error of the slope estimate sst = np.sum(ym ** 2, 0) ssr = (output[0, :] ** 2) * np.sum(xm ** 2) se = np.sqrt((1.0 / df) * (sst - ssr)) stderr = se / np.sqrt(np.sum(xm ** 2)) return np.vstack([output, pearson_r, pval, stderr])
def mypearsonr(x, y): """ Calculates a Pearson correlation coefficient and the p-value for testing non-correlation. The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. Parameters ---------- x : (N,) array_like Input y : (N,) array_like Input Returns ------- (Pearson's correlation coefficient, 2-tailed p-value) References ---------- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation """ # x and y should have same length. x = np.asarray(x) #print x y = np.asarray(y) n = len(x) mx = x.mean() my = y.mean() xm, ym = x-mx, y-my r_num = np.add.reduce(xm * ym) r_den = np.sqrt(stats.ss(xm) * stats.ss(ym)) r = r_num / r_den #r = max(min(r, 1.0), -1.0) df = n-2 if abs(r.all()) == 1.0: prob = 0.0 else: t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) prob = betai(0.5*df, 0.5, df / (df + t_squared)) return r, prob
def mypearsonr(x, y): """ Calculates a Pearson correlation coefficient and the p-value for testing non-correlation. The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. The p-values are not entirely reliable but are probably reasonable for datasets larger than 500 or so. Parameters ---------- x : (N,) array_like Input y : (N,) array_like Input Returns ------- (Pearson's correlation coefficient, 2-tailed p-value) References ---------- http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation """ # x and y should have same length. x = np.asarray(x) #print x y = np.asarray(y) n = len(x) mx = x.mean() my = y.mean() xm, ym = x - mx, y - my r_num = np.add.reduce(xm * ym) r_den = np.sqrt(stats.ss(xm) * stats.ss(ym)) r = r_num / r_den #r = max(min(r, 1.0), -1.0) df = n - 2 if abs(r.all()) == 1.0: prob = 0.0 else: t_squared = r * r * (df / ((1.0 - r) * (1.0 + r))) prob = betai(0.5 * df, 0.5, df / (df + t_squared)) return r, prob
def calcBrownsCombinedPnanRemove(snpSet, genotypeArray): nSNPs = snpSet.nSNPs pValArray, adjpValArray = snpSet.getPvalues() chisq = sum(-2 * np.log(pValArray)) adjchisq = sum(-2 * np.log(adjpValArray)) colsWithMissingData = np.where(np.isnan(genotypeArray))[1] genotypeArray = np.delete(genotypeArray, colWithMissingData, 1) ms = genotypeArray.mean(axis=1)[(slice(None,None,None),None)] datam = genotypeArray - ms datass = np.sqrt(stats.ss(datam,axis=1)) runningSum = 0 for i in xrange(nSNPs-1): temp = np.dot(datam[i:],datam[i].T) d = (datass[i:]*datass[i]) rs = temp / d rs = np.absolute(rs)[1:] runningSum += 3.25 * np.sum(rs) + .75 * np.dot(rs, rs) sigmaSq = 4*nSNPs+2*runningSum E = 2*nSNPs df = (2*(E*E))/sigmaSq runningSum = sigmaSq/(2*E) d = chisq/runningSum adjd = adjchisq/runningSum brownsP = stats.chi2.sf(d, df) adjBrownsP = stats.chi2.sf(adjd, df) return brownsP, adjBrownsP
def select_ss(dm, levels, included): bign = len(dm) distances = (dm[i][j] for i, j in above_diagonal(bign) if included(levels[i], levels[j])) return stats.ss(distances)
def f_twoway(dm, levels): bign = len(levels) # number of observations dm = np.asarray(dm) # distance matrix l = len(set(levels)) # number of levels a = len(set([l[0] for l in levels])) # number of a-levels b = len(set([l[1] for l in levels])) # number of b-levels n = bign / float(a * b) # number of observations per level # sum of all distances ## sst = np.sum(stats.ss(r) for r in ## (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign) sst = stats.ss(chain(*(r[i + 1 :] for i, r in enumerate(dm)))) / float(bign) # same level of both a and b (error, within-group) ssr = select_ss(dm, levels, lambda a, b: a == b) / float(n) # same level of a sswa = select_ss(dm, levels, lambda a, b: a[0] == b[0]) / float(b * n) # same level of b sswb = select_ss(dm, levels, lambda a, b: a[1] == b[1]) / float(a * n) ssa = sst - sswa # effect of a ssb = sst - sswb # effect of b ssab = sst - ssa - ssb - ssr # interaction sum-of-squares # these should each be separate functions? f_interaction = (ssab / float((a - 1) * (b - 1))) / (ssr / float(bign - a * b)) f_a = (ssa / float((a - 1))) / (ssr / float(bign - a * b)) f_b = (ssb / float((b - 1))) / (ssr / float(bign - a * b)) return (f_interaction, f_a, f_b)
def f_oneway(dm, levels): bign = len(levels) #number of observations dm = np.asarray(dm) #distance matrix a = len(set(levels)) #number of levels n = bign / a #number of observations per level assert dm.shape == (bign, bign ) #check the dist matrix is square and the size #corresponds to the length of levels #total sum of squared distances sst = np.sum( stats.ss(r) for r in (s[n + 1:] for n, s in enumerate(dm[:-1]))) / float(bign) #sum of within-group squares #itertools.combinations(xrange(len(dm)),2)#top half of dm ssw = np.sum((dm[i][j]**2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j])) / float(n) ssa = sst - ssw fstat = (ssa / float(a - 1)) / (ssw / float(bign - a)) #print (fstat,sst,ssa,ssw,a,bign,n) return fstat
def f_twoway(dm, levels): bign = len(levels) #number of observations dm = np.asarray(dm) #distance matrix l = len(set(levels)) #number of levels a = len(set([l[0] for l in levels])) #number of a-levels b = len(set([l[1] for l in levels])) #number of b-levels n = bign / float(a * b) #number of observations per level #sum of all distances ## sst = np.sum(stats.ss(r) for r in ## (s[n+1:] for n,s in enumerate(dm[:-1])) )/float(bign) sst = stats.ss(chain(*(r[i + 1:] for i, r in enumerate(dm)))) / float(bign) #same level of both a and b (error, within-group) ssr = select_ss(dm, levels, lambda a, b: a == b) / float(n) #same level of a sswa = select_ss(dm, levels, lambda a, b: a[0] == b[0]) / float(b * n) #same level of b sswb = select_ss(dm, levels, lambda a, b: a[1] == b[1]) / float(a * n) ssa = sst - sswa #effect of a ssb = sst - sswb #effect of b ssab = sst - ssa - ssb - ssr #interaction sum-of-squares #these should each be separate functions? f_interaction = (ssab / float( (a - 1) * (b - 1))) / (ssr / float(bign - a * b)) f_a = (ssa / float((a - 1))) / (ssr / float(bign - a * b)) f_b = (ssb / float((b - 1))) / (ssr / float(bign - a * b)) return (f_interaction, f_a, f_b)
def fastPearsonCorrelation(TC): N = TC.shape[1] corr, TCm = fastCovariance(TC) TCss = sqrt(ss(TCm, axis=1)) for i in xrange(N): corr[i, i:] /= TCss[i:] * TCss[i] corr[i:, i] = corr[i, i:] return where(isfinite(corr), corr, 0.)
def pearson(self, x, y): data = np.vstack((x, y)) ms = data.mean(axis=1)[(slice(None, None, None), None)] datam = data - ms datass = np.sqrt(ss(datam, axis=1)) temp = np.dot(datam[1:], datam[0].T) rs = temp / (datass[1:] * datass[0]) return rs """ Two-way chi-square test of independence.
def pearson(x, y): """ Correlates row vector x with each row vector in 2D array y. """ data = np.vstack((x, y)) ms = data.mean(axis=1)[(slice(None, None, None), None)] datam = data - ms datass = np.sqrt(ss(datam, axis=1)) temp = np.dot(datam[1:], datam[0].T) rs = temp / (datass[1:] * datass[0]) return rs
def pearson(self, x, y): """ Correlates row vector x with each row vector in 2D array y. """ data = np.vstack((x,y)) ms = data.mean(axis=1)[(slice(None,None,None),None)] datam = data - ms datass = np.sqrt(ss(datam,axis=1)) temp = np.dot(datam[1:],datam[0].T) rs = temp / (datass[1:]*datass[0]) return rs
def pearson(self, x, y): data = np.vstack((x,y)) ms = data.mean(axis=1)[(slice(None,None,None),None)] datam = data - ms datass = np.sqrt(ss(datam,axis=1)) temp = np.dot(datam[1:],datam[0].T) rs = temp / (datass[1:]*datass[0]) return rs """ Two-way chi-square test of independence.
def repeated_oneway(data): n = data.shape[0] k = data.shape[1] grand_mean = np.mean(data) measurement_mean = np.mean(data, axis=0) subject_mean = np.mean(data, axis=1) ssb = n * st.ss(measurement_mean - grand_mean) # ssw = st.ss(data-measurement_mean) ssw = np.sum(st.ss(data - measurement_mean)) sss = k * st.ss(subject_mean - grand_mean) sse = ssw - sss dfb = k - 1 dfe = (n - 1) * (k - 1) msb = ssb / float(dfb) mse = sse / float(dfe) f = msb / mse p = st.fprob(dfb, dfe, f) return f, p
def repeated_oneway(data) : n = data.shape[0] k = data.shape[1] grand_mean = np.mean(data) measurement_mean = np.mean(data,axis=0) subject_mean = np.mean(data,axis=1) ssb = n*st.ss(measurement_mean-grand_mean) # ssw = st.ss(data-measurement_mean) ssw = np.sum(st.ss(data-measurement_mean)) sss = k*st.ss(subject_mean-grand_mean) sse = ssw-sss dfb = k - 1 dfe = (n-1)*(k-1) msb = ssb / float(dfb) mse = sse / float(dfe) f = msb / mse p = st.fprob(dfb,dfe,f) return f,p
def f_oneway(dm, levels): bign = len(levels) dm = np.asarray(dm) a = len(set(levels)) n = bign/a assert dm.shape == (bign, bign) sst = np.sum(stats.ss(r) for r in (s[n+1:] for n, s in enumerate(dm[:-1])))/float(bign) ssw = np.sum((dm[i][j]**2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j]))/float(n) ssa = sst - ssw fstat = (ssa/float(a-1))/(ssw/float(bign-a)) return fstat
def pearson(x, y): """ Correlates row vector x with each row vector in 2D array y. From neurosynth.stats.py - author: Tal Yarkoni """ data = np.vstack((x, y)) ms = data.mean(axis=1)[(slice(None, None, None), None)] datam = data - ms datass = np.sqrt(ss(datam, axis=1)) temp = np.dot(datam[1:], datam[0].T) rs = temp / (datass[1:] * datass[0]) return rs
def jackknife_bias_correct(pairs,confidence=None,return_all=False, nan_remove=True,return_raw=False): ''' Return jackknife-bias-corrected estimate from estimate-nsamples pairs Pairs can be either a list of tuples, or a 2 x nestimates array. If 'confidence' is between 0 and 1, return the mean with lower and upper bounds at -/+ the confidence interval. If 'confidence' is None, return the mean and standard error. If 'return_all' is True, return the mean, standard error, number of points, and confidence interval size. ''' data = asarray(pairs) if nan_remove: data = data[isfinite(data)[:,0],:] y = data[:,0] x = 1./data[:,1] n = len(x) # Compute linear regression and standard error of intercept (slope,intercept,r,p,slope_se) = linregress(x,y) intercept_se = slope_se * sqrt(ss(x)/n) # Return mean and SE if no value is specified: if confidence is None: if return_all: if return_raw: np = data[:,1] max_n = max(np) raw_mean = mean(data[np==max_n,0]) return intercept, intercept_se, n, raw_mean else: return intercept, intercept_se, n else: return intercept, intercept_se # Otherwise return intercept with confidence else: t_int = t._ppf((1+confidence)/2,n-2) intercept_int = t_int * intercept_se if return_all: if return_raw: np = data[:,1] max_n = max(np) raw_mean = mean(data[np==max_n,0]) return intercept, intercept_se, n, intercept_int, raw_mean else: return intercept, intercept_se, n, intercept_int else: return intercept, intercept - intercept_int, intercept + intercept_int
def test4(TC): t0 = time() ms = TC.mean(axis=0)[(slice(None, None, None), None)] TCm = TC.T - ms TCss = sqrt(ss(TCm, axis=1)) N = TC.shape[1] corr = zeros((N, N)) for i in xrange(N): corr[i, i:] = dot(TCm[i:], TCm[i].T) corr[i, i:] /= TCss[i:] * TCss[i] corr[i:, i] = corr[i, i:] print 'Pearson ', time() - t0 return corr
def test5(TC): ''' Attention for TC which is modified by the function ''' t0 = time() ms = TC.mean(axis=0)[(slice(None, None, None), None)] TC -= ms.T TCss = sqrt(ss(TC, axis=0)) N = TC.shape[1] corr = zeros((N, N)) for i in xrange(N): corr[i, i:] = dot(TC[:, i:].T, TC[:, i]) corr[i, i:] /= TCss[i:] * TCss[i] corr[i:, i] = corr[i, i:] print 'Pearson ', time() - t0 return corr
def f_oneway(dm, levels): bign = len(levels) dm = np.asarray(dm) a = len(set(levels)) n = bign / a assert dm.shape == (bign, bign) sst = np.sum( stats.ss(r) for r in (s[n + 1:] for n, s in enumerate(dm[:-1]))) / float(bign) ssw = np.sum((dm[i][j]**2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j])) / float(n) ssa = sst - ssw fstat = (ssa / float(a - 1)) / (ssw / float(bign - a)) return fstat
def simpleLeastSquares(X,Y): """ Compute the least-squares fit of y=ax+b Input: X is a list of sample x values, Y is a list of the corresponding Y values Output: A 2x1 matrix [a; b] """ # Complete this function A=matrix([[ss(X),sum(X)],[sum(X),len(X)]],'double') c=matrix([[Sxy(X,Y)],[sum(Y)]],'double') #-------------------# # Calculate the values of matrices A and c # and return the value of p P = linalg.solve(A,c) return P
def PearsonCorrelation(TC): #TODO change with coravriance, see Plot.py ''' Return the Pearson Correlation. The calculus is done for HALF of the matrix and duplicate for symetry. TC need to have a shape like [time, nodes] Attention for TC which is modified by the function ''' from scipy.stats import ss from pylab import sqrt as np_sqrt, dot as np_dot TC -= TC.mean(axis=0)[(slice(None, None, None), None)].T TCss = np_sqrt(ss(TC, axis=0)) N = TC.shape[1] corr = zeros((N, N)) for i in xrange(N): corr[i, i:] = np_dot(TC[:, i:].T, TC[:, i]) corr[i, i:] /= TCss[i:] * TCss[i] corr[i:, i] = corr[i, i:] return corr
def write_correlation_matrix(in_file, mask_file, out_file): import nibabel as nb import numpy as np from scipy.stats import ss import os mask_nii = nb.load(mask_file) data_nii = nb.load(in_file) data = data_nii.get_data()[mask_nii.get_data() > 0, :] print(data.shape[0] * (data.shape[0] - 1) / 2) return corr_matrix = np.memmap(out_file, dtype='int16', mode='w+', shape=(data.shape[0] * (data.shape[0] - 1) / 2)) counter = 0 ms = data.mean(axis=1)[(slice(None, None, None), None)] datam = data - ms datass = np.sqrt(ss(datam, axis=1)) status = 0 for i in xrange(0, data.shape[0]): temp = np.dot(datam[i + 1:], datam[i].T) rs = temp / (datass[i + 1:] * datass[i]) corr_matrix[counter:counter + len(rs)] = rs * 10000 counter += len(rs) if (counter / float(len(corr_matrix))) * 100 - status > 1: print "%d" % (counter / float(len(corr_matrix)) * 100) status = (counter / float(len(corr_matrix))) * 100 # counter = 0 # for i in range(data.shape[0]): # for j in range(i+1, data.shape[0]): # print "%g"%(counter/float(data.shape[0]*(data.shape[0]-1)/2)) # r,_ = pearsonr(data[i,:], data[j,:]) # corr_matrix[counter] = r # counter += 1 del corr_matrix return os.path.abspath(out_file)
def loglike(self, endog, mu, scale=1.0): """ The log-likelihood in terms of the fitted mean response. Parameters ---------- endog : array-like Endogenous response variable mu : array-like Fitted mean response variable scale : float, optional Scales the loglikelihood function. The default is 1. Returns ------- llf : float The value of the loglikelihood function evaluated at (endog,mu,scale) as defined below. Notes ----- If the link is the identity link function then the loglikelihood function is the same as the classical OLS model. llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs))) where SSR = sum((endog-link^(-1)(mu))**2) If the links is not the identity link then the loglikelihood function is defined as llf = sum((`endog`*`mu`-`mu`**2/2)/`scale` - `endog`**2/(2*`scale`) - \ (1/2.)*log(2*pi*`scale`)) """ if isinstance(self.link, L.Power) and self.link.power == 1: # This is just the loglikelihood for classical OLS nobs2 = endog.shape[0] / 2.0 SSR = ss(endog - self.fitted(mu)) llf = -np.log(SSR) * nobs2 llf -= (1 + np.log(np.pi / nobs2)) * nobs2 return llf else: # Return the loglikelihood for Gaussian GLM return np.sum( (endog * mu - mu ** 2 / 2) / scale - endog ** 2 / (2 * scale) - 0.5 * np.log(2 * np.pi * scale) )
def loglike(self, Y, mu, scale=1.): """ Loglikelihood function for Gaussian exponential family distribution. Parameters ---------- Y : array-like Endogenous response variable mu : array-like Fitted mean response variable scale : float, optional Scales the loglikelihood function. The default is 1. Returns ------- llf : float The value of the loglikelihood function evaluated at (Y,mu,scale) as defined below. Formulas -------- If the link is the identity link function then the loglikelihood function is the same as the classical OLS model. llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs))) where SSR = sum((Y-link^(-1)(mu))**2) If the links is not the identity link then the loglikelihood function is defined as llf = sum((`Y`*`mu`-`mu`**2/2)/`scale` - `Y`**2/(2*`scale`) - \ (1/2.)*log(2*pi*`scale`)) """ if isinstance(self.link, L.Power) and self.link.power == 1: # This is just the loglikelihood for classical OLS nobs2 = Y.shape[0]/2. SSR = ss(Y-self.fitted(mu)) llf = -np.log(SSR) * nobs2 llf -= (1+np.log(np.pi/nobs2))*nobs2 return llf else: # Return the loglikelihood for Gaussian GLM return np.sum((Y*mu-mu**2/2)/scale-Y**2/(2*scale)-\ .5*np.log(2*np.pi*scale))
def loglike(self, Y, mu, scale=1.): """ Loglikelihood function for Gaussian exponential family distribution. Parameters ---------- Y : array-like Endogenous response variable mu : array-like Fitted mean response variable scale : float, optional Scales the loglikelihood function. The default is 1. Returns ------- llf : float The value of the loglikelihood function evaluated at (Y,mu,scale) as defined below. Notes ----- If the link is the identity link function then the loglikelihood function is the same as the classical OLS model. llf = -(nobs/2)*(log(SSR) + (1 + log(2*pi/nobs))) where SSR = sum((Y-link^(-1)(mu))**2) If the links is not the identity link then the loglikelihood function is defined as llf = sum((`Y`*`mu`-`mu`**2/2)/`scale` - `Y`**2/(2*`scale`) - \ (1/2.)*log(2*pi*`scale`)) """ if isinstance(self.link, L.Power) and self.link.power == 1: # This is just the loglikelihood for classical OLS nobs2 = Y.shape[0] / 2. SSR = ss(Y - self.fitted(mu)) llf = -np.log(SSR) * nobs2 llf -= (1 + np.log(np.pi / nobs2)) * nobs2 return llf else: # Return the loglikelihood for Gaussian GLM return np.sum((Y*mu-mu**2/2)/scale-Y**2/(2*scale)-\ .5*np.log(2*np.pi*scale))
def f_oneway(dm, levels): bign = len(levels) # number of observations dm = np.asarray(dm) # distance matrix a = len(set(levels)) # number of levels n = bign / a # number of observations per level assert dm.shape == (bign, bign) # check the dist matrix is square and the size # corresponds to the length of levels # total sum of squared distances sst = np.sum(stats.ss(r) for r in (s[n + 1 :] for n, s in enumerate(dm[:-1]))) / float(bign) # sum of within-group squares # itertools.combinations(xrange(len(dm)),2)#top half of dm ssw = np.sum( (dm[i][j] ** 2 for i, j in product(xrange(len(dm)), xrange(1, len(dm))) if i < j and levels[i] == levels[j]) ) / float(n) ssa = sst - ssw fstat = (ssa / float(a - 1)) / (ssw / float(bign - a)) # print (fstat,sst,ssa,ssw,a,bign,n) return fstat
def descstats(data, cols=None, axis=0): ''' Prints descriptive statistics for one or multiple variables. Parameters ------------ data: numpy array `x` is the data v: list, optional A list of the column number or field names (for a recarray) of variables. Default is all columns. axis: 1 or 0 axis order of data. Default is 0 for column-ordered data. Examples -------- >>> descstats(data.exog,v=['x_1','x_2','x_3']) ''' x = np.array(data) # or rather, the data we're interested in if cols is None: # if isinstance(x, np.recarray): # cols = np.array(len(x.dtype.names)) if not isinstance(x, np.recarray) and x.ndim == 1: x = x[:, None] if x.shape[1] == 1: desc = ''' --------------------------------------------- Univariate Descriptive Statistics --------------------------------------------- Var. Name %(name)12s ---------- Obs. %(nobs)22i Range %(range)22s Sum of Wts. %(sum)22s Coeff. of Variation %(coeffvar)22.4g Mode %(mode)22.4g Skewness %(skewness)22.4g Repeats %(nmode)22i Kurtosis %(kurtosis)22.4g Mean %(mean)22.4g Uncorrected SS %(uss)22.4g Median %(median)22.4g Corrected SS %(ss)22.4g Variance %(variance)22.4g Sum Observations %(sobs)22.4g Std. Dev. %(stddev)22.4g ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \ stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \ 'mean': x.mean(), 'median': np.median(x), 'range': \ '('+str(x.min())+', '+str(x.max())+')', 'variance': \ x.var(), 'stddev': x.std(), 'coeffvar': \ stats.variation(x), 'skewness': stats.skew(x), \ 'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\ 'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)} # ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \ # stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \ # 'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \ # '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \ # x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \ # stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \ # 'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\ # 'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])} desc += ''' Percentiles ------------- 1 %% %12.4g 5 %% %12.4g 10 %% %12.4g 25 %% %12.4g 50 %% %12.4g 75 %% %12.4g 90 %% %12.4g 95 %% %12.4g 99 %% %12.4g ''' % tuple([ stats.scoreatpercentile(x, per) for per in (1, 5, 10, 25, 50, 75, 90, 95, 99) ]) t, p_t = stats.ttest_1samp(x, 0) M, p_M = sign_test(x) S, p_S = stats.wilcoxon(np.squeeze(x)) desc += ''' Tests of Location (H0: Mu0=0) ----------------------------- Test Statistic Two-tailed probability -----------------+----------------------------------------- Student's t | t %7.5f Pr > |t| <%.4f Sign | M %8.2f Pr >= |M| <%.4f Signed Rank | S %8.2f Pr >= |S| <%.4f ''' % (t, p_t, M, p_M, S, p_S) # Should this be part of a 'descstats' # in any event these should be split up, so that they can be called # individually and only returned together if someone calls summary # or something of the sort elif x.shape[1] > 1: desc =''' Var. Name | Obs. Mean Std. Dev. Range ------------+--------------------------------------------------------'''+\ os.linesep # for recarrays with columns passed as names # if isinstance(cols[0],str): # for var in cols: # desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \ #%(range)20s" % {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(), # 'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\ # +str(x[var].max())+')'+os.linesep} # else: for var in range(x.shape[1]): desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \ %(range)20s" % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(), 'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\ str(x[:,var].max())+')'+os.linesep} else: raise ValueError, "data not understood" return desc
def statistics(stat, infile, outfile, previous_p3_mosaic_dir=None): """Calculate the statistics """ # Open the original file dataset = gdal.Open(infile, gdal.GA_ReadOnly) num_layers = dataset.RasterCount # get the projection information no_data_value, xsize, ysize, geo_trans, projection, data_type = get_geo_info( infile) # get the numpy 3rd dimension array stack of the bands of image raster_stack = bands2layerstack(infile) # define the default output type format output_type = gdal.GDT_Float32 # call built in numpy statistical functions, with a specified axis. if # axis=2 means it will calculate along the 'depth' axis, per pixel. # with the return being n by m, the shape of each band. # # Calculate the median statistical if stat == 'median': new_array = np.nanmedian(raster_stack, axis=2) output_type = gdal.GDT_UInt16 # Calculate the mean statistical if stat == 'mean': new_array = np.nanmean(raster_stack, axis=2) output_type = gdal.GDT_UInt16 # Calculate the standard deviation if stat == 'std': new_array = np.nanstd(raster_stack, axis=2) # Calculate the valid data if stat == 'valid_data': # calculate the number of valid data used in statistics products in percentage (0-100%), # this count the valid data (no nans) across the layers (time axis) new_array = (num_layers - np.isnan(raster_stack).sum(axis=2)) * 100 / num_layers # Calculate the signal-to-noise ratio if stat == 'snr': # this signal-to-noise ratio defined as the mean divided by the standard deviation. m = np.nanmean(raster_stack, axis=2) sd = np.nanstd(raster_stack, axis=2, ddof=0) new_array = np.where(sd == 0, 0, m / sd) # Calculate the coefficient of variation if stat == 'coeff_var': # the ratio of the biased standard deviation to the mean new_array = variation(raster_stack, axis=2, nan_policy='omit') # Calculate the Pearson's correlation coefficient if stat == 'pearson_corr': # https://github.com/scipy/scipy/blob/v0.14.0/scipy/stats/stats.py#L2392 # get array of the previous mean file previous_dataset_file = os.path.join( previous_p3_mosaic_dir, os.path.basename(outfile).split('_pearson_corr.tif')[0] + '.tif') # get the numpy 3rd dimension array stack of the bands of image previous_raster_stack = bands2layerstack(previous_dataset_file) # raster_stack and previous_raster_stack should have same length in all axis if raster_stack.shape != previous_raster_stack.shape: z_rs = raster_stack.shape[2] z_prs = previous_raster_stack.shape[2] if z_rs > z_prs: raster_stack = np.delete(raster_stack, np.s_[z_prs - z_rs:], 2) if z_prs > z_rs: previous_raster_stack = np.delete(previous_raster_stack, np.s_[z_rs - z_prs:], 2) # propagate the nan values across the pair values in the same position for the # two raster in both directions mask1 = np.isnan(raster_stack) mask2 = np.isnan(previous_raster_stack) combined_mask = mask1 | mask2 raster_stack = np.where(combined_mask, np.nan, raster_stack) previous_raster_stack = np.where(combined_mask, np.nan, previous_raster_stack) del mask1, mask2, combined_mask mean_rs = np.nanmean(raster_stack, axis=2, keepdims=True) mean_prs = np.nanmean(previous_raster_stack, axis=2, keepdims=True) m_rs = np.nan_to_num(raster_stack - mean_rs) m_prs = np.nan_to_num(previous_raster_stack - mean_prs) r_num = np.add.reduce(m_rs * m_prs, axis=2) r_den = np.sqrt(ss(m_rs, axis=2) * ss(m_prs, axis=2)) r = r_num / r_den # return the r coefficient -1 to 1 new_array = r #### create the output geo tif # Set up the GTiff driver driver = gdal.GetDriverByName('GTiff') new_dataset = driver.Create(outfile, xsize, ysize, 1, output_type, ["COMPRESS=LZW", "PREDICTOR=2", "TILED=YES"]) # the '1' is for band 1 new_dataset.SetGeoTransform(geo_trans) new_dataset.SetProjection(projection.ExportToWkt()) # Write the array new_dataset.GetRasterBand(1).WriteArray(new_array) new_dataset.GetRasterBand(1).SetNoDataValue(np.nan)
def descstats(data, cols=None, axis=0): ''' Prints descriptive statistics for one or multiple variables. Parameters ------------ data: numpy array `x` is the data v: list, optional A list of the column number or field names (for a recarray) of variables. Default is all columns. axis: 1 or 0 axis order of data. Default is 0 for column-ordered data. Example ----------simple >>> decstats(data.exog,v=['x_1','x_2','x_3']) ''' x = np.array(data) # or rather, the data we're interested in if cols is None: # if isinstance(x, np.recarray): # cols = np.array(len(x.dtype.names)) if not isinstance(x, np.recarray) and x.ndim == 1: x = x[:,None] if x.shape[1] == 1: desc = ''' --------------------------------------------- Univariate Descriptive Statistics --------------------------------------------- Var. Name %(name)12s ---------- Obs. %(nobs)22i Range %(range)22s Sum of Wts. %(sum)22s Coeff. of Variation %(coeffvar)22.4g Mode %(mode)22.4g Skewness %(skewness)22.4g Repeats %(nmode)22i Kurtosis %(kurtosis)22.4g Mean %(mean)22.4g Uncorrected SS %(uss)22.4g Median %(median)22.4g Corrected SS %(ss)22.4g Variance %(variance)22.4g Sum Observations %(sobs)22.4g Std. Dev. %(stddev)22.4g ''' % {'name': cols, 'sum': 'N/A', 'nobs': len(x), 'mode': \ stats.mode(x)[0][0], 'nmode': stats.mode(x)[1][0], \ 'mean': x.mean(), 'median': np.median(x), 'range': \ '('+str(x.min())+', '+str(x.max())+')', 'variance': \ x.var(), 'stddev': x.std(), 'coeffvar': \ stats.variation(x), 'skewness': stats.skew(x), \ 'kurtosis': stats.kurtosis(x), 'uss': stats.ss(x),\ 'ss': stats.ss(x-x.mean()), 'sobs': np.sum(x)} # ''' % {'name': cols[0], 'sum': 'N/A', 'nobs': len(x[cols[0]]), 'mode': \ # stats.mode(x[cols[0]])[0][0], 'nmode': stats.mode(x[cols[0]])[1][0], \ # 'mean': x[cols[0]].mean(), 'median': np.median(x[cols[0]]), 'range': \ # '('+str(x[cols[0]].min())+', '+str(x[cols[0]].max())+')', 'variance': \ # x[cols[0]].var(), 'stddev': x[cols[0]].std(), 'coeffvar': \ # stats.variation(x[cols[0]]), 'skewness': stats.skew(x[cols[0]]), \ # 'kurtosis': stats.kurtosis(x[cols[0]]), 'uss': stats.ss(x[cols[0]]),\ # 'ss': stats.ss(x[cols[0]]-x[cols[0]].mean()), 'sobs': np.sum(x[cols[0]])} desc+= ''' Percentiles ------------- 1 %% %12.4g 5 %% %12.4g 10 %% %12.4g 25 %% %12.4g 50 %% %12.4g 75 %% %12.4g 90 %% %12.4g 95 %% %12.4g 99 %% %12.4g ''' % tuple([stats.scoreatpercentile(x,per) for per in (1,5,10,25, 50,75,90,95,99)]) t,p_t=stats.ttest_1samp(x,0) M,p_M=sign_test(x) S,p_S=stats.wilcoxon(np.squeeze(x)) desc+= ''' Tests of Location (H0: Mu0=0) ----------------------------- Test Statistic Two-tailed probability -----------------+----------------------------------------- Student's t | t %7.5f Pr > |t| <%.4f Sign | M %8.2f Pr >= |M| <%.4f Signed Rank | S %8.2f Pr >= |S| <%.4f ''' % (t,p_t,M,p_M,S,p_S) # Should this be part of a 'descstats' # in any event these should be split up, so that they can be called # individually and only returned together if someone calls summary # or something of the sort elif x.shape[1] > 1: desc =''' Var. Name | Obs. Mean Std. Dev. Range ------------+--------------------------------------------------------'''+\ os.linesep # for recarrays with columns passed as names # if isinstance(cols[0],str): # for var in cols: # desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \ #%(range)20s" % {'name': var, 'obs': len(x[var]), 'mean': x[var].mean(), # 'stddev': x[var].std(), 'range': '('+str(x[var].min())+', '\ # +str(x[var].max())+')'+os.linesep} # else: for var in range(x.shape[1]): desc += "%(name)15s %(obs)9i %(mean)12.4g %(stddev)12.4g \ %(range)20s" % {'name': var, 'obs': len(x[:,var]), 'mean': x[:,var].mean(), 'stddev': x[:,var].std(), 'range': '('+str(x[:,var].min())+', '+\ str(x[:,var].max())+')'+os.linesep} else: raise ValueError, "data not understood" return desc