def __init__(self,path,filename): new = Image.open(path+'/'+filename) self.size = new.size self.data = np.reshape(new.getdata(), self.size) self.pmax = prctile(new.getdata(),p=95) self.pmin = prctile(new.getdata(),p=5) self.title= 'New range = %i-%i'%(self.pmin,self.pmax) self.label= 'UT: %s\nExp. Times: %.1f sec\nTemp: %.1fC' % (new.info['UniversalTime'].strftime('%m-%d-%y %H:%M:%S'),new.info['ExposureTime'],new.info['CCDTemperature'])
def bootstrapMedian(data, N=5000): # determine 95% confidence intervals of the median M = len(data) percentile = [2.5,97.5] estimate = np.zeros(N) for n in range(N): bsIndex = np.random.random_integers(0,M-1,M) bsData = data[bsIndex] estimate[n] = mlab.prctile(bsData, 50) CI = mlab.prctile(estimate, percentile) return CI
def bootstrapMedian(data, N=5000): # determine 95% confidence intervals of the median M = len(data) percentile = [2.5, 97.5] estimate = np.zeros(N) for n in range(N): bsIndex = np.random.random_integers(0, M - 1, M) bsData = data[bsIndex] estimate[n] = mlab.prctile(bsData, 50) CI = mlab.prctile(estimate, percentile) return CI
def execute(self, seed=0): """Test the difference in means with bootstrapping. Data is drawn randomly from group1 and group2, with resampling. From these bootstraps, estimates with confidence intervals are calculated for the mean of each group and the difference in means. The estimated difference is positive if group2 > group1. Sets: mean1, CI_1, mean2, CI_2, diff_estimate, diff_CI, p1, p2 p1 is the p-value estimated from the distribution of differences p2 is the p-value from a 1-sample ttest on that distribution """ if len(self.data1) < self.min_bucket or len(self.data2) < self.min_bucket: #~ raise BootstrapError( #~ 'insufficient data in bucket in bootstrap_two_groups') raise ValueError( 'insufficient data in bucket in bootstrap_two_groups') if seed is not None: np.random.seed(seed) # Generate random samples, shape (n_boots, len(group)) self.idxs1 = np.random.randint(0, len(self.data1), (self.n_boots, len(self.data1))) self.idxs2 = np.random.randint(0, len(self.data2), (self.n_boots, len(self.data2))) # Draw from the data self.draws1 = self.data1[self.idxs1] self.draws2 = self.data2[self.idxs2] # Bootstrapped means of each group self.means1 = self.draws1.mean(axis=1) self.means2 = self.draws2.mean(axis=1) # CIs on group means self.CI_1 = mlab.prctile(self.means1, (2.5, 97.5)) self.CI_2 = mlab.prctile(self.means2, (2.5, 97.5)) # Bootstrapped difference between the groups self.diffs = self.means2 - self.means1 self.CI_diff = mlab.prctile(self.diffs, (2.5, 97.5)) # p-value self.p_from_dist = pvalue_of_distribution(self.diffs, 0) # save memory del self.idxs1 del self.idxs2 del self.draws1 del self.draws2
def test_prctile(): x=[1,2,3] assert mlab.prctile(x,50)==np.median(x) x=[1,2,3,4] assert mlab.prctile(x,50)==np.median(x) ob1=[1,1,2,2,1,2,4,3,2,2,2,3,4,5,6,7,8,9,7,6,4,5,5] p = [0, 75, 100] expected = [1, 5.5, 9] actual = mlab.prctile(ob1,p) assert np.allclose( expected, actual ) for pi, expectedi in zip(p,expected): actuali = mlab.prctile(ob1,pi) assert np.allclose( expectedi, actuali )
def createAsaInfo(self): 'Return True on error.' if self.showMessages: nTdebug( "Fetching WHATIF per-atom surface accessibility info..." ) fileNames = glob.glob(os.path.join(self.whatIfDataDir, "wsvacc*.log")) self.allWhatIfInfo = {'chains': {}} for fileName in fileNames: if self.readWhatIfAsaInfoFile(fileName): # fills self.allWhatIfInfo nTerror("Failed %s when reading file." % (getCallerName())) return True # end for # # Now determine the median ASA for each # # whatIfInfo is used in super class whereas allWhatIfInfo was filled before. self.whatIfInfo = self.allWhatIfInfo d = self.whatIfInfo['chains'] # medianIndex = None for chainCode in d.keys(): for seqKey in d[chainCode].keys(): for atomName in d[chainCode][seqKey]['atoms'].keys(): asaList = d[chainCode][seqKey]['atoms'][atomName] asaList.sort() # if not medianIndex: # medianIndex = int((len(asaList) / 2.0) + 0.5) # fails with round off on single element lists. ml = mlab.prctile(asaList,[50]) # if medianIndex < 0 or medianIndex >= len(asaList): # nTerror("Found improper median index %s for %s" % (medianIndex, str(asaList))) # return True # d[chainCode][seqKey]['atoms'][atomName] = [asaList[medianIndex]] # Resetting list to only include median d[chainCode][seqKey]['atoms'][atomName] = [ml[0]]
def comp_histo(a, **kwargs): """Return plot-ready histogram (h,l), with Freedman-Diaconis' choice for optimal bin width if not fixed. See http://en.wikipedia.org/wiki/Histogram""" if 'bins' in kwargs: try: nbins = len(kwargs['bins']) except: nbins = kwargs['bins'] print "Using default numpy histogram: nbins=%d" % nbins h,l = N.histogram(a, **kwargs) else: # Define optimal binning if 'range' in kwargs: vmin,vmax = kwargs['range'] else: vmin,vmax = a.min(),a.max() # Freedman-Diaconis' choice for optimal bin width q1,q3 = prctile(a, p=(25.,75.)) h = 2 * (q3-q1) / len(a)**(1./3.) nbins = round( (vmax-vmin)/h ) print "Freedman-Diaconis optimal bin width: nbins=%d" % nbins h,l = N.histogram(a, bins=nbins, **kwargs) h = N.concatenate((h,[h[-1]])) # Complete h #l = N.concatenate((l,[l[-1]+l[1]-l[0]])) # Not needed w/ new=True return h,l
def harmonize_clim_in_subplots(fig=None, axa=None, clim=(None, None), center_clim=False, trim=1): """Set clim to be the same in all subplots in figur f : Figure to grab all axes from, or None axa : the list of subplots (if f is None) clim : tuple of desired c-limits. If either or both values are unspecified, they are derived from the data. center_clim : if True, the mean of the new clim is always zero May overrule specified `clim` trim : does nothing if 1 or None otherwise, sets the clim to truncate extreme values for example, if .99, uses the 1% and 99% values of the data """ # Which axes to operate on if axa is None: axa = fig.get_axes() axa = np.asarray(axa) # Two ways of getting new clim if trim is None or trim == 1: # Get all the clim all_clim = [] for ax in axa.flatten(): for im in ax.get_images(): all_clim.append(np.asarray(im.get_clim())) # Find covering clim and optionally center all_clim_a = np.array(all_clim) new_clim = [np.min(all_clim_a[:, 0]), np.max(all_clim_a[:, 1])] else: # Trim to specified prctile of the image data data_l = [] for ax in axa.flatten(): for im in ax.get_images(): data_l.append(np.asarray(im.get_array()).flatten()) data_a = np.concatenate(data_l) # New clim new_clim = list(mlab.prctile(data_a, (100.*(1-trim), 100.*trim))) # Take into account specified clim try: if clim[0] is not None: new_clim[0] = clim[0] if clim[1] is not None: new_clim[1] = clim[1] except IndexError: print "warning: problem with provided clim" # Optionally center if center_clim: new_clim = np.max(np.abs(new_clim)) * np.array([-1, 1]) # Set to new value for ax in axa.flatten(): for im in ax.get_images(): im.set_clim(new_clim) return new_clim
def HoeffdingRuleMarkov(beta, G, H, U, FlowNum): """ Estimate the K-L divergence and the threshold by use of weak convergence ---------------- beta: the false alarm rate G: the gradient H: the Hessian U: a sample path of the Gaussian empirical measure FlowNum: the number of flows ---------------- """ _, SampNum, _ = U.shape # Estimate K-L divergence using 2nd-order Taylor expansion KL = [] for j in range(0, SampNum): t = (1.0 / sqrt(FlowNum)) * np.dot(G, U[0, j, :]) + \ (1.0 / 2) * (1.0 / FlowNum) * \ np.dot(np.dot(U[0, j, :], H), U[0, j, :]) # print t.tolist() # break KL.append(np.array(t.real)[0]) eta = prctile(KL, 100 * (1 - beta)) # print(KL) # assert(1 == 2) return eta
def createAsaInfo(self): 'Return True on error.' if self.showMessages: nTdebug("Fetching WHATIF per-atom surface accessibility info...") fileNames = glob.glob(os.path.join(self.whatIfDataDir, "wsvacc*.log")) self.allWhatIfInfo = {'chains': {}} for fileName in fileNames: if self.readWhatIfAsaInfoFile( fileName): # fills self.allWhatIfInfo nTerror("Failed %s when reading file." % (getCallerName())) return True # end for # # Now determine the median ASA for each # # whatIfInfo is used in super class whereas allWhatIfInfo was filled before. self.whatIfInfo = self.allWhatIfInfo d = self.whatIfInfo['chains'] # medianIndex = None for chainCode in d.keys(): for seqKey in d[chainCode].keys(): for atomName in d[chainCode][seqKey]['atoms'].keys(): asaList = d[chainCode][seqKey]['atoms'][atomName] asaList.sort() # if not medianIndex: # medianIndex = int((len(asaList) / 2.0) + 0.5) # fails with round off on single element lists. ml = mlab.prctile(asaList, [50]) # if medianIndex < 0 or medianIndex >= len(asaList): # nTerror("Found improper median index %s for %s" % (medianIndex, str(asaList))) # return True # d[chainCode][seqKey]['atoms'][atomName] = [asaList[medianIndex]] # Resetting list to only include median d[chainCode][seqKey]['atoms'][atomName] = [ml[0]]
def bootstrapped_intercluster_mahalanobis(cluster1, cluster2, n_boots=1000, fix_covariances=True): """Bootstrap the intercluster distance. Returns: m - The mean distance CI - 95% confidence interval on the distance distances - an array of the distances measured on each boot """ d_l = [] # Determine the covariance matrices, or recalculate each time if fix_covariances: icov1 = np.linalg.inv(np.cov(cluster1, rowvar=0)) icov2 = np.linalg.inv(np.cov(cluster2, rowvar=0)) else: icov1, icov2 = None, None # Bootstrap for n_boot in range(n_boots): # Draw idxs1 = np.random.randint(0, len(cluster1), len(cluster1)) idxs2 = np.random.randint(0, len(cluster2), len(cluster2)) # Calculate and store d_l.append(intercluster_mahalanobis( cluster1[idxs1], cluster2[idxs2], icov1, icov2)) # Statistics d_a = np.asarray(d_l) m = np.mean(d_a) CI = mlab.prctile(d_a, (2.5, 97.5)) return m, CI, d_a
def print_stats(name, x=None): "Prints simple stats" if type(name) is not StringType: x = name name = 'mean,stdv,rms,min,25%,median,75%,max: ' if name == '__header__': print '' n = (80 - len(x)) / 2 print n * ' ' + x print n * ' ' + len(x) * '-' print '' print ' Name mean stdv rms min 25% median 75% max' print ' --------- ------- ------- ------- ------- ------- ------- ------- -------' elif name == '__sep__': print ' --------- ------- ------- ------- ------- ------- ------- ------- -------' elif name == '__footer__': print ' --------- ------- ------- ------- ------- ------- ------- ------- -------' print '' else: ave = x.mean() std = x.std() rms = sqrt(ave * ave + std * std) prc = prctile(x) print '%10s %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f '%\ (name,ave,std,rms,prc[0],prc[1],prc[2],prc[3],prc[4])
def HoeffdingRuleMarkovRobust_(beta, G_list, H_list, U_list, FlowNum): """ Estimate the K-L divergence and the threshold by use of weak convergence ---------------- beta: the false alarm rate G: the gradient H: the Hessian U: a sample path of the Gaussian empirical measure FlowNum: the number of flows ---------------- """ _, SampNum, _ = U_list[0].shape # print(SampNum) # assert(1 == 2) # Estimate K-L divergence using 2nd-order Taylor expansion KL = [] for j in range(0, SampNum): KL_est_list = [] for G, H, U in zip(G_list, H_list, U_list): KL_est = (1.0 / sqrt(FlowNum)) * np.dot(G, U[0, j, :]) + \ (1.0 / 2) * (1.0 / FlowNum) * \ np.dot(np.dot(U[0, j, :], H), U[0, j, :]) KL_est = np.array(KL_est.real)[0] KL_est_list.append(KL_est) KL.append(min(KL_est_list)) eta = prctile(KL, 100 * (1 - beta)) return eta
def compute(self): """Detect RFI """ median_size = (self.median_size_time, self.median_size_freq) data = self.data - sp_dip.median_filter(self.data, size=median_size) # data1 = np.abs(np.sum(data,0)) # data1 = np.abs(np.median(data,0)) data1 = (np_median(data, 0)) # th = np.percentile(data1, th_prctile) thl = [] for ii in xrange(data1.shape[0] - 9): thl.append(prctile(data1[ii:ii + 10], p=90)) # thl.append(max(data1[ii:ii+10])) th = self.th_k * np_median(thl) for ii in xrange(data1.shape[0]): if data1[ii] > th: z, p_value = sp_normaltest(data[:, ii]) if p_value < self.p_th: if self.is_out_selected('Not_normal'): self.flag_results['Not_normal'].flag_data[:, ii] = 1 else: if self.is_out_selected('Normal'): self.flag_results['Normal'].flag_data[:, ii] = 1 return self.flag_results
def calc_kde1d(Xin, N=256, range=None, Verbose=False, name=None): """ Calculates 1D KDE. On input, Xin --- input data array N --- number of bins to evaluate KDE range --- range of Xin values to work with Example: bins, P = calc_kde1d(obs,range=(-2,2)) """ try: X = Xin.data[Xin.mask == False].ravel() except: X = Xin if range == None: prc = prctile(X) range = [prc[0], prc[4]] bins = linspace(range[0], range[1], N) if Verbose: if name != None: print name print 'Evaluating 1D kernel with %d observations' % len(X) kernel = stats.kde.gaussian_kde(X) if Verbose: print 'Evaluating 1D KDE with %d bins' % N P = kernel(bins) return (bins, P)
def difference_CI_bootstrap_wrapper(data, **boot_kwargs): """Given parsed data from single ulabel, return difference CIs. data : same format as bootstrap_main_effect expects Will calculate the following statistics: means : mean of each condition, across draws CIs : confidence intervals on each condition mean_difference : mean difference between conditions difference_CI : confidence interval on difference between conditions p : two-tailed p-value of 'no difference' Returns: dict of those statistics """ # Yields a 1000 x 2 x N_trials matrix: # 1000 draws from the original data, under both conditions. bh = bootstrap_main_effect(data, meth=keep, **boot_kwargs) # Find the distribution of means of each draw, across trials # This is 1000 x 2, one for each condition # hist(means_of_all_draws) shows the comparison across conditions means_of_all_draws = bh.mean(axis=2) # Confidence intervals across the draw means for each condition condition_CIs = np.array([ mlab.prctile(dist, (2.5, 97.5)) for dist in means_of_all_draws.T]) # Means of each ulabel (centers of the CIs, basically) condition_means = means_of_all_draws.mean(axis=0) # Now the CI on the *difference between conditions* difference_of_conditions = np.diff(means_of_all_draws).flatten() difference_CI = mlab.prctile(difference_of_conditions, (2.5, 97.5)) # p-value of 0. in the difference distribution cdf_at_value = np.sum(difference_of_conditions < 0.) / \ float(len(difference_of_conditions)) p_at_value = 2 * np.min([cdf_at_value, 1 - cdf_at_value]) # Should probably floor the p-value at 1/n_boots return {'p' : p_at_value, 'means' : condition_means, 'CIs': condition_CIs, 'mean_difference': difference_of_conditions.mean(), 'difference_CI' : difference_CI}
def get_sample_percentiles(self, percents): 'It returns the percentiles given a percent list' if not self._sample: raise ValueError('No data to calculate percentiles') vect = numpy.ravel(self.sample) percentiles = mlab.prctile(vect, percents) return list(percentiles)
def percentile_box_plot(ax, data, indexer=None, box_top=75, box_bottom=25,whisker_top=98,whisker_bottom=2): if indexer is None: indexed_data = zip(range(1,len(data)+1), data) else: indexed_data = [(indexer(datum), datum) for datum in data] for index, x in indexed_data: if whisker_top != None and whisker_bottom != None: bp = boxplotter(*(prctile(x,(50,box_top,box_bottom,whisker_top,whisker_bottom)))) bp.draw_on(ax, index, data=x) elif whisker_top == None and whisker_bottom == None: bp = boxplotter(*(prctile(x,(50,box_top,box_bottom)))) bp.draw_on(ax, index) else: raise Exception("Just one whisker? That's silly.")
def identify_outliers(test, chains, x): """ Determine which chains have converged on a local maximum much lower than the maximum likelihood. *test* is the name of the test to use (one of IQR, Grubbs, Mahal or none). *chains* is a set of log likelihood values of shape (chain len, num chains) *x* is the current population of shape (num vars, num chains) See :module:`outliers` for details. """ # Determine the mean log density of the active chains v = mean(chains, axis=0) # Check whether any of these active chains are outlier chains test = test.lower() if test == 'iqr': # Derive the upper and lower quartile of the chain averages Q1,Q3 = prctile(v,[25,75]) # Derive the Inter Quartile Range (IQR) IQR = Q3 - Q1 # See whether there are any outlier chains outliers = where(v < Q1 - 2*IQR)[0] elif test == 'grubbs': # Compute zscore for chain averages zscore = (mean(v) - v) / std(v, ddof=1) # Determine t-value of one-sided interval N = len(v) t2 = tinv(1 - 0.01/N,N-2)**2; # 95% interval # Determine the critical value Gcrit = ((N - 1)/sqrt(N)) * sqrt(t2/(N-2 + t2)) # Then check against this outliers = where(zscore > Gcrit)[0] elif test == 'mahal': # Use the Mahalanobis distance to find outliers in the population alpha = 0.01 Npop, Nvar = x.shape Gcrit = ACR(Nvar,Npop-1,alpha) #print "alpha",alpha,"Nvar",Nvar,"Npop",Npop,"Gcrit",Gcrit # Find which chain has minimum log_density minidx = argmin(v) # Then check the Mahalanobis distance of the current point to other chains d1 = mahalanobis(x[minidx,:], x[minidx!=arange(Npop),:]) #print "d1",d1,"minidx",minidx # and see if it is an outlier outliers = [minidx] if d1 > Gcrit else [] elif test == 'none': outliers = [] else: raise ValueError("Unknown outlier test "+test) return outliers
def do_kde(X, range=None, N=256): if range is None: prc = prctile(X.ravel()) a = prc[0] b = prc[4] else: a, b = range bins = linspace(a, b, N) kernel = kde.gaussian_kde(X.ravel()) return bins, kernel(bins)
def ThresCal(self): SampNum = 1000 KL = [] for i in range(0, SampNum): x = chain(self.mu_0, self.P, self.n) mu = np.reshape(self.mu, (self.N, self.N)) KL.append(KL_est( x, mu)) # Get the actual relative entropy (K-L divergence) eta = prctile(self.KL, 100 * (1 - self.beta)) return eta
def ThresCal(self): SampNum = 1000 self.KL = [] for i in range(0, SampNum): x = chain(self.mu_0, self.P, self.n) mu = np.reshape(self.mu, (self.N, self.N)) self.KL.append(KL_est(x, mu)) # Get the actual relative entropy (K-L divergence) self.eta = prctile(self.KL, 100 * (1 - self.beta)) KL = self.KL eta = self.eta return KL, eta
def modeifyer(times, fluxes, window=500, p=20, minpoints=10): """Uses percentile p of points around each datapoint "flux", being within time window to detrend fluxes. Returns corrected fluxes. For now done with a slow loop...""" detrend = fluxes.copy() for i in range(len(times)): near_fluxes = fluxes[where( (times < times[i] + window / 2) * (times > times[i] - window / 2))] trend = prctile(near_fluxes, p) detrend[i] = fluxes[i] - trend return detrend
def prctile(self, p = (2.5, 97.5)): ''' Returns the standard percentiles of the bootstrapped statistic. Arguments --------- perc : A sequence of percentile values or a scalar ''' from matplotlib.mlab import prctile return prctile(self.dist, p = p)
def modeifyer(times,fluxes,window=500,p=20,minpoints=10): """Uses percentile p of points around each datapoint "flux", being within time window to detrend fluxes. Returns corrected fluxes. For now done with a slow loop...""" detrend = fluxes.copy() for i in range(len(times)): near_fluxes = fluxes[where((times<times[i]+window/2)*( times>times[i]-window/2))] trend = prctile(near_fluxes,p) detrend[i] = fluxes[i] - trend return detrend
def _calculate_percentiles(numbers, percents): 'It calculates the percentiles for some numbers' #we need a numpy array if 'any' not in dir(numbers): numbers = numpy.ravel(numbers) if not numbers.any(): raise ValueError('No data to calculate percentiles') mlab = sys.modules['matplotlib.mlab'] percentiles = mlab.prctile(numbers, percents) return list(percentiles)
def bootstrap_regress(x, y, n_boot=1000): from matplotlib import mlab x = np.asarray(x) y = np.asarray(y) m_l, b_l = [], [] for n in range(n_boot): msk = np.random.randint(0, len(x), size=len(x)) m, b, rval, pval, stderr = scipy.stats.stats.linregress(x[msk], y[msk]) m_l.append(m) b_l.append(b) res = { 'slope_m': np.mean(m_l), 'slope_l': mlab.prctile(m_l, p=2.5), 'slope_h': mlab.prctile(m_l, p=97.5), 'intercept_m': np.mean(b_l), 'intercept_l': mlab.prctile(b_l, p=2.5), 'intercept_h': mlab.prctile(b_l, p=97.5), } return res
def plot_kde(X, a=None, b=None, N=256, Title=None, Label=None): if a == None: prc = prctile(X.ravel()) a = prc[0] b = prc[4] if Title is None: Title = 'Kernel Density Function' bins = linspace(a, b, N) kernel = kde.gaussian_kde(X.ravel()) plot(bins, kernel(bins)) ylabel('PDF') title(Title)
def bootstrap_regress(x, y, n_boot=1000): from matplotlib import mlab x = np.asarray(x) y = np.asarray(y) m_l, b_l = [], [] for n in range(n_boot): msk = np.random.randint(0, len(x), size=len(x)) m, b, rval, pval, stderr = scipy.stats.stats.linregress(x[msk], y[msk]) m_l.append(m) b_l.append(b) res = { "slope_m": np.mean(m_l), "slope_l": mlab.prctile(m_l, p=2.5), "slope_h": mlab.prctile(m_l, p=97.5), "intercept_m": np.mean(b_l), "intercept_l": mlab.prctile(b_l, p=2.5), "intercept_h": mlab.prctile(b_l, p=97.5), } return res
def test_prctile(): # test odd lengths x=[1,2,3] assert mlab.prctile(x,50)==np.median(x) # test even lengths x=[1,2,3,4] assert mlab.prctile(x,50)==np.median(x) # derived from email sent by jason-sage to MPL-user on 20090914 ob1=[1,1,2,2,1,2,4,3,2,2,2,3,4,5,6,7,8,9,7,6,4,5,5] p = [0, 75, 100] expected = [1, 5.5, 9] # test vectorized actual = mlab.prctile(ob1,p) assert np.allclose( expected, actual ) # test scalar for pi, expectedi in zip(p,expected): actuali = mlab.prctile(ob1,pi) assert np.allclose( expectedi, actuali )
def test_prctile(): # test odd lengths x = [1, 2, 3] assert mlab.prctile(x, 50) == np.median(x) # test even lengths x = [1, 2, 3, 4] assert mlab.prctile(x, 50) == np.median(x) # derived from email sent by jason-sage to MPL-user on 20090914 ob1 = [1, 1, 2, 2, 1, 2, 4, 3, 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 7, 6, 4, 5, 5] p = [0, 75, 100] expected = [1, 5.5, 9] # test vectorized actual = mlab.prctile(ob1, p) assert np.allclose(expected, actual) # test scalar for pi, expectedi in zip(p, expected): actuali = mlab.prctile(ob1, pi) assert np.allclose(expectedi, actuali)
def main_chain(img_name, template_name, blursize=5, cliplim=3.0, gridsize=8): # FIXME what if foscam moves, then blind offset-from-template method will not work robustly, will it? # apply blurring and CLAHE to small (skinny garage door) roi img, final, xywh_template, topleft_sgd, botright_sgd = blurred_histogram_equalization( img_name, template_name, blursize=5, cliplim=3.0, gridsize=8) # histogram of skinny garage door subset after image processing colors = ('b', ) fig, ax = plt.subplots(figsize=(12, 8)) # ith-channel of skinny garage door (roi1) after image processing i = 0 c = colors[i] sgd = final[topleft_sgd[1]:botright_sgd[1], topleft_sgd[0]:botright_sgd[0]][:, :, i] # i = 0 for Luminance intensity_bins = range(0, 256) n, bins, patches = ax.hist([sgd], intensity_bins, normed=1, color=c, histtype='step', cumulative=True, label='Color: ' + c) # FIXME we may not always want histogram plot (maybe just during "gather") # TODO with each image file, always ratchet up a running sum type of histogram OR db each for future summing # TODO always put percentiles (10th, 20th, 30th, ... 90th) into db table!? # percentiles percs = mlab.prctile([sgd], p=np.arange(10.0, 100.0, 10.0)) print percs # tidy up the figure ax.grid(True) #ax.legend(loc='right') #ax.set_title('Cumulative Step Histograms') ax.set_title('Cumulative Step Histogram, Blue Channel, %s' % img_name) ax.set_xlabel('Pixel [intensity?]') ax.set_ylabel('Likelihood of Occurrence') plt.xlim([0, 256]) # save cumulative histogram figure as _chist.jpg outname = img_name.replace('.jpg', '_chist.jpg') plt.savefig(outname) print 'open -a Firefox file://%s' % outname return img, final, xywh_template
def simple_bootstrap(data, n_boots=1000, min_bucket=20): if len(data) < min_bucket: raise BootstrapError("too few samples") res = [] data = np.asarray(data) for boot in range(n_boots): idxs = np.random.randint(0, len(data), len(data)) draw = data[idxs] res.append(np.mean(draw)) res = np.asarray(res) CI = mlab.prctile(res, (2.5, 97.5)) return res, res.mean(), CI
def HoeffdingRuleMarkov(beta, rho, G, H, W, Chi, FlowNum): """ Estimate the K-L divergence and the threshold by use of weak convergence ---------------- beta: the false alarm rate mu: the stationary distribution G: the gradient H: the Hessian Sigma: the covariance matrix W: a sample path of the Gaussian empirical measure Chi: a sample path of the "Chi-Square" estimation FlowNum: the number of flows ---------------- """ _, SampNum, N = W.shape # Here, N equals the number of states in the new chain Z # Estimate K-L divergence using 2nd-order Taylor expansion KL_1 = [] for j in range(0, SampNum): t = (1.0 / sqrt(FlowNum)) * np.dot(G, W[0, j, :]) + \ (1.0 / 2) * (1.0 / FlowNum) * \ np.dot(np.dot(W[0, j, :], H), W[0, j, :]) # print t.tolist() # break KL_1.append(np.array(t.real)[0]) # Get the threshold eta1 = prctile(KL_1, 100 * (1 - beta)) KL_2 = [Chi[idx] / (2 * FlowNum) for idx in xrange(len(Chi))] # Using the simplified formula # eta2 = 1.0 / (2 * FlowNum) * rho * chi2.ppf(1 - beta, N) eta2 = prctile(KL_2, 100 * (1 - beta)) # print N # print(KL) # assert(1 == 2) return KL_1, KL_2, eta1, eta2
def bootstrapMedian(data, N=5000): '''Bootstraper to refine estimate of a percentile from data N = number of iterations for the bootstrapping M = number of data points output = MU.bootStrapper(data, 50, 10000) ''' import numpy as np import matplotlib.mlab as mlab M = len(data) percentile = 50 estimate = np.array([]) for k in range(N): bsIndex = np.random.random_integers(0,M-1,M) bsData = data[bsIndex] tmp = mlab.prctile(bsData, percentile) estimate = np.hstack((estimate, tmp)) CI = mlab.prctile(estimate, [2.5,97.5]) med = np.mean(estimate) return med, CI, estimate
def calc_kde2d(x_values, y_values, x_range=None, y_range=None, Nx=256, Ny=256, npz=None, Verbose=True, name=None): if Verbose: if name != None: print "[] ", name print "Starting the 2D kernel density estimation with %d data points..."\ %len(x_values) kernel = stats.kde.gaussian_kde(_cat(x_values, y_values)) if x_range == None: prc = prctile(x_values) x_range = [prc[0], prc[4]] if y_range == None: y_range = x_range x_bins = linspace(x_range[0], x_range[1], Nx) y_bins = linspace(y_range[0], y_range[1], Ny) if Verbose: print "Evaluating 2D kernel on grid with (Nx,Ny)=(%d,%d) ..." % (Nx, Ny) X, Y = meshgrid(x_bins, y_bins) # each has shape (Ny,Nx) P = kernel(_cat(X, Y)) # shape is (Ny*Nx) P = reshape(P, X.shape) if Verbose: print "X, Y, P shapes: ", X.shape, Y.shape, P.shape # Save to file # ------------ if npz != None: print "Saving 2D KDE to file <" + npz + "> ..." savez(npz, pdf=P, x_bins=x_bins, y_bins=y_bins) return (x_bins, y_bins, P)
def test_Median(self): 'test median' # Wiki: If there is an even number of observations, then there is no single middle value; the median is then usually defined to be the # mean of the two middle values.[1][2] lol = [ # [], # fails [1.2], [1.0, 2.0], # Get 1.5 (matplotlib 1.0.1 or 2.0 (matplotlib 0.99.3) [1.0, 2.0, 4.0], ] expectedMedianList = [ 1.2, 1.5, 2.0] # matplotlib 1.0.1 expectedMedianListOldMatplotlib = [ 1.2, 2.0, 2.0] # matplotlib 0.99.3 for i,floatList in enumerate(lol): ml = mlab.prctile(floatList,[50]) nTdebug("Found: %s and expected (by new matplotlib): %s" % (ml[0], expectedMedianList[i])) if ml[0] != expectedMedianList[i]: self.assertEqual(ml[0], expectedMedianListOldMatplotlib[i])
def boxpoints(d, outlier_distance=1.5): # implementation pretty much the same as matplotlib axes.boxplot # get median and quartiles q1, med, q3 = mlab.prctile(d,[25,50,75]) # min(data), max(data) iq = q3 - q1 hi_val = q3 + outlier_distance*iq lo_val = q1 - outlier_distance*iq print iq, q1, q3, '---', hi_val, lo_val # print (d > hi_val) # print (d < lo_val) outliers = r_[d[d>hi_val], d[d<lo_val]] # print 'outliers', outliers inliers = list(set(data)-set(outliers)) # print 'inliers', inliers min_without_outliers = min(inliers) max_without_outliers = max(inliers) return outliers, min_without_outliers, q1, med, q3, max_without_outliers
def plot_transparent_histogram(arr, ax, frame_width, frame_height, upper_prctile_clim=95, cmap=plt.cm.gray_r): """Imshow a histogram with zero values transparent H_tip : single 2d array to plot. Should be non-negative frame_width, frame_height : passed to imshow to get the data limits right All zero bins will be transparent. The image color limits will be set to the 95th percentile of non-zero values. """ # Determine the transparent threshold and upper clim vals = arr.flatten() vals = vals[vals > 0] transparent_threshold = vals.min() clim_upper = prctile(vals, upper_prctile_clim) # Mask the data to make zero bins transparent # We use .99 to avoid floating point comparison problems masked_data = np.ma.masked_where(arr < transparent_threshold * .99, arr) # Plot im = my.plot.imshow( masked_data, ax=ax, xd_range=(0, frame_width), yd_range=(0, frame_height), axis_call='image', cmap=cmap, skip_coerce=True, ) # Set the clim to go from 0 to upper im.set_clim((0, clim_upper)) return im
def HoeffdingRuleMarkovRobust(beta, G_1, H_1, U_1, G_2, H_2, U_2, G_3, H_3, U_3, FlowNum): """ Estimate the K-L divergence and the threshold by use of weak convergence ---------------- beta: the false alarm rate G: the gradient H: the Hessian U: a sample path of the Gaussian empirical measure FlowNum: the number of flows ---------------- """ _, SampNum, _ = U_1.shape # Estimate K-L divergence using 2nd-order Taylor expansion KL = [] for j in range(0, SampNum): t_1 = (1.0 / sqrt(FlowNum)) * np.dot(G_1, U_1[0, j, :]) + \ (1.0 / 2) * (1.0 / FlowNum) * \ np.dot(np.dot(U_1[0, j, :], H_1), U_1[0, j, :]) t_2 = (1.0 / sqrt(FlowNum)) * np.dot(G_2, U_2[0, j, :]) + \ (1.0 / 2) * (1.0 / FlowNum) * \ np.dot(np.dot(U_2[0, j, :], H_2), U_2[0, j, :]) t_3 = (1.0 / sqrt(FlowNum)) * np.dot(G_3, U_3[0, j, :]) + \ (1.0 / 2) * (1.0 / FlowNum) * \ np.dot(np.dot(U_3[0, j, :], H_3), U_3[0, j, :]) t1 = np.array(t_1.real)[0] t2 = np.array(t_2.real)[0] t3 = np.array(t_3.real)[0] # print t.tolist() # break KL.append(min([t1, t2, t3])) eta = prctile(KL, 100 * (1 - beta)) # print(KL) # assert(1 == 2) return eta
def bootstrapped_intercluster_mahalanobis(cluster1, cluster2, n_boots=1000, fix_covariances=True): """Bootstrap the intercluster distance. Returns: m - The mean distance CI - 95% confidence interval on the distance distances - an array of the distances measured on each boot """ d_l = [] # Determine the covariance matrices, or recalculate each time if fix_covariances: icov1 = np.linalg.inv(np.cov(cluster1, rowvar=0)) icov2 = np.linalg.inv(np.cov(cluster2, rowvar=0)) else: icov1, icov2 = None, None # Bootstrap for n_boot in range(n_boots): # Draw idxs1 = np.random.randint(0, len(cluster1), len(cluster1)) idxs2 = np.random.randint(0, len(cluster2), len(cluster2)) # Calculate and store d_l.append( intercluster_mahalanobis(cluster1[idxs1], cluster2[idxs2], icov1, icov2)) # Statistics d_a = np.asarray(d_l) m = np.mean(d_a) CI = mlab.prctile(d_a, (2.5, 97.5)) return m, CI, d_a
datas = [np.recfromtxt(f, names = True, case_sensitive = True) for f in files] data = np.ma.concatenate(datas) desired_unit = dict(O3 = 'ppb', GMAO_TEMP = 'K', PRESS = 'hPa', TEMP = 'K') unit_factor = {'ppt': 1e12, 'ppb': 1e9} pfile.createDimension('time', data.shape[0]) for ki, key in enumerate(data.dtype.names): typecode = data[key].dtype.char if typecode not in ('c', 'S'): unit = desired_unit.get(key, 'ppt') factor = unit_factor.get(unit, 1) values = np.ma.masked_values(data[key], -1000) * factor else: unit = 'unknown' values = data[key] pfile.createVariable(key, typecode, dimensions = ('time',), units = unit, values = values) if __name__ == '__main__': import sys bfile1 = flightlogs(sys.argv[1:]) from matplotlib.mlab import prctile for label, key in [('O3', 'O3[:]'), ('NO2', 'NO2[:]')]: bvar = eval(key, None, bfile1.variables) b2var = eval(key, None, bfile1.variables) assert((bvar == b2var).all()) print('\n%s (BASE: %6.2f)' % (label, bvar.mean()), file = sys.stdout) print('\n BASE:', sep = '', file = sys.stdout) prctile(bvar, np.ma.arange(.1, 1., .1)* 100).tofile(sys.stdout, sep = ', ', format = '%6.2f') print('', file = sys.stdout)
def read_state_trajectories(): global state_array, NUM_DIM, NUM_STATES, TRAJ_LEN state_trajs = [] to_put = [] rewards = [] trajs = open("state_trajectories.dat", 'r') if trajs: lines = trajs.readlines() for l in lines: s = l.split('\t') num_steps = len(s) -1 if( len(s) > 3): to_put = [int(s[x]) for x in range(num_steps)] state_trajs.append(to_put) if len(to_put) > TRAJ_LEN: TRAJ_LEN = len(to_put) else: rewards.append(float(s[1])) trajs.close() num_traj = len(state_trajs) for ct in range(num_traj): last = state_trajs[ct][-1] curr_len = len(state_trajs[ct]) for ti in np.linspace(curr_len, TRAJ_LEN-1, TRAJ_LEN-curr_len): state_trajs[ct].append(last) state_trajs = np.array(state_trajs) traj_len = len(state_trajs[0]) print "num_traj: ", num_traj , " traj_len: ", traj_len, " reward: ", np.average(rewards) """ fig = figure(3) fig.add_subplot(111, aspect='equal') # create a hexbin map now for each trajectory for i in range(len(state_trajs)): curr_traj = np.array([ [state_array[x,0], state_array[x,1]] for x in state_trajs[i] ] ) clf() scatter( curr_traj[:,0], curr_traj[:,1], marker='o', c='y', s= 25, alpha=0.7) #hexbin(curr_traj[:,0], curr_traj[:,1], gridsize=10, cmap=cm.get_cmap('Jet'), alpha=0.9, mincnt=1) fig.savefig("movie/"+str(i)+".png") """ """ if NUM_DIM==2: fig = figure(1) ax = fig.add_subplot(111, aspect='equal') for i in range(len(state_trajs)): curr_traj = np.array([ [state_array[x,i] for i in range(NUM_DIM)] for x in state_trajs[i] ] ) plot(curr_traj[:,0], curr_traj[:,1], 'b-', lw=0.5, alpha=0.2) circle = Circle( (curr_traj[0,0], curr_traj[0,1]), 0.01, fc='red', alpha = 0.4) ax.add_patch(circle) circle = Circle( (curr_traj[traj_len-1,0], curr_traj[traj_len-1,1]), 0.01, fc='green', alpha = 0.4) ax.add_patch(circle) """ fig = figure(2) state_traj_x = [] state_traj_y = [] for i in range(num_traj): curr_traj = np.array([ [state_array[x,j] for j in range(NUM_DIM)] for x in state_trajs[i] ] ) tmp = np.array([state_array[x,0] for x in state_trajs[i]]) state_traj_x.append(tmp) if NUM_DIM == 2: tmp = np.array([state_array[x,1] for x in state_trajs[i]]) state_traj_y.append(tmp) #subplot(111) #plot(curr_traj[:,0], 'b-', lw=0.5, alpha=0.10) #subplot(212) #plot(curr_traj[:,1], 'ro', lw=0.5, alpha=0.05) state_traj_x = np.array(state_traj_x) state_traj_y = np.array(state_traj_y) print state_traj_x.shape, state_traj_y.shape state_traj_x_percentile_10 = np.array([mlab.prctile(state_traj_x[:,i],p=10) for i in range(TRAJ_LEN)]) state_traj_x_percentile_50 = np.array([mlab.prctile(state_traj_x[:,i],p=50) for i in range(TRAJ_LEN)]) state_traj_x_percentile_90 = np.array([mlab.prctile(state_traj_x[:,i],p=90) for i in range(TRAJ_LEN)]) state_traj_x_percentile = np.array([state_traj_x_percentile_10, state_traj_x_percentile_90]) if NUM_DIM == 2: state_traj_y_percentile_10 = np.array([mlab.prctile(state_traj_y[:,i],p=10) for i in range(TRAJ_LEN)]) state_traj_y_percentile_90 = np.array([mlab.prctile(state_traj_y[:,i],p=90) for i in range(TRAJ_LEN)]) state_traj_y_percentile = np.array([state_traj_y_percentile_10, state_traj_y_percentile_90]) subplot(211) plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), np.average(state_traj_x, axis=0), 'b-', label='mean') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_x_percentile_10, 'b--', label='10/90 percentile') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_x_percentile_90, 'b--') legend() grid() ylabel('x (t)') xlabel('t [s]') axis('tight') subplot(212) plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), np.average(state_traj_y, axis=0), 'b-', label='mean') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_y_percentile_10, 'b--', label='10/90 percentile') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_y_percentile_90, 'b--') legend() grid() ylabel('y (t)') xlabel('t [s]') axis('tight') elif NUM_DIM==1: subplot(111) #plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), np.average(state_traj_x, axis=0), 'b-', label='mean') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_x_percentile_10, 'b--', label='10/50/90 percentile') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_x_percentile_90, 'b--') plot( holding_time*np.linspace(0,TRAJ_LEN,num=TRAJ_LEN), state_traj_x_percentile_50, 'b--') legend() grid() xlabel('t [s]') axis('tight')
def imshow(arr, x=None, ax=None, vmin=None, vmax=None, percentile=True, strip=False, features=None, conf=0.95, line_kwargs=None, sort_by=None, fill_kwargs=None, figsize=(5, 12), width_ratios=(4, 1), height_ratios=(4, 1), subplot_params=dict(wspace=0.1, hspace=0.1), imshow_kwargs=None): """ Parameters ---------- arr : array-like x : 1D array X values to use. If None, use range(arr.shape[1]) ax : matplotlib.Axes If not None, then only plot the array on the provided axes. This will ignore any additional arguments provided that apply to figure-level configuration or to the average line plot. For example, `figsize`, `width_ratios`, `height_ratios`, `subplot_params`, `line_kwargs`, and `fill_kwargs` will be ignored. vmin, vmax : float percentile : bool If True, then treat values for `vmin` and `vmax` as percentiles rather than absolute values. strip : bool Include a strip plot alongside the array features : pybedtools.BedTool or string filename Features used to construct the array sort_by : array-like Use the provided array to sort the array (e.g., expression). This array is argsorted to get the proper order. line_kwargs, fill_kwargs : dict Passed directly to `ci_plot`. figsize : tuple (Width, height) of the figure to create. """ if ax is None: fig = new_shell( figsize=figsize, strip=strip, subplot_params=subplot_params, width_ratios=width_ratios, height_ratios=height_ratios) if x is None: x = np.arange(arr.shape[1]) if percentile: if vmin is None: vmin = arr.min() else: vmin = mlab.prctile(arr.ravel(), vmin) if vmax is None: vmax = arr.max() else: vmax = mlab.prctile(arr.ravel(), vmax) else: if vmin is None: vmin = arr.min() if vmax is None: vmax = arr.max() if imshow_kwargs is None: imshow_kwargs = {} cmap = colormap_adjust.smart_colormap(vmin, vmax) if sort_by is not None: ind = np.argsort(sort_by) else: ind = np.arange(arr.shape[0]) if ax is None: array_ax = fig.array_axes else: array_ax = ax mappable = array_ax.imshow( arr[ind, :], aspect='auto', cmap=cmap, vmin=vmin, vmax=vmax, origin='lower', extent=(x.min(), x.max(), 0, arr.shape[0]), **imshow_kwargs ) if ax is None: plt.colorbar(mappable, fig.cax) ci_plot( x, arr, ax=fig.line_axes, line_kwargs=line_kwargs, fill_kwargs=fill_kwargs, ) return fig else: return ax.figure
def input_ip_plots(iparr, inputarr, diffed, x, sort_ind, prefix=None, limits1=(None, None), limits2=(None, None), hlines=None, vlines=None): """ All-in-one plotting function to make a 5-panel figure. Panels are IP, input, and diffed; plus 2 line plots showing averages. :param iparr, inputarr: NumPy arrays constructed by a genomic_signal object :param diffed: Difference of `iparr` and `inputarr`, but can be some other transformation. :param x: Extent to use -- for TSSs, maybe something like np.linspace(-1000, 1000, bins), or for just bin IDs, something like `np.arange(bins)`. :param sort_ind: row order for each of the 3 panels -- usually interesting to use `clustered_sortind` or `tip_zscores` :param prefix: Used to prefix plot titles with '%(prefix)s IP", etc :param limits1: Tuple passed to the Normalize function for IP and input. :param limits2: Tuple passed tot he Normalize function for the diffed array :param hlines: List of (position, kwarg) tuples for plotting horizontal lines. Kwargs are passed directly to axhline. Useful for delimiting clusters, if you used `clustered_sortind` and have both `row_order` and `breaks`. :param vlines: List of (position, kwargs) tuples. A vertical line will be plotted at each position using kwargs. """ # global min and max gmin = min(iparr.min(), inputarr.min()) gmax = max(iparr.max(), inputarr.max()) fig = plt.figure(figsize=(10, 10)) # 3 arrays, 2 line plots, a gene strip, and 2 colorbars. Plots share the # axes that make sense # # 3 arrays ax1 = plt.subplot2grid( (9, 9), (0, 0), colspan=3, rowspan=6) ax2 = plt.subplot2grid( (9, 9), (0, 3), colspan=3, rowspan=6, sharex=ax1, sharey=ax1) ax3 = plt.subplot2grid( (9, 9), (0, 6), colspan=3, rowspan=6, sharex=ax1, sharey=ax1) # 2 line plots ax4 = plt.subplot2grid((9, 9), (6, 3), colspan=3, rowspan=3, sharex=ax1) ax5 = plt.subplot2grid((9, 9), (6, 6), colspan=3, rowspan=3, sharex=ax1) # 2 colorbars cax1 = plt.Axes(fig, rect=(0.05, 0.25, 0.25, 0.025)) cax2 = plt.Axes(fig, rect=(0.05, 0.15, 0.25, 0.025)) # For nice imshow axes extent = (min(x), max(x), 0, diffed.shape[0]) cm = matplotlib.cm.gist_gray cm.set_bad('k') cm.set_over('r') cm.set_under('b') limits1 = list(limits1) limits2 = list(limits2) all_base = np.column_stack((iparr.ravel(), inputarr.ravel())).ravel() if limits1[0] is None: limits1[0] = mlab.prctile( all_base, 1. / all_base.size) if limits1[1] is None: limits1[1] = mlab.prctile( all_base, 100 - 1. / all_base.size) if limits2[0] is None: limits2[0] = mlab.prctile( diffed.ravel(), 1. / all_base.size) if limits2[1] is None: limits2[1] = mlab.prctile( diffed.ravel(), 100 - 1. / all_base.size) del all_base imshow_kwargs = dict( interpolation='nearest', aspect='auto', cmap=cm, norm=matplotlib.colors.Normalize(*limits1), extent=extent, origin='lower') # modify kwargs for diffed (by changing the normalization) diffed_kwargs = imshow_kwargs.copy() diffed_kwargs['norm'] = matplotlib.colors.Normalize(*limits2) # IP mappable1 = ax1.imshow(iparr[sort_ind, :], **imshow_kwargs) # input mappable2 = ax2.imshow(inputarr[sort_ind, :], **imshow_kwargs) # diffed mappable3 = ax3.imshow((diffed)[sort_ind, :], **diffed_kwargs) # IP and input line plot with vertical line ax4.plot(x, inputarr.mean(axis=0), color='k', linestyle='--', label='input') ax4.plot(x, iparr.mean(axis=0), color='k', label='ip') ax4.axvline(0, color='k', linestyle=':') # Diffed line plot with vertical line ax5.plot(x, diffed.mean(axis=0), 'k', label='enrichment') ax5.axvline(0, color='k', linestyle=':') # Colorbars cbar1 = fig.colorbar(mappable1, cax1, orientation='horizontal') cbar2 = fig.colorbar(mappable3, cax2, orientation='horizontal') fig.add_axes(cax1) fig.add_axes(cax2) # labeling... ax1.set_ylabel('features') plt.setp(ax2.get_yticklabels(), visible=False) plt.setp(ax3.get_yticklabels(), visible=False) ax4.set_xlabel('bp') ax4.set_ylabel('mean reads per million mapped reads') ax5.set_xlabel('bp') cax1.set_xlabel('Reads per million mapped reads') cax2.set_xlabel('Enrichment (RPMMR)') if prefix is None: prefix = "" ax1.set_title('%s IP' % prefix) ax2.set_title('%s input' % prefix) ax3.set_title('Difference') # diffed line plot should have y ax on right ax5.yaxis.set_ticks_position('right') ax5.yaxis.set_label_position('right') ax5.set_ylabel('enriched reads per million mapped reads') # Legends ax4.legend(loc='best', frameon=False) ax5.legend(loc='best', frameon=False) # Make sure everybody snaps to xmin/xmax for ax in [ax1, ax2, ax3, ax4, ax5]: ax.axis(xmin=extent[0], xmax=extent[1]) if not hlines: hlines = [] if not vlines: vlines = [] for ax in [ax1, ax2, ax3]: for pos, kwargs in hlines: ax.axhline(pos, **kwargs) for pos, kwargs in vlines: ax.axvline(pos, **kwargs) fig.subplots_adjust(bottom=0.05, top=0.95, hspace=0.75, wspace=0.9) return fig
def boxplot(x, notch=0, sym='b+', positions=None, widths=None): """Makes a box and whisker plot. Adapted from matplotlib.axes 0.98.5.2 Modified such that the caps are set to the 10th and 90th percentiles, and to have some control on the colors. call signature:: boxplot(x, notch=0, sym='+', positions=None, widths=None) Make a box and whisker plot for each column of *x* or each vector in sequence *x*. The box extends from the lower to upper quartile values of the data, with a line at the median. The whiskers extend from the box to show the range of the data. Flier points are those past the end of the whiskers. - *notch* = 0 (default) produces a rectangular box plot. - *notch* = 1 will produce a notched box plot *sym* (default 'b+') is the default symbol for flier points. Enter an empty string ('') if you don't want to show fliers. *whis* (default 1.5) defines the length of the whiskers as a function of the inner quartile range. They extend to the most extreme data point within ( ``whis*(75%-25%)`` ) data range. *positions* (default 1,2,...,n) sets the horizontal positions of the boxes. The ticks and limits are automatically set to match the positions. *widths* is either a scalar or a vector and sets the width of each box. The default is 0.5, or ``0.15*(distance between extreme positions)`` if that is smaller. *x* is an array or a sequence of vectors. Returns a dictionary mapping each component of the boxplot to a list of the :class:`matplotlib.lines.Line2D` instances created. Copyright (c) 2002-2009 John D. Hunter; All Rights Reserved """ whiskers, caps, boxes, medians, fliers = [], [], [], [], [] # convert x to a list of vectors if hasattr(x, 'shape'): if len(x.shape) == 1: if hasattr(x[0], 'shape'): x = list(x) else: x = [ x, ] elif len(x.shape) == 2: nr, nc = x.shape if nr == 1: x = [x] elif nc == 1: x = [x.ravel()] else: x = [x[:, i] for i in xrange(nc)] else: raise ValueError, "input x can have no more than 2 dimensions" if not hasattr(x[0], '__len__'): x = [x] col = len(x) # get some plot info if positions is None: positions = range(1, col + 1) if widths is None: distance = max(positions) - min(positions) widths = min(0.15 * max(distance, 1.0), 0.5) if isinstance(widths, float) or isinstance(widths, int): widths = numpy.ones((col, ), float) * widths # loop through columns, adding each to plot for i, pos in enumerate(positions): d = numpy.ravel(x[i]) row = len(d) # get median and quartiles wisk_lo, q1, med, q3, wisk_hi = mlab.prctile(d, [10, 25, 50, 75, 90]) # get high extreme #iq = q3 - q1 #hi_val = q3 + whis*iq #wisk_hi = numpy.compress( d <= hi_val , d ) #if len(wisk_hi) == 0: #wisk_hi = q3 #else: #wisk_hi = max(wisk_hi) ## get low extreme #lo_val = q1 - whis*iq #wisk_lo = numpy.compress( d >= lo_val, d ) #if len(wisk_lo) == 0: #wisk_lo = q1 #else: #wisk_lo = min(wisk_lo) # get fliers - if we are showing them flier_hi = [] flier_lo = [] flier_hi_x = [] flier_lo_x = [] if len(sym) != 0: flier_hi = numpy.compress(d > wisk_hi, d) flier_lo = numpy.compress(d < wisk_lo, d) flier_hi_x = numpy.ones(flier_hi.shape[0]) * pos flier_lo_x = numpy.ones(flier_lo.shape[0]) * pos # get x locations for fliers, whisker, whisker cap and box sides box_x_min = pos - widths[i] * 0.5 box_x_max = pos + widths[i] * 0.5 wisk_x = numpy.ones(2) * pos cap_x_min = pos - widths[i] * 0.25 cap_x_max = pos + widths[i] * 0.25 cap_x = [cap_x_min, cap_x_max] # get y location for median med_y = [med, med] # calculate 'regular' plot if notch == 0: # make our box vectors box_x = [box_x_min, box_x_max, box_x_max, box_x_min, box_x_min] box_y = [q1, q1, q3, q3, q1] # make our median line vectors med_x = [box_x_min, box_x_max] # calculate 'notch' plot else: raise NotImplementedError notch_max = med #+ 1.57*iq/numpy.sqrt(row) notch_min = med #- 1.57*iq/numpy.sqrt(row) if notch_max > q3: notch_max = q3 if notch_min < q1: notch_min = q1 # make our notched box vectors box_x = [ box_x_min, box_x_max, box_x_max, cap_x_max, box_x_max, box_x_max, box_x_min, box_x_min, cap_x_min, box_x_min, box_x_min ] box_y = [ q1, q1, notch_min, med, notch_max, q3, q3, notch_max, med, notch_min, q1 ] # make our median line vectors med_x = [cap_x_min, cap_x_max] med_y = [med, med] doplot = plt.plot whiskers.extend( doplot(wisk_x, [q1, wisk_lo], color=whiskerscolor, linestyle='--')) whiskers.extend( doplot(wisk_x, [q3, wisk_hi], color=whiskerscolor, linestyle='--')) caps.extend( doplot(cap_x, [wisk_hi, wisk_hi], color=capscolor, linestyle='-')) caps.extend( doplot(cap_x, [wisk_lo, wisk_lo], color=capscolor, linestyle='-')) boxes.extend(doplot(box_x, box_y, color=boxescolor, linestyle='-')) medians.extend(doplot(med_x, med_y, color=medianscolor, linestyle='-')) fliers.extend( doplot(flier_hi_x, flier_hi, sym, flier_lo_x, flier_lo, sym)) # fix our axes/ticks up a little newlimits = min(positions) - 0.5, max(positions) + 0.5 plt.gca().set_xlim(newlimits) plt.gca().set_xticks(positions) return dict(whiskers=whiskers, caps=caps, boxes=boxes, medians=medians, fliers=fliers)
def calculate_limits(array_dict, method='global', percentiles=None, limit=()): """ Calculate limits for a group of arrays in a flexible manner. Returns a dictionary of calculated (vmin, vmax), with the same keys as `array_dict`. Useful for plotting heatmaps of multiple datasets, and the vmin/vmax values of the colormaps need to be matched across all (or a subset) of heatmaps. Parameters ---------- array_dict : dict of np.arrays method : {'global', 'independent', callable} If method="global", then use the global min/max values across all arrays in array_dict. If method="independent", then each array will have its own min/max calcuated. If a callable, then it will be used to group the keys of `array_dict`, and each group will have its own group-wise min/max calculated. limit: tuple, optional Tuple of 2 scalars passed directly to matplotlib.mlab.prctile to limit the calculation of the percentile. percentiles : None or list If not None, a list of (lower, upper) percentiles in the range [0,100]. """ if percentiles is not None: for percentile in percentiles: if not 0 <= percentile <= 100: raise ValueError("percentile (%s) not between [0, 100]") if method == 'global': all_arrays = np.concatenate( [i.ravel() for i in array_dict.itervalues()] ) if percentiles: vmin = mlab.prctile( all_arrays, percentiles[0], limit=limit) vmax = mlab.prctile( all_arrays, percentiles[1], limit=limit) else: vmin = all_arrays.min() vmax = all_arrays.max() d = dict([(i, (vmin, vmax)) for i in array_dict.keys()]) elif method == 'independent': d = {} for k, v in array_dict.iteritems(): d[k] = (v.min(), v.max()) elif hasattr(method, '__call__'): d = {} sorted_keys = sorted(array_dict.keys(), key=method) for group, keys in groupby(sorted_keys, method): keys = list(keys) all_arrays = np.concatenate([array_dict[i] for i in keys]) if percentiles: vmin = mlab.prctile( all_arrays, percentiles[0], limit=limit) vmax = mlab.prctile( all_arrays, percentiles[1], limit=limit) else: vmin = all_arrays.min() vmax = all_arrays.max() for key in keys: d[key] = (vmin, vmax) return d
def boxplot(x, notch=0, sym='b+', positions=None, widths=None): """Makes a box and whisker plot. Adapted from matplotlib.axes 0.98.5.2 Modified such that the caps are set to the 10th and 90th percentiles, and to have some control on the colors. call signature:: boxplot(x, notch=0, sym='+', positions=None, widths=None) Make a box and whisker plot for each column of *x* or each vector in sequence *x*. The box extends from the lower to upper quartile values of the data, with a line at the median. The whiskers extend from the box to show the range of the data. Flier points are those past the end of the whiskers. - *notch* = 0 (default) produces a rectangular box plot. - *notch* = 1 will produce a notched box plot *sym* (default 'b+') is the default symbol for flier points. Enter an empty string ('') if you don't want to show fliers. *whis* (default 1.5) defines the length of the whiskers as a function of the inner quartile range. They extend to the most extreme data point within ( ``whis*(75%-25%)`` ) data range. *positions* (default 1,2,...,n) sets the horizontal positions of the boxes. The ticks and limits are automatically set to match the positions. *widths* is either a scalar or a vector and sets the width of each box. The default is 0.5, or ``0.15*(distance between extreme positions)`` if that is smaller. *x* is an array or a sequence of vectors. Returns a dictionary mapping each component of the boxplot to a list of the :class:`matplotlib.lines.Line2D` instances created. Copyright (c) 2002-2009 John D. Hunter; All Rights Reserved """ whiskers, caps, boxes, medians, fliers = [], [], [], [], [] # convert x to a list of vectors if hasattr(x, 'shape'): if len(x.shape) == 1: if hasattr(x[0], 'shape'): x = list(x) else: x = [x,] elif len(x.shape) == 2: nr, nc = x.shape if nr == 1: x = [x] elif nc == 1: x = [x.ravel()] else: x = [x[:,i] for i in xrange(nc)] else: raise ValueError, "input x can have no more than 2 dimensions" if not hasattr(x[0], '__len__'): x = [x] col = len(x) # get some plot info if positions is None: positions = range(1, col + 1) if widths is None: distance = max(positions) - min(positions) widths = min(0.15*max(distance,1.0), 0.5) if isinstance(widths, float) or isinstance(widths, int): widths = np.ones((col,), float) * widths # loop through columns, adding each to plot for i,pos in enumerate(positions): d = np.ravel(x[i]) row = len(d) # get median and quartiles wisk_lo, q1, med, q3, wisk_hi = mlab.prctile(d,[10,25,50,75,90]) # get high extreme #iq = q3 - q1 #hi_val = q3 + whis*iq #wisk_hi = np.compress( d <= hi_val , d ) #if len(wisk_hi) == 0: #wisk_hi = q3 #else: #wisk_hi = max(wisk_hi) ## get low extreme #lo_val = q1 - whis*iq #wisk_lo = np.compress( d >= lo_val, d ) #if len(wisk_lo) == 0: #wisk_lo = q1 #else: #wisk_lo = min(wisk_lo) # get fliers - if we are showing them flier_hi = [] flier_lo = [] flier_hi_x = [] flier_lo_x = [] if len(sym) != 0: flier_hi = np.compress( d > wisk_hi, d ) flier_lo = np.compress( d < wisk_lo, d ) flier_hi_x = np.ones(flier_hi.shape[0]) * pos flier_lo_x = np.ones(flier_lo.shape[0]) * pos # get x locations for fliers, whisker, whisker cap and box sides box_x_min = pos - widths[i] * 0.5 box_x_max = pos + widths[i] * 0.5 wisk_x = np.ones(2) * pos cap_x_min = pos - widths[i] * 0.25 cap_x_max = pos + widths[i] * 0.25 cap_x = [cap_x_min, cap_x_max] # get y location for median med_y = [med, med] # calculate 'regular' plot if notch == 0: # make our box vectors box_x = [box_x_min, box_x_max, box_x_max, box_x_min, box_x_min ] box_y = [q1, q1, q3, q3, q1 ] # make our median line vectors med_x = [box_x_min, box_x_max] # calculate 'notch' plot else: raise NotImplementedError notch_max = med #+ 1.57*iq/np.sqrt(row) notch_min = med #- 1.57*iq/np.sqrt(row) if notch_max > q3: notch_max = q3 if notch_min < q1: notch_min = q1 # make our notched box vectors box_x = [box_x_min, box_x_max, box_x_max, cap_x_max, box_x_max, box_x_max, box_x_min, box_x_min, cap_x_min, box_x_min, box_x_min ] box_y = [q1, q1, notch_min, med, notch_max, q3, q3, notch_max, med, notch_min, q1] # make our median line vectors med_x = [cap_x_min, cap_x_max] med_y = [med, med] doplot = plt.plot whiskers.extend(doplot(wisk_x, [q1, wisk_lo], color=whiskerscolor, linestyle='--')) whiskers.extend(doplot(wisk_x, [q3, wisk_hi], color=whiskerscolor, linestyle='--')) caps.extend(doplot(cap_x, [wisk_hi, wisk_hi], color=capscolor, linestyle='-')) caps.extend(doplot(cap_x, [wisk_lo, wisk_lo], color=capscolor, linestyle='-')) boxes.extend(doplot(box_x, box_y, color=boxescolor, linestyle='-')) medians.extend(doplot(med_x, med_y, color=medianscolor, linestyle='-')) fliers.extend(doplot(flier_hi_x, flier_hi, sym, flier_lo_x, flier_lo, sym)) # fix our axes/ticks up a little newlimits = min(positions)-0.5, max(positions)+0.5 plt.gca().set_xlim(newlimits) plt.gca().set_xticks(positions) return dict(whiskers=whiskers, caps=caps, boxes=boxes, medians=medians, fliers=fliers)
def ssi_scatter(timelock, iter = 100): from myutils import bootstrap from matplotlib.mlab import prctile, find fig = plt.figure() ax = fig.add_subplot(111) storeSSI = {'PG':[],'FG':[]} mem = {'PG response':'PG', 'response':'FG'} units = timelock.units # For each unit, compute the SSI for PG and FG for unit in units: data = timelock.get(unit) # For now, I only want to look at hit-hit trials select = (data['PG outcome']==consts['HIT']) & \ (data['outcome']==consts['HIT']) trials = data[select] inter = interval(unit,trials,'PG out','onset') counts = inter[unit.id].map(len) rates = counts/(inter['onset']-inter['PG out']) for goal in mem.keys(): input = DataFrame({'rates':rates, 'stimulus':trials[goal]}) storeSSI[mem[goal]].append(bootstrap(input,ssi,iters=iter)) meanSSI = dict.fromkeys(storeSSI.keys()) intervSSI = dict.fromkeys(storeSSI.keys()) for key, ssis in storeSSI.iteritems(): # Calculate the means of the bootstrapped SSIs meanSSI[key] = [ np.mean(unitSSI) for unitSSI in ssis ] # Calculate the 95% confidence intervals of the boostrapped SSIs intervSSI[key] = [ prctile(unitSSI,p=(2.5,97.5)) for unitSSI in ssis ] # Now let's check for significance sig = dict.fromkeys(meanSSI.keys()) def check_between(check, between): is_it = (between[0] <= check) & (between[1] >= check) return is_it for key, iSSIs in intervSSI.iteritems(): sig[key] = np.array([ not check_between(0,issi) for issi in iSSIs ]) not_sig = [ not (pg | fg) for pg,fg in zip(sig['PG'],sig['FG']) ] not_sig = np.array(not_sig) sig_colors = {'PG':'r','FG':'b'} xpnts = np.array(meanSSI['PG']) ypnts = np.array(meanSSI['FG']) xbars = np.abs(np.array(intervSSI['PG']).T - xpnts) ybars = np.abs(np.array(intervSSI['FG']).T - ypnts) # First, plot the not significant units ax.errorbar(xpnts[not_sig],ypnts[not_sig], yerr=ybars[:,not_sig],xerr=xbars[:,not_sig], fmt='o', color = 'grey') # Then plot things that are significant for PG and FG for key in sig.iterkeys(): if sig[key].any(): ax.errorbar(xpnts[sig[key]],ypnts[sig[key]], yerr=ybars[:,sig[key]],xerr=xbars[:,sig[key]], fmt='o', color = sig_colors[key]) xs = ax.get_xlim() ys = ax.get_ylim() ax.plot(xs,[0,0],'-k') ax.plot([0,0],ys,'-k') ax.plot([-10,10],[-10,10],'--',color='grey') ax.set_xlabel('PG SSI') ax.set_ylabel('FG SSI') ax.set_xlim(xs) ax.set_ylim(ys) ax.set_aspect('equal') #fig.show() return sig, not_sig
def make_tss_plot(bam_file, tss, prefix, chromsizes, read_len, bins=400, bp_edge=2000, processes=8, greenleaf_norm=True): ''' Take bootstraps, generate tss plots, and get a mean and standard deviation on the plot. Produces 2 plots. One is the aggregation plot alone, while the other also shows the signal at each TSS ordered by strength. ''' logging.info('Generating tss plot...') tss_plot_file = '{0}_tss-enrich.png'.format(prefix) tss_plot_data_file = '{0}_tss-enrich.txt'.format(prefix) tss_plot_large_file = '{0}_large_tss-enrich.png'.format(prefix) # Load the TSS file tss = pybedtools.BedTool(tss) tss_ext = tss.slop(b=bp_edge, g=chromsizes) # Load the bam file # Need to shift reads and just get ends, just load bed file? bam = metaseq.genomic_signal(bam_file, 'bam') bam_array = bam.array(tss_ext, bins=bins, shift_width=-read_len / 2, # Shift to center the read on the cut site processes=processes, stranded=True) # Actually first build an "ends" file #get_ends = '''zcat {0} | awk -F '\t' 'BEGIN {{OFS="\t"}} {{if ($6 == "-") {{$2=$3-1; print}} else {{$3=$2+1; print}} }}' | gzip -c > {1}_ends.bed.gz'''.format(bed_file, prefix) # print(get_ends) # os.system(get_ends) #bed_reads = metaseq.genomic_signal('{0}_ends.bed.gz'.format(prefix), 'bed') # bam_array = bed_reads.array(tss_ext, bins=bins, # processes=processes, stranded=True) # Normalization (Greenleaf style): Find the avg height # at the end bins and take fold change over that if greenleaf_norm: # Use enough bins to cover 100 bp on either end num_edge_bins = int(100 / (2 * bp_edge / bins)) bin_means = bam_array.mean(axis=0) avg_noise = (sum(bin_means[:num_edge_bins]) + sum(bin_means[-num_edge_bins:] ))/(2*num_edge_bins) bam_array /= avg_noise else: bam_array /= bam.mapped_read_count() / 1e6 # Generate a line plot fig = plt.figure() ax = fig.add_subplot(111) x = np.linspace(-bp_edge, bp_edge, bins) ax.plot(x, bam_array.mean(axis=0), color='r', label='Mean') ax.axvline(0, linestyle=':', color='k') # Note the middle high point (TSS) tss_point_val = max(bam_array.mean(axis=0)) ax.set_xlabel('Distance from TSS (bp)') ax.set_ylabel('Average read coverage (per million mapped reads)') ax.legend(loc='best') fig.savefig(tss_plot_file) # Print a more complicated plot with lots of info # write the plot data; numpy object np.savetxt(tss_plot_data_file, bam_array.mean(axis=0), delimiter=",") # Find a safe upper percentile - we can't use X if the Xth percentile is 0 upper_prct = 99 if mlab.prctile(bam_array.ravel(), upper_prct) == 0.0: upper_prct = 100.0 plt.rcParams['font.size'] = 8 fig = metaseq.plotutils.imshow(bam_array, x=x, figsize=(5, 10), vmin=5, vmax=upper_prct, percentile=True, line_kwargs=dict(color='k', label='All'), fill_kwargs=dict(color='k', alpha=0.3), sort_by=bam_array.mean(axis=1)) # And save the file fig.savefig(tss_plot_large_file) return tss_plot_file, tss_plot_large_file, tss_point_val
def imshow(arr, x=None, ax=None, vmin=None, vmax=None, percentile=True, strip=False, features=None, conf=0.95, sort_by=None, line_kwargs=None, fill_kwargs=None, imshow_kwargs=None, figsize=(5, 12), width_ratios=(4, 1), height_ratios=(4, 1), subplot_params=dict(wspace=0.1, hspace=0.1), subset_by=None, subset_order=None,): """ Do-it-all function to help with plotting heatmaps Parameters ---------- arr : array-like x : 1D array X values to use. If None, use range(arr.shape[1]) ax : matplotlib.Axes If not None, then only plot the array on the provided axes. This will ignore any additional arguments provided that apply to figure-level configuration or to the average line plot. For example, `figsize`, `width_ratios`, `height_ratios`, `subplot_params`, `line_kwargs`, and `fill_kwargs` will all be ignored. vmin, vmax : float percentile : bool If True, then treat values for `vmin` and `vmax` as percentiles rather than absolute values. strip : bool Include a strip plot alongside the array features : pybedtools.BedTool or string filename Features used to construct the array conf : float Confidence interval to use in line plot. sort_by : array-like Use the provided array to sort the array (e.g., an array of expression values). This array will be argsorted to get the proper order. line_kwargs, fill_kwargs : dict Passed directly to `ci_plot`. figsize : tuple (Width, height) of the figure to create. imshow_kwargs : dict Passed directly to matplotlib.pyplot.imshow. By default, arguments used are `origin='lower'`, `aspect="auto"` and a colormap from colormap_adjust.smart_colormap generated using the provided `vmin` and `vmax`. width_ratios, height_ratios: tuple These tuples are passed to the `new_shell` function. The default values set up a 2x2 configuration of panels for heatmap, line plot, colorbar axes, and optional strip plot. However modifying `width_ratios` or `height_ratios` can be used to create more or fewer panels. subplot_params : dict Passed to Figure.subplots_adjust subset_by : array An array of any type (but usually int or str) that contains a class label for each row in the heatmap array. For example, to subset by expression, an array the values of "up", "down", or "unchanged" at each of the positions could be provided. Note that the heatmap array is first sorted by `sort_by` and then split into groups according to `subset_by`, so each subset remains sorted by `sort_by`. subset_order : list-like This provides the order in which the subsets are plotted. Since the default imshow arguments contain `origin="lower"`, these will be plotted in order starting at the bottom of the heatmap. """ if ax is None: fig = new_shell( figsize=figsize, strip=strip, subplot_params=subplot_params, width_ratios=width_ratios, height_ratios=height_ratios) if x is None: x = np.arange(arr.shape[1] + 1) if percentile: if vmin is None: vmin = arr.min() else: vmin = mlab.prctile(arr.ravel(), vmin) if vmax is None: vmax = arr.max() else: vmax = mlab.prctile(arr.ravel(), vmax) else: if vmin is None: vmin = arr.min() if vmax is None: vmax = arr.max() cmap = colormap_adjust.smart_colormap(vmin, vmax) _imshow_kwargs = dict(origin='lower', cmap=cmap, vmin=vmin, vmax=vmax, aspect='auto') if imshow_kwargs is not None: _imshow_kwargs.update(imshow_kwargs) # previously we did an argsort first; with subsetting we don't want to do # that yet.... #if sort_by is not None: # ind = np.argsort(sort_by) #else: # ind = np.arange(arr.shape[0]) if sort_by is None: sort_by = np.arange(arr.shape[0]) if ax is None: array_ax = fig.array_axes else: array_ax = ax # If not provided, assume all in the same subset. if subset_by is None: subset_by = np.zeros(arr.shape[0]) # Ensure always array, since we're doing indexing tricks if not isinstance(subset_by, np.ndarray): subset_by = np.array(subset_by) # If not provided, use sorted order if subset_order is None: subset_order = sorted(np.unique(subset_by)) inds = [] for cls in subset_order: subset_ind = np.nonzero(subset_by == cls)[0] subset_sort_by = sort_by[subset_ind] subset_argsort_by = np.argsort(subset_sort_by) inds.append(subset_ind[subset_argsort_by]) ind = np.concatenate(inds) mappable = array_ax.imshow( arr[ind, :], extent=(x.min(), x.max(), 0, arr.shape[0]), **_imshow_kwargs ) if line_kwargs is None: line_kwargs = {} if fill_kwargs is None: fill_kwargs = {} if isinstance(line_kwargs, dict): line_kwargs = [line_kwargs] if isinstance(fill_kwargs, dict): fill_kwargs = [fill_kwargs] _line_kwargs = itertools.cycle(line_kwargs) _fill_kwargs = itertools.cycle(fill_kwargs) if ax is None: plt.colorbar(mappable, fig.cax) for subset_ind, label, _lkw, _fkw in zip(inds, subset_order, _line_kwargs, _fill_kwargs): ci_plot( x, arr[subset_ind], ax=fig.line_axes, line_kwargs=_lkw, fill_kwargs=_fkw, ) return fig else: return ax.figure
#initializing function constant variables N = Nmarkets * Nproducts tolerance = 0.001 nogradient = 0 thetaspost = hlp.HMCMC( lambda theta: hlp.computeGMMobjective( theta, simshare, simoutshare, cdindex, weights, price, X, IV, vdraws, Nproducts, N, tolerance, nogradient), theta0, B) posteriormeanpost = np.mean(thetaspost[t - 1:, :], axis=0) posteriormedianpost = np.median(thetaspost[t - 1:, :], axis=0) thetasdemedianed = np.abs(thetaspost[t - 1:, :] - (np.ones( (B - t + 1, 1)) @ posteriormedianpost[:, None].T)) criticalvaluesymmetricpost = matlab.prctile(thetasdemedianed, 100 * (1 - alpha)) posteriorquantilealpha2post = matlab.prctile(thetaspost[t - 1:, :], 100 * alpha / 2) posteriorquantileoneminusalpha2post = matlab.prctile(thetaspost[t - 1:, :], 100 * (1 - alpha / 2)) ### STANDARD ERRORS ### betahat = np.zeros((1, dimX + 1)) for e in range(dimX + 1): betanew = posteriormeanpost[e] betahat[0, e] = betanew betahat = betahat.conj().transpose() theta2hat = np.zeros((1, dimX)) for c in range(dimX):
s.join( "lista", 'lista_qtde_' + datetime.now().strftime("%d%m%Y_%H_%M_%S") + '.csv'), 'w', 'utf-8') for k, v in assuntos.items(): if 'valor' in v: values.append(v['valor']) line = "%s;%s\n" % (k, str(v['valor'])) print(line) handle.write(line) handle.close() plt.figure() #d = np.sort(np.random.randint(0, 1000, 1000)).cumsum() d = sorted(values) print(d) # Percentile values p = np.array([0.0, 25.0, 50.0, 75.0, 100.0]) perc = mlab.prctile(d, p=p) plt.plot(d) # Place red dots on the percentiles plt.plot((len(d) - 1) * p / 100., perc, 'ro') # Set tick locations and labels plt.xticks((len(d) - 1) * p / 100., map(str, p)) plt.savefig( s.join( "figuras", 'resultado_perc_' + datetime.now().strftime("%d%m%Y_%H_%M_%S") + '.png'))
def ssa(X, M=None, K=0): r"""Performs Singular Spectrum Analysis on time series X with the method of Vautard and Ghil, Phys. D. 1989. Parameters ---------- X : 1D array Vector of evenly spaced observations. M : int Window length. Default value is M = len(X) / 10 K : int Number of EOFs used for reconstruction (AICC choice by default k=0). if K = 0, corrected Akaike Information Criterion (AICC) is used if K = 'mcssa', the Monte Carlo spectral significance estimation of Allen & Smith (J Clim, 1996) is used. Returns ------- spec : array_like Eigenvalue spectrum, in % variance. eig_vec : array_like Eigenvector matrix ("temporal EOFs"). PC : array_like Matrix of principal components. RC : array_like Matrix of RCs (N*M, K) (only if K > 0). RCp : array_like Reconstructed time-series, involving only the modes retained, and rescaled to original mean and variance. Examples -------- spec, eig_vec, PC, RC, RCp = ssa(X,[M, K]) Notes ----- Orignal file hepta_ssa.m from Hepta Technologies, 2004 writing in MatlabTM. last updated 03/14/2012 to include automated choice for K (AICC). Julien Emile-Geay, Lamont Doherty Earth Observatory. Dec 2004 last updated 03/14/2012 """ X = np.atleast_1d(X) if X.ndim > 1: raise ValueError("Input vector `X` has more than 1 dimension.") N = len(X) # Center the series. Xr, mu, sigma = standardize(X) # NOTE: Original calls standardize.m. # Set default value for M. if not M: M = N // 10 if K == 'mcssa': mcssa = True MC = 1000 else: mcssa = False signif = np.arange(0, K) # FIXME: 0, K Np = N - M + 1 gam, lags = xcorr(Xr, maxlags=M - 1, matlab_compat='unbiased') # Fill in Covariance matrix. Take positive half of auto-correlation # diagram, hence M to 2M - 1. C = toeplitz(gam[M - 1:2 * M]) # Solve eigenvalue problem. eig_vec, eig_val = eigd(C) # FIXME: Matlab eig_vec have reversed signs. spec = eig_val / np.sum(eig_val) # Determine significant eigenvalues. if mcssa: # NOTE: Got this at from: http://www.gps.caltech.edu/~tapio/arfit/ # But this is commented out in the original code. #w, A, C, SBC, FPE, th = arfit(Xr, 1, 1) # fit AR(1) model. # NOTE: The original code uses ar1.m. # What is the difference between ar1.m and arfit.m? a, var, _ = ar1(Xr) s = np.sqrt(var) noise = np.zeros(N, MC) noise[0, :] = np.tile(Xr[0], np.r_[1, MC]) for jt in range(1, N): noise[jt, :] = a * noise[jt - 1, :] + s * np.random.randn(1, MC) noise, _, _ = standardize(noise) Lambda_R = np.zeros_like(MC) # FIXME: Don't know the right shape yet. for m in range(0, MC): Gn, ln = xcorr(noise[:, m], M - 1, 'unbiased') Cn = toeplitz(Gn[M: 2 * M - 1]) # Noise "eigenvalues". tmp = np.dot(eig_vec, Cn) Lambda_R[:, m] = np.diag(np.dot(tmp, eig_vec)) q95 = prctile(Lambda_R, 100 * 0.95) # FIXME # Index of modes rising above the background. signif = np.where(eig_val > q95) print('MCSSA modes retained: %s' % signif) fix, ax = plt.subplots() ax.set_title('MCSSA') v = np.arange[1, M + 1] ligr = [0.7000, 0.7000, 0.7000] lmin = Lambda_R.min(axis=1) lmax = Lambda_R.max(axis=1) ax.fill(v, lmin, lmax, ligr, ligr, 0, 0.3) ax.plot(v, eig_val, 'kx', linewidth=2.0) ax.plot(v, q95, 'r-', linewidth=2.0) elif K == 0: trunc = range(0, len(spec)) # The pca_truncation_criteria.m original call: # [MDL, NE08, AIC, AICC] = # pca_truncation_criteria(eig_val, 1, trunc, N, 1) WK85, NE08 = pca_truncation_criteria(eig_val, 1, trunc, N, 1) imin = (np.real(NE08['aicc'])).argmin() K = trunc[imin] print('AICC truncation choice, K = %s' % K) signif = np.arange(0, K) # Compute PCs. decal = np.zeros((Np, M)) for t in range(0, N - M + 1): decal[t, :] = Xr[t:M + t] # The columns of this matrix are Ak(t), k=1 to M. PC = np.dot(decal, eig_vec) # Compute reconstructed timeseries if K > 0. if signif: RC = np.zeros((N, len(signif))) # First M terms. for t in range(0, M - 1): Av = np.flipud(PC[0:t, signif]) eig_vec_red = eig_vec[0:t, signif] RC[t, :] = 1.0 / t * np.sum(Av * eig_vec_red, axis=0) # Middle of timeseries. for t in range(M, Np + 1): Av = np.flipud(PC[t - M + 1:t, signif]) eig_vec_red = eig_vec[0:M, signif] RC[t, :] = 1 / M * np.sum(Av * eig_vec_red, axis=0) # Last M terms. for t in range(Np + 1, N + 1): Av = np.flipud(PC[t - M + 1:Np, signif]) eig_vec_red = eig_vec[t - N + M:M, signif] RC[t, :] = 1.0 / (N - t + 1) * np.sum(Av * eig_vec_red, axis=0) # Sum and restore the mean and variance. RCp = sigma * np.sum(RC, axis=1) + mu else: RC, RCp = None, None return spec, eig_vec, PC, RC, RCp, signif
np.log10(faked_rdiffs)) std_over_shuffles = np.log10(faked_rdiffs).std(axis=0) # floored 1-tailed p-value of actual std vs faked distr std_n_more_extreme = np.sum(std_over_shuffles > real_std) + \ np.sum(~np.isfinite(std_over_shuffles)) mad_n_more_extreme = np.sum(mad_over_shuffles > real_mad) assert np.sum(~np.isfinite(mad_over_shuffles)) == 0 std_pval = (std_n_more_extreme + 1) / float(faked_rdiffs.shape[1]) mad_pval = (mad_n_more_extreme + 1) / float(faked_rdiffs.shape[1]) # Text summary sdump.append(region) sdump.append("MAD, mean %0.3f, distr over shuffles: %s" % \ (mad_over_shuffles.mean(), ' ' .join(['%0.3f' % v for v in mlab.prctile( mad_over_shuffles, (25, 50, 95, 97.5))]))) sdump.append("MAD, nP.E. actual=%0.3f, P.E. actual=%0.3f, pval=%0.6f" % (real_npe_mad, real_mad, mad_pval)) sdump.append("STDEV, nanmean %0.3f, distr over shuffles: %s" % \ (np.nanmean(std_over_shuffles), ' '.join(['%0.3f' % v for v in mlab.prctile( std_over_shuffles, (25, 50, 95, 97.5))]))) sdump.append("STDEV, nP.E. actual=%0.3f, P.E. actual=%0.3f, p=%0.6f" % (real_npe_std, real_std, std_pval)) sdump.append('') # Pretty axa[0, 0].set_yticks((0, 1, 2, 3, 4)) axa[0, 1].set_yticks((0, 2, 4, 6)) axa[1, 0].set_yticks((0, 1, 2, 3, 4)) axa[1, 1].set_yticks((0, 2, 4, 6))
def sea(self, **kwargs): """Method called to perform superposed epoch analysis on data in object. Uses object attributes obj.data, obj.times, obj.epochs, obj.delta, obj.window, all of which must be available on instantiation. Other Parameters ================ storedata : boolean saves matrix of epoch windows as obj.datacube (default = False) quartiles : list calculates the quartiles as the upper and lower bounds (and is default); ci : float will find the bootstrapped confidence intervals of ci_quan at the ci percent level (default=95) mad : float will use +/- the median absolute deviation for the bounds; ci_quan : string can be set to 'median' (default) or 'mean' Notes ===== A basic plot can be raised with :meth:`plot` """ #check this hasn't already been done #TODO: find out why doing two .sea() calls back-to-back fails 2nd time if hasattr(self, 'semedian') or hasattr(self, 'semean'): return None #check defaults defaults = { 'storedata': True, 'quartiles': True, 'ci': False, 'mad': False, 'ci_quan': 'median' } for default in defaults: if default not in kwargs: kwargs[default] = defaults[default] #ensure all input is np array delt = float(self.delta) if isinstance(self.data, np.ndarray): y = self.data else: y = np.asarray(self.data, dtype=float) if kwargs['ci']: kwargs['quartiles'], kwargs['mad'] = False, False if kwargs['mad']: kwargs['quartiles'], kwargs['ci'] = False, False time, t_epoch = self._timeepoch(delt) #build SEA matrix and perform analysis wind = int(self.window) m = int(2 * wind + 1) n = len(t_epoch) y_sea = np.zeros((n, m), dtype=float) blankslice = np.zeros([m], dtype=float) for i in range(n): dif = np.abs(time - t_epoch[i]) j = np.where(dif == np.min(dif)) stpt = j[0][0] - wind enpt = j[0][0] + wind + 1 sea_slice = blankslice.copy() if stpt < 0: #fix for bad epochs not correctly moved to badepochs attr #TODO: make badepochs robust or do all checking here sea_slice[0:abs(stpt)] = np.NaN sea_slice[abs(stpt):] = y[0:enpt] elif enpt >= len(y): tmpslice = y[stpt:] sea_slice[:len(tmpslice)] = tmpslice sea_slice[len(tmpslice):] = np.NaN else: sea_slice = y[stpt:enpt] y_sea[i, 0:] = sea_slice #find SEA mean, median and percentiles - exclude NaNs (or badval) try: badval = kwargs['badval'] except KeyError: badval = np.nan y_sea_m = ma.masked_where(np.isnan(y_sea), y_sea) else: y_sea_m = ma.masked_values(y_sea, badval) self.semean = [np.mean(y_sea_m[:, i].compressed()) for i in range(m)] self.semedian = [ np.median(y_sea_m[:, i].compressed()) for i in range(m) ] self.semean, self.semedian = np.array(self.semean), np.array( self.semedian) self.bound_low = np.zeros((m, 1)) self.bound_high = np.zeros((m, 1)) if kwargs['quartiles']: from matplotlib.mlab import prctile for i in range(m): dum = np.sort(y_sea_m[:, i].compressed()) qul = prctile(dum, p=(25, 75)) self.bound_low[i], self.bound_high[i] = qul[0], qul[1] self.bound_type = 'quartiles' elif kwargs['ci']: #bootstrapped confidence intervals (95%) funcdict = {'mean': np.mean, 'median': np.median} try: if isinstance(kwargs['ci'], bool): raise ValueError #fall through to default case else: ci_level = float(kwargs['ci']) except ValueError: ci_level = 95 from spacepy.poppy import boots_ci if hasattr(kwargs['ci_quan'], "__call__"): #ci_quan is a function ci_func = kwargs['ci_quan'] else: ci_func = funcdict[kwargs['ci_quan']] for i in range(m): dum = np.sort(y_sea_m[:, i].compressed()) self.bound_low[i], self.bound_high[i] = \ boots_ci(dum, 800, ci_level, ci_func) self.bound_type = 'ci' elif kwargs['mad']: #median absolute deviation for i in range(m): dum = np.sort(y_sea_m[:, i].compressed()) spread_mad = tb.medAbsDev(dum) self.bound_low[i] = self.semedian[i] - spread_mad self.bound_high[i] = self.semedian[i] + spread_mad self.bound_type = 'mad' self.x = np.linspace(-1.*self.window*self.delta, self.window*self.delta, \ len(self.semedian)) if kwargs['storedata']: self.datacube = y_sea_m if self.verbose: print('sea(): datacube added as new attribute') if self.verbose: print('Superposed epoch analysis complete')