def _nice_intervals(data, nlevs): ''' Purpose:: Calculates nice intervals between each color level for colorbars and contour plots. The target minimum and maximum color levels are calculated by taking the minimum and maximum of the distribution after cutting off the tails to remove outliers. Input:: data - an array of data to be plotted nlevs - an int giving the target number of intervals Output:: clevs - A list of floats for the resultant colorbar levels ''' # Find the min and max levels by cutting off the tails of the distribution # This mitigates the influence of outliers data = data.ravel() mnlvl = mstats.scoreatpercentile(data, 5) mxlvl = mstats.scoreatpercentile(data, 95) locator = mpl.ticker.MaxNLocator(nlevs) clevs = locator.tick_values(mnlvl, mxlvl) # Make sure the bounds of clevs are reasonable since sometimes # MaxNLocator gives values outside the domain of the input data clevs = clevs[(clevs >= mnlvl) & (clevs <= mxlvl)] return clevs
def binner(x, y, w_sta, nbins, rang = None, ebar = False, per = None) : from numpy import array, digitize, lexsort, linspace from numpy.ma import average, median ind = lexsort((y, x)) xs, ys = x[ind], y[ind] if rang is None : mn, mx = min(xs), max(xs) else : mn, mx = rang bins = linspace(mn, mx, nbins + 1) x_cen = (bins[: - 1] + bins[1:])*0.5 bins = linspace(mn, mx, nbins) ibins = digitize(xs, bins) if w_sta == "median" : y_sta = array([median(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mean" : y_sta = array([average(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mode" : y_sta = array([mode(ys[ibins == i])[0] for i in range(1, bins.size + 1)]) if ebar == False : return x_cen, y_sta elif ebar == True and per == None : myer = abs(array([scoreatpercentile(ys[ibins == i], 15.8) for i in range(1, bins.size + 1)]) - y_sta) pyer = abs(array([scoreatpercentile(ys[ibins == i], 84.0) for i in range(1, bins.size + 1)]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer elif ebar == True and per != None : myer = abs(array([scoreatpercentile(ys[ibins == i], per[0]) for i in range(1, bins.size + 1)]) - y_sta) pyer = abs(array([scoreatpercentile(ys[ibins == i], per[1]) for i in range(1, bins.size + 1)]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer
def nbins(sample, range_ = None) : IQR = lambda x : st.scoreatpercentile(x, 75.0) - st.scoreatpercentile(x, 25.0) if range_ is None : mn, mx = sample.min(), sample.max() else : mn, mx = range_ mask = (sample >= mn) & (sample <= mx) binsize = (2 * IQR(sample[mask]) / mask.sum() ** (1. / 3)) return (mx - mn) / binsize, mn, mx, binsize
def nbins(sample, range_=None): IQR = lambda x: st.scoreatpercentile(x, 75.0) - st.scoreatpercentile( x, 25.0) if range_ is None: mn, mx = sample.min(), sample.max() else: mn, mx = range_ mask = (sample >= mn) & (sample <= mx) binsize = (2 * IQR(sample[mask]) / mask.sum()**(1. / 3)) return (mx - mn) / binsize, mn, mx, binsize
def binner(x, y, w_sta, nbins, rang=None, ebar=False, per=None): from numpy import array, digitize, lexsort, linspace from numpy.ma import average, median ind = lexsort((y, x)) xs, ys = x[ind], y[ind] if rang is None: mn, mx = min(xs), max(xs) else: mn, mx = rang bins = linspace(mn, mx, nbins + 1) x_cen = (bins[:-1] + bins[1:]) * 0.5 bins = linspace(mn, mx, nbins) ibins = digitize(xs, bins) if w_sta == "median": y_sta = array( [median(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mean": y_sta = array( [average(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mode": y_sta = array( [mode(ys[ibins == i])[0] for i in range(1, bins.size + 1)]) if ebar == False: return x_cen, y_sta elif ebar == True and per == None: myer = abs( array([ scoreatpercentile(ys[ibins == i], 15.8) for i in range(1, bins.size + 1) ]) - y_sta) pyer = abs( array([ scoreatpercentile(ys[ibins == i], 84.0) for i in range(1, bins.size + 1) ]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer elif ebar == True and per != None: myer = abs( array([ scoreatpercentile(ys[ibins == i], per[0]) for i in range(1, bins.size + 1) ]) - y_sta) pyer = abs( array([ scoreatpercentile(ys[ibins == i], per[1]) for i in range(1, bins.size + 1) ]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer
def clean_outliers(self): """ Function to remove outliers. Parameters ---------- self.outlier_perc : integer Percentile value for mstats.scoreatpercentile function. Mask all values greater than this value. """ # Outliers using percentiles - num_rows * [min, max] outlier_all = ma.array([[mstats.scoreatpercentile(self.xs[i, :], 100 - self.outlier_perc), mstats.scoreatpercentile(self.xs[i, :], self.outlier_perc)] for i in xrange(self.rows_N)]) self.xs = ma.array([ma.hstack((ma.masked_outside(self.xs[i, :-self.keep_n_values], outlier_all[i, 0], outlier_all[i, 1]), self.xs[i, -self.keep_n_values:])) for i in xrange(self.rows_N)])
def test_2D(self): x = ma.array([[1, 1, 1], [1, 1, 1], [4, 4, 3], [1, 1, 1], [1, 1, 1]]) assert_equal(mstats.scoreatpercentile(x,50), [1,1,1])
def _nice_intervals(data, nlevs): ''' Purpose:: Calculates nice intervals between each color level for colorbars and contour plots. The target minimum and maximum color levels are calculated by taking the minimum and maximum of the distribution after cutting off the tails to remove outliers. Input:: data - an array of data to be plotted nlevs - an int giving the target number of intervals Output:: clevs - A list of floats for the resultant colorbar levels ''' # Find the min and max levels by cutting off the tails of the distribution # This mitigates the influence of outliers data = data.ravel() mn = mstats.scoreatpercentile(data, 5) mx = mstats.scoreatpercentile(data, 95) # if min less than 0 and or max more than 0 put 0 in center of color bar if mn < 0 and mx > 0: level = max(abs(mn), abs(mx)) mnlvl = -1 * level mxlvl = level # if min is larger than 0 then have color bar between min and max else: mnlvl = mn mxlvl = mx # hack to make generated intervals from mpl the same for all versions autolimit_mode = mpl.rcParams.get('axes.autolimit_mode') if autolimit_mode: mpl.rc('axes', autolimit_mode='round_numbers') locator = mpl.ticker.MaxNLocator(nlevs) clevs = locator.tick_values(mnlvl, mxlvl) if autolimit_mode: mpl.rc('axes', autolimit_mode=autolimit_mode) # Make sure the bounds of clevs are reasonable since sometimes # MaxNLocator gives values outside the domain of the input data clevs = clevs[(clevs >= mnlvl) & (clevs <= mxlvl)] return clevs
def test_percentile(self): x = np.arange(8) * 0.5 assert_equal(mstats.scoreatpercentile(x, 0), 0.) assert_equal(mstats.scoreatpercentile(x, 100), 3.5) assert_equal(mstats.scoreatpercentile(x, 50), 1.75)
#solution attempt 2 - heatmap plt.clf() gridsize = 20 plt.hexbin(xlist, ylist, gridsize=gridsize, cmap=cm.jet, bins=None) cb = plt.colorbar() cb.set_label('frequency') xlabel('percentile by variation in first five plays') ylabel('percentile by average in second five plays') print "r = %.3f, p = %.5f" % pearsonr(xlist, ylist) savefig('explore_exploit_scatterheatmap.png', dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1) #now do CI for r value bootrec = pickle.load(open('save_a5_boot_bootrec.p', 'rb')) bootrec = bootrec[0] ci_upper = ssm.scoreatpercentile(bootrec, 97.5) ci_lower = ssm.scoreatpercentile(bootrec, 02.5) ci_mean = np.mean(bootrec) print "Bootstrapped confidence intervals were Upper = %0.3f, Lower = %0.3f" % ( ci_upper, ci_lower)
pickle.dump(a, open('save_a4_1_a.p', 'wb')) # -------------------------------------------- #calc dict of maximum score for each player(=each key) maxscore={} for key in big: maxscore[key]= max([big[key][attempt][0] for attempt in big[key]]) # sort maximum scores, smallest to biggest ranked_maxscore=sorted(maxscore[key] for key in maxscore) #calc percentile ranking for each player (=each key) prcentiles=[] for p in range(100): prcentiles.append(ssm.scoreatpercentile(ranked_maxscore,p)) decile={} for key in big: for i in prcentiles: if maxscore[key]>i: decile[key]=prcentiles.index(float(i)) #------------------------------------------------ # now calculate some index of spread # - the simplest one is range timespread={}
for attempt in second_plays: second.append(sample_wr(bootdata[attempt], 1)) av2[key] = sp.mean(second) var2[key] = sp.var(second) #make list of summary stats x = [] y = [] for key in big: x.append(var1[key]) y.append(av2[key]) #find percentile values prcentiles_x = [] for p in range(100): prcentiles_x.append(ssm.scoreatpercentile(x, p)) prcentiles_y = [] for p in range(100): prcentiles_y.append(ssm.scoreatpercentile(y, p)) #make dict of prcentile values for each statistic for each player prcentile_xindex = { key: bisect.bisect(prcentiles_x, var1[key]) for key in big } prcentile_yindex = { key: bisect.bisect(prcentiles_y, av2[key]) for key in big }
#solution attempt 2 - heatmap plt.clf() gridsize=20 plt.hexbin(xlist, ylist,gridsize=gridsize, cmap=cm.jet, bins=None) cb = plt.colorbar() cb.set_label('frequency') xlabel('percentile by variation in first five plays') ylabel('percentile by average in second five plays') print "r = %.3f, p = %.5f" % pearsonr(xlist,ylist) savefig('Figure6.png', dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1) generatepaperfigs=0 if generatepaperfigs: savefig('../cogsci13/figures/a5_e-e_heatscatter.png', dpi=300, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format=None, transparent=False, bbox_inches='tight', pad_inches=0.1) #now do CI for r value bootrec=pickle.load(open('save_a5_boot_bootrec.p', 'rb')) bootrec=bootrec[0] ci_upper=ssm.scoreatpercentile(bootrec,97.5) ci_lower=ssm.scoreatpercentile(bootrec,02.5) ci_mean=np.mean(bootrec) print "Bootstrapped confidence intervals were Upper = %0.3f, Lower = %0.3f" % (ci_upper,ci_lower)
def drawGraphs(outFolder, bootName, windowSizes1, windowSizes2, zBottom = -1, zTop = 1): import matplotlib matplotlib.use('PDF') from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D print 'Drawing bootstrap graphs for: {}'.format(bootName) bootrec = pickle.load(open('{}/{}/bootrec.p'.format(outFolder,bootName),'rb')) plt.close('all') Z_obs = np.zeros((len(windowSizes1),len(windowSizes2))) Z_lower = np.zeros((len(windowSizes1),len(windowSizes2))) Z_boot = np.zeros((len(windowSizes1),len(windowSizes2))) Z_upper = np.zeros((len(windowSizes1),len(windowSizes2))) Z_std = np.zeros((len(windowSizes1),len(windowSizes2))) currentFolder = outFolder + '/' + bootName for i1 in xrange(len(windowSizes1)): for i2 in xrange(len(windowSizes2)): groupn_i = windowSizes1[i1] groupn_j = windowSizes2[i2] curbootrec=bootrec[0,i1,i2] print print "Analyzing %i - %i" % (groupn_i,groupn_j) xlist= pickle.load(open(currentFolder + '/save_a5_xlist' + str(groupn_i) + "," + str(groupn_j) +'.p', 'rb')) ylist= pickle.load(open(currentFolder + '/save_a5_ylist' + str(groupn_i) + "," + str(groupn_j) +'.p', 'rb')) a,b = pearsonr(xlist,ylist) #now do CI for r value ci_upper=ssm.scoreatpercentile(curbootrec,97.5) ci_lower=ssm.scoreatpercentile(curbootrec,02.5) ci_mean=np.mean(curbootrec) ci_std=np.var(curbootrec) print scipy.stats.norm(ci_mean,ci_std).cdf(abs(a)) print "r = %.3f, p = %.5f, %s of confidence interval" % (a,b, 'outside' if a > ci_upper or a < ci_lower else 'inside') print "Bootstrapped confidence intervals were Upper = %0.3f, Lower = %0.3f" % (ci_upper,ci_lower) Z_obs[i1][i2] = a Z_upper[i1][i2] = ci_upper Z_lower[i1][i2] = ci_lower Z_boot[i1][i2] = ci_mean Z_std[i1][i2] = ci_std X = [[k for j in windowSizes2] for k in windowSizes1] Y = [[j for j in windowSizes2] for k in windowSizes1] #One-sided Z value to p value Z_p = [[st.norm.sf((Z_obs[i][j] - Z_boot[i][j])/Z_std[i][j]) for j in range(len(windowSizes1))] for i in range(len(windowSizes1))] fig1 = plt.figure() fontsize = 16 ax = fig1.add_subplot(111, projection='3d') ax.plot_surface(X, Y, Z_obs, rstride=1, cstride=1) fig1.suptitle('Observed Correlations', fontsize=20) ax.set_xlabel('Size window 1', fontsize = fontsize) ax.set_ylabel('Size window 2', fontsize = fontsize) ax.set_zlabel('r', fontsize = fontsize) ax.set_zlim(bottom = zBottom, top = zTop) plt.savefig('{}/figures/{}_corObs.pdf'.format(outFolder,bootName), bbox_inches='tight') fig2 = plt.figure() ax = fig2.add_subplot(111, projection='3d') ax.plot_surface(X, Y, Z_boot, rstride=1, cstride=1) fig2.suptitle('Bootstrap Average Correlations', fontsize=20) ax.set_xlabel('Size window 1', fontsize = fontsize) ax.set_ylabel('Size window 2', fontsize = fontsize) ax.set_zlabel('r', fontsize = fontsize) ax.set_zlim(bottom = zBottom, top = zTop) plt.savefig('{}/figures/{}_corBoot.pdf'.format(outFolder,bootName), bbox_inches='tight')
res = [rt.err(table[:, i], table[:, i + 1]) if i == 0 else rt.err(table[:, i], table[:, i + 1], False) for i in xrange(0, 10, 2)] lab = ["mass residuals", "mass weighted age residuals", "flux weighted age residuals", "metallicity residuals", "dust extinction residuals"] for j in xrange(len(res)) : fig, axs = plt.subplots(5, 13, sharex = True, sharey = True, figsize = (20, 15)) plt.xlim(-1.5, +1.5) plt.ylim(0, 40) axs = np.ravel(axs) for i in xrange(65) : data = res[j][i * 100:(i + 1) * 100] median = np.median(data) p16 = st.scoreatpercentile(data, 16.0) p84 = st.scoreatpercentile(data, 84.0) counts, bins, patches = axs[i].hist(data, 30, histtype = "step", hatch = "///", lw = 1, color = "#1A1A1A", range = (-1.5, +1.5)) axs[i].axvline(median, ls = "--", lw = 1.5, color = "#000080") axs[i].axvline(p16, ls = "-.", lw = 1.5, color = "#000080") axs[i].axvline(p84, ls = "-.", lw = 1.5, color = "#000080") axs[-1].set_xticks([-1., 0, +1.]) axs[52].set_yticks(list(axs[i].get_yticks()[1:-1])) axs[58].set_xlabel(lab[j], fontsize = 16) axs[26].set_ylabel("counts", fontsize = 16) plt.tight_layout() plt.subplots_adjust(wspace = 0.01, hspace = 0.01, bottom = 0.06)
# -------------------------------------------- #calc dict of maximum score for each player(=each key) maxscore={} for key in big: maxscore[key]= max([big[key][attempt][0] for attempt in big[key]]) # sort maximum scores, smallest to biggest ranked_maxscore=sorted(maxscore[key] for key in maxscore) #calc percentile ranking for each player (=each key) prcentiles=[] for p in range(100): prcentiles.append(ssm.scoreatpercentile(ranked_maxscore,p)) #decile={} # #for key in big: # for i in prcentiles: # if maxscore[key]>i: # decile[key]=prcentiles.index(float(i)) #so now we know how good each player is #now let's calc variance av1={} var1={}
big = {k: data[k] for k in data if len(data[k]) > 9} #pythonic # -------------------------------------------- #calc dict of maximum score for each player(=each key) maxscore = {} for key in big: maxscore[key] = max([big[key][attempt][0] for attempt in big[key]]) # sort maximum scores, smallest to biggest ranked_maxscore = sorted(maxscore[key] for key in maxscore) #calc percentile ranking for each player (=each key) prcentiles = [] for p in range(100): prcentiles.append(ssm.scoreatpercentile(ranked_maxscore, p)) #decile={} # #for key in big: # for i in prcentiles: # if maxscore[key]>i: # decile[key]=prcentiles.index(float(i)) #so now we know how good each player is #now let's calc variance av1 = {} var1 = {} av2 = {}
execfile("fig4_boot.py") #this can take a long time (e.g. 24 hours) if you use many (e.g. 2000) resamples #load #observed data plot_timespread = pickle.load(open('save_plot_timespread.p', 'rb')) #bootstrap data bootdata = pickle.load(open('save_a4_2boot_bootdata.p','rb')) #find CIs, using ssm ci_upper=np.zeros( (1,100)) ci_lower=np.zeros( (1,100)) m_boot=np.zeros( (1,100)) for i in range(100): ci_upper[0,i]=ssm.scoreatpercentile(bootdata[i,:],97.5) ci_lower[0,i]=ssm.scoreatpercentile(bootdata[i,:],02.5) m_boot[0,i]=np.mean(bootdata[i,:]) print "PLOTTING" # plot ------------------------------------------- # thank you tomas http://www.staff.ncl.ac.uk/tom.holderness/software/pythonlinearfit plt.clf() # plot sample data plot(plot_timespread,'ro',label='Sample observations') # plot line of best fit plot(m_boot[(0,)],'b-',label='bootstrap_mean')
bootdata = np.zeros((100, boot_n)) print "Starting bootstrap calculations" for n in range(boot_n): print "iteration " + str(n) + " of " + str(boot_n) #find maxscores, when actual scores are a sample [attempts] long of a #maxscore_boot={key: max(random.sample(a,len(big[key]))) for key in big} maxscore_boot = {key: max(sample_wr(a, len(big[key]))) for key in big} # sort maximum scores, smallest to biggest, put in list ranked_maxscore_boot = sorted(maxscore_boot[key] for key in maxscore_boot) #calculate percentiles on these bootstrapped maximum scores prcentiles_boot = [ ssm.scoreatpercentile(ranked_maxscore_boot, p) for p in range(100) ] #assign prcentile to key in decile_boot decile_boot = { key: bisect.bisect(prcentiles_boot, maxscore_boot[key]) for key in big } #now calculate timespread to score percentile, using these #bootstrapped maximum scores spreads_b = np.zeros((100, 1)) #holding var for the time counts_b = np.zeros((100, 1)) #holding var for the number of players' data #sort timespread into holding variables according to decile value for key in decile_boot:
#load #observed data plot_timespread = pickle.load(open('save_plot_timespread.p', 'rb')) #bootstrap data bootdata = pickle.load(open('save_a4_2boot_bootdata.p', 'rb')) print "finding CIs" #find CIs, using ssm ci_upper = np.zeros((1, 100)) ci_lower = np.zeros((1, 100)) m_boot = np.zeros((1, 100)) for i in range(100): ci_upper[0, i] = ssm.scoreatpercentile(bootdata[i, :], 97.5) ci_lower[0, i] = ssm.scoreatpercentile(bootdata[i, :], 02.5) m_boot[0, i] = np.mean(bootdata[i, :]) print "running t-test" #make the same shape expt = a = np.reshape(m_boot[(0, )], 100) #expected values (from bootstrap, ie under H0) obsv = a = np.reshape(plot_timespread, 100) #observed values #recode so that a positive difference supports the theory (ie that spacing helps) #for the bottom 50% this means their observed is lower than expected #for the top 50% this means their obseved is higher than expected diffs = np.concatenate([expt[0:50] - obsv[0:50], obsv[50:100] - expt[50:100]])
"flux weighted age residuals", "metallicity residuals", "dust extinction residuals" ] for j in xrange(len(res)): fig, axs = plt.subplots(5, 13, sharex=True, sharey=True, figsize=(20, 15)) plt.xlim(-1.5, +1.5) plt.ylim(0, 40) axs = np.ravel(axs) for i in xrange(65): data = res[j][i * 100:(i + 1) * 100] median = np.median(data) p16 = st.scoreatpercentile(data, 16.0) p84 = st.scoreatpercentile(data, 84.0) counts, bins, patches = axs[i].hist(data, 30, histtype="step", hatch="///", lw=1, color="#1A1A1A", range=(-1.5, +1.5)) axs[i].axvline(median, ls="--", lw=1.5, color="#000080") axs[i].axvline(p16, ls="-.", lw=1.5, color="#000080") axs[i].axvline(p84, ls="-.", lw=1.5, color="#000080") axs[-1].set_xticks([-1., 0, +1.])
boot_n = 2000 # define how many resamples the bootstrap uses bootdata = np.zeros((100, boot_n)) print "Starting bootstrap calculations" for n in range(boot_n): print "iteration " + str(n) + " of " + str(boot_n) # find maxscores, when actual scores are a sample [attempts] long of a # maxscore_boot={key: max(random.sample(a,len(big[key]))) for key in big} maxscore_boot = {key: max(sample_wr(a, len(big[key]))) for key in big} # sort maximum scores, smallest to biggest, put in list ranked_maxscore_boot = sorted(maxscore_boot[key] for key in maxscore_boot) # calculate percentiles on these bootstrapped maximum scores prcentiles_boot = [ssm.scoreatpercentile(ranked_maxscore_boot, p) for p in range(100)] # assign prcentile to key in decile_boot decile_boot = {key: bisect.bisect(prcentiles_boot, maxscore_boot[key]) for key in big} # now calculate timespread to score percentile, using these # bootstrapped maximum scores spreads_b = np.zeros((100, 1)) # holding var for the time counts_b = np.zeros((100, 1)) # holding var for the number of players' data # sort timespread into holding variables according to decile value for key in decile_boot: spreads_b[decile_boot[key] - 1] += timespread[key] counts_b[decile_boot[key] - 1] += 1 t = spreads_b / counts_b # find average
second.append(sample_wr(bootdata[attempt],1)) av2[key]=sp.mean(second) var2[key]=sp.var(second) #make list of summary stats x=[] y=[] for key in big: x.append(var1[key]) y.append(av2[key]) #find percentile values prcentiles_x=[] for p in range(100): prcentiles_x.append(ssm.scoreatpercentile(x,p)) prcentiles_y=[] for p in range(100): prcentiles_y.append(ssm.scoreatpercentile(y,p)) #make dict of prcentile values for each statistic for each player prcentile_xindex={key: bisect.bisect(prcentiles_x,var1[key]) for key in big} prcentile_yindex={key: bisect.bisect(prcentiles_y,av2[key]) for key in big} # #plot subset # i=1001 # for key in big: # i+=1 # plot(prcentile_xindex[key],prcentile_yindex[key],'b.')