def plot_bourgdata(N1,N2): A=TRICLAIRModele() Tb15 = A.get_data_triathlon(link='/triathlon-bourg-resultats-1996.htm',year=2015) Tb14 = A.get_data_triathlon(link='/triathlon-bourg-resultats-1715.htm',year=2014) S15_ = map(lambda x: x.total_seconds()/60,Tb15['Scratch'].dropna()) S14_ = map(lambda x: x.total_seconds()/60,Tb14['Scratch'].dropna()) S15 = S15_[N1:N2] S14 = S14_[N1:N2] (mu14, sigma14) = norm.fit(S14) (mu15, sigma15) = norm.fit(S15) N_BINS = 50 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) n, bins, patches = ax.hist(S14, N_BINS,normed=1, facecolor='red', alpha=0.5,label=r'$\mathrm{2014:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu14, sigma14)) y = mlab.normpdf( bins, mu14, sigma14) l = ax.plot(bins, y, 'r-', linewidth=4) n, bins, patches = ax.hist(S15, N_BINS, normed=1, facecolor='green', alpha=0.5,label=r'$\mathrm{ 2015:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu15, sigma15)) y = mlab.normpdf( bins, mu15, sigma15) l = ax.plot(bins, y, 'g-', linewidth=4) fig.tight_layout() ax.set_xlabel('Scratch Time (minutes)') ax.set_ylabel('Number of athletes per scratch time (normalized)') ax.legend(loc='best', fancybox=True, framealpha=0.5) ax.set_title(r'$\mathrm{Athletes\ from\ rank\ } %d \mathrm{\ to\ } %d$' %(N1, N2)) plt.show()
def get_sent_similarity(user_data): scores=[] #=====[ Creates counts for each sentiment score in 21 buckets of width 0.1 from -1 to 1 ]===== for data in user_data: user_score = [0]*21 for tweet in data: score = int(float("%.1f" % tweet['score'])*10+10) user_score[score] += 1 scores.append(user_score) #=====[ Forms normalized probability distributions for each users sentiments ]===== x = np.linspace(-1, 1, 100) mu, std = norm.fit(scores[0]) p = norm.pdf(x, mu, std) mu, std = norm.fit(scores[1]) p2 = norm.pdf(x,mu,std) #=====[ Takes Kullback-Leibler Divergence between probability distributions ]===== similarity = float("%.5f" % scipy.stats.entropy(p,p2)) #=====[ Converts similarity score to a percentage from 10 - 90 to display on compatability spectrum ]===== if similarity < 0.003: return 90 elif similarity > 0.07: return 10 else: return int(10 + ((similarity*100)-1)/6.7*80) return int(similarity)
def compare_dlospeak_fit_test_to_normfit(): """ Kind of stupid test to compare the dlospeak_fit function to scipy.stat.norm.fit. Obviously the values for sigma are not the same because sigma =/= standard deviation of dLOS peak values because dLOS peak is not a Gaussian. It has heavier tails. Also tests that normlizating the dLOS distribution does NOT change the sigma or fpeak fit values! MPfit does a good job. """ cat_corr = { 'catalog': {'name': 'nseries', 'n_mock': 1}, 'correction': {'name': 'upweight'} } dlosclass = Dlos(cat_corr) dlosclass.read() print dlospeak_fit_test(dlosclass.dlos, fit = 'gauss', peak_range = [-15.0, 15.0]) print dlospeak_fit_test(dlosclass.dlos, fit = 'gauss', peak_range = [-15.0, 15.0], normed=True) inpeak = np.where( (dlosclass.dlos < 15.0) & (dlosclass.dlos > -15.0) ) print norm.fit(dlosclass.dlos[inpeak]) print np.std(dlosclass.dlos[inpeak]) return None
def plotter( fdict ): """ Go """ pgconn = psycopg2.connect(database='coop', host='iemdb', user='******') cursor = pgconn.cursor(cursor_factory=psycopg2.extras.DictCursor) month = int(fdict.get('month', 10)) day = int(fdict.get('day', 7)) state = fdict.get('state', 'IA') table = "alldata_%s" % (state,) cursor.execute(""" SELECT high, low from """+table+""" where sday = %s and high is not null and low is not null """, ("%02i%02i" % (month, day),)) highs = [] lows = [] for row in cursor: highs.append(row[0]) lows.append(row[1]) highs = np.array(highs) lows = np.array(lows) (fig, ax) = plt.subplots(1,1) ax.hist(highs, bins=(np.max(highs)-np.min(highs)), histtype='step', normed=True, color='r', zorder=1) mu, std = norm.fit(highs) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) ax.plot(x, p, 'r--', linewidth=2) ax.text(0.99, 0.99, "High Temp\n$\mu$ = %.1f$^\circ$F\n$\sigma$ = %.2f" % ( mu, std), va='top', ha='right', color='r', transform=ax.transAxes) ax.hist(lows, bins=(np.max(highs)-np.min(highs)), histtype='step', normed=True, color='b', zorder=1) mu, std = norm.fit(lows) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) ax.plot(x, p, 'b--', linewidth=2) ts = datetime.datetime(2000, month, day) ax.set_title("%s %s Temperature Distribution" % (STATES[state], ts.strftime("%d %B"))) ax.text(0.01, 0.99, "Low Temp\n$\mu$ = %.1f$^\circ$F\n$\sigma$ = %.2f" % ( mu, std), va='top', ha='left', color='b', transform=ax.transAxes) ax.grid(True) ax.set_xlabel("Temperature $^\circ$F") ax.set_ylabel("Probability") return fig
def getVelStat(self): """ this method calculates the data's ensamble mean and standard deviation values and assigns them to new atribtes of the instance vec. """ u,v = self.u.flatten(), self.v.flatten() self.Umean, self.Ustd = norm.fit(u) self.Vmean, self.Vstd = norm.fit(v)
def create_scatter_hist(trans_data,sigma): nullfmt = NullFormatter() # no labels left, width = 0.1, 0.65 bottom, height = 0.1, 0.65 bottom_h = left_h = left+width+0.02 rect_scatter = [left, bottom, width, height] rect_histx = [left, bottom_h, width, 0.2] rect_histy = [left_h, bottom, 0.2, height] fig = plt.figure(1,figsize=(8,8)) axScatter = fig.add_subplot(223, position=rect_scatter) plt.xlabel(r'$log_{10}(\eta_{\nu})$', fontsize=16) plt.ylabel(r'$log_{10}(V_{\nu})$', fontsize=16) axHistx=fig.add_subplot(221, position=rect_histx) axHisty=fig.add_subplot(224, position=rect_histy) axHistx.xaxis.set_major_formatter(nullfmt) axHisty.yaxis.set_major_formatter(nullfmt) col=['r','b','g','y'] for i in range(len(frequencies)): xdata=[np.log10(trans_data[n][1]) for n in range(len(trans_data)) if trans_data[n][6]==frequencies[i] if trans_data[n][1] > 0 if trans_data[n][3] > 0] ydata=[np.log10(trans_data[n][3]) for n in range(len(trans_data)) if trans_data[n][6]==frequencies[i] if trans_data[n][1] > 0 if trans_data[n][3] > 0] axScatter.scatter(xdata, ydata,color=col[i], s=5.) axScatter.legend(frequencies,loc=4) x=[np.log10(trans_data[n][1]) for n in range(len(trans_data)) if trans_data[n][1] > 0 if trans_data[n][3] > 0] y=[np.log10(trans_data[n][3]) for n in range(len(trans_data)) if trans_data[n][1] > 0 if trans_data[n][3] > 0] bins = 50 param=norm.fit(x) range_x=np.linspace(min(x),max(x),1000) fit=norm.pdf(range_x,loc=param[0],scale=param[1]) sigcutx = param[1]*sigma+param[0] axHistx.axvline(x=sigcutx, linewidth=2, color='k', linestyle='--') axHistx.plot(range_x,fit, 'k:', linewidth=2) param2=norm.fit(y) range_y=np.linspace(min(y),max(y),1000) fit2=norm.pdf(range_y,loc=param2[0],scale=param2[1]) sigcuty = param2[1]*sigma+param2[0] axHisty.axhline(y=sigcuty, linewidth=2, color='k', linestyle='--') axScatter.axhline(y=sigcuty, linewidth=2, color='k', linestyle='--') axScatter.axvline(x=sigcutx, linewidth=2, color='k', linestyle='--') axHisty.plot(fit2, range_y, 'k:', linewidth=2) axHistx.hist(x, bins=bins, normed=1, histtype='stepfilled', color='b') axHisty.hist(y, bins=bins, normed=1, histtype='stepfilled', orientation='horizontal', color='b') axHistx.set_xlim( axScatter.get_xlim() ) axHisty.set_ylim( axScatter.get_ylim() ) # xvals=[-3., -2., -1., 0., 1., 2., 3.] # xtxts=[str(10.**a) for a in xvals] # yvals=[-2., -1., 0.] # ytxts=[str(10.**a) for a in xvals] # axScatter.set_xticks(xvals) # axScatter.set_xticklabels(xtxts) # axScatter.set_yticks(yvals) # axScatter.set_yticklabels(ytxts) plt.savefig('scatter_hist.png') plt.close() return
def fit(self, X, y, sample_weight=None): # the new coordinate system based on the training X self.dm=DiffusionMap(X,self.eps_par) mindist = self.dm.onePercentDistances() numpy.log(mindist,mindist) mu, std = norm.fit(mindist) wok=numpy.abs(mindist-mu)/std < 3 mu, std = norm.fit(mindist[wok]) self.dm.par = (numpy.exp(mu+self.eps_par*std))**2 self.dm.make_map() return self.estimator.fit(self.dm.dmap.X, y, sample_weight=sample_weight)
def sentiment_dist(): data, dif = loadFeatures(), [] pos = array(data['pos'], float) neg = array(data['neg'], float) pmean, pstd = norm.fit(pos) nmean, nstd = norm.fit(neg) print 'positive: mean, std: ', pmean, pstd print 'negative: mean, std: ', nmean, nstd for i in range(len(pos)): dif.append(pos[i] - neg[i]) dmean, dstd = norm.fit(dif) print 'delta: mean, std: ', dmean, dstd
def fit(self, x, y): del(self.catastrophe) del(self.max_distance) del(self.mask_scale) del(self.dm) self.catastrophe = numpy.logical_or(y[:,0] < self.zmin, numpy.logical_or(y[:,0] > self.zmax, y[:,1] < self.oiimin)) if False: #os.path.isfile('estimator.pkl'): #print 'get pickle' pklfile=open('estimator.pkl','r') self.dm=pickle.load(pklfile) else: # the new coordinate system based on the training data data = Data(x,y,numpy.zeros(len(y)),xlabel=self.xlabel,ylabel=self.ylabel) self.dm=DiffusionMap(data,self.eps_par) mindist = self.dm.data_mindist() mindist[mindist < sys.float_info.min]=(mindist[mindist > sys.float_info.min]).min() mindist= numpy.log(mindist) mu, std = norm.fit(mindist) wok=numpy.abs(mindist-mu)/std < 3 mu, std = norm.fit(mindist[wok]) self.dm.par = numpy.exp(mu+self.eps_par*std) self.dm.make_map() # pklfile=open('estimator.pkl','w') # pickle.dump(self.dm,pklfile) # pklfile.close() # self.dm=DiffusionMap(x,self.eps_par) # self.dm.make_map() train_dist = sklearn.metrics.pairwise_distances(self.dm.data_dm().x,self.dm.data_dm().x) # catastrophe_distances = train_dist[numpy.outer(self.catastrophe,self.catastrophe)] # catastrophe_distances = catastrophe_distances[catastrophe_distances !=0] # catastrophe_distances = numpy.sort(catastrophe_distances) numpy.fill_diagonal(train_dist,train_dist.max()) #numpy.finfo('d').max) train_min_dist = numpy.min(train_dist,axis=0) train_min_dist = numpy.sort(train_min_dist) train_min_dist[train_min_dist < sys.float_info.min]=(train_min_dist[train_min_dist > sys.float_info.min]).min() catastrophe_min_dist = train_min_dist[self.catastrophe] catastrophe_min_dist=numpy.log(catastrophe_min_dist) mu, std = norm.fit(catastrophe_min_dist) wok=numpy.abs(catastrophe_min_dist-mu)/std < 3 mu, std = norm.fit(catastrophe_min_dist[wok]) self.max_distance = train_min_dist[x.shape[0]*self.outlier_cut] self.mask_scale = numpy.exp(mu+std*self.mask_var)
def getDistribution(filePath, refMeasurements_ds, refMeasurements_dth): #List containing all corner measurements measurements_ds = dE.getMeasurements(filePath,'ds =') measurements_dth = dE.getMeasurements(filePath,'dth =') # Measurements which are not due to random noise filteredMeasurements_ds = checkReference(refMeasurements_ds, measurements_ds, 0.2) filteredMeasurements_dth = checkReference(refMeasurements_dth, measurements_dth, 0.2) # Getting normal distribution parameters mu_ds, std_ds = norm.fit(filteredMeasurements_ds) mu_dth, std_dth = norm.fit(filteredMeasurements_dth) # return mu, std, allDifferences return mu_ds, std_ds, mu_dth, std_dth
def plot_histplot(self, files): # best fit of data (mu, sigma) = norm.fit(self.y) print "mu and sigma: " + str(mu) + ", " + str(sigma) + "" # Make the hist plot plt.figure(figsize=(12, 6)) binwidth = 0.1 color = "dodgerblue" # the histogram of the data n, bins, patches = plt.hist( self.y, normed=1, color=color, bins=np.arange(min(self.y), max(self.y) + binwidth, binwidth) ) # # add a 'best fit' line fit = mlab.normpdf(bins, mu, sigma) l = plt.plot(bins, fit, "b--", linewidth=2) plt.xlabel(u"Distance [Å]") plt.ylabel(u"Probability") title = "$\mathrm{Histogram\ of: \ " + files + "}$" title = title.replace("_", "\_") plt.title(r"" + title + "$\ \ \mu=%.3f,\ \sigma=%.3f$" % (mu, sigma)) plt.savefig("plots/" + files + "_hist.png") plt.close()
def plot_t_value_hist( img_path='~/ni_data/ofM.dr/l1/as_composite/sub-5703/ses-ofM/sub-5703_ses-ofM_task-EPI_CBV_chr_longSOA_tstat.nii.gz', roi_path='~/ni_data/templates/roi/DSURQEc_ctx.nii.gz', mask_path='/usr/share/mouse-brain-atlases/dsurqec_200micron_mask.nii', save_as='~/qc_tvalues.pdf', ): """Make t-value histogram plot""" f, axarr = plt.subplots(1, sharex=True) roi = nib.load(path.expanduser(roi_path)) roi_data = roi.get_data() mask = nib.load(path.expanduser(mask_path)) mask_data = mask.get_data() idx = np.nonzero(np.multiply(roi_data,mask_data)) img = nib.load(path.expanduser(img_path)) data = img.get_data()[idx] (mu, sigma) = norm.fit(data) n, bins, patches = axarr.hist(data,'auto',normed=1, facecolor='green', alpha=0.75) y = mlab.normpdf(bins, mu, sigma) axarr.plot(bins, y, 'r--', linewidth=2) axarr.set_title('Histogram of t-values $\mathrm{(\mu=%.3f,\ \sigma=%.3f}$)' %(mu, sigma)) axarr.set_xlabel('t-values') plt.savefig(path.expanduser(save_as))
def plot_logistic_parameter_ratio(plot_conditions, plot_colors, control_condition, condition_logistic_params, xlim=[-.1,.2], ylim=[0,35]): fig=plt.figure() ax = fig.add_subplot(1, 1, 1) xx=np.arange(-.512,.512,.001) mean_a0=np.mean(condition_logistic_params['a0'][control_condition]) mean_a1=np.mean(condition_logistic_params['a1'][control_condition]) mean_a2=np.mean(condition_logistic_params['a2'][control_condition]) yy_r=1/(1+np.exp(-(mean_a0+mean_a1*xx+mean_a2*1))) yy_l=1/(1+np.exp(-(mean_a0+mean_a1*xx+mean_a2*-1))) ax.plot(xx,yy_l,'--', color=plot_colors[control_condition], linewidth=2, label='Left*') ax.plot(xx,yy_r,plot_colors[control_condition], linewidth=2, label='Right*') ax.legend(loc='best') ax.set_xlabel('coherence') ax.set_ylabel('P(R)') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) xx=np.arange(xlim[0],xlim[1],0.001) binwidth=.02 for condition in plot_conditions: ratio=np.array(condition_logistic_params['a2'][condition]) / np.array(condition_logistic_params['a1'][condition]) bins=np.arange(min(ratio), max(ratio) + binwidth, binwidth) hist,edges=np.histogram(ratio, bins=bins) center = (bins[:-1] + bins[1:]) / 2 ax.bar(center, hist/float(len(ratio))*100.0, color=plot_colors[condition], alpha=0.75, label=condition, width=binwidth) (mu, sigma) = norm.fit(ratio) y = normpdf(xx, mu, sigma)*binwidth*100.0 ax.plot(xx, y, '--', color=plot_colors[condition], linewidth=2) ax.legend(loc='best') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('a2/a1') ax.set_ylabel('% subjects')
def PlotHistNorm(data, log=False): # distribution fitting param = norm.fit(data) mean = param[0] sd = param[1] #Set large limits xlims = [-6*sd+mean, 6*sd+mean] #Plot histogram histdata = hist(data,bins=12,alpha=.3,log=log) #Generate X points x = linspace(xlims[0],xlims[1],500) #Get Y points via Normal PDF with fitted parameters pdf_fitted = norm.pdf(x,loc=mean,scale=sd) #Get histogram data, in this case bin edges xh = [0.5 * (histdata[1][r] + histdata[1][r+1]) for r in xrange(len(histdata[1])-1)] #Get bin width from this binwidth = (max(xh) - min(xh)) / len(histdata[1]) #Scale the fitted PDF by area of the histogram pdf_fitted = pdf_fitted * (len(data) * binwidth) #Plot PDF plot(x,pdf_fitted,'r-')
def plotDistribution(l,a,b,c=0,t="No title"): d = [] if c==0: data1 = np.array(l[a:b]) data2 = np.array(l[b:len(l)]) d=[(data1,"c","Objective"), (data2,"r","Subjective")] else: data1 = np.array(l[a:b]) data2 = np.array(l[b:c]) data3 = np.array(l[c:len(l)]) d=[(data1,"r","Negative"), (data2,"c","Objective"), (data3,"g","Positive")] for data in d: #fit a normal distribution to the data mu, std = norm.fit(data[0]) lb = data[2]+" : mu = %.2f, std = %.2f" % (mu, std) #plot histogram plt.hist(data[0], normed=True, alpha=0, color='g') # Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, data[1], linewidth=2, label=lb) plt.title(t) plt.legend(loc="upper right") plt.xlabel("sentiment score") plt.show()
def AnalysePredictor(self, train, predictor_transformation='none'): if self.predictor_name is None: raise TypeError("Execute the SetUpTrainTest method to use this feature") return #http://matplotlib.org/users/pyplot_tutorial.html if self.predictor_type == 'continuous': values = train[self.predictor_name] if predictor_transformation == 'log': values = np.log(values) else: predictor_transformation = 'none' #in case not supported transformation # fit the normal distribution on ln(loss) (mu, sigma) = norm.fit(values) # the histogram of the ln(loss) n, bins, patches = plt.hist(values, 60, normed=1, facecolor='green', alpha=0.75) # add the fitted line y = mlab.normpdf( bins, mu, sigma) l = plt.plot(bins, y, 'r--', linewidth=2) #plot plt.xlabel('Predictor: ' + self.predictor_name + ' - Transformation: ' +predictor_transformation) plt.ylabel('Probability') plt.title(r'$\mathrm{Histogram\ of\ Ln(Loss):}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma)) plt.grid(True) plt.show() else: print 'predictor_type not implemented'
def plot_bandwith(file,subplot): bandwith = [] time = [] for line in file: l = line.split(",") if len(l) == 9: #the it is a client report transfered=l[7] Bps=float(l[8]) time_=l[6].split('-')[0] if (Bps > 0.0): bandwith.append(Bps/1000) time.append(time_) else: #the server's report total_trans=l[7] average_bandwith=l[8] jitter=l[9] loss=l[10] total_pack=l[11] loss_rate =l[12] # average_bandwith=0 # for i in bandwith: # average_bandwith+=i # average_bandwith = average_bandwith/len(bandwith) # average=[] # stdp=[] #standar deviation positive # stdn =[] #standard deviation negative # sd = np.std(bandwith) # for i in bandwith: # average.append(average_bandwith) # stdp.append(average_bandwith+sd) # stdn.append(average_bandwith-sd) # print "Average ", average_bandwith, "SD ",sd #plt.xlabel("Time (s)") #plt.ylabel("Bandwith (Kb)") #plt.plot(time[:-1],bandwith[:-1],"r.",label="Bandwith") #plt.plot(time[:-1],average[:-1],"b",label="Average") #plt.plot(time[:-1],stdp[:-1],"g",label="Standard deviation") #plt.plot(time[:-1],stdn[:-1],"g") #plt.legend() (mu,sigma) = norm.fit(bandwith[:-1]) #print mu, sigma # plt.subplot(340+current) # plt.subplots(nrows=4,ncols=3) n , bins , patches = subplot.hist(bandwith[:-1], 30,normed=True,facecolor='green',alpha=1) #print bins #y = mlab.normpdf(bins,average_bandwith,sd) y = mlab.normpdf(bins,mu,sigma) # plt.xlabel("Bandwidth (Mbps)",fontsize=15) subplot.set_xlabel("Bandwidth (Mbps)",fontsize=4.5,style='italic') #subplot.plot(bins,y,'b-') subplot.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *np.exp( - (bins - mu)**2 / (2 * sigma**2) ),linewidth=2, color='r') # subplot.subplots_adjust(left=0.15) #print file.name.split(":")[0].split("/")[1] print file.name.split(":")[0].split("/")[1] subplot.set_title(r'$\mathrm{Histogram\ of\ GS\ %s:}\ \mu=%.3f,\ \sigma=%.3f$' %(file.name.split(":")[0].split("/")[1],mu, sigma),fontsize=6) subplot.grid(True)
def log_histogram(code,r,bins=30): #loads the file containing the masses for code mass_file = cd.get_output_file_name(code=code,attribute='tm',redshift=r) #remove units from data with 'Msun' unit attached mass_data = cd.format_data(load_file=mass_file, file_unit_type='mass', tuple_data='no') x = [float(i) for i in mass_data] #Take the log base 10 of the masses logx = np.log10(x) ####################################### # Plot with dashed bars and curve fit # ####################################### plt.figure(figsize=(15,10)) # best fit of data (mu, sigma) = norm.fit(logx) # the histogram of the data (n=logx bins=bins) n, bins, patches = plt.hist(logx, int(bins), normed=1, facecolor='green', alpha=0.75) # add a 'best fit' line y = mlab.normpdf( bins, mu, sigma) l = plt.plot(bins, y, 'r--', linewidth=2) #plot plt.xlabel('Msun') plt.ylabel('RFrequency') plt.title('Histogram of Mass distribution for %s %s' %(code,r)) plt.xscale('log') plt.yscale('log') plt.xlim(left=0, right=14) plt.grid(True) plt.savefig(str(code)+str(r)+str(bins)+'loghist.png')
def tVertexErrorHist(diffs, nEvents, title=None, ranges=None, quiet=True): """ Plots an error histogram for tVertex-genVertex z values. Usage: tVertexErrorHist(differences, number of counts, quiet=True) """ absDiffs = np.absolute(diffs) fig, ax = plt.subplots() # |Set up graph n, bins, patches = ax.hist(diffs, normed=False, range=ranges, bins=100) (mu, sigma) = norm.fit(diffs) # |Fit curve muerr = sigma / np.sqrt(nEvents) dx = bins[1] - bins[0] # |Get bin width scale = dx * len(absDiffs) # |Scale histogram fitline = mlab.normpdf(bins, mu, sigma) * scale ax.plot(bins, fitline, "r--", linewidth=2) # |Plot fit line ax.set_xlabel("Error (mm)") ax.set_ylabel("Counts ($\Sigma=%i$)" % nEvents) if title == None: ax.set_title("tVertexed $z$ - genVertex $z$ for 500GeV $\gamma$-gun") else: ax.set_title(title) # Build output and figure text string string = "$\mu$ = %.3f$\pm$%.3fmm, $\sigma_{fitted}$ = %.3fmm \n" % (mu, muerr, sigma) # |Print out info box string += "68% error magnitude: {0:>6.3f}mm \n".format(np.percentile(absDiffs, 68.27)) string += "Median error magnitude: {0:>6.3f}mm \n".format(np.median(absDiffs)) string += "Mean error magnitude: {0:>6.3f}mm \n".format(np.mean(absDiffs)) # Changing font to look nicer and line up colons font = {"family": "monospace", "weight": "bold", "size": 9} matplotlib.rc("font", **font) # |Changes fond to monospace # Annotate plot ax.text(0.02, 0.99, string, transform=ax.transAxes, verticalalignment="top") if not quiet: print string # Show it plt.show()
def estimate_normal_params(x, outlier=True, value=3.): """ Estimate the mean (as median) and veriance (as MAD) Paramters: ---------------------- x: array, float outlier: bool, with or without outlier removal value: float, factor for outlier removal (value * sd +. median is removed) Returns: --------------------- (loca, scale): float, float tuple of loc and scale estimation """ if outlier: mu, sd = norm.fit(x) mu = np.median(x) sd = mad(x) t = x t = t[(t >= mu - value * sd) & (t <= mu + value * sd)] loc = np.mean(t) scale = np.std(t) else: loc = np.median(x) scale = mad(x) return (loc, scale)
def getDistribution(filePath, refMeasurements): #List containing all corner measurements measurements = dE.getMeasurements(filePath,'corners_world =') # Measurements which are not due to random noise filteredMeasurements = checkReference(refMeasurements, measurements, 50) #Split filtered corners in list of points that belong together measurements_point_A = dE.removeSublistLevel(filteredMeasurements,0) measurements_point_B = dE.removeSublistLevel(filteredMeasurements,1) measurements_point_C = dE.removeSublistLevel(filteredMeasurements,2) # Extracting difference in x and y coordinates from split corners x_A = dE.removeSublistLevel(measurements_point_A, 0) y_A = dE.removeSublistLevel(measurements_point_A, 1) x_B = dE.removeSublistLevel(measurements_point_B, 0) y_B = dE.removeSublistLevel(measurements_point_B, 1) x_C = dE.removeSublistLevel(measurements_point_C, 0) y_C = dE.removeSublistLevel(measurements_point_C, 1) # Putting all the differences in one list allDifferences = [] allDifferences.extend(x_A) allDifferences.extend(y_A) allDifferences.extend(x_B) allDifferences.extend(y_B) allDifferences.extend(x_C) allDifferences.extend(y_C) # Getting normal distribution parameters mu, std = norm.fit(allDifferences) return mu, std, allDifferences
def plot_assumption_free(scores, data, bins=50): """ Plots the scores from the analysis using the assumption free algorithm. """ plt.figure() plt.subplot(2, 1, 1) (data.acc / data.acc.max()).plot() (data.hr / data.hr.max()).plot() data.ratio_log.plot() plt.legend(loc='best') plt.subplot(2, 1, 2) plt.plot(data.index[:len(scores)], scores) scores = [x for x in scores if abs(x) > 10 ** -10] s_mean, sigma = norm.fit(scores) plt.figure() plt.hist(scores, bins=50, normed=True) plt.plot(bins, norm.pdf(bins, loc=s_mean, scale=sigma)) vlin = linspace(s_mean - 3 * sigma, s_mean + 3 * sigma, 13) step = int(256 / ((len(vlin) - 1) / 2)) colors = linspace(0, 1, 256)[::step][:(len(vlin) - 1) / 2] colors = [(c, 0, 0) for c in colors] colors += [(1, 1, 1)] colors += [(0, c, 0) for c in reversed(colors)] plt.vlines(vlin.tolist()[1:], 0, 1, colors[1:])
def plot(fileName, color, label): data = [] f = open('%s%s' % (carpeta, file), 'r') lines = f.readlines() f.close() for line in lines: data.append(float(line)) n, bins, patches = pyplot.hist(data, bins=15, range=(100, 500), normed=True, color="%c" % color, label=label, alpha=0.53, linewidth=0.3) # normal fitting (mu, sigma) = norm.fit(data) pdf_norm = mlab.normpdf( bins, mu, sigma) pyplot.plot(bins, pdf_norm, "%c--" % color, linewidth=1, label=None) # lognormal fitting shape, loc, scale = stats.lognorm.fit(data, floc=0) mu = np.log(scale) # Mean of log(X) sigma = shape # Standard deviation of log(X) M = np.exp(mu) # Geometric mean == median s = np.exp(sigma) # Geometric standard deviation x = np.linspace(100, 500) pdf_lognorm = stats.lognorm.pdf(x, shape, loc=0, scale=scale) pyplot.plot(x, pdf_lognorm, "%c" % color, linewidth=1, label=None) # Plot fitted curve pyplot.vlines(mu, 0, pdf_norm.max(), linestyle='-', label=None) pyplot.vlines(M, 0, pdf_lognorm.max(), linestyle=':', label=None) ax = pyplot.gca() # Get axis handle for text positioning ax.text(M, pdf_norm.max(), u"%s\nmedian=%.2f ms\n%i samples" % (fileName, M, len(data)), style='italic', color=color, size='small') pyplot.legend() pylab.savefig("%s%s" % (carpeta, 'img.png'))
def clik1((al,origclust,dellen)): allen = al.shape[0] stats = [] for i in xrange(BOOTREPS): boot = np.array(random.sample(al,dellen)) stats.append(clust(boot)) return norm(*norm.fit(stats)).pdf(origclust)
def mcmc_tt(al=np.genfromtxt(ALIGNFILE,delimiter=',').astype(np.int), imps=IMPS): print 'Building likelihood distributions...' rdist = np.genfromtxt(RDIST, delimiter=',') ldist = norm(*norm.fit(rdist)) pdist = cclass(al, imps) print 'Starting MCMC:' print 'Step#\t|New Lik\t|New PropLik\t|Old Lik\t|Old PropLik\t|Accept Prob' old = impute.impute(al,imps, orderfunc=ORDERFUNC) old_tt = tt.ttratio(old) old_lik = ldist.pdf(old_tt) old_plik = pdist.pdf(old_tt) states = [(clust(old),old_lik,old_plik,old_lik,old_plik,1)] for i in xrange(STEPS): prop = impute.impute(al,imps, orderfunc=ORDERFUNC) prop_tt = tt.ttratio(prop) prop_lik = ldist.pdf(prop_tt) prop_plik = pdist.pdf(prop_tt) a = (prop_lik/old_lik)*(old_plik/prop_plik) states.append((clust(old),prop_lik,prop_plik,old_lik,old_plik,a)) print '%d\t|%2f\t|%2f\t|%2f\t|%2f\t|%e' % (i+1,prop_lik,prop_plik,old_lik,old_plik,a) if random.random()<a: old, old_tt, old_lik, old_plik = prop, prop_tt, prop_lik, prop_plik states.append((clust(old),prop_lik,prop_plik,old_lik,old_plik,a)) np.savetxt(OUT_STATES, np.array(states), delimiter=',')
def tlik((al,origtt,dellen)): allen = al.shape[0] stats = [] for i in xrange(BOOTREPS): boot = np.array(random.sample(al,dellen)) stats.append(tt.ttratio(boot)) return norm(*norm.fit(stats)).pdf(origtt)
def clik((al,origclust,dellen)): allen = al.shape[0] stats = [] for i in xrange(BOOTREPS): boot = al[np.random.choice(xrange(allen),dellen,replace=0)] stats.append(clust(boot)) return norm(*norm.fit(stats)).pdf(origclust)
def construct_gs_hist(del_bl=8.,num_bl=10,beam_sig=0.09,fq=0.1): save_tag = 'grid_del_bl_{0:.2f}_num_bl_{1}_beam_sig_{2:.2f}_fq_{3:.3f}'.format(del_bl,num_bl,beam_sig,fq) save_tag_mc = 'grid_del_bl_{0:.2f}_num_bl_{1}_beam_sig_{2:.2f}_fq_{3}'.format(del_bl,num_bl,beam_sig,fq) ys = load_mc_data('{0}/monte_carlo/{1}'.format(data_loc,save_tag_mc)) print 'ys ',ys.shape alms_fg = qgea.generate_sky_model_alms(gsm_fits_file,lmax=3) alms_fg = alms_fg[:,2] baselines,Q,lms = load_Q_file(gh='grid',del_bl=del_bl,num_bl=num_bl,beam_sig=beam_sig,fq=fq,lmax=3) N = total_noise_covar(0.1,baselines.shape[0],'{0}/gsm_matrices/gsm_{1}.npz'.format(data_loc,save_tag)) MQN = return_MQdagNinv(Q,N,num_remov=None) print MQN ahat00s = n.array([]) for ii in xrange(ys.shape[1]): #_,ahat,_ = qgea.test_recover_alms(ys[:,ii],Q,N,alms_fg,num_remov=None) ahat = uf.vdot(MQN,ys[:,ii]) ahat00s = n.append(n.real(ahat[0]),ahat00s) #print ahat00s print ahat00s.shape _,bins,_ = p.hist(ahat00s,bins=36,normed=True) # plot best fit line mu,sigma = norm.fit(ahat00s) print "mu, sigma = ",mu,', ',sigma y_fit = mpl.mlab.normpdf(bins,mu,sigma) p.plot(bins, y_fit, 'r--', linewidth=2) p.xlabel('ahat_00') p.ylabel('Probability') p.title(save_tag) p.annotate('mu = {0:.2f}\nsigma = {1:.2f}'.format(mu,sigma), xy=(0.05, 0.5), xycoords='axes fraction') p.savefig('./figures/monte_carlo/{0}.pdf'.format(save_tag)) p.clf()
def plot_delay(file): delays=[] for line in file: if line.find("time") != -1: if line[line.find("time")+4]=='=': data =line.split("time")[1].split("=")[1].split(" ")[0] delays.append(float(data)) (mu,sigma) = norm.fit(delays) print mu, sigma n , bins , patches = plt.hist(delays, 15,normed=True,facecolor='green',alpha=1) #print bins #y = mlab.normpdf(bins,average_bandwith,sd) y = mlab.normpdf(bins,mu,sigma) # plt.xlabel("Bandwidth (Mbps)",fontsize=15) plt.xlabel("Delay (ms)",fontsize=13,style='italic') plt.ylabel("Ocurrence probability",fontsize=13,style='italic') data1=frange(140,180,0.3) #subplot.plot(bins,y,'b-') plt.plot(data1, 1/(sigma * np.sqrt(2 * np.pi)) *np.exp( - (data1 - mu)**2 / (2 * sigma**2) ),linewidth=2, color='r') print file.name.split(":")[0].split("/")[1] plt.title(r'$\mathrm{Histogram\ of\ GS\ %s:}\ \mu=%.3f,\ \sigma=%.3f$' %(gs[file.name.split(":")[0].split("/")[1]],mu, sigma),fontsize=16) plt.grid(True) plt.tight_layout()
def mcmc_sym_dist(alignment, num_imp, dem_ratios, directory, length, burnin): acceptances = 0 d = transprobs(TRANSITIONS, MARGINAL) pd = pdn(alignment) mins = np.array([sorted(i) for i in pd]) nloc, nscale = norm.fit(mins) dist = norm(nloc, nscale) # Build first state of Markov chain print 'Imputing first alignment...' current = impute.imp_align(num_imp, alignment, dem_ratios) current.loglik = loglik(current)+math.log(distlik(current, num_imp, nloc, 1000)) print '\t Log likelihood %2f' % current.loglik if not burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,0), 'fasta') # Run chain for i in xrange(1,length+1): proposal = propose(current,num_imp,max(norm(loc=2,scale=1).rvs(),1), d) l1 = loglik(proposal) l2 = math.log(distlik(proposal, num_imp, nloc, 1000)) proposal.loglik = l1+l2 p = proposal.loglik-current.loglik print 'Current LLH: %2f; Proposed LLH: %2f' % (current.loglik, proposal.loglik) print '\tPhylogeny component: %2f; Distance component: %2f' % (l1, l2) print '\tAcceptance probability %e' % math.exp(p) if random.random()<math.exp(p): current = proposal acceptances += 1 print '\tAccepted' else: print '\tNot accepted' if i > burnin: AlignIO.write(current, '%s/%d.fasta' % (directory,i-burnin), 'fasta') return float(acceptances)/length
def absolute_sdm(obs_cube, mod_cube, sce_cubes, *args, **kwargs): """ apply absolute scaled distribution mapping to all scenario cubes assuming a normal distributed parameter Args: * obs_cube (:class:`iris.cube.Cube`): the observational data * mod_cube (:class:`iris.cube.Cube`): the model data at the reference period * sce_cubes (:class:`iris.cube.CubeList`): the scenario data that shall be corrected Kwargs: * cdf_threshold (float): limit of the cdf-values (default: .99999) """ from scipy.stats import norm from scipy.signal import detrend cdf_threshold = kwargs.get('cdf_threshold', .99999) obs_cube_mask = np.ma.getmask(obs_cube.data) cell_iterator = np.nditer(obs_cube.data[0], flags=['multi_index']) while not cell_iterator.finished: index_list = list(cell_iterator.multi_index) cell_iterator.iternext() index_list.insert(0, 0) index = tuple(index_list) if obs_cube_mask and obs_cube_mask[index]: continue index_list[0] = slice(0, None, 1) index = tuple(index_list) # consider only cells with valid observational data obs_data = obs_cube.data[index] mod_data = mod_cube.data[index] obs_len = len(obs_data) mod_len = len(mod_data) obs_mean = obs_data.mean() mod_mean = mod_data.mean() # detrend the data obs_detrended = detrend(obs_data) mod_detrended = detrend(mod_data) obs_norm = norm.fit(obs_detrended) mod_norm = norm.fit(mod_detrended) obs_cdf = norm.cdf(np.sort(obs_detrended), *obs_norm) mod_cdf = norm.cdf(np.sort(mod_detrended), *mod_norm) obs_cdf = np.maximum(np.minimum(obs_cdf, cdf_threshold), 1 - cdf_threshold) mod_cdf = np.maximum(np.minimum(mod_cdf, cdf_threshold), 1 - cdf_threshold) for sce_cube in sce_cubes: sce_data = sce_cube[index].data sce_len = len(sce_data) sce_mean = sce_data.mean() sce_detrended = detrend(sce_data) sce_diff = sce_data - sce_detrended sce_argsort = np.argsort(sce_detrended) sce_norm = norm.fit(sce_detrended) sce_cdf = norm.cdf(np.sort(sce_detrended), *sce_norm) sce_cdf = np.maximum(np.minimum(sce_cdf, cdf_threshold), 1 - cdf_threshold) # interpolate cdf-values for obs and mod to the length of the # scenario obs_cdf_intpol = np.interp(np.linspace(1, obs_len, sce_len), np.linspace(1, obs_len, obs_len), obs_cdf) mod_cdf_intpol = np.interp(np.linspace(1, mod_len, sce_len), np.linspace(1, mod_len, mod_len), mod_cdf) # adapt the observation cdfs # split the tails of the cdfs around the center obs_cdf_shift = obs_cdf_intpol - .5 mod_cdf_shift = mod_cdf_intpol - .5 sce_cdf_shift = sce_cdf - .5 obs_inverse = 1. / (.5 - np.abs(obs_cdf_shift)) mod_inverse = 1. / (.5 - np.abs(mod_cdf_shift)) sce_inverse = 1. / (.5 - np.abs(sce_cdf_shift)) adapted_cdf = np.sign(obs_cdf_shift) * ( 1. - 1. / (obs_inverse * sce_inverse / mod_inverse)) adapted_cdf[adapted_cdf < 0] += 1. adapted_cdf = np.maximum(np.minimum(adapted_cdf, cdf_threshold), 1 - cdf_threshold) xvals = norm.ppf(np.sort(adapted_cdf), *obs_norm) \ + obs_norm[-1] / mod_norm[-1] \ * (norm.ppf(sce_cdf, *sce_norm) - norm.ppf(sce_cdf, *mod_norm)) xvals -= xvals.mean() xvals += obs_mean + (sce_mean - mod_mean) correction = np.zeros(sce_len) correction[sce_argsort] = xvals correction += sce_diff - sce_mean sce_cube.data[index] = correction
# Compile and run our program process = Popen(["gcc", "-o", "01_cachedemo", "01_cachedemo.c"]) process.wait() process = Popen( ["./01_cachedemo", "10000", "10000", "01_flush.txt", "01_noflush.txt"]) process.wait() # Generate histograms for measurements of NOT cached values s = "" with open('01_flush.txt') as f: s = f.read() s = numpy.fromstring(s, dtype=int, sep=',') srt_row = numpy.array(sorted(s, key=int, reverse=False)).astype(numpy.int) # Sort and cut away 5% of the biggest measurements cutSortedRow = srt_row[:int(len(srt_row) * 0.95)] (mu, sigma) = norm.fit(cutSortedRow) n, bins, patches = plt.hist(cutSortedRow, weights=numpy.zeros_like(cutSortedRow) + 1. / cutSortedRow.size) y = norm.pdf(bins, mu, sigma) plt.plot(bins, y, 'r--', linewidth=2) plt.xlabel('Cycles') plt.ylabel('Frequency') plt.title( f"""Main memory access speed - \u03BC: {round(mu, 2)}, \u03C3: {round(sigma, 2)}""" ) plt.grid(True) plt.locator_params(axis='x', nbins=5) plt.tight_layout() plt.savefig('01_flush.png') plt.close()
def hist(df, feature, bins=50): '''Plots bokeh histogram, PDF & CDF of a DF feature. Parameters ---------- df : DataFrame DF of the data. feature : str Column name of the df. bins : int Number of bins to plot. Returns ------- None ''' #not nan feature values x = df[feature][df[feature].notna()].values #Get the values for the histogram and bin edges (length(hist)+1)/ #Use density to plot pdf and cdf on the same plot. hist, edges = np.histogram(x, bins=bins, density=True) ### PDF & CDF ## #find normal distribution parameters mu, sigma = norm.fit(x) xs = np.linspace(min(x), max(x) + 1, len(x)) #x values to plot the line(s) pdf = norm.pdf(xs, loc=mu, scale=sigma) #probability distribution function cdf = norm.cdf(xs, loc=mu, scale=sigma) #cumulative distribution function #data sources for cdf source_cdf = ColumnDataSource({'cdf': cdf, 'xs': xs}) #create the canvas p1 = figure(title='Histogram, PDF & CDF', plot_height=400, x_axis_label=feature, y_axis_label='Density') #add histogram p1.quad(bottom=0, top=hist, left=edges[:-1], right=edges[1:], fill_color='royalblue', line_color='black', alpha=0.7) #add pdf p1.line(xs, pdf, line_color='red', line_width=5, alpha=0.5, legend_label='PDF') #set left-hand y-axis range p1.y_range = Range1d(0, max(hist) + 0.05 * max(hist)) #setting the second y axis range name and range p1.extra_y_ranges = {"cdf": Range1d(start=0, end=1.05)} #adding the second y axis to the plot and to the right. p1.add_layout(LinearAxis(y_range_name="cdf", axis_label='CDF'), 'right') #add cdf with y range on the right cdf_plot = p1.line('xs', 'cdf', source=source_cdf, alpha=0.8, line_color='darkgoldenrod', line_width=5, legend_label='CDF', y_range_name='cdf', name='cdf', hover_line_color='green') #hover tool p1.add_tools( HoverTool(renderers=[cdf_plot], tooltips=[('Prob', '@cdf{0.00}')], mode='hline')) #figure properties p1.xgrid.visible = False #hide entries when clocking on a legend p1.legend.click_policy = "hide" show(p1)
vec = P elif sys.argv[3] == 'eccentricity': vec = E elif sys.argv[3] == 'solidity': vec = S history = vec nn = int(steps) for counter in range(nn): vec = SS.dot(vec) history = np.hstack((history,vec)) ########################################## # Fit a normal distribution to the data: attribute = np.log2(np.mean(history,axis=1)) mu, std = norm.fit(attribute) # you could also fit to a lognorma the original data sns.set(style='white', rc={'figure.figsize':(5,5)}) plt.hist(attribute, bins=100, density=True, alpha=0.6, color='g') #Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=2) title = "Fit results: mu = %.2f, std = %.2f" % (mu, std) plt.title(title) plt.savefig("./png/distro-"+str(sys.argv[3])+".png") # save as png plt.close() ########################################### # create empty list for node colors pos = XY
vp.append(v) for w in v: wp.append(w) UN2s.append(size(np.unique(wp))) print np.mean(OBS2s), np.mean(DET2s), np.mean(CHAR2s), np.mean(UN2s) #D1 = UNs #D2 = UN2s #xmax = 11 D1 = DETs D2 = CHARs xmax = 18 bins = range(xmax+1) (mu, sigma) = norm.fit(D1) y1 = mlab.normpdf( bins, mu, sigma) (mu, sigma) = norm.fit(D2) y2 = mlab.normpdf( bins, mu, sigma) close('all') figure(2) grid('on') fsz = 18 plot(bins, y1, 'b-o', linewidth=2, markersize=5, label='KasdinBraems') plot(bins, y2, 'r-o', linewidth=2, markersize=5, label='Nemati') #xlabel('Unique planet detections', fontsize=fsz) xlabel('Total planet detections', fontsize=fsz) ylabel('Normalized frequency', fontsize=fsz) xlim(0,xmax) tick_params(axis='both', which='major', labelsize=fsz)
# zero_file = 'fms/agents/zerointelligencetrader.py' with open(zero_file, 'w') as f: f.write(zero_agent) # process = Popen(['python2', 'startfms.py', 'run', 'config.yml'], stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate() if len(stdout) != 0: print('STDOUT', stdout) if len(stderr) != 0: print('STDERR', stderr) # df = pd.read_csv('output.csv', skiprows=[0], sep=';') df['return'] = df['price'] / 100000 - 1 # mu, sigma = norm.fit(df['return']) skew, kurtosis = st.skew(df['return']), st.kurtosis(df['return']) autocorr = f_autocorr(df['return'].dropna().abs())[0, 1] print('{},{},{},{},{}'.format( mu, sigma, skew, kurtosis, autocorr)) result_df = result_df.append({ 'zero_pct': zero_pct, 'herding_pct': herding_pct, 'threshold_pct': threshold_pct, 'mu': mu, 'sigma': sigma, 'skew': skew, 'kurtosis': kurtosis, 'autocorr': autocorr, }, ignore_index=True) result_df.to_csv('result.csv.10times.2.csv', index=False)
extractor = AppearanceExtractor(0, 0, TEST_SEASONS, 1, 1) train_input, train_output = extractor.get_train_data() non_mol = [ data[0] for data, label in zip(train_input, train_output) if label == 0.0 ] mol = [ data[0] for data, label in zip(train_input, train_output) if label == 1.0 ] plt.figure(figsize=(12, 3)) plt.xlabel("Relative Appearance") plt.ylabel("Is 'mol'") plt.yticks(np.linspace(0.0, 1.0, 11)) plt.gcf().subplots_adjust(bottom=0.15) mol_norm = norm.fit(mol) X = np.linspace(-1.5, 1.0, 500) mol_Y = [norm.pdf(x, loc=mol_norm[0], scale=mol_norm[1]) for x in X] plt.plot(X, mol_Y, color='r') non_mol_norm = norm.fit(non_mol) non_mol_Y = [ norm.pdf(x, loc=non_mol_norm[0], scale=non_mol_norm[1]) for x in X ] plt.plot(X, non_mol_Y, color='g') non_mol_multiplier = len(non_mol) / len(train_output) mol_multiplier = len(mol) / len(train_output) posterior = [ my * mol_multiplier / (my * mol_multiplier + ny * non_mol_multiplier) for my, ny in zip(mol_Y, non_mol_Y)
Ea_ch3oh.append(b_ch3oh) dEa_all.append(dEa) print mat.cat, mat.cattype, mat.ets_ch4, mat.ets_ch3oh, dEa labels = [] colors = [] for mclass in dEa_dict: labels.append(mclass) colors.append(dEa_dict[mclass]['clr']) n, bins, patches = plt.hist(dEa_all, nbins, normed=1, label=labels, color=colors, stacked=True) mu, std = norm.fit(dEa_all) plt.xlim(0, 2) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=2) title = r"Fit results: $\mu$ = %.2f, $\sigma$ = %.2f" % (mu, std) plt.ylabel(r'Counts') plt.xlabel(r'$E^a_{CH_4} - E^a_{CH_3OH}$ (eV)') plt.title(title) plt.tight_layout() plt.legend(fontsize=10) plt.savefig('fig-S7c-beef-RuO2-ECH4-ECH3OH.pdf') plt.cla() n, bins, patches = plt.hist(Ea_ch4,
# drop features drop_feats = [ 'WW_GRS', 'PERCENT', 'NM_0.5W_T', 'NM_0.5W_M24', 'NM_0.5W_M26', 'NM_0.5W_F24', 'NM_0.5W_F26', 'GENRE2' ] df.drop(drop_feats, axis=1, inplace=True) # check OBO df['OBO'].describe() # orginal data sns.distplot(df['OBO'], fit=norm) # Get the fitted parameters used by the function (mu, sigma) = norm.fit(df['OBO']) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #Now plot the distribution plt.legend( ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('distribution') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(df['OBO'], plot=plt) plt.show() # log transformation sns.distplot(np.log(df['OBO']), fit=norm)
def draw_distribution_of_contacts(): import os import zipfile from bokeh.plotting import figure, output_file, show import numpy as np with zipfile.ZipFile('/Users/trman/OneDrive/Projects/PyTorch/trainingFiles/PDBBind/target_feature_vectors/aadistancematrix500.zip') as z: for dist_fl_name in z.namelist(): if not os.path.isdir(dist_fl_name) and dist_fl_name.endswith("tsv"): print(dist_fl_name) dist_lst = [] prot_id = dist_fl_name.split(".")[0] #dist_fl = open("{}/{}".format(dist_folder_path, dist_fl_name), "r") with z.open(dist_fl_name) as f: row_ind = 0 for line in f: col_values = str(line).split("\\t") # print(col_values) for col_ind in range(len(col_values)): dist = 0 if col_ind > row_ind: if col_ind==row_ind or (col_ind!=row_ind and col_values[col_ind]!="0.0"): try: dist = float(col_values[col_ind]) dist_lst.append(dist) except: pass row_ind += 1 dist_lst = sorted(dist_lst) # print(dist_lst) output_file("line.html") p = figure(plot_width=400, plot_height=400) lst_indices = list(range(len(dist_lst))) # print(lst_indices) # add a circle renderer with a size, color, and alpha arr_hist, edges = np.histogram(dist_lst, bins=1000, range=[0.0, 1.0]) # Put the information in a dataframe distances = pd.DataFrame({'arr_dist': arr_hist, 'left': edges[:-1], 'right': edges[1:]}) # print(distances) # Create the blank plot p = figure(plot_height=600, plot_width=600, title='Histogram distances of aminoacids on 3D', x_axis_label='Aminoacid pairs', y_axis_label='Distance') # Add a quad glyph p.quad(bottom=0, top=distances['arr_dist'], left=distances['left'], right=distances['right'], fill_color='red', line_color='black') print(pd.DataFrame(dist_lst).describe()) # Show the plot show(p) import numpy as np from scipy.stats import norm import matplotlib.pyplot as plt # Generate some data for this demonstration. data = np.asarray(dist_lst) print(type(data)) # Fit a normal distribution to the data: mu, std = norm.fit(np.asarray(data)) # Plot the histogram. plt.hist(data, bins=100, density=True, alpha=0.6, color='g') # Plot the PDF. xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=2) title = "Fit results: mu = %.2f, std = %.2f" % (mu, std) plt.title(title) plt.show()
def visualization(df_train, df_test): print(df_train['SalePrice'].describe()) # We’re going to predict the SalePrice column ($ USD) sns.set(style='whitegrid', palette='muted', font_scale=1.5) rcParams['figure.figsize'] = 14, 8 sns.distplot(df_train['SalePrice'], fit=norm) (mu, sigma) = norm.fit(df_train['SalePrice']) plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.title('Sale Prices') plt.xlabel('Sale Price') plt.ylabel('Probability') # plt.close() plt.show() # Most of the density lies between 100k and 250k, but there appears to be a lot of outliers on the pricier side. # -------------------------------- # # top 10 correlated features with the sale price: corr_matrix = df_train.corr() sns.heatmap(corr_matrix, vmax=.8, square=True) k = 10 # number of variables for heat map cols = corr_matrix.nlargest(k, 'SalePrice')['SalePrice'].index sns.heatmap(df_train[cols].corr().values.T, cbar=True, annot=True, square=True, yticklabels=cols.values, xticklabels=cols.values) # plt.close() plt.show() # Overall Quality vs Sale Price var = 'OverallQual' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000), s=32) plt.show() # Living Area vs Sale Price var = 'GrLivArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000), s=32) plt.show() # It makes sense that people would pay for the more living area. # What doesn't make sense is the two data points in the bottom-right of the plot. # Removing outliers manually (Two points in the bottom right) df_train = df_train.drop(df_train[(df_train['GrLivArea'] > 4000) & (df_train['SalePrice'] < 300000)].index).reset_index(drop=True) # After removing outliers, Living Area vs Sale Price var = 'GrLivArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000), s=32) plt.show() # GarageCars vs Sale Price var = 'GarageCars' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000), s=32) plt.show() # GarageArea vs Sale Price var = 'GarageArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000), s=32) plt.show() # Up to this point, we were exploring the data # Do we have missing data - train? total = df_train.isnull().sum().sort_values(ascending=False) percent = (df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data = missing_data[missing_data.Total > 0] print(missing_data) total_test = df_test.isnull().sum().sort_values(ascending=False) percent_test = (df_test.isnull().sum() / df_test.isnull().count()).sort_values(ascending=False) missing_data_test = pd.concat([total_test, percent_test], axis=1, keys=['TotalTest', 'PercentTest']) missing_data_test = missing_data_test[missing_data_test.TotalTest > 0] print(missing_data_test)
def plot_disp(data, true_hadroness=False): """Plot the performance of reconstructed position Parameters: ----------- data: pandas DataFrame true_hadroness: boolean True: True gammas and proton events are plotted (they are separated using true hadroness). False: Gammas and protons are separated using reconstructed hadroness (hadro_rec) """ hadro = "reco_type" if true_hadroness: hadro = "mc_type" gammas = data[data[hadro] == 0] plt.subplot(221) reco_disp_norm = np.sqrt(gammas['reco_disp_dx']**2 + gammas['reco_disp_dy']**2) disp_res = ((gammas['disp_norm'] - reco_disp_norm) / gammas['disp_norm']) section = disp_res[abs(disp_res) < 0.5] mu, sigma = norm.fit(section) print("mu = {}\n sigma = {}".format(mu, sigma)) n, bins, patches = plt.hist( disp_res, bins=100, density=1, alpha=0.75, range=[-2, 1.5], ) y = norm.pdf(bins, mu, sigma) plt.plot(bins, y, 'r--', linewidth=2) plt.xlabel('$\\frac{disp\_norm_{gammas}-disp_{rec}}{disp\_norm_{gammas}}$', fontsize=15) plt.figtext(0.15, 0.7, 'Mean: ' + str(round(mu, 4)), fontsize=12) plt.figtext(0.15, 0.65, 'Std: ' + str(round(sigma, 4)), fontsize=12) plt.subplot(222) hD = plt.hist2d( gammas['disp_norm'], reco_disp_norm, bins=100, range=([0, 1.1], [0, 1.1]), ) plt.colorbar(hD[3]) plt.xlabel('$disp\_norm_{gammas}$', fontsize=15) plt.ylabel('$disp\_norm_{rec}$', fontsize=15) plt.plot(gammas['disp_norm'], gammas['disp_norm'], "-", color='red') plt.subplot(223) theta2 = (gammas['src_x'] - gammas['reco_src_x'])**2 + (gammas['src_y'] - gammas['src_y'])**2 plt.hist(theta2, bins=100, range=[0, 0.1], histtype=u'step') plt.xlabel(r'$\theta^{2}(º)$', fontsize=15) plt.ylabel(r'# of events', fontsize=15)
mu = np.zeros(len(test)) std = np.zeros(len(test)) batman_good = [] #plt.plot(test) #plt.show() ##Write data to file out_file = open('sample_candidates.txt', 'w') line1 = 'sector' + ',' + 'tessFile' + ',' + 'curveID' + ',' + 'correlation' + '\n' good = [] for row in range(len(test)): mu, std = norm.fit(test[row]) good.append(test[row][np.where(test[row] >= mu + 3 * std)]) #print('values: ',test[row][0]) #print('std: ', mu+3*std) #print('index: ',np.where(test[row] >= mu+1*std)[0]) #plt.plot(test[row]) #plt.show() batman_good.append(batman_indices[np.where(test[row] >= mu + 3 * std)]) good = np.asarray(good) batman_good = np.asarray(batman_good) try: for row in range(len(good)): for column in range(len(good[row] - 1)): line = str(sector[row]) + ',' + str(data[row]) + ',' + str(
residuals = gandalfs.zenith - primaries.zenith cut = (gandalfs["lambda"] < l) & (np.abs(residuals) < 2 * np.pi) residuals = residuals[cut] event_info[cut] # convert rad -> deg residuals = residuals * 180 / np.pi pi = 180 # x axis for plotting x = np.linspace(-pi, pi, 1000) c_loc, c_gamma = cauchy.fit(residuals) fwhm = 2 * c_gamma g_mu_bad, g_sigma_bad = norm.fit(residuals) g_mu, g_sigma = norm.fit(residuals[np.abs(residuals) < 10]) plt.hist(residuals, bins="auto", label="Histogram", density=True, alpha=0.7) plt.plot( x, cauchy(c_loc, c_gamma).pdf(x), label="Lorentz: FWHM $=${:.3f}".format(fwhm), linewidth=2, ) plt.plot( x, norm(g_mu_bad, g_sigma_bad).pdf(x), label="Unrestricted Gauss: $\sigma =$ {:.3f}".format(g_sigma_bad), linewidth=2, )
plt.xlabel('index') plt.ylabel('Tempo de Permanência') plt.title("Tempo de Permanência - Distribution") plt.show(); #Target Variable Analysis from scipy import stats from scipy.stats import norm, skew import seaborn as sns import matplotlib.pyplot as plt (mu, sigma) = norm.fit(df_maio_19_reg['TEMPO_PERM_INT_POSTERIOR']) plt.figure(figsize = (14, 7)) sns.distplot(df_maio_19_reg['TEMPO_PERM_INT_POSTERIOR'], fit = norm) plt.ylabel('Frequency') plt.title('Tempo de Permanência - Distribution') plt.legend(['Normal Dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc = 'best') quantile_plot = stats.probplot(df_maio_19_reg['TEMPO_PERM_INT_POSTERIOR'], plot = plt) import numpy as np df_maio_19_reg['TEMPO_PERM_INT_POSTERIOR'] = np.log1p(df_maio_19_reg['TEMPO_PERM_INT_POSTERIOR']) (mu, sigma) = norm.fit(df_maio_19_reg['TEMPO_PERM_INT_POSTERIOR']) plt.figure(figsize = (14, 7))
import numpy as np from scipy.stats import norm, multinomial original_data = norm.rvs(loc=1.0, scale=0.5, size=1000, random_state=1386) original_data[:20] # In[ ]: #Now replace every other element with the mean 1.0 missing_elements = np.asarray([0, 1] * 500) updated_data = original_data * (1 - missing_elements) + missing_elements updated_data[:20] # In[ ]: #Now, let's get mean and std of the new distribution: mean, std = norm.fit(updated_data) print(f'Mean: {mean}, std: {std}') # As you see, even though the mean is the same, the standard deviation is much less. While the imputation of data this way increases the performance of the model, it also amplifies the bias that already exists in the data. In order to prevent amplification of the bias, we have to replace the missing values with a sample from the normal distribution with the same mean and standard deviation. For categorical features it would be a multinomial distribution. # # For debiasing we can try to increase the standard deviation of the distribution from which we sample data for numerical features, and a similar transformation for the multinomial distribution. # # In this notebook I suggest two classes for the numerical and categorical features respectively. # ## Proposed solution ## # In[ ]: from sklearn.base import BaseEstimator, TransformerMixin import numpy.ma as ma from sklearn.utils.validation import check_is_fitted
dt = dateutil.parser.parse(row['time']).astimezone( timezone(timedelta(hours=9))) all_conferences[0].append(dt) conferences[0].append(dt) all_participants[0].append(dt) participants[0].append(dt) all_conferences[1][j].append(int(row['conferences'])) all_participants[1][j].append(int(row['participants'])) j += 1 print("Cleaning time: %d seconds" % (time.time() - curr_time)) curr_time = time.time() for d in all_conferences[1]: conf_norm_dist_funcs.append(norm.fit(d)) print("First Normal Distribution Functions Fitting time: %d seconds" % (time.time() - curr_time)) curr_time = time.time() for d in all_participants[1]: part_norm_dist_funcs.append(norm.fit(d)) print("Second Normal Distribution Functions Fitting time: %d seconds" % (time.time() - curr_time)) curr_time = time.time() for loc, scale in conf_norm_dist_funcs: conferences[1].append(norm.rvs(loc=loc, scale=scale, random_state=8192)) conferences[1] = np.clip(conferences[1], 0, None) conferences[1] = savgol_filter(conferences[1], 91, 1) conferences[1] = np.around(conferences[1])
def absSDM(obs, mod, sce, cdf_threshold=0.9999999): '''absolute scaled distribution mapping assuming a normal distributed parameter rewritten from pyCAT for 1D data obs :: observed variable time series mod :: modelled variable for same time series as obs sce :: to unbias modelled time series cdf_threshold :: upper and lower threshold of CDF returns corrected timeseries tested with pandas series. ''' obs_len = len(obs) mod_len = len(mod) sce_len = len(sce) obs_mean = np.mean(obs) mod_mean = np.mean(mod) smean = np.mean(sce) odetrend = detrend(obs) mdetrend = detrend(mod) sdetrend = detrend(sce) obs_norm = norm.fit(odetrend) mod_norm = norm.fit(mdetrend) sce_norm = norm.fit(sdetrend) sce_diff = sce - sdetrend sce_argsort = np.argsort(sdetrend) obs_cdf = norm.cdf(np.sort(odetrend), *obs_norm) mod_cdf = norm.cdf(np.sort(mdetrend), *mod_norm) sce_cdf = norm.cdf(np.sort(sdetrend), *sce_norm) obs_cdf = np.maximum(np.minimum(obs_cdf, cdf_threshold), 1 - cdf_threshold) mod_cdf = np.maximum(np.minimum(mod_cdf, cdf_threshold), 1 - cdf_threshold) sce_cdf = np.maximum(np.minimum(sce_cdf, cdf_threshold), 1 - cdf_threshold) # interpolate cdf-values for obs and mod to the length of the scenario obs_cdf_intpol = np.interp(np.linspace(1, obs_len, sce_len), np.linspace(1, obs_len, obs_len), obs_cdf) mod_cdf_intpol = np.interp(np.linspace(1, mod_len, sce_len), np.linspace(1, mod_len, mod_len), mod_cdf) # adapt the observation cdfs # split the tails of the cdfs around the center obs_cdf_shift = obs_cdf_intpol - .5 mod_cdf_shift = mod_cdf_intpol - .5 sce_cdf_shift = sce_cdf - .5 obs_inverse = 1. / (.5 - np.abs(obs_cdf_shift)) mod_inverse = 1. / (.5 - np.abs(mod_cdf_shift)) sce_inverse = 1. / (.5 - np.abs(sce_cdf_shift)) adapted_cdf = np.sign(obs_cdf_shift) * ( 1. - 1. / (obs_inverse * sce_inverse / mod_inverse)) adapted_cdf[adapted_cdf < 0] += 1. adapted_cdf = np.maximum(np.minimum(adapted_cdf, cdf_threshold), 1 - cdf_threshold) xvals = norm.ppf(np.sort(adapted_cdf), *obs_norm) \ + obs_norm[-1] / mod_norm[-1] \ * (norm.ppf(sce_cdf, *sce_norm) - norm.ppf(sce_cdf, *mod_norm)) xvals -= xvals.mean() xvals += obs_mean + (smean - mod_mean) correction = np.zeros(sce_len) correction[sce_argsort] = xvals correction += sce_diff - smean return correction
def __init__(self, N=None, size=1, mu0=0.1, sigma_mean0=10, sigma_std0=1.0, sigma_min=0.1, sigma_max=10, data=None): self.N = N self.K = size # Parameter initialization #random init if data is None: # mu = random normal with std mu0,mean 0 self.mu = mu0 * np.random.randn(self.N, self.K).astype(DTYPE) # Sigma = random normal with mean sigma_mean0, std sigma_std0, and min/max of sigma_min, sigma_max self.Sigma = np.random.randn(self.N, 1).astype(DTYPE) self.Sigma *= sigma_std0 self.Sigma += sigma_mean0 self.Sigma = np.maximum(sigma_min, np.minimum(self.Sigma, sigma_max)) self.Gaussian = np.concatenate((self.mu, self.Sigma), axis=1) # TensorVariables for mi, mj, si, sj respectivelly. a, b = T.fvectors('a', 'b') c, d = T.fscalars('c', 'd') # Energy as a TensorVariable E = -0.5 * (self.K * d / c + T.sum( (a - b)**2 / c) - self.K - self.K * T.log(d / c)) self.enrg = function([a, b, c, d], E) g1 = T.grad(E, a) # dE/dmi self.f1 = function([a, b, c, d], g1) g2 = T.grad(E, b) # dE/dmj self.f2 = function([a, b, c, d], g2) g3 = T.grad(E, c) # dE/dsi self.f3 = function([a, b, c, d], g3) g4 = T.grad(E, d) # dE/dsj self.f4 = function([a, b, c, d], g4) #non random init else: self.mu = [] self.Sigma = [] for i in range(len(data)): mu, std = norm.fit(data[i]) var = np.power(std, 2) self.mu.append(mu) self.Sigma.append(var) self.Gaussian = np.concatenate( (np.asarray(self.mu), np.asarray(self.Sigma)), axis=1) self.Gaussian = np.reshape(self.Gaussian, (2, N)).T
def create_1d_hist(fig, ax, hist, title=None, x_axis_title=None, y_axis_title=None, bins=101, x_min=None, x_max=None): if x_min is None: x_min = 0.0 if x_max is None: if hist.all() is np.ma.masked: # check if masked array is fully masked x_max = 1.0 else: x_max = hist.max() hist_bins = int(x_max - x_min) + 1 if bins is None else bins if hist_bins > 1: bin_width = (x_max - x_min) / (hist_bins - 1) else: bin_width = 1.0 hist_range = (x_min - bin_width / 2, x_max + bin_width / 2) # if masked_hist.dtype.kind in 'ui': # masked_hist[masked_hist.mask] = np.iinfo(masked_hist.dtype).max # elif masked_hist.dtype.kind in 'f': # masked_hist[masked_hist.mask] = np.finfo(masked_hist.dtype).max # else: # raise TypeError('Inappropriate type %s' % masked_hist.dtype) masked_hist_compressed = np.ma.masked_invalid( np.ma.masked_array(hist)).compressed() if masked_hist_compressed.size == 0: ax.plot([]) else: _, _, _ = ax.hist( x=masked_hist_compressed, bins=hist_bins, range=hist_range, align='mid') # re-bin to 1d histogram, x argument needs to be 1D # BUG: np.ma.compressed(np.ma.masked_array(hist, copy=True)) (2D) is not equal to np.ma.masked_array(hist, copy=True).compressed() (1D) if hist is ndarray ax.set_xlim(hist_range) # overwrite xlim if hist.all() is np.ma.masked: # or np.allclose(hist, 0.0): ax.set_ylim((0, 1)) ax.set_xlim((-0.5, +0.5)) elif masked_hist_compressed.size == 0: # or np.allclose(hist, 0.0): ax.set_ylim((0, 1)) # create histogram without masked elements, higher precision when calculating gauss # h_1d, h_bins = np.histogram(np.ma.masked_array(hist, copy=True).compressed(), bins=hist_bins, range=hist_range) if title is not None: ax.set_title(title) if x_axis_title is not None: ax.set_xlabel(x_axis_title) if y_axis_title is not None: ax.set_ylabel(y_axis_title) # bin_centres = (h_bins[:-1] + h_bins[1:]) / 2 # amplitude = np.amax(h_1d) # defining gauss fit function def gauss(x, *p): amplitude, mu, sigma = p return amplitude * np.exp(-(x - mu)**2.0 / (2.0 * sigma**2.0)) # mu, sigma = p # return 1.0 / (sigma * np.sqrt(2.0 * np.pi)) * np.exp(- (x - mu)**2.0 / (2.0 * sigma**2.0)) def chi_square(observed_values, expected_values): return (chisquare(observed_values, f_exp=expected_values))[0] # chisquare = 0 # for observed, expected in itertools.izip(list(observed_values), list(expected_values)): # chisquare += (float(observed) - float(expected))**2.0 / float(expected) # return chisquare # p0 = (amplitude, mean, rms) # p0 is the initial guess for the fitting coefficients (A, mu and sigma above) # try: # coeff, _ = curve_fit(gauss, bin_centres, h_1d, p0=p0) # except (TypeError, RuntimeError), e: # logging.info('Normal distribution fit failed, %s', e) # else: xmin, xmax = ax.get_xlim() points = np.linspace(xmin, xmax, 500) # hist_fit = gauss(points, *coeff) param = norm.fit(masked_hist_compressed) # points = np.linspace(norm.ppf(0.01, loc=param[0], scale=param[1]), norm.ppf(0.99, loc=param[0], scale=param[1]), 100) pdf_fitted = norm.pdf(points, loc=param[0], scale=param[1]) * ( len(masked_hist_compressed) * bin_width) ax.plot(points, pdf_fitted, "r--", label='Normal distribution') # ax.plot(points, hist_fit, "g-", label='Normal distribution') try: median = np.median(masked_hist_compressed) except IndexError: logging.warning('Cannot create 1D histogram named %s', title) return ax.axvline(x=median, color="g") # chi2, pval = chisquare(masked_hist_compressed) # _, p_val = mstats.normaltest(masked_hist_compressed) # textright = '$\mu=%.2f$\n$\sigma=%.2f$\n$\chi^{2}=%.2f$' % (coeff[1], coeff[2], chi2) # props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) # ax.text(0.85, 0.9, textright, transform=ax.transAxes, fontsize=8, verticalalignment='top', bbox=props) textleft = '$\Sigma=%d$\n$\mathrm{mean\,\mu=%.2f}$\n$\mathrm{std\,\sigma=%.2f}$\n$\mathrm{median=%.2f}$' % ( len(masked_hist_compressed), param[0], param[1], median) props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) ax.text(0.05, 0.9, textleft, transform=ax.transAxes, fontsize=8, verticalalignment='top', bbox=props)
#plt.ylabel('SalePrice',fontsize=13) #plt.xlabel('GrLivArea',fontsize=13) #plt.show() train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index) #fig,ax=plt.subplots() #ax.scatter(train['GrLivArea'],train['SalePrice']) #plt.ylabel('SalePrice',fontsize=13) #plt.xlabel('GrLivArea',fontsize=13) #plt.show() sns.distplot(train['SalePrice'], fit=norm) (mu, sigma) = norm.fit(train['SalePrice']) print('\n mu = {:.2f} ans sigma = {:.2f}\n'.format(mu, sigma)) #plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu,sigma)],loc='best') #plt.ylabel('Frequency') #plt.title('SalePrice distribution') #fig=plt.figure() #res=stats.probplot(train['SalePrice'],plot=plt) #plt.show() train["SalePrice"] = np.log1p(train["SalePrice"]) sns.distplot(train['SalePrice'], fit=norm) (mu, sigma) = norm.fit(train['SalePrice'])
def bootstrap_context(Unit, eval_method='fr1', shuffle_num=10000, isfig=False): init_t = 0 spkt_ob = [] tstart = [] tend = [] dur = [] ttemp = 0 #create a long spiketrain containing only spikes from context for i, record_end in enumerate(Unit.marker.record[1]): spkctx = Unit.spktrain[(Unit.spktrain >= Unit.marker.door[0][i]) & ( Unit.spktrain <= record_end)] - Unit.marker.door[0][i] spkt_ob.append(spkctx + init_t) dur.append((record_end - Unit.marker.door[0][i])) init_t += dur[i] tstart.append(ttemp) tend.append(ttemp + dur[i]) ttemp += dur[i] spkt_observed = np.concatenate(spkt_ob) #keep the ISI the same but shuffled ISI = np.insert(np.diff(spkt_observed), 0, Unit.spktrain[0]) #create pseudo spiketrains spk_shuffle = [] for i in range(shuffle_num): spk_new = [] currentspk = 0 new_ISI = np.random.permutation(ISI) for isi in new_ISI: spk_new.append(currentspk + isi) currentspk += isi spk_shuffle.append(np.array(spk_new)) #1. compare the cdi between observed and shuffled (not a good measure) if eval_method == 'cdi': thres = 2.17 #99% zscore value cdi_observed = cal_ctx_cdi(spkt_observed, Unit, tstart, tend, dur)[1] cdi_shuffle = [] for spk_s in spk_shuffle: cdi_shuffle.append(cal_ctx_cdi(spk_s, Unit, tstart, tend, dur)[1]) mu, sigma = norm.fit(cdi_shuffle) CI1 = thres * sigma + mu CI2 = mu - thres * sigma if cdi_observed > CI1: cell_identity = 'A' elif cdi_observed < CI2: cell_identity = 'B' else: cell_identity = 'others' if isfig: plt.figure() n, bins, patches = plt.hist(cdi_shuffle, bins=100) plt.axvline(cdi_observed, color='g') CI = thres * sigma + mu plt.axvline(CI, color='r') plt.show() return cell_identity #2. we compare the firing rate of each trial to its shuffled results if eval_method == 'fr1': thres = 1.96 #99% zscore value ctx_ob = cal_ctx_cdi(spkt_observed, Unit, tstart, tend, dur)[0] ctx_num = len(np.unique(Unit.marker.protocol)) trl_num = np.unique(Unit.marker.protocol, return_counts=True)[1][0] ctx_shuffle_fr = np.zeros((ctx_num, trl_num, shuffle_num)) for s, spk_s in enumerate(spk_shuffle): ctx_temp = cal_ctx_cdi(spk_s, Unit, tstart, tend, dur)[0] for c in range(ctx_num): for t in range(trl_num): ctx_shuffle_fr[c][t][s] = ctx_temp[c]['fr'][t] #plot the shuffled distribution if isfig: f, ax = plt.subplots(ctx_num, trl_num, sharey=True, sharex=True) for c in range(ctx_num): for t in range(trl_num): n, bins, patches = ax[c, t].hist(ctx_shuffle_fr[c][t], 60, density=True, alpha=0.75) ax[c, t].axvline(ctx_ob[c]['fr'][t], color='g') #we use 99% confidence interval as threshold mu, sigma = norm.fit(ctx_shuffle_fr[c][t]) y = plt.mlab.normpdf(bins, mu, sigma) ax[c, t].plot(bins, y, 'y--', linewidth=2) CI = thres * sigma + mu ax[c, t].axvline(CI, color='r') #show the context preference of each trial ctx_pref = np.zeros((ctx_num, trl_num)) for c in range(ctx_num): for t in range(trl_num): mu, sigma = norm.fit(ctx_shuffle_fr[c][t]) zobserved = (ctx_ob[c]['fr'][t] - mu) / sigma if abs(zobserved) < thres: ctx_pref[c][t] = 0 elif zobserved > thres: ctx_pref[c][t] = 1 elif zobserved < -thres: ctx_pref[c][t] = -1 #decide which context this unit prefers if np.sum(ctx_pref[0] == 1) >= 2: cell_identity = ctx_ob[0]['name'] elif np.sum(ctx_pref[1] == 1) >= 2: cell_identity = ctx_ob[1]['name'] else: cell_identity = 'others' print('This unit prefers ' + cell_identity) return cell_identity, ctx_pref
import numpy as np from scipy.stats import norm import matplotlib.pyplot as plt data = np.loadtxt("10000phidiffp01.txt", dtype=float, delimiter='\t', usecols=range(2)) #Loading file with two columns of data fig = plt.figure(1) #Configuring side by side plots hrange = 0.02 #Range of plot bins = np.linspace(-hrange, hrange, 100) #Number of bins and interval where it's defined mu0, std0 = norm.fit(data[:, 0]) #Normal gaussian distribution fit axs[0].hist(data[:, 0], bins, density=True) #Histogram plot axs[0].set_title( "$\sigma_{\phi} = 0.01$ with fitting $\mu=%.5s$, $\sigma_{gaus}=%.5s$" % (mu0, std0)) #Title of the subplot axs[0].set(xlabel="$(\phi_{Kalman}-\phi_{real})$", ylabel="Distribution of tracks") #Axis label of plots p = norm.pdf(bins, mu0, std0) #Fitted Gaussian definition axs[0].plot(bins, p, 'k', linewidth=2) #Gaussian plot mu1, std1 = norm.fit(data[:, 1]) axs[1].hist(data[:, 1], bins, density=True) axs[1].set_title( "$\sigma_{\phi} = 0.01$ with fitting $\mu=%.5s$, $\sigma_{gaus}=%.5s$" % (mu1, std1)) axs[1].set(xlabel="$(\phi_{Kalman}-\phi_{real})/\phi_{real}$",
#open file and get data hdulist = fits.open("/Users/aliyah/Downloads/A1_mosaic.fits") hdulist.info() #hdulist[0].header() image_data = hdulist[0].data hdulist.close() x_values = image_data #print(image_data[:,1]) x_values = x_values[ x_values <= 3600] x_value = x_values[ x_values <= 3450] xv = x_value[ x_value >= 3390 ] n,bins,patches= plt.hist(x_values,bins=3600) mu, sigma= norm.fit(xv ) y=norm.pdf( bins, mu , sigma) plt.figure(1) plt.plot(bins, 10500000 * y) plt.xlim([3300,3600]) plt.show() print(mu, sigma) #mean background count plt.figure(2) plt.imshow(image_data, cmap='gray') plt.colorbar()
plt.xlabel('Steps') plt.ylabel('time(seconds)') print(score) print('Policy-Iteration converged at step %d.' % (i + 1)) break policy = new_policy return s if __name__ == '__main__': env_name = 'FrozenLake-v0' env = gym.make(env_name) #optimal_policy = policy_iteration(env, gamma = g) #print(optimal_policy) #env.render() #scores = evaluate_policy(env, optimal_policy, gamma = g) #print('Average scores = ', np.mean(scores)) s = [] for i in range(100): steps = policy_iteration(env, gamma=g) s.append(steps) s = np.array(s) (mu, sigma) = norm.fit(s) n, bins, patches = plt.hist(s, 60, normed=1, facecolor='green', alpha=0.75) plt.xlabel('Steps') plt.ylabel('Probability') plt.title(r'$\mathrm{Histogram\ of\ steps:}\ \mu=%.3f,\ \sigma=%.3f$' % (mu, sigma)) plt.grid(True) plt.show()
#check again the data size after dropping the 'Id' variable print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) print("The test data size after dropping Id feature is : {} ".format(test.shape)) #Data_preprocessing fig, ax = plt.subplots() ax.scatter(x = train['growth_rate'], y = train['Attrition_rate']) plt.ylabel('Attrition_rate', fontsize=13) plt.xlabel('growth_rate', fontsize=13) plt.show() sns.distplot(train['Attrition_rate'] , fit=norm); # Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['Attrition_rate']) print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) #Now plot the distribution plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('Attrition_rate distribution') #Get also the QQ-plot fig = plt.figure() res = stats.probplot(train['Attrition_rate'], plot=plt) plt.show() print("The skewness of Attrition_rate is {}".format(train['Attrition_rate'].skew()))
Ntime = 100 # number of timesteps h = 1 # steplength part_pos_list = np.zeros(N, dtype=np.int) # N particles in x=0. # random walk in 1D for t in range(Ntime): r_list = np.random.random(N) # list with N random numbers between 0 and 1 for i in range(N): if r_list[i] >= 0.5: part_pos_list[i] += h # One step to the right else: part_pos_list[i] -= h # One step to the left # find standard deviation mu and variance sigma to the normal distribution best suited to part_pos_list mu, sigma = norm.fit(part_pos_list) print("mu =", mu, "sigma =", sigma) # pre-plotting xMax = np.max(np.abs(part_pos_list)) # maximum absolute x position value xRange = (-xMax * 1.1, xMax * 1.1) # Range for the plot xAx = np.linspace(*xRange, 1000) # list og x values for normal distribution p = norm.pdf(xAx, mu, sigma) # normal distribution # plotting savename = "RandomWalkIn1D" fig, ax = plt.subplots(1, 1, num=savename) # new axis for p ax2 = ax.twinx() # Set ax's patch invisible ax.patch.set_visible(False)
def make_qoi_plots(data_directory, plot_directory, config=None, data_type='kde', iterations='all'): data_types = ['kde', 'results'] assert isinstance(config,str) \ or isinstance(config,PyposmatConfigurationFile) \ or config is None assert os.path.isdir(data_directory) assert isinstance(plot_directory, str) assert data_type in data_types if not os.path.exists(plot_directory): os.mkdir(plot_directory) # process config argument if isinstance(config, str): o_config = PyposmatConfigurationFile() o_config.read(filename=config) elif isinstance(config, PyposmatConfigurationFile): o_config = PyposmatConfigurationFile() elif config is None: o_config = PyposmatConfigurationFile() o_config.read( filename=os.path.join(data_directory, 'pyposmat.config.in')) else: m = 'config arguement must either be a path string of a PyposmatConfigurationFile object' raise TypeError(m) if iterations == 'all': iterations = range(o_config.n_iterations) if data_type == 'kde': datas = [ os.path.join(data_directory, 'pyposmat.kde.{}.out'.format(i + 1)) for i in iterations ] elif data_type == 'results': datas = [ os.path.join(data_directory, 'pyposmat.results.{}.out'.format(i)) for i in iterations ] else: raise TypeError() plot_fns = [] for qn in o_config.qoi_names: print('qoi_name:{}'.format(qn)) plot_fn = os.path.join(plot_directory, '{}.eps'.format(qn.replace('.', '_'))) plot_fns.append(plot_fn) xlabel = qn ylabel = 'probablity density' o_plot = PyposmatQoiPlot(config=o_config) print('\tdetermining x_lims') x_min = None x_max = None for data_fn in datas: x_pctl_min = 0.15 x_pctl_max = 1. - x_pctl_min o_data = PyposmatDataFile() o_data.read(filename=data_fn) from scipy.stats import norm mu, std = norm.fit(o_data.df[qn]) norm_rv = norm(loc=mu, scale=std) if x_min == None: x_min = norm_rv.ppf(x_pctl_min) else: x_min = min(norm_rv.ppf(x_pctl_min), x_min) if x_max == None: x_max = norm_rv.ppf(x_pctl_max) else: x_max = max(norm_rv.ppf(x_pctl_max), x_max) for i, data_fn in enumerate(datas): print('\t{}'.format(data_fn)) o_data = PyposmatDataFile() o_data.read(filename=data_fn) label = 'i={}'.format(iterations[i] + 1) o_plot.initialize_data(data=o_data) o_plot.add_qoi_plot(qoi_name=qn, x_limits=[x_min, x_max], label=label, color=plt.cm.cool(i / len(datas))) o_plot.add_qoitarget(qoi_name=qn) o_plot.ax.set_xlim(x_min, x_max) o_plot.legend() o_plot.ax.set_xlabel(xlabel) o_plot.ax.set_ylabel(ylabel) o_plot.ax.ticklabel_format(axis='both', style='sci', scilimits=(0, 4)) o_plot.savefig(filename=plot_fn, dpi=1300) return plot_fns
'Price/Cash flow', 'Dividend Payout Ratio', 'Net Profit Margin', 'Gross Profit Margin', \ 'Cash Flow Margin', 'Return on Assets', 'Return on Equity', 'Return on Capital Employed', \ 'Gross Profit/Total Assets', 'Total Debt/Invested Capital', 'Inventory/Current Assets', \ 'Total Debt/Total Assets', 'Cash Ratio', 'Quick Ratio (Acid Test)', 'Current Ratio', 'Inventory Turnover', \ 'Asset Turnover', 'Price/Book', 'Dividend Yield', 'Volume Change (3mo)', \ 'Change in Shares Outstanding (3mo)', 'Total Volatility'] X = X[fields] """ yT = df.iloc[:, -1] n = len(yT) y = pd.Series([]) k = 523 inf = 100 for i in range(0, int(n / k)): vals = yT[i * k:(i + 1) * k] mu, std = norm.fit(vals) #bins = [-inf, mu - 2*std, mu + 2*std, inf] #bins = [-inf, mu - 2 * std, mu - std, mu, mu + std, mu + 2 * std, inf] #yCat = pd.cut(vals, bins=bins, labels=False) yCat = pd.qcut(vals, 4, labels=False) y = pd.concat([y, yCat]) X_trainDev, X_test, y_trainDev, y_test = train_test_split(X, y, test_size=0.2, random_state=1) X_train, X_dev, y_train, y_dev = train_test_split(X_trainDev, y_trainDev, test_size=0.25, random_state=1)
#print(N_Sigma_i_med) #print(np.amin(N_Sigma_i)) #print(np.amax(N_Sigma_i)) ################ ################Histogram 1 bins_wm=np.arange(np.amin(N_Sigma_i_wm) - 0.5,np.amax(N_Sigma_i_wm) + 0.5 , 0.5) bins2_wm=np.arange(np.amin(N_Sigma_i_wm) - 0.5,np.amax(N_Sigma_i_wm) + 0.5 , 2) #print(bins) plt.hist(N_Sigma_i_wm,bins=bins_wm,alpha=0.4,histtype='stepfilled', normed = True, edgecolor = 'black', linewidth=0.8) plt.xlabel("$N_σ$") plt.ylabel("probability density") ##############fitting a gaussian to the above histogram parameters = norm.fit(N_Sigma_i_wm) pdf_x_wm = np.linspace(np.amin(N_Sigma_i_wm) - 0.5,np.amax(N_Sigma_i_wm) + 0.5 ,500) fitted_pdf_wm = norm.pdf(pdf_x_wm,loc = parameters[0],scale = parameters[1]) plt.plot(pdf_x_wm,fitted_pdf_wm,"black", linestyle="dashed", linewidth=1.5) plt.legend() plt.show() ###############histogram for median bins_med=np.arange(np.amin(N_Sigma_i_med) - 0.5,np.amax(N_Sigma_i_med) + 0.5 , 0.5) bins2_med=np.arange(np.amin(N_Sigma_i_med) - 0.5,np.amax(N_Sigma_i_med) + 0.5 , 2) #print(bins) binned_array, b ,c = plt.hist(N_Sigma_i_med,bins=bins_med,alpha=0.4,histtype='stepfilled', normed = True, edgecolor = 'black', linewidth=0.8) plt.xlabel("$N_σ$") plt.ylabel("probability density")