def calculateP(variables, k, data, WINDOW_LEN): freq_old = np.zeros(len(variables)) freq = np.zeros(len(variables)) for i in range(len(variables)): sample = data[k:k+WINDOW_LEN] freq_old[i] = sample.count(variables[i]) sample = data[k+WINDOW_LEN : k+2*WINDOW_LEN] freq[i] = sample.count(variables[i]) if (len(variables)==2): chi = chisquare(freq, freq_old) p = chi[1] # Tried the exact binomial goodness of fit method: # p = binom_test(freq, n=None, p=freq_old[0]/sum(freq_old)) # The results were the same as Chi-square else: if (sum(freq==0)>0 or sum(freq_old==0)>0): chi = chisquare(freq, freq_old) else: chi = chi2_contingency([freq,freq_old], correction=True) p = chi[1] return p
def min_rotation(x): print x RotateInstrumentComponent(offset,b+"/sixteenpack",X=x[1],Y=x[2],Z=x[3],Angle=x[0],RelativeRotation=False) difc=CalculateDIFC(offset) dm_new=ma.masked_array(difc.extractY().flatten()[firstDet:lastDet+1],mask=mask_array) print chisquare(f_obs=odm,f_exp=dm_new) return chisquare(f_obs=odm,f_exp=dm_new)[0]
def min_position(x): print x MoveInstrumentComponent(offset,b,X=x[0],Y=x[1],Z=x[2],RelativePosition=False) difc=CalculateDIFC(offset) dm_new=ma.masked_array(difc.extractY().flatten()[firstDet:lastDet+1],mask=mask_array) print chisquare(f_obs=odm,f_exp=dm_new) return chisquare(f_obs=odm,f_exp=dm_new)[0]
def two_c(N, nbins, mass1, mass2): """ 2c) make a historgram averaged over last N timesteps to show convergence with MB for both masses. Input is N. Returns None. Works best with ~(30,5).""" if twomasses: vend_ave = v_all[-N:] h, bins = np.histogram(vend_ave, bins=nbins, normed=True) center = (bins[:-1] + bins[1:]) / 2 lab1 = "ave over last {} tsteps".format(N) pl.plot(center, h, "black", label=lab1) # plot MB on top pl.plot(v, MB_v(v, mass1), "green", label="MB(v,mass1)") pl.plot(v, MB_v(v, mass2), "red", label="MB(v,mass2)") # calculate own Chi^2 # chi21 = (((MB_v(center, mass1) - h)**2)/MB_v(center,mass1)).mean() # chi22 = (((MB_v(center, mass2) - h)**2)/MB_v(center,mass2)).mean() # built in chisquare a = chisquare(h, MB_v(center, mass1)) b = chisquare(h, MB_v(center, mass2)) pl.text(1.5, 0.6, "chi2_m1={:0.2}\nchi2_m2={:0.2}".format(a[0], b[0])) # makeplot pl.legend() pl.title("2c: two masses data v MB afer {0} steps".format(numTsteps)) fname = "2c_two_masses_{}_steps".format(numTsteps) pl.savefig(fname) return None else: print("must run with 2 masses")
def EMR_Chi2_Test(self, data): xz_array = [[],[]]; yz_array = [[],[]] x_chi = -10; y_chi = -10; tot_chi = -10 for track in data: if len(track["space_points"]) > 0: for sp in range(len(track["space_points"])): space = track["space_points"][sp] if not space["x_pos"] in xz_array[0] and not space["z_pos"] in xz_array[1]: xz_array[0].append(space["x_pos"]) xz_array[1].append(space["z_pos"]) if not space["y_pos"] in yz_array[0] and not space["z_pos"] in yz_array[1]: yz_array[0].append(space["y_pos"]) yz_array[1].append(space["z_pos"]) if len(xz_array[1]) > 5 and len(yz_array[1]) > 5: x_expect = []; y_expect = [] x_fit = np.polyfit(xz_array[1], xz_array[0], 1, full=True) y_fit = np.polyfit(yz_array[1], yz_array[0], 1, full=True) for i in range(len(xz_array[1])): x_expect.append(xz_array[1][i] * x_fit[0][0] + x_fit[0][1]) for i in range(len(yz_array[1])): y_expect.append(yz_array[1][i] * y_fit[0][0] + y_fit[0][1]) # print "Expected Y Values" # print "y = ", y_fit[0][0], "*", yz_array[1][i], " + ", y_fit[0][1] # print yz_array[0][i], " / ", y_expect[i], "\n\n" x_chi = chisquare(xz_array[0], x_expect) y_chi = chisquare(yz_array[0], y_expect) tot_chi = math.sqrt(x_chi[0] ** 2 + y_chi[0] ** 2) if tot_chi < 10: return [True, abs(x_chi[0]), abs(y_chi[0]), tot_chi] else: return [False, abs(x_chi[0]), abs(y_chi[0]), tot_chi] return [False, x_chi, y_chi, tot_chi]
def check_chisquare(f_obs, f_exp, ddof, axis, expected_chi2): # Use this only for arrays that have no masked values. f_obs = np.asarray(f_obs) if axis is None: num_obs = f_obs.size else: if axis == 'no': use_axis = 0 else: use_axis = axis b = np.broadcast(f_obs, f_exp) num_obs = b.shape[use_axis] if axis == 'no': chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof) else: chi2, p = mstats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis) assert_array_equal(chi2, expected_chi2) ddof = np.asarray(ddof) expected_p = stats.chisqprob(expected_chi2, num_obs - 1 - ddof) assert_array_equal(p, expected_p) # Also compare to stats.chisquare if axis == 'no': stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof) else: stats_chisq, stats_p = stats.chisquare(f_obs, f_exp=f_exp, ddof=ddof, axis=axis) assert_array_almost_equal(chi2, stats_chisq) assert_array_almost_equal(p, stats_p)
def scoreCHISQ(self, pos_query_set, neg_query_set): """ Use chisquare approximation to fisher's exact test to calculate p-values for each """ # compute probability of random distribution in each # category by simple combinatorics s1 = len(self.pos_de_set) s2 = len(self.neg_de_set) norm = float(s1+s2) s1 = s1/norm s2 = s2/norm # expected frequencies for each set expected = np.array([s1, s2]) up_AGREE = float(len(pos_query_set.intersection(self.pos_de_set))) up_DISAGREE = float(len(pos_query_set.intersection(self.neg_de_set))) observed = np.array([up_AGREE, up_DISAGREE]) UP_chisq, UP_pval = stats.chisquare(observed, expected) down_AGREE = float(len(neg_query_set.intersection(self.neg_de_set))) down_DISAGREE = float(len(neg_query_set.intersection(self.pos_de_set))) observed = np.array([down_DISAGREE, down_AGREE]) DOWN_chisq, DOWN_pval = stats.chisquare(observed, expected) combined_p = UP_pval*DOWN_pval return combined_p
def dm_sigma_shape(sps): """ Chi-squares for Gaussian and Lorentzian profiles """ env = get_envelope(sps) i_xs, i_ys = interp_envelope(env) G_fit = fit_gauss(i_xs, i_ys) L_fit = fit_lorentz(i_xs, i_ys) e_xs, e_ys = env.dm, env.sigma g_ys, l_ys = G_fit(e_xs), L_fit(e_xs) return (chisquare(e_ys, g_ys, 2)[0], chisquare(e_ys, l_ys, 2)[0])
def Responses(self,PlotStartTime,PlotEndTime,DoSkip=[]): NotSkippedGroups = [G for G in self.final_data['SortedGroupsList'] if not any(ext in G for ext in DoSkip)] # spio.savemat(os.path.join(self.CXOutputPath,'SpikesToPlot.mat'),GroupsSpikes) # else: # GroupsSpikes = self.loadmat(os.path.join(self.CXOutputPath,'SpikesToPlot.mat')) f, axarr = plt.subplots(int(round(len(NotSkippedGroups)/2.)), 2, sharex=True, figsize=(8, 17)) axarr_twins = [col.twinx() for row in axarr for col in row] axarr_twins = np.reshape(axarr_twins, (int(round(len(NotSkippedGroups)/2.)), 2)) PSTH_bin = 0.005 statistics = [] for idx, Group in enumerate(NotSkippedGroups): plt_y_idx = idx % (int(round(len(NotSkippedGroups)/2.))) plt_x_idx = int(idx / (int(round(len(NotSkippedGroups)/2.)))) Scatter_Xs = np.concatenate(self.final_data['GroupsSpikes'][Group]) Scatter_Ys = np.concatenate( [(np.ones(len(TimePoint)) * (int(TimePoint_idx) + 1)).astype(int) for TimePoint_idx, TimePoint in enumerate(self.final_data['GroupsSpikes'][Group])]) axarr[plt_y_idx, plt_x_idx].scatter(Scatter_Xs, Scatter_Ys, s=1, color='0.5') Step_Xs = np.arange(0, self.final_data['runtime'], PSTH_bin) Step_Ys = np.zeros_like(Step_Xs) for X in np.unique(Scatter_Xs): bin_idx = np.where(Step_Xs == min(Step_Xs, key=lambda x: abs(x - X))) Step_Ys[bin_idx] += len(np.where(Scatter_Xs == X)[0]) axarr_twins[plt_y_idx, plt_x_idx].step(Step_Xs, Step_Ys, where='mid', c='b') axarr[plt_y_idx, plt_x_idx].plot([self.final_data['InputTime'], self.final_data['InputTime']], [0, len(self.final_data['FileList'])], color='r', linestyle='dotted', linewidth=2) axarr[plt_y_idx, plt_x_idx].spines['top'].set_color('none') axarr_twins[plt_y_idx, plt_x_idx].spines['top'].set_color('none') axarr[plt_y_idx, plt_x_idx].xaxis.set_ticks_position('bottom') axarr_twins[plt_y_idx, plt_x_idx].xaxis.set_ticks_position('bottom') axarr[plt_y_idx, plt_x_idx].set_ylim(0, len(self.final_data['FileList'])) CurrentTitle = Group[Group.index('_') + 1:].replace('_L',' Layer ').replace('toL',' to Layer ') axarr[plt_y_idx, plt_x_idx].set_title(CurrentTitle) axarr[plt_y_idx, plt_x_idx].set_xlim(PlotStartTime, PlotEndTime) axarr_twins[plt_y_idx, plt_x_idx].set_xlim(PlotStartTime, PlotEndTime) if max(Step_Ys[int(np.where(np.array(Step_Xs)>=PlotStartTime)[0][0]):int(np.where(np.array(Step_Xs)<=PlotEndTime)[0][-1])]) == 0: axarr_twins[plt_y_idx, plt_x_idx].set_ylim(0, 1) else: axarr_twins[plt_y_idx, plt_x_idx].set_ylim(0, max(Step_Ys[int(np.where(np.array(Step_Xs)>=PlotStartTime)[0][0]):int(np.where(np.array(Step_Xs)<=PlotEndTime)[0][-1])]) ) y_lower,y_upper = axarr_twins[plt_y_idx, plt_x_idx].get_ylim() if y_upper > 5 : axarr_twins[plt_y_idx, plt_x_idx].yaxis.set_ticks(np.arange(int(y_lower), int(y_upper)+1, (int(y_upper) - int(y_lower)) / 5)) else: axarr_twins[plt_y_idx, plt_x_idx].yaxis.set_ticks(np.arange(int(y_lower), int(y_upper)+1)) if plt_x_idx == 0 : axarr[plt_y_idx, plt_x_idx].set_ylabel('Trials') cropped_steps = Step_Ys[int(np.where(np.array(Step_Xs) >= PlotStartTime)[0][0]):int(np.where(np.array(Step_Xs) <= PlotEndTime)[0][-1])] statistics.append([CurrentTitle,st.chisquare(cropped_steps)[0],st.chisquare(cropped_steps)[1]]) axarr[-1,0].set_xlabel('time (s)') axarr[-1,1].set_xlabel('time (s)') plt.locator_params(axis='x', nbins=5) plt.tight_layout() plt.show() f.savefig(os.path.join(self.IllustratorOutputFolder,'cell_type_response.eps')) with open (os.path.join(self.IllustratorOutputFolder,'table.txt'),'w') as table_file: table_file.write(tabulate(statistics,headers=['Group Name','Chi-Square','p-Value']))
def chi2_from_sig_m(sig_m, err_t, sig_w): sig_t = np.sqrt(sig_m**2 + sig_w**2) mean_err_t = np.mean(err_t) err_t_normalized = (err_t-mean_err_t)/sig_t vals, bins = np.histogram(err_t_normalized, bins='sturges') std_norm = stats.norm(loc=0, scale=1) normal_vals = [len(err_t)*integrate.quad(std_norm.pdf, bins[i], bins[i+1])[0] for i in range(len(vals))] print stats.chisquare(vals, normal_vals)[0] print sig_t.mean() print sum(vals), sum(normal_vals) return stats.chisquare(vals, normal_vals)[0]
def GainvsVolt(folderpath): filename = folderpath+'/pyspes.log' fftplt=1; if 'pyspes.log' in filename: if os.path.isfile(filename): gain,errgain,volt,temp = np.genfromtxt(filename,usecols=(6,7,12,13),unpack=True,dtype='float') volt = -1.*volt mygr = ROOT.TGraphErrors(len(volt),volt.flatten(),gain.flatten(),np.zeros(len(volt),dtype=float),errgain.flatten()) mygr.Draw("AP") for i in range(len(errgain)): if errgain[i]<1: errgain[i] = 1 popt,perr = scop.curve_fit(linearfunc,volt,gain,sigma=errgain) dof = len(gain)-1-2 chi,pval = scist.chisquare(gain,linearfunc(volt,popt[0],popt[1]),dof) plt.errorbar(volt,gain,yerr=errgain,fmt='.') vbd = -1.*popt[0]/popt[1] errvbd = np.sqrt((perr[0][0]/(popt[0]**2)+perr[1][1]/(popt[1]**2))*(vbd**2)) normg = popt[1]*20e-15/50./1.6e-19 errnormg = np.sqrt(perr[1][1])*20e-15/50./1.6e-19 plt.plot(volt,linearfunc(volt,popt[0],popt[1]),'r--') plt.grid(True) plt.xlabel('Bias Voltage [V]',fontsize=16) plt.ylabel('Gain [adu.]',fontsize=16) plt.annotate("DSF\n$U_{bd}$ : "+"{0:.2f} $\pm$ {1:.2f} V\nGain: {2:.2e} $e_0$/V\nTemp: {3:.1f}$^\circ$C\n$\chi^2$/DOF : {4:.1f}/{5}".format(vbd,errvbd,normg,np.mean(temp),chi,dof),xy=(0.6,0.3),xycoords='axes fraction',fontsize=16) plt.xlim(np.min(volt)-0.1,np.max(volt)+0.1) plt.show() else: print 'pyspes.log not exists, using FFT fit data ...' fftplt=1 if (fftplt): filename = folderpath+'/spes.log' if os.path.isfile(filename): voltage,temperature,GainFFT,errGainFFT = np.genfromtxt(filename,usecols=(0,1,7,8),unpack=True) voltage = -1.*voltage popt,perr = scop.curve_fit(linearfunc,voltage,GainFFT,sigma=errGainFFT) dof = len(GainFFT)-1-2 chi,pval = scist.chisquare(GainFFT,linearfunc(voltage,popt[0],popt[1]),dof) plt.errorbar(voltage,GainFFT,yerr=errGainFFT,fmt='.') vbd = -1.*popt[0]/popt[1] errvbd = np.sqrt((perr[0][0]/(popt[0]**2)+perr[1][1]/(popt[1]**2))*(vbd**2)) normg = popt[1]*20e-15/50./1.6e-19 errnormg = np.sqrt(perr[1][1])*20e-15/50./1.6e-19 plt.plot(voltage,linearfunc(voltage,popt[0],popt[1]),'r--') plt.grid(True) plt.xlabel('Bias Voltage [V]',fontsize=16) plt.ylabel('Gain [adu.]',fontsize=16) plt.annotate("FFT\n$U_{bd}$ : "+"{0:.2f} $\pm$ {1:.2f} V\nGain: {2:.2e} $e_0$/V\nTemp: {3:.1f}$^\circ$C\n$\chi^2$/DOF : {4:.1f}/{5}".format(vbd,errvbd,normg,np.mean(temperature),chi,dof),xy=(0.6,0.3),xycoords='axes fraction',fontsize=16) plt.xlim(voltage[0]-0.1,voltage[-1]+0.1) plt.show() else: print "no log file found!"
def statistic_test(tagging, feature_values): '''need to compare the two sides I split (how many of each label in each one)''' if len(frozenset(feature_values))>2: return 0.0,0.0 #only works for 2 values locs= find(feature_values==1) locs2= find(feature_values!=1) observed= array([len(find(tagging[locs]==1)),len(find(tagging[locs]!=1))]) expected= array([len(find(tagging[locs2]==1)),len(find(tagging[locs2]!=1))]) if any(expected==0): if any(observed==0): return inf, 0.0 #this is good for us return chisquare(expected, observed) return chisquare(observed, expected) #high stat+low p->good
def test_4_14_add_poisson_noise(): from scipy.stats import chisquare rows = 75 mew = [] dev = [] sig = [] for i in range(rows): fish = starCam.images[0].addPoissonNoise(img[i,:]) - img[0,:] muFish = np.mean(fish) varFish = np.var(fish) stdFish = np.std(fish) muImg = np.mean(img) varImg = np.var(img) mew.append(muFish) dev.append(stdFish) sig.append(varFish) chi = chisquare(np.array(fish))[1] # Chi-Squared Test (p-value must be >= than 0.05 for good Poisson Fit) assert ( chi >= 0.05 ) # Mean of Samples == Mean of Normal Image assert ( abs(np.mean(mew) - muImg) <= 0.01 ) # Avg of Sample Std Dev == sqrt [ Avg of Sample Variances ] assert ( abs(np.mean(dev) - np.sqrt(np.mean(sig))) <= 1e-3 ) # Avg of Sample Variance == Avg of Sample Mean assert ( abs(np.mean(sig) - np.mean(mew)) <= 0.1 )
def get_patient_ChiP(patient, geneToCases, patientToGenes): """ :param patient: :param numGenes: :param numCases: :param geneToCases: :param patientToGenes: :return: The chi-squared value of the sample, given the expected probabilities of the genes """ patient_genes = patientToGenes[patient] numCases = len(patientToGenes) f_obs = [1. if gene in patient_genes else 0. for gene in geneToCases] # The expected value is the marginal probability of the gene's occurrence f_exp = [len(geneToCases[gene]) * 1.0 / numCases for gene in geneToCases] chisq, p = stats.chisquare(f_obs, f_exp) if p < 0.05: print patient # print "Observed: " # print f_obs[0:50] # print "Expected: " # print f_exp[0:50] return p
def check_counts( obj2counts, expected, threshold=0.001, verbose=False): """Check some counts according to a chi-squared statistic. We can use this to see if sampling counts, etc. are what they should be. Here, obj2counts is a dictionary mapping each thing to a count expected is a *function* that takes an object and hands back its expected counts (unnormalized), or a dictionary doing the same (unnormalized) TODO: We may want a normalized version? """ objects = obj2counts.keys() actual_counts = map(lambda o: float(obj2counts[o]), objects) N = sum(actual_counts) if isinstance(expected, dict): e = map(lambda o: expected.get(o,0.0), objects) else: assert callable(expected) e = map(lambda o: expected(o), objects) Z = float(sum(e)) expected_counts = map(lambda o: float(o*N)/Z, e) chi, p = chisquare(f_obs=actual_counts, f_exp=expected_counts) if verbose: print "# Chi squared gives chi=%f, p=%f" % (chi,p) if p < threshold: assert "# *** SIGNIFICANT DEVIATION FOUND IN P" assert p > threshold, "*** Chi squared test fail with chi=%f, p=%f" % (chi,p) return True
def get_chisquare(obs_data, obs_model, nbins=3): ''' Sends into function actual MEP amplitude data at each time point, predicted amplitudes from simulated facilitation curves, number of bins to divide percentile bins into. Calculates histograms for experimental and predicted data. Compares frequencies in each bin. Calculates one-way chi square test. Parameters -------------- obs_data : array of MEP amplitudes from experimental data obs_model : predicted MEP amplitudes from simulated trials (i.e. facilitation curves) Returns --------------- chi square statistic for how well predicted data matches experimental data ''' percentile_bins = np.linspace(0, 100, nbins + 1) bin_edges = np.percentile(obs_data, list(percentile_bins)) hist_data, bin_edges = np.histogram(obs_data, bins=bin_edges) hist_data = hist_data / float(obs_data.size) # still presents frequencies proportional to number of observations? - check with Angus # put in density so that value for each bin is expressed as proportion of total number of observations hist_model, bin_edges = np.histogram(obs_model, bins=bin_edges) hist_model = hist_model / float(obs_model.size) return stats.chisquare(hist_data, hist_model)
def test_adjust_bad_positions(): pages_positions = { 0: [8, 28, 33, 38], 1: [10, 30, 35, 40], 2: [10, 30, 35, 40], 3: [0, 20, 25, 32], 4: [3, 21, 25, 31], 5: [3, 21, 25, 31], } mean_widths = np.diff([np.mean(pos) for pos in zip(*pages_positions.values())]) pages_positions.update({ 6: [3, 21, 20, 31], # bad: neg. width 7: [3, 21, 25, 28, 31], # bad: too many positions 8: [3, 21, 25, 70], # bad: invalid last position }) alpha = 0.05 adj_positions = adjust_bad_positions(pages_positions, pos_check_signif_level=alpha) assert pages_positions.keys() == adj_positions.keys() for p_num in pages_positions.keys(): orig = pages_positions[p_num] adj = adj_positions[p_num] assert len(adj) == 4 assert adj[0] == orig[0] adj_widths = np.diff(adj) _, p_val = chisquare(adj_widths, mean_widths) assert p_val >= alpha
def transnational_distribution(node): """Check the graph distribution of the node in question to make sure that it qualifies as transantional, we don't want it too heavily skewed towards one other time_zone/country :param node: The node to test to see if it's graph is transnational :returns: If the node has a transnationally distributed graph :rtype: boolean """ time_zone_list = [] for time_zone in [n.time_zone for n in node.friends]: if (time_zone is not None): time_zone_list.append(time_zone) # if they dont have any friends with time zones, they cannot be quantified if(len(time_zone_list) < 1): return False # collect the top 3 time_zones in their network counts = [c[1] for c in Counter(time_zone_list).most_common(3)] # cs returns tuple(Power_divergenceResult, pvalue) cs = chisquare(counts) if (cs[0] < 5 and cs[1] > 0.25): return True else: return False
def ejercicio5b(): val1 = [6,1,1,2,6] valores = [6,7,3,4,7,3,7,2,6,3,7,8,2,1,3,5,8,7] p = estimarPbinomial(valores,8) val2 = [st.binom.cdf(x,8,p)*18 for x in val1] print " chi (python)",st.chisquare(val1, f_exp=[2.37,3.49,4.5,3.62,2.01],ddof=1) print " chi2", 1 - st.chi2.cdf(tabla2(),3)
def check_initializer_statistics(self, xp, n): from scipy import stats ws = xp.empty((n,) + self.shape, dtype=self.dtype) for i in range(n): initializer = self.target(**self.target_kwargs) initializer(xp.squeeze(ws[i:i+1], axis=0)) expected_scale = self.scale or 1.1 sampless = cuda.to_cpu(ws.reshape(n, -1).T) alpha = 0.01 / len(sampless) ab = 0.5 * (self.dim_in - 1) for samples in sampless: if self.dim_in == 1: numpy.testing.assert_allclose(abs(samples), expected_scale) _, p = stats.chisquare((numpy.sign(samples) + 1) // 2) else: _, p = stats.kstest( samples, stats.beta( ab, ab, loc=-expected_scale, scale=2*expected_scale ).cdf ) assert p >= alpha
def pymc3_random_discrete(dist, paramdomains, valuedomain=Domain([0]), ref_rand=None, size=100000, alpha=0.05, fails=20): model = build_model(dist, valuedomain, paramdomains) domains = paramdomains.copy() for pt in product(domains, n_samples=100): pt = pm.Point(pt, model=model) p = alpha # Allow Chisq test to fail (i.e., the samples be different) # a certain number of times. f = fails while p <= alpha and f > 0: o = model.named_vars['value'].random(size=size, point=pt) e = ref_rand(size=size, **pt) o = np.atleast_1d(o).flatten() e = np.atleast_1d(e).flatten() observed = dict(zip(*np.unique(o, return_counts=True))) expected = dict(zip(*np.unique(e, return_counts=True))) for e in expected.keys(): expected[e] = (observed.get(e, 0), expected[e]) k = np.array([v for v in expected.values()]) if np.all(k[:, 0] == k[:, 1]): p = 1. else: _chi, p = st.chisquare(k[:, 0], k[:, 1]) f -= 1 assert p > alpha, str(pt)
def test_RegenerationProposal(self): from LOTlib.Inference.Proposals.RegenerationProposal import RegenerationProposal rp = RegenerationProposal(self.grammar) for tree in self.trees: cnt = Counter() for _ in xrange(NSAMPLES): p, fb = rp.propose_tree(tree) cnt[p] += 1 # Check the proposal self.check_tree(p) ## check that the proposals are what they should be -- rp.lp_propose is correct! obsc = [cnt[t] for t in self.trees] expc = [exp(self.grammar.log_probability(t))*sum(obsc) for t in self.trees] csq, pv = chisquare([cnt[t] for t in self.trees], [exp(rp.lp_propose(tree, x))*NSAMPLES for x in self.trees]) # Look at some # print ">>>>>>>>>>>", tree # for p in self.trees: # print "||||||||||", p # v = rp.lp_propose(tree,p) # print "V=",v for c, e, tt in zip([cnt[t] for t in self.trees], [exp(rp.lp_propose(tree, x))*NSAMPLES for x in self.trees], self.trees): print c, e, tt, rp.lp_propose(tree,tt) self.assertGreater(pv, 0.001, msg="Sampler failed chi squared!")
def serial_test(sequence): """ serial test tests for randomness by looking at conversions from 1 digit to the next a low p-value in the returned dict indicates the strength that the null-hypothesis of a random sequence may be rejected. http://books.google.com/books?id=EIbxfCGfzgcC&lpg=PA141&ots=o-8ymmqbs9&pg=PA142#v=onepage&q=&f=false :param sequence: any iterable with at most 2 values that can be turned into an integer via int() . e.g. '1001001' [1, 0, 1, 0, 1] :rtype: returns dict of {'chi': <chisquare value>, 'p': <p-value of said chisquare>} >>> serial_test('101010101111000') {'chi': 1.4285714285714286, 'p': 0.69885130769248427} >>> serial_test('110000000000000111111111111') {'chi': 18.615384615384617, 'p': 0.00032831021826061683} """ #if isinstance(sequence, basestring): sequence = map(int, sequence) pairwise = izip(sequence[1:], sequence[:-1]) d = collections.defaultdict(int) for k in pairwise: d[k] += 1 # order doesnt matter because the expected are all the same. obs = np.array(d.values()) exp = np.ones_like(obs) * obs.mean() chi, pval = chisquare(obs, exp) return {'chi': chi, 'p': pval}
def do_chi_square(self, catalog_file, alpha, seconds): """ receives a catalog file, a significance level and a number of seconds do a hyphotesis test to detect whether the catalog is poissonian or not, under the significance level H0 -> catalog is poissonian H1 -> catalog is not poissonian prints the p-value and the significance level """ # get observed frequencies catalog = Catalog() observed_frequencies = catalog.get_observed_frequencies(catalog_file, seconds) # print if the observed frequencies are too low for x in observed_frequencies: if x < 5: print("Warning: the number of occurrences appear to be too low!") break # get the number of restrictions restrictions = 1 # 1 restriction, since lambda - rate of occurrence - is estimated from the parameters # perform chi square test result = chisquare(observed_frequencies, ddof=restrictions) # get the p_value p_value = result[1] # print results showing the p value and the significance level print("the p_value was: ", p_value) print("the significance level was: ", alpha)
def run(args): col_num = get_col_num(args.c) file_iter = (l.rstrip("\r\n").split("\t") for l in open(args.file) if l[0] != "#") pvals = np.array([float(b[col_num]) for b in file_iter]) kwargs = {"bins": args.n} if args.n else {} hist, bins = np.histogram(pvals, normed=True, **kwargs) xlabels = "|".join("%.2f-%.2f" % b for b in pairwise(bins)) print "#", chart(hist, xlabels) hist, bins = np.histogram(pvals, normed=False, **kwargs) print "# median: %.3f mean:%.3f; std: %.3f min:%.3f; max:%.3f" % ( np.median(pvals), pvals.mean(), pvals.std(), pvals.min(), pvals.max()) try: from scipy.stats import chisquare chisq, p = chisquare(hist) print "#chi-square test of uniformity. p: %.3g " \ "(low value means reject null of uniformity)" % p except ImportError: pass print "#bin_start\tbin_end\tn" for bin, val in zip(pairwise(bins), hist): print "%.2f\t%.2f\t%i" % (bin[0], bin[1], val)
def mirror_clusters(data, labels, cutoff=0.01): """ Merge mirrored profiles based on a chi2 test of the mean profiles Only if the profile is mirrored over all data tracks Returns the labels of the two matched mirrored tracks, if there is at least one match with a p-value greater than the cutoff. If not, return (None, None) """ n = len(set(labels)) if n == 1: return (None, None) mirror = dict([(i, {}) for i in range(n)]) for track in data.keys(): profiles = [] for i in range(n): profiles.append(numpy.mean(data[track][labels == i], 0) + 1e-10) for i in range(n - 1): for j in range(i + 1, n): p = chisquare(profiles[i], profiles[j][::-1])[1] mirror[i].setdefault(j, []).append(p) result = [] for i in mirror.keys(): for j in mirror[i].keys(): result.append([(i, j), mirror[i][j]]) for (i, j), ps in sorted(result, cmp=lambda a, b: cmp(numpy.mean(a[1]), numpy.mean(b[1])))[::-1]: # print (i,j), ps, numpy.array(ps), cutoff if (numpy.array(ps) >= cutoff).all(): return (i, j) return (None, None)
def fe2_after_hbeta(wave, flux, error): fig = plt.figure() plt.plot(wave, flux) # (FeII, FeII), (FeII, FeII), FeII hbeta_complex_fit_func = models.Gaussian1D(5.0, 5169.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5150, 5180]}) + \ models.Gaussian1D(5.0, 5197.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5180, 5210]}) + \ models.Gaussian1D(2.0, 5234.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5220, 5250]}) + \ models.Gaussian1D(2.0, 5276.0, 7.0, bounds = {"amplitude": [0, 10.0], "mean": [5260, 5300]}) + \ models.Gaussian1D(5.0, 5316.0, 2.0, bounds = {"amplitude": [0, 10.0], "mean": [5300, 5325]}) + \ models.Linear1D((flux[0] - flux[-1])/(wave[0]-wave[-1]), (-flux[0] * wave[-1] + flux[-1] * wave[0])/(wave[0]-wave[-1])) fitter = fitting.LevMarLSQFitter() with warnings.catch_warnings(): warnings.filterwarnings('error') try: fit = fitter(hbeta_complex_fit_func, wave, flux, weights= error, maxiter = 10000) except Warning: expected = np.array(fit(wave)) plt.plot(wave, expected) cont = models.Linear1D(fit.parameters[15], fit.parameters[16]) plt.plot(wave, cont(wave)) fig.savefig("aft-failed.jpg") plt.close() raise SpectraException("Line Fe2 after Hbeta fit failed") expected = np.array(fit(wave)) plt.plot(wave, expected) cont = models.Linear1D(fit.parameters[15], fit.parameters[16]) plt.plot(wave, cont(wave)) fig.savefig("aft.jpg") plt.close() rcs = chisquare(flux, expected)[0] / np.abs(len(flux) - 17) if rcs > 10.0: plt.close() raise SpectraException("Line Fe2 after Hbeta reduced chi-square too large" + str(rcs)) return fit.parameters
def chi_squared_test(obs, exp): """ :param obs: observation sequences :param exp: exception sequences :return: P-value """ return chisquare(obs, exp)[1]
def calDMcurve(data2d, ddms, freqs, period): chisqs = [] for i,ddm in enumerate(ddms): deltaphases = ddm * 4.15e3 * 1. / freqs**2 / period data = np.array([rotate(data2d[j,:], dp) for j,dp in enumerate(deltaphases)]) chisqs.append(stats.chisquare(data.sum(0))[0]) return np.array(chisqs)
def compute_score(attr): if attr is group_var: return 3 if attr.is_continuous: # One-way ANOVA col = data.get_column_view(attr)[0].astype(float) groups = (col[group_col == i] for i in range(n_groups)) groups = (col[~np.isnan(col)] for col in groups) groups = [group for group in groups if len(group)] p = f_oneway(*groups)[1] if len(groups) > 1 else 2 else: # Chi-square with the given distribution into groups # (see degrees of freedom in computation of the p-value) if not attr.values or not group_var.values: return 2 observed = np.array( contingency.get_contingency(data, group_var, attr)) observed = observed[observed.sum(axis=1) != 0, :] observed = observed[:, observed.sum(axis=0) != 0] if min(observed.shape) < 2: return 2 expected = \ np.outer(observed.sum(axis=1), observed.sum(axis=0)) / \ np.sum(observed) p = chisquare(observed.ravel(), f_exp=expected.ravel(), ddof=n_groups - 1)[1] if math.isnan(p): return 2 return p
male_count = len(sequence[sequence == 0]) female_count = len(sequence[sequence == 1]) male_diff = (male_count - 150)**2 / 150 female_diff = (female_count - 150)**2 / 150 chi_squared = male_diff + female_diff chi_squared_values.append(chi_squared) plt.hist(chi_squared_values) ## 9. Increasing degrees of freedom ## diffs = [] observed = [27816, 3124, 1039, 311, 271] expected = [26146.5, 3939.9, 944.3, 260.5, 1269.8] for i, obs in enumerate(observed): exp = expected[i] diff = (obs - exp)**2 / exp diffs.append(diff) race_chisq = sum(diffs) ## 10. Using SciPy ## from scipy.stats import chisquare import numpy as np observed = np.array([27816, 3124, 1039, 311, 271]) expected = np.array([26146.5, 3939.9, 944.3, 260.5, 1269.8]) chisquare_value, race_pvalue = chisquare(observed, expected)
def plot_bkg_templates(fnames_to_run): """ Runs LOWESS smoothing algorithm ntoys times and finds 1 and 2 sigma bands for interpolation """ for bkg_file in fnames_to_run: hdict = load(bkg_file) jmult = "3Jets" if "3Jets" in os.path.basename(bkg_file) else "4PJets" for tname, orig_template in hdict[args.lepton].items(): proc = tname.split( "_")[0] if not "data_obs" in tname else "data_obs" sys = sorted(filter(None, tname.split(f"{proc}_")))[0] if proc == "BKG": continue #if sys not in ["hdampUP", "hdampDOWN", "mtop1665", "mtop1695", "mtop1715", "mtop1735", "mtop1755", "mtop1785", "ueUP", "ueDOWN"]: continue if sys == "nosys": continue print(args.lepton, jmult, sys, proc) nosys_hist = hdict[args.lepton][f"{proc}_nosys"].copy() orig_smooth_hist = Plotter.smoothing_mttbins( nosys=nosys_hist, systematic=orig_template, mtt_centers=mtt_centers, nbinsx=nbinsx, nbinsy=nbinsy) x_lims = (0, nosys_hist.dense_axes()[0].centers().size) # get vals and errors of systematic variation sys_histo_vals, sys_histo_sumw2 = orig_template.values( sumw2=True)[()] sys_histo_errs = np.sqrt(sys_histo_sumw2) # make toys based on Gaussian distribution of mu=bin_val, sigma=bin_error toy_arrays = np.zeros((nbins, ntoys)) for idx in range(nbins): toy_arrays[idx] = np.random.normal(sys_histo_vals[idx], sys_histo_errs[idx], size=ntoys) # get smoothed relative deviation distributions from toys smoothed_rel_dev_arrays = np.zeros((ntoys, nbins)) chi2_pvals = np.zeros((ntoys, 2)) for idx in range(ntoys): smoothed_array = Plotter.smoothing_mttbins( nosys=nosys_hist, systematic=(toy_arrays.T)[idx], mtt_centers=mtt_centers, nbinsx=nbinsx, nbinsy=nbinsy) chi2_pval = chisquare( f_obs=smoothed_array, f_exp=orig_smooth_hist.values()[()] ) # convert to expected yields so inputs are greater than 5 chi2_pvals[idx] = np.array( [chi2_pval.statistic, chi2_pval.pvalue]) smoothed_rel_dev_arrays[idx] = ( smoothed_array - nosys_hist.values()[()]) / nosys_hist.values()[()] ## find 68% and 95% intervals plus_one_sigma_smooth_vals, minus_one_sigma_smooth_vals = np.zeros( nbins), np.zeros(nbins) plus_two_sigma_smooth_vals, minus_two_sigma_smooth_vals = np.zeros( nbins), np.zeros(nbins) for bin in range(nbins): plus_one_sigma_smooth_vals[bin] = np.sort( smoothed_rel_dev_arrays[:, bin])[plus_one_sigma_ind] minus_one_sigma_smooth_vals[bin] = np.sort( smoothed_rel_dev_arrays[:, bin])[minus_one_sigma_ind] plus_two_sigma_smooth_vals[bin] = np.sort( smoothed_rel_dev_arrays[:, bin])[plus_two_sigma_ind] minus_two_sigma_smooth_vals[bin] = np.sort( smoothed_rel_dev_arrays[:, bin])[minus_two_sigma_ind] # plot relative deviation fig, ax = plt.subplots() fig.subplots_adjust(hspace=.07) # original relative deviations orig_masked_vals, orig_masked_bins = Plotter.get_ratio_arrays( num_vals=orig_template.values()[()] - nosys_hist.values()[()], denom_vals=nosys_hist.values()[()], input_bins=nosys_hist.dense_axes()[0].edges()) ax.step(orig_masked_bins, orig_masked_vals, where="post", **{ "color": "k", "linestyle": "-", "label": "Original" }) # original smoothing relative deviations orig_smoothed_masked_vals, orig_smoothed_masked_bins = Plotter.get_ratio_arrays( num_vals=orig_smooth_hist.values()[()] - nosys_hist.values()[()], denom_vals=nosys_hist.values()[()], input_bins=nosys_hist.dense_axes()[0].edges()) ax.step(orig_smoothed_masked_bins, orig_smoothed_masked_vals, where="post", **{ "color": "r", "linestyle": "-", "label": "Original Smoothing" }) # plot 68 and 95% intervals for yields ax.fill_between(nosys_hist.dense_axes()[0].edges(), np.r_[minus_one_sigma_smooth_vals, minus_one_sigma_smooth_vals[-1]], np.r_[plus_one_sigma_smooth_vals, plus_one_sigma_smooth_vals[-1]], where=np.r_[plus_one_sigma_smooth_vals, plus_one_sigma_smooth_vals[-1]] > np.r_[minus_one_sigma_smooth_vals, minus_one_sigma_smooth_vals[-1]], step="post", **{ "label": "68%", "facecolor": "#00cc00", "alpha": 0.5 }) ax.fill_between(nosys_hist.dense_axes()[0].edges(), np.r_[minus_two_sigma_smooth_vals, minus_two_sigma_smooth_vals[-1]], np.r_[plus_two_sigma_smooth_vals, plus_two_sigma_smooth_vals[-1]], where=np.r_[plus_two_sigma_smooth_vals, plus_two_sigma_smooth_vals[-1]] > np.r_[minus_two_sigma_smooth_vals, minus_two_sigma_smooth_vals[-1]], step="post", **{ "label": "95%", "facecolor": "#ffcc00", "alpha": 0.5 }) ax.legend(loc="upper right", title=f"{sys}, {proc}") ax.axhline( 0, **{ "linestyle": "--", "color": (0, 0, 0, 0.5), "linewidth": 1 }) ax.autoscale() ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1] * 1.15) ax.set_xlim(x_lims) ax.set_xlabel( "$m_{t\\bar{t}}$ $\otimes$ |cos($\\theta^{*}_{t_{l}}$)|") ax.set_ylabel("Rel. Deviaton from Nominal") # add lepton/jet multiplicity label ax.text(0.02, 0.94, f"{leptypes[args.lepton]}, {jet_mults[jmult]}", fontsize=rcParams["font.size"] * 0.9, horizontalalignment="left", verticalalignment="bottom", transform=ax.transAxes) ## draw vertical lines for distinguishing different ctstar bins vlines = [x_lims[1] * ybin / 5 for ybin in range(1, 5)] for vline in vlines: ax.axvline(vline, color="k", linestyle="--") hep.cms.label(ax=ax, data=False, paper=False, year=args.year, lumi=round(data_lumi_year[f"{args.lepton}s"] / 1000., 1)) #set_trace() pltdir = os.path.join(outdir, args.lepton, jmult, sys) if not os.path.isdir(pltdir): os.makedirs(pltdir) figname = os.path.join( pltdir, "_".join([ jmult, args.lepton, sys, proc, "SmoothingConfidenceIntervals" ])) fig.savefig(figname) print(f"{figname} written") plt.close()
def pearson_chisquare(dist, N): """ 对于 8 (9-1) 个自由度,5% significant level,critical value of chi-square is 15.507 这里返回 critical value - chi_square statistics 如果返回值大于 0,则通过测试,95% 满足 benford's law,且越大越好;否则,未通过测试 """ return 15.507 - chisquare(dist, ideal_distribution)[0] * N
def run_socnet_model(x, y, ct, g_id, cur, tag, ylabel, data_consolidated, model_consolidated, curdate, text): import fitrs3 from scipy.stats import chisquare rname = ct.replace('_', ' ') fdata = f'gpdata/dat/{ct}-{g_id}.dat' fgplot = f'gpdata/{ct}-{g_id}.gp' fsvg = f'svg/{ct}-{g_id}.svg' freport = f'report/{ct}-{g_id}.html' file1 = f'scnlog/{ct}-p1.dat' file2 = f'scnlog/{ct}-p2.dat' partition = len(y) // 4 forecast = fitrs3.previsaoredeslp(y, 7, 200, 400, 100, file1, file2, partition, y[-1] + 50, y[-1] * 20, 4, 6, 0.2, 0.7, 0, 101) if forecast is None: data_consolidated.append('n.a.') data_consolidated.append('n.a.') dump_xy_dat(fdata, x, y) dump_svg(fgplot, fsvg, f'{text} for {rname} on {curdate}', 'Days from the first infected', f'{ylabel}', fdata, 2, f"{rname} data", opt='colorsequence podo', txt1='NO FIT AVAILABLE FOR THE CURRENT DATA', point=True) else: chisqr = chisquare(y, f_exp=forecast[:len(y)])[0] data_consolidated.append(chisqr) model_consolidated.append('socnet-fitrs3') nx = x if forecast is None else np.arange(len(forecast)) dump_xyz_dat(fdata, nx, y, forecast) dump_svg2D(fgplot, fsvg, f'{text} for {rname} on {curdate}', 'Days from the first infected', f'{ylabel}', fdata, 2, 3, f"{rname} data", f'{text}', opt='yrange [0<*:]', txt1=f'SOCNET', txt2=f'�² = {chisqr:9.2}') if forecast is not None: with open(freport, 'w') as f: table_info = f'<tr><td>Success status</td><td>Forecast calculated with socnet-fitrs3</td></tr>' table_info += f'<tr><td>Abort status</td><td>n.a.</td></tr>' table_info += f'<tr><td>Fit message</td><td>n.a.</td></tr>' table_stat = '<tr> <td>n.a</td></tr>' table_obs = f'<tr><th>Days from the first infected</th><th>{ylabel}</th><th>Model {ylabel}</th></tr>' if forecast != None: for i, j, k in itertools.zip_longest(nx, y, forecast, fillvalue='nan'): table_obs += f'<tr><td>{i}</td><td>{j}</td><td>{k:.0f}</td></tr>' else: for i, j in itertools.zip_longest(x, y, fillvalue='nan'): table_obs += f'<tr><td>{i}</td><td>{j}</td><td>n.a.</td></tr>' f.write(param_page(rname, table_info, table_stat, table_obs, fsvg)) return
def get_top_labels(): gender_dict = _get_genders() print(gender_dict) f_counts = {} m_counts = {} with open('mc_data_replicated.tsv', 'rt') as in_file: tsv_reader = csv.reader(in_file, delimiter='\t') next(tsv_reader) # skip the first row, which has headings for row in tsv_reader: image = unidecode.unidecode(row[0]) labels = row[1].split(",") gender = gender_dict[image] for label in labels: label = label.strip() if gender == "Male": if label in m_counts: m_counts[label] += 1 else: m_counts[label] = 1 else: if label in f_counts: f_counts[label] += 1 else: f_counts[label] = 1 ordered_f = [f for f in f_counts.items() if f[1] >= 5] # Only consider labels used at least 5 times ordered_m = [m for m in m_counts.items() if m[1] >= 5] # (as done in the paper) total_f = sum(value == "Female" for value in gender_dict.values()) total_m = len(gender_dict) - total_f top_f = [(label, i / total_f * 100) for label, i in ordered_f] top_m = [(label, i / total_m * 100) for label, i in ordered_m] # Get occurrences of gender A's top labels in gender B for index, label in enumerate(top_f): m_prob = 0 if label[0] in m_counts: m_prob = m_counts[label[0]] / total_m * 100 chi2, p = chisquare([label[1], m_prob]) top_f[index] = [label[0], label[1], m_prob, chi2] for index, label in enumerate(top_m): f_prob = 0 if label[0] in f_counts: f_prob = f_counts[label[0]] / total_f * 100 chi2, p = chisquare([label[1], f_prob]) top_m[index] = [label[0], label[1], f_prob, chi2] # Get the top 25 labels by chi2 where occurrence is higher than expected for that gender top_f = [ f for f in sorted(top_f, key=lambda item: item[3]) if f[1] > f[2] ][-25:] top_m = [ m for m in sorted(top_m, key=lambda item: item[3]) if m[1] > m[2] ][-25:] top_f = sorted(top_f, key=lambda item: item[1]) top_m = sorted(top_m, key=lambda item: item[1]) fig, ax = plt.subplots() x = np.arange(len(top_f)) width = 0.35 rects1 = ax.barh(x - width / 2, [r[1] for r in top_f], width, label='Women') rects2 = ax.barh(x + width / 2, [r[2] for r in top_f], width, label='Men') ax.set_xlabel('% receiving each label') ax.set_title('Top labels for images of women') ax.set_yticks(x) ax.set_yticklabels([r[0] for r in top_f]) ax.legend() plt.show() fig, ax = plt.subplots() x = np.arange(len(top_m)) width = 0.35 rects1 = ax.barh(x - width / 2, [r[2] for r in top_m], width, label='Women') rects2 = ax.barh(x + width / 2, [r[1] for r in top_m], width, label='Men') ax.set_xlabel('% receiving each label') ax.set_title('Top labels for images of men') ax.set_yticks(x) ax.set_yticklabels([r[0] for r in top_m]) ax.legend() plt.show()
df = pd.read_csv(infname, sep='\t') DLQ_COLS = [f'DLQ01_resp-{i}' for i in range(5)] freqs = df[DLQ_COLS].sum(axis=0).values ########## stats on the frequencies ########### comparisons = ['across_DLQ01', 'across_nonzeroDLQ01', 'zeroVSnonzero_DLQ01'] index = pd.Index(comparisons, name='comparison') stats_df = pd.DataFrame(columns=['test', 'chisq', 'pval'], index=index) # Use chi2 to test the difference among a group # of proportions, and then pairwise with binomial test. # is there a difference among the whole DLQ score? chisq, p = stats.chisquare(freqs) stats_df.loc['across_DLQ01', ['test', 'chisq', 'pval']] = ['chisquare', chisq, p] # is there a difference among the lucidity options (non-zero)? nonzero_opts = freqs[1:] chisq, p = stats.chisquare(nonzero_opts) stats_df.loc['across_nonzeroDLQ01', ['test', 'chisq', 'pval']] = ['chisquare', chisq, p] # compare if half the nights had LDs or not** # but note that we don't really care about this, # since we also highlight how it depends on how # you measure success. But run this just to be able # to say there were about half LDs nonlucid = freqs[0]
# dx = bin_size/100 # f_exp2 = [] # f_exp4 = [] # # for i in range(len(bins5)-1): # x = np.arange(bins5[i], bins5[i+1], dx) # y2 = fit_function(x, *fitparams2) # y4 = bi_gaussian(x, *fitparams4) # area2 = np.trapz(y2, x) # area4 = np.trapz(y4, x) # f_exp2.append(area2) # f_exp4.append(area4) chisq2_bincenter, p2_bincenter = chisquare(n5, fit_function( bins5_mid, *fitparams2), ddof=len(guesses2) - 1) chisq4_bincenter, p4_bincenter = chisquare(n5, bi_gaussian( bins5_mid, *fitparams4), ddof=len(guesses4) - 1) # chisq2_area, p2_area = chisquare(n5, f_exp2, ddof=len(guesses2)-1) # chisq4_area, p4_area = chisquare(n5, f_exp4, ddof=len(guesses4)-1) chisqdof2 = chisq2_bincenter / dof2 chisqdof4 = chisq4_bincenter / dof4 pos2a, pos2b, wid2a, wid2b, amp2a, amp2b, r2a, r2b = fitparams2 f = open(
print(f"Skewness: {skew(bio[symbol])}") print(f"Kurtosis: {kurtosis(bio[symbol])}") print() for symbol in trio.columns: print(f"{symbol}:") print(f"Mean: {trio[symbol].mean()}") print(f"STD: {trio[symbol].std()}") print(f"Variance: {trio[symbol].var()}") print(f"Skewness: {skew(trio[symbol])}") print(f"Kurtosis: {kurtosis(trio[symbol])}") print() for symbol in uni.columns: print(f"{symbol}: ") print(kstest(uni[symbol], "norm")) print(chisquare(uni[symbol])) print() for symbols in bio.columns: print(f"{symbols}: ") print(kstest(bio[symbols], "norm")) print(chisquare(bio[symbols])) print() for symbols in trio.columns: print(f"{symbols}: ") print(kstest(trio[symbols], "norm")) print(chisquare(trio[symbols])) print()
#print(fil_max) #print(fil_max[0]) #print(fil_max[1]) pdays = [] for k in range(0, len(arr)): d = arr.iloc[k]['epoch'] - fil_max[0] #print(d) pdays.append(d) plt.plot(pdays, p, 'o') plt.gca().invert_yaxis() plt.show() obs = np.array([-2, -4, -9.0, -6]) pred = np.array([-3, -4, -8, -8]) a = ((obs - pred)**2) chi = chisquare(obs.astype(np.float64), pred.astype(np.float64)) print(chi) a = 4 print(a**2) #Working with Takashi's models list = os.listdir("/Users/bhagyasubrayan/Desktop/Plastic/public_data/") lines = open('/Users/bhagyasubrayan/Desktop/Plastic/public_data/' + list[0]).readlines() a = open('modelnames.txt', 'w').writelines(lines[2:]) with open('modelnames.txt', 'r') as in_file: stripped = (line.strip() for line in in_file) lines = (line.split() for line in stripped if line) with open('model.csv', 'w') as out_file: writer = csv.writer(out_file) #writer.writerow(('name', 'mass'))
def pearson_chisquare_pval(counts, N): """ 注意,计算 p-value 时,一定要传入次数,而不是概率 """ return chisquare(counts, ideal_distribution * N)[1]
1] + 1 for i in range(24, 28, 1): if datalist[row][i] != "Null": Multi_Flag_2017 = Multi_Flag_2017 + 1 if Multi_Flag_2017 > 2: if datalist[row][0][0] == "0": Multi_Count_2017[ int(datalist[row][0][1]) - 1] = Multi_Count_2017[int(datalist[row][0][1]) - 1] + 1 elif datalist[row][0][0] == "1": Multi_Count_2017[ int(datalist[row][0][0:2]) - 1] = Multi_Count_2017[int(datalist[row][0][0:2]) - 1] + 1 Multi_Flag_2017 = 0 Corr_Matrix = np.vstack((Total_Count_2017, Multi_Count_2017)) [chistatistics, ptest] = chisquare(Corr_Matrix) print(Total_Count_2017, Multi_Count_2017, chistatistics[0]) # We can use collision locations to estimate the areas of the zip code regions. All_Zip_Code = list('') All_Zip_Collision = list('') for row in range(1, row_count, 1): if datalist[row][0][-1:] == "7": if datalist[row][3] != 'Null': a = datalist[row][3] if a in All_Zip_Code and datalist[row][4] != 'Null' and datalist[ row][5] != 'Null': zipcode = All_Zip_Code.index(a) All_Zip_Collision[zipcode] = All_Zip_Collision[zipcode] + 1
else: cur_pixel = pixels[x, y] histogram[cur_pixel] += 1 obs = [] exp = [] X = 0 for y in range(1, len(histogram), 2): x = histogram[y - 1] z = (histogram[y - 1] + histogram[y]) / 2 if x > 0 and z > 0: obs.append(x) exp.append(z) obs = numpy.array(obs) exp = numpy.array(exp) chi, pval = stats.chisquare(obs, exp) chis.append(chi) pvals.append(pval) # print(pval) # if pval<=0.01: # if last_pval < 0.01: # print("Cover") # exit() # else: # break # last_pval=pval # print(sum(pvals), sum(chis)) # print("Stego, length: %d" % sz) if abs(sum(pvals) - 0.1) <= 0.1:
def enrichment_and_fold_change(self, seg_dict, min_valid=15): seg_lens = {k: len(V) for k, V in seg_dict.items()} sum_lens = float(sum(seg_lens.values())) seg_cnts = sorted([ a for b in [[(c, k) for c in seg_dict[k]] for k in seg_dict.keys()] for a in b ]) seg_means = sorted([(k, np.mean(V)) for k, V in seg_dict.items()], key=lambda x: x[1]) seg_obs = sorted([(k, len([v for v in V if v > 0]) / float(len(V))) for k, V in seg_dict.items()]) seg_min, seg_max = seg_means[0][0], seg_means[-1][0] seg_valid = len([x[1] for x in seg_cnts if x[0] > 0]) if seg_valid < min_valid or seg_valid < (seg_lens[seg_max] / 5.0): return {a: b for a, b in seg_means}, {a: b for a, b in seg_obs }, (seg_min, seg_max), (1.0, 1.0) seg_means = sorted([(k, np.mean(V)) for k, V in seg_dict.items()], key=lambda x: x[1]) seg_obs = sorted([(k, len([v for v in V if v > 0]) / float(len(V))) for k, V in seg_dict.items()]) seg_min, seg_max = seg_means[0][0], seg_means[-1][0] min_len, max_len = seg_lens[seg_min], seg_lens[seg_max] min_seg = seg_cnts[0:min_len] i = len(min_seg) while min_seg[-1][0] == seg_cnts[i][0]: min_seg.append(seg_cnts[i]) i += 1 if i == len(seg_cnts): return {a: b for a, b in seg_means}, {a: b for a, b in seg_obs }, (seg_min, seg_max), (1.0, 1.0) if max_len > len(seg_cnts) - i: max_seg = seg_cnts[i::] else: seg_rev = seg_cnts[-1::-1] max_seg = seg_rev[0:max_len] i = len(max_seg) while max_seg[-1][0] == seg_rev[i][0]: max_seg.append(seg_rev[i]) i += 1 if i == len(seg_cnts): return {a: b for a, b in seg_means }, {a: b for a, b in seg_obs}, (seg_min, seg_max), (1.0, 1.0) min_len, max_len = len(min_seg), len(max_seg) AAexp, ABexp = min_len * (seg_lens[seg_min] / sum_lens), min_len * ( seg_lens[seg_max] / sum_lens) BAexp, BBexp = max_len * (seg_lens[seg_min] / sum_lens), max_len * ( seg_lens[seg_max] / sum_lens) AAobs, ABobs = len([x for x in min_seg if x[1] == seg_min ]), len([x for x in min_seg if x[1] == seg_max]) BAobs, BBobs = len([x for x in max_seg if x[1] == seg_min ]), len([x for x in max_seg if x[1] == seg_max]) chi_low = chisquare([AAobs, ABobs], f_exp=[AAexp, ABexp])[1] chi_hi = chisquare([BAobs, BBobs], f_exp=[BAexp, BBexp])[1] return {a: b for a, b in seg_means }, {a: b for a, b in seg_obs}, (seg_min, seg_max), (chi_low, chi_hi)
# plt.plot(np.linspace(int(.05*EPOCH), EPOCH, int(.95*EPOCH)), np.asarray(losses)[int(.05*EPOCH):], 'bo', label='Loss') # plt.plot(np.linspace(int(.05*EPOCH), EPOCH, int(.95*EPOCH)), np.zeros(int(0.95*EPOCH))+float(loss.data.float()), 'g--', label='Final Loss = %.3e' % (float(loss.data.float()))) # plt.legend() # plt.show() ReHfit = torch.mean(torch.transpose(p, 0, 1)[0]).data.numpy() ReEfit = torch.mean(torch.transpose(p, 0, 1)[1]).data.numpy() ReHTfit = torch.mean(torch.transpose(p, 0, 1)[2]).data.numpy() fit_cffs = [ReHfit, ReEfit, ReHTfit] # plt.plot(phi[a:b], ydat[a:b], 'bo', label='data') # plt.plot(phi[a:b], f(xdat,fit_cffs), 'g--', label='fit') # plt.legend() # plt.show() err_H.append(abs(100 * (abs(fit_cffs[0] - ReH_target[a])) / ReH_target[a])) err_E.append(abs(100 * (abs(fit_cffs[1] - ReE_target[a])) / ReE_target[a])) err_HT.append( abs(100 * (abs(fit_cffs[2] - ReHT_target[a])) / ReHT_target[a])) print('Chi-Squared Value for this fit: %.3e' % (chisquare(f(xdat, fit_cffs), ydat[a:b])[0])) print('MSE Loss Value for this fit: %.3e' % (float(loss.data.float()))) print('Average Error for set #%d using ANN = %.2f%%' % ((datset), ((err_H[-1] + err_E[-1] + err_HT[-1]) / 3))) #dvcsfit.fit_scipy(datset) print('\n\033[1m%s%.2f%%' % ('Avg. Error of ReH = ', sum(err_H) / len(err_H))) print('\033[1m%s%.2f%%' % ('Avg. Error of ReE = ', sum(err_E) / len(err_E))) print('\033[1m%s%.2f%%' % ('Avg. Error of ReHT = ', sum(err_HT) / len(err_HT)))
def fitmodelnewx(model, x, y, dy): # p0=np.array([k,B,omi,E0,alpha]) p0 = np.array([k, B, omi]) # popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=([0.001,0.0001,0.001,0.00001,0.0], [4.0, 100.,50.0,100.0,1.0])) popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=([0.001, 0.0001, 0.0001], [2.0, 100., 9.0])) # popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=(0., [1.8, 10.,10.,100.])) #popt, pcov = curve_fit(model, x, y, p0, sigma=dy) print "------ " print " k [", k, "] =", "%.5f" % popt[0], "+/-", "%.5f" % pcov[0, 0]**0.5 print " B [", B, "(10^14 G)] =", "%.5f" % popt[ 1], "+/-", "%.5f" % pcov[1, 1]**0.5 print " omi [2pi/spin_i=", omi, "(10^3 Hz)] =", "%.5f" % popt[ 2], "+/-", "%.5f" % pcov[2, 2]**0.5 print " Spin Period [ms]=", 2.0 * np.pi / popt[ 2], "+/-", 2.0 * np.pi / popt[2] * (pcov[2, 2]**0.5) / popt[2] print " E0 [fixed (10^51 erg)] =", E0newa print " alpha (fixed) =", alphax print " E051=(L(Ttstart))*Tstart/k=", 10**(model( np.log10(startTxrt), popt[0], popt[1], popt[2])) * startTxrt / popt[0] print "------ " E051 = 10**(model(np.log10(startTxrt), popt[0], popt[1], popt[2])) * startTxrt / popt[0] Pms = 2.0 * np.pi / popt[2] dPms = Pms * (pcov[2, 2]**0.5) / popt[2] ym = model(x, popt[0], popt[1], popt[2]) print stats.chisquare(f_obs=y, f_exp=ym) mychi = sum(((y - ym)**2) / dy**2) #mychi=sum(((y-ym)**2)/ym) dof = len(x) - len(popt) print "my chisquare=", mychi print "dof=", dof p_value = 1 - stats.chi2.cdf(x=mychi, df=dof) print "P value", p_value bfmodel = model(np.log10(t), popt[0], popt[1], popt[2]) out_file = open(outfilenewx, "a") out_file.write(fi + "," + str(startTxrt) + "," + str(E051) + "," + str(alphax) + "," + str("%.5f" % popt[0]) + "," + str("%.5f" % pcov[0, 0]**0.5) + "," + str("%.5f" % popt[1]) + "," + str("%.5f" % pcov[1, 1]**0.5) + "," + str("%.5f" % Pms) + "," + str("%.5f" % dPms) + "," + str("%.5f" % mychi) + "," + str("%.5f" % dof) + "," + str("%.5f" % p_value) + "\n") out_file.close() return plt.plot(np.log10(t), bfmodel, 'c', label='CS06 alpha = 0.1 (fit)')
def __call__(self, parameters): allModelData = np.zeros(4) for stakes in self.allStakes: gainValue, lossValue = stakes allSimulations = [ lcaWrapper(gainValue, lossValue, *parameters) for _ in range(self.numSimulationsPerCondition) ] allValidResponseSimulations = list( filter(filterFunction, allSimulations)) numValidResponses = len(allValidResponseSimulations) if numValidResponses < self.numSimulationsPerCondition / 3: return (-1, parameters[3]) _, allModelRTs, allModelResponses = zip( *allValidResponseSimulations) modelStakes = np.hstack((np.full( (numValidResponses, 1), gainValue), np.full((numValidResponses, 1), lossValue))) modelDataForStakes = np.hstack( (np.array(allModelResponses).reshape(-1, 1), np.array(allModelRTs).reshape(-1, 1), modelStakes)) allModelData = np.vstack((allModelData, modelDataForStakes)) allModelData = allModelData[1:, :] actualDataMeanRT = np.mean(data[:, 1]) simDataMeanRT = np.mean(allModelData[:, 1]) delta = simDataMeanRT - actualDataMeanRT if delta > parameters[3]: delta = parameters[3] allModelData[:, 1] = allModelData[:, 1] - delta totalCost = 0 quantiles = np.array([0.1, 0.3, 0.5, 0.7, 0.9]) # quantiles of the chi^2 function observedProportionsChoiceWise = np.array( [0.1, 0.2, 0.2, 0.2, 0.2, 0.1]) # this is to cover some edge cases for stakes in self.allStakes: # loop over all combinations of possible gain and loss gain, loss = stakes observedTrials = selectConditionTrials(data, gain, loss) numObservedTrials = np.shape(observedTrials)[0] modelTrials = selectConditionTrials(allModelData, gain, loss) numModelTrials = np.shape(modelTrials)[0] for choice in range( 2): # loop over choice = 0 (reject) and 1 (accept) observedTrialsForChoice = observedTrials[ observedTrials[:, 0] == choice] observedRTsForChoice = observedTrialsForChoice[:, 1] numObservedRTsForChoice = np.size(observedRTsForChoice) observedPOfThisChoice = numObservedRTsForChoice / numObservedTrials if numObservedRTsForChoice < 5: # less than 5 trials --> can't compute quantile boundaries continue # skip this combination of gain, loss, choice quantilesBoundaries = np.quantile(observedRTsForChoice, quantiles) observedProportions = \ np.histogram(observedRTsForChoice, bins=np.concatenate(([0], quantilesBoundaries, [100])))[ 0] / numObservedTrials # proportions of experimental RTs in all quantiles if numObservedRTsForChoice == 5 or 0 in observedProportions: # some edge cases observedProportions = observedProportionsChoiceWise * observedPOfThisChoice observedFrequencies = numObservedTrials * observedProportions modelTrialsForChoice = modelTrials[modelTrials[:, 0] == choice] modelRTsForChoice = modelTrialsForChoice[:, 1] numModelRTsForChoice = np.size(modelRTsForChoice) modelProportions = \ np.histogram(modelRTsForChoice, bins=np.concatenate(([0], quantilesBoundaries, [100])))[ 0] / numModelTrials modelFrequencies = numObservedTrials * modelProportions totalCost += chisquare(modelFrequencies, observedFrequencies)[0] return (totalCost, parameters[3] - delta)
def time_chisqure(self): stats.chisquare(self.chisq)
def compare_histograms(hist, hist_vocab): return chisquare(hist, f_exp=hist_vocab)[0]
def generate_page(): st.markdown( """ ## Qu'est-ce qu'un échantillon ? Si on souhaite mesurer une caractéristique sur une grande population, on doit mesurer tous les individus de la population, mais il est souvent impossible de faire la mesure sur la population entière. Dans la pratique, on choisit un échantillon aléatoire de la population, c'est a dire plusieurs individus pris au hasard que l'on mesure pour avoir une approximation. Dans une population de 3 million d'individus avec deux allèles `A` et `a` pour un gène et ses trois génotypes associés (`AA`), (`Aa`), et (`aa`), il est plus facile de compter les génotypes de 100 individus pour avoir les fréquences génotypiques plutôt que de compter les 3 millions d'individus. On distingue la **mesure sur l'échantillon** obtenue après la avoir compté les 100 individus, de la **mesure théorique** que l'on aurait obtenue si on avait compté toute la population. Un échantillon est une représentation imparfaite de la population. Il se peut que par hasard, il contienne plus d'individus (`AA`), ou au contraire, plus d'individus (`aa`). Par conséquent, la mesure obtenue sur l'échantillon n'est quasiment jamais exactement égale à la mesure théorique. D'ailleurs un autre échantillon, contenant 100 individus différents, aurait des fréquences génotypiques sensiblement différentes de notre premier échantillon. De manière générale, plus un échantillon est grand, plus il y a de chance que la valeur estimée soit proche de la valeur théorique. A l'inverse, plus l'échantillon est petit, plus les valeurs estimées seront, en moyenne, éloignées de la valeur théorique. ## Échantillonnage aléatoire d'une population connue Pour estimer la fréquence génotypique de la population, on prend plusieurs échantillons aléatoirement dans la population. On observe grâce à un tirage aléatoire d'individus qui forment nos échantillons, que chaque échantillon possède des proportions de génotypes différentes. """ ) x = st.slider( "Fréquence théorique des génotypes (AA), (Aa), (aa)", 0.0, 1.0, (0.25, 0.75), 0.01, ) population_ratio = np.array([x[0], x[1] - x[0], 1 - x[1]]) sample_size = st.number_input( "Nombre d'individus dans l'échantillon", 0, 1000000, 100, 10 ) nb_echantillons = st.number_input("Nombre d'échantillons", 0, 30, 10, 1) multiple_echantillon = np.zeros((nb_echantillons, 3)) multiple_echantillon[:, 0] = np.random.binomial( sample_size, population_ratio[0], nb_echantillons ) multiple_echantillon[:, 1] = np.random.binomial( sample_size, population_ratio[1], nb_echantillons ) multiple_echantillon[:, 2] = sample_size - ( multiple_echantillon[:, 0] + multiple_echantillon[:, 1] ) fig = src.plots.display_echantillons( multiple_echantillon, truth=population_ratio * sample_size ) st.pyplot(fig) st.markdown( """ ## Déduire la fréquence des génotypes de la population totale Une fois que l'on a mesuré les fréquences des 3 génotypes, on peut demander quelle est la fréquence génotypique dans la population totale. La meilleure estimation que l'on puisse avoir est celle de l'échantillon. Par exemple, si on compte 30% de (`AA`) dans notre échantillon, on peut dire "la fréquence allélique des (`AA`) dans la population totale est d'environ 30%". On a une chance de me tromper bien sûr, mais elle est moins importante que si on avait dit "la fréquence allélique des (`AA`) dans la population totale est d'environ 60%". On peut mesurer les chances de se tromper grâce aux tests statistiques. Pour cela il faut prendre le problème à l'envers. On va faire une hypothèse sur la population (par exemple, l'hypothèse qu'il y a 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`)) et on va mesurer la chance d'obtenir aléatoirement l'échantillon que l'on vient de mesurer si cette hypothèse est vraie. Ainsi dans cette exemple, si notre échantillon de 100 individus contient 60% de (`AA`), 20% de (`Aa`), et 20% de (`aa`), il y a moins de 1% de chance que notre échantillon soit originaire d'une population qui suit notre hypothèse. On peut donc dire: - L'échantillon provient d'une population avec 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`) mais on a moins de 0.1% de chance d'avoir raison - L'échantillon ne provient pas d'une population avec 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`) et on a plus de 99.9% de chance d'avoir raison Du coup on va conclure que la population a de grande chance de ne pas avoir 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`). En revanche, si notre échantillon de 100 individus contient 32% de (`AA`), 35% de (`Aa`), et 33% de (`aa`), il y a de grande chance que notre échantillon soit originaire d'une population qui suit notre hypothèse. La différence entre les fréquences alléliques de notre hypothèse, vient certainement du hasard de l'échantillon. On en conclura qu'il y a de grande chance que notre échantillon ait été pris dans une population avec des fréquences alléliques de 33% (`AA`), 33% (`Aa`), et 34% (`aa`). Il existe une équation qui nous donne la probabilité que notre échantillon ait été pris dans une population en fonction des fréquences génotypiques hypothétiques de la population, et de celle mesurées dans notre échantillon. On appelle cette équation l'équation du Chi2. Pour chaque génotype $i$, on compare le nombre d'individus que l'on a observé ($Obs_i$) contre le nombre d'individus que l'on aurait dû obtenir si la population suivait l'hypothèse $Theo_i$. $$ Chi2 = \\sum{\\frac{(Obs_i - Theo_i)^2}{Theo_i} } $$ Par exemple dans le cas de notre premier échantillon avec 60% de (`AA`), 20% de (`Aa`), et 20% de (`aa`), on a compté 60 (`AA`), 20 (`Aa`), et 20 (`aa`) dans notre échantillon. Si notre population suit l'hypothèse décrite dans notre exemple, on s'attend idéalement à avoir 33 (`AA`), 33 (`Aa`), et 34 (`aa`). Du coup notre Chi2 vaut: $$ Chi2 = \\frac{(60-33)^2}{33} + \\frac{(20-33)^2}{33} + \\frac{(20-34)^2}{34} = 32.977 $$ On peut voir quelle est la probabilité d'obtenir cet échantillon sur la courbe du Chi2. """ ) fig = src.plots.chi2_curve() st.pyplot(fig) st.markdown( """ On peut voir que l'échantillon a une probabilité très faible de venir de la population de notre hypothèse. En d'autre terme, si on observe un tel échantillon, on est presque certain que la population n'est pas distribuée avec 33% de (`AA`), 33% de (`Aa`), et 34% de (`aa`). ## Exemple pratique avec l'équilibre de Hardy Weinberg Grace au test du Chi2, on peut maintenant déterminer si un échantillon dont on vient de mesurer les fréquences génotypiques provient d'une population qui est à l'équilibre de Hardy Weinberg. Pour cela on fait l'hypothèse que notre population est à l'équilibre de Hardy Weinberg avec $p^2 %$ de (`AA`), $2pq %$ de (`Aa`), et $q^2 %$ de (`aa`), ou $p$ et $q$ sont les fréquences allèliques respectives de l'allèle `A` et `a`. Entrez les valeurs que vous observez dans votre échantillon pour chaque génotype: """ ) col1, col2, col3 = st.beta_columns(3) with col1: AA = st.number_input( "nombre de (AA) dans l'échantillon", 0, 1000000, 0, 1 ) with col2: Aa = st.number_input( "nombre de (Aa) dans l'échantillon", 0, 1000000, 0, 1 ) with col3: aa = st.number_input( "nombre de (aa) dans l'échantillon", 0, 1000000, 0, 1 ) if AA == Aa == aa == 0: # nothing is inputted pass else: N = AA + Aa + aa fA = (AA + 0.5 * Aa) / N fa = 1 - fA fAA = fA ** 2 fAa = 2 * fA * fa faa = fa ** 2 chi2_value, pvalue = chisquare( [AA, Aa, aa], f_exp=[fAA * N, fAa * N, faa * N] ) st.markdown( f""" On a donc un échantillon de taille {N}. La fréquence allélique de l'échantillon est de {np.around(fA, 2)} pour `A` et {np.around(fa, 2)} pour `a`. Par conséquent, si la population est à l'équilibre de Hardy Weinberg, on s'attend à avoir: - {np.around(100*fAA, 2)} % de (`AA`) - {np.around(100*fAa, 2)} % de (`Aa`) - {np.around(100*faa, 2)} % de (`aa`) On observe: - {np.around(100*AA/N, 2)} % de (`AA`) - {np.around(100*Aa/N, 2)} % de (`Aa`) - {np.around(100*aa/N, 2)} % de (`aa`) Le test du Chi2 produit une valeur de {np.around(chi2_value, 4)} et il y a, par conséquent, {np.around(pvalue*100, 7)} % de chance que notre échantillon provienne d'une population à l'équilibre de Hardy Weinberg. """ ) if pvalue < 0.05: st.markdown( """ **Comme cette valeur est inférieure à 5%, on en conclut, avec moins de 5% de chance de se tromper que la population ne suit pas l'équilibre de Hardy Weinberg.** """ ) else: st.markdown( """ **Comme cette valeur est supérieure à 5%, on en conclut qu'il a une forte chance que la population suive l'équilibre de Hardy Weinberg** """ )
def fitmodelold(model, x, y, dy): # p0=np.array([k,B,omi,E0]) p0 = np.array([k, B, omi]) popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=([0.0001, 0.0001, 0.0001], [2.0, 100., 10.0])) # popt, pcov = curve_fit(model, x, y, p0, sigma=dy, bounds=(0., [1.8, 10.,10.,100.])) #popt, pcov = curve_fit(model, x, y, p0, sigma=dy) #Ein = 0.5*Ine*popt[2]**2 # initial spin energy 27.7 10^51 erg #tsdi = 3*Ine*c**3/(popt[1]**2*(r0)**6*popt[2]**2)*10**5 # Initial spin down time for the standard magnetic dipole formula 3.799*10^6/B2*omi**2 s #Li=Ein/tsdi print "------ " print " k [", k, "] =", "%.5f" % popt[0], "+/-", "%.5f" % pcov[0, 0]**0.5 print " B [", B, "(10^14 G)] =", "%.5f" % popt[ 1], "+/-", "%.5f" % pcov[1, 1]**0.5 print " omi [2pi/spin_i=", omi, "(kHz)] =", "%.5f" % popt[ 2], "+/-", "%.5f" % pcov[2, 2]**0.5 print " Spin Period [ms]=", 2.0 * np.pi / popt[ 2], "+/-", 2.0 * np.pi / popt[2] * (pcov[2, 2]**0.5) / popt[2] # print "E0 [",E0,"(10^51 erg)] =", "%.5f" %popt[3], "+/-", "%.5f" %pcov[3,3]**0.5 print " E0 (fixed) [10^51 erg) =", E0old print " L(Tt)=", model(np.log10(startTxrt), popt[0], popt[1], popt[2]) print " E051=(L(Ttstart))*Tstart/k=", 10**(model( np.log10(startTxrt), popt[0], popt[1], popt[2])) * startTxrt / popt[0] print "------ " E051 = (10**(model(np.log10(startTxrt), popt[0], popt[1], popt[2]))) * startTxrt / popt[0] Pms = 2.0 * np.pi / popt[2] dPms = Pms * (pcov[2, 2]**0.5) / popt[2] print 'Pms, dPms=', Pms, dPms # ym=model(x,popt[0],popt[1],popt[2],popt[3]) ym = model(x, popt[0], popt[1], popt[2]) print stats.chisquare(f_obs=y, f_exp=ym) mychi = sum(((y - ym)**2) / dy**2) #mychi=sum(((y-ym)**2)/ym) dof = len(x) - len(popt) print "my chisquare=", mychi print "dof=", dof p_value = 1 - stats.chi2.cdf(x=mychi, df=dof) print "P value", p_value bfmodel = model(np.log10(t), popt[0], popt[1], popt[2]) out_file = open(outfileold, "a") #out_file.write(fi+","+str(startTxrt)+","+str(E051)+","+str("%.5f" %popt[0])+","+str("%.5f" %pcov[0,0]**0.5)+","+str("%.5f" %popt[1])+","+str("%.5f" %pcov[1,1]**0.5)+","+str("%.5f" %popt[2])+","+str("%.5f" %pcov[2,2]**0.5)+","+str("%.5f" %mychi)+","+str("%.5f" %dof)+","+str("%.5f" %p_value)+"\n") out_file.write(fi + "," + str(startTxrt) + "," + str(E051) + "," + str("%.5f" % popt[0]) + "," + str("%.5f" % pcov[0, 0]**0.5) + "," + str("%.5f" % popt[1]) + "," + str("%.5f" % pcov[1, 1]**0.5) + "," + str("%.5f" % Pms) + "," + str("%.5f" % dPms) + "," + str("%.5f" % mychi) + "," + str("%.5f" % dof) + "," + str("%.5f" % p_value) + "\n") out_file.close() return plt.plot(np.log10(t), bfmodel, 'r', label='D11 (fit)')
observed = [6662, 1179, 15128, 9592] expected = [5249.8, 2597.4, 16533.5, 8180.3] chisq_gender_income = 0 for i in range(len(observed)): chisq_gender_income += (observed[i] - expected[i])**2 / expected[i] ## 4. Finding statistical significance ## from scipy.stats import chisquare observed = [6662, 1179, 15128, 9592] expected = [5249.8, 2597.4, 16533.5, 8180.3] pvalue_gender_income = chisquare(observed, expected) ## 5. Cross tables ## import pandas table = pandas.crosstab(income["sex"], income["race"]) print(table) ## 6. Finding expected values ## import pandas from scipy.stats import chi2_contingency table = pandas.crosstab(income["sex"], income["race"])
def benfords(numbers): ''' Examine the distribution of the first digits in a given corpus of numbers to see if they correspond to Benford's Law using a chi square test. Benford's Law, also known as the "first digit law" or the "law of anomalous numbers" states that there is a specific distribution pattern of the first digits of certain groups of numbers. See https://en.wikipedia.org/wiki/Benford%27s_law for more info. :param numbers: The set of numbers to check against Benford's Law :type numbers: A list-like object (list, tuple, set, Pandas DataFrame or Series) containing floats or integers :Return Value: The function returns three values in a tuple (chi2, p, counts): * The 'chi2' value is a float in the range 0..1 that describes how well the observed distribution of first digits matched the predictions of Benford's Law. Lower is better. * The 'p' value is the probability that the computed 'chi2' is significant (i.e., it tells you whether the chi2 value can be trusted). Its range is also 0..1, but in this case, higher is better. Generally speaking, if the p-value is >= 0.95 then the chi2 value is considered significant. * 'counts' is a Pandas series where the indices are the possible first digits 1-9 and the values are the observed distributions of those digits. If the observed distributions didn't match up with Benford's law, the counts may help you identify the anomalous values. ''' def _first_digit(i: float): while i >= 10: i //= 10 return trunc(i) _BENFORDS = [ 0.301, # 1 0.176, # 2 0.125, # 3 0.097, # 4 0.079, # 5 0.067, # 6 0.058, # 7 0.051, # 8 0.046 # 9 ] if not is_list_like(numbers): raise TypeError( f'The argument must be a list or list-like of numbers, not type {type(numbers)}.' ) if isinstance(numbers, pd.core.series.Series): numbers = numbers.values numbers = pd.DataFrame(numbers, columns=['numbers']) numbers['digits'] = numbers['numbers'].apply(_first_digit) counts = numbers['digits'].value_counts() # No leading zeroes! if 0 in counts.index: counts = counts.drop(0) # Ensure every digit 1-9 has an count, even if it's 0 for i in range(1, 10): if not i in counts: counts[i] = 0 # Sort by index just to be extra sure they are all in the correct # order counts = counts.sort_index() # Compute the actual distribution of first digits in the input # as a proportion of that count to the entire number of samples num_samples = counts.sum() counts = counts.apply(lambda x: x / num_samples) # Compare the actual distribution to Benford's Law chi2, p = chisquare(counts.values, _BENFORDS) # Return the results of the comparison, plus the observed counts return chi2, p, counts
def return_chisquare(observed, expected): # null hypothesis: Diagnosed patients and control patients use feature with the same frequency count_chisquare = stats.chisquare(observed, f_exp=expected) return count_chisquare
def distribution_test(dist): chisquare_val, p_val = ss.chisquare(dist) print('Chisquare:', chisquare_val) print('P:', p_val)
def significance_testing(default_value, values, significane=0.05): value, p = chisquare(values) value = (value / p) * significane return [v - default_value > value for v in values]
def return_partition_DV(self, data, borders, r=2, alpha=0.05): # extract the bin boundaries Xmin = borders['nodes'][0] Xmax = borders['nodes'][1] Ymin = borders['nodes'][2] Ymax = borders['nodes'][3] # find the number of bins # numBins = r ** 2 idx = np.where((data[:, 0] >= Xmin) & (data[:, 0] <= Xmax) & (data[:, 1] >= Ymin) & (data[:, 1] <= Ymax)) # extract the points in the bin Xsub = data[idx, 0] Ysub = data[idx, 1] # print(Xsub.shape, '\t', Ysub.shape) # find the indices of the points in the x- and y-patches idx_x = np.where((data[:, 0] >= Xmin) & (data[:, 0] <= Xmax)) idx_y = np.where((data[:, 1] >= Ymin) & (data[:, 1] <= Ymax)) # get the subpartitions ai = np.floor( np.percentile(data[idx_x, 0], 1 / r * np.arange(1, r) * 100)) bj = np.floor( np.percentile(data[idx_y, 1], 1 / r * np.arange(1, r) * 100)) # get the bin edges edges1 = np.concatenate(([Xmin], ai, [Xmax])) edges2 = np.concatenate(([Ymin], bj, [Ymax])) # first exit criteria: we cannot split inot unique boundaries any more # preallocate the partition list partitions = [] if (len(np.unique(edges1, return_counts=True)[1]) < r + 1 or len(np.unique(edges2, return_counts=True)[1]) < r + 1): # reject futher partitions, and return original bin partitions.insert(0, { 'nodes': np.array([Xmin, Xmax, Ymin, Ymax]), 'npts': len(idx[0]) }) return partitions # figure out the shift in the edges so that boundaries do not overlap xShift = np.zeros((2 * r, 2 * r)) yShift = xShift xShift[:, 1:-1] = np.tile(np.array([[-1, 0]]), (2 * r, r - 1)) yShift = xShift.T # find the boundaries for each bin # duplicate inner nodes for x mesh dupMidNodesX = np.append( np.insert(np.repeat((edges1[1:-1]), 2, axis=0), 0, edges1[0]), edges1[-1]) # duplicate inner nodes for y mesh dupMidNodesY = np.append( np.insert(np.repeat((edges2[1:-1]), 2, axis=0), 0, edges2[0]), edges2[-1]) # reshape dupMidNodesY = np.reshape(dupMidNodesY, (-1, 1)) # now find the nodes for each bin xBinBound = dupMidNodesX + xShift yBinBound = dupMidNodesY + yShift # find the number of points in each bin, and put this info into array binned_data = binned_statistic_2d(Xsub.flatten(), Ysub.flatten(), None, 'count', bins=[edges1, edges2]) # get the counts. Flatten columnwise to match the bin definition in the # loop that creates the dictionaries below binCounts = binned_data.statistic.flatten('F') # define an empty list to hold the dictionaries of the fresh partitions bins = [] # create dictionaries for each bin # start with the loop over y # note how the loop counts were obtained above to match the convention # here for yInd in np.arange(r): # this is the loop over x for xInd in np.arange(r): # get the bin number binNo = yInd * r + xInd xLow, xHigh = xBinBound[yInd, 2 * xInd + np.arange(2)] yLow, yHigh = yBinBound[2 * yInd + np.arange(2), xInd] bins.append({ 'nodes': np.array([xLow, xHigh, yLow, yHigh]), 'npts': binCounts[binNo] }) # calculate the chi square statistic chi2 = chisquare(binCounts) # check for independence and start recursion # if the chi2 test fails, do further partitioning: if (chi2.pvalue < alpha and Xmax != Xmin and Ymax != Ymin).all(): for binInfo in bins: if binInfo['npts'] != 0: # if the bin is not empty: # append entries to the tuple partitions.extend( self.return_partition_DV(data=data, borders=binInfo, r=r, alpha=alpha)) # Second exit criteria: # if the partitions are independent, reject further partitioning and # save the orignal, unpartitioned bin elif len(idx[0]) != 0: partitions.insert(0, { 'nodes': np.array([Xmin, Xmax, Ymin, Ymax]), 'npts': len(idx[0]) }) return partitions
def main(): n_bins = 20 a = mkPlot('1MHz') b = mkPlot('3MHz') c = mkPlot('5MHz') d = mkPlot('10MHz') e = mkPlot('12MHz') f = mkPlot('450kHz') g = mkPlot('500kHz') h = mkPlot('700kHz') i = mkPlot('800kHz') l = mkPlot('990kHz') y = np.array( [f['Frequenza'], g['Frequenza'], h['Frequenza'], i['Frequenza'], l['Frequenza'], a['Frequenza'], b['Frequenza'], c['Frequenza'], d['Frequenza'], e['Frequenza']]) y_err = np.array([f['Std'], g['Std'], h['Std'], i['Std'], l['Std'], a['Std'], b['Std'], c['Std'], d['Std'], e['Std']]) x = np.array([0.45, 0.5, 0.7, 0.8, 0.99, 1, 3, 5, 10, 12]) data_hist = np.array( [f['hist'], g['hist'], h['hist'], i['hist'], l['hist'], a['hist'], b['hist'], c['hist'], d['hist'], e['hist']]) fig = plt.figure(figsize=(10, 5)) ax = fig.add_subplot() fig1 = plt.figure(figsize=(10, 5)) spec = gridspec.GridSpec(ncols=5, nrows=2, figure=fig1) ax1 = fig1.add_subplot(spec[0, 0]) ax2 = fig1.add_subplot(spec[0, 1]) ax3 = fig1.add_subplot(spec[0, 2]) ax4 = fig1.add_subplot(spec[0, 3]) ax5 = fig1.add_subplot(spec[0, 4]) ax6 = fig1.add_subplot(spec[1, 0]) ax7 = fig1.add_subplot(spec[1, 1]) ax8 = fig1.add_subplot(spec[1, 2]) ax9 = fig1.add_subplot(spec[1, 3]) ax10 = fig1.add_subplot(spec[1, 4]) h1 = ax1.hist(f['hist'], bins=n_bins) ax1.set_title('450kHz') h2 = ax2.hist(g['hist'], bins=n_bins) ax2.set_title('500kHz') h3 = ax3.hist(h['hist'], bins=n_bins) ax3.set_title('700kHz') h4 = ax4.hist(i['hist'], bins=n_bins) ax4.set_title('800kHz') h5 = ax5.hist(l['hist'], bins=n_bins) ax5.set_title('990kHz') h6 = ax6.hist(a['hist'], bins=n_bins) ax6.set_title('1MHz') h7 = ax7.hist(b['hist'], bins=n_bins) ax7.set_title('3MHz') h8 = ax8.hist(c['hist'], bins=n_bins) ax8.set_title('5MHz') h9 = ax9.hist(d['hist'], bins=n_bins) ax9.set_title('10MHz') h10 = ax10.hist(e['hist'], bins=n_bins) ax10.set_title('12MHz') linearity_plot = ax.errorbar(x, y, yerr=y_err * 10, label='Linearity data', ls='none', ecolor='r') coef = np.polyfit(x, y, 1) poly1d_fn = np.poly1d(coef) fit_label_1 = 'm = {}\n'.format(round(coef[0], 5)) fit_label_2 = 'q = {}'.format(round(coef[1], 5)) fit_label = fit_label_1 + fit_label_2 fit_plot = ax.plot(x, poly1d_fn(x), '--k', linewidth=0.5, label=fit_label) legend = ax.legend(loc='upper left', shadow=True, fontsize='medium', prop={"size": 15}) ax.set_xlabel('Nominal frequencies (MHz)', color='black', fontsize=15) ax.set_ylabel('Digitizer frequencies (MHz)', color='black', fontsize=15) """ax1.set_xlabel('', color='black') ax1.set_ylabel('Digitizer frequencies (MHz)', color='black') ax2.set_xlabel('Nominal frequencies (MHz)', color='black') ax2.set_ylabel('Digitizer frequencies (MHz)', color='black') ax3.set_xlabel('Nominal frequencies (MHz)', color='black') ax3.set_ylabel('Digitizer frequencies (MHz)', color='black') ax4.set_xlabel('Nominal frequencies (MHz)', color='black') ax4.set_ylabel('Digitizer frequencies (MHz)', color='black') ax5.set_xlabel('Nominal frequencies (MHz)', color='black') ax5.set_ylabel('Digitizer frequencies (MHz)', color='black') ax6.set_xlabel('Nominal frequencies (MHz)', color='black') ax6.set_ylabel('Digitizer frequencies (MHz)', color='black') ax7.set_xlabel('Nominal frequencies (MHz)', color='black') ax7.set_ylabel('Digitizer frequencies (MHz)', color='black') ax8.set_xlabel('Nominal frequencies (MHz)', color='black') ax8.set_ylabel('Digitizer frequencies (MHz)', color='black') ax9.set_xlabel('Nominal frequencies (MHz)', color='black') ax9.set_ylabel('Digitizer frequencies (MHz)', color='black') ax10.set_xlabel('Nominal frequencies (MHz)', color='black') ax10.set_ylabel('Digitizer frequencies (MHz)', color='black')""" print(' ----------------------- LEGEND AND UNITS ----------------------\n' '| The frequencies are reported in [MHz] |\n' ' ---------------------------------------------------------------\n\n' ' *** RESULTS ***\n\n' 'Nominal Frequencies | Derived Frequencies +/- Dev. Std \n' '---------------------------------------------------------------------------\n' ' 450kHz | ' + str(f['Frequenza']) + ' +/- ' + str(f['Std']) + '\n' ' 500kHz | ' + str(g['Frequenza']) + ' +/- ' + str(g['Std']) + '\n' ' 700kHz | ' + str(h['Frequenza']) + ' +/- ' + str(h['Std']) + '\n' ' 800kHz | ' + str(i['Frequenza']) + ' +/- ' + str(i['Std']) + '\n' ' 990kHz | ' + str(l['Frequenza']) + ' +/- ' + str(l['Std']) + '\n' ' 1MHz | ' + str(a['Frequenza']) + ' +/- ' + str(a['Std']) + '\n' ' 3MHz | ' + str(b['Frequenza']) + ' +/- ' + str(b['Std']) + '\n' ' 5MHz | ' + str(c['Frequenza']) + ' +/- ' + str(c['Std']) + '\n' ' 10MHz | ' + str(d['Frequenza']) + ' +/- ' + str(d['Std']) + '\n' ' 12MHz | ' + str(e['Frequenza']) + ' +/- ' + str(e['Std']) + '\n' '---------------------------------------------------------------------------\n\n' ' *** FIT RESULTS ***\n\n' 'Slope (m): ' + str(coef[0]) + '\n' 'Intercept (q): ' + str(coef[1]) + '\n' 'Chi-square: ' + str(chisquare(f_obs=y, f_exp=x)) + '\n' 'Delta (%): ' + str(abs(1 - coef[0]) * 100), file=open("./Characterization/Linearity_output.txt", "w")) plt.tight_layout() plt.show()
def fitgaussian(x, y, weights=None, guess=None, return_fit=True, return_uncertainties=False): """ Fit a single gaussian to the data "y" at positions "x", points can be weighted by "weights" and an initial guess for the gaussian parameters :param x: numpy array (1D), the x values for the gaussian :param y: numpy array (1D), the y values for the gaussian :param weights: numpy array (1D), the weights for each y value :param guess: list of floats, the initial guess for the guassian fit parameters in the following order: [amplitude, center, fwhm, offset from 0 (in y-direction)] :param return_fit: bool, if True also calculates the fit values for x i.e. yfit = gauss_function(x, *pfit) :param return_uncertainties: bool, if True also calculates the uncertainties based on the covariance matrix (pcov) uncertainties = np.sqrt(np.diag(pcov)) :return pfit: numpy array (1D), the fit parameters in the following order: [amplitude, center, fwhm, offset from 0 (in y-direction)] :return yfit: numpy array (1D), the fit y values, i.e. the gaussian values for the fit parameters, only returned if return_fit = True """ # if we don't have weights set them to be all equally weighted if weights is None: weights = np.ones(len(x)) weights = 1.0 / weights # if we aren't provided a guess, make one if guess is None: guess = [np.nanmax(y), np.nanmean(y), np.nanstd(y), 0] # calculate the fit using curve_fit to the function "gauss_function" with warnings.catch_warnings(record=True) as _: pfit, pcov = curve_fit(gauss_function, x, y, p0=guess, sigma=weights, absolute_sigma=True) if return_fit and return_uncertainties: # calculate the fit parameters yfit = gauss_function(x, *pfit) # work out the normalisation constant chis, _ = chisquare(y, f_exp=yfit) norm = chis / (len(y) - len(guess)) # calculate the fit uncertainties based on pcov efit = np.sqrt(np.diag(pcov)) * np.sqrt(norm) # return pfit, yfit and efit return pfit, yfit, efit # if just return fit elif return_fit: # calculate the fit parameters yfit = gauss_function(x, *pfit) # return pfit and yfit return pfit, yfit # if return uncertainties elif return_uncertainties: # calculate the fit parameters yfit = gauss_function(x, *pfit) # work out the normalisation constant chis, _ = chisquare(y, f_exp=yfit) norm = chis / (len(y) - len(guess)) # calculate the fit uncertainties based on pcov efit = np.sqrt(np.diag(pcov)) * np.sqrt(norm) # return pfit and efit return pfit, efit # else just return the pfit else: # return pfit return pfit
y_int = np.interp(x1, x, yb) # All the models!! y1 = model_pickle['forest_clear_atmosphere.dat'] y2 = model_pickle['forest_t10_atmosphere.dat'] y3 = model_pickle['grass_clear_atmosphere.dat'] y4 = model_pickle['grass_t10_atmosphere.dat'] y5 = model_pickle['ice_clear_atmosphere.dat'] y6 = model_pickle['ice_t10_atmosphere.dat'] y7 = model_pickle['ocean_clear_atmosphere.dat'] y8 = model_pickle['ocean_t30_atmosphere.dat'] y9 = model_pickle['sand_clear_atmosphere.dat'] y10 = model_pickle['sand_t10_atmosphere.dat'] chisquaresb = [ chisquare(y_int, f_exp=y1), chisquare(y_int, f_exp=y2), chisquare(y_int, f_exp=y3), chisquare(y_int, f_exp=y4), chisquare(y_int, f_exp=y5), chisquare(y_int, f_exp=y6), chisquare(y_int, f_exp=y7), chisquare(y_int, f_exp=y8), chisquare(y_int, f_exp=y9), chisquare(y_int, f_exp=y10) ] # best fit for b is y10, sand_t10_atmosphere y_intc = np.interp(x1, x, yc) chisquaresc = [ chisquare(y_intc, f_exp=y1),
noise = (noise - np.mean(noise)) / np.std(noise) ndata = len(noise) # bin data nbin = 20 minb = -4.5 maxb = 4.5 bins, freq = freq_hist(minb, maxb, nbin, noise) # get idealized values f_ideal = np.empty([nbin]) for i in range(nbin): f_ideal[i] = st.norm.cdf(bins[i + 1]) - st.norm.cdf(bins[i]) # do chi^2 test chi2, p = st.chisquare(freq, f_ideal) print p, chi2 # do the same with uniform transformation u = 0.5 * erfc(-noise / np.sqrt(2.)) nbin = 20 minb = 0. maxb = 1. bins, freq = freq_hist(minb, maxb, nbin, u) print freq chi2, p = st.chisquare(freq, np.full([nbin], 1. / nbin)) print p, chi2 # plot plt.plot(noise, 'k-', lw=1.) plt.savefig('rx.png')