def chi2_dir(cause, effect, unknown, n, p_cause, p_effect_given_cause): cnt = count(zip(effect, unknown)) #print cnt chi_indep = chi2_contingency(cnt)[1] p_unknown_given_effect = [ float(cnt[0][1]) / sum(cnt[0]), float(cnt[1][1]) / sum(cnt[1]) ] #print 'p(bact|cd)=%s' % p_unknown_given_effect exp=[[0,0],[0,0]] for c in range(2): for e in range(2): for u in range(2): exp[c][u] += (n * p_of_val(p_cause, c) * p_of_val(p_effect_given_cause[c], e) * p_of_val(p_unknown_given_effect[e], u)) cnt = count(zip(cause, unknown)) #print "obs=%s" % cnt #print 'cnt=%s' % cnt #print 'expected if cd->bact=%s' % exp chi_rev = chisquare(cnt, exp, axis=None, ddof=2) chi_fwd = chi2_contingency(cnt) #print 'expected if bact->cd=%s' % chi_fwd[3] bayes_factor = chi2.pdf(chi_fwd[0],1) / chi2.pdf(chi_rev.statistic,1) return struct(reject_indep=chi_indep, bayes_fwd_rev=bayes_factor, reject_fwd=chi_fwd[1], reject_rev=chi_rev.pvalue)
def getChis(crosstab, variable): chi2, p, dof, ex = sps.chi2_contingency(crosstab) x = sps.chi2_contingency(crosstab) crit = sps.chi2.ppf(q=0.95, df=dof) if (crit < chi2): evaluation = True else: evaluation = False obs = crosstab.as_matrix() obs_list = obs.tolist() ex_list = ex.tolist() z_scores = sps.zmap(obs_list, ex_list) z_list = z_scores.tolist() z_indicators = [] for z in z_list: z_sig = ["+" if i > 1.96 else "-" if i < -1.96 else " " for i in z] z_indicators.append(z_sig) results = {'chi-sq': chi2, 'p-val': p, 'eval': evaluation, 'dof': dof, 'explanans': variable, 'expected': ex_list, 'observed': obs_list, 'z_scores': z_indicators, 'row_lab': crosstab.index.tolist(), 'col_lab': crosstab.columns.tolist() } print results return results
def chiSquare(): ''' Application of a chi square test to a 2x2 table. The calculations are done with and without Yate's continuity correction. Data are taken from Altman, Table 10.10: Comparison of number of hours' swimming by swimmers with or without erosion of dental enamel. >= 6h: 32 yes, 118 no < 6h: 17 yes, 127 no''' # Enter the data obs = np.array([[32, 118], [17, 127]]) # --- >>> START stats <<< --- # Calculate the chi-square test chi2_corrected = stats.chi2_contingency(obs, correction=True) chi2_uncorrected = stats.chi2_contingency(obs, correction=False) # --- >>> STOP stats <<< --- # Print the result print('\nCHI SQUARE --------------------------------------------------') print(('The corrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format( chi2_corrected[0], chi2_corrected[1]))) print(('The uncorrected chi2 value is {0:5.3f}, with p={1:5.3f}'.format( chi2_uncorrected[0], chi2_uncorrected[1]))) return chi2_corrected
def test_basic(self): # median_test calls chi2_contingency to compute the test statistic # and p-value. Make sure it hasn't screwed up the call... x = [1, 2, 3, 4, 5] y = [2, 4, 6, 8] stat, p, m, tbl = stats.median_test(x, y) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p) stat, p, m, tbl = stats.median_test(x, y, lambda_=0) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, lambda_=0) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p) stat, p, m, tbl = stats.median_test(x, y, correction=False) assert_equal(m, 4) assert_equal(tbl, [[1, 2], [4, 2]]) exp_stat, exp_p, dof, e = stats.chi2_contingency(tbl, correction=False) assert_allclose(stat, exp_stat) assert_allclose(p, exp_p)
def main (contingency_table): """ Calcula estadisticas de una tabal de contingencia 2x2 """ SRS_types = set([]) tables = {} for row in csv.reader(open(contingency_table), delimiter = '\t'): ID, non_can, can = row SRS, tag = ID.split("_") SRS_types.add(SRS) tables[ID] = [int(non_can), int(can)] for srs in SRS_types: table = [] table.append(tables[srs + "_YES"]) table.append(tables[srs + "_NO"]) obs = np.array(table) chi2, chi2_pvalue, chi2_dof, chi2_ex = chi2_contingency(obs, correction=False) chi2_yates, chi2_yates_pvalue, chi2_yates_dof, chi2_yates_ex = chi2_contingency(obs, correction=True) fisher_oddsratio, fisher_pvalue = stats.fisher_exact(table) # print srs, table, fisher_oddsratio, fisher_pvalue, chi2, chi2_pvalue, chi2_dof, chi2_ex print srs, fisher_oddsratio, log(fisher_oddsratio, 2), fisher_pvalue, chi2, chi2_pvalue, chi2_yates, chi2_yates_pvalue
def position_wise_scores2(seq5_list, seq3_list, organism, title='Intron position strength'): '''Uses chi-contingency test to score base proportions at each position in sample against population''' organism, gff3, fa_dict, bowtie_index = SP.find_organism_files(organism) all_5p, all_3p = generate_all_ss_seqs(gff3, fa_dict, organism) pop_5p = seq_list_to_totals(all_5p) pop_3p = seq_list_to_totals(all_3p) samp_5p = seq_list_to_totals(seq5_list) samp_3p = seq_list_to_totals(seq3_list) print samp_5p.shape p5 = [] for n in range(samp_5p.shape[1]): if n == 2 or n == 3: p5.append(1) else: conting = np.array([samp_5p[:,n],pop_5p[:,n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p5.append(np.log10(p)*-1) p3 = [] for n in range(samp_3p.shape[1]): if n == 4 or n == 5: p3.append(1) else: conting = np.array([samp_3p[:,n],pop_3p[:,n]]) chi2, p, dof, expected = stats.chi2_contingency(conting) p3.append(np.log10(p)*-1) fig, ax = plt.subplots(2, 1, figsize=(4,4)) width = 0.7 max_y = max(p5+p3) + 0.1*max(p5+p3) ind5 = np.arange(len(p5)) ax[0].bar(ind5, p5, color='k') ax[0].plot([0,8], [2,2], '--', color='0.7') ax[0].set_xlim([0,len(p5)]) ax[0].set_ylabel("5' splice site\n-log10(p-value)") ax[0].set_title(title) ax[0].set_ylim([0,max_y]) ind3 = np.arange(len(p3)) ax[1].bar(ind3, p3, color='k') ax[1].plot([0,8], [2,2], '--', color='0.7') ax[1].set_xlim([0,len(p3)]) ax[1].set_ylabel("3' splice site\n-log10(p-value)") ax[1].set_ylim([0,max_y]) ax[0].set_xticks(ind3 + width / 2) ax[1].set_xticks(ind3 + width / 2) ax[0].set_xticklabels(np.arange(-2,6)) ax[1].set_xticklabels(np.arange(-5,3)) fig.tight_layout() plt.show() return fig
def statistic_analysis(np_snp_info,np_feature_snp,np_label_classifyProgress,np_label_classifyPhenotype): ### proportion np_proportion = np.empty([np_snp_info.shape[0],np_snp_info.shape[1]],dtype='float') np_proportion = np.average(np_feature_snp, axis=0).reshape(np_snp_info.shape[0],np_snp_info.shape[1]) ### get 2X2 matrix np_2_2_matrix_classifyProgress = np.empty([np_snp_info.shape[0],4],dtype='float') np_2_2_matrix_classifyPhenotype = np.empty([np_snp_info.shape[0],4],dtype='float') for idxSNP in range(0,np_snp_info.shape[0]): for idxSample in range(0,np_feature_snp.shape[0]): if np_label_classifyProgress[idxSample] == 0: np_2_2_matrix_classifyProgress[idxSNP,0] = np_2_2_matrix_classifyProgress[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyProgress[idxSNP,2] = np_2_2_matrix_classifyProgress[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 else: np_2_2_matrix_classifyProgress[idxSNP,1] = np_2_2_matrix_classifyProgress[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyProgress[idxSNP,3] = np_2_2_matrix_classifyProgress[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 if np_label_classifyPhenotype[idxSample] == 0: np_2_2_matrix_classifyPhenotype[idxSNP,0] = np_2_2_matrix_classifyPhenotype[idxSNP,0] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyPhenotype[idxSNP,2] = np_2_2_matrix_classifyPhenotype[idxSNP,2] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 else: np_2_2_matrix_classifyPhenotype[idxSNP,1] = np_2_2_matrix_classifyPhenotype[idxSNP,1] + np_feature_snp[idxSample,idxSNP*3] * 2 + np_feature_snp[idxSample,idxSNP*3+1] np_2_2_matrix_classifyPhenotype[idxSNP,3] = np_2_2_matrix_classifyPhenotype[idxSNP,3] + np_feature_snp[idxSample,idxSNP*3+1] + np_feature_snp[idxSample,idxSNP*3+2] * 2 ### chi-square; fisher; oddsratio np_chi2 = np.empty([np_snp_info.shape[0],2],dtype='float') np_fisher = np.empty([np_snp_info.shape[0],2],dtype='float') np_oddsratio = np.empty([np_snp_info.shape[0],2],dtype='float') for idxSNP in range(0,np_snp_info.shape[0]): np_this_2_2_matrix = np_2_2_matrix_classifyProgress[idxSNP,:].reshape(2,2) print np_this_2_2_matrix chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False) np_chi2[idxSNP,0] = p oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix) np_fisher[idxSNP,0] = pvalue np_oddsratio[idxSNP,0] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1]) np_this_2_2_matrix = np_2_2_matrix_classifyPhenotype[idxSNP,:].reshape(2,2) chi2, p, dof, ex = st.chi2_contingency(np_this_2_2_matrix, correction=False) np_chi2[idxSNP,1] = p oddsratio, pvalue = st.fisher_exact(np_this_2_2_matrix) np_fisher[idxSNP,1] = pvalue np_oddsratio[idxSNP,1] = (np_this_2_2_matrix[0,0]*np_this_2_2_matrix[1,1])/(np_this_2_2_matrix[1,0]*np_this_2_2_matrix[0,1]) #proportion(AA:AB:BB); ClassifyProgress(Chi2,Fisher,OddsRatio); ; ClassifyPhenotype(Chi2,Fisher,OddsRatio) np_statistic_result = np.empty([np_snp_info.shape[0],9],dtype='float') np_statistic_result[:,:3] = np_proportion np_statistic_result[:,3] = np_chi2[:,0] np_statistic_result[:,4] = np_fisher[:,0] np_statistic_result[:,5] = np_oddsratio[:,0] np_statistic_result[:,6] = np_chi2[:,1] np_statistic_result[:,7] = np_fisher[:,1] np_statistic_result[:,8] = np_oddsratio[:,1] return np_statistic_result
def contigencyAnalysis(dataTable, rowNames = None, columnNames = None, display = True, outputFile = None): testStatistic, pValue, degreesOfFreedom, expectedTable = chi2_contingency(dataTable) outputDataTable = ezTable(dataTable, columnNames, rowNames, summarizeColumns ="SUM", summarizeRows = "SUM", title = "Observed Counts", display = display, returnFormat = "MATRIX") outputExpectedTable = ezTable(expectedTable, columnNames, rowNames, summarizeColumns ="SUM", summarizeRows = "SUM", title = "Expected Counts", display = display, returnFormat = "MATRIX") if 0.0001 > pValue: pValueString = "< 0.0001" else: pValueString = str(round(pValue, 4)) resultsTable = [["Test Statistic", testStatistic], ["Degrees of Freedom", degreesOfFreedom], ["p-value", pValueString]] outputResultsTable = ezTable(resultsTable, title="Results", display = display, returnFormat = "MATRIX") if outputFile <> None: outputMatrix = outputDataTable + [['']] + outputExpectedTable + [['']] + outputResultsTable matrixToCSV(outputMatrix, outputFile) return testStatistic, pValue, degreesOfFreedom, expectedTable
def severs(a,b,cut,verbose=False): cntall = count(zip(a,b)) cntcut = count(zip(cut,a,b)) p_b_given_a = [float(x[1])/sum(x) for x in cntall] p_a_given_b = [float(x[1])/sum(x) for x in zip(*cntall)] if verbose: print 'orig=%s' % cntall print 'split=%s' % cntcut print 'p_a_given_b = %s' % p_a_given_b print 'p_b_given_a = %s' % p_b_given_a pvar = count(zip(cut)) mularr(pvar, 1.0/sum(pvar)) expnsev = [deepcopy(cntall), deepcopy(cntall)] for i in [0,1]: mularr(expnsev[i], pvar[i]) exptoucha = deepcopy(expnsev) # We'll overwrite everything for i in [0,1]: for aval in [0,1]: n = cntcut[i][aval][0] + cntcut[i][aval][1] for bval in [0,1]: p = p_of_val(p_b_given_a[aval], bval) exptoucha[i][aval][bval] = n * p ##print 'for cut=%d a=%d b=%d, n=%d p=%.1f val=%.1f' % (i, aval, bval, n, p, exptoucha[i][aval][bval]) exptouchb = deepcopy(expnsev) # We'll overwrite everything for i in [0,1]: for bval in [0,1]: n = cntcut[i][0][bval] + cntcut[i][1][bval] for aval in [0,1]: exptouchb[i][aval][bval] = n * p_of_val(p_a_given_b[bval], aval) if verbose: print 'exp|touch a = %s' % exptoucha print 'exp|touch b = %s' % exptouchb exps = [expnsev, exptoucha, exptouchb] bayes_factor = [1, 1, 1] for model in [0,1,2]: if verbose: print 'Model Touches %s' % (['neither', 'a', 'b'])[model] for i in [0,1]: try: chi_sev = chi2_contingency(cntcut[i]) except (ValueError, ZeroDivisionError) as e: continue peg_sev = blurred_chi2_pdf(chi_sev[0], sumall(cntcut[i])) if verbose: print ' chi_sev=%s' % str(chi_sev) print ' p(e|sev)=%f' % peg_sev print ' Cut=%d' % i print ' Actual: %s' % cntcut[i] print ' Expected: %s' % exps[model][i] try: chi_nsev = chisquare(cntcut[i], exps[model][i], axis=None, ddof=2) except (ValueError, ZeroDivisionError) as e: print 'Failure for model %d cut %d act=%s exp=%s' % (model, i, cntcut, exps[model]) raise e peg_nsev = blurred_chi2_pdf(chi_nsev[0], sumall(cntcut[i])) if verbose: print ' Chi=%s' % str(chi_nsev) print ' p=%s' % peg_nsev bayes_factor[model] *= peg_sev/peg_nsev return min(bayes_factor)
def binary_chi2(data, alternative): """ """ for group in data: assert issubclass(group.dtype.type, np.integer) assert set(np.unique(group)) == set((0, 1)) n_groups = len(data) n_outcomes = 2 ct = np.zeros((n_groups, n_outcomes)) for i, group in enumerate(data): for el in group: ct[i, el] += 1 support_outcome = np.sum(ct, axis=0) support_group = np.sum(ct, axis=1) n_samples = np.sum(ct, axis=None) chi2 = 0.0 for i in range(n_groups): for j in range(n_outcomes): observed = ct[i, j] expected = support_group[i] * support_outcome[j] / n_samples chi2 += ((observed-expected) ** 2) / expected _, p, _, _ = stats.chi2_contingency(ct, correction=False) return p
def chi_mode(data,depth,low=lcut,alpha=chi,f=freq): result=dict() plus=data['A'][0]+data['T'][0]+data['G'][0]+data['C'][0] minus=data['A'][1]+data['T'][1]+data['G'][1]+data['C'][1] for key in ['A','T','G','C']: if data[key][0] >= low[0]*depth and data[key][1] >=low[1]*depth: ndep=data[key][2] frequency=ndep/float(data['cover']) if frequency >= f: #add chi square test: if frequency > 0.5: result[key]=frequency else: a=data[key][0] b=data[key][1] c=plus-data[key][0] d=minus-data[key][1] least=sorted([a,b,c,d])[0] table=[[a,b],[c,d]] if least < 5: pvalue=stats.fisher_exact(table)[1] else: pvalue=stats.chi2_contingency(table)[1] if pvalue > alpha: result[key]=frequency return result
def RuleGeneration (D, globalL, minconf): Rules = [] for key, value in globalL.items()[1:]: for item in value: #_subsets = map(frozenset, [x for x in subsets(item)]) for consequence in item: consequence = frozenset([consequence]) antecedent = item.difference(consequence) if len(consequence) > 0: if (chisquaremode): #calculate chi square value #A->B A = getSupp(antecedent, allFreq, D)*len(D) B = getSupp(consequence, allFreq, D)*len(D) AB = getSupp(item, allFreq, D)*len(D) # print "AB: " + str(AB) A_B = A-AB # print "A_B: " + str(A_B) _AB = B-AB # print "_AB: " + str(_AB) _A_B = len(D) - AB - A_B - _AB # print "_A_B: " + str(_A_B) chistatistics = chi2_contingency(np.array([[AB,A_B],[_AB,_A_B]])) if chistatistics[1] <= p_value: Rules.append(((tuple(antecedent), tuple(consequence)), chistatistics[0], getSupp(item, allFreq, D), getSupp(antecedent, allFreq, D), getSupp(consequence, allFreq, D), chistatistics[1])) else: confidence = getSupp(item, allFreq, D) / getSupp(antecedent, allFreq, D) if confidence >= minconf: Rules.append(((tuple(antecedent), tuple(consequence)), confidence, getSupp(item, allFreq, D), getSupp(antecedent, allFreq, D), getSupp(consequence, allFreq, D))) return Rules
def consistent_acceptance_rate(self, window_size=None, critical_pval=0.05): """ A convenience funcion for `burnin`. Returns `True` if the acceptances of the two halves of the window are consistent with having the same acceptance rates. This is done using a chi-squared contingency test. """ if window_size is None: if len(self.updates) == 0: return False else: window_start = self.updates[-1] else: window_start = self.iterations - window_size window_length = self.iterations - window_start # If window is really small, return `consistent` to avoid gratuitous updating consistent = True if window_length > 2: windowed_acceptances = self.acceptance[window_start:self.iterations].flatten() X1, X2 = np.array_split(windowed_acceptances, 2) n1, n2 = len(X1), len(X2) k1, k2 = np.sum(X1), np.sum(X2) # Use chi^2 contingency test to test whether the halves have consistent acceptances table = [[k1, k2], [n1 - k1, n2 - k2]] p_val = chi2_contingency(table)[1] if p_val < critical_pval: consistent = False return consistent
def independence(table, test): #conducts test for independence and prints result depending on mode chi2, p, df, f_exp = stats.chi2_contingency(table) if test==True: print "chi-square statistic: %s \np-value: %s \ndegrees of freedom: %d \nexpected values: %s" %(chi2, p, df, f_exp) else: print "chi-square statistic: %s \np-value: %s" %(chi2, p)
def test_random_circuits(self): qk_simulator = get_backend('local_qasm_simulator') for circuit in self.rqg.get_circuits(format_='QuantumCircuit'): self.log.info(circuit.qasm()) compiled_circuit = compile_circuit(circuit) shots = 100 job_pq = QuantumJob(compiled_circuit, backend=pq_simulator, seed=1, shots=shots) job_qk = QuantumJob(compiled_circuit, backend=qk_simulator, seed=1, shots=shots) result_pq = pq_simulator.run(job_pq).result() result_qk = qk_simulator.run(job_qk).result() counts_pq = result_pq.get_counts(result_pq.get_names()[0]) counts_qk = result_qk.get_counts(result_qk.get_names()[0]) self.log.info('local_qasm_simulator_projectq: %s', str(counts_pq)) self.log.info('local_qasm_simulator: %s', str(counts_qk)) states = counts_qk.keys() | counts_pq.keys() # contingency table ctable = numpy.array([[counts_pq.get(key, 0) for key in states], [counts_qk.get(key, 0) for key in states]]) result = chi2_contingency(ctable) self.log.info('chi2_contingency: %s', str(result)) with self.subTest(circuit=circuit): self.assertGreater(result[1], 0.01)
def rank_features(self, metric): self.metrics_ranked.add(metric) for feat in self.feature_set: feat_func = {} for id in self.train_set: if feat in self.features[id]: feat_func[id] = 1 else: feat_func[id] = 0 if metric == "info": feat_yes = set([id for id in self.train_set if feat_func[id] == 1]) feat_no = set([id for id in self.train_set if feat_func[id] == 0]) label_yes = set([id for id in self.train_set if self.label_func[id] == 1]) label_no = set([id for id in self.train_set if self.label_func[id] == 0]) x = [len(feat_yes & label_yes), len(feat_yes & label_no)] y = [len(feat_no & label_yes), len(feat_no & label_no)] a, b, c, d = x[0], x[1], y[0], y[1] obs = numpy.array([x, y]) self.feature_rank["info"][feat] = info_gain(obs) elif metric == "spearman": u = [self.label_func[id] for id in self.train_set] v = [feat_func[id] for id in self.train_set] rho, pval = stats.spearmanr(u, v) self.feature_rank["spearman"][feat] = abs(rho) else: feat_yes = set([id for id in self.train_set if feat_func[id] == 1]) feat_no = set([id for id in self.train_set if feat_func[id] == 0]) label_yes = set([id for id in self.train_set if self.label_func[id] == 1]) label_no = set([id for id in self.train_set if self.label_func[id] == 0]) x = [len(feat_yes & label_yes), len(feat_yes & label_no)] y = [len(feat_no & label_yes), len(feat_no & label_no)] a, b, c, d = x[0], x[1], y[0], y[1] obs = numpy.array([x, y]) chi2, pval, dof, ex = stats.chi2_contingency(obs, correction=False) self.feature_rank[metric][feat] = 1-pval
def calculateP(variables, k, data, WINDOW_LEN): freq_old = np.zeros(len(variables)) freq = np.zeros(len(variables)) for i in range(len(variables)): sample = data[k:k+WINDOW_LEN] freq_old[i] = sample.count(variables[i]) sample = data[k+WINDOW_LEN : k+2*WINDOW_LEN] freq[i] = sample.count(variables[i]) if (len(variables)==2): chi = chisquare(freq, freq_old) p = chi[1] # Tried the exact binomial goodness of fit method: # p = binom_test(freq, n=None, p=freq_old[0]/sum(freq_old)) # The results were the same as Chi-square else: if (sum(freq==0)>0 or sum(freq_old==0)>0): chi = chisquare(freq, freq_old) else: chi = chi2_contingency([freq,freq_old], correction=True) p = chi[1] return p
def test_run_device(self): backends = self._provider.available_backends({'simulator': False}) self.log.info('devices: %s', [b.name for b in backends]) backend = lowest_pending_jobs(backends) self.log.info('using backend: %s', backend.name) qobj = qiskit._compiler.compile(self._qc, backend) shots = qobj['config']['shots'] quantum_job = QuantumJob(qobj, backend, preformatted=True) job = backend.run(quantum_job) while not (job.done or job.exception): self.log.info(job.status) time.sleep(4) if job.exception: raise job.exception self.log.info(job.status) result = job.result() counts_qx = result.get_counts(result.get_names()[0]) counts_ex = {'00': shots/2, '11': shots/2} states = counts_qx.keys() | counts_ex.keys() # contingency table ctable = numpy.array([[counts_qx.get(key, 0) for key in states], [counts_ex.get(key, 0) for key in states]]) self.log.info('states: %s', str(states)) self.log.info('ctable: %s', str(ctable)) contingency = chi2_contingency(ctable) self.log.info('chi2_contingency: %s', str(contingency)) self.assertDictAlmostEqual(counts_qx, counts_ex, shots*0.1)
def choose_vocabulary(data): for assignIndex in range(len(data)): for innerIndex in range(len(data[assignIndex])): if data[assignIndex][innerIndex]==0: data[assignIndex][innerIndex]+=1 chi2, p, dof, ex =stats.chi2_contingency(data) return p
def Dep_GTest(C,X,S,M,alpha=0.05): C=np.array(C) X=np.array(X) g,p,dof,expected = stats.chi2_contingency(np.array([X,C]))#,lambda_='log-likelihood') if (p<=alpha): return true return false
def xtab(formula, covariate_df): y, X = patsy.dmatrices(str(formula), covariate_df) X = patsy.dmatrix('genotype', covariate_df) ix = get_genotype_ix(X) tbl = pd.crosstab(X[:, ix], y.ravel()) try: tbl.columns = ['%s_%i' % (y.design_info.column_names[-1], j) for j in range(2)] except: return None # too few samples tbl.index = ['%i_alts' % i for i in tbl.index] alts = set(tbl.index) if len(alts) < 2 or not '0_alts' in alts: tbl_dom = None else: tbl_dom = pd.DataFrame({'0_alts': tbl.ix['0_alts', :], 'n_alts': tbl.ix[list(alts - set(['0_alts'])), :].sum()}).T # can't test recessive without any homoz alts. if not '2_alts' in alts or len(alts) < 2: tbl_rec = None else: tbl_rec = pd.DataFrame({'lt2_alts': tbl.ix[['0_alts', '1_alts'], :].sum(), '2_alts': tbl.ix['2_alts', :]}) d = {} for name, xtbl in (('additive', tbl), ('dominant', tbl_dom), ('recessive', tbl_rec)): if xtbl is None: d['p.chi.%s' % name] = 'nan' continue chi, p, ddof, e = chi2_contingency(xtbl) if name == 'additive': d = xtbl.to_dict() d['p.chi.%s' % name] = "%.3g" % p return d
def calculate_associations(self, covariate='passage', lookup=None): ''' calculate the association of amino acid state and sequence properties such as passage ''' if not hasattr(self, 'mutation_count'): self.count_mutations_per_site() # calculate associations from scipy.stats import chi2_contingency self.associations = {} if lookup is None: lookup=lambda x:x # loop over all positions (currently rather clumsy) for prot, pos in mutation_dict: assoc = defaultdict(int) for node in selt.tree.get_terminals(): # extract info from each node if hasattr(node, covariate): assoc[(node.translations[prot][pos-1], lookup(node.passage))]+=1 # make contingency matrix aa_states = sorted(set([x[0] for x in assoc])) cov_states = sorted(set([x[1] for x in assoc])) contingeny_matrix = np.zeros((aa_states, cov_states)) for a, c in assoc: contingeny_matrix[aa_states.index(a), cov_states.index(c)] = assoc[(a,c)] g, p, dof, expctd = chi2_contingency(contingeny_matrix, lambda_="log-likelihood") assoc['contingency matrix'] = contingeny_matrix assoc['aa']=aa_states assoc['covariates']=cov_states assoc['g_test'] = (g,p) self.associations[(prot, pos)] = assoc
def MK_test(SNPs, test_mode): ''' (dict, str) -> dict Take a dict of gene : [PN, PS, DN, DS] pairs and a string fisher or G_test and a return a new dict with gene : [PN, PS, DN, DS, p-val] pairs with PN and DN being respectively replacement polymorphisms and divergence and PS and DS being respectively synonymous polymorphisms and divergence and p-val being the p-value of the contingency test using either Fisher's two-sided exact test or the G-test with Yate's correction ''' # create new dict MK = {} # loop over genes in dict for gene in SNPs: # initialize list with PN, PS polym = [SNPs[gene][0], SNPs[gene][1]] # initialize list with DN, DS diverg = [SNPs[gene][2], SNPs[gene][3]] # perform the MK test according to fisher 2-tailed or G-test if test_mode == 'fisher': # get the p-value P = stats.fisher_exact([polym, diverg])[1] elif test_mode == 'G_test': P = stats.chi2_contingency([polym, diverg], lambda_ = 'log-likelihood')[1] # add p-val to list MK[gene] = list(SNPs[gene]) MK[gene].append(P) return MK
def contingency_table(self, dead_strains, live_strains, output_file): elem_intervals = self.make_elementary_intervals( [self.sample_dict[sn][0] for sn in dead_strains + live_strains] ) num_dead = len(dead_strains) num_live = len(live_strains) dead_observed = self.build_pairwise_matrix(dead_strains, elem_intervals) live_observed = self.build_pairwise_matrix(live_strains, elem_intervals) with open(output_file, 'w+') as fp: writer = csv.writer(fp) writer.writerow(['Proximal chromosome', 'Proximal start', 'Proximal end', 'Distal chromosome', 'Distal start', 'Distal end', 'Proximal origin', 'Distal origin', 'chi squared', 'p-value']) elem_intervals.insert(0, 0) for combo in xrange(subspecies.NUM_SUBSPECIES**2): for i in xrange(len(elem_intervals)-1): for j in xrange(i+1, len(elem_intervals)-1): if dead_observed[combo, i, j] and live_observed[combo, i, j]: contingency = np.array([[dead_observed[combo, i, j], live_observed[combo, i, j]], [num_dead-dead_observed[combo, i, j], num_live-live_observed[combo, i, j]]]) chi_squared, p, _, _ = stats.chi2_contingency(contingency) proximal_pos = self.chrom_and_pos(elem_intervals[i], elem_intervals[i+1]) distal_pos = self.chrom_and_pos(elem_intervals[j], elem_intervals[j+1]) writer.writerow(proximal_pos + distal_pos + (subspecies.proximal(combo), subspecies.distal(combo), chi_squared, p))
def myChisquare(self, values): # Uses chisquare values = [pair for pair in values if not np.all(np.array(pair) == 0)] chi2, p, dof, ex = chi2_contingency(values) if (ex < 5).sum() > 0: return 0.0, 1.0 # print chi2, p, dof return chi2, p
def chi(data1,data2): obs = np.array([data1,data2]) try: chi2, p, dof, expected = stats.chi2_contingency(obs) except: print 'Chi2 error' return chi2
def chi_square_of_df_cols(self, df, col1, col2): df_col1, df_col2 = df[col1], df[col2] result = [[sum((df_col1 == cat1) & (df_col2 == cat2)) for cat2 in self.categories(df_col2)] for cat1 in self.categories(df_col1)] return stats.chi2_contingency(result)
def chiSqQuant(x, y, num_states_x, num_states_y): if num_states_x == 1 or num_states_y == 1: return (1, 0) x = x - min(x) y = y - min(y) n_mat = hist3(x, y, range(num_states_x), range(num_states_y)) T, result, _, _ = chi2_contingency(n_mat) return (result, T)
def doHitProcess(inp): idx, hits, n_f1_hits, n_f2_hits = inp if hits[0] == 0 and hits[1] == 0: return if hits[0] == 0: return idx, 999.0, 0, 0, hits[1], float(hits[1])/float(n_f2_hits), 'NA', 'NA' if hits[1] == 0: return idx, 0.0, hits[0], float(hits[0])/float(n_f1_hits), 0, 0, 'NA', 'NA' h1_p = float(hits[0])/float(n_f1_hits) h2_p = float(hits[1])/float(n_f2_hits) chi, pvalue, _, _ = stats.chi2_contingency([[hits[1],n_f2_hits-hits[1]],[hits[0],n_f1_hits-hits[0]]]) return idx, round(h2_p/h1_p,3), hits[0], h1_p, hits[1], h2_p, chi, pvalue
def first_sec(stats): obs = [[0, 0, 0], [0, 0, 0]] for l in stats: if l[5] == 1: add_data(l[2], obs[0]) else: add_data(l[2], obs[1]) return chi2_contingency(obs)[0:2]
def chi2_homogeneity(c_tbl): return chi2_contingency(c_tbl)
searchdata_file = '../data/searches.json' searches = pd.read_json(searchdata_file, lines=True) odd_id = searches[(searches['uid'] % 2 != 0)] even_id = searches[(searches['uid'] % 2 == 0)] odds_searched = odd_id[(odd_id['search_count'] > 0)] odd_unsearched = odd_id[(odd_id['search_count'] == 0)] evens_searched = even_id[(even_id['search_count'] > 0)] evens_unsearched = even_id[(even_id['search_count'] == 0)] "ANALYSIS" obs1 = np.array([[odds_searched.shape[0], odd_unsearched.shape[0]], [evens_searched.shape[0], evens_unsearched.shape[0]]]) chi = (chi2_contingency(obs1)) mannwhitneyu = stats.mannwhitneyu(odd_id['search_count'], even_id['search_count']) """ # INFUSER DOES NOT ACCEPT THE FOLLOWING LINES: UNABLE TO JUDGE TYPE FOR EXPRESSION odds_searched = odds_searched[(odds_searched['is_instructor'] == True)] odd_unsearched = odd_unsearched[(odd_unsearched['is_instructor'] == True)] evens_searched = evens_searched[(evens_searched['is_instructor'] == True)] evens_unsearched = evens_unsearched[(evens_unsearched['is_instructor'] == True)] odd_id = odd_id[(odd_id['is_instructor'] == True)] even_id = even_id[(even_id['is_instructor'] == True)]
expected.columns = ["democrat", "independent", "republican"] expected.index = ["asian", "black", "hispanic", "other", "white"] print(expected) chi_squared_stat = (((observed - expected)**2) / expected).sum().sum() print(chi_squared_stat) crit = stats.chi2.ppf( q=0.95, # Find the critical value for 95% confidence* df=8) # * print("Critical value") print(crit) p_value = 1 - stats.chi2.cdf( x=chi_squared_stat, # Find the p-value df=8) print("P value") print(p_value) print(stats.chi2_contingency(observed=observed)) print( "If the p-value is less than 0.05, we reject the null hypothesis that there's no difference between the means and conclude that a significant difference does exist" ) print( "As expected, given the high p-value, the test result does not detect a significant relationship between the variables." )
app_pivot['Percent with Application'] = app_pivot.Application / app_pivot.Total app_pivot # It looks like more people from Group B turned in an application. Why might that be? # # We need to know if this difference is statistically significant. # # Choose a hypothesis tests, import it from `scipy` and perform it. Be sure to note the p-value. # Is this result significant? # In[36]: from scipy.stats import chi2_contingency contingency = [[250, 2254], [325, 2175]] chi2_contingency(contingency) # ## Step 4: Who purchases a membership? # Of those who picked up an application, how many purchased a membership? # # Let's begin by adding a column to `df` called `is_member` which is `Member` if `purchase_date` is not `None`, and `Not Member` otherwise. # In[35]: df['is_member'] = df.purchase_date.apply(lambda x: 'Member' if pd.notnull(x) else 'Not Member') # Now, let's create a DataFrame called `just_apps` the contains only people who picked up an application. # In[38]:
def q_counts(): # Statistics : Q-Feature (mean, std, number_samples) f = { 'q_ends': (0.00885, 0.09388, 1238), 'q_contains': (0.022617, 0.14874, 1238) } o = { 'q_ends': (0.090437, 0.286955, 962), 'q_contains': (0.133056, 0.339812, 962) } a = { 'q_ends': (0.025316, 0.157284, 395), 'q_contains': (0.075949, 0.265253, 395) } d = read_clean_dataset() q = read_pickle_file(_feature_file_map['Q']) q['Stance'] = d.articleHeadlineStance # Run the t-test! for feature in ['q_ends', 'q_contains']: mean_f, std_f, n_f = f[feature] mean_a, std_a, n_a = a[feature] mean_o, std_o, n_o = o[feature] # Run the actual test _, p_fo = ttest_ind_from_stats(mean1=mean_f, std1=std_f, nobs1=n_f, mean2=mean_o, std2=std_o, nobs2=n_o) _, p_fa = ttest_ind_from_stats(mean1=mean_f, std1=std_f, nobs1=n_f, mean2=mean_a, std2=std_a, nobs2=n_a) _, p_ao = ttest_ind_from_stats(mean1=mean_a, std1=std_a, nobs1=n_a, mean2=mean_o, std2=std_o, nobs2=n_o) print(f"""P-values ({feature}) 1) For - Against: {p_fa} 2) Observing - Against: {p_ao} 3) For - Observing: {p_fo}""") # Chi-square test for dependency between feature and stance contingency_table = pd.crosstab(q['Stance'], q[feature], margins=False) chi2_stat, p_val, dof, ex = stats.chi2_contingency(contingency_table) print("\n") print(f"""=== Chi2 Stat ({feature}) ===""") print(chi2_stat) print("\n") print("===Degrees of Freedom===") print(dof) print("\n") print("===P-Value===") print(p_val) print("\n") print("===Contingency Table===") print(ex)
def get_bias_chi2_pvals(clf, df, feature_names, categories, low=None, high=None, num=100): """ Get p-values across a range of decision thresholds Parameters ------------ clf : sklearn clf object model classifier, must have a `decision_function` or `predict_proba` method df : pandas DataFrame contains untransformed data feature_names : list of strings features included in the classifier categories : list of strings names of demographic columns to check, e.g. ['gender', 'ethnicity'] low : float lower threshold value high : float upper threshold value num : int number of thresholds to consider Returns --------- thresholds_to_check : range of floats decision thresholds obtained by np.linspace(low, high,num) post_chi2stat_pvals : defaultdict(list) containing categories' chi2 statistics and p_vals at a range of thresholds """ # get decision score for each user and sort by the score # this sort makes finding who matches at a threshold easy X = df[feature_names].values # subsequent modifications on copy of the input dataframe df = df.copy() clf = ClassifierWrapper(clf) df['decision'] = clf.decision_function(X) # allow for older and newer pandas sorting schemes if hasattr(df, 'sort_values'): sorted_df = df.reindex( df.sort_values('decision', ascending=False).index) else: sorted_df = df.reindex(df.sort('decision', ascending=False).index) matched_col = get_unique_name('matched', df.columns) # define range of values to test over if not inputted if low is None: low = df.decision.min() if high is None: high = df.decision.max() n_samples = sorted_df.shape[0] thresholds_to_check = np.linspace(low, high, num) post_chi2stat_pvals = defaultdict(list) for threshold in thresholds_to_check: # set the top 1-threshold proportion of sample to 1 (match) and the # rest to 0 (not match) num_matches = int(n_samples * (1 - threshold)) num_not_matches = (n_samples - int(n_samples * (1 - threshold))) sorted_df[matched_col] = ([1] * num_matches) + ([0] * num_not_matches) for category in categories: # get p-values for non-nan values category_vals = set(sorted_df[category].dropna()) cat_df = sorted_df[sorted_df[category].isin(category_vals)] cat_ctabs = pd.crosstab(cat_df[matched_col], cat_df[category]) chi2_stat, chi2_pval = chi2_contingency(cat_ctabs)[:2] post_chi2stat_pvals[category].append((chi2_stat, chi2_pval)) return thresholds_to_check, post_chi2stat_pvals
thalach_typical = heart.thalach[heart.cp == 'typical angina'] thalach_asymptom = heart.thalach[heart.cp == 'asymptomatic'] thalach_nonangin = heart.thalach[heart.cp == 'non-anginal pain'] thalach_atypical = heart.thalach[heart.cp == 'atypical angina'] # run ANOVA from scipy.stats import f_oneway Fstat, pval = f_oneway(thalach_typical, thalach_asymptom, thalach_nonangin, thalach_atypical) print('p-value for ANOVA: ', pval) #there is at least one pair of chest pain types (cp) for which people with those pain types have significantly different average max heart rates during exercise (thalach) # run Tukey's range test from statsmodels.stats.multicomp import pairwise_tukeyhsd output = pairwise_tukeyhsd(heart.thalach, heart.cp) print(output) #For any pair where “Reject” is “True”, we conclude that people with those chest pain types have significantly different maximum heart rates during exercise # contingency table of heart disease vs cp Xtab = pd.crosstab(heart.cp, heart.heart_disease) print(Xtab) # run chi-square test from scipy.stats import chi2_contingency chi2, pval, dof, exp = chi2_contingency(Xtab) print('p-value for chi-square test: ', pval) #This is less than 0.05, so we can conclude that there is a significant association between these variables.
def calculate(self): try: if self.df.shape[1] != 2 or len(self.batchsize) != 2: raise ValueError( 'Lengths of survival_rate and batchsize must be =2') except ValueError as ve: print(ve) try: if min(list(self.df.nunique())) == 0: raise ValueError('One or more columns in dataframe is empty') except ValueError as ve: print(ve) [a_key, b_key] = list(self.batchsize.keys()) a = self.df.loc[:, a_key] b = self.df.loc[:, b_key] nRuns = math.floor( min(a.shape[0] / self.batchsize[a_key], b.shape[0] / self.batchsize[b_key])) a_end = -1 b_end = -1 Cumm_P_val = np.zeros(nRuns) # This variable shows cummulative P Value # Loop to find cummulative P value, by increasing sample size in each run for i in range(nRuns): a_end = a_end + self.batchsize[a_key] b_end = b_end + self.batchsize[b_key] a_pass = a[0:a_end].sum() a_fail = self.batchsize[a_key] * (i + 1) - a[0:a_end].sum() b_pass = b[0:b_end].sum() b_fail = self.batchsize[b_key] * (i + 1) - b[0:b_end].sum() ContingencyTable = np.array([[a_pass, a_fail], [b_pass, b_fail]]) if np.min(ContingencyTable) == 0: # P value cannot be determined if one or more values in the ContingencyTable is zero Cumm_P_val[i] = np.nan else: (chi1, Cumm_P_val[i], DOF, expected) = stats.chi2_contingency(ContingencyTable, correction=False) #Plot cummulative p values for all runs x = (np.arange(1, nRuns + 1)) Cumm_P_val = pd.DataFrame(list(zip(x, Cumm_P_val)), columns=['N_runs', 'Cummulative_P_Value']) Cumm_P_val = Cumm_P_val.dropna() ax = Cumm_P_val.plot(x='N_runs', y='Cummulative_P_Value', grid=True, label='p value') plt.plot(np.ones(np.max(x)) * 0.05, color='red', ls="--", label='alpha=0.05') ax.set_title("Chi2 Results") ax.set_xlabel('N Runs') ax.set_ylabel('P Value') ax.set_xticks(np.arange(0, nRuns + 1, 10)) ax.legend() def run2samples(x): return x * (self.batchsize[a_key] + self.batchsize[b_key]) def samples2run(x): return x / (self.batchsize[a_key] + self.batchsize[b_key]) secax = ax.secondary_xaxis('top', functions=(run2samples, samples2run)) secax.set_xlabel('Total Samples tested') #secax.set_xticks(np.arange(0, (nRuns+1)*(self.batchsize[a_key]+self.batchsize[b_key]), 10)) plt.show() return None
# Also not very fruitful # What is the overall conversion rate for each group? conversions = tests.groupby('price').aggregate( conversion_rate=('converted', lambda x: sum(x) / len(x)), conversion_count=('converted', 'sum'), nonconversion_count=('converted', lambda x: len(x) - sum(x)), visitor_count=('user_id', 'count')) conversions = conversions.reset_index() conversions['revenue_per_visitor'] = conversions[ 'conversion_count'] * conversions['price'] / conversions['visitor_count'] print(conversions) # Even with the decrease in conversion rate, the revenue earned per visitor is up by $0.14 # Is the difference in conversion rate significant? chi2, pvalue, dof, ex = chi2_contingency( conversions[['conversion_count', 'nonconversion_count']].transpose()) print( 'The decreased conversion rate of {:.3f} is statististically significant with p={:.3f}' .format(conversions['conversion_rate'].diff().max(), pvalue)) # Is the difference in revenue significant? # this doesn't seem like a valid question to ask here because it's just another version of "are these two numbers different?" # Plot all of the data! # Boxplots of each variable by conversion tests_melted = tests.melt( id_vars=['user_id', 'timestamp', 'converted', 'test', 'price']) tests_melted_conversion_rate = tests_melted.groupby( ['variable', 'value', 'test']).agg(conversion_rate=('converted', lambda x: sum(x) / len(x)),