def bandwidth_suppression_from_peak(tuningDict, subtractBaseline=False): spikeArray = tuningDict['responseArray'] baselineSpikeRate = tuningDict['baselineSpikeRate'] spikeCountMat = tuningDict['spikeCountMat'] suppressionIndex = np.zeros(spikeArray.shape[1]) facilitationIndex = np.zeros_like(suppressionIndex) suppressionpVal = np.zeros_like(suppressionIndex) facilitationpVal = np.zeros_like(suppressionIndex) if not subtractBaseline: baselineSpikeRate = 0 for ind in range(len(suppressionIndex)): suppressionIndex[ind] = (max(spikeArray[:,ind])-spikeArray[:,ind][-1])/(max(spikeArray[:,ind])-baselineSpikeRate) facilitationIndex[ind] = (max(spikeArray[:,ind])-spikeArray[:,ind][0])/(max(spikeArray[:,ind])-baselineSpikeRate) trialsThisSeconsVal = tuningDict['trialsEachCond'][:,:,ind] peakInd = np.argmax(spikeArray[:,ind]) peakSpikeCounts = spikeCountMat[trialsThisSeconsVal[:,peakInd]].flatten() whiteNoiseSpikeCounts = spikeCountMat[trialsThisSeconsVal[:,-1]].flatten() pureToneSpikeCounts = spikeCountMat[trialsThisSeconsVal[:,0]].flatten() suppressionpVal[ind] = stats.ranksums(peakSpikeCounts, whiteNoiseSpikeCounts)[1] facilitationpVal[ind] = stats.ranksums(peakSpikeCounts, pureToneSpikeCounts)[1] suppressionDict = {'suppressionIndex':suppressionIndex, 'suppressionpVal':suppressionpVal, 'facilitationIndex':facilitationIndex, 'facilitationpVal':facilitationpVal} return suppressionDict
def testNTDifference(oFN, minAvgPhast = .90, minSNR = 2, oRNAType = 'oRNA'): oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA) oNX.load(['phastScores', 'snrSS']) groupA = [10,11,12,13] #groupB = [15,16,17,18] groupB = [4,5,6,7] a = [] b = [] for oID in oNX.phastScores: avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID])) #filter if (avgScore < float(minAvgPhast)) or (oNX.snrSS[oID] < float(minSNR)): continue if avgScore == 1.00: continue for i, pScore in enumerate(oNX.phastScores[oID]): if (i + 1) in groupA: a.append(pScore) if (i + 1) in groupB: b.append(pScore) print len(a)/4, len(b)/4 print stats.ranksums(a,b)
def WilcoxonTest(Original_input, Symbiotic_output, Modified_GA_output): ''' Returns the most similar output to Original_input out of Symbiotic_output and GA_output using Wilcoxon Rank Sum Test. Args: Original_input: Data with N features Symbiotic_output: Data with N-1 features extracted using symbiotic algorithm Modified_GA_output: Data with N-1 features extracted using Modified Genetic algorithm Returns: The best suited N-1 features for a given distribution or [-1] ''' z_stat_for_symbiotic, p_val_for_symbiotic = stats.ranksums( Symbiotic_output, Original_input) z_stat_for_GA, p_val_for_GA = stats.ranksums( Modified_GA_output, Original_input) print p_val_for_symbiotic, p_val_for_GA if max(p_val_for_GA , p_val_for_symbiotic) < 1e-300: return -1 if (p_val_for_GA > p_val_for_symbiotic): print "Forest one is better" return 1 else: print "Symbiotic is better" return 0
def rank_sum_3_sites(measurements, details=False): output = [1, 2, 3, 4, 5, 6, 7, 8] done = False while not done: done = True for i in range(len(measurements) - 1): if ranksums(measurements[output[i] - 1], measurements[output[i+1] - 1])[0] < 0: output[i], output[i+1] = output[i+1], output[i] done = False if details: for i in range(len(measurements) - 1): print(ranksums(measurements[output[i] - 1], measurements[output[i + 1] - 1])) return output
def getCompArray(self, datasetA, datasetB, plot): warnings.filterwarnings("error") aggregateDataA, offScreen = datasetA.getAggregateData() aggregateDataB, offScreen = datasetB.getAggregateData() results = [] # get x, y magnitude of difference between sets, and significance for i in range(self.params['gridWidth']): for j in range(self.params['gridHeight']): # get two arrays for given plot setA = aggregateDataA[i][j].getResult(plot) setB = aggregateDataB[i][j].getResult(plot) # only compare if mean counts of both are greater than one if st.nanmean(aggregateDataA[i][j].getResult(0)) > 1 or st.nanmean(aggregateDataB[i][j].getResult(0)) > 1: # print str(i) + ", " + str(j) + ": " + str(st.nanmean(setA)) try: mww_z, p = stats.ranksums(setA, setB) except UserWarning: p = numpy.nan results.append((st.nanmean(setA), st.nanmean(setB), p)) else: # print str(i) + ", " + str(j) + ": " + str(0) results.append((numpy.nan, numpy.nan, numpy.nan)) return results
def printBoxData(self, datasets, boxCoord, plot): print "Box " + str(boxCoord) means = [] print "Mean, StdDev, n" for ds in datasets: alldata = ds.getAggregateDataAsArray(plot) boxdata = alldata[boxCoord[0]][boxCoord[1]] means.append(st.nanmean(boxdata)) print str(st.nanmean(boxdata)) + ", " + str(numpy.std(boxdata)) + ", " + str(len(boxdata)) print "-----" print str(st.nanmean(means)) + ", " + str(numpy.std(means)) + ", " + str(len(means)) for i in range(len(datasets)): dsA = datasets[i] alldata = dsA.getAggregateDataAsArray(plot) boxdata = alldata[boxCoord[0]][boxCoord[1]] for j in range(len(datasets))[i+1:]: dsB = datasets[j] alldataB = dsB.getAggregateDataAsArray(plot) boxdataB = alldataB[boxCoord[0]][boxCoord[1]] try: mww_z, p = stats.ranksums(boxdata, boxdataB) except UserWarning: p = 1 if p <= 0.05: print "Difference between " + dsA.label + " and " + dsB.label + ". p = " + str(p) else: print "Nothing between " + dsA.label + " and " + dsB.label + "(p=" + str(p) + ")"
def stats(d_lengths,dn,): for bool_skip in [False,True,]: even = [] odd = [] for dist_min in d_lengths.keys(): for len_diff in d_lengths[dist_min].keys(): if bool_skip == True: if len_diff == 1: continue if len_diff % 2 == 0: even += d_lengths[dist_min][len_diff]*[dist_min] else: odd += d_lengths[dist_min][len_diff]*[dist_min] import scipy from scipy import stats u,p = stats.mannwhitneyu(even,odd) fd = open('stats','a') fd.write('mannwhitneyu u %s p %s %s %s\n' %(u,p,dn,bool_skip)) fd.close() z,p = stats.ranksums(even,odd) fd = open('stats','a') fd.write('ranksums z %s p %s %s %s\n' %(z,p,dn,bool_skip)) fd.close() average_even = sum(even)/len(even) average_odd = sum(odd)/len(odd) fd = open('stats','a') fd.write('average even %s odd %s %s %s\n' %(average_even,average_odd,dn,bool_skip)) fd.close() return
def compute_ranksum_p(start_gs,last_gs): res = {} with gzip.open(gene_sets_discrete,"r") as infile: gs = infile.readlines() for line in gs[start_gs:min(last_gs,len(gs))]: words = line.strip().split("\t") gs_genes = words[2].split("|") # Stratify gs_gene_scores = [] other_genes_scores = [] for g in reconstituted_gene_sets_df.index: if g in gs_genes: gs_gene_scores.append(reconstituted_gene_sets_df.iloc[reconstituted_gene_sets_df.index.get_loc(g),reconstituted_gene_sets_df.columns.get_loc(words[0])]) else: other_genes_scores.append(reconstituted_gene_sets_df.iloc[reconstituted_gene_sets_df.index.get_loc(g),reconstituted_gene_sets_df.columns.get_loc(words[0])]) # Test z, p1 = ranksums(gs_gene_scores, other_genes_scores) t, p2 = ttest_ind(gs_gene_scores, other_genes_scores, equal_var=False) print "{}: gs_median={}, other_median={}, p_utest={} p_ttest={})".format(words[0],numpy.median(gs_gene_scores),numpy.median(other_genes_scores),p1,p2) res[words[0]] = p1 # Write to file with open("{}_{}_{}.tab".format(outfile_prefix,start_gs,last_gs), "w") as f: for gs in res: f.write("{}\t{}\n".format(gs,res[gs]))
def testRankSum(self, ctrlData, expData): result=[] for k in range(ctrlData.shape[1]): result.append(ranksums(ctrlData[:,k], expData[:,k])[self.index]) return result
def ROC_base(X,I,m,n,cat0,cat1,y): if y == []: y = [0] * m + [1] * n y2 = [] for j in range(len(y)): if y[j] == 0: y2.append(0) if y[j] == 1: y2.append(1) res = [] for i in range(len(X)): x = X[i] x2 = [] x2_cat0 = [] x2_cat1 = [] for j in range(len(x)): if y[j] == 0: x2.append(x[j]) x2_cat0.append(x[j]) if y[j] == 1: x2.append(x[j]) x2_cat1.append(x[j]) Wilcoxon = ranksums(x2_cat0,x2_cat1) #mannwhitneyu(x, y, use_continuity=True) res.append(ROC_ligne(x2,m,n,cat0,cat1,y=y2)[4:] + I[i] + [Wilcoxon[1]] + non_nul(X,y)[i]) res2 = sorted (res, reverse=True) return res2
def do_significance_test(tpx_feature, test="Wilcoxon Ranksum"): """ Do significance testing to see if the two distributions differ significantly. If p <= 0.05, we are highly confident that the distributions differ significantly. Arguments: tpx_feature (string): Name of the temporal expression feature to test test (string): which test to do: Wilcoxon Ranksum or Mann Whitney U """ md_table = pd.DataFrame.from_csv(os.path.join(wdir, md_csv), header=0) ht_table = pd.DataFrame.from_csv(os.path.join(wdir, "tpx-corpus-counts.csv"), header=0) working_table = ht_table.join(md_table) # get data points data = copy.copy(working_table[tpx_feature]) # get ids of historical novels idnos_hist = md_table[md_table["subgenre_hist"] == "historical"].index.tolist() # get ids of non-historical novels idnos_not_hist = md_table[md_table["subgenre_hist"] == "not_historical"].index.tolist() # split data into subgroups data_hist = data[idnos_hist] data_not_hist = data[idnos_not_hist] if test == "Mann Whitney": test_stat = stats.mannwhitneyu(data_hist, data_not_hist) else: # do Wilcoxon Ranksum by default test_stat = stats.ranksums(data_hist, data_not_hist) return test_stat
def append_wilcoxmann(df,columns,multitest): pvalcols = [] groups = [ratcol+"_ratio" for ratcol in design.run.ratios] cntr=0 for col in columns: pvals = [] data = df[col].values for vals in data: pvals.append(ranksums([v for v in vals],[item for sublist in data for item in sublist])[1]) df.insert(len(df.columns),groups[cntr].replace("_ratio","")+"^wmannpvals",pvals) pvalcols.append(groups[cntr].replace("_ratio","")+"^wmannpvals") cntr+=1 log("P-values calculated for following groups: "+str(columns),1) if multitest: combined=[] for row in df[pvalcols].get_values(): combined.append(combine_pvalues(row,method='fisher', weights=None)[1]) log("Fisher combined p_value test completed",1) df.insert(len(df.columns),'fisher_combined_wmannpval',combined) bh_corrected = bh_correct(dict(zip(df.index.values,df['fisher_combined_wmannpval'].values))) corrected_vals = [] for k in df.index.values: corrected_vals.append(bh_corrected[k]) df.insert(len(df.columns),'benj_hoch_corrected_wmannpval',corrected_vals) log("Benjamini-hochberg correction successfully applied to combined p-values",1) return df
def select_feature(x,y): for i in range(0, feature_number): temp0 = x[y==0,i] temp1 = x[y==1,i] pvalues[i] = ranksums(temp0,temp1 )[1] top_n_index = sorted(range(len(feature_name)), key=lambda i: pvalues[i])[0:top_n] return top_n_index
def bandwidth_suppression_by_bins(tuningDict, lowBandInds=[1,2], highBandInds=[5,6], subtractBaseline=False): spikeArray = tuningDict['responseArray'] spikeCountMat = tuningDict['spikeCountMat'] baselineSpikeRate = tuningDict['baselineSpikeRate'] if not subtractBaseline: baselineSpikeRate = 0 suppressionIndex = np.zeros(spikeArray.shape[1]) suppressionpVal = np.zeros_like(suppressionIndex) for ind in range(len(suppressionIndex)): trialsThisSeconsVal = tuningDict['trialsEachCond'][:,:,ind] lowBandSpikeCounts = [] for lowInd in lowBandInds: thisBinCounts = spikeCountMat[trialsThisSeconsVal[:,lowInd]].flatten() lowBandSpikeCounts.extend(thisBinCounts) highBandSpikeCounts = [] for highInd in highBandInds: thisBinCounts = spikeCountMat[trialsThisSeconsVal[:,highInd]].flatten() highBandSpikeCounts.extend(thisBinCounts) suppressionIndex[ind] = (np.mean(lowBandSpikeCounts)-np.mean(highBandSpikeCounts))/(np.mean(lowBandSpikeCounts)+np.mean(highBandSpikeCounts)-2*baselineSpikeRate) suppressionpVal[ind] = stats.ranksums(lowBandSpikeCounts, highBandSpikeCounts)[1] suppressionDict = {'suppressionIndex':suppressionIndex, 'suppressionpVal':suppressionpVal} return suppressionDict
def rank_sum_n_sites(measurements, details=False): if math.frexp(len(measurements))[0] != 0.5: print("rank_sum_n_sites received an input of length %s, which is not equal to the number of genotypes." "Quitting." % len(measurements)) sys.exit() output_indices = [] for genotype in measurements: output_indices.append(measurements.keys().index(genotype)) done = False while not done: done = True for i in range(len(measurements) - 1): if ranksums(measurements[measurements.keys()[output_indices[i]]], measurements[measurements.keys()[output_indices[i+1]]])[0] < 0: output_indices[i], output_indices[i + 1] = output_indices[i + 1], output_indices[i] done = False output = [] output_look_good = [] number_loci = 0 for index in output_indices: output.append(measurements.keys()[index]) if len(measurements.keys()[index]) > number_loci: number_loci = len(measurements.keys()[index]) for index in output_indices: output_look_good.append(genotype_look_good(measurements.keys()[index], number_loci)) output_detailed = [] for genotype in output: fitness = measurements[genotype][1:] output_detailed.append([genotype, np.mean(fitness)]) if not details: return output else: return output_detailed
def computeRankSumZvalsPvals(errRates, lowIsBetter=True): ranks = computeRanks(errRates, onlyFullRows=False) # compute the ranked sums test p-value between different classifiers numClassifiers = errRates.shape[1] dims = (numClassifiers, numClassifiers) zvals = np.empty(dims) pvals = np.empty(dims) for i in range(numClassifiers): zvals[i, i] = 0 pvals[i, i] = 1 for j in range(i+1, numClassifiers): x = errRates.iloc[:, i] y = errRates.iloc[:, j] # compare using all datasets they have in common rowsWithoutNans = np.invert(np.isnan(x) + np.isnan(y)) x = x[rowsWithoutNans] y = y[rowsWithoutNans] zvals[i, j], pvals[i, j] = ranksums(y, x) # cols are indep var zvals[j, i], pvals[j, i] = -zvals[i, j], pvals[i, j] classifierNames = ranks.columns.values zvals = pd.DataFrame(data=zvals, index=classifierNames, columns=classifierNames) pvals = pd.DataFrame(data=pvals, index=classifierNames, columns=classifierNames) return zvals, pvals
def calc_ranksum(): atributos = ['tteste', 'ttreinamento', 'precisao'] for dataset in DATASETS: for atributo in atributos: for f1,f2 in combinations(FUNCOES, 2): d1 = np.array([i[atributo] for i in DADOS if i['dataset_file'] == dataset and i['function_path'] == f1]) d2 = np.array([i[atributo] for i in DADOS if i['dataset_file'] == dataset and i['function_path'] == f2]) print ','.join([str(s) for s in [dataset,atributo,f1,f2,ranksums(d1,d2)[0]]])
def rank_sums(features1, features2, **_): """ :param features1: :param features2: :param _: :return: """ return stats.ranksums(features1, features2)
def plot_histogram(histogram, html_writer, title='', max_pathway_length=8, xmin=None, xlim=20, error_bars=True, min_to_show=20, legend_loc='upper left'): fig = pylab.figure() pylab.hold(True) reps = 1000 y_offset = 0 offset_step = 0.007 colors = {1:'r', 2:'orange', 3:'green', 4:'cyan', 5:'blue', 'Rest':'violet', 'Not first':'k--', 'No known regulation':'grey', 'Activated':'green', 'Inhibited':'r', 'Mixed regulation':'blue'} for key, value in histogram.iteritems(): if len(value) >= min_to_show: m = stats.cmedian(value) sample_std = None if error_bars: sample_vals = [] i = 0 while i < reps: samples = [] while len(samples) < len(value): samples.append(random.choice(value)) sample_vals.append(pylab.median(samples)) i += 1 sample_std = pylab.std(sample_vals) plotting.cdf(value, label='%s (med=%.1f, N=%d)' % \ (key, m, len(value)), style=colors.get(key, 'grey'), std=sample_std, y_offset=y_offset) y_offset += offset_step xmin = -1 * xlim if xmin == None else xmin pylab.xlim(xmin, xlim) pylab.xlabel('Irreversability') #pylab.xlabel('deltaG') pylab.ylabel('Cumulative distribution') legendfont = matplotlib.font_manager.FontProperties(size=11) pylab.legend(loc=legend_loc, prop=legendfont) pylab.title(title) pylab.hold(False) if 'Not first' in histogram: print '%s, first vs. non-first ranksum test: ' % title + '(%f, %f)' % stats.ranksums(histogram[1], histogram['Not first']) if 'Inhibited' in histogram: print '%s, inhibited vs. non-regulated ranksum test: ' % title + '(%f, %f)' % stats.ranksums(histogram['Inhibited'], histogram['No known regulation']) #for k1, h1 in histogram.iteritems(): # for k2, h2 in histogram.iteritems(): # print k1, k2, stats.ranksums(h1, h2) return fig
def caculation(data): trt_1 = data[data['trt'] == 1] trt_0 = data[data['trt'] == 0] medi = statistics.median(trt_1['y']) - statistics.median(trt_0['y']) mean = statistics.mean(trt_1['y']) - statistics.mean(trt_0['y']) peop = len(trt_1) + len(trt_0) vari = statistics.variance(trt_1['y']) + statistics.variance(trt_0['y']) z_stat, p_val = stats.ranksums(trt_0['y'], trt_1['y']) return [medi, mean, peop, p_val]
def BootstrapGenes(CNVTargets, ToSampleFrom, Nsamples, NGenes): ''' (dict, dict, int, int) -> dict Take the dictionary with predicted targets and CNV status for each gene in each study, the dictionary with numbers: gene pairs, the number of bootstrap replicates and the number of genes to sample and return a dictionary with the the number of replicates in which CNV genes have more targets, less targets and no significant differences ''' # create a dict for each study with a list with numbers of each different # outcomes when comparing targets in CNV and non-CNV genes # {study: [# replicates CNV > non-CNV, # replicates CNV < non-CNV, # replicates no differences]} BootStrap = {} # initialize list values for study in ToSampleFrom: BootStrap[study] = [0, 0, 0] # loop over studies in dict to sample from for study in ToSampleFrom: replicates = Nsamples while replicates != 0: # make list of targets for CNV and non-CNV genes repCNVtargets, repNonCNVtargets = [], [] # draw NGenes CNV genes and NGenes non-CNV genes with replacement for i in range(NGenes): # draw a random CNV gene j = random.randint(0, len(ToSampleFrom[study]['CNV']) - 1) k = random.randint(0, len(ToSampleFrom[study]['not_CNV']) - 1) # get the corresponding genes gene1 = ToSampleFrom[study]['CNV'][j] gene2 = ToSampleFrom[study]['not_CNV'][k] # get the the number of targets for these 2 genes assert CNVTargets[study][gene1][-1] == 'CNV', 'random gene should be CNV' assert CNVTargets[study][gene2][-1] == 'not_CNV', 'random gene should be non-CNV' repCNVtargets.append(CNVTargets[study][gene1][2]) repNonCNVtargets.append(CNVTargets[study][gene2][2]) # make sure that the correct numbers of genes is drawn assert len(repCNVtargets) == NGenes, 'number of CNV genes is not correct' assert len(repNonCNVtargets) == NGenes, 'number of non-CNV genes is not correct' # compare CNV and non-CNV genes Pval = stats.ranksums(repCNVtargets, repNonCNVtargets)[1] # check significance if Pval >= 0.05: # difference is not significance BootStrap[study][2] += 1 elif Pval < 0.05: # difference is significance, check if CNV genes have a greater number of targets if np.mean(repCNVtargets) > np.mean(repNonCNVtargets): BootStrap[study][0] += 1 elif np.mean(repCNVtargets) < np.mean(repNonCNVtargets): BootStrap[study][1] += 1 assert np.mean(repCNVtargets) != np.mean(repNonCNVtargets), 'means are equal but significantly different' # update replicate number replicates -= 1 return BootStrap
def rank_sums_test(treatment1, treatment2): """ See if the distribution of treatmen1 is different than the distribution treatment2 Arguments: - `treatment1`: - `treatment2`: """ z_stat, p_val = stats.ranksums(treatment1, treatment2) print "Mann-Whitney-Wilcoxon RankSum P for treatments 1 and 2 =", p_val return z_stat, p_val
def rstest_mw(x, y): '''rank sum test p value of x > y, Not used because of some bug in mannwhitneyu ''' from scipy.stats import ranksums, mannwhitneyu #n1, n2 = len(x), len(y) #mu = n1 * n2 / 2. st1, p1 = ranksums(x, y) try : st, p = mannwhitneyu(x, y) except : return 0.5 if st1 > 0 : return p else: return 1 - p ###
def statistics_test(data, labels): d = data.as_matrix() y = labels.as_matrix() y = y.reshape([y.shape[0]]) in1 = np.where(y < 1e-5)[0] in2 = np.where(y > 1-(1e-5))[0] l = [] for i in range(d.shape[1]): s, p = stt.ranksums(d[in1, i], d[in2, i]) l.append(p) df = pd.DataFrame(data=np.array(l)*len(l), index=data.columns.values, columns=['pvalue']) return df
def Wilcoxon(ipdc, ipdt): #calculates z_stat and pvalue from Wilcoxon test, appends to a file pv_list = [] if len(ipdc) > len(ipdt): sample = len(ipdt) else: sample = len(ipdc) for r in range(100): sampled_timepoint = numpy.random.choice(ipdt, sample, replace=True) sampled_control = numpy.random.choice(ipdc, sample, replace=True) z_stat, p_val = stats.ranksums(sampled_control, sampled_timepoint) pv_list.append(p_val) return pv_list
def series_mean_ranksums(word_set1, word_set2, words_time_series, one_minus=True): word_means1 = np.array(get_word_means(words_time_series, word_set1).values()) word_means2 = np.array(get_word_means(words_time_series, word_set2).values()) if one_minus: word_means1 = 1 - word_means1 word_means2 = 1 - word_means2 z,p = ranksums(word_means1, word_means2) return {"z" : z, "p" : p, "set1_size" : len(word_means1), "set2_size" : len(word_means2), "set1_med" : np.median(word_means1), "set2_med" : np.median(word_means2)}
def outcomeBoxplot(cyDf, cyVar, outcomeVar, printP=True, axh=None): if axh is None: axh = plt.gca() axh.cla() sns.boxplot(y=cyVar, x=outcomeVar, data=cyDf, ax=axh, order=[0,1]) sns.stripplot(y=cyVar, x=outcomeVar, data=cyDf, jitter=True, ax=axh, order=[0,1]) plt.xticks([0,1], ['False', 'True']) if printP: tmp = cyDf[[cyVar, outcomeVar]].dropna() z, pvalue = stats.ranksums(tmp[cyVar].loc[tmp[outcomeVar] == 1], tmp[cyVar].loc[tmp[outcomeVar] == 0]) annParams = dict(textcoords='offset points', xytext=(0,-5), ha='center', va='top', color='black', weight='bold', size='medium') plt.annotate('p = %1.3g' % pvalue, xy=(0.5,plt.ylim()[1]), **annParams) plt.show()
def main(): """ 1st phase top1 = [70.0, 71.1, 72.5, 70.8, 68.1, 71.9, 71.1, 71.3, 68.4, 70.2] top3 = [75.8, 78.4, 77.8, 77.7, 80.0, 77.8, 78.7, 76.4, 79.1, 77.3] 2nd phase """ x = [53.6, 54.5, 53.7, 52.7, 53.1, 55.5, 55.5, 52.8, 53.7, 52.7] y = [89.7, 89.1, 89.5, 88.7, 89.4, 88.6, 89.8, 89.5, 89.2, 89.7] # Compute the Wilcoxon rank-sum statistic for two samples. wilcoxon = stats.ranksums(x, y) anova = stats.f_oneway(x, y) print "Wilcoxon: " + str(wilcoxon[1]) + "; ANOVA: " + str(anova[1])
def ranksum_test_two_sample(sample_a, sample_b): ''' doing ranksum test on two samples, sample_a and sample_b should be array_like ''' z_statistic, two_tailed_pvalue = stats.ranksums(sample_a, sample_b) z_statistic = z_statistic.tolist() if z_statistic < 0: pvalue_left = two_tailed_pvalue / 2 pvalue_right = 1 - two_tailed_pvalue / 2 else: pvalue_left = 1 - two_tailed_pvalue / 2 pvalue_right = two_tailed_pvalue / 2 return z_statistic, pvalue_left, pvalue_right
def wilcox_test(x, y): ''' Performs the Wilcoxon-Ranked Sums Test. @param x: collection of numerics. @param y: collection of numerics. @return: float referencing the test p-value. ''' pval = ranksums(x, y)[-1].astype('float64') if numpy.isnan(pval): # replace NaN with a poor p-value. pval = 1.0 if pval == 0.0: # sanity checks to ensure all values are non-zero. pval = numpy.finfo(numpy.float64).tiny.astype('float') return pval
def violin_from_dict(ann_violin, dict_list, category_label, prefix, taskid, colormap=None, figsize=(1.2, 1.2), lfc=False): from scipy.stats import ranksums ann_violin = ann_violin.copy() ann_violin.X = ann_violin.raw.X sc.pp.normalize_per_cell(ann_violin, copy=False, counts_per_cell_after=15000) for t, _genes in dict_list.items(): if len(_genes) > 1: fig_all, ax_sub = plt.subplots(1, len(_genes), figsize=((figsize[0] * len(_genes)), figsize[1] + 0.3)) for _, g in enumerate(_genes): if not g in ann_violin.var_names: print(g, 'not found') continue ann_violin.obs['exp'] = ann_violin.X[:, ann_violin.var_names == g].A.reshape(-1) ann_violin.obs['l2fc'] = np.log2( ann_violin.X[:, ann_violin.var_names == g].A.reshape(-1) + 1) ann_violin.obs[g] = ann_violin.obs['l2fc'] # Figure properties fig, ax = plt.subplots(figsize=figsize) # rc={'font.size': 32, 'axes.labelsize': 18, 'legend.fontsize': 18, # 'axes.titlesize': 20, 'xtick.labelsize': 20, 'ytick.labelsize': 20} #plt.rcParams.update(**rc) # Violin Plot mask = ann_violin.obs[category_label] == ann_violin.obs[ category_label].cat.categories[0] if lfc: rep = np.log2(ann_violin.obs['exp'][~mask].mean() + 1) - np.log2(ann_violin.obs['exp'][mask].mean() + 1) else: stat, pval = ranksums(ann_violin.obs['l2fc'][mask], ann_violin.obs['l2fc'][~mask]) rep = pval from statannot import add_stat_annotation if ann_violin.obs[category_label].dtype.name == 'category': ann_violin.obs[category_label] = ann_violin.obs[ category_label].cat.remove_unused_categories() axs = [ax] if len(_genes) > 1: axs.append(ax_sub[_]) for _ax in axs: sns.violinplot(data=ann_violin.obs, palette=colormap, y=g, x=category_label, linewidth=1, ax=_ax) if lfc: add_stat_annotation( _ax, data=ann_violin.obs, y=g, x=category_label, box_pairs=[(ann_violin.obs[category_label].unique())], perform_stat_test=False, pvalues=[rep], text_format='custom', line_offset_to_box=0.2, line_offset=0.1, line_height=0.05, linewidth=0.6, text_offset=0.5) else: add_stat_annotation( _ax, data=ann_violin.obs, y=g, x=category_label, box_pairs=[(ann_violin.obs[category_label].unique())], perform_stat_test=False, pvalues=[rep], line_offset_to_box=0.2, line_offset=0.1, line_height=0.05, linewidth=0.6, text_offset=0.5) _ax.set_xlabel('') _ax.set_title(g) _ax.set_ylabel('') ax.set_ylabel(r'$ \log_{2}( expression) $') if _ == 0 and len(_genes) > 1: ax_sub[_].set_ylabel(r'$ \log_{2}( expression) $', fontsize=7) if len(_genes) > 1: ax_sub[_].tick_params(axis='y', pad=-3) path = prefix + t + '-' + g + '-' + taskid + '-' + ".pdf" fig.savefig(path, dpi=300, bbox_inches='tight') plt.close(fig) fig_all.tight_layout(pad=0.3) fig_all.savefig(prefix + t + '-' + taskid + '-' + ".pdf", bbox_inches='tight') plt.close('all')
def batch_stats_extended(marker_exp, c_list, coi): """Applies t test , wilcoxon test, and likelihood ratio test (Based on logistic regression) to a gene expression matrix, gene by gene. Also gives simple up versus down regulation test (difference between means). :param marker_exp: A DataFrame whose rows are cell identifiers, columns are gene identifiers, and values are float values representing gene expression. :param c_list: A Series whose indices are cell identifiers, and whose values are the cluster which that cell is part of. :param coi: The cluster of interest. :returns: A matrix with arbitary row indices whose columns are the gene, t statistic, then t p-value; the last two being of float type. Their names are 'gene', 't_stat' , 't_pval' , w_stat, w_pval , LRT_pval, up/down regulated :rtype: pandas.DataFrame """ def LRT_LogReg(df): # Define model matrix and response X = np.matrix(df.drop('cluster', axis=1)) y = df['cluster'] # Train logistic regression with full model logreg1 = LogisticRegression(solver='lbfgs').fit(X, y) ll1 = -log_loss(y, logreg1.predict_proba(X), normalize=False) # Train logistic regression with null model (only intercept) logreg0 = LogisticRegression(solver='lbfgs').fit([[0]] * len(X), y) ll0 = -log_loss(y, logreg0.predict_proba(X), normalize=False) # Likelihood ratio test stat = 2 * (ll1 - ll0) pval = ss.chi2.sf(stat, 1) return (pval) LRT_pvals = [] up_v_down_vals = [] for column in marker_exp: log_reg_in = pd.DataFrame(data=[marker_exp[column]]) log_reg_in = np.transpose(log_reg_in) c_list_2 = np.array(c_list) c_list_2 = np.array(c_list_2 == coi, dtype=int) c_list_2 = np.transpose(c_list_2) log_reg_in['cluster'] = c_list_2 in_cls = marker_exp[column][c_list == coi].values out_cls = marker_exp[column][c_list != coi].values out_cls_mean = np.sum(out_cls) / len(out_cls) in_cls_mean = np.sum(in_cls) / len(in_cls) test = in_cls_mean - out_cls_mean if test <= 0: up_v_down_vals.append('down') else: up_v_down_vals.append('up') LRT_pval = LRT_LogReg(log_reg_in) LRT_pvals.append(LRT_pval) t = marker_exp.apply(lambda col: ss.ttest_ind( col[c_list == coi], col[c_list != coi], equal_var=False)) ws = marker_exp.apply( lambda col: ss.ranksums(col[c_list == coi], col[c_list != coi])) output = pd.DataFrame() output['gene_1'] = t.index #output['gene_1'] = ws.index output[['t_stat', 't_pval']] = pd.DataFrame(t.values.tolist(), columns=['t_stat', 't_pval']) output[['w_stat', 'w_pval']] = pd.DataFrame(ws.values.tolist(), columns=['w_stat', 'w_pval']) output['up_down'] = up_v_down_vals output['LRT_pval'] = LRT_pvals return output
for (j, k), l in grouped: bar_num = sorted(list_of_genotypes).index(j) index_num = sorted(list_of_treatments).index(k) p = plt.bar(index_num + (bar_width * bar_num), means[j, k], bar_width, alpha=opacity[index_num], color=colourlist[bar_num], yerr=sems[j, k], error_kw=error_config, label=[j, k]) #Mann-Whitney test = st.ranksums z_stat_gt, p_val_gt = st.ranksums( df[(df[groupinglist[0]] == j) & (df[groupinglist[1]] == k)][i], df[(df[groupinglist[0]] == CONTROL_GENOTYPE) & (df[groupinglist[1]] == k)][i]) z_stat_tr, p_val_tr = st.ranksums( df[(df[groupinglist[0]] == j) & (df[groupinglist[1]] == k)][i], df[(df[groupinglist[0]] == j) & (df[groupinglist[1]] == CONTROL_TREATMENT)][i]) p_vals_gt_rounded = ['%.4f' % elem for elem in p_vals_gt] p_vals_tr_rounded = ['%.4f' % elem for elem in p_vals_tr] q = plt.text( index_num + (bar_width * (bar_num + 0.5)), #centre of bar means[j, k] + 0.5 * sems[j, k] + 0.1 * means.values.max(), #just above error bar 'p(genotype) = ' + str(p_val_gt.round(4)) + '\np(treatment) = ' + str(p_val_tr.round(4)) + '\nn = ' + str(ns[j, k]),
# %% significance = np.zeros( ( cumulative_weighted_assessment_score.shape[1], unstacked_cumulative_weighted_assessment_score.shape[1], ), dtype=bool, ) for offset, date in enumerate(cumulative_weighted_assessment_score): df = unstacked_cumulative_weighted_assessment_score[date] columns = df.columns offset *= len(columns) for i, j in combinations(range(len(columns)), r=2): significance[i, j + offset] = significance[j, i + offset] = (ranksums( df.iloc[:, i], df.iloc[:, j], nan_policy="omit").pvalue <= 0.05) significance_df = (pd.DataFrame( significance, index=unstacked_cumulative_weighted_assessment_score.columns.levels[1], columns=unstacked_cumulative_weighted_assessment_score.T.index, ).rename_axis("").rename_axis(["", "H0 rejected"], axis=1)) display(significance_df) # %% [markdown] tags=[] # As in the work of Hlosta et al. the hypothesis test indicates a # significant difference between the groups in the time slices, starting from the # first time slice (the assessment from day 33). # # We also note that the test did not reject the null hypothesis for the (Withdrawn-Fail)
salineDataObjs[1], 'k') ax.tick_params(axis='both', which='major', labelsize=labelFontSize) ax.tick_params(axis='both', which='minor', labelsize=labelFontSize) title('sal-{}'.format([key for key, value in salineSoundTypes.iteritems()][1])) behavioranalysis.nice_psycurve_settings(ax, fontsize=10, lineweight=2) # figtext(0.075, 0.7, 'Fraction of trials going to the right', rotation='vertical') # figtext(0.4, 0.05, 'Log2(frequency) - octaves') plt.subplots_adjust(wspace=0.25, hspace=0.25) show() suptitle(animal) #We should be using nonparametric stats from scipy import stats print(stats.ranksums(salineAmpEstimates[:, 1], musAmpEstimates[:, 1])) print(stats.ranksums(salineChordEstimates[:, 1], musChordEstimates[:, 1])) sa = salineAmpEstimates[:, 1] ma = musAmpEstimates[:, 1] sc = salineChordEstimates[:, 1] mc = musChordEstimates[:, 1] figure() subplot(121) plot(zeros((len(sa), 1)), 1 / (4. * sa), 'ko') plot(ones((len(ma), 1)), 1 / (4. * ma), 'ro') xlim([-1, 2])
def rand_samp(checks, file_name, gene_high, gene_mid, gene_low, verheek_merged, verheek_mm): table = pd.DataFrame() for i in checks: x = pd.DataFrame( dict(high=gene_high[i].value_counts(), low=gene_low[i].value_counts())) table = table.append(x) table.fillna(0, inplace=True) p = [] for i, j in table.values: x = [gene_low.shape[0] - j, j], [gene_high.shape[0] - i, i] z = fisher_exact(x)[1] p.append(z) table['p'] = p table.to_csv(file_name + "_pre_balanced.csv") # random sampling until fisher is not significant or 10000 iterations c = 0 sig = 0 sig2 = 1 while c < 10000 and sig2 > 0.05: c = c + 1 print('Iteration number = {}'.format(str(c))) table = pd.DataFrame() if gene_high.shape[0] > 20: gene_high_samp = gene_high.sample( np.random.randint(20, gene_high.shape[0])) else: gene_high_samp = gene_high.copy() if gene_mid.shape[0] > 20: gene_mid_samp = gene_mid.sample( np.random.randint(20, gene_mid.shape[0])) else: gene_mid_samp = gene_mid.copy() if gene_mid.shape[0] > 20: gene_low_samp = gene_low.sample( np.random.randint(20, gene_low.shape[0])) else: gene_low_samp = gene_low.copy() for i in checks: x = pd.DataFrame( dict(high=gene_high_samp[i].value_counts(), low=gene_low_samp[i].value_counts())) table = table.append(x) table.fillna(0, inplace=True) p = [ fisher_exact(([gene_low_samp.shape[0] - j, j], [gene_high_samp.shape[0] - i, i]))[1] for i, j in table.values ] # p = [] # for i,j in table.values: # x = [gene_low.shape[0]-j,j],[gene_high.shape[0]-i,i] # z = fisher_exact(x)[1] # p.append(z) table['p'] = p p.sort() sig = p[0] print("Lowest fisher significance = {:.8f}".format(sig)) if sig > 0.05: sig2 = ranksums(gene_high_samp.os, gene_low_samp.os)[1] print("Survival significance = {:.3f}".format(sig2)) a = gene_high_samp.index.values.tolist() \ + gene_mid_samp.index.values.tolist() + \ gene_low_samp.index.values.tolist() print("exporting balanced unscaled to csv...") verheek_merged.loc[a].to_csv(file_name + "_balanced_unscaled.csv") print("gzipping balanced unscaled csv...") with open(file_name + "_balanced_unscaled.csv", 'rb') as f_in: with gzip.open(file_name + "_balanced_unscaled.csv.gz", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(file_name + "_balanced_unscaled.csv") print("exporting balanced scaled to csv...") verheek_mm.loc[a].to_csv(file_name + "_balanced.csv") print("gzipping balanced scaled csv...") with open(file_name + "_balanced.csv", 'rb') as f_in: with gzip.open(file_name + "_balanced.csv.gz", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) os.remove(file_name + "_balanced.csv")
if miranda[species][gene][-1] == 'CNV': k += 1 elif miranda[species][gene][-1] == 'not_CNV': l += 1 # check that numbers match assert i == k, 'CNV genes should match between miranda and targetscan' assert j == l, 'non-CNV genes should match between miranda and targetscan' # populate dict GeneNumbers[species] = [i, j] print('counted CNV and non-CNV genes for each species') # perform stattistical tests between CNV and non-CNV genes # create dicts to store results {species: [P-value targetscan, P-value mirnada]} CompTargets = {} for species in SpeciesDataTargetscan: Ptargetscan = stats.ranksums(SpeciesDataTargetscan[species][0], SpeciesDataTargetscan[species][1])[1] Pmiranda = stats.ranksums(SpeciesDataMiranda[species][0], SpeciesDataMiranda[species][1])[1] CompTargets[species] = [Ptargetscan, Pmiranda] print('compared CNV and non-CNV genes') # get the significance level Significance = {} for species in CompTargets: Significance[species] = [] for i in range(len(CompTargets[species])): if CompTargets[species][i] >= 0.05: Significance[species].append('') elif CompTargets[species][i] < 0.05 and CompTargets[species][i] >= 0.01: Significance[species].append('*') elif CompTargets[species][i] < 0.01 and CompTargets[species][ i] >= 0.001:
from scipy.stats import ranksums import matplotlib.pyplot as plot import sys input_filename = "run2.csv)" if len(sys.argv) > 1: input_filename = sys.argv[1] f = open(input_filename, "r") results = [] for line in f: cols = [float(x) for x in line.split()] for i, col in zip(range(len(cols)), cols): try: results[i].append(col) except: results.append([col]) for i, result in zip(range(len(results)), results): print i, sum(result) / len(result) print "---------- RESULTS ----------" print "Normal Test", normaltest(results[1]) print "Mann-Whitney", mannwhitneyu(results[1], results[3]) print "Wilcoxon", wilcoxon(results[1], results[3]) print "T-Test", ttest_rel(results[1], results[3]) print "Rank Sums", ranksums(results[1], results[9])
'worst_concavity', 'worst_concave', 'worst_symmetry', 'worst_fractal_dimension' ] df = pd.read_csv(endereco_dos_dados, header=None) df.columns = nomes_das_variaveis del endereco_dos_dados, nomes_das_variaveis """---------------------------------------------------------------------------- Verificando a base de dados """ # Top 5 dos dados df.head() # Tipagem dos dados df.ftypes # Descrição dos dados df.describe() """---------------------------------------------------------------------------- Aplicando o teste """ # Separando os dados diagnostico_m = df.query("diagnosis == 'M'") diagnostico_b = df.query("diagnosis == 'B'") # Aplicando o teste estatistica, p = ranksums(diagnostico_m["mean_radius"], diagnostico_b["mean_radius"]) print("p-valor: {}".format(round(p, 2))) print("estatistica: {}".format(round(estatistica, 2)))
def plot(handle, out, cols, names, bins, title, xlab, ylab, xlog, ylog, \ vmax, vmin, vMinSum, collapse, normed, alpha, legendLoc, colors,\ verbose, dlimit=1): """ """ if verbose: sys.stderr.write("Parsing data...\n") x = [[] for i in range(len(cols))] for l in handle: try: ldata = l[:-1].split('\t') vals = [] for col in cols: if col >= len(ldata) or not ldata[col]: continue v = float(ldata[col]) if vmin < v < vmax: vals.append(v) #skip entire line if one value out of bounds # or if sum of values below threshold if len(vals) != len(cols) or sum(vals) < vMinSum: continue for i, v in enumerate(vals): x[i].append(v) except: sys.stderr.write("[Error] Cannot parse line: %s\n" % ",".join(l.split('\t'))) if verbose: sys.stderr.write(" %s values loaded.\n" % len(x)) fig = plt.figure() #http://matplotlib.org/users/customizing.html #mpl.rcParams['figure.subplot.wspace'] = 0.3 '''mpl.rcParams['figure.subplot.hspace'] = 0.5 mpl.rcParams['axes.titlesize'] = 8 mpl.rcParams['axes.labelsize'] = 6 mpl.rcParams['xtick.labelsize'] = 5 mpl.rcParams['ytick.labelsize'] = 5''' #add subplots plt.rc('axes', color_cycle=colors) #['c', 'm', 'y', 'k'] #plot x, y = x # get correlation print "%s points\n mean X: %s +/- %s\n mean Y: %s +/- %s" % ( len(x), np.mean(x), np.std(x), np.mean(y), np.std(y)) print "Pearson: r=%s p=%s" % stats.pearsonr(x, y) print "Spearman: r=%s p=%s" % stats.spearmanr(x, y) print "Wilcoxon: T=%s p=%s" % ranksums(x, y) # wilcoxon(x, y) pairwise_wilcoxon(x, y) ax = fig.add_subplot(111) ax.plot(x, y, 'b.', alpha=0.5) #add title ax.set_title(title) #add subplots labels ax.set_xlabel(xlab) #, fontsize=30) ax.set_ylabel(ylab) #, fontsize=30) #plot legend only if collapsed if xlog: ax.set_xscale('log') if ylog: ax.set_yscale('log', nonposy='clip') ax.grid(True) #save or show if type(out) is file and out.name == '<stdout>': plt.show() else: fpath = out #handle.name+".png" fformat = fpath.split('.')[-1] plt.savefig(fpath, dpi=300, format=fformat, orientation='landscape', transparent=False) sys.stderr.write("Figure written to: %s\n" % fpath)
def plot_update_trend(self): if self.y_axis.value: selected_indices = {n: getattr(self.sources, 'time_%s' % n).selected.indices for n in GROUP_LABELS} for n in GROUP_LABELS: if not selected_indices[n]: selected_indices[n] = range(len(getattr(self.sources, 'time_%s' % n).data['x'])) group = {n: {'x': [], 'y': []} for n in GROUP_LABELS} for n in GROUP_LABELS: for i in range(len(getattr(self.sources, 'time_%s' % n).data['x'])): if i in selected_indices[n]: for v in ['x', 'y']: group[n][v].append(getattr(self.sources, 'time_%s' % n).data[v][i]) try: avg_len = int(self.look_back_distance.value) except: avg_len = 1 try: percentile = float(self.plot_percentile.value) except: percentile = 90. # average daily data and keep track of points per day, calculate moving average group_collapsed = {n: [] for n in GROUP_LABELS} for n in GROUP_LABELS: if group[n]['x']: group_collapsed[n] = collapse_into_single_dates(group[n]['x'], group[n]['y']) if self.look_back_units.value == "Dates with a Sim": x_trend, moving_avgs = moving_avg(group_collapsed[n], avg_len) else: x_trend, moving_avgs = moving_avg_by_calendar_day(group_collapsed[n], avg_len) y_np = np.array(group[n]['y']) upper_bound = float(np.percentile(y_np, 50. + percentile / 2.)) average = float(np.percentile(y_np, 50)) lower_bound = float(np.percentile(y_np, 50. - percentile / 2.)) getattr(self.sources, 'time_trend_%s' % n).data = {'x': x_trend, 'y': moving_avgs, 'mrn': ['Avg'] * len(x_trend)} getattr(self.sources, 'time_bound_%s' % n).data = {'x': group[n]['x'], 'mrn': ['Bound'] * len(group[n]['x']), 'upper': [upper_bound] * len(group[n]['x']), 'avg': [average] * len(group[n]['x']), 'lower': [lower_bound] * len(group[n]['x'])} getattr(self.sources, 'time_patch_%s' % n).data = {'x': [group[n]['x'][0], group[n]['x'][-1], group[n]['x'][-1], group[n]['x'][0]], 'y': [upper_bound, upper_bound, lower_bound, lower_bound]} else: for v in ['trend', 'bound', 'patch']: clear_source_data(self.sources, 'time_%s_%s' % (v, n)) x_var = str(self.y_axis.value) if x_var.startswith('DVH Endpoint'): self.histograms.xaxis.axis_label = x_var.split("DVH Endpoint: ")[1] elif x_var == 'EUD': self.histograms.xaxis.axis_label = "%s (Gy)" % x_var elif x_var == 'NTCP/TCP': self.histograms.xaxis.axis_label = "NTCP or TCP" else: if self.range_categories[x_var]['units']: self.histograms.xaxis.axis_label = "%s (%s)" % (x_var, self.range_categories[x_var]['units']) else: self.histograms.xaxis.axis_label = x_var # Normal Test s, p = {n: '' for n in GROUP_LABELS}, {n: '' for n in GROUP_LABELS} for n in GROUP_LABELS: if group[n]['y']: s[n], p[n] = normaltest(group[n]['y']) p[n] = "%0.3f" % p[n] # t-Test and Rank Sums pt, pr = '', '' if group['1']['y'] and group['2']['y']: st, pt = ttest_ind(group['1']['y'], group['2']['y']) sr, pr = ranksums(group['1']['y'], group['2']['y']) pt = "%0.3f" % pt pr = "%0.3f" % pr self.histogram_normaltest_1_text.text = "Group 1 Normal Test p-value = %s" % p['1'] self.histogram_normaltest_2_text.text = "Group 2 Normal Test p-value = %s" % p['2'] self.histogram_ttest_text.text = "Two Sample t-Test (Group 1 vs 2) p-value = %s" % pt self.histogram_ranksums_text.text = "Wilcoxon rank-sum (Group 1 vs 2) p-value = %s" % pr else: for n in GROUP_LABELS: for k in ['trend', 'bound', 'patch']: clear_source_data(self.sources, "time_%s_%s" % (k, n)) self.histogram_normaltest_1_text.text = "Group 1 Normal Test p-value = " self.histogram_normaltest_2_text.text = "Group 2 Normal Test p-value = " self.histogram_ttest_text.text = "Two Sample t-Test (Group 1 vs 2) p-value = " self.histogram_ranksums_text.text = "Wilcoxon rank-sum (Group 1 vs 2) p-value = " self.update_histograms()
#scores_df['fb_gain'] = (scores_df['second'] - scores_df['first'])/scores_df['first'] #scores_df = scores_df.fillna(0) factor_names = ['fb_type', 'metric_type', 'threshold_factor'] p_values_df = pd.DataFrame(columns=factor_names + ['runksum', 'comparison', 'pvalue']) pvalue_types = ['FBMock', 'FB500', 'FB250', 'FB0'][::-1] for factors_values, group in scores_df.groupby(factor_names): for pvalue_type in ['FBMock', 'FB500', 'FB250', 'FB0']: if factors_values[0] == pvalue_type: continue mock = scores_df.query( 'fb_type=="{}" & metric_type=="{}" & threshold_factor=={}'.format( pvalue_type, *factors_values[1:])) p_value_mock = ranksums(group['score'].values, mock['score'].values) pvalue = stats.ranksums(group['score'].values, mock['score'].values).pvalue print('*' if stats.shapiro(group['score'].values)[1] < 0.05 else '-', factors_values) #p_value_0 = stats.ranksums(group['score'].values, 0)[0] pvalue_dict = dict( zip( factor_names + ['runksum', 'comparison', 'pvalue'], list(factors_values) + [[p_value_mock], [pvalue_type], [pvalue]])) #print(pvalue_dict) p_values_df = p_values_df.append(pd.DataFrame(pvalue_dict), ignore_index=True) #sns.set(rc={'figure.figsize':(2,2)})
boxprops=boxprops, meanprops=meanlineprops, meanline=True, medianprops=medianprops, capprops=capprops, whiskerprops=whiskerprops) axes[3, 0].set_ylim(-0.1, ylimMax) axes[3, 0].set_ylabel("Surface Distance (mm)", fontsize=20) axes[3, 1].boxplot(dist_bs[1], labels=labels, showmeans=True, showfliers=False, boxprops=boxprops, meanprops=meanlineprops, meanline=True, medianprops=medianprops, capprops=capprops, whiskerprops=whiskerprops) axes[3, 1].set_ylim(-0.1, ylimMax) axes[3, 1].set_ylabel("Surface Distance (mm)", fontsize=20) plt.savefig(png_path_out, dpi=fig.dpi) plt.savefig(eps_path_out, dpi=fig.dpi) plt.show() print(ss.ranksums(dist[0][0], dist[0][1]), ss.ranksums(dist_apx[0][0], dist_apx[0][1]), ss.ranksums(dist_md[0][0], dist_md[0][1]), ss.ranksums(dist_bs[0][0], dist_bs[0][1])) results.close()
def tests_compare_report_experimental(request, test_id_1, test_id_2): data = Aggregate.objects.raw( """ SELECT a.url as "id", a1.average as "average_1", a2.average as "average_2", a1.average - a2.average as "avg_diff", (((a1.average-a2.average)/a2.average)*100) as "avg_diff_percent", a1.median - a2.median as "median_diff", (((a1.median-a2.median)/a2.median)*100) as "median_diff_percent" FROM (SELECT action_id, average, median FROM jltc.aggregate WHERE test_id = %s) a1, (SELECT action_id, average, median FROM jltc.aggregate WHERE test_id = %s) a2, jltc.action a WHERE a1.action_id = a2.action_id and a.id = a1.action_id """, [test_id_1, test_id_2]) reasonable_percent = 3 reasonable_abs_diff = 5 # ms negatives = [] positives = [] absense = [] MWW_test = [] avg_list_1 = [] avg_list_2 = [] for row in data: if row.avg_diff_percent > reasonable_percent: negatives.append(row) elif row.avg_diff_percent < -reasonable_percent: positives.append(row) test_1_actions = list( Aggregate.objects.annotate(url=F('action__url')).filter( test_id=test_id_1).values('url')) test_2_actions = list( Aggregate.objects.annotate(url=F('action__url')).filter( test_id=test_id_2).values('url')) for url in test_2_actions: if url not in test_1_actions: absense.append(url) action_list_2 = TestActionAggregateData.objects.filter( test_id=test_id_2).values() for action in action_list_2: action_id = action['action_id'] action_url = Action.objects.values().get(id=action_id)['url'] set_1 = TestActionData.objects. \ filter(test_id=test_id_1, action_id=action_id). \ annotate(average=RawSQL("((data->>%s)::numeric)", ('avg',))). \ values("average") set_2 = TestActionData.objects. \ filter(test_id=test_id_2, action_id=action_id). \ annotate(average=RawSQL("((data->>%s)::numeric)", ('avg',))). \ values("average") data_1 = queryset_to_json(set_1) data_2 = queryset_to_json(set_2) for d in data_1: avg_list_1.append(d['average']) for d in data_2: avg_list_2.append(d['average']) logger.info(action_id) if not avg_list_1: absense.append(action_url) else: z_stat, p_val = stats.ranksums(avg_list_1, avg_list_2) if p_val <= 0.05: a_1 = queryset_to_json( TestActionAggregateData.objects.filter( test_id=test_id_1, action_id=action_id).annotate( mean=RawSQL("((data->>%s)::numeric)", ( 'mean', ))).annotate( p50=RawSQL("((data->>%s)::numeric)", ( '50%', ))).values("mean", "p50")) a_2 = queryset_to_json( TestActionAggregateData.objects.filter( test_id=test_id_2, action_id=action_id).annotate( mean=RawSQL("((data->>%s)::numeric)", ( 'mean', ))).annotate( p50=RawSQL("((data->>%s)::numeric)", ( '50%', ))).values("mean", "p50")) mean_1 = float(a_1[0]['mean']) mean_2 = float(a_2[0]['mean']) mean_diff_percent = (mean_1 - mean_2 / mean_2) * 100 if mean_diff_percent > 0: negatives.append({ "id": action_url, "mean_diff_percent": mean_diff_percent, "mean_1": mean_1, "mean_2": mean_2 }) else: positives.append({ "id": action_url, "mean_diff_percent": mean_diff_percent, "mean_1": mean_1, "mean_2": mean_2 }) MWW_test.append({"url": action_url, "p_val": p_val}) logger.info("MWW RankSum P for 1 and 2 = {}".format(p_val)) return render( request, 'compare_report.html', { 'negatives': negatives, 'positives': positives, 'absense': absense, 'MWW_test': MWW_test, })
data=np.stack([r2_sig_pr, r2_noise_pr, r2_interference_pr]).T)]) r2 = r2.melt() r2['corrected'] = np.tile(np.concatenate(((False * np.ones(len(r2_sig)).astype(bool), (True * np.ones(len(r2_sig)).astype(bool))))), [3]) r2 = r2.rename(columns={'value': r'$cvR^2$', 'variable': 'Regressor'}) # model coefficients coefs = pd.DataFrame(columns=[r"$\Delta$ Signal"+"\nmagnitude", r"$\Delta$ Shared"+"\nnoise variance", r"$\Delta$ Noise" +"\ninterference"], data=coefs[:, :-1]) coefs = coefs.melt() coefs = coefs.rename(columns={'value': 'Coefficient', 'variable': 'Regressor'}) # stats for r2 for each predictor across sites. Is significant? r2_raw = r2[r2.corrected==False] x = r2_raw[r2_raw.Regressor==r"$\Delta$ Signal magnitude"][r"$cvR^2$"] U, pval = ss.ranksums(x, np.zeros(x.shape[0])) m = x.mean() print(f"R2 for signal magnitude, pval: {pval}, U: {U}, mean: {m}\n") x = r2_raw[r2_raw.Regressor==r"$\Delta$ Shared noise variance"][r"$cvR^2$"] U, pval = ss.ranksums(x, np.zeros(x.shape[0])) m = x.mean() print(f"R2 for shared noise variance, pval: {pval}, U: {U}, mean: {m}\n") x = r2_raw[r2_raw.Regressor==r"$\Delta$ Noise interference"][r"$cvR^2$"] U, pval = ss.ranksums(x, np.zeros(x.shape[0])) m = x.mean() print(f"R2 for noise interference, pval: {pval}, U: {U}, mean: {m}\n") # same for corrected r2_raw = r2[r2.corrected==True]
ranks.append(rankdata(ms).tolist()) ranks = np.array(ranks) mean_ranks = np.mean(ranks, axis=0) best_clusters.append(np.argmax(mean_ranks) + 2) # print("\nRanks:\n", ranks) # print("\nMean ranks:\n", ) alpha = .05 length = len(clfs) s = np.zeros((length, length)) p = np.zeros((length, length)) for i in range(length): for j in range(length): s[i, j], p[i, j] = ranksums(ranks.T[i], ranks.T[j]) _ = np.where((p < alpha) * (s > 0)) conclusions = [list(1 + _[1][_[0] == i]) for i in range(length)] t.append(["%s" % div] + ["%.3f" % v for v in mean_ranks]) # t.append([''] + [", ".join(["%i" % i for i in c]) # if len(c) > 0 else nc # for c in conclusions]) t.append([''] + [ ", ".join(["%i" % i for i in c]) if len(c) > 0 and len(c) < len(clfs) - 1 else ("all" if len(c) == len(clfs) - 1 else "---") for c in conclusions ]) # print(t)
def calc_p_value(a_vec, b_vec, is_normal = True): if is_normal: _, p_val = stats.ttest_ind(a_vec, b_vec) else: _, p_val = stats.ranksums(a_vec, b_vec) return p_val
else: normpop.append(False) print "population is NOT normal" fdatasize.write("--> population is NOT normal --> " "p-value (Shapiro's test) :" + str(p) + "\n\n") plotlabels.append(dirname) print normpop print sums print plotlabels if False in normpop: """ Non parametric Wilcoxon rank sum test.""" print "At least one sample does Not have a normal distibution" \ "--> wilcoxon rank sum test" fdatasize.write("At least one sample does Not have a normal " "distibution --> wilcoxon rank sum test" + "\n") statrank, prank = stats.ranksums(*sums) if prank > 0.05: print "--> populations are NOT statiscically different " \ "--> p-value is " + str(prank) fdatasize.write("--> populations are NOT statiscically different " "--> p-value (wilcoxon rank sum test) : " + str(prank) + "\n\n") else: print "--> populations are statiscically different " \ "--> p-value is " + str(prank) fdatasize.write("--> populations are statiscically different " "--> p-value (wilcoxon rank sum test) : " + str(prank) + "\n\n") else: """ Bartlett's test for equal variance."""
verheek_mm_noNN = verheek_merged.copy() verheek_mm_noNN = verheek_mm_noNN.query("karyotype != 'karyotype: NN'") verheek_mm_noNN.loc[:, '1007_s_at': 'AFFX-TrpnX-M_at'] = min_max_scaler.fit_transform( verheek_mm_noNN.loc[:, '1007_s_at':'AFFX-TrpnX-M_at']) ## loop to look for most significant genes x = {} y = 0 for i in verheek_mm.loc[:, '1007_s_at':'AFFX-TrpnX-M_at'].columns: y += 1 gene_high = verheek_mm.loc[verheek_mm.loc[:, i] > 0.7] gene_low = verheek_mm.loc[verheek_mm.loc[:, i] < 0.3] if gene_high.shape[0] > 20: if gene_low.shape[0] < 400: z = ranksums(gene_high.os, gene_low.os)[1] if z < 0.05: print(i, y) x[i] = [gene_high.shape[0], gene_low.shape[0], z] # change to dataframe sig = pd.DataFrame.from_dict(x, orient='index') sig.columns = ['high_no', 'low_no', 'sig'] sig = sig.join(probes['Gene Symbol']) sig.to_csv('sig.csv') # resume from here no need to test all sig again sig = pd.read_csv('sig.csv', index_col=0) gene = '217975_at' # WBP5 # gene = sig.sort_values('sig').index[4] symbol = probes.loc[gene]['Gene Symbol']
print(delta_class_gain(call_total, 'empirical_reg', 'BS')) print(delta_class_gain(call_total, 'DNN', 'empirical_reg')) separated_BS = pd.pivot_table(call_total, values='SSE_DNN', columns=['delta_class'], index=['ut'], aggfunc=np.sum) separated_MV = pd.pivot_table(call_total, values='SSE_empirical_reg', columns=['delta_class'], index=['ut'], aggfunc=np.sum) print(1 - separated_BS / separated_MV) for idx, data in call_total.groupby('delta_class'): print(delta_class_gain(data, 'empirical_reg', 'BS')) print(idx, ranksums(data['SSE_empirical_reg'], data['SSE_BS'])) for idx, data in call_total.groupby('delta_class'): print(delta_class_gain(data, 'DNN', 'empirical_reg')) print(idx, ranksums(data['SSE_empirical_reg'], data['SSE_DNN'])) for idx, data in call_total.groupby(['delta_class', 'ut']): print(idx, 1 - data['SSE_DNN'].sum() / data['SSE_empirical_reg'].sum(), ranksums(data['SSE_empirical_reg'], data['SSE_DNN'])) put_total = get_backtesting(total_data, start_date=datetime(2015, 1, 15), end_date=datetime(2019, 12, 30), cp=-1, rolling=False, TESTING_PERIOD=240 * 9,
if ith_base != None: t_alt_pos_from_end.append( min(ith_base, read_i.query_length - ith_base)) # Flanking indels: t_alt_flanking_indel.append(flanking_indel_i) # Inconsistent read or 2nd alternate calls: else: t_noise_read_count += 1 # Done extracting info from tumor tBAM. Now tally them: t_ref_mq = mean(t_ref_read_mq) t_alt_mq = mean(t_alt_read_mq) t_z_ranksums_mq = stats.ranksums(t_alt_read_mq, t_ref_read_mq)[0] t_ref_bq = mean(t_ref_read_bq) t_alt_bq = mean(t_alt_read_bq) t_z_ranksums_bq = stats.ranksums(t_alt_read_bq, t_ref_read_bq)[0] t_ref_NM = mean(t_ref_edit_distance) t_alt_NM = mean(t_alt_edit_distance) t_z_ranksums_NM = stats.ranksums(t_alt_edit_distance, t_ref_edit_distance)[0] t_NM_Diff = t_alt_NM - t_ref_NM - abs(indel_length) t_concordance_fet = stats.fisher_exact( ((t_ref_concordant_reads, t_alt_concordant_reads), (t_ref_discordant_reads, t_alt_discordant_reads)))[1]
noisePval = np.empty(len(db)) baseRange = [-0.2, 0] responseRange = [0, 0.2] for indCell, cell in db.iterrows(): spikeData, eventData = dataloader.get_session_ephys( cell, 'noiseburst') eventOnsetTimes = eventData.get_event_onset_times() alignmentRange = [baseRange[0], responseRange[1]] (spikeTimesFromEventOnset, trialIndexForEachSpike, indexLimitsEachTrial) = spikesanalysis.eventlocked_spiketimes( spikeData.timestamps, eventOnsetTimes, alignmentRange) nspkBase = spikesanalysis.spiketimes_to_spikecounts( spikeTimesFromEventOnset, indexLimitsEachTrial, baseRange) nspkResp = spikesanalysis.spiketimes_to_spikecounts( spikeTimesFromEventOnset, indexLimitsEachTrial, responseRange) [zScore, pVal] = stats.ranksums(nspkResp, nspkBase) noiseZscore[indCell] = zScore noisePval[indCell] = pVal db['noiseZscore'] = noiseZscore db['noisePval'] = noisePval #Laser pulse response #NOTE: This does the same thing as the noise burst response, but I am not making a function #because things are getting hidden and I want to be more explicit about what I am doing. pulseZscore = np.empty(len(db)) pulsePval = np.empty(len(db)) baseRange = [-0.1, 0] responseRange = [0, 0.1] for indCell, cell in db.iterrows(): spikeData, eventData = dataloader.get_session_ephys( cell, 'laserpulse')
fontsize=fontSizeLabels) plt.ylabel('Number of cells', fontsize=fontSizeLabels) extraplots.boxoff(plt.gca()) # -- Stats: test whether the modulation index distribution for all good cells is centered at zero -- # print 'Total number of sound responsive good cells is:', sum( soundRespAStr), '\nNumber of cells significantly modulated is:', len( sigModIAStr) (Z, pVal) = stats.wilcoxon(allModIAStr) print 'For AStr: Mean mod index is {:.3f}. Using the Wilcoxon signed-rank test, comparing the modulation index distribution for all good cells to zero yielded a p value of {:.3f}'.format( np.mean(allModIAStr), pVal) (Z, pVal) = stats.wilcoxon(sigModIAStr) print 'For significantly modulated cells in AC: Mean mod index is {:.3f}. Using the Wilcoxon signed-rank test, comparing the modulation index distribution to zero yielded a p value of {:.3f}'.format( np.mean(sigModIAC), pVal) (Z, pValBtAreas) = stats.ranksums(np.abs(allModIAC), np.abs(allModIAStr)) print 'Using wilcoxon rank sum test to compare ABSOLUTE modulation indices between AC and AStr, p value is {:.3f}'.format( pValBtAreas) print 'Median absolute mod index for AC: {}'.format( np.median(np.abs(allModIAC))) print 'Median absolute mod index for AStr: {}'.format( np.median(np.abs(allModIAStr))) #(oddRatio, pValFisher) = stats.fisher_exact([[sum(soundRespAC)-len(sigModIAC), len(sigModIAC)],[sum(soundRespAStr)-len(sigModIAStr), len(sigModIAStr)]]) #print 'Using Fishers exact test to compare fraction of modulated cells between AC and AStr, p value is {:.3f}'.format(pValFisher) (Z, pValBtAreasSig) = stats.ranksums(np.abs(sigModIAC), np.abs(sigModIAStr)) print 'Using wilcoxon rank sum test to compare ABSOLUTE modulation indices between significantly modulated cells in AC and AStr, p value is {:.3f}'.format( pValBtAreasSig) print 'Median absolute mod index for modulated cells in AC: {}'.format( np.median(np.abs(sigModIAC)))
odor_end=odor_end) correlation.plot_correlation_across_days(temp, days, loop_keys=['mouse', 'odor'], shuffle=shuffle, figure_path=figure_path, reuse=False, save=True, analyze=False, plot_bool=True, odor_end=odor_end) ixa = temp['odor_valence'] == 'CS+' ixb = temp['odor_valence'] == 'CS-' a = temp['corrcoef'][ixa] b = temp['corrcoef'][ixb] print(ranksums(a, b)) if condition.name == 'PIR': naive_config = statistics.analyze.PIR_NAIVE_Config() data_path_ = os.path.join(Config.LOCAL_DATA_PATH, Config.LOCAL_DATA_TIMEPOINT_FOLDER, naive_config.condition.name) save_path_ = os.path.join(Config.LOCAL_EXPERIMENT_PATH, 'COUNTING', naive_config.condition.name) res_naive = fio.load_pickle(os.path.join(save_path_, 'dict.pkl')) learned_day_per_mouse_, last_day_per_mouse_ = get_days_per_mouse( data_path_, naive_config.condition) # res = statistics.analyze.analyze_data(save_path, condition_config, m_threshold=.1)
# extract all predicted driver genes | sift_score genelist1 = pd.read_csv( '/encrypted/e3000/gatkwork/COREAD-ESCA-predicteddriver.tsv', header=None, skiprows=0, sep='\t') genelist1.columns = ['geneName'] merged_df1 = sift_df.merge(genelist1, how='inner', on=['geneName']) merged_df1.drop(['geneName'], axis=1, inplace=True) # calculate p-value for ranksums with SIFT stat, pvalue = ranksums(merged_df, merged_df1) print(pvalue) #################### POLYPHEN ################### # calculate ranksums for POLYPHEN polyphen_df = df[['geneName', 'polyphen']] # extract all non-driver genes | sift_score genelist = pd.read_csv('/encrypted/e3000/gatkwork/COREAD-ESCA-all-driver.tsv', header=None, skiprows=0, sep='\t') genelist.columns = ['geneName']
for i in range(nb): # ( actual labels, predicted probabilities ) fpr[i], tpr[i], _ = roc_curve(test_labels[:, i], test_prediction[:, i]) # flip here roc_auc[i] = auc(fpr[i], tpr[i]) return [round(roc_auc[x], 3) for x in range(nb)] Y_pred = F Y = np_utils.to_categorical(label, nb_classes) ROC = AUC(Y, Y_pred, nb_classes) print('AUC =', ROC[1]) import scipy.stats as stat a = Y_pred[:, 0] b = Y[:, 0] groups = [a[b == i] for i in xrange(2)] rs = stat.ranksums(groups[0], groups[1])[1] print('p = ', rs) score = model.evaluate(X, Y, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) #find indeces where Y is greater than a certain value idx = Y_pred[:, 0] indeces = [i for i, v in enumerate(idx >= 0.5) if v]
ortholog = subdata['ortholog'] if list(subdata.values())[0] == "E": try: essential.append(float(conservation[ortholog])) except: pass if list(subdata.values())[0] == "NE": try: non_essential.append(float(conservation[ortholog])) except: pass print("The number of essential genes: %d" % len(essential)) print("The number of non-essential genes: %d" % len(non_essential)) print(ranksums(essential, non_essential)) # # # https://blog.csdn.net/aijiudu/article/details/89387328 # print(pd.DataFrame(essential).describe()) # print(pd.DataFrame(non_essential).describe()) # print("-------------------------------------") # Results: # This organism is: S cerevisiae # The number of essential genes: 1033 # The number of non-essential genes: 4301 # RanksumsResult(statistic=4.84696083548837, pvalue=1.2536716457740872e-06) # This organism is: S pombe # The number of essential genes: 1140 # The number of non-essential genes: 2600
print(dic['Question_Text'][mask].iloc[i]) tags = tag_matrix.ix[:, mask].ix[:, i] print("Tags: "+'%s, '*tags.sum() % tuple(tags.index.str.lower()[tags])) # Checks Ordinal data, which is used in rating questions (rate from 1-6) if dic['Data_type'][mask].iloc[i] == 'Ordinal': print(dic['Data_values'][mask].iloc[i]) print("Category\t\tn\tMean\t1\t2\t3\t4\t5\t6\t(4-6)\tpWilc.\tpBinom.") # Rates data by each demographic for j in range(len(category_names)): width = np.zeros(6) total = subframe.ix[:, i].ix[categories.ix[:, j]].valid().count() width = np.histogram(subframe.ix[:, i].ix[categories.ix[:, j]], bins=np.arange(1, 8), range=(1,7), normed=True)[0] pval = stats.ranksums(subframe.ix[:, i].ix[categories.ix[:, j]].dropna(), subframe.ix[:, i].ix[-categories.ix[:, j]].dropna())[1] yes = (subframe.ix[:, i].ix[categories.ix[:, j]].dropna() > 3).sum() no = (subframe.ix[:, i].ix[categories.ix[:, j]].dropna() <= 3).sum() p0 = (subframe.ix[:, i].ix[-categories.ix[:, j]].dropna() > 3).mean() pval_binom = stats.binom_test((yes, no), p=p0) pval_comb = stats.combine_pvalues((pval, pval_binom))[1] print('%21s\t%i' % (category_names[j], total) + '\t%2.1f' % (subframe.ix[:, i].ix[categories.ix[:, j]]).mean() + '\t%3.1f%%'*6 % tuple(width*100)+'\t%3.1f%%' % (width[3:].sum()*100) +'\t%3.2f' % (pval)+'*'*(pval < 0.05)+'\t%3.2f' % (pval_binom)+'*'*(pval_binom < 0.05)) #print '%21s\t%i' % (category_names[j], total) + '\t%2.1f' % (subframe.ix[:, i].ix[categories.ix[:, j]]).mean() + '\t%3.1f%%'*6 % tuple(width*100) +'\t%3.1f%%' % (width[3:].sum()*100)+'\t%3.2f' % (pval_binom)+'*'*(pval < 0.05) print elif dic['Data_type'][mask].iloc[i] == 'Binary': responsetypes = dic['Data_values'][mask].iloc[i].split(';') print("Category\t\t n\t"+ '%s\t'*len(responsetypes) % tuple(responsetypes)+"p-value") for j in range(len(category_names)): yes = (subframe.ix[:, i].ix[categories.ix[:, j]]==responsetypes[0]).sum() no = (subframe.ix[:, i].ix[categories.ix[:, j]]==responsetypes[1]).sum() total = (yes+no)*1.
#print(wilcoxon(rt-np.mean(rt), correction = True)) print('Mu_ge equal 0') print('***') GE = pd.read_csv('C:/Users/anivia/Desktop/geDJ.txt', sep="\s+", header=None, names=['date', 'open', 'high', 'low', 'close', 'vol']) SP = pd.read_csv( 'https://www.math.ust.hk/~macwyu/MAFS5110_2018-2019/MAFS5110_2018-2019/Chapter_1/sp500.txt', sep="\s+") logreturn_GE = np.diff(np.log(np.array(GE["close"]))) logreturn_sp500 = np.diff(np.log(np.array(SP["close"]))) da2 = pd.concat([pd.DataFrame(logreturn_GE), pd.DataFrame(logreturn_sp500)], axis=1) da2.columns = ["logreturn_GE", "logreturn_sp500"] da2.boxplot(column=['logreturn_GE', 'logreturn_sp500']) #plt.show() print('***') print(stats.mood(logreturn_sp500, logreturn_GE)) print('H0 can be rejected, the variances are significantly different') print(ttest_ind(logreturn_sp500, logreturn_GE, equal_var=True)) print('Means are insignificantly different') #cm=sms.CompareMeans(sms.DescrStatsW(logreturn_sp500),sms.DescrStatsW(logreturn_GE)) #print('C.I. is ',cm.tconfint_diff()) print('so they are not equal.') from scipy.stats import ranksums print(ranksums(logreturn_sp500, logreturn_GE)) print('two groups do not have equal meDIans')
CLIP_y = [] for z in range(0, len(sorted_CLIP)): CLIP_y.append(float(1.0 / len(sorted_CLIP)) * z) random_y = [] for z in range(0, len(sorted_random)): random_y.append(float(1.0 / len(sorted_random)) * z) WSN_y = [] for z in range(0, len(sorted_WSN)): WSN_y.append(float(1.0 / len(sorted_WSN)) * z) statistic, pvalue_CLIP = stats.ranksums(sorted_total, sorted_CLIP) print pvalue_CLIP params = {'mathtext.default': 'regular'} plt.rcParams.update(params) plt.scatter(sorted_total, total_y, s=1, color='k', alpha=0.5, label="not CLIP n=" + str(len(sorted_total))) plt.scatter(sorted_CLIP, CLIP_y, s=1, color='r',