def test_qqplot_unequal(): rs = np.random.RandomState(0) data1 = rs.standard_normal(100) data2 = rs.standard_normal(200) fig1 = qqplot_2samples(data1, data2) fig2 = qqplot_2samples(data2, data1) x1, y1 = fig1.get_axes()[0].get_children()[0].get_data() x2, y2 = fig2.get_axes()[0].get_children()[0].get_data() np.testing.assert_allclose(x1, x2) np.testing.assert_allclose(y1, y2) numobj1 = len(fig1.get_axes()[0].get_children()) numobj2 = len(fig2.get_axes()[0].get_children()) assert numobj1 == numobj2 @pytest.mark.matplotlib def test_qqplot(self, close_figures): qqplot(self.res, line="r") @pytest.mark.matplotlib def test_qqplot_2samples_prob_plot_obj(self, close_figures): # also tests all values for line for line in ["r", "q", "45", "s"]: # test with `ProbPlot` instances qqplot_2samples(self.prbplt, self.other_prbplot, line=line) @pytest.mark.matplotlib def test_qqplot_2samples_arrays(self, close_figures): # also tests all values for line for line in ["r", "q", "45", "s"]: # test with arrays qqplot_2samples(self.res, self.other_array, line=line)
def qq_plot_2samples(self): """ :return: Q-Q plot between two samples """ self.ax = self.figure.add_subplot(111) self.ax.hold(True) pp_x = sm.ProbPlot(self.column_data) pp_y = sm.ProbPlot(self.var_data) qqplot_2samples(pp_x, pp_y, ax=self.ax) self.canvas.draw()
def athlete_qqplot(df1, df2): plot = [2, 2, 0] for var in athlete_var_list: plot[2] += 1 ax = plt.subplot(plot[0], plot[1], plot[2]) ax.axis(facecolor='blue') qqplot_2samples(df1[0][var], df2[0][var], xlabel=df1[1], ylabel=df2[1], line='45', ax=ax) plt.title('{var}'.format(var=var)) plt.subplots_adjust(top=0.9) plt.gcf().suptitle('Comparison of Athlete Variables') plt.show()
def test_correct_labels(close_figures, reset_randomstate, line, x_size, y_size, labels): rs = np.random.RandomState(9876554) x = rs.normal(loc=0, scale=0.1, size=x_size) y = rs.standard_t(3, size=y_size) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) fig = qqplot_2samples(pp_x, pp_y, line=line, **labels) ax = fig.get_axes()[0] x_label = ax.get_xlabel() y_label = ax.get_ylabel() if x_size <= y_size: if not labels: assert "2nd" in x_label assert "1st" in y_label else: assert "Y" in x_label assert "X" in y_label else: if not labels: assert "1st" in x_label assert "2nd" in y_label else: assert "X" in x_label assert "Y" in y_label
def test_axis_order(close_figures): xx = np.random.normal(10, 1, (100,)) xy = np.random.normal(1, 0.01, (100,)) fig = qqplot_2samples(xx, xy, "x", "y") ax = fig.get_axes()[0] y_range = np.diff(ax.get_ylim())[0] x_range = np.diff(ax.get_xlim())[0] assert y_range < x_range xx_long = np.random.normal(10, 1, (1000,)) fig = qqplot_2samples(xx_long, xy, "x", "y") ax = fig.get_axes()[0] y_range = np.diff(ax.get_ylim())[0] x_range = np.diff(ax.get_xlim())[0] assert y_range < x_range xy_long = np.random.normal(1, 0.01, (1000,)) fig = qqplot_2samples(xx, xy_long, "x", "y") ax = fig.get_axes()[0] y_range = np.diff(ax.get_ylim())[0] x_range = np.diff(ax.get_xlim())[0] assert x_range < y_range
def qqplot_2(var, medal_df, non_medal_df, male_df, female_df, winter_df, summer_df): qqplot_2samples(medal_df[var], non_medal_df[var], xlabel='Medal', ylabel='Non-Medal', line='45') plt.title('{var} for Medal v. Non-Medal'.format(var=var)) qqplot_2samples(female_df[var], male_df[var], xlabel='Male', ylabel='Female', line='45') plt.title('{var} for Female v. Male'.format(var=var)) qqplot_2samples(winter_df[var], summer_df[var], xlabel='Winter', ylabel='Summer', line='45') plt.title('{var} for Winter v. Summer'.format(var=var))
# Figure 14.1: comparing ACFs plt.figure() nlags = 160 cols = {'Gibbs': 'gray', 'marginal': 'black'} lss = {'Gibbs': '--', 'marginal': '-'} for t in [0, 49, 99, 149, 199]: for alg_name, alg in algos.items(): if isinstance(alg, mcmc.MCMC): burnin = int(alg.niter / 10) acf_x = acf(alg.chain.x[burnin:, t], nlags=nlags, fft=True) lbl = '_' if t > 0 else alg_name # set label only once plt.plot(acf_x, label=lbl, color=cols[alg_name], linestyle=lss[alg_name], linewidth=2) plt.axis([0, nlags, -0.03, 1.]) plt.xlabel('lag') plt.ylabel('ACF') plt.legend() if savefigs: plt.savefig('acf_gibbs_marginal_smoothing_stochvol.pdf') # Figure 14.2: qq-plots to check that MCMC samplers target the same posterior plt.figure() qqplot_2samples(algos['Gibbs'].chain.x[:, 0], algos['marginal'].chain.x[:, 0]) if savefigs: plt.savefig('qqplots_gibbs_vs_marginal_stochvol.pdf') plt.show()
mean2 = _sum2/len(marks2) sd2 = statistics.stdev(marks2) step2 = sd2/3 print('mean '+str(mean)+ ' mean2 '+str(mean2)) print('sd '+str(sd)+'sd2 '+str(sd2)) print('step '+str(step)+'step2 '+str(step2)) while(i<len(marks2)): Z2.append((marks2[i]-mean2)/step2) i+=1 print(Z2) plt.figure() #plt.scatter(Z, Z2) pp_x = sm.ProbPlot(np.array(marks)) pp_y = sm.ProbPlot(np.array(marks2)) qqplot_2samples(pp_x, pp_y, line='45') #qqplot(Z, Z2, c='r', alpha=0.5, edgecolor='k') plt.xlabel('Section-1') plt.ylabel('Section-2') #plt.show() i=0 while (j < len(marks)): idx = 0 flag = False i = 0 while i < len(scores): if (marks[j] >= scores[i]): flag = True if (i != 0):
''' Create Q-Q Plot of two samples quanitiles of randomnly generated data. ''' import statsmodels.api as sm import numpy as np import matplotlib.pyplot as plt from statsmodels.graphics.gofplots import qqplot_2samples x = np.random.normal(loc=8.5, scale=2.5, size=37) y = np.random.normal(loc=8.0, scale=3.0, size=37) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) qqplot_2samples(pp_x, pp_y) plt.show()
def test_qqplot_2samples_prob_plot_obj(self, close_figures): # also tests all values for line for line in ["r", "q", "45", "s"]: # test with `ProbPlot` instances qqplot_2samples(self.prbplt, self.other_prbplot, line=line)
def show_qq_plot(data, current, previous, title, ax, is_spiral=False): pp_x = sm.ProbPlot(current) pp_y = sm.ProbPlot(previous) qqplot_2samples(pp_x, pp_y, line="r", ax=ax) ax.grid() ax.set_title(title)
# -*- coding: utf-8 -*- import numpy as np import statsmodels.api as sm from statsmodels.graphics.gofplots import qqplot_2samples from matplotlib import pyplot as plt ################################################### # QQ-plot x = np.random.normal(loc=8.5, scale=2.5, size=37) y = np.random.normal(loc=8.0, scale=3.0, size=37) pp_x = sm.ProbPlot(x) pp_y = sm.ProbPlot(y) fig = qqplot_2samples(pp_x, pp_y, xlabel="N(8.5,2.5)", ylabel="N(8,3)", line=None, ax=None) fig.show(warn=True) raw_input("Enter: ")
def plotDists(topmedDict, nontopmedDict, topmedKeys, nontopmedKeys, graphFileName): logntmList, logtmList = createListsFromDict(nontopmedDict, topmedDict) lowerBound = min([min(logntmList), min(logtmList)]) upperBound = max([max(logntmList), max(logtmList)]) lineNumbers = numpy.arange(lowerBound, upperBound, 0.1) # plot scatter plotScatter(logntmList, logtmList, lineNumbers, graphFileName, 'non_topmed', 'just_topmed') # plot PDF plotHist(logntmList, logtmList, graphFileName, 'non_topmed', 'just_topmed') # plot QQ-plot n = len(logntmList) plt.title('all non-topmed vs all just-topmed QQ n=' + str(n)) ax = plt.gca() qqplot_2samples(data1=sm.ProbPlot(numpy.array(logtmList)), data2=sm.ProbPlot(numpy.array(logntmList)), xlabel='non-topmed', ylabel='just-topmed', line="45",ax = ax) plt.savefig(graphFileName + '_' + 'non_topmed' + '_vs_' + 'just_topmed' + '_QQ_n=' + str(n) + '.png') plt.close() # create non-zero lists nonZeroTM = [x for x in logtmList if x != 0] nonZeroNTM = [x for x in logntmList if x != 0] # run KS test # ksTest = ks_2samp(topmedDict[tmkey], nontopmedDict[ntmkey]) ksTest = ks_2samp(nonZeroNTM, nonZeroTM) print('ksTest for non-zero: ' + 'just_topmed' + ' vs ' + 'non_topmed' + ' : ' + str(ksTest)) for i in range(len(topmedKeys)): tmkey = topmedKeys[i] ntmkey = nontopmedKeys[i] logntmList, logtmList = createListsPerEthnicity(nontopmedDict, topmedDict, ntmkey, tmkey) lowerBound = min([min(logntmList), min(logtmList)]) upperBound = max([max(logntmList), max(logtmList)]) lineNumbers = numpy.arange(lowerBound, upperBound, 0.1) # plot scatter plotScatter(logntmList, logtmList, lineNumbers, graphFileName, tmkey, ntmkey) # plot PDF plotHist(logntmList, logtmList, graphFileName, tmkey, ntmkey) # plot QQ-plot n = len(logntmList) plt.title(graphFileName + '_' + ntmkey + '_vs_' + tmkey + '_QQ_' + 'n=' + str(n) ) ax = plt.gca() ntmData = sm.ProbPlot(numpy.array(logntmList)) tmData = sm.ProbPlot(numpy.array(logtmList)) qqplot_2samples(data1=tmData, data2=ntmData, xlabel='non-topmed', ylabel='just-topmed', line="45", ax=ax) plt.savefig(graphFileName + '_' + ntmkey + '_vs_' + tmkey + '_QQ_n=' + str(n) + '.png') plt.close() # create non-zero lists nonZeroTM = [x for x in topmedDict[tmkey] if x!= 0 ] nonZeroNTM = [x for x in nontopmedDict[ntmkey] if x !=0 ] # run KS test #ksTest = ks_2samp(topmedDict[tmkey], nontopmedDict[ntmkey]) ksTest = ks_2samp(nonZeroNTM, nonZeroTM) print('ksTest for non-zero: ' + ntmkey + ' vs ' + tmkey + ' : ' + str(ksTest))
def qqplot(model, X, y, ax=None): sample = model.predict_f_samples(X, 1)[0, :, 0] y = y[:, 0] qqplot_2samples(y, sample, ylabel='Posterior quantiles', xlabel='Data quantiles', line='45', ax=ax) plt.show()
import statsmodels.api as sm import numpy as np import matplotlib.pyplot as plt from statsmodels.graphics.gofplots import qqplot_2samples fig, ax = plt.subplots() x = np.array([ 0.2938 * 5000, 0.205 * 5000, 0.1532 * 5000, 0.1092 * 5000, 0.077 * 5000, 0.0598 * 5000, 0.0522 * 5000, 0.0498 * 5000 ]) y = np.array([ 0.354 * 5000, 0.216 * 5000, 0.144 * 5000, 0.109 * 5000, 0.0706 * 5000, 0.058 * 5000, 0.028 * 5000, 0.016 * 5000 ]) # pp_x = sm.ProbPlot(x) # pp_y = ProbPlot(y) qqplot_2samples(x, y, ax=ax) x = np.linspace(*ax.get_xlim()) ax.plot(x, x) plt.xlabel("Quantiles of wins in BBE") plt.ylabel("Quantiles of wins in real horse race-events") plt.show()
def make_qqplot(x: List, y: List) -> None: plt_x = sm.ProbPlot(x) plt_y = sm.ProbPlot(y) qqplot_2samples(plt_x, plt_y) plt.savefig("testing")
for value in line.split(","): if counter == 0: k = value data[value] = {} elif counter < 5: data[k]["r" + str(counter)] = value else: data[k]["genre"] = value counter += 1 r1 = [] r3 = [] for k, v in data.items(): for key, value in data[k].items(): if key == "r1": r1.append(float(value)) if key == "r3": r3.append(float(value)) x = sm.ProbPlot(np.array(r1)) y = sm.ProbPlot(np.array(r3)) fig = sm.qqplot_2samples(x, y, xlabel="avg rating website 1 quantiles", ylabel="avg rating website 3 quantiles", line="r") plt.title("Q-Q Plot") plt.show()
def test_qqplot_2samples_arrays(self, close_figures): # also tests all values for line for line in ["r", "q", "45", "s"]: # test with arrays qqplot_2samples(self.res, self.other_array, line=line)
] #2.4(1) age_mean = statistics.mean(ages) age_median = statistics.median(ages) age_deviation = statistics.pstdev(ages) print('%.2f' % age_mean, '%.2f' % age_median, '%.2f' % age_deviation) fats_mean = statistics.mean(fats) fats_median = statistics.median(fats) fats_deviation = statistics.pstdev(fats) print('%.2f' % fats_mean, '%.2f' % fats_median, '%.2f' % fats_deviation) #2.4(2) plt.boxplot(ages, patch_artist=True, labels=['ages']) plt.show() plt.boxplot(fats, patch_artist=True, labels=['fats%']) plt.show() #2.4(3) plt.scatter(ages, fats) plt.xlabel("ages") plt.ylabel("fats%") plt.show() ages_array = np.asarray(ages) fats_array = np.asarray(fats) pp_ages = sm.ProbPlot(ages_array) pp_fats = sm.ProbPlot(fats_array) qqplot_2samples(pp_fats, pp_ages, xlabel='ages', ylabel='fats%', line='r') plt.show()
def main(): outdir = '../../Supplemental_Figures/SGA_Scaling/scaler_output' # make output folder try: os.makedirs(outdir) except FileExistsError: pass # define datasets, datasetB is scaled to match datasetA datasetA = '../../Data/SGA_Scaling/cF3.txt' datasetB = '../../Data/SGA_Scaling/SGA_NxN_avg.txt' # read in the two datasets ints, profs, genes = read_square_dataset_small(datasetA, "", "\t", split=True, profiles=False) b_ints, b_profs, b_genes = read_square_dataset_small(datasetB, "", "\t", split=True, profiles=False) datasetA = datasetA.split('/')[-1].split('.')[0] datasetB = datasetB.split('/')[-1].split('.')[0] avalues = [] bvalues = [] for i in ints: if i in b_ints: avalues.append(ints[i]) bvalues.append(b_ints[i]) asorted = sorted(avalues) bsorted = sorted(bvalues) # shift datasetB so that it has the same number of negative values # as datasetA (makes it a little easier to scale) adjustment = -bsorted[len([x for x in asorted if x < 0])] bsorted = [x + adjustment for x in bsorted] # plot scatter plot showing shared interactions density_scatter_plot(ints, b_ints, outdir + '/unscaled_scatter.png', xlabel='S-score', ylabel='SGA score') # record dataset information in log with open(outdir + '/scaler_log.txt', 'w') as f: f.write("cF3 EMAP has {} interactions\n".format(len(ints))) f.write("SGA_NxN has {} interactions\n".format(len(b_ints))) f.write("The sets have {} interactions in common\n".format( len(avalues))) f.write("Dataset correlation = {}\n".format( np.corrcoef(avalues, bvalues)[0][1])) f.write("Adjustment so that the SGA_NxN shared interaction " "set has the same number of negative values as the " "cF3 EMAP.\nadjustment={}\n".format(adjustment)) ## Computing scaling values #essentially the data is partitioned into 100 overlapping bins #the mean value of bin[0] in datasetB is divided by the mean value of bin[0] from datasetA #this gives a scaling factor for values in the range (min(bin[0]), max(bin[0])) #values close to zero give unpredictable scaling factors, so they are ignored. #Depending on the size of your overlap you may want to tweak the number of bins bins = 500 binsize = len(avalues) / bins score = [] scale = [] lower_threshold = 0.05 upper_threshold = 0.99 for i in np.arange(1, bins * lower_threshold): start = int(i * binsize - binsize) end = int(i * binsize + binsize) score.append(np.mean(bsorted[start:end])) scale.append(np.mean(asorted[start:end]) / np.mean(bsorted[start:end])) for i in np.arange(bins * upper_threshold, bins): start = int(i * binsize - binsize) end = int(i * binsize + binsize) score.append(np.mean(bsorted[start:end])) scale.append(np.mean(asorted[start:end]) / np.mean(bsorted[start:end])) # This function creates a curve which maps scores to scaling factors # the s=0.02 defines how close the curve fits your data points # large values give crap curves, small values may overfit your data # it's best to look at the resulting curve and tweak s= as appropriate svalue = 0.02 s = UnivariateSpline(score, scale, s=svalue) #displays the scaling values(in red) and the fitted curve (in black) fig = plt.figure(figsize=(2, 2), dpi=300, facecolor='w', edgecolor='k') plt.plot( np.arange(min(score), max(score), 0.01), # changed from scatter [s(x) for x in np.arange(min(score), max(score), 0.01)], color="red", linewidth=1) plt.scatter(score, scale, color="black", s=3) plt.xlim(1.1 * min(score), 1.1 * max(score)) plt.ylim(0.9 * min(scale), 1.1 * max(scale)) plt.ylabel('Scaling Factor', fontname='Helvetica', fontsize=6) plt.xlabel('SGA Score', fontname='Helvetica', fontsize=6) pylab.savefig(outdir + "/scaling_factor_curve.png", format='png', transparent=True, bbox_inches='tight', dpi=300) # if the value to be scaled is larger than any value in our training set, we use # the scaling factor from the largest observed value def s_bounded(x): if x < min(score): x = min(score) elif x > max(score): x = max(score) return s(x) #This function applies our scaling factor to a given value g = lambda x: (x + adjustment) * s_bounded(x + adjustment) for i in b_ints: b_ints[i] = float(g(b_ints[i])) scaled_dataset_file = "../../Data/SGA_Scaling/SGA_NxN_scaled_to_cF3.txt" output_delimited_text(scaled_dataset_file, b_genes, b_genes, b_ints, True) # save scaling info to log with open(outdir + '/scaler_log.txt', 'a') as f: f.write("Number of bins used: {}\n".format(bins)) f.write("Lower threshold for bins: {}\n".format(lower_threshold)) f.write("Upper threshold for bins: {}\n".format(upper_threshold)) f.write("S value for fitting spline: {}\n".format(svalue)) f.write("max_score={}\n".format(max(score))) f.write("min_score={}\n".format(min(score))) # save spline for scaling full SGA in R scores = np.arange(min(score), max(score), 0.01) scales = [float(s(x)) for x in np.arange(min(score), max(score), 0.01)] spline = pd.DataFrame(data=np.stack((scores, scales)).T, columns=['score', 'scale']) spline.to_csv(outdir + '/spline.txt', sep='\t', index=False) # Plot scatter plot of shared interactions after scaling density_scatter_plot(ints, b_ints, outdir + '/scaled_scatter.png', xlabel='S-score', ylabel='Scaled SGA score') # Make QQ Plots using the interactions before and after scaling avalues_after_scaling = [] bvalues_after_scaling = [] for i in ints: if i in b_ints: avalues_after_scaling.append(ints[i]) bvalues_after_scaling.append(b_ints[i]) # qqplot_2samples puts the "2nd Sample" on the x-axis # see documentation for statsmodels.graphics.gofplots fig, ax = plt.subplots(figsize=(2, 2)) fig = qqplot_2samples(np.array(bvalues_after_scaling), np.array(avalues_after_scaling), line='r', ax=ax) ax.set_xlabel('S-score Quantiles', fontname='Helvetica', fontsize=6) ax.set_ylabel('Scaled SGA score Quantiles', fontname='Helvetica', fontsize=6) pylab.savefig(outdir + "/qq_scaled.png", transparent=True, bbox_inches='tight', dpi=300) fig, ax = plt.subplots(figsize=(2, 2)) fig = qqplot_2samples(np.array(bvalues), np.array(avalues), line='r', ax=ax) ax.set_xlabel('S-score Quantiles', fontname='Helvetica', fontsize=6) ax.set_ylabel('SGA score Quantiles', fontname='Helvetica', fontsize=6) pylab.savefig(outdir + "/qq_unscaled.png", transparent=True, bbox_inches='tight', dpi=300)
if not os.path.exists("plots"): os.makedirs("plots") for label in top_features: features = top_features[label] plt.scatter(data[features[0]], data[features[1]], s=7) plt.xlabel(features[0].title()) plt.ylabel(features[1].title()) plt.title("Top Features (" + label + ")") plt.savefig("plots/scatter-plot-" + label.lower() + ".png") plt.clf() plot_feature1 = gofplots.ProbPlot(data[features[0]]) plot_feature2 = gofplots.ProbPlot(data[features[1]]) fig = gofplots.qqplot_2samples( plot_feature1, plot_feature2, line="r", xlabel=features[0], ylabel=features[1]) dots = fig.findobj(lambda x: hasattr( x, 'get_color') and x.get_color() == 'b') [d.set_ms(3) for d in dots] plt.title("Probability Plot (" + label + ")") plt.savefig("plots/pp-plot-" + label.lower() + ".png") plt.clf() def intuitive_partion(data, clip=True): if len(data) == 0: return data data = np.array(data, dtype=float)