def prob_dispersion(target, result, prob, nclasses=5, save_as=''): classprob = [[] for i in 2 * range(nclasses)] for i in xrange(len(result)): p = prob[i][result[i]] if result[i] == target[i]: classprob[2*result[i]].append(p) else: classprob[2*result[i] + 1].append(p) xlabels = [[str(i+1) + "-Good", str(i+1) + "-Bad"] for i in range(nclasses)] xlabels = reduce(list.__add__, xlabels, []) # Plotting the result fig = plt.figure() fig.suptitle('Probability distribution' , fontsize=20) plot = fig.add_subplot(111) pylab.boxplot(classprob) pylab.xticks(range(1, 2 * nclasses + 1), xlabels) plot.set_xlabel('Predicted' , fontsize = 16) plot.set_ylabel('Probabilities' , fontsize = 16) plot.tick_params(axis='both', which='major', labelsize=14) plot.tick_params(axis='both', which='minor', labelsize=8) # Save options if save_as =='': plt.show() else : fig.savefig(save_as)
def chart(SW, a, b, label, folder, FILE): pylab.ioff() fig_width_pt = 350 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0/72.27 # Convert pt to inch golden_mean = ((5**0.5)-1.0)/2.0 # Aesthetic ratio fig_width = fig_width_pt*inches_per_pt # width in inches fig_height = fig_width*golden_mean # height in inches fig_size = [fig_width,fig_height] params = { 'backend': 'ps', 'axes.labelsize': 10, 'text.fontsize': 10, 'legend.fontsize': 10, 'xtick.labelsize': 8, 'ytick.labelsize': 8, 'text.usetex': True, 'figure.figsize': fig_size } pylab.rcParams.update(params) home = '/home/nealbob' img_ext = '.pdf' pylab.figure() pylab.boxplot([SW['SWA'], SW['OA'], SW['NS']], whis=5) pylab.axhline(y=1.0, color='0.5', linewidth=0.5, alpha=0.75, linestyle=':') pylab.ylim(a, b) pylab.ylabel(label) pylab.tick_params(axis='x', which = 'both', labelbottom='off') pylab.figtext(0.225, 0.06, 'SWA', fontsize = 10) pylab.figtext(0.495, 0.06, 'OA', fontsize = 10) pylab.figtext(0.76, 0.06, 'NS', fontsize = 10) pylab.savefig(home + folder + FILE + img_ext) pylab.show()
def makeboxplot(filteredclusts, dblibrary, figname, pool=False): '''takes a filtered dict of clusts worth keeping and creates a boxplot of either by lane (default) or pool''' indiv_cluster_count = defaultdict(int) for clust, inddict in filteredclusts.items(): for ind, reads in inddict.items(): if ind in indiv_cluster_count.keys(): indiv_cluster_count[ind]+=1 else: indiv_cluster_count[ind]+=1 t = gdata_tools.get_table_as_dict(dblibrary) db_ind_countd = Util.countdict([d['sampleid'] for d in t if d['sampleid'] in indiv_cluster_count.keys()[3]]) #creates a table of individual dicts from google spreadsheet indiv_by_group = defaultdict(list) for d in t: if 'pool' in d: indkey = (d.get('flowcell',None),d.get('lane',None),d.get('index',None),d.get('sampleid',None)) if indkey in indiv_cluster_count: if pool == True: indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None),d['pool'])].append(indiv_cluster_count[indkey]) else: indiv_by_group[(d['flowcell'],d['lane'],d.get('index',None))].append(indiv_cluster_count[indkey]) boxes = [] labels = [] for group,indcounts in indiv_by_group.items(): boxes.append(indcounts) labels.append(group) boxplt = pylab.figure(1) pylab.boxplot(boxes) pylab.xticks(arange(1,(len(labels)+1)),labels,fontsize='small') #legend with best location (0) if pools boxplt.savefig(figname)
def chart(idx, a, b, label, FILE): pylab.ioff() fig_width_pt = 350 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0/72.27 # Convert pt to inch golden_mean = ((5**0.5)-1.0)/2.0 # Aesthetic ratio fig_width = fig_width_pt*inches_per_pt # width in inches fig_height = fig_width*golden_mean # height in inches fig_size = [fig_width*0.42,fig_height] params = { 'backend': 'ps', 'axes.labelsize': 10, 'text.fontsize': 10, 'legend.fontsize': 10, 'xtick.labelsize': 8, 'ytick.labelsize': 8, 'text.usetex': True, 'figure.figsize': fig_size } pylab.rcParams.update(params) home = '/home/nealbob' folder = '/Dropbox/Thesis/IMG/chapter3/' img_ext = '.pdf' pylab.figure() pylab.boxplot(idx, whis=100) pylab.ylim(a, b) #pylab.ylabel(label) pylab.tick_params(axis='x', which = 'both', labelbottom='off') pylab.savefig(home + folder + FILE + img_ext) pylab.show()
def plot(): swarmsize_marks = [20, 50, 100, 200] times = {} for mark in swarmsize_marks: times[mark] = [] for time_filename, size_filename, label, style in lines_to_plot: time_file = open('parser_results/' + time_filename) size_file = open('parser_results/' + size_filename) for line in time_file: time = float(line.split()[0]) size = int(size_file.next().split()[0]) for mark in swarmsize_marks: if size <= mark: times[mark].append(time) break xs = [] labels = [] for mark in swarmsize_marks: xs.append(times[mark]) labels.append('<=%d' % mark) pylab.boxplot(xs) pylab.setp(pylab.gca(), 'xticklabels', labels) pylab.savefig(output_filename) # pylab.close() print 'Output saved to:', output_filename
def quartile_plot( fits, group_index_start, group_index_end, model_param_index, ylim=None, log=True, xlabel=None, ylabel=None, labels=None): model_param_values = [ fit_params(fits, group_index, model_param_index) for group_index in xrange( group_index_start, group_index_end) ] fig = plt.figure(figsize=(len(model_param_values), 7)) if log is True: plt.yscale('log') if ylim is not None: plt.ylim(ylim) if xlabel is not None: plt.xlabel(xlabel) if ylabel is not None: plt.ylabel(ylabel) plt.boxplot( model_param_values, labels=labels, showmeans=True) plt.grid() plt.show()
def genderBoxplots(self, women, men, labels, path): data = [women.edition_count.values, men.edition_count.values] plt.figure() plt.boxplot(data) # mark the mean means = [np.mean(x) for x in data] print(means) plt.scatter(range(1, len(data) + 1), means, color="red", marker=">", s=20) plt.ylabel("num editions") plt.xticks(range(1, len(data) + 1), labels) plt.savefig( path + "/numeditions_gender_box_withOutlier" + self.pre + "-" + self.post + ".png", bbox_inches="tight" ) plt.figure() plt.boxplot(data, sym="") # mark the mean means = [np.mean(x) for x in data] print(means) plt.scatter(range(1, len(data) + 1), means, color="red", marker=">", s=20) plt.ylabel("num editions") plt.xticks(range(1, len(data) + 1), labels) plt.savefig(path + "/numeditions_gender_box" + self.pre + "-" + self.post + ".png", bbox_inches="tight")
def plot_res_paper(df): """ :param df: contain field classifier_name, accuarcy, and fold :return: """ ticks = [] i = 0 data_to_plot = [] for g, v in df.groupby(df.classifier_name): data_to_plot.append(v['accuracy'].values) ticks.append(g) print v pylab.boxplot(data_to_plot) pylab.xticks(range(1, 1+ len(data_to_plot)), ticks) pylab.gca().invert_xaxis() pylab.ylabel('Classification accuracy') pylab.xlabel('Fold (cross validation fold for test)') pylab.gca().yaxis.set_ticks(np.arange(0, 1, 0.1)) pylab.ylim((0,1)) pylab.legend() pylab.show() return
def whiskers(i1, i2, lab1="", lab2=""): width = 0.35 l1 = pb.boxplot([d[:, i1] for d in data] , positions=np.arange(len(data))-1.03*width/2., widths=width) l2 = pb.boxplot([d[:, i2] for d in data] , positions=np.arange(len(data))+1.03*width/2., widths=width) pb.xticks(np.arange(len(data)),[fn.split('raw')[0].replace('_',' ') for fn in fnames], rotation=45) pb.xlim(-1.2*width, len(data)-1+1.2*width) for key, lines in l1.iteritems(): pb.setp(lines, lw=1) if key == "boxes": pb.setp(lines, color='b', lw=1.4) if key == 'whiskers': pb.setp(lines, color='b') if key == 'fliers': pb.setp(lines, color='b') if key == 'medians': pb.setp(lines, color='k', lw=1.4) for key, lines in l2.iteritems(): pb.setp(lines, lw=1.2) if key == "boxes": pb.setp(lines, color='g', lw=1.4) if key == 'whiskers': pb.setp(lines, color='g') if key == 'fliers': pb.setp(lines, color='g') if key == 'medians': pb.setp(lines, color='k', lw=1.4)
def graphPageSizeComparison(benchType, backend, writeUnits): clf() data = filter(table, benchType=benchType, backend=backend, writeUnits=writeUnits) xData = project(data, 'pageSize') yData = project(data, 'latency') # Each sample represents 10 trials. yData = map(lambda x:x/10, yData) (xData, yData) = condense(xData, yData) pylab.boxplot(yData) fmt = ticker.FixedFormatter(map(str, xData)) ax = gca() ax.get_xaxis().set_major_formatter(fmt) ax.set_ylabel("Sequential 4K block write latency (s)") ax.set_xlabel("Page size (B)") ax.get_yaxis().grid(color='gray', linestyle='dashed') ax.get_yaxis().set_major_locator(ticker.MaxNLocator(15)) if backend == 's3': title('Backend: S3') else: title('Backend: DynamoDB; Provisioning Units = %d' % writeUnits) pylab.show()
def finalgen(names): names = eval(names) totaleff = [] for name in names: resultsfolder = "results/"+name+"/" final = resultsfolder + "gen049.dat" population = [] name = final.rstrip('.dat') efflist = [] resultsfile = open(final, 'r') for line in resultsfile: population.append(eval(line)) for indiv in population: # if "fullrandom" in name: # print "found", name # lift = indiv['fitness'][0] - 0.5 # else: lift = indiv['fitness'][0] drag = 5.0 - indiv['fitness'][1] efficiency = lift/drag efflist.append(efficiency) aveeff = ave(efflist) stdeff = std(efflist,aveeff) print "efficieny average", aveeff, "+-", stdeff totaleff.append(efflist) pylab.boxplot(totaleff) pylab.show() bwblift, bwbdrag = LIFT, 5 - DRAG print "bwbefficiency", bwblift/bwbdrag
def graphDepthComparison(benchType): clf() data = filter(table, timestamp=range(1336921433, 1336922429+1), benchType=benchType) writeUnits = 160 xData = project(data, 'depth') yData = project(data, 'latency') print len(xData) print len(yData) # Each sample represents 5 trials. yData = map(lambda x:x/5, yData) (xData, yData) = condense(xData, yData) pylab.boxplot(yData) fmt = ticker.FixedFormatter(map(str, xData)) ax = gca() ax.get_xaxis().set_major_formatter(fmt) ax.set_ylabel("Sequential 4K block write latency (s)") ax.set_xlabel("Depth (number of parent directories)") ax.get_yaxis().grid(color='gray', linestyle='dashed') ax.get_yaxis().set_major_locator(ticker.MaxNLocator(10)) title('Backend: DynamoDB; Provisioning Units = %d' % writeUnits) pylab.ylim([0,0.1]) pylab.show()
def do_proc(resdir, timedir): """ EXPS on IPC6_SEQ_ELEVATORS_12 & IPC6_TEMPO_OPENSTACKS_17 steadyState=50 1) popsize=48 & runmax=1 & maxseconds=0 2) RESTART case: popsize=96 & runmax=0 & maxseconds=1799 foreach nthreads: 1, 24, 48 repeat 11 times """ if not options.cores: return for field, popsize, runmax, maxseconds in [ ("PROC", 48, 1, 0), #("RESTART_PROC", 96, 0, 1799) ]: for name, domain, instance in SAMPLES: local_logger = logging.getLogger("GECCO2011.PROC.%s" % name) plotdata = [] for num in range(1, options.nruns+1): subdata = [] for nthreads in [1, 24, 48]: field_name = "%s_%s_%d" % (field, "DYNAMIC" if options.dynamic else "STATIC", nthreads) time_filename = PATTERN_TIME_FILENAME % {"TIMEDIR": timedir, "NAME": name, "FIELD": field_name, "NUM": num} res_filename = PATTERN_RES_FILENAME % {"RESDIR": resdir, "NAME": name, "FIELD": field_name, "NUM": num} plan_filename = PATTERN_PLAN_FILENAME % {"RESDIR": resdir, "NAME": name, "FIELD": field_name, "NUM": num} cmd = PATTERN_CMD % {"DOMAIN": domain, "INSTANCE": instance, "LOOP": 1, "DYNAMIC": 1 if options.dynamic else 0, "THREADS": nthreads, "RUNMAX": runmax, "POPSIZE": popsize, "OFFSPRINGS": popsize*7, "MAXSECONDS": maxseconds, "GENSTEADY": 50, "TIME_FILENAME": time_filename, "RES_FILENAME": res_filename, "PLAN_FILENAME": plan_filename, } local_logger.debug(cmd) if options.execute: os.system( cmd ) if options.plot: try: f = open(time_filename).readlines() t1 = float(f[1].split()[-1]) tp = f[4].split()[-1].split(':') tp = float(int(tp[0]) * 60 + float(tp[1])) subdata.append([t1, tp, t1 / tp]) except IOError: pass if options.plot: if len(subdata): plotdata.append(subdata) if options.plot: local_logger.info(plotdata) pylab.boxplot( plotdata )
def analyze(real, samples, skip=0, thr=0.9): real = pickle.load(open(real, 'rb')) samples = pickle.load(open(samples, 'rb')) thr=float(thr) def flatten(measurements): shared, exclusive = [], [] for es, ss in measurements: exclusive.extend(es[skip:]) shared.extend(ss[skip:]) return exclusive, shared true_values = OrderedDict() for vm, measurements in real.results.items(): shared, exclusive = flatten(measurements) true_values[vm] = mean(shared)/mean(exclusive) print(true_values) means = OrderedDict() nums = [] avgms, avgdevs = [], [] #for vm, measurements in sorted(samples.results.items()): for vm, measurements in sorted(samples.results.items()): print("calculating", vm) shared, exclusive = flatten(measurements) def myfilter(l): return [e for e in l if e != 0] shared = myfilter(shared) exclusive = myfilter(exclusive) ns = [] true = true_values[vm] for _ in range(1000): sh_samples, exc_samples = [], [] n = 0 while True: n += 1 sh_samples.append(choice(shared)) exc_samples.append(choice(exclusive)) cur = mean(sh_samples)/mean(exc_samples) prec = 1 - abs(1-cur/true) if prec > thr: ns.append(n) break if n > 20: print(vm, "max precision:", prec) break if not ns: print("no data points for", vm) continue nums.append(ns) #m = mean(ns) #d = pstdev(ns) #rd = d/m*100 #avgdevs.append(rd) #avgms.append(m) #print("{vm}: {m:.1f} {rd:.1f}%".format(vm=vm, m=m,d=d,rd=rd)) #means[vm]=mean(nums) ticks = real.mapping p.xticks(range(len(ticks)), ticks) p.boxplot(nums)
def main(): # exon, intron, unknown specific = [[], [], []] nonspecific = [[], [], []] foldspecific = [[], [], []] foldnonspecific = [[], [], []] for prefix in sys.argv[1:]: tempexonic, tempspecific, tempnonspecific, tempfoldspecific, tempfoldnonspecific, templength = getData( prefix + ".exonic.overlap.out.annotation.txt", ["exon"] ) exonic = tempexonic[0] exoniclength = templength[0] specific[0].append(tempspecific[0]) nonspecific[0].append(tempnonspecific[0]) foldspecific[0].append(tempfoldspecific[0]) foldnonspecific[0].append(tempfoldnonspecific[0]) tempData, tempspecific, tempnonspecific, tempfoldspecific, tempfoldnonspecific, templength = getData( prefix + ".novel.overlap.out.annotation.txt", ["intron", "unknown"] ) intronic = tempData[0] unknown = tempData[1] introniclength = templength[0] unknownlength = templength[1] specific[1].append(tempspecific[0]) specific[2].append(tempspecific[1]) nonspecific[1].append(tempnonspecific[0]) nonspecific[2].append(tempnonspecific[1]) foldspecific[1].append(tempfoldspecific[0]) foldspecific[2].append(tempfoldspecific[1]) foldnonspecific[1].append(tempfoldnonspecific[0]) foldnonspecific[2].append(tempfoldnonspecific[1]) plotData = [exonic, intronic, unknown] print prefix print "exonic: ", len(exonic) print "intronic: ", len(intronic) print "unknown: ", len(unknown) fig = pl.figure() pl.boxplot(plotData) pl.ylim([2, 15]) pl.ylabel("Log Expression Level") pl.xticks([1, 2, 3], ["Exonic", "Intronic", "Unknown"]) pl.title(prefix.replace("_fsorted", "").replace("_", " ")) fig.savefig(prefix + ".expression.png", dpi=fig.dpi) fig = pl.figure() pl.boxplot([exoniclength, introniclength, unknownlength]) pl.ylim([60, 2500]) pl.ylabel("Transcript Length") pl.xticks([1, 2, 3], ["Exonic", "Intronic", "Unknown"]) pl.title(prefix.replace("_fsorted", "").replace("_", " ")) fig.savefig(prefix + ".length.png", dpi=fig.dpi) # pl.show() abbr = [] for i in sys.argv[1:]: tokens = i.split("_") abbr.append(tokens[0][0].upper() + tokens[1][0:2].title()) plotSpec(specific, nonspecific, abbr, ["exonic", "intronic", "unknown"], "abs") plotSpec(foldspecific, foldnonspecific, abbr, ["exonic", "intronic", "unknown"], "fold")
def vizFeature(self): minPoints = [] col = ['b','g','r','c','m','y','k','w'] boxpoints = [] feat = self.mainWindow.subFeatCmb.currentText() bplot = self.mainWindow.boxPlotCheck.checkState() glyphNames = [thumb.scene().glyphtxt for thumb in self.thumbNails[0] if thumb.scene().created] #print glyphNames for ind in range(self.mainWindow.tabWidget.count()): for thumb in self.thumbNails[ind]: if thumb.scene().created: feat = getattr(thumb.scene().windowS,self.FeatSelect[self.mainWindow.featCmb.currentText()]) subFeat = getattr(self,self.FeatSelect[self.mainWindow.featCmb.currentText()]+'Val')[self.mainWindow.subFeatCmb.currentText()] minPoints.append(feat[subFeat]) if not bplot: pylab.plot(minPoints,col[ind],label=self.mainWindow.tabWidget.tabText(ind)) pylab.plot(minPoints,'ro') else: boxpoints.append(minPoints) minPoints = [] scriptNames = [self.mainWindow.tabWidget.tabText(ind) for ind in range(self.mainWindow.tabWidget.count())] # pylab.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) if not bplot: pylab.xticks(range(len(glyphNames)),glyphNames) pylab.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) pylab.show() else: pylab.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.) pylab.xticks(range(len(scriptNames)),scriptNames) pylab.boxplot(boxpoints) pylab.show() # import rpy2.robjects as R # # result = R.r['t.test'](R.IntVector(boxpoints[0]),R.IntVector(boxpoints[1])) # # k = str(result)[str(result).find('p-value = '):] # # print k points1 = [] points2 = [] strokes = []
def plotChart(resultObj, patternName, stockPriceDataObj): print resultObj[0].identifyPos for patternData in resultObj: processedData = stockPriceDataObj[patternData.code] plt.boxplot(map(lambda res: (res[1], res[2], res[3], res[4]), processedData)) plt.plot([patternData.upperLine.startX + 1, patternData.upperLine.endX + 1], [patternData.upperLine.startPriceY, patternData.upperLine.endPriceY]) plt.plot([patternData.downLine.startX + 1, patternData.downLine.endX + 1], [patternData.downLine.startPriceY, patternData.downLine.endPriceY]) plt.axvline(x = patternData.identifyPos + 1, color='red') plt.savefig("../demoImg/%s/%s" % (patternName, patternData.code)) plt.clf()
def UnivarDescStat(self,Data,FileOutPath): # Analitic Descriptives text N = len(Data) Mean = np.mean(Data) Minimum = np.min(Data) Maximum = np.max(Data) Variance = np.var(Data) Std = np.std(Data) MinimumQ = np.percentile(Data,0) Q1 = np.percentile(Data,25) Median = np.percentile(Data,50) Q3 = np.percentile(Data,75) MaximumQ = np.percentile(Data,100) txt = ("\nN : {0:8d}".format(N)) txt = txt + ("\nMean : {0:8.6f}".format(Mean)) txt = txt + ("\nMinimum : {0:8.6f}".format(Minimum)) txt = txt + ("\nMaximum : {0:8.6f}".format(Maximum)) txt = txt + ("\nVariance : {0:8.6f}".format(Variance)) txt = txt + ("\nStd. deviation : {0:8.6f}".format(Std)) txt = txt + ("\n\n\n") txt = txt + ("\nMinimum : {0:8.6f}".format(MinimumQ)) txt = txt + ("\n1st Quartile : {0:8.6f}".format(Q1)) txt = txt + ("\nMedian : {0:8.6f}".format(Median)) txt = txt + ("\n3rd Quartile : {0:8.6f}".format(Q3)) txt = txt + ("\nMaximum : {0:8.6f}".format(MaximumQ)) # Grid to plot into. G = gridspec.GridSpec(2, 2, width_ratios=[2, 1]) # Plot Analitics axes_1 = P.subplot(G[:,1]) axes_1.set_title("Analitics") axes_1.axis('off') P.text(0.15, 0.25, txt, size=12) # Histogram and... axes_2 = P.subplot(G[0,0]) axes_2.set_title("Histogram") n, bins, patches = P.hist(Data, 15, normed=1) # ... PDF Plots (Probability Distribution Function) y = mlab.normpdf( bins, Mean, Std) P.plot(bins, y, 'r--', linewidth=1) P.ylabel('Probability') # Plot boxplot axes_3 = P.subplot(G[1,0]) axes_3.set_title("Boxplot") P.boxplot(Data,0,'rs',0); # Store as SVG P.savefig(FileOutPath)
def draw_learning_curve(data_first=None, data_second=None, measure=None, x_axis=None, delta=0.1, scaling=100, fname=None): """ Accepts as input an iterator over lists of numbers. Draws the exponential decay grpah over the means of lists. """ def learning_curve_function(x, a, b): return a * (1 - np.exp(-b * x)) x_axis = np.array(x_axis) mean_originals = [] for originals in data_first: mean_originals.append(np.mean(np.array(originals))) mean_originals_and_samples = [] for originals_and_samples in data_second: mean_originals_and_samples.append(np.mean(np.array(originals_and_samples))) a, b = curve_fit(learning_curve_function, x_axis, mean_originals) c, d = curve_fit(learning_curve_function, x_axis, mean_originals_and_samples) x_axis_fit = np.linspace(x_axis.min(), x_axis.max(), 100) mean_originals_fit = learning_curve_function(x_axis_fit, *a) mean_originals_and_samples_fit = learning_curve_function(x_axis_fit, *c) fig, ax1 = plt.subplots(figsize=(10, 6)) fig.canvas.set_window_title('Exponential Decay Learning Curves') # plt.subplots_adjust(left=0.04, right=0.35, top=0.9, bottom=0.25) ax1.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax1.set_title('Learning Curve Comparison for %s' % measure) ax1.set_xlabel('Dataset Percentage Used for Training') ax1.set_ylabel('%s Value' % measure) plt.boxplot(data_first, positions=(x_axis + delta) * scaling, notch=False) plt.plot((x_axis + delta) * scaling, mean_originals, 'ro', label='') plt.plot((x_axis_fit) * scaling, mean_originals_fit, 'r-', label='Original') plt.boxplot(data_second, positions=(x_axis - delta) * scaling, notch=False) plt.plot((x_axis - delta) * scaling, mean_originals_and_samples, 'go', label='') plt.plot((x_axis_fit) * scaling, mean_originals_and_samples_fit, 'g-', label='Original+sampled') plt.grid() plt.legend(loc='lower right') if fname is not None: plt.savefig(fname) else: plt.show()
def make_boxplot(self, data, labels, filename, ylabel): plt.figure() plt.boxplot(data) # mark the mean means = [np.mean(x) for x in data] print ylabel print means #print range(1, len(data)+1) plt.scatter(range(1, len(data)+1), means, color="red", marker=">", s=20) plt.ylabel(ylabel) plt.xticks(range(1, len(data)+1), labels) plt.savefig(filename)
def graphBackendComparison(benchType, pageSize): clf() dynData = filter(table, benchType=benchType, backend='dynamodb', pageSize=pageSize) s3Data = filter(table, benchType=benchType, backend='s3', pageSize=pageSize) dynXData = project(dynData, 'writeUnits') dynYData = project(dynData, 'latency') s3YData = project(s3Data, 'latency') # Each sample represents 10 trials. dynYData = map(lambda x:x/10, dynYData) s3YData = map(lambda x:x/10, s3YData) (dynXData, dynYData) = condense(dynXData, dynYData) # Merge the dynamodb and s3 datasets to they can be plotted in the same axes. yData = [s3YData] + dynYData pylab.boxplot(yData) captions = { 'seqwrite': ('Sequential', 'write'), 'seqread': ('Sequential', 'read'), 'randwrite':('Random', 'write'), 'randread': ('Random', 'read') } (order, direction) = captions[benchType] limits = { 'seqwrite': [0, 0.3], 'seqread': [0, 0.075], 'randwrite':[0, 0.3], 'randread': [0, 0.075] } fmt = ticker.FixedFormatter(['S3'] + map(str, dynXData)) ax = gca() ax.get_xaxis().set_major_formatter(fmt) ax.set_ylabel(order + " 4K block " + direction + " latency (s)") ax.set_xlabel("Provisioned read and write units") ax.get_yaxis().grid(color='gray', linestyle='dashed') ax.get_yaxis().set_major_locator(ticker.MaxNLocator(10)) title('Page Size = %dK' % (pageSize / 1024)) pylab.ylim(limits[benchType]) dpi = 60 gcf().dpi = dpi gcf().set_size_inches(400 / dpi, 300 / dpi)
def plotStats(self,save) : figure() # show boxplot, iff we have enough data if min(map(len, self.stat_avg_z)) > 3 : data = self.stat_avg_z boxplot(data,1) #else : figure() data2 = self.stat_avg_z_total plot(data2) show()
def generate( filenames ): for cur in filenames: filename = RESULT_FILE_FORMAT % (pwd, cur, p, ps, P, d, ds, D, r, s) pylab.boxplot( get_boxplot_data( filename ) ) nonzero = lambda x: x if x > 0 else 1 iters = ( nonzero( P - p ) / ps ) * ( nonzero( D - d ) / ds ) pylab.xlabel('%d iterations from %d,%d to %d,%d' % ( iters, p, d, P, D) ) pylab.ylabel('%s - %s' % (cur, name)) pylab.savefig( filename + '.pdf', format='pdf' ) pylab.savefig( filename + '.png', format='png' ) pylab.cla() pylab.clf()
def main(args): ##======## ## init ## ##======## targetfile = args.input_tsv xdatacol = args.xdatacol ydatacol = args.ydatacol delim = args.delimiter if delim=="t": delim = "\t" elif delim=="n": delim = "\n" xlabel = args.xlabel ylabel = args.ylabel tosave = args.s if tosave: savefilename = os.path.splitext(targetfile)[0] + "_boxplot.png" ##===========## ## read data ## ##===========## xitems = list(set([float(line.rstrip().split(delim)[xdatacol]) for line in open(targetfile)])) #xitems.append(0.0) xitems.sort() data = {} for item in xitems: data[item] = [] #data[0.0] = [0.0] for line in open(targetfile): lineitems = line.rstrip().split(delim) key, val = float(lineitems[xdatacol]), float(lineitems[ydatacol]) data[key].append(val) ##======## ## show ## ##======## fig = pylab.figure() pylab.xticks(range(len(xitems)), xitems) pylab.xlabel(unicode(xlabel, sys.stdin.encoding)) pylab.ylabel(unicode(ylabel, sys.stdin.encoding)) for key in xitems: print key, data[key] showdata = [ data[key] for key in xitems ] pylab.boxplot(showdata) maxy = max([max(data[key]) for key in xitems]) pylab.plot([1.0,len(xitems)], [0, maxy], 'k--') if not tosave: pylab.show() else: pylab.savefig(savefilename) print data
def _plot_nominal(self, data, result_dir, x_key, y_key): """ Creates a boxplot of the y_keys for the given nominal parameter x_key. A method that allows to create a plot that visualizes the effect of differing one nominal variable onto a second one (e.g. the effect of differing the classifier onto the accuracy). **Expected arguments** :data: A dictionary, that contains a mapping from an attribute (e.g. accuracy) to a list of values taken by an attribute. An entry is the entirety of all i-th values over all dict-values :result_dir: The director in which the plots will be saved. :x_key: The key of the dictionary whose values should be used as values for the x-axis (the independent variables) :y_key: The key of the dictionary whose values should be used as values for the y-axis, i.e. the dependent variable """ # Create the plot for this specific dependent variable values = defaultdict(list) for i in range(len(data[x_key])): parameter_value = data[x_key][i] if y_key[0] is not "#": performance_value = float(data[y_key][i]) else: # A weighted cost function weight1, y_key1, weight2, y_key2 = y_key[1:].split("#") performance_value = float(weight1) * float(data[y_key1][i]) \ + float(weight2) * float(data[y_key2][i]) values[parameter_value].append(performance_value) values = sorted(values.items()) # values = [("Standard_vs_Target", values["Standard_vs_Target"]), # ("MissedTarget_vs_Target", values["MissedTarget_vs_Target"])] pylab.subplots_adjust(bottom = 0.3, # the bottom of the subplots of the figure ) pylab.boxplot(map(lambda x: x[1], values)) pylab.gca().set_xticklabels(map(lambda x: x[0], values)) pylab.setp(pylab.gca().get_xticklabels(), rotation=-90) pylab.setp(pylab.gca().get_xticklabels(), size='x-small') pylab.gca().set_xlabel(x_key.replace("_", " ")) if y_key[0] is not "#": pylab.gca().set_ylabel(y_key.replace("_", " ")) else: pylab.gca().set_ylabel("%s*%s+%s*%s" % tuple(y_key[1:].split("#"))) pylab.savefig("%s%s%s_%s.pdf" % (result_dir, os.sep, y_key, x_key)) pylab.gca().clear() pylab.close("all")
def boxplot_data(results_list, title): pylab.clf() pylab.figure(1) result_cols = [] for i in range(len(results_list[0])): res = [result[i] for result in results_list] result_cols.append(res) pylab.boxplot(result_cols) pylab.figure(1).autofmt_xdate() title = title + '_boxplot' pylab.title(title) if not os.path.exists('./graphs'): os.makedirs('./graphs') filename = 'graphs/' + title + FILETYPE pylab.savefig(filename)
def boxPlot( rankDict ): import numpy as np pylab.rcParams.update({'lines.linewidth' : 2.0}) counts = [] for i,(k,v) in enumerate(rankDict.iteritems()): counts.append(v) pylab.boxplot(counts) pylab.xticks(np.arange(1,len(rankDict)+1),rankDict.keys(), rotation=60) #pylab.ylim(0.0, 1.0) pylab.title('Rank Distributions by Orthology Group') pylab.xlabel('Orthology Classes') pylab.ylabel('Relative Ranks')
def draw_ind_by_clust_plots(rc_sort,cd_sort,inds,rc_orig,cd_cut=None,win=1000,rc_low=4,rc_hi=10,fignum=1,figsize=(8,10),filename=None): if cd_cut is None: cd_cut = 0.1 pylab.figure(fignum,figsize) lol = lol_by_segment(rc_sort,win) cdlol = lol_by_segment(cd_sort,win) ncat = len(lol) step = ncat/20 print >> sys.stderr, 'draw boxplots' pylab.subplot(4,1,1) pylab.boxplot([[len([v for v in d.values() if v>=rc_low]) for d in li] for li in lol]) pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90) pylab.subplot(4,1,2) pylab.boxplot([[len([v for v in d.values() if v>=rc_hi]) for d in li] for li in lol]) pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90) pylab.subplot(4,1,3) pylab.boxplot([[len([v for v in d.values() if v>=rc_low]) for this_cd,d in zip(cdli,li) if this_cd <= cd_cut] for cdli,li in zip(cdlol,lol)]) pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90) pylab.subplot(4,1,4) pylab.boxplot([[len([v for v in d.values() if v>=rc_hi]) for this_cd,d in zip(cdli,li) if this_cd <= cd_cut] for cdli,li in zip(cdlol,lol)]) pylab.xticks(numpy.arange(0,ncat,step),(numpy.arange(0,ncat,step)*win)/1000,rotation=90) if filename is not None: print >> sys.stderr, 'store boxplots' try: pylab.savefig(filename) except IOError: print >> sys.stderr, 'unable to write %s, output not stored' % filename
def fwhm_whisker_plot(stampImgList=None,bkgList=None,sigma=1.1/scale): whk,fwhm = get_fwhm_whisker_list(stampImgList,bkgList,sigma=sigma) whk=list(whk.T) fwh=list(fwhm.T) pl.figure(figsize=(7,5)) pl.boxplot(whk) pl.hlines(0.2,0,3,linestyle='solid',color='g') pl.ylim(0.,.4) pl.xticks(np.arange(1,3),['whisker_Wmoments','whisker_Amoments']) pl.figure(figsize=(12,5)) pl.boxplot(fwh) pl.ylim(0.4,1.5) pl.hlines(0.9,0,6,linestyle='solid',color='g') pl.xticks(np.arange(1,6),['fwhm_weighted', 'fwhm_Amoments','fwhm_moffat', 'fwhm_gauss','fwhm_sech2']) return '-----done !----'
def boxPlot(self): """ Plots a box-plot of the contig lengths. Returns ------ Box plot of contig sizes, saved in the file contig_boxplot.png """ seqLengths = [] for x in self.contigsInfo.keys(): seq = self.contigsInfo[x] seqLengths.append(len(seq)) pylab.boxplot(seqLengths) pylab.savefig('contig_boxplot.png')
def estimate_possibility(n, mu, k, g, cs, ks, num_holdouts, percentages, fuzzifier, verbose=False): axiom_indices = range(n) assert (len(axiom_indices) == len(mu) == n) paired_axioms = [axiom_indices[i:i + 2] for i in range(0, n, 2)] paired_labels = [mu[i:i + 2] for i in range(0, n, 2)] metrics_membership_rmse = [] metrics_membership_median = [] metrics_membership_stdev = [] metrics_possibility_rmse = [] metrics_possibility_median = [] metrics_possibility_stdev = [] for h in range(num_holdouts): (paired_values_train, paired_values_validate, paired_values_test, paired_mu_train, paired_mu_validate, paired_mu_test) = split_indices(paired_axioms, paired_labels, percentages) if verbose: print 'holdout {} of {}'.format(h, num_holdouts) best_c, _, result = model_selection_holdout(paired_values_train, paired_mu_train, paired_values_validate, paired_mu_validate, cs, ks, sample_generator=g, log=False, adjustment=adjustment, fuzzifier=fuzzifier, verbose=verbose) if best_c is None: if verbose: print 'in holdout {} optimization always failed!'.format(h) continue if verbose: print 'in holdout {} best C is {}'.format(h, best_c) estimated_membership = result[0] # values and labels are still paired, we need to flatten them out values_test = flatten(paired_values_test) mu_test = flatten(paired_mu_test) membership_square_err = [(estimated_membership(v) - m)**2 for v, m in zip(values_test, mu_test)] membership_rmse = math.sqrt( sum(membership_square_err) / len(values_test)) metrics_membership_rmse.append(membership_rmse) membership_median = np.median(membership_square_err) metrics_membership_median.append(membership_median) membership_stdev = np.std(membership_square_err) metrics_membership_stdev.append(membership_stdev) estimated_mu = map(estimated_membership, values_test) actual_possibility = [ mfi - mnotfi for mfi, mnotfi in zip(mu_test[::2], mu_test[1::2]) ] estimated_possibility = [ mfi - mnotfi for mfi, mnotfi in zip(estimated_mu[::2], estimated_mu[1::2]) ] possibility_square_err = [ (actual - estimated)**2 for actual, estimated in zip( actual_possibility, estimated_possibility) ] possibility_rmse = math.sqrt( sum(possibility_square_err) / len(possibility_square_err)) metrics_possibility_rmse.append(possibility_rmse) possibility_median = np.median(possibility_square_err) metrics_possibility_median.append(possibility_median) possibility_stdev = np.std(possibility_square_err) metrics_possibility_stdev.append(possibility_stdev) indices = ['-'.join(map(str, pair)) for pair in paired_values_test] results = [ (i, phi, notphi, max(phi, notphi), ephi, enotphi, max(ephi, enotphi), p, ep, (p - ep)**2) for i, phi, notphi, p, ephi, enotphi, ep in zip( indices, mu_test[::2], mu_test[1::2], actual_possibility, estimated_mu[::2], estimated_mu[1::2], estimated_possibility) ] results.sort(key=lambda r: r[-1]) with open( 'data/axioms-results-holdout-{}-{}-details.csv'.format( fuzzifier.name, h), 'w') as output_file: writer = csv.writer(output_file) writer.writerows(results) with open( 'data/axioms-results-holdout-{}-{}-global.csv'.format( fuzzifier.name, h), 'w') as output_file: writer = csv.writer(output_file) writer.writerows([ ('membership RMSE', membership_rmse), ('membership median', membership_median), ('membership STDEV', membership_stdev), ('possibility RMSE', possibility_rmse), ('possibility median', possibility_median), ('possibility STDEV', possibility_stdev), ]) errors = [r[-1] for r in results] p = plt.boxplot(errors) plt.savefig('data/axioms-results-holdout-{}-{}-boxplot.png'.format( fuzzifier.name, h)) plt.clf() p = plt.hist(errors, bins=50) plt.savefig('data/axioms-results-holdout-{}-{}-histogram.png'.format( fuzzifier.name, h)) plt.clf() gc.collect() if verbose: print 'Membership average values:' print 'RMSE: {}'.format(np.average(metrics_membership_rmse)) print 'Median: {}'.format(np.average(metrics_membership_median)) print 'STDEV: {}'.format(np.average(metrics_membership_stdev)) print 'Possibility average values:' print 'RMSE: {}'.format(np.average(metrics_possibility_rmse)) print 'Median: {}'.format(np.average(metrics_possibility_median)) print 'STDEV: {}'.format(np.average(metrics_possibility_stdev)) with open( 'data/axioms-results-holdout-{}-average-metrics.csv'.format( fuzzifier.name), 'w') as output_file: writer = csv.writer(output_file) writer.writerows([ ('membership average RMSE', np.average(metrics_membership_rmse)), ('membership average median', np.average(metrics_membership_median)), ('membership average STDEV', np.average(metrics_membership_stdev)), ('possibility average RMSE', np.average(metrics_possibility_rmse)), ('possibility average median', np.average(metrics_possibility_median)), ('possibility average STDEV', np.average(metrics_possibility_stdev)), ])
B = [p[1], q[1]] C = [p[2], q[2]] # D=[p[3],q[3]] # E=[p[4],q[4]] # F=[p[5],q[5]] # G=[p[6],q[6]] # H=[p[7],q[7]] # I=[p[8],q[8]] fig = figure() ax = axes() hold(True) # first boxplot pair bp = boxplot(A, positions=[1, 2], widths=0.6, whis=100000000) setBoxColors(bp) for box in bp['boxes']: # change outline color box.set(linewidth=3) for cap in bp['whiskers']: cap.set(linewidth=4) for median in bp['medians']: median.set(linewidth=4) # second boxplot pair bp = boxplot(B, positions=[4, 5], widths=0.6, whis=100000000) setBoxColors(bp) for box in bp['boxes']:
def boxplot(data, **kwargs): for d, x in zip(data, kwargs['positions']): if numpy.percentile(d, 0.25) == -2: pylab.plot([x], [max(d)], '_') pylab.boxplot(data, **kwargs)
def box_plot(self, df, x_label=None, fontsize=25, figsize=(15, 10), markersize=12, colors=None, custom_legend=None, legend_loc='best', legend_font_size='10', legend_marker_size=0.85, box_line_thickness=1.75, draw_points=False): """ Plots all data in a dataframe as a box-and-whisker plot with optional axis label """ tick_labels = [str(column) for column in df.columns] fontsize = fontsize # Draw figure and axis fig, ax = plt.subplots(figsize=figsize) # Set background to opaque fig.patch.set_facecolor('white') # Set grid parameters ax.yaxis.grid(False) ax.xaxis.grid(True, linestyle='--', which='both', color='black', alpha=0.5, zorder=1) # Set left frame attributes ax.spines['left'].set_linewidth(1.8) ax.spines['left'].set_color('gray') ax.spines['left'].set_alpha(1.0) # Remove all but bottom frame line # ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['bottom'].set_visible(False) # Draw box plot box_plot_kwargs = dict(notch=0, sym='+', vert=False, whis=5, patch_artist=True, capprops=dict(color='k', linestyle='-', linewidth=box_line_thickness), boxprops=dict(linestyle='-', linewidth=box_line_thickness, color='black'), medianprops=dict(linestyle='none', color='k', linewidth=box_line_thickness), whiskerprops=dict(color='k', linestyle='-', linewidth=box_line_thickness)) bp = plt.boxplot(df.values, **box_plot_kwargs) # Set custom colors if colors: for item in ['boxes']: #'medians' 'whiskers', 'fliers', 'caps' for patch, color in zip(bp[item], colors): patch.set_color(color) for patch, color in zip(bp['medians'], colors): patch.set_color('black') else: for patch in bp['boxes']: patch.set_color('black') for patch in bp['medians']: patch.set_color('black') # Draw overlying data points if draw_points == True: for column_ind, column in enumerate(df.columns): # Get data y = (column_ind + 1) * np.ones(len(df[column])) x = df[column].values # Plot data points plt.plot(x, y, '.', color='k', markersize=markersize) # Set tick labels and sizes plt.setp(ax, yticklabels=tick_labels) plt.setp(ax.get_yticklabels(), fontsize=fontsize) plt.setp(ax.get_xticklabels(), fontsize=fontsize) # Adjust limits so plot elements aren't cut off x_ticks, x_tick_labels = plt.xticks() # shift half of range to left range_factor = 2 x_min = x_ticks[0] x_max = x_ticks[-1] + (x_ticks[-1] - x_ticks[-2]) / float(range_factor) # Set new limits plt.xlim(x_min, x_max) # Set tick positions plt.xticks(x_ticks) # Place x- and y-labels plt.xlabel(x_label, size=fontsize) # plt.ylabel(y_label,size=small_text_size) # Move ticks to where I want them ax.xaxis.set_ticks_position('none') ax.yaxis.set_ticks_position('left') if custom_legend: ax.legend(custom_legend[1], custom_legend[0], handlelength=legend_marker_size, handleheight=legend_marker_size, frameon=False, loc=legend_loc) plt.setp(plt.gca().get_legend().get_texts(), fontsize=legend_font_size) # Draw a white dot for medians for column_ind, column in enumerate(df.columns): x_median = np.median(df[column].values) y_median = (column_ind + 1) * np.ones(1) # Plot data points plt.plot(x_median, y_median, 'o', color='white', markersize=markersize, markeredgecolor='white', zorder=3) # Display plot plt.show()
h = np.random.uniform(0.020 * 0.95, 0.020 * 1.05) l = np.random.uniform(0.95, 1.05) b = np.random.uniform(0.95, 1.05) T = np.random.uniform(70 * 0.95, 70 * 1.05) - np.random.uniform( 200 * .095, 200 * 1.05) to = np.random.uniform(0.15 * 0.95, 0.15 * 1.05) ti = np.random.uniform(0.1 * 0.95, 0.1 * 1.05) P = np.random.uniform(2000 * 0.95, 2000 * 1.05) omegaVal = omega(G, h, Eo, to, Ei, ti) x = np.linspace(0, 0.5, 50) tensoes = [] for xi in x: tensoes.append( tensaoCisalhamento(P, omegaVal, b, xi, Eo, to, Ei, ti, alfao, alfai, T, l)) vetor.append(tensoes) if min(tensoes) < minimo: minimo = min(tensoes) if max(tensoes) > maximo: maximo = max(tensoes) pl.plot(x, tensoes) pl.grid() pl.title("Tensão cisalhante por distância") pl.xlabel("Distância (in)") pl.ylabel("Tensão cisalhante (lbf/in²)") pl.show() pl.figure() pl.boxplot(vetor) print("valor mínimo:" + str(minimo) + "\nvalor máximo: " + str(maximo))
# B = [q, [7, 2, 5]] # C = [[3, 2, 5, 7], [6, 7, 3]] A = [p[0], q[0]] B = [p[1], q[1]] C = [p[2], q[2]] D = [p[3], q[3]] E = [p[4], q[4]] fig = figure() ax = axes() hold(True) # first boxplot pair bp = boxplot(A, positions=[1, 2], widths=0.6, whis=100000000) setBoxColors(bp) for box in bp['boxes']: # change outline color box.set(linewidth=2) for cap in bp['whiskers']: cap.set(linewidth=2) for median in bp['medians']: median.set(linewidth=2) # second boxplot pair bp = boxplot(B, positions=[4, 5], widths=0.6, whis=100000000) setBoxColors(bp) for box in bp['boxes']:
df = pd.read_excel('/Users/mac/Desktop/Machine Learning/module4/housing.xlsx', header=0) df.columns = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] #EDA summary = df.describe() print(summary) des = open('/Users/mac/Desktop/Machine Learning/module4/summary.csv', 'w') print(df.describe(), file=des) des.close() #box-plot of 13 attributes from pylab import boxplot array = df.iloc[:, 0:13].values boxplot(array) plt.xlabel("Attribute Index") plt.ylabel(("Quartile Ranges")) plt.savefig('/Users/mac/Desktop/Machine Learning/module4/box-plot.jpg') plt.show() df = df.dropna(axis=0) import seaborn as sns sns.pairplot(df, size=2.5) plt.tight_layout() plt.savefig('/Users/mac/Desktop/Machine Learning/module4/pairplot.jpg') plt.show() #13x13 correlation matrix and heatmap from pandas import DataFrame corMat = DataFrame(df.corr())
setp(bp['whiskers'][0], color='blue') setp(bp['whiskers'][1], color='blue') setp(bp['fliers'][0], color='blue') setp(bp['fliers'][1], color='blue') setp(bp['medians'][0], color='blue') setp(bp['boxes'][1], color='red') setp(bp['caps'][2], color='red') setp(bp['caps'][3], color='red') setp(bp['whiskers'][2], color='red') setp(bp['whiskers'][3], color='red') setp(bp['fliers'][2], color='red') setp(bp['fliers'][3], color='red') setp(bp['medians'][1], color='red') # Some fake data to plot A = [auc1, auc2] fig = figure() ax = axes() hold(True) bp = boxplot(A, positions=[1, 2], widths=0.6) #setBoxColors(bp) # set axes limits and labels ylim(0.8, 1.05) ax.set_xticklabels(['Deep Learning', 'SVM']) title('Comparision between Deep Learning and SVM') ylabel('AUC Score') savefig('boxcompare.png') show()
import numpy as np data = np.random.randn(1000) f, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 3)) ax1.hist(data, bins=30, normed=True, color='b') ax2.hist(data, bins=10, normed=False, color='r', cumulative=True) x = [1, 2, 3, 2, 1] y = [3, 2, 1, 3, 1] pl.subplot(2, 1, 1) pl.plot(x) pl.subplot(2, 1, 2) pl.plot(y) x = [1, 2, 3, 2, 1] y = [3, 2, 1, 3, 1] pl.subplot(1, 2, 1) pl.plot(x) pl.subplot(1, 2, 2) pl.plot(y) x = np.random.randn(256) pl.boxplot(x, vert=0) pl.show() samp1 = np.random.normal(loc=0, scale=3., size=200) samp2 = np.random.normal(loc=5., scale=10., size=500) samp3 = np.random.normal(loc=0.3, scale=1.2, size=100) f, ax = plt.subplots(1, 1, figsize=(5, 4)) ax.boxplot((samp1, samp2, samp3)) ax.set_xticklabels(['sample1', 'sample2', 'sample3'])
else: data = numpy.hstack((data, data_d)) mi_ests[nr] = mi_est nr += 1 pl.figure(tight_layout=True, figsize=(len(widths) * 4, 4)) i = 0 for T_s in datas: pl.subplot(1, len(datas), i + 1) pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81) pl.title('w: ' + str(widths[i])) pl.xlim([-1, 1]) pl.ylim([-1, 1]) i += 1 pl.savefig("ring_data.png") pl.figure(tight_layout=True, figsize=(6, 4)) pl.boxplot(data) title = "N=%i (ring)" % (n) pl.ylabel('MI') pl.xlabel('ring width') pl.gca().set_xticklabels(widths) pl.plot(range(1, len(widths) + 1), mi_ests, c="red", label="est MI (kd-tree)") pl.legend(loc=0, prop={'size': 8}) pl.title(title) pl.savefig("mi_vs_ring.png")
def analysis_results(options): """ Analyzes the results of the comparisons """ # Start marker for time measure start = time.time() print( "\n\t\t------------------------------------------------------------------------------------------------------------------------\n" ) print( "\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n" ) print( "\t\t------------------------------------------------------------------------------------------------------------------------\n" ) # Get the script path main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) toolbox_dir = os.path.join(main_path, 'diana/toolbox') # Check the directory of the profiles, comparisons and analysis data_dir = os.path.join(options.workspace, "profiles") check_directory(data_dir) results_dir = os.path.join(options.workspace, "comparisons") check_directory(results_dir) analysis_dir = os.path.join(options.workspace, "analysis") check_directory(analysis_dir) # Get the list of thresholds to create the profiles if options.threshold_list and fileExist(options.threshold_list): threshold_list = get_values_from_threshold_file(options.threshold_list) else: threshold_list = [1, 5, 10, 20, 50] # Do we consider Side Effects/ATC? if options.consider_se: consider_se = True else: consider_se = False # Get the names of the columns columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se) #-----------------------------------------------------# # PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME # #-----------------------------------------------------# pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl') pair2comb = cPickle.load(open(pair2comb_file)) ddi = sum(1 for x in pair2comb.values() if x == 1) non_ddi = sum(1 for x in pair2comb.values() if x == 0) print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi)) print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi)) output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv') if not fileExist(output_dataframe): # Create a data frame to store the results df = pd.DataFrame(columns=columns) # Obtain all the results subfolders of the results main folder results_dir_list = [ f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f)) ] for comparison in results_dir_list: drug_id1, drug_id2 = comparison.split('---') comparison_dir = os.path.join(results_dir, comparison) results_table = os.path.join(comparison_dir, 'results_table.tsv') # Add the Comb field (if it is drug combination or not) drug1 = drug_id1.split('_')[0].upper() drug2 = drug_id2.split('_')[0].upper() comparison_without_id = '{}---{}'.format(drug1, drug2) if comparison_without_id in pair2comb: combination_field = pair2comb[comparison_without_id] else: print( 'The comparison {} is not in the pair2comb dictionary!\n'. format(comparison_without_id)) print(pair2comb) sys.exit(10) if not fileExist(results_table): print('The comparison {} has not been executed properly!\n'. format(comparison)) sys.exit(10) results = get_results_from_table(results_table, columns, combination_field) df2 = pd.DataFrame([results], columns=columns, index=[comparison]) # Add the information to the main data frame df = df.append(df2) # Output the Pandas dataframe in a CSV file df.to_csv(output_dataframe) else: df = pd.read_csv(output_dataframe, index_col=0) #---------------------------# # REMOVE MISSING VALUES # #---------------------------# # Replace the None values in dcstructure by nan if 'None' in df['dcstructure']: df = df.replace(to_replace={'dcstructure': {'None': np.nan}}) # Remove the nan values in dcstructure df = df.dropna() # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print('Number of drug combinations after removing missing values:\t{}\n'. format(num_dc)) print( 'Number of non-drug combinations after removing missing values:\t{}\n'. format(num_ndc)) #---------------------------# # IDENTIFY ME-TOO DRUGS # #---------------------------# me_too_dir = os.path.join(analysis_dir, 'me_too_drugs') create_directory(me_too_dir) me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv') me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv') me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl') me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl') if not fileExist(me_too_drug_pairs_file) or not fileExist( me_too_drug_comb_pairs_file): df_struc = df[['dcstructure']] df_struc = df_struc.astype(float) me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations( df_struc, columns, me_too_drugs_table, me_too_drug_combs_table) cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w')) cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w')) else: me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file)) me_too_drug_comb_pairs = cPickle.load( open(me_too_drug_comb_pairs_file)) # Process me-too drug combination pairs me_too_drug_combinations = set() drug_pair_to_me_too_times = {} for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2])) drug_pair_to_me_too_times.setdefault(drug_comb1, 0) drug_pair_to_me_too_times.setdefault(drug_comb2, 0) drug_pair_to_me_too_times[drug_comb1] += 1 drug_pair_to_me_too_times[drug_comb2] += 1 removed_drug_pairs = set() for pair in me_too_drug_comb_pairs: drug_comb1, drug_comb2 = pair.split('___') if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs: continue if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[ drug_comb2]: removed_drug_pairs.add(drug_comb1) else: removed_drug_pairs.add(drug_comb2) # Remove the drug pairs which appear in me-too pairs of drug pairs more times df = df.loc[~df.index.isin(list(removed_drug_pairs))] # Count the number of drug combinations / non-drug combinations dc_data = df[df['combination'] == 1] ndc_data = df[df['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print( 'Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_dc)) print( 'Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n' .format(num_ndc)) #-------------------------# # EVALUATE PERFORMANCE # #-------------------------# img_dir = os.path.join(analysis_dir, 'figures') create_directory(img_dir) fig_format = 'png' tables_dir = os.path.join(analysis_dir, 'tables') create_directory(tables_dir) # Machine learning parameters repetitions = 25 # Number of repetititons n_fold = 2 # Number of folds min_num_dc_group = 10 greater_or_smaller = 'greater' classifier = 'SVC' classifiers = { 'KNeighbors': KNeighborsClassifier(3), 'SVC': SVC(probability=True), 'SVC linear': SVC(kernel="linear", C=0.025), 'SVC rbf': SVC(gamma=2, C=1), 'DecisionTree': DecisionTreeClassifier(max_depth=5), 'RandomForest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), 'MLP': MLPClassifier(alpha=1), 'AdaBoost': AdaBoostClassifier(), 'GaussianNB': GaussianNB(), 'QuadraticDiscr.': QuadraticDiscriminantAnalysis(), 'SVC best 1': SVC(kernel="linear", C=0.1, probability=True), 'SVC best 2': SVC(kernel="rbf", gamma=0.01, C=100.0, probability=True) } # Plot of distributions of AUC plot_name = os.path.join(img_dir, 'dcGUILD_1_threshold_auc.{}'.format(fig_format)) # Get the targets file drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl') drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file)) # Get the DIANA IDs file diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl') diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file)) print('\nEVALUATION OF DCGUILD\n') repetitions = 25 n_fold = 10 analysis_results = {} # Obtain the different non-drug combination groups to repeat the analysis ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times # dcGUILD_features = [str(x) for x in threshold_list] # dcGUILD_feature_to_columns = {} # # Get dcGUILD columns # for top_threshold in threshold_list: # for data_type in ['node', 'edge', 'function']: # for scoring_function in ['dot_product', 'spearman', 'jaccard']: # col = 'dcg'+'_'+data_type+'_'+str(top_threshold)+'_'+scoring_function # dcGUILD_feature_to_columns.setdefault(str(top_threshold), []) # dcGUILD_feature_to_columns[str(top_threshold)].append(col) # dcGUILD_feature_to_columns[str(top_threshold)].append('combination') dcGUILD_features = [] dcGUILD_feature_to_columns = {} # Get dcGUILD columns for top_threshold in [1]: for data_type in ['node', 'edge', 'function']: for scoring_function in ['dot_product', 'spearman', 'jaccard']: col = 'dcg' + '_' + data_type + '_' + str( top_threshold) + '_' + scoring_function dcGUILD_features.append(col) dcGUILD_feature_to_columns[col] = [col, 'combination'] for feature in dcGUILD_features: df_method = df[dcGUILD_feature_to_columns[feature]] dc_data = df_method[df_method['combination'] == 1] ndc_data = df_method[df_method['combination'] == 0] num_dc = len(dc_data.index) num_ndc = len(ndc_data.index) print(feature) print( 'Building {} repetition groups of {} (same) DC and {} (different) non-DC' .format(repetitions, num_dc, num_dc)) ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length( ndc_data, repetitions, num_dc ) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times mean_aucs = [ ] # Here we will store the means of AUCs from the cross-validations std_aucs = [ ] # Here we will store the standard deviations of the AUCs from the cross-validations all_aucs = [] # Here we will store ALL the AUCs all_probs = [] # Here we store all the probabilities and labels num_repetitions = 0 for ndc_data_equal in ndc_repetitions: num_repetitions += 1 num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation if num_repetitions == 1: print( 'Building {} fold groups of {} DC and {} non-DC x {} repetitions' .format(n_fold, num_items_group, num_items_group, repetitions)) dc_groups = diana_analysis.obtain_n_groups_of_k_length( dc_data, n_fold, num_items_group, me_too_drug_combinations ) # Defining the drug combination groups in each cross-validation step ndc_groups = diana_analysis.obtain_n_groups_of_k_length( ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations ) # Defining the non-drug combination groups in each cross-validation step merged_groups = [ pd.concat([x, y]) for x, y in zip(dc_groups, ndc_groups) ] mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob( n_fold, merged_groups, classifiers[classifier]) mean_aucs.append(mean) std_aucs.append(std) all_aucs = all_aucs + list_auc all_probs = all_probs + list_prob final_mean = np.mean(mean_aucs) mean_std = np.mean(std_aucs) std_means = np.std(mean_aucs) std = np.std(all_aucs) print('FINAL MEAN: {}'.format(final_mean)) print('MEAN of STD: {}'.format(mean_std)) print('STD: {}\n'.format(std)) # Store the distribution of AUCs in the dictionary analysis_results[feature] = all_aucs #------------------------------# # PLOT DISTRIBUTION OF AUC # #------------------------------# fig = pylab.figure(dpi=300) ax = pylab.axes() #pylab.hold(True) pos = 1 col_num = 0 xticks = [] # Define the places in which the labels will be xlabels = [] # Define the labels (the names of the features) #colors = [ ['#9ed0ff, blue'], ['#32f232', 'green'], ['#fbc562', '#d48900'], ['#ff7373', '#b80000'], ['grey', 'black'] ] for feature in dcGUILD_features: positions = [] positions.append(pos) # Define the positions of the boxplots pos += 2 # Add separation between boxplots xlabels.append(feature) # Add the feature used at the x axis # Boxplot group #bp = boxplot(data, positions = positions, widths = 0.6) bp = pylab.boxplot(analysis_results[feature], positions=positions, widths=0.6, patch_artist=True) tick = np.mean( positions ) # The label will be at the mean of the positions (in the middle) xticks.append(tick) # Set axes limits and labels pylab.xlim(0, pos - 1) pylab.ylim(0, 1) ax.set_xticklabels(xlabels) ax.set_xticks(xticks) pylab.xlabel('Features') pylab.ylabel('Distribution of AUC values') fig.autofmt_xdate() pylab.savefig(plot_name, format=fig_format) pylab.show() # End marker for time end = time.time() print( '\n DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n' .format(end - start, (end - start) / 60)) return
liveness = Float32Col() energy = Float32Col() speechiness = Float32Col() valence = Float32Col() tempo = Float32Col() key = Int32Col() mode = StringCol(5) h5file = open_file('output.h5', mode='r', title='Spotify Tracks') for table in h5file.root.trackinfo: acousticness = np.array([]) for track in table: acousticness = np.append(acousticness, track['acousticness']) acousticness = acousticness.astype(float) p = P.figure() bp = P.boxplot(acousticness) p.suptitle('Acousticness Distribution for ' + table.name, fontsize=20) P.ylabel('Acousticness Score') P.ylim([0, 1]) for i in range(acousticness.size): y = acousticness x = np.random.normal(1 + i, 0.04, size=acousticness.size) P.plot(x, y, 'ro', alpha=0.2) P.show()
S = cc_state.cc_state([X[:, 0], X[:, 1]], ['normal'] * 2, ct_kernel=kernel, distargs=[None] * 2) S.transition(N=200) mi = iu.mutual_information(S, 0, 1) # linfoot = iu.mutual_information_to_linfoot(MI) MI[r, c] = mi print("w: %1.2f, MI: %1.6f" % (w, mi)) print("%i of %i" % (i + 1, len(W_list) * n_data_sets * n_samples * 2)) del S i += 1 r += 1 c += 1 w_labs = [str(w) for w in W_list] ax = pylab.subplot(1, 2, kernel + 1) pylab.boxplot(MI) pylab.ylim([0, 1]) pylab.ylabel('MI') pylab.xlabel('ring width') pylab.title("kernel %i" % kernel) ax.set_xticklabels(w_labs) pylab.show()
blah = numpy.load(outFileRaw + '.npz') W = blah['X'] for ii in range(X.shape[0]): Xi = X[ii, :, :] Zi = Z[ii, :, :] Wi = W[ii, :, :] pylab.subplot(2, 3, 1) pylab.imshow(Xi, interpolation='nearest', cmap='gist_earth') pylab.title('(mu=%0.2f, std=%0.2f, sk=%0.2f)' % (numpy.mean(Xi), numpy.std(Xi), stats.skew(col(Xi)))) pylab.colorbar() pylab.subplot(2, 3, 4) pylab.boxplot(col(Xi)) pylab.subplot(2, 3, 2) pylab.imshow(Zi, interpolation='nearest', cmap='gist_earth') pylab.title('(mu=%0.2f, std=%0.2f, sk=%0.2f)' % (numpy.mean(Zi), numpy.std(Zi), stats.skew(col(Zi)))) pylab.colorbar() pylab.subplot(2, 3, 5) pylab.boxplot(col(Zi)) pylab.subplot(2, 3, 3) pylab.imshow(Wi, interpolation='nearest', cmap='gist_earth') pylab.title('(mu=%0.2f, std=%0.2f, sk=%0.2f)' % (numpy.mean(Wi), numpy.std(Wi), stats.skew(col(Wi)))) pylab.colorbar()
sns.violinplot(x='target', y='lwt', hue="sit", data=D_, palette="muted", split=True, zorder=2) #%% # 2.2.3. Análise usando bwt (variável quantitativa) vars = ['smoke', 'race2', 'ptl2', 'ht', 'ui', 'ftv2'] f_trues = [] f_falses = [] for v in vars: ax = sns.boxplot(x="day", y="total_bill", hue="smoker", ... data=tips, palette="Set3") f_trues.append(X[X[v] == True]['bwt']) f_falses.append(X[X[v] == False]['bwt']) pl.boxplot([f_falses[-1], f_trues[-1]]) pl.xticks([1,2], [0, 1]) pl.yticks(range(0, 5001, 500)) # pl.grid(axis='y') pl.show() pl.close() # %% #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: iagorosa """ # In[] import numpy as np
def plot(self, vert=True, alpha=0.4, widths=0.5, **kwargs): """Plot the boxplots and dots """ self.widths = widths if self.hold is False: pylab.clf() ordered_data = [self.data[key] for key in self.names] for i, vector in enumerate(ordered_data): N = len(vector) color = self.colors[i % len(self.colors)] if vert is True: X, Y = self.beeswarm(vector, i + 1), vector else: X, Y = vector, self.beeswarm(vector, i + 1) pylab.plot(X, Y, 'o', markersize=self.markersize, markerfacecolor=color, markeredgewidth=1, alpha=alpha) #show means but not outliers try: d = pylab.boxplot(ordered_data, widths=self.widths, vert=vert, patch_artist=True, positions=range(1, len(ordered_data) + 1), showmeans=True, showfliers=False) except: # ReadTheDocs uses matplotlib 1.3.1 for now, so # need this without showmeans parameter d = pylab.boxplot(ordered_data, widths=self.widths, vert=vert, patch_artist=True, positions=range(1, len(ordered_data) + 1)) # for further tuning if needed. self.tuning = d # This is now in matplotlib 1.4.3 (dots instead of lines # though) # additional line for the 1 std means = [pylab.mean(data) for data in ordered_data] stds = [pylab.std(data) for data in ordered_data] for i, this in enumerate(means): if vert is True: x1 = (i + 1) - widths / 2. / 1.5 x2 = (i + 1) + widths / 2. / 1.5 X = pylab.array([x1, x2]) y = this + stds[i] pylab.plot(X, [y, y], lw=2, color='purple') y = this - stds[i] pylab.plot(X, [y, y], lw=2, color='purple') else: y1 = (i + 1) - widths / 2. / 1.5 y2 = (i + 1) + widths / 2. / 1.5 Y = pylab.array([y1, y2]) x = this + stds[i] pylab.plot([x, x], Y, lw=2, color='purple') x = this - stds[i] pylab.plot([x, x], Y, lw=2, color='purple') for i, this in enumerate(d['boxes']): this.set_color('k') this.set_linewidth(self.lw) color = self.colors[i % len(self.colors)] this.set_facecolor(color) this.set_alpha( 0.3) # 0.4 is less than the alpha of the dots to ... # ... so as to see the dots inside the boxes this.set_zorder(10) # this moves the box on top of all dots for this in d['caps']: this.set_linewidth(self.lw) for this in d['whiskers']: this.set_linewidth(self.lw) for this in d['medians']: this.set_linewidth(self.lw) # we will extend the limits by 5% m = min([min(this) for this in self.data.values()]) M = max([max(this) for this in self.data.values()]) extend = 0.05 R = (M - m) * extend X, Y = range(1, len(self.names) + 1), self.names Y = [y.replace("_", " ") for y in Y] if vert is True: pylab.ylabel(self.ylabel, fontsize=self.fontsize) pylab.xticks(X, Y, fontsize=self.fontsize, rotation=90) pylab.ylabel(self.xlabel, fontsize=self.fontsize) pylab.yticks(pylab.yticks()[0], fontsize=self.fontsize) pylab.ylim([m - R, M + R]) else: pylab.xlabel(self.xlabel, fontsize=self.fontsize) if len(X) > 20: pylab.yticks(X, Y, fontsize=self.fontsize / 1.6, rotation=00) else: pylab.yticks(X, Y, fontsize=self.fontsize, rotation=00) pylab.ylabel(self.ylabel, fontsize=self.fontsize) pylab.xticks(pylab.xticks()[0], fontsize=self.fontsize) pylab.xlim([m - R, M + R]) pylab.title(self.title, fontsize=self.fontsize * 1.25) pylab.grid() try: pylab.tight_layout() except: pass return pylab.gca()
def _plot_nominal(self, data, result_dir, fig1, ax, x_key, y_key): """ Creates a boxplot of the y_keys for the given nominal parameter x_key. A method that allows to create a plot that visualizes the effect of differing one nominal variable onto a second one (e.g. the effect of differing the classifier onto the accuracy). **Expected parameters** *data*: A dictionary, that contains a mapping from an attribute \ (e.g. accuracy) to a list of values taken by an attribute. \ An entry is the entirety of all i-th values over all dict-values *result_dir*: The director in which the plots will be saved. *x_key*: The key of the dictionary whose values should be used as \ values for the x-axis (the independent variables) *y_key*: The key of the dictionary whose values should be used as\ values for the y-axis, i.e. the dependent variable """ ax.append(fig1.add_subplot(111, label="%d" % (ax.__len__() + 1))) fig1.sca(ax[-1]) # Create the plot for this specific dependent variable values = defaultdict(list) for i in range(len(data[x_key])): parameter_value = data[x_key][i] if y_key[0] is not "#": performance_value = float(data[y_key][i]) else: # A weighted cost function weight1, y_key1, weight2, y_key2 = y_key[1:].split("#") performance_value = float(weight1) * float(data[y_key1][i]) \ + float(weight2) * float(data[y_key2][i]) values[parameter_value].append(performance_value) values = sorted(values.items()) # values = [("Standard_vs_Target", values["Standard_vs_Target"]), # ("MissedTarget_vs_Target", values["MissedTarget_vs_Target"])] pylab.subplots_adjust(bottom=0.3, ) # the bottom of the subplots of the figure # pylab.boxplot(map(lambda x: x[1], values)) b = pylab.boxplot(map(lambda x: x[1], values)) medlines = b['medians'] medians = range(len(medlines)) for i in range(len(medians)): medians[i] = medlines[i].get_ydata()[0] # create array with median labels with 2 decimal places of precision upperLabels = [str(numpy.round(m, 2)) for m in medians] pylab.gca().set_xticklabels(map(lambda x: x[0], values)) pylab.setp(pylab.gca().get_xticklabels(), rotation=-90) pylab.setp(pylab.gca().get_xticklabels(), size='x-small') pylab.gca().set_xlabel(x_key.replace("_", " ")) # top = pylab.gca().get_ylim()[1] # for i in range(len(medians)): # pylab.gca().text(i+1,top-(top*0.05),upperLabels[i], # horizontalalignment='center', size='x-small') bottom = pylab.gca().get_ylim()[0] for i in range(len(medians)): pylab.gca().text(i + 1, bottom + (bottom * 0.05), upperLabels[i], horizontalalignment='center', size='x-small') if y_key[0] is not "#": pylab.gca().set_ylabel(y_key.replace("_", " ")) else: pylab.gca().set_ylabel("%s*%s+%s*%s" % tuple(y_key[1:].split("#"))) return fig1, ax
def plot_deviation(vals_of_replicas, vals_of_graph, metrics, figpath, jaccard_edges=None, title_infix='', seed=0, Gname=''): #vals_of_graph could be a number (level 0) or a list (the same as the number of replicas) clean_names = { 'num nodes': 'num nodes', 'num edges': 'num edges', 'clustering': 'clustering', 'average degree': 'avg\ndegree', 'degree assortativity': 'degree\nassortativity', 'degree connectivity': 'degree\nconnectivity', 'total deg*deg': 'total deg*deg\nassortativity', 's-metric': 's metric', 'mean ecc': 'avg\neccentricity', 'num comps': 'num comps', 'L eigenvalue sum': 'L eigen-\nvalue sum', 'average shortest path': 'avg\ndistance', 'harmonic mean path': 'harmonic avg\ndistance', 'avg flow closeness': 'avg flow\ncloseness', 'avg eigvec centrality': 'avg eigenvec.\ncentrality', 'avg between. central.': 'avg between.\ncentrality', 'modularity': 'modularity' } multiple_models = type(vals_of_graph[0]) is list pylab.show() fig = pylab.figure() pylab.hold(True) num_of_metrics = len(metrics) med_vals = [np.median(vals_of_replicas[i]) for i in range(num_of_metrics)] avg_vals = [np.average(vals_of_replicas[i]) for i in range(num_of_metrics)] p25_vals = [ np.percentile(vals_of_replicas[i], 25) for i in range(num_of_metrics) ] p75_vals = [ np.percentile(vals_of_replicas[i], 75) for i in range(num_of_metrics) ] max_vals = [np.max(vals_of_replicas[i]) for i in range(num_of_metrics)] min_vals = [np.min(vals_of_replicas[i]) for i in range(num_of_metrics)] std_vals = [np.std(vals_of_replicas[i]) for i in range(num_of_metrics)] replica_stats = { 'median_of_replicas': med_vals, 'avg_of_replicas': avg_vals, 'p25_of_replicas': p25_vals, 'p75_of_replicas': p75_vals, 'max_of_replicas': max_vals, 'min_of_replicas': min_vals, 'std_of_replicas': std_vals } normed_replica_vals = [] avg_norms = [] print('Medians' + (' (average of model graphs)' if multiple_models else '')) print('-------') print('metric\t\tOriginalG\t\tReplicas') for met_num, metric in enumerate(metrics): try: model_val = np.average( vals_of_graph[met_num] ) if multiple_models else vals_of_graph[met_num] print('%s\t\t%.5f\t\t%.5f' % (metric['name'], model_val, med_vals[met_num])) except: print('%\tserror' % metric['name']) for met_num, metric in enumerate(metrics): #handle error in original, 0 in original, error in one replica, error in all replicas nor_vals = [] if multiple_models: assert len(vals_of_graph[met_num]) == len( vals_of_replicas[met_num]) pruned_model_vals = [ v for v in vals_of_graph[met_num] if v != graphutils.METRIC_ERROR ] if len(pruned_model_vals) > 0: v_graph = np.average(pruned_model_vals) else: v_graph = graphutils.METRIC_ERROR else: v_graph = vals_of_graph[met_num] v_reps = vals_of_replicas[met_num] if v_graph != graphutils.METRIC_ERROR: if v_graph != 0.0: nor_vals = [ float(v) / v_graph for v in v_reps if v != graphutils.METRIC_ERROR ] else: if v_reps != [] and np.abs(v_reps).sum() == 0.: nor_vals.append(len(v_reps) * [1.0]) pylab.plot(1.0, met_num, 'o', color='k', linewidth=2., label=Gname) pylab.text(x=.0, y=(met_num - 2. / len(metrics)), s='%.2e' % v_graph) #if type(v_graph) is int: # pylab.text(x=.0, y=(met_num-2./len(metrics)), s=str(v_graph)) #else: # pylab.text(x=.0, y=(met_num-2./len(metrics)), s='%.3f'%v_graph) nor_vals = np.array(nor_vals) normed_replica_vals.append(nor_vals) if len(nor_vals) > 0: pylab.boxplot(nor_vals, positions=[met_num], vert=0, widths=0.5) if (nor_vals == graphutils.METRIC_ERROR).any(): val_str = r'undefined' avg_norm = -np.inf elif np.abs(nor_vals).sum() < 1000: avg_norm = np.average(nor_vals) val_str = r'$%.2f$' % np.average( nor_vals) if latex_available else r'%.2f' % avg_norm else: avg_norm = np.inf val_str = r'$\gg0$' if latex_available else r'>>0' avg_norms.append(avg_norm) else: val_str = r'undefined' avg_norms.append(None) else: val_str = r'undefined' normed_replica_vals.append([None, None]) avg_norms.append(None) pylab.text(x=1.74, y=(met_num - 2. / len(metrics)), s=val_str) try: pylab.yticks( list(range(num_of_metrics)), [clean_names.get(met['name'], met['name']) for met in metrics], rotation=0) if multiple_models: pylab.xlabel(r'Relative to mean of coarse networks', rotation=0, fontsize='20') #, x=0.1) else: pylab.xlabel(r'Relative to real network', rotation=0, fontsize='20') #, x=0.1) #pylab.title(G.name) #pylab.legend(loc='best') max_axis = 2 pylab.xlim(-0.02, max_axis) pylab.ylim(-1.0, len(metrics)) pylab.text(x=0.00, y=len(metrics) + 0.05, s='Template\ngraph', va='bottom') pylab.text(x=1.650, y=-1.05, s='Median of\nreplicas', va='top') if jaccard_edges != None: pylab.text(x=0.30, y=len(metrics) + 0.05, s='(Jaccard=%.3f)' % jaccard_edges, va='bottom') #pylab.text(x=-0.30, y=len(metrics)*(-0.15), s='E[EdgeJaccard]=%.3f'%jaccard_edges, ha='right', va='top') fig.subplots_adjust(left=0.17, right=0.95) if figpath == None: figpath = 'output/replica_vs_original_' + Gname + '_' + title_infix + '_' + str( seed) + '__' + timeNow() figpath = clean_path(figpath) save_figure_helper(figpath) pylab.hold(False) except Exception as inst: print('Warning: could not save stats figure ' + figpath + ':\n' + str(inst)) exc_traceback = sys.exc_info()[2] print( str(inst) + "\n" + str(traceback.format_tb(exc_traceback)).replace('\\n', '\n')) replica_stats['normed_replica_vals'] = normed_replica_vals replica_stats['avg_norm_of_replicas'] = avg_norms mean_rel_errors = [] mean_relstd_errors = [] for met_i in range(num_of_metrics): normed_vals = normed_replica_vals[met_i] if graphutils.METRIC_ERROR in normed_vals or len(normed_vals) == 1: mean_rel_errors.append(None) mean_relstd_errors.append(None) continue rel_error_ar = [v - 1.0 for v in normed_vals if v != None] if len(rel_error_ar) == 0: rel_error_ar = [graphutils.METRIC_ERROR, graphutils.METRIC_ERROR] mean_rel_errors.append(np.average(rel_error_ar)) mean_relstd_errors.append( np.average(rel_error_ar) / (1E-20 + np.std(rel_error_ar))) replica_stats['mean_rel_errors'] = mean_rel_errors replica_stats['mean_relstd_errors'] = mean_relstd_errors try: replica_stats['mean_mean_error'] = np.average( mean_rel_errors) #the grand stat replica_stats['mean_mean_errorstd'] = np.average( mean_relstd_errors) #the grand stat except: replica_stats['mean_mean_error'] = None replica_stats['mean_mean_errorstd'] = None return replica_stats, figpath
def create_boxplot(label, data, title, xlabel, ylabel): pl.boxplot(data, labels=label, showmeans=True) pl.suptitle(title, fontweight='bold') pl.xlabel(xlabel) pl.ylabel(ylabel)
make_lists(1, WK38) make_lists(2, WK48) make_lists(3, WK59) make_lists(4, WK119) make_lists(5, WK206) ############################################################################ #PLOT BOXPLOTS AND CALCULATE P-VALUES ############################################################################ P.figure() data = [WK38, WK48, WK59, WK119, WK206] bp = P.boxplot( data, whis=1000000 ) #setting whiskers to an unreasonably large number forces a plot of the whiskers to represent the min and max of the data for i in range(len(data)): y = data[i] y = np.array(y) print i print y x = np.random.normal(i + 1, 0.08, size=len(y)) x = np.array(x) #calculate point density xy = np.vstack([x, y]) z = gaussian_kde(xy)(xy) #sort points by density idx = z.argsort() x, y, z = x[idx], y[idx], z[idx]
repr((avg_values_RBBUP[1] / 2500) * 100) + " Length = " + repr((avg_values_RBBUP[2] / 2500) * 100) + " Turns = " + repr((avg_values_RBBUP[3] / avg_values_RBBUP[2]) * 100)) fig = plt.figure() fig.set_size_inches(10, 7) # Set properties for the plot. n_groups = 6 index = np.arange(n_groups) bar_width = 0.4 opacity = 0.8 # Dead ends subplot1 = plt.subplot(221) plot1 = plt.boxplot(dead_ends, vert=0) plt.xlabel('Algorithm') plt.ylabel('Cell') plt.title('Dead Ends') plt.yticks(index + 1, ('RB', 'P', 'W', 'RC', 'BU', 'RBBU')) # Rivers plt.subplot(222) plot2 = plt.boxplot(rivers, vert=0) plt.xlabel('Algorithm') plt.ylabel('Cell') plt.title('River Factor') plt.yticks(index + 1, ('RB', 'P', 'W', 'RC', 'BU', 'RBBU')) # Length plt.subplot(223)
def compare_nbrs(): """Decide which number of nbrs is best *** 3 *** [1428.5, 49.100000000000001, 6.2999999999999998] *** 7 *** [2240.9000000000001, 56.799999999999997, 6.4000000000000004] *** 11 *** [3262.0, 66.900000000000006, 7.7999999999999998] *** 15 *** [4020.5, 66.299999999999997, 7.9000000000000004] *** 31 *** [5613.6999999999998, 70.700000000000003, 8.0] *** 66 *** [6858.1999999999998, 55.200000000000003, 6.5] *** 3 *** [1489.7, 40.700000000000003, 5.0999999999999996] *** 7 *** [2357.4000000000001, 58.700000000000003, 7.2000000000000002] *** 11 *** [3079.3000000000002, 66.700000000000003, 7.2000000000000002] *** 15 *** [3791.8000000000002, 62.799999999999997, 7.0999999999999996] *** 31 *** [5291.3999999999996, 68.799999999999997, 8.0999999999999996] *** 66 *** [6714.8999999999996, 60.600000000000001, 7.0999999999999996] """ for matrix in [2, 4]: nbrs = [3, 7, 11, 15, 31, 66] alldata = [] for nbr in nbrs: print "***", nbr, "***" data = [[], [], []] for seed in range(10): filename = os.path.join( "paramsweep", "output", "alltetramers_CHR64_NBR%d_MAT%d_distance_SEED%d.txt" % (nbr, matrix, seed)) db = os.path.join("..", "dims_and_tets", "alltetramers") res = GA_Results(filename, db, 4) # Check result = len( res.pols), len(res.pols & chosentetramers), len(res.pols & besttetramers) for i in range(3): data[i].append(result[i]) print[pylab.mean(x) for x in data] alldata.append(data) for i, x in enumerate([ 'Number of polymers', 'Number of chosen tetramers', 'Number of top 10 most efficient tetramers' ]): pylab.boxplot([y[i] for y in alldata]) pylab.xlabel("Number of neighbours") pylab.ylabel(x) pylab.gca().set_xticklabels(nbrs) pylab.savefig( os.path.join("pictures", "Nnbrs_matrix%d_%d.png" % (matrix, i))) pylab.clf()
def box_plot(df, val, factors=None, where=None, fname=None, output_dir='', quality='medium'): """ Makes a box plot args: df: a pyvttbl.DataFrame object val: the label of the dependent variable kwds: factors: a list of factors to include in boxplot where: a string, list of strings, or list of tuples applied to the DataFrame before plotting fname: output file name quality: {'low' | 'medium' | 'high'} specifies image file dpi """ if factors == None: factors = [] if where == None: where = [] # check to see if there is any data in the table if df == {}: raise Exception('Table must have data to print data') # check to see if data columns have equal lengths if not df._are_col_lengths_equal(): raise Exception('columns have unequal lengths') # check the supplied arguments if val not in list(df.keys()): raise KeyError(val) if not hasattr(factors, '__iter__'): raise TypeError("'%s' object is not iterable" % type(factors).__name__) for k in factors: if k not in list(df.keys()): raise KeyError(k) # check for duplicate names dup = Counter([val] + factors) del dup[None] if not all([count == 1 for count in list(dup.values())]): raise Exception('duplicate labels specified as plot parameters') # check fname if not isinstance(fname, _strobj) and fname != None: raise TypeError('fname must be None or string') if isinstance(fname, _strobj): if not (fname.lower().endswith('.png') or \ fname.lower().endswith('.svg')): raise Exception('fname must end with .png or .svg') test = {} if factors == []: d = df.select_col(val, where=where) fig = pylab.figure() pylab.boxplot(np.array(d)) xticks = pylab.xticks()[0] xlabels = [val] pylab.xticks(xticks, xlabels) test['d'] = d test['val'] = val else: D = df.pivot(val, rows=factors, where=where, aggregate='tolist') fig = pylab.figure(figsize=(6 * len(factors), 6)) fig.subplots_adjust(left=.05, right=.97, bottom=0.24) pylab.boxplot([np.array(_flatten(d)) for d in D]) xticks = pylab.xticks()[0] xlabels = ['\n'.join('%s = %s' % fc for fc in c) for c in D.rnames] pylab.xticks(xticks, xlabels, rotation=35, verticalalignment='top') test['d'] = [np.array(_flatten(d)) for d in D] test['xlabels'] = xlabels maintitle = '%s' % val if factors != []: maintitle += ' by ' maintitle += ' * '.join(factors) fig.text(0.5, 0.95, maintitle, horizontalalignment='center', verticalalignment='top') test['maintitle'] = maintitle if fname == None: fname = 'box(%s' % val if factors != []: fname += '~' + '_X_'.join([str(f) for f in factors]) fname += ').png' fname = os.path.join(output_dir, fname) test['fname'] = fname # save figure if quality == 'low' or fname.endswith('.svg'): pylab.savefig(fname) elif quality == 'medium': pylab.savefig(fname, dpi=200) elif quality == 'high': pylab.savefig(fname, dpi=300) else: pylab.savefig(fname) pylab.close() if df.TESTMODE: return test
# setp(bp['fliers'][2], color='red') # setp(bp['fliers'][3], color='red') setp(bp['medians'][1], color='red') # Some fake data to plot A = [[1, 2, 5], [7, 2]] B = [[5, 7, 2, 2, 5], [7, 2, 5]] C = [[3, 2, 5, 7], [6, 7, 3]] fig = figure() ax = axes() # hold(True) # first boxplot pair bp = boxplot(data, positions=[1, 2], widths=0.6) setBoxColors(bp) # # second boxplot pair # bp = boxplot(B, positions = [4, 5], widths = 0.6) # setBoxColors(bp) # # # thrid boxplot pair # bp = boxplot(C, positions = [7, 8], widths = 0.6) # setBoxColors(bp) # set axes limits and labels # xlim(0,9) # ylim(0,9) # ylim(ymin=0) xlim(xmin=0)
def crossvalidate_krr(X, Y, f=5, kwidths=10.0**np.array([0, 1, 2]), llambdas=10.0**np.array([-4, -2, 0])): ''' Test generalization performance of kernel ridge regression with gaussian kernel Input: X data (dims-by-samples) Y labels (dims2-by-samples) f number of cross-validation folds kwidths width of gaussian kernel function llambdas regularizer (height of ridge on kernel matrix) ''' N = f * (X.shape[-1] / f) idx = sp.reshape(sp.random.permutation(sp.arange(N, dtype=int)), (f, N / f)) r2_outer = sp.zeros((f)) r2_linear = sp.zeros((f)) r2_inner = sp.zeros((f - 1, kwidths.shape[-1], llambdas.shape[-1])) # to outer cross-validation (model evaluation) for ofold in range(f): # split in training and test (outer fold) otestidx = sp.zeros((f), dtype=bool) otestidx[ofold] = 1 otest = idx[otestidx, :].flatten() otrain = idx[~otestidx, :] # inner cross-validation (model selection) for ifold in range(f - 1): # split in training and test (inner fold) itestidx = sp.zeros((f - 1), dtype=bool) itestidx[ifold] = 1 itest = otrain[itestidx, :].flatten() itrain = otrain[~itestidx, :].flatten() # do inner cross-validation (model selection) for illambda in range(llambdas.shape[-1]): for ikwidth in range(kwidths.shape[-1]): #compute kernel for all data points alphas = train_krr(X[:, itrain], Y[:, itrain], kwidths[ikwidth], llambdas[illambda]) yhat = apply_krr(alphas, X[:, itrain], X[:, itest], kwidths[ikwidth]) r2_inner[ifold, ikwidth, illambda] = compute_rsquare(yhat, Y[:, itest]) #train again using optimal parameters r2_across_folds = r2_inner.mean(axis=0) optkwidthidx, optllambdaidx = np.unravel_index( r2_across_folds.flatten().argmax(), r2_across_folds.shape) #evaluate model on outer test fold alphas = train_krr(X[:, otrain.flatten()], Y[:, otrain.flatten()], kwidths[optkwidthidx], llambdas[optllambdaidx]) yhat = apply_krr(alphas, X[:, otrain.flatten()], X[:, otest], kwidths[optkwidthidx]) r2_outer[ofold] = compute_rsquare(yhat, Y[:, otest]) # for comparison: predict with linear model w_est = train_ols(X[:, otrain.flatten()], Y[:, otrain.flatten()]) y_est_lin = apply_ols(w_est, X[:, otest]) r2_linear[ofold] = compute_rsquare(y_est_lin, Y[:, otest]) print( 'Fold %d'%ofold + ' best kernel width %f'%kwidths[optkwidthidx] +\ ' best regularizer %f'%llambdas[optllambdaidx] + \ ' rsquare %f'%r2_outer[ofold] + \ ' rsquare linear %f'%r2_linear[ofold]) pl.figure() pl.boxplot(sp.vstack((r2_outer, r2_linear)).T) pl.ylabel('$r^2$') pl.xticks((1, 2), ('KRR', 'Lin')) pl.savefig('krr_vs_lin_comparison.pdf') return r2_outer, r2_linear
break n += 1 bins = np.array(list(range(0, 21))) / 10 inds = np.digitize(x, bins) print(bins) print(inds) x_bin = [] y_bin = [] for group in range(min(inds), max(inds)): # x_bin.append(x[inds == group]) y_bin.append(np.array(y)[inds == group]) fig, ax = plt.subplots() boxplot(y_bin, 0, '') ax.set_xticklabels(bins) plt.xlabel("Reserve Price", fontsize=30) ax.xaxis.set_tick_params(labelsize=20) plt.ylabel("Impression Revenue", fontsize=30) ax.yaxis.set_tick_params(labelsize=20) plt.show() fig, ax = plt.subplots() ax.plot(x, y, '.', markersize=0.3, color='black') plt.xlim(0, 2) plt.ylim(0, 4) plt.xlabel("Reserve Price", fontsize=30) ax.xaxis.set_tick_params(labelsize=20) plt.ylabel("Impression Revenue", fontsize=30) ax.yaxis.set_tick_params(labelsize=20)
############################################################################# ############################################################################# #1rst component scz = projections[y == 1, 0] scz_asd = projections[y == 2, 0] asd = projections[y == 3, 0] data = [scz, scz_asd, asd] import pylab as P import numpy as np P.figure() bp = P.boxplot(data) P.ylabel('Predicted') plt.ylabel('Score on 1rst component') P.xticks([1, 2, 3], ['SCZ', 'SCZ-ASD', 'ASD']) for i in range(3): y = data[i] x = np.random.normal(1 + i, 0.04, size=len(y)) P.plot(x, y, 'bo', alpha=0.6) P.show() #2nd component y = np.load(DATA_Y) y = y[y != 0] scz = projections[y == 1, 1]
while any(len(b.exp_frac_bi) < 10000 for b in bins): r = random.random()**2 sim_states = [(random.random() < r, random.random() < r) for p in sample_pairs] num_bi = sum(p and m for p,m in sim_states) num_silent = sum(not (p or m) for p,m in sim_states) bins[num_silent].addsim(num_bi) if 'sayN' in o.plotstyle: Ns = [len(b.obs_frac_bi) for b in bins[:-1]] print 'min N =', min(Ns), 'max =', max(Ns) if 'violin' in o.plotstyle: violin_plot(pylab.axes(), [b.obs_frac_bi for b in bins[:-1]], [b.frac_silent for b in bins[:-1]], leftside=False, color='b', widthf=0.1) violin_plot(pylab.axes(), [random.sample(b.exp_frac_bi, len(o.obs_frac_bi)) for b in bins[:-1]], [b.frac_silent for b in bins[:-1]], rightside=False, color='r', widthf=0.1) if 'boxplot' in o.plotstyle: pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected') pylab.boxplot([b.obs_frac_bi for b in bins], positions=[b.frac_silent for b in bins], widths=0.5/len(sample_pairs)) if 'mean_graph' in o.plotstyle: pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected') pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.obs_frac_bi) for b in bins], color='b', label='observed') if 'mean_sem' in o.plotstyle: pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected') pylab.errorbar([b.frac_silent for b in bins], [numpy.mean(b.obs_frac_bi) for b in bins], [sem(b.obs_frac_bi) for b in bins], label='observed') if 'std' in o.plotstyle: pylab.plot([b.frac_silent for b in bins], [numpy.mean(b.exp_frac_bi) for b in bins], color='r', label='expected') pylab.errorbar([b.frac_silent for b in bins], [numpy.mean(b.obs_frac_bi) for b in bins], [numpy.std(b.obs_frac_bi) for b in bins], label='observed') if 'sayY' in o.plotstyle: print [numpy.mean(b.obs_frac_bi) for b in bins] print [numpy.mean(b.exp_frac_bi) for b in bins] from scipy import stats print 'paired t test', stats.ttest_rel([numpy.mean(b.obs_frac_bi) for b in bins if not b.hasNaN()], [numpy.mean(b.exp_frac_bi) for b in bins if not b.hasNaN()]) pylab.title(' '.join(o.stages))
def plotGCContent(all_result_outputs, label=''): #Merge data across samples unique_cols = ['Oligo ID', 'Indel', 'GC Content', 'MH Len', 'MH Dist'] datas = [ x[0]['Data'][unique_cols + ['Indel Reads', 'Non-Null Reads']] for x in all_result_outputs ] merged_data = datas[0] for i, data in enumerate(datas[1:]): merged_data = pd.merge(merged_data, data, on=unique_cols, suffixes=('', '%d' % (i + 2)), how='outer') suffix = lambda i: '%d' % (i + 1) if i > 0 else '' merged_data['Indel Reads Sum'] = merged_data[[ 'Indel Reads' + suffix(i) for i in range(len(datas)) ]].sum(axis=1) merged_data['Non-Null Reads Sum'] = merged_data[[ 'Non-Null Reads' + suffix(i) for i in range(len(datas)) ]].sum(axis=1) #Compute mean regression lines across samples for each MH length mean_lines = {} for mh_len in range(2, 16): if mh_len not in all_result_outputs[0][0]['RegrLines']: continue regr_lines = [ x[0]['RegrLines'][mh_len][:2] for x in all_result_outputs ] mean_lines[mh_len] = np.mean(regr_lines, axis=0) #Restrict to only MH dist in (0,10) and adjust for mh len-dist relationship for mh_len in [9]: compute_resid = lambda row: row[ 'Perc Reads' ] # - getRegrValue(row['MH Len'],row['MH Dist'],mean_lines) sel_data = merged_data.loc[(merged_data['MH Len'] == mh_len) & (merged_data['MH Dist'] >= 0) & (merged_data['MH Dist'] <= 10)] sel_data['Perc Reads'] = sel_data[ 'Indel Reads Sum'] * 100.0 / sel_data['Non-Null Reads Sum'] sel_data['Perc Reads Residual'] = sel_data.apply(compute_resid, axis=1) PL.figure(figsize=(4, 4)) gcs = sel_data['GC Content'].unique() gcs.sort() boxdata_lk = { gc: sel_data.loc[sel_data['GC Content'] == gc]['Perc Reads Residual'] for gc in gcs } gcs = [gc for gc in gcs if len(boxdata_lk[gc]) > 20 ] #Limit to GC with at least 20 data points boxdata = [boxdata_lk[gc] for gc in gcs] print([len(x) for x in boxdata]) PL.boxplot(boxdata) PL.ylabel('Percent total mutated reads of MH-mediated deletion') PL.xlabel('GC content of microhomologous sequence') PL.title('Microhomology of length %d\n(at max 10 distance)' % mh_len) PL.xticks(range(1, len(gcs) + 1), gcs) PL.show(block=False) saveFig('gc_content_mh%d' % mh_len)
large_size_ratios = [pair[1] for pair in zip(sizes, ratios) if pair[0] > 1000] print('best compression overall: ', max(ratios)) print() print('Results for all blocks, n: ', len(ratios)) print('mean compression: ', np.mean(ratios)) print('median compression: ', np.median(ratios)) print() print('Results for blocks with more than 1K txs, n: ', len(large_size_ratios)) print('mean compression: ', np.mean(large_size_ratios)) print('median compression: ', np.median(large_size_ratios)) plt.scatter(sizes, ratios, alpha=0.75) plt.xlabel('transactions in block') plt.ylabel('compression ratio') plt.ylim((0.95, 1.0)) plt.grid(True) size_ratios_map = defaultdict(list) for size, ratio in zip(sizes, ratios): size_ratios_map[size//50].append(ratio) plt.figure() plt.boxplot(size_ratios_map.values()) plt.xlabel('transactions in block') plt.ylabel('compression rate') plt.ylim((0.95, 1.0)) plt.xticks(rotation='vertical') plt.xticks(sorted(size_ratios_map.keys()), [50*k for k in sorted(size_ratios_map.keys())], rotation='vertical') plt.tight_layout() plt.show()