def get_scores_from_fits(fits, use_correlations): if use_correlations: R2_pairs = [(fit.LOO_score,fit.with_correlations.LOO_score) for fit in iterate_fits(fits)] R2_pairs = [(s1,s2) for s1,s2 in R2_pairs if s1>-1 and s2>-1] basic = np.array([b for b,m in R2_pairs]) multi = np.array([m for b,m in R2_pairs]) else: basic = np.array([fit.LOO_score for fit in iterate_fits(fits) if fit.LOO_score>-1]) multi = None return basic,multi
def get_scores_from_fits(fits, use_correlations): if use_correlations: R2_pairs = [(fit.LOO_score, fit.with_correlations.LOO_score) for fit in iterate_fits(fits)] R2_pairs = [(s1, s2) for s1, s2 in R2_pairs if s1 > -1 and s2 > -1] basic = np.array([b for b, m in R2_pairs]) multi = np.array([m for b, m in R2_pairs]) else: basic = np.array([ fit.LOO_score for fit in iterate_fits(fits) if fit.LOO_score > -1 ]) multi = None return basic, multi
def plot_bootstrap_onset_variance(data, fits): mu_and_std = [] for fit in iterate_fits(fits): a,h,mu_global,_ = fit.theta nParams, nSamples = fit.theta_samples.shape mu_bootstrap = np.empty(nSamples) for i in xrange(nSamples): a,h,mu_i,_ = fit.theta_samples[:,i] mu_bootstrap[i] = mu_i mu_std = np.std(mu_bootstrap) mu_and_std.append( (mu_global, mu_std) ) mu,mu_std = zip(*mu_and_std) fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) ax.plot(mu, mu_std, 'bx') ax.set_ylabel('onset time bootstrap std', fontsize=cfg.fontsize) # set the development stages as x labels stages = [stage.scaled(age_scaler) for stage in dev_stages] ax.set_xticks([stage.central_age for stage in stages]) ax.set_xticklabels([stage.short_name for stage in stages], fontsize=cfg.fontsize, fontstretch='condensed', rotation=90) yticks = ax.get_yticks() yticks = [yticks[0], yticks[-1]] ax.set_yticks(yticks) ax.set_yticklabels(['{:.1g}'.format(t) for t in yticks], fontsize=cfg.fontsize) # mark birth time with a vertical line ymin, ymax = ax.get_ylim() birth_age = age_scaler.scale(0) ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85') return fig
def add_change_distributions(data, fitter, fits, age_range=None, n_bins=50): """ Compute a histogram of "strength of transition" at different ages. The histogram is computed for each (gene,region) in fits and is added to the fit objects. Currently this function only works for sigmoid fits. It uses the h parameter explicitly, relies on monotonicity, etc. It is probably not too hard to generalize it to other shapes. """ shape = fitter.shape assert shape.cache_name() in [ 'sigmoid', 'sigslope' ] # the function currently works only for sigmoid/sigslope fits bin_edges, bin_centers = get_bins(data, age_range, n_bins) fits.change_distribution_params = Bunch( bin_edges=bin_edges, bin_centers=bin_centers, ) for dsname, g, r, fit in iterate_fits(fits, return_keys=True): weights = calc_bootstrap_change_distribution(shape, fit.theta_samples, bin_edges) fit.change_distribution_weights = weights fit.change_distribution_spread = change_distribution_spread_cumsum( bin_centers, weights) fit.change_distribution_mean_std = change_distribution_mean_and_std( bin_centers, weights)
def plot_comparison_bar(data, shapes, all_fits, threshold_percentile=None): nShapes = len(shapes) mu = np.empty(nShapes) se = np.empty(nShapes) for i,fits in enumerate(all_fits): scores = np.array([f.LOO_score for f in iterate_fits(fits, R2_threshold=-1)]) if threshold_percentile is not None: threshold_score = np.percentile(scores, 50) scores = scores[scores > threshold_score] mu[i] = np.mean(scores) se[i] = scipy.stats.sem(scores) # reorder by mean score idx = np.argsort(mu)[::-1] mu = mu[idx] se = se[idx] shapes = [shapes[i] for i in idx] index = np.arange(nShapes) bar_width = 0.8 fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) ax.bar(index, mu, yerr=se, width=bar_width, color='b', error_kw = {'ecolor': '0.3', 'linewidth': 2}) ax.set_ylabel('Mean $R^2$', fontsize=fontsize) ax.set_xticks(index + bar_width/2) ax.set_xticklabels([s.display_name() for s in shapes], fontsize=fontsize) yticks = [0, 0.1, 0.2, 0.3] ax.set_yticks(yticks) ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize) return fig
def plot_change_width_scatter(data, fitter, fits): bin_edges = fits.change_distribution_params.bin_edges bin_centers = fits.change_distribution_params.bin_centers shape = fitter.shape def get_width(weights): return change_distribution_width_cumsum(bin_centers, weights) width_pairs = [] for fit in iterate_fits(fits): weights_single = calc_change_distribution(shape, fit.theta, bin_edges) width_single = get_width(weights_single) width_bootstrap = get_width(fit.change_distribution_weights) width_pairs.append( (width_single, width_bootstrap) ) width_single,width_bootstrap = zip(*width_pairs) fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) maxw = int(math.ceil(max(max(width_single), max(width_bootstrap)))) minw = int(math.floor(min(min(width_single), min(width_bootstrap)))) ax.scatter(width_single, width_bootstrap, alpha=0.8) ax.plot(np.mean(width_single), np.mean(width_bootstrap), 'rx', markersize=8, markeredgewidth=2, label='mean') ax.plot([minw, maxw], [minw, maxw],'k--') ax.set_xlim(minw,maxw) ax.set_ylim(minw,maxw) ticks = range(minw, maxw+1) ax.set_yticks(ticks) ax.set_xticks(ticks) ax.set_xticklabels([str(t) for t in ticks], fontsize=cfg.fontsize) ax.set_yticklabels([str(t) for t in ticks], fontsize=cfg.fontsize) ax.set_xlabel('width of single fit', fontsize=cfg.fontsize) ax.set_ylabel('width by bootstrap', fontsize=cfg.fontsize) ax.set_title('change distribution of single fit vs. bootstrap', fontsize=cfg.fontsize) return fig
def plot_onset_times(all_data, data, fitter, fits, dct_pathways, R2_threshold, b_unique): fig = plt.figure() ax = fig.add_axes([0.12, 0.12, 0.8, 0.8]) stages = [stage.scaled(age_scaler) for stage in dev_stages] low = min(stage.from_age for stage in stages) high = max(stage.to_age for stage in stages) n_fits = sum( len(ds.gene_names) * len(ds.region_names) for ds in all_data.datasets) bin_edges, change_vals = get_change_distribution_for_whole_genome( all_data, fitter) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 ax.plot(bin_centers, change_vals, linewidth=5, label='whole genome ({} fits)'.format(n_fits)) for i, (pathway_name, genes) in enumerate(sorted(dct_pathways.items())): pathway_fits = restrict_genes(fits, genes) thetas = [(g, r, fit.theta) for dsname, g, r, fit in iterate_fits( pathway_fits, R2_threshold=R2_threshold, return_keys=True)] if not thetas: print 'Skipping {}. No fits left'.format(pathway_name) continue bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 linestyles = ['-', '--', '-.'] style = linestyles[int(i / 7)] label = '{} ({} fits)'.format(pathway_name, len(thetas)) ax.plot(bin_centers, change_vals, style, linewidth=3, label=label) ax.legend(loc='best', fontsize=18, frameon=False) ax.set_ylabel('expression change magnitude', fontsize=fontsize) # set the development stages as x labels stages = [stage.scaled(age_scaler) for stage in dev_stages] ax.set_xticks([stage.central_age for stage in stages]) ax.set_xticklabels([stage.short_name for stage in stages], fontsize=fontsize, fontstretch='condensed', rotation=90) yticks = ax.get_yticks() yticks = [yticks[0], yticks[-1]] ax.set_yticks(yticks) ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize) # mark birth time with a vertical line ymin, ymax = ax.get_ylim() birth_age = age_scaler.scale(0) ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85') return fig
def main(): fits = get_fits() def cond(fit): a,h,mu,w = fit.theta if h*w > 0: return False return abs(w) < 0.5 return [(g,r) for dsname,g,r,fit in iterate_fits(fits,R2_threshold=0.5,return_keys=True) if cond(fit)]
def plot_comparison_bar(data, shapes, all_fits): n = len(shapes) assert len(all_fits) == n assert n == 2 score_pairs = [ (f1.LOO_score, f2.LOO_score) for f1, f2 in iterate_fits(all_fits[0], all_fits[1], R2_threshold=-1) ] scores1, scores2 = zip(*score_pairs) all_scores = [scores1, scores2] _, pval = scipy.stats.wilcoxon(scores1, scores2) pval = pval / 2 # one sided p-value print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval) mu = np.empty(n) se = np.empty(n) for i, scores in enumerate(all_scores): mu[i] = np.mean(scores) se[i] = scipy.stats.sem(scores) # reorder by mean score idx = np.argsort(mu)[::-1] mu = mu[idx] se = se[idx] shapes = [shapes[i] for i in idx] index = np.arange(n) bar_width = 0.8 fig = plt.figure() ax = fig.add_axes([0.12, 0.12, 0.8, 0.8]) ax.bar(index, mu, yerr=se, width=bar_width, color='b', error_kw={ 'ecolor': '0.3', 'linewidth': 2 }) ax.set_xlabel('shape', fontsize=fontsize) ax.set_ylabel('Mean $R^2$', fontsize=fontsize) ax.set_xticks(index + bar_width / 2) ax.set_xticklabels([s.cache_name() for s in shapes], fontsize=fontsize) yticks = [0, 0.1, 0.2, 0.3] ax.set_yticks(yticks) ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize) return fig
def analyze_variant(theta,sigma): theta_priors = priors_name if theta else None sigma_prior = 'normal' if sigma else None shape = Sigslope(theta_priors) fitter = Fitter(shape,sigma_prior) fits = get_all_fits(data,fitter,allow_new_computation=False) LOO_scores = [f.LOO_score for f in iterate_fits(fits) if f.LOO_score is not None] mu,sem = bootstrap(LOO_scores, np.mean) return Bunch( theta = theta, sigma = sigma, LOO_scores = LOO_scores, mu = mu, sem = sem, )
def plot_comparison_scatter(data1, fitter1, fits1, data2, fitter2, fits2): pairs = [(fit1.LOO_score, fit2.LOO_score) for fit1,fit2 in iterate_fits(fits1,fits2)] scores1,scores2 = zip(*pairs) fig = plt.figure() plt.scatter(scores1, scores2, alpha=0.5) plt.plot([-1, 1], [-1, 1],'k--') plt.xlim(-1,1) plt.ylim(-1,1) ttl1 = r'Comparison of scores using {} vs. {}'.format(fitter1.shape,fitter2.shape) ttl2 = r'{}, {}'.format(data1.name, data1.pathway) plt.title('\n'.join([ttl1, ttl2]), fontsize=cfg.fontsize) plt.xlabel('R2 for {}'.format(fitter1.shape), fontsize=cfg.fontsize) plt.ylabel('R2 for {}'.format(fitter2.shape), fontsize=cfg.fontsize) return fig
def analyze_paired_scores_with_and_without_priors(n_best=10): nFitter = Fitter(Sigslope()) yFitter = Fitter(Sigslope(priors_name), 'normal') nFits = get_all_fits(data, nFitter, allow_new_computation=False) yFits = get_all_fits(data, yFitter, allow_new_computation=False) score_pairs = [(f1.LOO_score, f2.LOO_score) for f1, f2 in iterate_fits(nFits, yFits)] nScores, yScores = zip(*score_pairs) _, pval = scipy.stats.wilcoxon(nScores, yScores) pval = pval / 2 # one sided p-value print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval) # find examples of best improvements diffs = [(f2.LOO_score - f1.LOO_score, f1.LOO_score, f2.LOO_score, g, r) for dsname, g, r, f1, f2 in iterate_fits( nFits, yFits, R2_threshold=-1, return_keys=True)] diffs.sort(reverse=True) print 'Gene/Regions for which priors produce best R2 improvement:' for i, (delta, R2_without, R2_with, g, r) in enumerate(diffs[:10]): print '{i}) {g}@{r}, delta-R2={delta:.3g}. R2_without={R2_without:.3g}, R2_with={R2_with:.3g}'.format( **locals())
def analyze_variant(theta, sigma): theta_priors = priors_name if theta else None sigma_prior = 'normal' if sigma else None shape = Sigslope(theta_priors) fitter = Fitter(shape, sigma_prior) fits = get_all_fits(data, fitter, allow_new_computation=False) LOO_scores = [ f.LOO_score for f in iterate_fits(fits) if f.LOO_score is not None ] mu, sem = bootstrap(LOO_scores, np.mean) return Bunch( theta=theta, sigma=sigma, LOO_scores=LOO_scores, mu=mu, sem=sem, )
def plot_onset_times(all_data, data, fitter, fits, dct_pathways, R2_threshold, b_unique): fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) stages = [stage.scaled(age_scaler) for stage in dev_stages] low = min(stage.from_age for stage in stages) high = max(stage.to_age for stage in stages) n_fits = sum(len(ds.gene_names) * len(ds.region_names) for ds in all_data.datasets) bin_edges, change_vals = get_change_distribution_for_whole_genome(all_data,fitter) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 ax.plot(bin_centers, change_vals, linewidth=5, label='whole genome ({} fits)'.format(n_fits)) for i,(pathway_name, genes) in enumerate(sorted(dct_pathways.items())): pathway_fits = restrict_genes(fits,genes) thetas = [(g,r,fit.theta) for dsname,g,r,fit in iterate_fits(pathway_fits, R2_threshold=R2_threshold, return_keys=True)] if not thetas: print 'Skipping {}. No fits left'.format(pathway_name) continue bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 linestyles = ['-', '--', '-.'] style = linestyles[int(i/7)] label = '{} ({} fits)'.format(pathway_name,len(thetas)) ax.plot(bin_centers, change_vals, style, linewidth=3, label=label) ax.legend(loc='best', fontsize=18, frameon=False) ax.set_ylabel('expression change magnitude', fontsize=fontsize) # set the development stages as x labels stages = [stage.scaled(age_scaler) for stage in dev_stages] ax.set_xticks([stage.central_age for stage in stages]) ax.set_xticklabels([stage.short_name for stage in stages], fontsize=fontsize, fontstretch='condensed', rotation=90) yticks = ax.get_yticks() yticks = [yticks[0], yticks[-1]] ax.set_yticks(yticks) ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize) # mark birth time with a vertical line ymin, ymax = ax.get_ylim() birth_age = age_scaler.scale(0) ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85') return fig
def plot_comparison_scatter(data, shape1, fits1, shape2, fits2): pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(fits1,fits2)] scores1,scores2 = zip(*pairs) fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) ax.scatter(scores1, scores2, alpha=0.3) ax.plot([-1, 1], [-1, 1],'k--') ax.set_xlim(-1,1) ax.set_ylim(-1,1) ticks = [-1,1] ax.set_yticks(ticks) ax.set_xticks(ticks) ax.set_xticklabels([str(t) for t in ticks], fontsize=fontsize) ax.set_yticklabels([str(t) for t in ticks], fontsize=fontsize) ax.set_xlabel('$R^2$ for {}'.format(shape1), fontsize=fontsize) ax.set_ylabel('$R^2$ for {}'.format(shape2), fontsize=fontsize) return fig
def plot_comparison_scatter(data, shape1, fits1, shape2, fits2): pairs = [(f1.LOO_score, f2.LOO_score) for f1, f2 in iterate_fits(fits1, fits2)] scores1, scores2 = zip(*pairs) fig = plt.figure() ax = fig.add_axes([0.12, 0.12, 0.8, 0.8]) ax.scatter(scores1, scores2, alpha=0.3) ax.plot([-1, 1], [-1, 1], 'k--') ax.set_xlim(-1, 1) ax.set_ylim(-1, 1) ticks = [-1, 1] ax.set_yticks(ticks) ax.set_xticks(ticks) ax.set_xticklabels([str(t) for t in ticks], fontsize=fontsize) ax.set_yticklabels([str(t) for t in ticks], fontsize=fontsize) ax.set_xlabel('$R^2$ for {}'.format(shape1), fontsize=fontsize) ax.set_ylabel('$R^2$ for {}'.format(shape2), fontsize=fontsize) return fig
def get_onset_times(data, fitter, R2_threshold, b_force=False): filename = join(cache_dir(),fit_results_relative_path(data,fitter) + '.pkl') if isfile(filename): print 'Loading onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) else: print 'Computing...' fits = get_all_fits(data, fitter) thetas = [fit.theta for fit in iterate_fits(fits, R2_threshold=R2_threshold)] stages = [stage.scaled(age_scaler) for stage in dev_stages] low = min(stage.from_age for stage in stages) high = max(stage.to_age for stage in stages) bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50) print 'Saving result to {}'.format(filename) ensure_dir(dirname(filename)) with open(filename,'w') as f: pickle.dump((bin_edges,change_vals),f) return bin_edges, change_vals
def add_change_distributions(data, fitter, fits, age_range=None, n_bins=50): """ Compute a histogram of "strength of transition" at different ages. The histogram is computed for each (gene,region) in fits and is added to the fit objects. Currently this function only works for sigmoid fits. It uses the h parameter explicitly, relies on monotonicity, etc. It is probably not too hard to generalize it to other shapes. """ shape = fitter.shape assert shape.cache_name() in ['sigmoid','sigslope'] # the function currently works only for sigmoid/sigslope fits bin_edges, bin_centers = get_bins(data, age_range, n_bins) fits.change_distribution_params = Bunch( bin_edges = bin_edges, bin_centers = bin_centers, ) for dsname,g,r,fit in iterate_fits(fits, return_keys=True): weights = calc_bootstrap_change_distribution(shape, fit.theta_samples, bin_edges) fit.change_distribution_weights = weights fit.change_distribution_spread = change_distribution_spread_cumsum(bin_centers, weights) fit.change_distribution_mean_std = change_distribution_mean_and_std(bin_centers, weights)
def analyze_paired_scores_with_and_without_priors(n_best=10): nFitter = Fitter(Sigslope()) yFitter = Fitter(Sigslope(priors_name), 'normal') nFits = get_all_fits(data,nFitter,allow_new_computation=False) yFits = get_all_fits(data,yFitter,allow_new_computation=False) score_pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(nFits, yFits)] nScores, yScores = zip(*score_pairs) _, pval = scipy.stats.wilcoxon(nScores, yScores) pval = pval/2 # one sided p-value print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval) # find examples of best improvements diffs = [(f2.LOO_score-f1.LOO_score, f1.LOO_score, f2.LOO_score, g, r) for dsname,g,r,f1,f2 in iterate_fits(nFits, yFits, R2_threshold=-1, return_keys=True)] diffs.sort(reverse=True) print 'Gene/Regions for which priors produce best R2 improvement:' for i,(delta,R2_without, R2_with, g,r) in enumerate(diffs[:10]): print '{i}) {g}@{r}, delta-R2={delta:.3g}. R2_without={R2_without:.3g}, R2_with={R2_with:.3g}'.format(**locals())
def plot_change_width_scatter(data, fitter, fits): bin_edges = fits.change_distribution_params.bin_edges bin_centers = fits.change_distribution_params.bin_centers shape = fitter.shape def get_width(weights): return change_distribution_width_cumsum(bin_centers, weights) width_pairs = [] for fit in iterate_fits(fits): weights_single = calc_change_distribution(shape, fit.theta, bin_edges) width_single = get_width(weights_single) width_bootstrap = get_width(fit.change_distribution_weights) width_pairs.append((width_single, width_bootstrap)) width_single, width_bootstrap = zip(*width_pairs) fig = plt.figure() ax = fig.add_axes([0.12, 0.12, 0.8, 0.8]) maxw = int(math.ceil(max(max(width_single), max(width_bootstrap)))) minw = int(math.floor(min(min(width_single), min(width_bootstrap)))) ax.scatter(width_single, width_bootstrap, alpha=0.8) ax.plot(np.mean(width_single), np.mean(width_bootstrap), 'rx', markersize=8, markeredgewidth=2, label='mean') ax.plot([minw, maxw], [minw, maxw], 'k--') ax.set_xlim(minw, maxw) ax.set_ylim(minw, maxw) ticks = range(minw, maxw + 1) ax.set_yticks(ticks) ax.set_xticks(ticks) ax.set_xticklabels([str(t) for t in ticks], fontsize=cfg.fontsize) ax.set_yticklabels([str(t) for t in ticks], fontsize=cfg.fontsize) ax.set_xlabel('width of single fit', fontsize=cfg.fontsize) ax.set_ylabel('width by bootstrap', fontsize=cfg.fontsize) ax.set_title('change distribution of single fit vs. bootstrap', fontsize=cfg.fontsize) return fig
def plot_theta_diff_scatter(show_title=False): yFitter = Fitter(Sigslope(priors_name),'normal') nFitter = Fitter(Sigslope()) yFits = get_all_fits(data,yFitter) nFits = get_all_fits(data,nFitter) pairs = [(nFit.LOO_score,yFit.LOO_score) for nFit,yFit in iterate_fits(nFits,yFits)] diff_pairs = [(n,y-n) for n,y in pairs if n is not None and y is not None] n,d = zip(*diff_pairs) fig = plt.figure() ax = fig.add_axes([0.15,0.12,0.8,0.8]) ax.scatter(n, d, alpha=0.5) xlims = ax.get_xlim() ax.plot(xlims,[0, 0],'k--') ax.set_xlim(*xlims) if show_title: ax.title(r'Improvement from prior on $\theta$ vs. baseline $R^2$', fontsize=fontsize) ax.set_xlabel(r'$R^2$(no priors)', fontsize=fontsize) ax.set_ylabel(r'$R^2$($\theta$) - $R^2$(no priors)', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig
def plot_comparison_over_R2_score(data, shapes, all_fits, zoom=None, nbins=50): if zoom is None: zoom = (-1, 1) fig = plt.figure() ax = fig.add_axes([0.12, 0.12, 0.8, 0.8]) zoom_max = 0 for shape, fits in zip(shapes, all_fits): scores = np.array([f.LOO_score for f in iterate_fits(fits)]) scores[scores < -0.999] = -0.999 h, bins = np.histogram(scores, bins=nbins, density=True) xpos = (bins[:-1] + bins[1:]) / 2 zoom_data = h[(xpos >= zoom[0]) & (xpos <= zoom[1])] zoom_max = max(max(zoom_data), zoom_max) ax.plot(xpos, h, linewidth=3, label=shape.cache_name()) ax.set_xlim(*zoom) ax.set_ylim(0, zoom_max * 1.1) ax.legend(loc='best', fontsize=fontsize, frameon=False) ax.set_xlabel('test $R^2$ score', fontsize=fontsize) ax.set_ylabel("probability density", fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig
def plot_comparison_over_R2_score(data, shapes, all_fits, zoom=None, nbins=50): if zoom is None: zoom = (-1,1) fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) zoom_max = 0 for shape,fits in zip(shapes,all_fits): scores = np.array([f.LOO_score for f in iterate_fits(fits)]) scores[scores < -0.999] = -0.999 h,bins = np.histogram(scores,bins=nbins,density=True) xpos = (bins[:-1] + bins[1:])/2 zoom_data = h[(xpos>=zoom[0]) & (xpos<=zoom[1])] zoom_max = max(max(zoom_data),zoom_max) ax.plot(xpos,h, linewidth=3, label=shape.cache_name()) ax.set_xlim(*zoom) ax.set_ylim(0,zoom_max*1.1) ax.legend(loc='best', fontsize=fontsize, frameon=False) ax.set_xlabel('test $R^2$ score', fontsize=fontsize) ax.set_ylabel("probability density", fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig
def plot_bootstrap_onset_variance(data, fits): mu_and_std = [] for fit in iterate_fits(fits): a, h, mu_global, _ = fit.theta nParams, nSamples = fit.theta_samples.shape mu_bootstrap = np.empty(nSamples) for i in xrange(nSamples): a, h, mu_i, _ = fit.theta_samples[:, i] mu_bootstrap[i] = mu_i mu_std = np.std(mu_bootstrap) mu_and_std.append((mu_global, mu_std)) mu, mu_std = zip(*mu_and_std) fig = plt.figure() ax = fig.add_axes([0.12, 0.12, 0.8, 0.8]) ax.plot(mu, mu_std, 'bx') ax.set_ylabel('onset time bootstrap std', fontsize=cfg.fontsize) # set the development stages as x labels stages = [stage.scaled(age_scaler) for stage in dev_stages] ax.set_xticks([stage.central_age for stage in stages]) ax.set_xticklabels([stage.short_name for stage in stages], fontsize=cfg.fontsize, fontstretch='condensed', rotation=90) yticks = ax.get_yticks() yticks = [yticks[0], yticks[-1]] ax.set_yticks(yticks) ax.set_yticklabels(['{:.1g}'.format(t) for t in yticks], fontsize=cfg.fontsize) # mark birth time with a vertical line ymin, ymax = ax.get_ylim() birth_age = age_scaler.scale(0) ax.plot([birth_age, birth_age], [ymin, ymax], '--', color='0.85') return fig
def plot_comparison_bar(data, shapes, all_fits): n = len(shapes) assert len(all_fits) == n assert n == 2 score_pairs = [(f1.LOO_score, f2.LOO_score) for f1,f2 in iterate_fits(all_fits[0], all_fits[1], R2_threshold=-1)] scores1, scores2 = zip(*score_pairs) all_scores = [scores1, scores2] _, pval = scipy.stats.wilcoxon(scores1, scores2) pval = pval/2 # one sided p-value print '*** wilcoxon signed rank p-value (one sided) = {:.3g}'.format(pval) mu = np.empty(n) se = np.empty(n) for i,scores in enumerate(all_scores): mu[i] = np.mean(scores) se[i] = scipy.stats.sem(scores) # reorder by mean score idx = np.argsort(mu)[::-1] mu = mu[idx] se = se[idx] shapes = [shapes[i] for i in idx] index = np.arange(n) bar_width = 0.8 fig = plt.figure() ax = fig.add_axes([0.12,0.12,0.8,0.8]) ax.bar(index, mu, yerr=se, width=bar_width, color='b', error_kw = {'ecolor': '0.3', 'linewidth': 2}) ax.set_xlabel('shape', fontsize=fontsize) ax.set_ylabel('Mean $R^2$', fontsize=fontsize) ax.set_xticks(index + bar_width/2) ax.set_xticklabels([s.cache_name() for s in shapes], fontsize=fontsize) yticks = [0, 0.1, 0.2, 0.3] ax.set_yticks(yticks) ax.set_yticklabels(['{:g}'.format(t) for t in yticks], fontsize=fontsize) return fig
def plot_theta_diff_scatter(show_title=False): yFitter = Fitter(Sigslope(priors_name), 'normal') nFitter = Fitter(Sigslope()) yFits = get_all_fits(data, yFitter) nFits = get_all_fits(data, nFitter) pairs = [(nFit.LOO_score, yFit.LOO_score) for nFit, yFit in iterate_fits(nFits, yFits)] diff_pairs = [(n, y - n) for n, y in pairs if n is not None and y is not None] n, d = zip(*diff_pairs) fig = plt.figure() ax = fig.add_axes([0.15, 0.12, 0.8, 0.8]) ax.scatter(n, d, alpha=0.5) xlims = ax.get_xlim() ax.plot(xlims, [0, 0], 'k--') ax.set_xlim(*xlims) if show_title: ax.title(r'Improvement from prior on $\theta$ vs. baseline $R^2$', fontsize=fontsize) ax.set_xlabel(r'$R^2$(no priors)', fontsize=fontsize) ax.set_ylabel(r'$R^2$($\theta$) - $R^2$(no priors)', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig
cfg.verbosity = 1 age_scaler = LogScaler() pathway = '17full' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) data_shuffled = GeneData.load('both').restrict_pathway(pathway).scale_ages( age_scaler).shuffle() shape = Sigmoid('sigmoid_wide') fitter = Fitter(shape, sigma_prior='normal') fits = get_all_fits(data, fitter, allow_new_computation=False) fits_shuffled = get_all_fits(data_shuffled, fitter, allow_new_computation=False) R2_pairs = [(fit.LOO_score, fit2.LOO_score) for fit, fit2 in iterate_fits(fits, fits_shuffled)] R2 = np.array([r for r, r_shuffled in R2_pairs]) R2_shuffled = np.array([r_shuffled for r, r_shuffled in R2_pairs]) name = '{}-{}'.format(data.pathway, shape.cache_name()) fig = plot_score_distribution(R2, R2_shuffled) save_figure(fig, 'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True) mu_shuffled = np.mean(R2_shuffled) std_shuffled = np.std(R2_shuffled) z_scores = (R2 - mu_shuffled) / std_shuffled fig = plot_z_scores(z_scores) save_figure(fig,
ttl = '\n'.join([ttl, ttl_fit]) plt.title(ttl) return vals cfg.verbosity = 1 age_scaler = LogScaler() pathway = 'serotonin' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) shape = Sigmoid() fitter = Fitter(shape) fits = get_all_fits(data,fitter, allow_new_computation=False) def translate(g,r,fit): series = data.get_one_series(g,r) theta,sigma = fitter.translate_parameters_to_priors_scale(series.ages, series.single_expression, fit.theta, fit.sigma) a,h,mu,w = theta if h < 0: theta = (a+h,-h,mu,-w) # this is an equivalent sigmoid, with h now positive return Bunch( theta = theta, sigma = sigma, ) flat_fits = [translate(g,r,fit) for dsname,g,r,fit in iterate_fits(fits, return_keys=True)] # This script is meant to be run as a setup, then run commands interactively, e.g.: create_hist(flat_fits, 'a', -2, 1) create_hist(flat_fits, 'h', -1, 3) create_hist(flat_fits, 'w', -0.5, 1) create_hist(flat_fits, 'mu', -2, 2) create_hist(flat_fits, 'p', 0, 10)
data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) shape = Sigmoid() fitter = Fitter(shape) fits = get_all_fits(data, fitter, allow_new_computation=False) def translate(g, r, fit): series = data.get_one_series(g, r) theta, sigma = fitter.translate_parameters_to_priors_scale( series.ages, series.single_expression, fit.theta, fit.sigma) a, h, mu, w = theta if h < 0: theta = (a + h, -h, mu, -w ) # this is an equivalent sigmoid, with h now positive return Bunch( theta=theta, sigma=sigma, ) flat_fits = [ translate(g, r, fit) for dsname, g, r, fit in iterate_fits(fits, return_keys=True) ] # This script is meant to be run as a setup, then run commands interactively, e.g.: create_hist(flat_fits, 'a', -2, 1) create_hist(flat_fits, 'h', -1, 3) create_hist(flat_fits, 'w', -0.5, 1) create_hist(flat_fits, 'mu', -2, 2) create_hist(flat_fits, 'p', 0, 10)
import setup import config as cfg from load_data import GeneData from shapes.sigmoid import Sigmoid from fitter import Fitter from all_fits import get_all_fits, iterate_fits from scalers import LogScaler cfg.verbosity = 1 age_scaler = LogScaler() pathway = 'serotonin' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) fitter = Fitter(Sigmoid(priors=None)) fits = get_all_fits(data, fitter) extreme = [(g, r) for dsname, g, r, fit in iterate_fits( fits, R2_threshold=0.5, return_keys=True) if abs(fit.theta[0]) > 100]
def print_diff_points(data1, fitter1, fits1, data2, fitter2, fits2, n): diffs = [(fit1.LOO_score-fit2.LOO_score, g,r, fit1.LOO_score, fit2.LOO_score) for dsname,g,r,fit1,fit2 in iterate_fits(fits1,fits2, return_keys=True)] diffs.sort() print 'Top {} fits where {} > {}:'.format(n, fitter1.shape, fitter2.shape) for diff,g,r,score1,score2 in diffs[-n:]: print '\t{}@{}: diff={:.2g}, {}={:.2g}, {}={:.2g}'.format(g,r,diff,fitter1.shape,score1,fitter2.shape,score2) print 'Top {} fits where {} < {}:'.format(n, fitter1.shape, fitter2.shape) for diff,g,r,score1,score2 in diffs[:n]: print '\t{}@{}: diff={:.2g}, {}={:.2g}, {}={:.2g}'.format(g,r,diff,fitter1.shape,score1,fitter2.shape,score2)
import setup import config as cfg from load_data import GeneData from shapes.sigmoid import Sigmoid from fitter import Fitter from all_fits import get_all_fits, iterate_fits from scalers import LogScaler cfg.verbosity = 1 age_scaler = LogScaler() pathway = 'serotonin' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) fitter = Fitter(Sigmoid(priors=None)) fits = get_all_fits(data,fitter) extreme = [(g,r) for dsname,g,r,fit in iterate_fits(fits, R2_threshold=0.5, return_keys=True) if abs(fit.theta[0]) > 100]
ax.set_xlabel('z score', fontsize=fontsize) ax.set_ylabel('probability', fontsize=fontsize) ax.tick_params(axis='both', labelsize=fontsize) return fig cfg.verbosity = 1 age_scaler = LogScaler() pathway = '17full' data = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler) data_shuffled = GeneData.load('both').restrict_pathway(pathway).scale_ages(age_scaler).shuffle() shape = Sigmoid('sigmoid_wide') fitter = Fitter(shape,sigma_prior='normal') fits = get_all_fits(data,fitter,allow_new_computation=False) fits_shuffled = get_all_fits(data_shuffled,fitter,allow_new_computation=False) R2_pairs = [(fit.LOO_score,fit2.LOO_score) for fit,fit2 in iterate_fits(fits,fits_shuffled)] R2 = np.array([r for r,r_shuffled in R2_pairs]) R2_shuffled = np.array([r_shuffled for r,r_shuffled in R2_pairs]) name = '{}-{}'.format(data.pathway,shape.cache_name()) fig = plot_score_distribution(R2,R2_shuffled) save_figure(fig,'RP/R2-distribution-{}.png'.format(name), under_results=True, b_close=True) mu_shuffled = np.mean(R2_shuffled) std_shuffled = np.std(R2_shuffled) z_scores = (R2-mu_shuffled)/std_shuffled fig = plot_z_scores(z_scores) save_figure(fig,'RP/R2-z-scores-{}.png'.format(name), under_results=True, b_close=True) T, signed_rank_p_value = wilcoxon(R2, R2_shuffled) maxShuffled = R2_shuffled.max()