def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations, correlations_k_of_n, allow_new_computation): def arg_mapper(gr,f_proxy): g,r = gr series = dataset.get_one_series(g,r) return f_proxy(series,fitter) # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard # this also requires that the list of all genes be taken from the whole data # and not from each dataset. Otherwise we can get a mismatch between the genes # in the shard for different datasets. dataset_fits = job_splitting.compute( name = 'fits', f = _compute_fit, arg_mapper = arg_mapper, all_keys = list(product(dataset.gene_names,dataset.region_names)), all_sharding_keys = data.gene_names, f_sharding_key = lambda gr: gr[0], k_of_n = k_of_n, base_filename = fit_results_relative_path(dataset,fitter), allow_new_computation = allow_new_computation, ) if n_correlation_iterations > 0: # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region # which is necessary for computing correlations in that region. assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level" _add_dataset_correlation_fits(dataset, fitter, dataset_fits, n_correlation_iterations, correlations_k_of_n, allow_new_computation) if cfg.verbosity > 0: print 'Adding fit scores... ', _add_scores(dataset, dataset_fits) if cfg.verbosity > 0: print 'done!' return dataset_fits
def get_change_distribution_for_whole_genome(all_data, fitter): # NOTE: the distribution for all genes should be precomputed by running onset_times_whole_genome.py filename = join(cache_dir(),fit_results_relative_path(all_data,fitter) + '.pkl') print 'Loading whole genome onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) return bin_edges, change_vals
def create_top_correlations_html(data, fitter, fits, scores, regions, n_top=None): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data,fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g,r,pval,lst_R2 = score return r scores.sort(key=key_func) top_genes = [g for g,r,pval,lst_R2 in scores[:n_top]] top_scores = {g:r for g,r,pval,lst_R2 in scores[:n_top]} top_pvals = {g:pval for g,r,pval,lst_R2 in scores[:n_top]} def get_onset_time(fit): a,h,mu,_ = fit.theta age = age_scaler.unscale(mu) txt = 'onset = {:.3g} years'.format(age) cls = '' return txt,cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names = top_genes, region_names = regions, extra_columns = [('r',top_scores),('p-value',top_pvals)], extra_fields_per_fit = [get_onset_time], b_inline_images = True, b_R2_dist = False, ttl = 'Fit for genes with top Spearman correlations', filename = 'top-gradual-maturation', )
def get_change_distribution_for_whole_genome(all_data, fitter): # NOTE: the distribution for all genes should be precomputed by running onset_times_whole_genome.py filename = join(cache_dir(), fit_results_relative_path(all_data, fitter) + '.pkl') print 'Loading whole genome onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) return bin_edges, change_vals
def create_top_genes_html(data, fitter, fits, scores, regions, n_top=None, filename_suffix=''): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data,fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g,pval,qval = score return pval scores.sort(key=key_func) top_genes = [g for g,pval,qval in scores[:n_top]] top_pvals = {g:pval for g,pval,qval in scores[:n_top]} top_qvals = {g:qval for g,pval,qval in scores[:n_top]} n = len(scores) n05 = len([g for g,pval,qval in scores if qval < 0.05]) n01 = len([g for g,pval,qval in scores if qval < 0.01]) top_text = """\ <pre> one sided t-test: {regions[0]} < {regions[1]} {n05}/{n} q-values < 0.05 {n01}/{n} q_values < 0.01 </pre> """.format(**locals()) def get_onset_time(fit): a,h,mu,_ = fit.theta age = age_scaler.unscale(mu) return 'onset = {:.3g} years'.format(age) def get_onset_dist(fit): mu_vals = fit.theta_samples[2,:] mu = mu_vals.mean() vLow,vHigh = np.percentile(mu_vals, (20,80)) mu = age_scaler.unscale(mu) vLow = age_scaler.unscale(vLow) vHigh = age_scaler.unscale(vHigh) txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format(mu,vLow,vHigh) cls = '' return txt,cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names = top_genes, region_names = regions, extra_columns = [('p-value',top_pvals), ('q-value',top_qvals)], extra_fields_per_fit = [get_onset_time, get_onset_dist], b_inline_images = True, inline_image_size = '30%', b_R2_dist = False, ttl = 'Fit for genes with top t-test scores', top_text = top_text, filename = 'gradual-maturation-t-test' + filename_suffix, )
def export_timing_info_for_all_fits(data, fitter, fits): change_dist = compute_timing_info_for_all_fits(data, fitter, fits) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS=README, genes=list_of_strings_to_matlab_cell_array(change_dist.genes), regions=list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler=scalers.unify(change_dist.age_scaler).cache_name(), mu=change_dist.mu, std=change_dist.std, bin_edges=change_dist.bin_edges, bin_centers=change_dist.bin_centers, weights=change_dist.weights, ) filename = join( cache_dir(), fit_results_relative_path(data, fitter) + '-change-dist.mat') save_matfile(mdict, filename)
def create_top_correlations_html(data, fitter, fits, scores, regions, n_top=None): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data, fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g, r, pval, lst_R2 = score return r scores.sort(key=key_func) top_genes = [g for g, r, pval, lst_R2 in scores[:n_top]] top_scores = {g: r for g, r, pval, lst_R2 in scores[:n_top]} top_pvals = {g: pval for g, r, pval, lst_R2 in scores[:n_top]} def get_onset_time(fit): a, h, mu, _ = fit.theta age = age_scaler.unscale(mu) txt = 'onset = {:.3g} years'.format(age) cls = '' return txt, cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names=top_genes, region_names=regions, extra_columns=[('r', top_scores), ('p-value', top_pvals)], extra_fields_per_fit=[get_onset_time], b_inline_images=True, b_R2_dist=False, ttl='Fit for genes with top Spearman correlations', filename='top-gradual-maturation', )
def export_timing_info_for_all_fits(data, fitter, fits): change_dist = compute_timing_info_for_all_fits(data, fitter, fits) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS = README, genes = list_of_strings_to_matlab_cell_array(change_dist.genes), regions = list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler = scalers.unify(change_dist.age_scaler).cache_name(), mu = change_dist.mu, std = change_dist.std, bin_edges = change_dist.bin_edges, bin_centers = change_dist.bin_centers, weights = change_dist.weights, ) filename = join(cache_dir(), fit_results_relative_path(data,fitter) + '-change-dist.mat') save_matfile(mdict, filename)
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False, k_of_n=None, use_correlations=False, correlations=None, show_change_distributions=False, html_kw=None, figure_kw=None): if fits is None: fits = get_all_fits(data,fitter,k_of_n) if basedir is None: basedir = join(results_dir(), fit_results_relative_path(data,fitter)) if use_correlations: basedir = join(basedir,'with-correlations') if html_kw is None: html_kw = {} if figure_kw is None: figure_kw = {} print 'Writing HTML under {}'.format(basedir) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' correlations_dir = 'gene-correlations' scores_dir = 'score_distributions' if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries plot_and_save_all_genes(data, fitter, fits, join(basedir,gene_dir), show_change_distributions) if do_series and not only_main_html: plot_and_save_all_series(data, fitter, fits, join(basedir,series_dir), use_correlations, show_change_distributions, figure_kw) if do_hist and k_of_n is None and not only_main_html: create_score_distribution_html(fits, use_correlations, join(basedir,scores_dir)) if do_html and k_of_n is None: link_to_correlation_plots = use_correlations and correlations is not None if link_to_correlation_plots and not only_main_html: plot_and_save_all_gene_correlations(data, correlations, join(basedir,correlations_dir)) dct_pathways = load_17_pathways_breakdown() pathway_genes = set.union(*dct_pathways.values()) data_genes = set(data.gene_names) missing = pathway_genes - data_genes b_pathways = len(missing) < len(pathway_genes)/2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing) create_html( data, fitter, fits, basedir, gene_dir, series_dir, scores_dir, correlations_dir=correlations_dir, use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, b_pathways=b_pathways, **html_kw )
def get_onset_times(data, fitter, R2_threshold, b_force=False): filename = join(cache_dir(),fit_results_relative_path(data,fitter) + '.pkl') if isfile(filename): print 'Loading onset distribution from {}'.format(filename) with open(filename) as f: bin_edges, change_vals = pickle.load(f) else: print 'Computing...' fits = get_all_fits(data, fitter) thetas = [fit.theta for fit in iterate_fits(fits, R2_threshold=R2_threshold)] stages = [stage.scaled(age_scaler) for stage in dev_stages] low = min(stage.from_age for stage in stages) high = max(stage.to_age for stage in stages) bin_edges, change_vals = compute_change_distribution(fitter.shape, thetas, low, high, n_bins=50) print 'Saving result to {}'.format(filename) ensure_dir(dirname(filename)) with open(filename,'w') as f: pickle.dump((bin_edges,change_vals),f) return bin_edges, change_vals
def save_theta_text_files(data, fitter, fits): assert fitter.shape.cache_name() == 'spline', "save to text is only supported for splines at the moment" for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.txt') dataset_fits = fits[dataset.name] print 'Saving text file to {}'.format(filename) with open(filename, 'w') as f: for (g,r),fit in dataset_fits.iteritems(): if fit.theta is None: continue knots, coeffs, degree = fit.theta[0] knots = list(knots) coeffs = list(coeffs) gr_text = """\ Gene symbol: {g} Region: {r} Spline knots: {knots} Spline coefficients: {coeffs} Spline degree: {degree} """.format(**locals()) print >>f, gr_text
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations, k_of_n, allow_new_computation): def arg_mapper(key, f_proxy): ir, loo_point = key r = dataset.region_names[ir] series = dataset.get_several_series(dataset.gene_names, r) basic_theta = [ds_fits[(g, r)].theta for g in dataset.gene_names] return f_proxy(series, fitter, basic_theta, loo_point, n_iterations) all_keys = [] for ir, r in enumerate(dataset.region_names): all_keys.append((ir, None)) series = dataset.get_several_series(dataset.gene_names, r) for iy, g in enumerate(dataset.gene_names): for ix in xrange(len(series.ages)): loo_point = (ix, iy) all_keys.append((ir, loo_point)) def f_sharding_key( key): # keep all x points in the same shard for same r,iy r, loo_point = key if loo_point is None: return (r, None) else: ix, iy = loo_point return (r, iy) dct_results = job_splitting.compute( name='fits-correlations', f=_compute_fit_with_correlations, arg_mapper=arg_mapper, all_keys=all_keys, f_sharding_key=f_sharding_key, k_of_n=k_of_n, base_filename=fit_results_relative_path(dataset, fitter) + '-correlations-{}'.format(n_iterations), allow_new_computation=allow_new_computation, ) _add_dataset_correlation_fits_from_results_dictionary( dataset, ds_fits, dct_results)
def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations, correlations_k_of_n, allow_new_computation): def arg_mapper(gr, f_proxy): g, r = gr series = dataset.get_one_series(g, r) return f_proxy(series, fitter) # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard # this also requires that the list of all genes be taken from the whole data # and not from each dataset. Otherwise we can get a mismatch between the genes # in the shard for different datasets. dataset_fits = job_splitting.compute( name='fits', f=_compute_fit, arg_mapper=arg_mapper, all_keys=list(product(dataset.gene_names, dataset.region_names)), all_sharding_keys=data.gene_names, f_sharding_key=lambda gr: gr[0], k_of_n=k_of_n, base_filename=fit_results_relative_path(dataset, fitter), allow_new_computation=allow_new_computation, ) if n_correlation_iterations > 0: # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region # which is necessary for computing correlations in that region. assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level" _add_dataset_correlation_fits(dataset, fitter, dataset_fits, n_correlation_iterations, correlations_k_of_n, allow_new_computation) if cfg.verbosity > 0: print 'Adding fit scores... ', _add_scores(dataset, dataset_fits) if cfg.verbosity > 0: print 'done!' return dataset_fits
def save_theta_text_files(data, fitter, fits): assert fitter.shape.cache_name( ) == 'spline', "save to text is only supported for splines at the moment" for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset, fitter) + '.txt') dataset_fits = fits[dataset.name] print 'Saving text file to {}'.format(filename) with open(filename, 'w') as f: for (g, r), fit in dataset_fits.iteritems(): if fit.theta is None: continue knots, coeffs, degree = fit.theta[0] knots = list(knots) coeffs = list(coeffs) gr_text = """\ Gene symbol: {g} Region: {r} Spline knots: {knots} Spline coefficients: {coeffs} Spline degree: {degree} """.format(**locals()) print >> f, gr_text
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations, k_of_n, allow_new_computation): def arg_mapper(key, f_proxy): ir, loo_point = key r = dataset.region_names[ir] series = dataset.get_several_series(dataset.gene_names,r) basic_theta = [ds_fits[(g,r)].theta for g in dataset.gene_names] return f_proxy(series, fitter, basic_theta, loo_point, n_iterations) all_keys = [] for ir,r in enumerate(dataset.region_names): all_keys.append((ir,None)) series = dataset.get_several_series(dataset.gene_names,r) for iy,g in enumerate(dataset.gene_names): for ix in xrange(len(series.ages)): loo_point = (ix,iy) all_keys.append((ir,loo_point)) def f_sharding_key(key): # keep all x points in the same shard for same r,iy r, loo_point = key if loo_point is None: return (r,None) else: ix,iy = loo_point return (r,iy) dct_results = job_splitting.compute( name = 'fits-correlations', f = _compute_fit_with_correlations, arg_mapper = arg_mapper, all_keys = all_keys, f_sharding_key = f_sharding_key, k_of_n = k_of_n, base_filename = fit_results_relative_path(dataset,fitter) + '-correlations-{}'.format(n_iterations), allow_new_computation = allow_new_computation, ) _add_dataset_correlation_fits_from_results_dictionary(dataset, ds_fits, dct_results)
def save_as_mat_files(data, fitter, fits, has_change_distributions): for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset, fitter) + '.mat') dataset_fits = fits[dataset.name] print 'Saving mat file to {}'.format(filename) shape = fitter.shape gene_names = dataset.gene_names gene_idx = {g: i for i, g in enumerate(gene_names)} n_genes = len(gene_names) region_names = dataset.region_names region_idx = {r: i for i, r in enumerate(region_names)} n_regions = len(region_names) write_theta = shape.can_export_params_to_matlab() if write_theta: theta = init_array(np.NaN, shape.n_params(), n_genes, n_regions) else: theta = np.NaN fit_scores = init_array(np.NaN, n_genes, n_regions) LOO_scores = init_array(np.NaN, n_genes, n_regions) fit_predictions = init_array(np.NaN, *dataset.expression.shape) LOO_predictions = init_array(np.NaN, *dataset.expression.shape) high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions) scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot) original_high_res_ages = scalers.unify( dataset.age_scaler).unscale(scaled_high_res_ages) if has_change_distributions: change_distribution_bin_centers = fits.change_distribution_params.bin_centers n_bins = len(change_distribution_bin_centers) change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions) else: change_distribution_bin_centers = [] change_distribution_weights = [] for (g, r), fit in dataset_fits.iteritems(): series = dataset.get_one_series(g, r) ig = gene_idx[g] ir = region_idx[r] fit_scores[ig, ir] = fit.fit_score LOO_scores[ig, ir] = fit.LOO_score if write_theta and fit.theta is not None: theta[:, ig, ir] = fit.theta if fit.fit_predictions is not None: fit_predictions[series.original_inds, ig, ir] = fit.fit_predictions if fit.LOO_predictions is not None: LOO_predictions[series.original_inds, ig, ir] = fit.LOO_predictions if fit.theta is not None: high_res_predictions[:, ig, ir] = shape.f(fit.theta, scaled_high_res_ages) change_weights = getattr(fit, 'change_distribution_weights', None) if change_weights is not None: change_distribution_weights[:, ig, ir] = change_weights mdict = dict( gene_names=list_of_strings_to_matlab_cell_array(gene_names), region_names=list_of_strings_to_matlab_cell_array(region_names), theta=theta, fit_scores=fit_scores, LOO_scores=LOO_scores, fit_predictions=fit_predictions, LOO_predictions=LOO_predictions, high_res_predictions=high_res_predictions, high_res_ages=original_high_res_ages, change_distribution_bin_centers=change_distribution_bin_centers, change_distribution_weights=change_distribution_weights, ) savemat(filename, mdict, oned_as='column')
def create_top_genes_html(data, fitter, fits, scores, regions, n_top=None, filename_suffix=''): if n_top is None: n_top = len(scores) basedir = join(results_dir(), fit_results_relative_path(data, fitter)) ensure_dir(basedir) gene_dir = 'gene-subplot' series_dir = 'gene-region-fits' def key_func(score): g, pval, qval = score return pval scores.sort(key=key_func) top_genes = [g for g, pval, qval in scores[:n_top]] top_pvals = {g: pval for g, pval, qval in scores[:n_top]} top_qvals = {g: qval for g, pval, qval in scores[:n_top]} n = len(scores) n05 = len([g for g, pval, qval in scores if qval < 0.05]) n01 = len([g for g, pval, qval in scores if qval < 0.01]) top_text = """\ <pre> one sided t-test: {regions[0]} < {regions[1]} {n05}/{n} q-values < 0.05 {n01}/{n} q_values < 0.01 </pre> """.format(**locals()) def get_onset_time(fit): a, h, mu, _ = fit.theta age = age_scaler.unscale(mu) return 'onset = {:.3g} years'.format(age) def get_onset_dist(fit): mu_vals = fit.theta_samples[2, :] mu = mu_vals.mean() vLow, vHigh = np.percentile(mu_vals, (20, 80)) mu = age_scaler.unscale(mu) vLow = age_scaler.unscale(vLow) vHigh = age_scaler.unscale(vHigh) txt = 'onset reestimate (mean [20%, 80%]) = {:.3g} [{:.3g},{:.3g}]'.format( mu, vLow, vHigh) cls = '' return txt, cls create_html( data, fitter, fits, basedir, gene_dir, series_dir, gene_names=top_genes, region_names=regions, extra_columns=[('p-value', top_pvals), ('q-value', top_qvals)], extra_fields_per_fit=[get_onset_time, get_onset_dist], b_inline_images=True, inline_image_size='30%', b_R2_dist=False, ttl='Fit for genes with top t-test scores', top_text=top_text, filename='gradual-maturation-t-test' + filename_suffix, )
def save_as_mat_files(data, fitter, fits, has_change_distributions): for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.mat') dataset_fits = fits[dataset.name] print 'Saving mat file to {}'.format(filename) shape = fitter.shape gene_names = dataset.gene_names gene_idx = {g:i for i,g in enumerate(gene_names)} n_genes = len(gene_names) region_names = dataset.region_names region_idx = {r:i for i,r in enumerate(region_names)} n_regions = len(region_names) write_theta = shape.can_export_params_to_matlab() if write_theta: theta = init_array(np.NaN, shape.n_params(), n_genes,n_regions) else: theta = np.NaN fit_scores = init_array(np.NaN, n_genes,n_regions) LOO_scores = init_array(np.NaN, n_genes,n_regions) fit_predictions = init_array(np.NaN, *dataset.expression.shape) LOO_predictions = init_array(np.NaN, *dataset.expression.shape) high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions) scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot) original_high_res_ages = scalers.unify(dataset.age_scaler).unscale(scaled_high_res_ages) if has_change_distributions: change_distribution_bin_centers = fits.change_distribution_params.bin_centers n_bins = len(change_distribution_bin_centers) change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions) else: change_distribution_bin_centers = [] change_distribution_weights = [] for (g,r),fit in dataset_fits.iteritems(): series = dataset.get_one_series(g,r) ig = gene_idx[g] ir = region_idx[r] fit_scores[ig,ir] = fit.fit_score LOO_scores[ig,ir] = fit.LOO_score if write_theta and fit.theta is not None: theta[:,ig,ir] = fit.theta if fit.fit_predictions is not None: fit_predictions[series.original_inds,ig,ir] = fit.fit_predictions if fit.LOO_predictions is not None: LOO_predictions[series.original_inds,ig,ir] = fit.LOO_predictions if fit.theta is not None: high_res_predictions[:,ig,ir] = shape.f(fit.theta, scaled_high_res_ages) change_weights = getattr(fit,'change_distribution_weights',None) if change_weights is not None: change_distribution_weights[:,ig,ir] = change_weights mdict = dict( gene_names = list_of_strings_to_matlab_cell_array(gene_names), region_names = list_of_strings_to_matlab_cell_array(region_names), theta = theta, fit_scores = fit_scores, LOO_scores = LOO_scores, fit_predictions = fit_predictions, LOO_predictions = LOO_predictions, high_res_predictions = high_res_predictions, high_res_ages = original_high_res_ages, change_distribution_bin_centers = change_distribution_bin_centers, change_distribution_weights = change_distribution_weights, ) savemat(filename, mdict, oned_as='column')
def save_fits_and_create_html(data, fitter, fits=None, basedir=None, do_genes=True, do_series=True, do_hist=True, do_html=True, only_main_html=False, k_of_n=None, use_correlations=False, correlations=None, show_change_distributions=False, exons_layout=False, html_kw=None, figure_kw=None): if fits is None: fits = get_all_fits(data, fitter, k_of_n) if basedir is None: basedir = join(results_dir(), fit_results_relative_path(data, fitter)) if use_correlations: basedir = join(basedir, 'with-correlations') if html_kw is None: html_kw = {} if figure_kw is None: figure_kw = {} print 'Writing HTML under {}'.format(basedir) ensure_dir(basedir) gene_dir = 'gene-subplot' exons_dir = 'exons_subplot_series' if cfg.exons_plots_from_series else 'exons_subplot' series_dir = 'gene-region-fits' correlations_dir = 'gene-correlations' scores_dir = 'score_distributions' if do_genes and not only_main_html: # relies on the sharding of the fits respecting gene boundaries plot_and_save_all_genes(data, fitter, fits, join(basedir, gene_dir), show_change_distributions) if do_series and not only_main_html: plot_and_save_all_series(data, fitter, fits, join(basedir, series_dir), use_correlations, show_change_distributions, exons_layout, figure_kw) if exons_layout and not only_main_html: if cfg.exons_plots_from_series: plot_and_save_all_exons_from_series(fits, join(basedir, exons_dir), join(basedir, series_dir)) else: plot_and_save_all_exons(data, fitter, fits, join(basedir, exons_dir)) if do_hist and k_of_n is None and not only_main_html: create_score_distribution_html(fits, use_correlations, join(basedir, scores_dir)) if do_html and k_of_n is None: link_to_correlation_plots = use_correlations and correlations is not None if link_to_correlation_plots and not only_main_html: plot_and_save_all_gene_correlations( data, correlations, join(basedir, correlations_dir)) dct_pathways = load_17_pathways_breakdown() pathway_genes = set.union(*dct_pathways.values()) data_genes = set(data.gene_names) missing = pathway_genes - data_genes b_pathways = len(missing) < len( pathway_genes ) / 2 # simple heuristic to create pathways only if we have most of the genes (currently 61 genes are missing) create_html(data, fitter, fits, basedir, gene_dir, exons_dir, series_dir, scores_dir, correlations_dir=correlations_dir, use_correlations=use_correlations, link_to_correlation_plots=link_to_correlation_plots, b_pathways=b_pathways, exons_layout=exons_layout, **html_kw)
def calc_bootstrap_change_distribution(shape, theta_samples, bin_edges): bin_centers = bin_edges_to_centers(bin_edges) n_params, n_samples = theta_samples.shape weights = np.zeros(bin_centers.shape) for i in xrange(n_samples): weights += calc_change_distribution(shape, theta_samples[:, i], bin_edges) weights /= n_samples # now values are in fraction of total change (doesn't have to sum up to 1 if ages don't cover the whole transition range) return weights @cache(lambda data, fitter, fits: join( cache_dir(), fit_results_relative_path(data, fitter) + '-dprime-cube.pkl')) def compute_dprime_measures_for_all_pairs(data, fitter, fits): genes = data.gene_names regions = data.region_names r2ds = data.region_to_dataset() cube_shape = (len(genes), len(regions), len(regions)) d_mu = np.empty(cube_shape) # mu2-mu1 for all genes and region pairs std = np.empty(cube_shape) # std (combined) for all genes and region pairs def get_mu_std(g, r): dsfits = fits[r2ds[r]] fit = dsfits.get((g, r)) if fit is None: return np.nan, np.nan else: return fit.change_distribution_mean_std
for dsname,g,r,fit in iterate_fits(fits, return_keys=True): weights = calc_bootstrap_change_distribution(shape, fit.theta_samples, bin_edges) fit.change_distribution_weights = weights fit.change_distribution_spread = change_distribution_spread_cumsum(bin_centers, weights) fit.change_distribution_mean_std = change_distribution_mean_and_std(bin_centers, weights) def calc_bootstrap_change_distribution(shape, theta_samples, bin_edges): bin_centers = bin_edges_to_centers(bin_edges) n_params, n_samples = theta_samples.shape weights = np.zeros(bin_centers.shape) for i in xrange(n_samples): weights += calc_change_distribution(shape, theta_samples[:,i], bin_edges) weights /= n_samples # now values are in fraction of total change (doesn't have to sum up to 1 if ages don't cover the whole transition range) return weights @cache(lambda data, fitter, fits: join(cache_dir(), fit_results_relative_path(data,fitter) + '-dprime-cube.pkl')) def compute_dprime_measures_for_all_pairs(data, fitter, fits): genes = data.gene_names regions = data.region_names r2ds = data.region_to_dataset() cube_shape = (len(genes), len(regions), len(regions)) d_mu = np.empty(cube_shape) # mu2-mu1 for all genes and region pairs std = np.empty(cube_shape) # std (combined) for all genes and region pairs def get_mu_std(g,r): dsfits = fits[r2ds[r]] fit = dsfits.get((g,r)) if fit is None: return np.nan, np.nan else: return fit.change_distribution_mean_std for ig,g in enumerate(genes):