def export_cube(): cube = load_pickle(RegionPairTiming.cube_filename) README = """\ d_mu: mu(r2)-mu(r1) for every gene and region pair. Dimensions: <n-genes> X <n-regions> X <n-regions> combined_std: The combined standard deviation of the two change distributions. std = sqrt(0.5*(std1^2 + std2^2)) Dimensions: <n-genes> X <n-regions> X <n-regions> score: The d' for the two change distributions. Equal to d_mu ./ combined_std. Dimensions: <n-genes> X <n-regions> X <n-regions> genes: Gene names for the genes represented in other arrays regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CUBE=README, genes=list_of_strings_to_matlab_cell_array(cube.genes), regions=list_of_strings_to_matlab_cell_array(cube.regions), age_scaler=scalers.unify(cube.age_scaler).cache_name(), d_mu=cube.d_mu, combined_std=cube.std, scores=cube.d_mu / cube.std, ) save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_cube(): cube = load_pickle(RegionPairTiming.cube_filename) README = """\ d_mu: mu(r2)-mu(r1) for every gene and region pair. Dimensions: <n-genes> X <n-regions> X <n-regions> combined_std: The combined standard deviation of the two change distributions. std = sqrt(0.5*(std1^2 + std2^2)) Dimensions: <n-genes> X <n-regions> X <n-regions> score: The d' for the two change distributions. Equal to d_mu ./ combined_std. Dimensions: <n-genes> X <n-regions> X <n-regions> genes: Gene names for the genes represented in other arrays regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CUBE = README, genes = list_of_strings_to_matlab_cell_array(cube.genes), regions = list_of_strings_to_matlab_cell_array(cube.regions), age_scaler = scalers.unify(cube.age_scaler).cache_name(), d_mu = cube.d_mu, combined_std = cube.std, scores = cube.d_mu / cube.std, ) save_matfile(mdict, join(results_dir(), 'export', 'cube.mat'))
def export_pathways(): change_dist = load_pickle(SingleRegion.change_dist_filename) matlab_g2i = {g:(i+1) for i,g in enumerate(change_dist.genes)} # NOTE that matlab is one based pathways = pathway_lists.read_all_pathways() pathway_names = pathways.keys() # make sure the order stays fixed pathway_genes_names = np.array([list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names], dtype=object) pathway_genes_idx = np.array([np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names], dtype=object) matlab_p2i = {p:(i+1) for i,p in enumerate(pathway_names)} # NOTE matlab indexing is one based list_names = pathway_lists.all_pathway_lists() list_pathway_names = np.empty(len(list_names), dtype=object) list_pathway_idx = np.empty(len(list_names), dtype=object) for i,listname in enumerate(list_names): pathways_in_list = pathway_lists.list_to_pathway_names(listname) list_pathway_names[i] = list_of_strings_to_matlab_cell_array(pathways_in_list) list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list] README = """\ pathway_names: Cell array of all pathway names. The name in cell number k is the name of the pathway at position k in "pathway_genes_names" and "pathway_genes_idx". pathway_genes_names: Cell array (size <n-pathways>). Each cell contains a cell array of strings which are the gene symbols of the genes in that pathway. pathway_genes_idx: Same as pathway_genes_names, but each cell in the outer cell array is now an array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat. Hopefully this should be easier to use in matlab. list_names: Names of pathway lists prepared by Noa list_pathway_names: Call array. One item per list. Each item is a cell array of strings which are the names of the pathways belonging to that list. list_pathway_idx: Same as list_pathway_names, but instead of listing the pathways by name, they are given as indices into the previous pathway_xxx structures. """ mdict = dict( README_PATHWAYS = README, pathway_names = list_of_strings_to_matlab_cell_array(pathway_names), pathway_genes_names = pathway_genes_names, pathway_genes_idx = pathway_genes_idx, list_names = list_of_strings_to_matlab_cell_array(list_names), list_pathway_names = list_pathway_names, list_pathway_idx = list_pathway_idx, ) save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def export_timing_info_for_all_fits(data, fitter, fits): change_dist = compute_timing_info_for_all_fits(data, fitter, fits) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS=README, genes=list_of_strings_to_matlab_cell_array(change_dist.genes), regions=list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler=scalers.unify(change_dist.age_scaler).cache_name(), mu=change_dist.mu, std=change_dist.std, bin_edges=change_dist.bin_edges, bin_centers=change_dist.bin_centers, weights=change_dist.weights, ) filename = join( cache_dir(), fit_results_relative_path(data, fitter) + '-change-dist.mat') save_matfile(mdict, filename)
def save_to_mat(self): filename = join(cache_dir(), 'both', 'dprime-all-pathways-and-regions-{}.mat'.format(self._filename_suffix)) mdict = dict( pathway = list_of_strings_to_matlab_cell_array([x.pathway for x in self.res]), r1 = list_of_strings_to_matlab_cell_array([x.r1 for x in self.res]), r2 = list_of_strings_to_matlab_cell_array([x.r2 for x in self.res]), score = np.array([x.score for x in self.res]), delta = np.array([x.delta for x in self.res]), weighted_delta = np.array([x.weighted_delta for x in self.res]), mu1_years = np.array([x.mu1_years for x in self.res]), mu2_years = np.array([x.mu2_years for x in self.res]), pval = np.array([x.pval for x in self.res]), pathway_size = np.array([x.pathway_size for x in self.res]), ) print 'Saving results to {}'.format(filename) savemat(filename, mdict, oned_as='column')
def export_timing_info_for_all_fits(data, fitter, fits): change_dist = compute_timing_info_for_all_fits(data, fitter, fits) README = """\ mu: The mean age of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> std: The standard deviation of the change distribution for given gene and region. Dimensions: <n-genes> X <n-regions> genes: Gene names for the genes represented in other arrays weights: The change distributions for each gene and region. Dimensions: <n-genes> X <n-regions> X <n-bins> bin_centers: The ages for the center of each bin used in calculating the histogram in "weights". Dimensions: <n-bins> X 1 bin_edges: The edges of the bins used in calculating the change histogram. (centers can be calculated from the bin_edges, but it's convenient to have it pre-calculated) Dimensions: <n-bins + 1> X 1 regions: Region names for the regions represented in other arrays age_scaler: The scaling used for ages (i.e. 'log' means x' = log(x + 38/52)) """ mdict = dict( README_CHANGE_DISTRIBUTIONS = README, genes = list_of_strings_to_matlab_cell_array(change_dist.genes), regions = list_of_strings_to_matlab_cell_array(change_dist.regions), age_scaler = scalers.unify(change_dist.age_scaler).cache_name(), mu = change_dist.mu, std = change_dist.std, bin_edges = change_dist.bin_edges, bin_centers = change_dist.bin_centers, weights = change_dist.weights, ) filename = join(cache_dir(), fit_results_relative_path(data,fitter) + '-change-dist.mat') save_matfile(mdict, filename)
def save_to_mat(self): filename = join( cache_dir(), 'both', 'dprime-all-pathways-and-regions-{}.mat'.format( self._filename_suffix)) mdict = dict( pathway=list_of_strings_to_matlab_cell_array( [x.pathway for x in self.res]), r1=list_of_strings_to_matlab_cell_array([x.r1 for x in self.res]), r2=list_of_strings_to_matlab_cell_array([x.r2 for x in self.res]), score=np.array([x.score for x in self.res]), delta=np.array([x.delta for x in self.res]), weighted_delta=np.array([x.weighted_delta for x in self.res]), mu1_years=np.array([x.mu1_years for x in self.res]), mu2_years=np.array([x.mu2_years for x in self.res]), pval=np.array([x.pval for x in self.res]), pathway_size=np.array([x.pathway_size for x in self.res]), ) print 'Saving results to {}'.format(filename) savemat(filename, mdict, oned_as='column')
def export_pathways(): change_dist = load_pickle(SingleRegion.change_dist_filename) matlab_g2i = {g: (i + 1) for i, g in enumerate(change_dist.genes) } # NOTE that matlab is one based pathways = pathway_lists.read_all_pathways() pathway_names = pathways.keys() # make sure the order stays fixed pathway_genes_names = np.array([ list_of_strings_to_matlab_cell_array(pathways[p]) for p in pathway_names ], dtype=object) pathway_genes_idx = np.array([ np.array([matlab_g2i[g] for g in pathways[p]]) for p in pathway_names ], dtype=object) matlab_p2i = {p: (i + 1) for i, p in enumerate(pathway_names) } # NOTE matlab indexing is one based list_names = pathway_lists.all_pathway_lists() list_pathway_names = np.empty(len(list_names), dtype=object) list_pathway_idx = np.empty(len(list_names), dtype=object) for i, listname in enumerate(list_names): pathways_in_list = pathway_lists.list_to_pathway_names(listname) list_pathway_names[i] = list_of_strings_to_matlab_cell_array( pathways_in_list) list_pathway_idx[i] = [matlab_p2i[p] for p in pathways_in_list] README = """\ pathway_names: Cell array of all pathway names. The name in cell number k is the name of the pathway at position k in "pathway_genes_names" and "pathway_genes_idx". pathway_genes_names: Cell array (size <n-pathways>). Each cell contains a cell array of strings which are the gene symbols of the genes in that pathway. pathway_genes_idx: Same as pathway_genes_names, but each cell in the outer cell array is now an array of gene indices corresponding to the gene positions in cube.mat and change-distributions.mat. Hopefully this should be easier to use in matlab. list_names: Names of pathway lists prepared by Noa list_pathway_names: Call array. One item per list. Each item is a cell array of strings which are the names of the pathways belonging to that list. list_pathway_idx: Same as list_pathway_names, but instead of listing the pathways by name, they are given as indices into the previous pathway_xxx structures. """ mdict = dict( README_PATHWAYS=README, pathway_names=list_of_strings_to_matlab_cell_array(pathway_names), pathway_genes_names=pathway_genes_names, pathway_genes_idx=pathway_genes_idx, list_names=list_of_strings_to_matlab_cell_array(list_names), list_pathway_names=list_pathway_names, list_pathway_idx=list_pathway_idx, ) save_matfile(mdict, join(results_dir(), 'export', 'pathways.mat'))
def save_as_mat_files(data, fitter, fits, has_change_distributions): for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset,fitter) + '.mat') dataset_fits = fits[dataset.name] print 'Saving mat file to {}'.format(filename) shape = fitter.shape gene_names = dataset.gene_names gene_idx = {g:i for i,g in enumerate(gene_names)} n_genes = len(gene_names) region_names = dataset.region_names region_idx = {r:i for i,r in enumerate(region_names)} n_regions = len(region_names) write_theta = shape.can_export_params_to_matlab() if write_theta: theta = init_array(np.NaN, shape.n_params(), n_genes,n_regions) else: theta = np.NaN fit_scores = init_array(np.NaN, n_genes,n_regions) LOO_scores = init_array(np.NaN, n_genes,n_regions) fit_predictions = init_array(np.NaN, *dataset.expression.shape) LOO_predictions = init_array(np.NaN, *dataset.expression.shape) high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions) scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot) original_high_res_ages = scalers.unify(dataset.age_scaler).unscale(scaled_high_res_ages) if has_change_distributions: change_distribution_bin_centers = fits.change_distribution_params.bin_centers n_bins = len(change_distribution_bin_centers) change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions) else: change_distribution_bin_centers = [] change_distribution_weights = [] for (g,r),fit in dataset_fits.iteritems(): series = dataset.get_one_series(g,r) ig = gene_idx[g] ir = region_idx[r] fit_scores[ig,ir] = fit.fit_score LOO_scores[ig,ir] = fit.LOO_score if write_theta and fit.theta is not None: theta[:,ig,ir] = fit.theta if fit.fit_predictions is not None: fit_predictions[series.original_inds,ig,ir] = fit.fit_predictions if fit.LOO_predictions is not None: LOO_predictions[series.original_inds,ig,ir] = fit.LOO_predictions if fit.theta is not None: high_res_predictions[:,ig,ir] = shape.f(fit.theta, scaled_high_res_ages) change_weights = getattr(fit,'change_distribution_weights',None) if change_weights is not None: change_distribution_weights[:,ig,ir] = change_weights mdict = dict( gene_names = list_of_strings_to_matlab_cell_array(gene_names), region_names = list_of_strings_to_matlab_cell_array(region_names), theta = theta, fit_scores = fit_scores, LOO_scores = LOO_scores, fit_predictions = fit_predictions, LOO_predictions = LOO_predictions, high_res_predictions = high_res_predictions, high_res_ages = original_high_res_ages, change_distribution_bin_centers = change_distribution_bin_centers, change_distribution_weights = change_distribution_weights, ) savemat(filename, mdict, oned_as='column')
def save_as_mat_files(data, fitter, fits, has_change_distributions): for dataset in data.datasets: filename = join(cache_dir(), fit_results_relative_path(dataset, fitter) + '.mat') dataset_fits = fits[dataset.name] print 'Saving mat file to {}'.format(filename) shape = fitter.shape gene_names = dataset.gene_names gene_idx = {g: i for i, g in enumerate(gene_names)} n_genes = len(gene_names) region_names = dataset.region_names region_idx = {r: i for i, r in enumerate(region_names)} n_regions = len(region_names) write_theta = shape.can_export_params_to_matlab() if write_theta: theta = init_array(np.NaN, shape.n_params(), n_genes, n_regions) else: theta = np.NaN fit_scores = init_array(np.NaN, n_genes, n_regions) LOO_scores = init_array(np.NaN, n_genes, n_regions) fit_predictions = init_array(np.NaN, *dataset.expression.shape) LOO_predictions = init_array(np.NaN, *dataset.expression.shape) high_res_predictions = init_array(np.NaN, cfg.n_curve_points_to_plot, n_genes, n_regions) scaled_high_res_ages = np.linspace(dataset.ages.min(), dataset.ages.max(), cfg.n_curve_points_to_plot) original_high_res_ages = scalers.unify( dataset.age_scaler).unscale(scaled_high_res_ages) if has_change_distributions: change_distribution_bin_centers = fits.change_distribution_params.bin_centers n_bins = len(change_distribution_bin_centers) change_distribution_weights = init_array(np.NaN, n_bins, n_genes, n_regions) else: change_distribution_bin_centers = [] change_distribution_weights = [] for (g, r), fit in dataset_fits.iteritems(): series = dataset.get_one_series(g, r) ig = gene_idx[g] ir = region_idx[r] fit_scores[ig, ir] = fit.fit_score LOO_scores[ig, ir] = fit.LOO_score if write_theta and fit.theta is not None: theta[:, ig, ir] = fit.theta if fit.fit_predictions is not None: fit_predictions[series.original_inds, ig, ir] = fit.fit_predictions if fit.LOO_predictions is not None: LOO_predictions[series.original_inds, ig, ir] = fit.LOO_predictions if fit.theta is not None: high_res_predictions[:, ig, ir] = shape.f(fit.theta, scaled_high_res_ages) change_weights = getattr(fit, 'change_distribution_weights', None) if change_weights is not None: change_distribution_weights[:, ig, ir] = change_weights mdict = dict( gene_names=list_of_strings_to_matlab_cell_array(gene_names), region_names=list_of_strings_to_matlab_cell_array(region_names), theta=theta, fit_scores=fit_scores, LOO_scores=LOO_scores, fit_predictions=fit_predictions, LOO_predictions=LOO_predictions, high_res_predictions=high_res_predictions, high_res_ages=original_high_res_ages, change_distribution_bin_centers=change_distribution_bin_centers, change_distribution_weights=change_distribution_weights, ) savemat(filename, mdict, oned_as='column')