def summarize_heatmaps(python, arrayplot, cluster, file_layout, libpath=[]): import arrayio from genomicode import graphlib # Load the data sets. DATA_orig = arrayio.gct_format.read(file_layout.DS_PROC_FILTERED) DATA_final = arrayio.gct_format.read(file_layout.DS_FINAL_FILTERED) assert DATA_final.dim() == DATA_orig.dim() nrow, ncol = DATA_orig.dim() x = graphlib.find_tall_heatmap_size( #nrow, ncol, min_box_width=20, max_total_height=2000, nrow, ncol, max_total_height=2000, max_total_width=2000) xpix, ypix = x #print "SIZE", nrow, ncol, xpix, ypix graphlib.plot_heatmap(file_layout.DS_FINAL_FILTERED, file_layout.DS_FINAL_HEATMAP, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_center="mean", gene_normalize="var", array_label=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) graphlib.plot_heatmap(file_layout.DS_PROC_FILTERED, file_layout.DS_PROC_HEATMAP, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_center="mean", gene_normalize="var", array_label=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) trash_files = [ file_layout.DS_PROC_CLUSTER_TRASH1, file_layout.DS_PROC_CLUSTER_TRASH2, file_layout.DS_FINAL_CLUSTER_TRASH1, file_layout.DS_FINAL_CLUSTER_TRASH2, ] for f in trash_files: if os.path.exists(f): os.unlink(f)
def summarize_heatmap(python, arrayplot, cluster, libpath, file_layout): from genomicode import graphlib # Bug: what if there are nan's in the probabilities? xpix, ypix = 30, 30 x = graphlib.plot_heatmap( file_layout.PROBABILITIES_PCL, file_layout.PROBABILITIES_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_label=True, cluster_genes=True, array_label=True, cluster_arrays=True, scale=-0.5, gain=2.0, no_autoscale=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) print x # Clean up some extra files. if os.path.exists(file_layout.PROBABILITIES_CDT): src = file_layout.PROBABILITIES_CDT x = os.path.split(file_layout.PROBABILITIES_CDT)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst) if os.path.exists(file_layout.PROBABILITIES_GTR): src = file_layout.PROBABILITIES_GTR x = os.path.split(file_layout.PROBABILITIES_GTR)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst) if os.path.exists(file_layout.PROBABILITIES_ATR): src = file_layout.PROBABILITIES_ATR x = os.path.split(file_layout.PROBABILITIES_ATR)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst)
def summarize_heatmap(python, arrayplot, cluster, file_layout, libpath=[]): import arrayio from genomicode import graphlib M_predict = arrayio.read(file_layout.PREDICTIONS_PCL) nrow, ncol = M_predict.dim() # Set the size of the plot. x = graphlib.find_wide_heatmap_size(nrow, ncol, min_box_width=12, min_box_height=12, height_width_ratio=nrow * 1.618 / ncol) xpix, ypix = x x = graphlib.plot_heatmap(file_layout.PREDICTIONS_PCL, file_layout.PREDICTIONS_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, scale=-0.5, gain=1.5, no_autoscale=True, gene_label=True, array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) print x # If arrayplot generated predictions.cdt file, remove it. # Actually, don't remove it. It might be required if people want # to plot it themselves with other plotting software. Maybe can # move it to the attic. There may also be a predictions.atr file. #if os.path.exists(file_layout.PREDICTIONS_CDT): # os.unlink(file_layout.PREDICTIONS_CDT) # Clean up some of the cluster files. if os.path.exists(file_layout.PREDICTIONS_CDT): src = file_layout.PREDICTIONS_CDT x = os.path.split(file_layout.PREDICTIONS_CDT)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst) if os.path.exists(file_layout.PREDICTIONS_ATR): src = file_layout.PREDICTIONS_ATR x = os.path.split(file_layout.PREDICTIONS_ATR)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst) # Make sure the signature was generated correctly. An error could # mean that arrayplot.py or cluster is missing. assert os.path.exists(file_layout.PREDICTIONS_PNG), \ "Failed to make predictions heatmap."
def summarize_factor_scores(file_layout, python, arrayplot, cluster, libpath): import zipfile import arrayio from genomicode import Matrix from genomicode import jmath from genomicode import archive from genomicode import graphlib from genomicode import bfrm DATA = arrayio.read(file_layout.DATASET) param_file = "parameters.txt" model = bfrm.read_clean_model(file_layout.BFRM_MODEL, param_file=param_file) num_factors = model["F"].nrow() # Load the factor names. assert zipfile.is_zipfile(file_layout.BFRM_MODEL) s2f = archive.unzip_dict(file_layout.BFRM_MODEL) assert "factorids.txt" in s2f, "Missing: factorids.txt" zfile = zipfile.ZipFile(file_layout.BFRM_MODEL) factor_names = [x.strip() for x in zfile.open(s2f["factorids.txt"])] assert len(factor_names) == num_factors # sample x factor matrix F = arrayio.read(file_layout.BFRM_AF) assert F.nrow() == DATA.ncol() F_X = jmath.transpose(F._X) # F_X contains all factors, including intercept and design. # Remove all but the latent factors. F_X = F_X[-num_factors:] # Sort the factors so they'll be in the same order as the clean # model. assert len(F_X) == len(model["FACTOR_O"]) F_X = [F_X[i] for i in model["FACTOR_O"]] factor_names = [factor_names[i] for i in model["FACTOR_O"]] # Write out the projected factor scores. SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_names["xID"] = factor_names col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME) M = Matrix.InMemoryMatrix(F_X, row_names, col_names) arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES) # Make the heatmap. x = graphlib.find_wide_heatmap_size(M.nrow(), M.ncol(), min_box_height=10, min_box_width=10, max_total_height=768, max_total_width=1024) xpix, ypix = x ypix = min(ypix, xpix * 4) x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES, file_layout.FACTOR_SCORES_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_center="mean", gene_normalize="var", gene_label=True, cluster_genes=True, array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # Clean up the cluster files. files = [ file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR ] for filename in files: if not os.path.exists(filename): continue src = filename x = os.path.split(filename)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst)
def summarize_gene_factor_probs(file_layout, factor_cutoff, python, arrayplot, cluster, libpath): import arrayio from genomicode import Matrix from genomicode import graphlib model = _read_model(file_layout, factor_cutoff) PostPib = model["PostPib"] ExternalProb = model.get("ExternalProb") # If there were no factors, then don't generate any files. if not PostPib.ncol(): print "Not generating factor probabilities file. No factors detected." return # Pull out the gene names. DATA = arrayio.read(file_layout.DATASET) DATA_m = DATA.matrix(model["VariablesIn"], None) # Pull out the factor names. assert os.path.exists(file_layout.FACTOR_SCORES) D_scores = arrayio.read(file_layout.FACTOR_SCORES) factor_names = D_scores.row_names(arrayio.ROW_ID) assert len(factor_names) == PostPib.ncol() # Write the probabilities for the genes in the model. SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_order = DATA_m.row_names() for x in row_order: row_names[x] = DATA_m.row_names(x) col_names[SAMPLE_NAME] = factor_names M = Matrix.InMemoryMatrix(PostPib._X, row_names, col_names, row_order) arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS) # Make heatmap of the factor probs. #x = graphlib.find_tall_heatmap_size( # M.nrow(), M.ncol(), min_box_width=10, max_total_height=1000, # max_total_width=1000) xpix, ypix = 20, 20 x = graphlib.plot_heatmap( file_layout.FACTOR_PROBS, file_layout.FACTOR_PROBS_PNG, xpix, ypix, color="red", #show_colorbar=True, show_grid=True, array_label=True, gene_label=True, scale=-0.5, gain=2.0, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # If exists, write the probabilities for all genes in the data set. if not ExternalProb: return row_names = {} col_names = {} row_order = DATA.row_names() for x in row_order: row_names[x] = DATA.row_names(x) col_names[SAMPLE_NAME] = factor_names M = Matrix.InMemoryMatrix(ExternalProb._X, row_names, col_names, row_order) arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS_ALL)
def summarize_factor_scores(file_layout, factor_cutoff, python, arrayplot, cluster, libpath): import arrayio from genomicode import Matrix from genomicode import graphlib DATA = arrayio.read(file_layout.DATASET) model = _read_model(file_layout, factor_cutoff) F = model["F"] # If there were no factors, then don't generate any files. if not F.nrow(): print "Not generating factor scores file. No factors detected." return assert F.ncol() == DATA.ncol() # Read the factor names. x = [x.strip() for x in open(file_layout.BFRM_FACTOR_IDS)] factor_names = x assert len(factor_names) == F.nrow() # The factor names are in the same order as the data files. Sort # them so they'll be in the same order as the clean model. factor_names = [factor_names[i] for i in model["FACTOR_O"]] SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME row_names = {} col_names = {} row_names["xID"] = factor_names col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME) M = Matrix.InMemoryMatrix(F._X, row_names, col_names) arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES) # Make the heatmap. x = graphlib.find_wide_heatmap_size(M.nrow(), M.ncol(), min_box_height=10, min_box_width=10, max_total_height=768, max_total_width=1024) xpix, ypix = x ypix = min(ypix, xpix * 4) # TODO: Don't show array label if there are too many samples. x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES, file_layout.FACTOR_SCORES_PNG, xpix, ypix, color="bild", show_colorbar=True, show_grid=True, gene_label=True, cluster_genes=True, gene_center="mean", gene_normalize="var", array_label=True, cluster_arrays=True, python=python, arrayplot=arrayplot, cluster=cluster, libpath=libpath) # Clean up some of the cluster files. files = [ file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR ] for filename in files: if not os.path.exists(filename): continue src = filename x = os.path.split(filename)[1] dst = os.path.join(file_layout.ATTIC, x) os.rename(src, dst)