예제 #1
0
def summarize_heatmaps(python, arrayplot, cluster, file_layout, libpath=[]):
    import arrayio
    from genomicode import graphlib

    # Load the data sets.
    DATA_orig = arrayio.gct_format.read(file_layout.DS_PROC_FILTERED)
    DATA_final = arrayio.gct_format.read(file_layout.DS_FINAL_FILTERED)
    assert DATA_final.dim() == DATA_orig.dim()

    nrow, ncol = DATA_orig.dim()
    x = graphlib.find_tall_heatmap_size(
        #nrow, ncol, min_box_width=20, max_total_height=2000,
        nrow,
        ncol,
        max_total_height=2000,
        max_total_width=2000)
    xpix, ypix = x
    #print "SIZE", nrow, ncol, xpix, ypix

    graphlib.plot_heatmap(file_layout.DS_FINAL_FILTERED,
                          file_layout.DS_FINAL_HEATMAP,
                          xpix,
                          ypix,
                          color="bild",
                          show_colorbar=True,
                          show_grid=True,
                          gene_center="mean",
                          gene_normalize="var",
                          array_label=True,
                          python=python,
                          arrayplot=arrayplot,
                          cluster=cluster,
                          libpath=libpath)
    graphlib.plot_heatmap(file_layout.DS_PROC_FILTERED,
                          file_layout.DS_PROC_HEATMAP,
                          xpix,
                          ypix,
                          color="bild",
                          show_colorbar=True,
                          show_grid=True,
                          gene_center="mean",
                          gene_normalize="var",
                          array_label=True,
                          python=python,
                          arrayplot=arrayplot,
                          cluster=cluster,
                          libpath=libpath)

    trash_files = [
        file_layout.DS_PROC_CLUSTER_TRASH1,
        file_layout.DS_PROC_CLUSTER_TRASH2,
        file_layout.DS_FINAL_CLUSTER_TRASH1,
        file_layout.DS_FINAL_CLUSTER_TRASH2,
    ]
    for f in trash_files:
        if os.path.exists(f):
            os.unlink(f)
예제 #2
0
def summarize_heatmap(python, arrayplot, cluster, libpath, file_layout):
    from genomicode import graphlib

    # Bug: what if there are nan's in the probabilities?
    xpix, ypix = 30, 30
    x = graphlib.plot_heatmap(
        file_layout.PROBABILITIES_PCL, file_layout.PROBABILITIES_PNG,
        xpix, ypix, color="bild", show_colorbar=True, show_grid=True,
        gene_label=True, cluster_genes=True,
        array_label=True, cluster_arrays=True, scale=-0.5, gain=2.0,
        no_autoscale=True,
        python=python, arrayplot=arrayplot, cluster=cluster,
        libpath=libpath)
    print x
    
    # Clean up some extra files.
    if os.path.exists(file_layout.PROBABILITIES_CDT):
        src = file_layout.PROBABILITIES_CDT
        x = os.path.split(file_layout.PROBABILITIES_CDT)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
    if os.path.exists(file_layout.PROBABILITIES_GTR):
        src = file_layout.PROBABILITIES_GTR
        x = os.path.split(file_layout.PROBABILITIES_GTR)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
    if os.path.exists(file_layout.PROBABILITIES_ATR):
        src = file_layout.PROBABILITIES_ATR
        x = os.path.split(file_layout.PROBABILITIES_ATR)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
예제 #3
0
def summarize_heatmap(python, arrayplot, cluster, file_layout, libpath=[]):
    import arrayio
    from genomicode import graphlib

    M_predict = arrayio.read(file_layout.PREDICTIONS_PCL)
    nrow, ncol = M_predict.dim()

    # Set the size of the plot.
    x = graphlib.find_wide_heatmap_size(nrow,
                                        ncol,
                                        min_box_width=12,
                                        min_box_height=12,
                                        height_width_ratio=nrow * 1.618 / ncol)
    xpix, ypix = x

    x = graphlib.plot_heatmap(file_layout.PREDICTIONS_PCL,
                              file_layout.PREDICTIONS_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              scale=-0.5,
                              gain=1.5,
                              no_autoscale=True,
                              gene_label=True,
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)
    print x

    # If arrayplot generated predictions.cdt file, remove it.
    # Actually, don't remove it.  It might be required if people want
    # to plot it themselves with other plotting software.  Maybe can
    # move it to the attic.  There may also be a predictions.atr file.
    #if os.path.exists(file_layout.PREDICTIONS_CDT):
    #    os.unlink(file_layout.PREDICTIONS_CDT)

    # Clean up some of the cluster files.
    if os.path.exists(file_layout.PREDICTIONS_CDT):
        src = file_layout.PREDICTIONS_CDT
        x = os.path.split(file_layout.PREDICTIONS_CDT)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
    if os.path.exists(file_layout.PREDICTIONS_ATR):
        src = file_layout.PREDICTIONS_ATR
        x = os.path.split(file_layout.PREDICTIONS_ATR)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)

    # Make sure the signature was generated correctly.  An error could
    # mean that arrayplot.py or cluster is missing.
    assert os.path.exists(file_layout.PREDICTIONS_PNG), \
           "Failed to make predictions heatmap."
예제 #4
0
def summarize_factor_scores(file_layout, python, arrayplot, cluster, libpath):
    import zipfile
    import arrayio
    from genomicode import Matrix
    from genomicode import jmath
    from genomicode import archive
    from genomicode import graphlib
    from genomicode import bfrm

    DATA = arrayio.read(file_layout.DATASET)

    param_file = "parameters.txt"
    model = bfrm.read_clean_model(file_layout.BFRM_MODEL,
                                  param_file=param_file)
    num_factors = model["F"].nrow()

    # Load the factor names.
    assert zipfile.is_zipfile(file_layout.BFRM_MODEL)
    s2f = archive.unzip_dict(file_layout.BFRM_MODEL)
    assert "factorids.txt" in s2f, "Missing: factorids.txt"
    zfile = zipfile.ZipFile(file_layout.BFRM_MODEL)
    factor_names = [x.strip() for x in zfile.open(s2f["factorids.txt"])]
    assert len(factor_names) == num_factors

    # sample x factor matrix
    F = arrayio.read(file_layout.BFRM_AF)
    assert F.nrow() == DATA.ncol()
    F_X = jmath.transpose(F._X)

    # F_X contains all factors, including intercept and design.
    # Remove all but the latent factors.
    F_X = F_X[-num_factors:]

    # Sort the factors so they'll be in the same order as the clean
    # model.
    assert len(F_X) == len(model["FACTOR_O"])
    F_X = [F_X[i] for i in model["FACTOR_O"]]
    factor_names = [factor_names[i] for i in model["FACTOR_O"]]

    # Write out the projected factor scores.
    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_names["xID"] = factor_names
    col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME)
    M = Matrix.InMemoryMatrix(F_X, row_names, col_names)
    arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES)

    # Make the heatmap.
    x = graphlib.find_wide_heatmap_size(M.nrow(),
                                        M.ncol(),
                                        min_box_height=10,
                                        min_box_width=10,
                                        max_total_height=768,
                                        max_total_width=1024)
    xpix, ypix = x
    ypix = min(ypix, xpix * 4)
    x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES,
                              file_layout.FACTOR_SCORES_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              gene_center="mean",
                              gene_normalize="var",
                              gene_label=True,
                              cluster_genes=True,
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)

    # Clean up the cluster files.
    files = [
        file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR
    ]
    for filename in files:
        if not os.path.exists(filename):
            continue
        src = filename
        x = os.path.split(filename)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)
예제 #5
0
def summarize_gene_factor_probs(file_layout, factor_cutoff, python, arrayplot,
                                cluster, libpath):
    import arrayio
    from genomicode import Matrix
    from genomicode import graphlib

    model = _read_model(file_layout, factor_cutoff)
    PostPib = model["PostPib"]
    ExternalProb = model.get("ExternalProb")

    # If there were no factors, then don't generate any files.
    if not PostPib.ncol():
        print "Not generating factor probabilities file.  No factors detected."
        return

    # Pull out the gene names.
    DATA = arrayio.read(file_layout.DATASET)
    DATA_m = DATA.matrix(model["VariablesIn"], None)

    # Pull out the factor names.
    assert os.path.exists(file_layout.FACTOR_SCORES)
    D_scores = arrayio.read(file_layout.FACTOR_SCORES)
    factor_names = D_scores.row_names(arrayio.ROW_ID)
    assert len(factor_names) == PostPib.ncol()

    # Write the probabilities for the genes in the model.
    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_order = DATA_m.row_names()
    for x in row_order:
        row_names[x] = DATA_m.row_names(x)
    col_names[SAMPLE_NAME] = factor_names
    M = Matrix.InMemoryMatrix(PostPib._X, row_names, col_names, row_order)
    arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS)

    # Make heatmap of the factor probs.
    #x = graphlib.find_tall_heatmap_size(
    #    M.nrow(), M.ncol(), min_box_width=10, max_total_height=1000,
    #    max_total_width=1000)
    xpix, ypix = 20, 20
    x = graphlib.plot_heatmap(
        file_layout.FACTOR_PROBS,
        file_layout.FACTOR_PROBS_PNG,
        xpix,
        ypix,
        color="red",
        #show_colorbar=True, show_grid=True,
        array_label=True,
        gene_label=True,
        scale=-0.5,
        gain=2.0,
        python=python,
        arrayplot=arrayplot,
        cluster=cluster,
        libpath=libpath)

    # If exists, write the probabilities for all genes in the data set.
    if not ExternalProb:
        return
    row_names = {}
    col_names = {}
    row_order = DATA.row_names()
    for x in row_order:
        row_names[x] = DATA.row_names(x)
    col_names[SAMPLE_NAME] = factor_names
    M = Matrix.InMemoryMatrix(ExternalProb._X, row_names, col_names, row_order)
    arrayio.tab_delimited_format.write(M, file_layout.FACTOR_PROBS_ALL)
예제 #6
0
def summarize_factor_scores(file_layout, factor_cutoff, python, arrayplot,
                            cluster, libpath):
    import arrayio
    from genomicode import Matrix
    from genomicode import graphlib

    DATA = arrayio.read(file_layout.DATASET)
    model = _read_model(file_layout, factor_cutoff)

    F = model["F"]
    # If there were no factors, then don't generate any files.
    if not F.nrow():
        print "Not generating factor scores file.  No factors detected."
        return
    assert F.ncol() == DATA.ncol()

    # Read the factor names.
    x = [x.strip() for x in open(file_layout.BFRM_FACTOR_IDS)]
    factor_names = x
    assert len(factor_names) == F.nrow()
    # The factor names are in the same order as the data files.  Sort
    # them so they'll be in the same order as the clean model.
    factor_names = [factor_names[i] for i in model["FACTOR_O"]]

    SAMPLE_NAME = arrayio.tdf.SAMPLE_NAME
    row_names = {}
    col_names = {}
    row_names["xID"] = factor_names
    col_names[SAMPLE_NAME] = DATA.col_names(SAMPLE_NAME)
    M = Matrix.InMemoryMatrix(F._X, row_names, col_names)
    arrayio.pcl_format.write(M, file_layout.FACTOR_SCORES)

    # Make the heatmap.
    x = graphlib.find_wide_heatmap_size(M.nrow(),
                                        M.ncol(),
                                        min_box_height=10,
                                        min_box_width=10,
                                        max_total_height=768,
                                        max_total_width=1024)
    xpix, ypix = x
    ypix = min(ypix, xpix * 4)
    # TODO: Don't show array label if there are too many samples.
    x = graphlib.plot_heatmap(file_layout.FACTOR_SCORES,
                              file_layout.FACTOR_SCORES_PNG,
                              xpix,
                              ypix,
                              color="bild",
                              show_colorbar=True,
                              show_grid=True,
                              gene_label=True,
                              cluster_genes=True,
                              gene_center="mean",
                              gene_normalize="var",
                              array_label=True,
                              cluster_arrays=True,
                              python=python,
                              arrayplot=arrayplot,
                              cluster=cluster,
                              libpath=libpath)

    # Clean up some of the cluster files.
    files = [
        file_layout.FACTOR_CDT, file_layout.FACTOR_ATR, file_layout.FACTOR_GTR
    ]
    for filename in files:
        if not os.path.exists(filename):
            continue
        src = filename
        x = os.path.split(filename)[1]
        dst = os.path.join(file_layout.ATTIC, x)
        os.rename(src, dst)