示例#1
0
def _parse_cluster(options_cluster, indexes_include_headers, MATRIX):
    # Return a vector of clusters, where each cluster is an integer
    # from 0 to K-1.  K is the total number of clusters.  The length
    # of the vector should be the same as the number of annotations in
    # the matrix.
    from genomicode import parselib

    index2cluster = {}
    for clust_i, s in enumerate(options_cluster):
        ranges = parselib.parse_ranges(s)
        clean = []
        for s, e in ranges:
            # Ranges are 1-based, inclusive.  Convert to 0-based exclusive.
            s = s - 1
            if indexes_include_headers:
                s, e = s - 1, e - 1  # assume 1 header
                #i -= len(MATRIX._row_names)
            clean.append((s, e))
        # Set the clusters.
        for s, e in clean:
            for i in range(s, e):
                assert i < MATRIX.num_annots(), \
                       "Index %d out of range" % i
                assert i not in index2cluster, \
                       "Index %d in multiple clusters" % i
                index2cluster[i] = clust_i
    #cluster = [len(options_cluster)] * MATRIX.ncol()
    cluster = [None] * MATRIX.num_annots()
    for i, g in index2cluster.iteritems():
        cluster[i] = g
    return cluster
示例#2
0
def parse_indexes(MATRIX, indexes, indexes_include_headers):
    from genomicode import parselib

    max_index = MATRIX.ncol()
    num_headers = len(MATRIX._row_names)
    assert max_index, "empty matrix"

    I = []
    for s, e in parselib.parse_ranges(indexes):
        if indexes_include_headers:
            s, e = s - num_headers, e - num_headers
        assert s >= 1, "Index out of range: %s" % s
        assert e <= max_index, "Index out of range: %s" % e
        s, e = s - 1, min(e, max_index)
        I.extend(range(s, e))
    return I
示例#3
0
def _parse_cluster(options_cluster, indexes_include_headers, MATRIX):
    # Return a vector of clusters, where each cluster is an integer
    # from 0 to K-1.  K is the total number of clusters.  The length
    # of the vector should be the same as the number of samples in the
    # matrix.
    from genomicode import parselib

    index2cluster = {}
    for clust_i, s in enumerate(options_cluster):
        ranges = parselib.parse_ranges(s)
        for s, e in ranges:
            for i in range(s - 1, e):
                if indexes_include_headers:
                    i -= len(MATRIX._row_names)
                assert i < MATRIX.ncol(), "Index %d out of range" % i
                assert i not in index2cluster, \
                       "Index %d in multiple clusters" % i
                index2cluster[i] = clust_i
    #cluster = [len(options_cluster)] * MATRIX.ncol()
    cluster = [None] * MATRIX.ncol()
    for i, g in index2cluster.iteritems():
        cluster[i] = g
    return cluster
示例#4
0
def main():
    from optparse import OptionParser, OptionGroup

    # matrix_file should be a pathway x sample file.
    usage = "usage: %prog [options] <dataset>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("",
                      "--selap",
                      dest="selap_path",
                      default=None,
                      help="Specify the path to SELAPv3.")
    parser.add_option("",
                      "--matlab",
                      dest="matlab",
                      default="matlab",
                      help="Specify the command to run matlab.")
    parser.add_option("",
                      "--python",
                      dest="python",
                      default=None,
                      help="Specify the command to run python (optional).")
    parser.add_option("",
                      "--arrayplot",
                      dest="arrayplot",
                      default=None,
                      help="Specify the command to run arrayplot.")
    parser.add_option("",
                      "--cluster",
                      dest="cluster",
                      default=None,
                      help="Specify the command to run cluster.")
    # This doesn't give as much control over exactly which python
    # version is run.
    #parser.add_option(
    #    "", "--binpath", dest="binpath", action="append", default=[],
    #    help="Add to the binary search path.")
    parser.add_option("",
                      "--libpath",
                      dest="libpath",
                      action="append",
                      default=[],
                      help="Add to the Python library search path.")
    parser.add_option("-o",
                      "--outpath",
                      dest="outpath",
                      type="string",
                      default=None,
                      help="Save files in this path.")
    parser.add_option("-z",
                      "--archive",
                      dest="archive",
                      action="store_true",
                      default=None,
                      help="Archive the raw output.  Helpful for GenePattern.")

    group = OptionGroup(parser, "Model Parameters")
    # Higher numbers have more groups.
    # Range from 0 and lower.
    group.add_option(
        "-p",
        "--penalty",
        dest="penalty",
        default="-33",
        help="Penalty for tuning number of subgroups (default -33).")
    group.add_option(
        "-m",
        "--model",
        dest="model_file",
        default=None,
        help="Specify a file that contains a pre-built subtype model.")
    parser.add_option_group(group)

    # Parse the input arguments.
    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error("Please specify a file with pathway probabilities.")
    filename, = args
    if not os.path.exists(filename):
        parser.error("I could not find file %s." % filename)

    if options.penalty.find(".") >= 0:
        parser.error("Penalties should be integers.")

    if options.libpath:
        sys.path = options.libpath + sys.path
    # Import after the library path is set.
    import arrayio
    from genomicode import genepattern
    from genomicode import archive
    from genomicode import parselib

    genepattern.fix_environ_path()

    # Maximum number of models that someone can create at a time.
    MAX_MODELS = 50

    # Allow people to supply more than one penalty.  Parse into a list
    # of ranges.  Penalties must be integers.
    penalties = []
    for (start, end) in parselib.parse_ranges(options.penalty):
        penalties.extend(range(start, end + 1))
    assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \
           MAX_MODELS
    assert penalties, "At least one penalty must be specified."
    assert not (options.model_file and len(penalties) != 1)
    for p in penalties:
        assert p <= 0, "Penalties should be negative."

    num_analyses = len(penalties)

    # Set up the files.
    file_layout = make_file_layout(options.outpath, num_analyses, penalties[0])
    init_paths(file_layout)

    # Read the matrix and convert to GCT format.
    MATRIX = arrayio.read(filename)
    MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format)

    # Align this matrix to the SELAP model, if it already exists.
    if options.model_file:
        MATRIX = align_dataset(MATRIX, options.model_file)
    # Write out the data set.
    write_dataset(file_layout.DATASET, MATRIX)

    for penalty in penalties:
        # Set up the files.
        file_layout = make_file_layout(options.outpath, num_analyses, penalty)
        init_paths(file_layout)

        # Make the model.
        write_selap_dataset(file_layout)
        if options.model_file:
            write_model(options.model_file, file_layout)
        else:
            make_model(options.selap_path, penalty, file_layout,
                       options.matlab)

        # Predict the subgroups.
        predict_subgroups(options.selap_path, file_layout, options.matlab)

        # Generate some files for output.
        summarize_predictions(file_layout)
        summarize_heatmap(options.python, options.arrayplot, options.cluster,
                          file_layout, options.libpath)

        # Archive the SELAP stuff, and any other big files.
        if options.archive:
            print "Archiving results."
            archive.zip_path(file_layout.SELAP, noclobber=False)
            archive.zip_path(file_layout.ATTIC, noclobber=False)

        if num_analyses <= 1:
            continue
        # Now do some cleanup if multiple analyses were requested.

        # If there were multiple penalties specified, make a copy of
        # some files for convenience.
        fl = file_layout
        files_to_copy = [
            (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL),
            (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG),
        ]
        for src, dst in files_to_copy:
            assert os.path.exists(src)
            os.system("cp -p '%s' '%s'" % (src, dst))

        if options.archive:
            archive.zip_path(file_layout.ANALYSIS)
        sys.stdout.flush()

    if num_analyses > 1:
        summarize_subgroups(options.outpath, num_analyses, penalties)

    print "Done."
示例#5
0
def resolve_classes(MATRIX, indexes1, indexes2, count_headers, name1, name2):
    # indexes1 is a string.  indexes2 is a string or None.
    # Return name1, name2, classes.  classes is 0, 1, or None.
    from genomicode import parselib

    max_index = MATRIX.ncol()
    num_headers = len(MATRIX._row_names)
    assert max_index, "empty matrix"

    assert indexes1 and type(indexes1) is type("")

    I1 = []
    for s, e in parselib.parse_ranges(indexes1):
        if count_headers:
            s, e = s - num_headers, e - num_headers
        assert s >= 1, "Index out of range: %s" % s
        assert e <= max_index, "Index out of range: %s" % e
        s, e = s - 1, min(e, max_index)
        I1.extend(range(s, e))

    I2 = []
    if indexes2:
        for s, e in parselib.parse_ranges(indexes2):
            if count_headers:
                s, e = s - num_headers, e - num_headers
            assert s >= 1, "Index out of range: %s" % s
            assert e <= max_index, "Index out of range: %s" % e
            s, e = s - 1, min(e, max_index)
            I2.extend(range(s, e))
    else:
        # If indexes2 not given, then I2 should be every index that's
        # not in I1.
        I2 = [i for i in range(max_index) if i not in I1]

    # Make sure no overlap between I1 and I2.
    for i in I1:
        assert i not in I2, "Overlap in classes."

    # Provide default group names.
    # If there is only 1 index, then use the sample name from the
    # matrix.
    col_header = None
    if MATRIX.col_names():
        col_header = MATRIX.col_names()[0]
    if not name1 and col_header and len(I1) == 1:
        name1 = MATRIX.col_names(col_header)[I1[0]]
    if not name2 and col_header and len(I2) == 1:
        name2 = MATRIX.col_names(col_header)[I2[0]]

    name1 = name1 or "group1"
    name2 = name2 or "group2"
    if name1 == name2:
        name1 = "%s-1" % name1
        name2 = "%s-2" % name2

    classes = [None] * MATRIX.ncol()
    for i in I1:
        classes[i] = 0
    for i in I2:
        classes[i] = 1

    x = name1, name2, classes
    return x