def _parse_cluster(options_cluster, indexes_include_headers, MATRIX): # Return a vector of clusters, where each cluster is an integer # from 0 to K-1. K is the total number of clusters. The length # of the vector should be the same as the number of annotations in # the matrix. from genomicode import parselib index2cluster = {} for clust_i, s in enumerate(options_cluster): ranges = parselib.parse_ranges(s) clean = [] for s, e in ranges: # Ranges are 1-based, inclusive. Convert to 0-based exclusive. s = s - 1 if indexes_include_headers: s, e = s - 1, e - 1 # assume 1 header #i -= len(MATRIX._row_names) clean.append((s, e)) # Set the clusters. for s, e in clean: for i in range(s, e): assert i < MATRIX.num_annots(), \ "Index %d out of range" % i assert i not in index2cluster, \ "Index %d in multiple clusters" % i index2cluster[i] = clust_i #cluster = [len(options_cluster)] * MATRIX.ncol() cluster = [None] * MATRIX.num_annots() for i, g in index2cluster.iteritems(): cluster[i] = g return cluster
def parse_indexes(MATRIX, indexes, indexes_include_headers): from genomicode import parselib max_index = MATRIX.ncol() num_headers = len(MATRIX._row_names) assert max_index, "empty matrix" I = [] for s, e in parselib.parse_ranges(indexes): if indexes_include_headers: s, e = s - num_headers, e - num_headers assert s >= 1, "Index out of range: %s" % s assert e <= max_index, "Index out of range: %s" % e s, e = s - 1, min(e, max_index) I.extend(range(s, e)) return I
def _parse_cluster(options_cluster, indexes_include_headers, MATRIX): # Return a vector of clusters, where each cluster is an integer # from 0 to K-1. K is the total number of clusters. The length # of the vector should be the same as the number of samples in the # matrix. from genomicode import parselib index2cluster = {} for clust_i, s in enumerate(options_cluster): ranges = parselib.parse_ranges(s) for s, e in ranges: for i in range(s - 1, e): if indexes_include_headers: i -= len(MATRIX._row_names) assert i < MATRIX.ncol(), "Index %d out of range" % i assert i not in index2cluster, \ "Index %d in multiple clusters" % i index2cluster[i] = clust_i #cluster = [len(options_cluster)] * MATRIX.ncol() cluster = [None] * MATRIX.ncol() for i, g in index2cluster.iteritems(): cluster[i] = g return cluster
def main(): from optparse import OptionParser, OptionGroup # matrix_file should be a pathway x sample file. usage = "usage: %prog [options] <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--selap", dest="selap_path", default=None, help="Specify the path to SELAPv3.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") # This doesn't give as much control over exactly which python # version is run. #parser.add_option( # "", "--binpath", dest="binpath", action="append", default=[], # help="Add to the binary search path.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") group = OptionGroup(parser, "Model Parameters") # Higher numbers have more groups. # Range from 0 and lower. group.add_option( "-p", "--penalty", dest="penalty", default="-33", help="Penalty for tuning number of subgroups (default -33).") group.add_option( "-m", "--model", dest="model_file", default=None, help="Specify a file that contains a pre-built subtype model.") parser.add_option_group(group) # Parse the input arguments. options, args = parser.parse_args() if len(args) != 1: parser.error("Please specify a file with pathway probabilities.") filename, = args if not os.path.exists(filename): parser.error("I could not find file %s." % filename) if options.penalty.find(".") >= 0: parser.error("Penalties should be integers.") if options.libpath: sys.path = options.libpath + sys.path # Import after the library path is set. import arrayio from genomicode import genepattern from genomicode import archive from genomicode import parselib genepattern.fix_environ_path() # Maximum number of models that someone can create at a time. MAX_MODELS = 50 # Allow people to supply more than one penalty. Parse into a list # of ranges. Penalties must be integers. penalties = [] for (start, end) in parselib.parse_ranges(options.penalty): penalties.extend(range(start, end + 1)) assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \ MAX_MODELS assert penalties, "At least one penalty must be specified." assert not (options.model_file and len(penalties) != 1) for p in penalties: assert p <= 0, "Penalties should be negative." num_analyses = len(penalties) # Set up the files. file_layout = make_file_layout(options.outpath, num_analyses, penalties[0]) init_paths(file_layout) # Read the matrix and convert to GCT format. MATRIX = arrayio.read(filename) MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format) # Align this matrix to the SELAP model, if it already exists. if options.model_file: MATRIX = align_dataset(MATRIX, options.model_file) # Write out the data set. write_dataset(file_layout.DATASET, MATRIX) for penalty in penalties: # Set up the files. file_layout = make_file_layout(options.outpath, num_analyses, penalty) init_paths(file_layout) # Make the model. write_selap_dataset(file_layout) if options.model_file: write_model(options.model_file, file_layout) else: make_model(options.selap_path, penalty, file_layout, options.matlab) # Predict the subgroups. predict_subgroups(options.selap_path, file_layout, options.matlab) # Generate some files for output. summarize_predictions(file_layout) summarize_heatmap(options.python, options.arrayplot, options.cluster, file_layout, options.libpath) # Archive the SELAP stuff, and any other big files. if options.archive: print "Archiving results." archive.zip_path(file_layout.SELAP, noclobber=False) archive.zip_path(file_layout.ATTIC, noclobber=False) if num_analyses <= 1: continue # Now do some cleanup if multiple analyses were requested. # If there were multiple penalties specified, make a copy of # some files for convenience. fl = file_layout files_to_copy = [ (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL), (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG), ] for src, dst in files_to_copy: assert os.path.exists(src) os.system("cp -p '%s' '%s'" % (src, dst)) if options.archive: archive.zip_path(file_layout.ANALYSIS) sys.stdout.flush() if num_analyses > 1: summarize_subgroups(options.outpath, num_analyses, penalties) print "Done."
def resolve_classes(MATRIX, indexes1, indexes2, count_headers, name1, name2): # indexes1 is a string. indexes2 is a string or None. # Return name1, name2, classes. classes is 0, 1, or None. from genomicode import parselib max_index = MATRIX.ncol() num_headers = len(MATRIX._row_names) assert max_index, "empty matrix" assert indexes1 and type(indexes1) is type("") I1 = [] for s, e in parselib.parse_ranges(indexes1): if count_headers: s, e = s - num_headers, e - num_headers assert s >= 1, "Index out of range: %s" % s assert e <= max_index, "Index out of range: %s" % e s, e = s - 1, min(e, max_index) I1.extend(range(s, e)) I2 = [] if indexes2: for s, e in parselib.parse_ranges(indexes2): if count_headers: s, e = s - num_headers, e - num_headers assert s >= 1, "Index out of range: %s" % s assert e <= max_index, "Index out of range: %s" % e s, e = s - 1, min(e, max_index) I2.extend(range(s, e)) else: # If indexes2 not given, then I2 should be every index that's # not in I1. I2 = [i for i in range(max_index) if i not in I1] # Make sure no overlap between I1 and I2. for i in I1: assert i not in I2, "Overlap in classes." # Provide default group names. # If there is only 1 index, then use the sample name from the # matrix. col_header = None if MATRIX.col_names(): col_header = MATRIX.col_names()[0] if not name1 and col_header and len(I1) == 1: name1 = MATRIX.col_names(col_header)[I1[0]] if not name2 and col_header and len(I2) == 1: name2 = MATRIX.col_names(col_header)[I2[0]] name1 = name1 or "group1" name2 = name2 or "group2" if name1 == name2: name1 = "%s-1" % name1 name2 = "%s-2" % name2 classes = [None] * MATRIX.ncol() for i in I1: classes[i] = 0 for i in I2: classes[i] = 1 x = name1, name2, classes return x