def make_model(selap_path, penalty, file_layout, matlab): import arrayio from genomicode import parselib from genomicode import archive from genomicode import selap print "Generating subgroups with penalty %d." % penalty x = selap.selap_make_raw(file_layout.SELAP_DATASET, penalty, matlab_bin=matlab, selap_path=selap_path, outpath=file_layout.SELAP) print x # Make sure SELAP ran correctly. msg = "Missing file. SELAPver3 did not run correctly." assert os.path.exists(file_layout.SELAP_MU), msg assert os.path.exists(file_layout.SELAP_SIG), msg assert os.path.exists(file_layout.SELAP_PROB), msg # Figure out the number of variables and the number of subgroups. X = arrayio.read(file_layout.SELAP_MU) num_vars, num_subgroups = X.dim() # Make the model file. opj = os.path.join path = file_layout.SMODEL_ZIP.replace(".zip", "") if not os.path.exists(path): os.mkdir(path) # Move over the files generated by SELAP. os.rename(file_layout.SELAP_MU, opj(path, "mu.txt")) os.rename(file_layout.SELAP_SIG, opj(path, "sig.txt")) os.rename(file_layout.SELAP_PROB, opj(path, "prob.txt")) # Generate the var.txt file. M = arrayio.read(file_layout.DATASET) assert M.nrow() == num_vars names = M.row_names(arrayio.ROW_ID) assert len(names) == num_vars handle = open(opj(path, "var.txt"), 'w') for x in names: print >> handle, x handle.close() # Generate the clust.txt file. # Set the names of the subgroups to a reasonable default. x = ["GROUP%s" % x for x in parselib.pretty_range(0, num_subgroups)] group_names = x handle = open(opj(path, "clust.txt"), 'w') for x in group_names: print >> handle, x handle.close() archive.zip_path(path, noclobber=False) assert os.path.exists(file_layout.SMODEL_ZIP) check_model(file_layout.SMODEL_ZIP)
def main(): #from optparse import OptionParser, OptionGroup from optparse import OptionParser usage = "usage: %prog [options] <file1> <file2> ..." parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("-f", "--num_factors", dest="num_factors", type="int", default=15, help="Number of factors to use for normalization.") # Any string in the control probe file can be a control probe. # Delimited by tabs and newlines. parser.add_option("", "--control_probe_file", dest="control_probe_file", default=None, help="File that contains the control probes.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--bfrm", dest="bfrm_path", default=None, help="Specify the path to the BFRM_normalize directory.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--povray", dest="povray", default="povray", help="Specify the command to run povray.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") # Parse the arguments. options, args = parser.parse_args() if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import time import arrayio from genomicode import filelib from genomicode import archive from genomicode import genepattern start_time = time.time() genepattern.fix_environ_path() if not args: parser.error("Please specify files to normalize.") filenames = args names = [os.path.split(x)[-1] for x in filenames] for filename in filenames: assert filelib.exists(filename), "File not found: %s" % filename # Check to make sure value for num_factors is reasonable. MIN_FACTORS, MAX_FACTORS = 1, 100 if options.num_factors < MIN_FACTORS: if MIN_FACTORS == 1: parser.error("At least %d factor is required." % MIN_FACTORS) else: parser.error("At least %d factors are required." % MIN_FACTORS) elif options.num_factors > MAX_FACTORS: parser.error("%d factors is too many. Maximum is %d." % (options.num_factors, MAX_FACTORS)) # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read each of the input files and align them. matrices = read_matrices(filenames) # Make sure the number of factors don't exceed the size of the # matrices. if matrices and options.num_factors > matrices[0].nrow(): parser.error("Too many factors.") # Standardize each of the matrices to GCT format. if 1: # for debugging for i in range(len(matrices)): matrices[i] = arrayio.convert(matrices[i], to_format=arrayio.gct_format) write_dataset(file_layout.DS_ORIG, matrices) # Log each of the matrices if needed. if 1: # for debugging log_matrices(names, matrices) write_dataset(file_layout.DS_PROC, matrices) sys.stdout.flush() # Format the parameters and output files for bfrm. if 1: # for debugging run_bfrm(options.bfrm_path, options.num_factors, options.control_probe_file, file_layout, options.matlab) # Generate some files for output. if 1: # for debugging summarize_dataset(file_layout) summarize_filtered_genes(file_layout) summarize_heatmaps(options.python, options.arrayplot, options.cluster, file_layout, options.libpath) summarize_pca(options.povray, file_layout, matrices) summarize_report(filenames, matrices, options.num_factors, start_time, file_layout) # Archive the BFRM stuff, and the big files. if options.archive: print "Archiving results." archive.zip_path(file_layout.BFRM, noclobber=False) archive.zip_path(file_layout.ATTIC, noclobber=False) #archive.zip_path(file_layout.DS_PROC, noclobber=False) #archive.zip_path(file_layout.DS_FINAL, noclobber=False) print "Done."
def main(): from optparse import OptionParser, OptionGroup usage = "usage: %prog [options] <bfrm_model> <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--bfrm_path", dest="bfrm_path", default=None, help="Specify the path to BFRM_project.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") # Parse the arguments. options, args = parser.parse_args() if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import arrayio from genomicode import archive from genomicode import genepattern genepattern.fix_environ_path() if len(args) != 2: parser.error("Please specify files.") model_file, filename = args assert os.path.exists(model_file), "File not found: %s" % model_file assert os.path.exists(filename), "File not found: %s" % filename # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read the matrix and convert to GCT format. x = arrayio.read(filename) MATRIX = arrayio.convert(x, to_format=arrayio.gct_format) print "Read data set with %d genes and %d samples." % (MATRIX.nrow(), MATRIX.ncol()) log_matrix(MATRIX) # Write out the data sets. write_dataset(file_layout.DATASET, MATRIX) # Save the BFRM model. write_model(model_file, file_layout) # Run BFRM projection. run_bfrm_project(file_layout, options.bfrm_path, options.matlab) # Generate output files. summarize_factor_scores(file_layout, options.python, options.arrayplot, options.cluster, options.libpath) if options.archive: print "Archiving results." archive.zip_path(file_layout.ATTIC, noclobber=False) archive.zip_path(file_layout.BFRM, noclobber=False) print "Done."
def main(): from optparse import OptionParser, OptionGroup usage = "usage: %prog [options]" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option( "-r", "--rma", dest="rma_dataset", type="string", default=None, help="Specify the RMA-normalized data to analyze.") parser.add_option( "-m", "--mas5", dest="mas5_dataset", type="string", default=None, help="Specify the MAS5-normalized data to analyze.") parser.add_option( "-i", "--illu", dest="illu_dataset", type="string", default=None, help="Specify the Illumina data to analyze.") parser.add_option( "", "--sigdb_path", dest="sigdb_path", type="string", default=None, help="Location of the sigdb/ directory.") parser.add_option( "", "--sigtag", dest="signature_tags", default=[], action="append", help="Specify a specific tag to use.") parser.add_option( "", "--sigid", dest="signature_ids", default=[], action="append", help="Specify a specific signature to use.") parser.add_option( "", "--max_signatures", dest="max_signatures", type="int", default=None, help="Maximum number of signatures to run (for DEBUGGING).") parser.add_option( "-j", "", dest="num_procs", type="int", default=1, help="Number of jobs to run in parallel.") parser.add_option( "-z", "", dest="archive", action="store_true", default=False, help="Archive the individual signatures. Helpful for GenePattern.") parser.add_option( "", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option( "-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option( "", "--gp_imod_all_vars", dest="gp_imod_all_vars", type="string", default=None, help="Special internal variable for use with GenePattern " "interactive modules.") parser.add_option( "", "--debug_gp_imod_all_vars", action="store_true", default=False, dest="debug_gp_imod_all_vars", ) #group = OptionGroup(parser, "Normalization") #group.add_option( # "", "--normalization", dest="normalization", default="MAS5", # help="How was the data set normalized (default MAS5).") #group.add_option( # "-l", "--log_data", dest="log_data", action="store_true", # default=False, # help="Log the MAS5 data before analyzing.") #parser.add_option_group(group) group = OptionGroup(parser, "Pybinreg") group.add_option( "", "--python", dest="python", default=None, help="Specify the command to run python.") group.add_option( "", "--matlab", dest="matlab", default=None, help="Specify the command to run matlab.") group.add_option( "", "--povray", dest="povray", default=None, help="Specify the command to run povray.") group.add_option( "", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") group.add_option( "", "--binreg", dest="binreg_path", default=None, help="Specify the path to the BinReg2.0 code.") group.add_option( "", "--pybinreg", dest="pybinreg", default=None, help="Specify the command to run pybinreg.py.") group.add_option( "", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option_group(group) options, args = parser.parse_args() #if len(args) < 1: # #print sys.argv # #print len(args), args # parser.error("Please specify sigdb_path.") #elif len(args) > 1: # parser.error("Too many arguments.") if args: parser.error("Too many arguments.") # DEBUG the gp_imod_all_vars variable. if options.debug_gp_imod_all_vars: assert not options.gp_imod_all_vars options.gp_imod_all_vars = ( "mas5_expression_file_cb=file&mas5_expression_file_url=&" "rma_expression_file_cb=file&rma_expression_file_url=&" # Skip AKT signature. "sig_AKT=no&" # Change BCAT normalization. "sig_BCAT=yes (custom parameters)&" "sig_BCAT_apply_quantile_normalization=no&" "sig_BCAT_apply_shiftscale_normalization=no&" "sig_BCAT_num_genes=85&sig_BCAT_num_metagenes=2&" # No changes in E2F1. "sig_E2F1=yes (custom parameters)&" "sig_E2F1_apply_quantile_normalization=yes&" "sig_E2F1_apply_shiftscale_normalization=yes&" "sig_E2F1_num_genes=150&sig_E2F1_num_metagenes=2&" # Change genes in EGFR. "sig_EGFR=yes (custom parameters)&" "sig_EGFR_apply_quantile_normalization=no&" "sig_EGFR_apply_shiftscale_normalization=yes&" #"sig_EGFR_num_genes=50000&sig_EGFR_num_metagenes=2&" "sig_EGFR_num_genes=501&sig_EGFR_num_metagenes=2&" # Change quantile, genes, metagenes in ER. "sig_ER=yes (custom parameters)&" "sig_ER_apply_quantile_normalization=no&" "sig_ER_apply_shiftscale_normalization=yes&" "sig_ER_num_genes=150&sig_ER_num_metagenes=3&" "sig_HER2=yes (default parameters)&" "sig_IFNalpha=yes (default parameters)&" "sig_IFNgamma=yes (default parameters)&" "sig_MYC=yes (default parameters)&" "sig_P53=yes (default parameters)&" "sig_P63=yes (default parameters)&" "sig_PI3K=yes (default parameters)&" "sig_PR=yes (default parameters)&" "sig_RAS=yes (default parameters)&" "sig_SRC=yes (default parameters)&" "sig_STAT3=yes (default parameters)&" "sig_TGFB=yes (default parameters)&" "sig_TNFa=yes (default parameters)&" "which_signatures=I choose myself" ) datafile_rma = datafile_mas5 = datafile_illu = None if options.rma_dataset is not None: assert os.path.exists(options.rma_dataset), \ "RMA file not found: %s" % options.rma_dataset datafile_rma = os.path.realpath(options.rma_dataset) if options.mas5_dataset is not None: assert os.path.exists(options.mas5_dataset), \ "MAS5 file not found: %s" % options.mas5_dataset datafile_mas5 = os.path.realpath(options.mas5_dataset) if options.illu_dataset is not None: assert os.path.exists(options.illu_dataset), \ "ILLU file not found: %s" % options.illu_dataset datafile_illu = os.path.realpath(options.illu_dataset) assert datafile_rma or datafile_mas5 or datafile_illu, \ "Please specify at least one data set." if options.libpath: sys.path = options.libpath + sys.path # Import after the library path is set. import time import arrayio from genomicode import config from genomicode import parallel from genomicode import archive from genomicode import hashlib from genomicode import matrixlib from genomicode import genepattern #sigdb_path, = args x = options.sigdb_path or config.sigdb_path sigdb_path = os.path.realpath(x) assert os.path.exists(sigdb_path), \ "I could not find the signatures database: %s." % sigdb_path start_time = time.time() genepattern.fix_environ_path() file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read the signatures and select the ones to score. # BUG: Should allow this to be specified on the command line. desired_tags = ["Pathway"] # default if options.signature_tags: desired_tags = options.signature_tags[:] all_normalization = ["RMA", "MAS5", "ILLU"] desired_normalization = [] if datafile_rma is not None: # RMA datafile is specified. desired_normalization.append("RMA") if datafile_mas5 is not None: # MAS5 datafile is specified. desired_normalization.append("MAS5") if datafile_illu is not None: # ILLU datafile is specified. desired_normalization.append("ILLU") # If any signature IDs are specified, then use only those IDs and # ignore the desired tags. print "Reading signature database: %s." % sigdb_path desired_ids = [] if options.signature_ids: desired_ids = options.signature_ids[:] x = read_signatures( sigdb_path, all_normalization, desired_ids, desired_tags) signatures = x orig_signatures = signatures[:] # Filter for just the normalization that we have data files for. # Keep track of why we filtered out certain signatures. why_dropped = {} # ID -> explanation as string good = [] for sig in signatures: if sig.Normalization.upper() in desired_normalization: good.append(sig) continue x = "Signature requires %s normalized data, but it was not provided."%( sig.Normalization.upper()) why_dropped[sig.xID] = x signatures = good assert signatures, "No signatures available." # Process additional parameters from GenePattern. # o Do this before max_signatures, so that the maximum signatures # is selected only out of the ones that the user specified. # o Do this before names and paths, so the variables will be # aligned. # gp_imod_all_vars can be None or "". if options.gp_imod_all_vars: x = process_gp_imod_all_vars( options.gp_imod_all_vars, signatures, why_dropped) signatures, why_dropped = x sys.stdout.flush() DATA_rma = DATA_mas5 = DATA_illu = None if datafile_rma is not None: print "Reading RMA file: %s" % datafile_rma DATA_rma = arrayio.read(datafile_rma) DATA_rma = arrayio.convert(DATA_rma, to_format=arrayio.gct_format) if datafile_mas5 is not None: print "Reading MAS5 file: %s" % datafile_mas5 DATA_mas5 = arrayio.read(datafile_mas5) DATA_mas5 = arrayio.convert(DATA_mas5, to_format=arrayio.gct_format) if datafile_illu is not None: print "Reading ILLU file: %s" % datafile_illu DATA_illu = arrayio.read(datafile_illu) DATA_illu = arrayio.convert(DATA_illu, to_format=arrayio.gct_format) # Don't handle the log. Let pybinreg do it. # Make sure the data sets contain the same samples. Align them if # necessary. DATA_all = [ ("DATA_rma", DATA_rma), ("DATA_mas5", DATA_mas5), ("DATA_illu", DATA_illu)] DATA_all = [x for x in DATA_all if x[1]] for i in range(1, len(DATA_all)): key1, data1 = DATA_all[0] key2, data2 = DATA_all[i] assert key1 != key2 assert data1 and data2 assert data1.ncol() == data2.ncol(), \ "%s and %s data sets have different numbers of samples." % ( key1, key2) if matrixlib.are_cols_aligned(data1, data2): continue x = matrixlib.align_cols(data1, data2) data1_new, data2_new = x assert matrixlib.are_cols_aligned(data1_new, data2_new) # The samples in data1 (the reference) should not be changed. assert data1.ncol() == data1_new.ncol(), \ "%s and %s data sets have different samples" % ( key1, key2) assert matrixlib.are_cols_aligned(data1, data1_new) DATA_all[i] = key2, data2_new for key, data in DATA_all: if key == "DATA_rma": DATA_rma = data elif key == "DATA_mas5": DATA_mas5 = data elif key == "DATA_illu": DATA_illu = data else: raise AssertionError, "Unknown key: %s" % key print "Writing aligned signal files." if DATA_rma: arrayio.gct_format.write( DATA_rma, open(file_layout.DATASET_RMA, 'w')) if DATA_mas5: arrayio.gct_format.write( DATA_mas5, open(file_layout.DATASET_MAS5, 'w')) if DATA_illu: arrayio.gct_format.write( DATA_illu, open(file_layout.DATASET_ILLU, 'w')) # Figure out the names and paths for each signature. print "Finding signatures." names = [None] * len(signatures) # SIG19_AKT[_modified] paths = [None] * len(signatures) # <path>/SIG19_AKT[_modified] for i, sig in enumerate(signatures): name = "SIG%02d_%s" % (sig.xID, hashlib.hash_var(sig.Name)) # If the user has modified the signature from the default # parameters, then make a note of it. if getattr(sig, "Changed", False): name = "%s_modified" % name outpath = os.path.join(file_layout.OUTPATH, name) names[i] = name paths[i] = outpath if options.max_signatures is not None: signatures = signatures[:options.max_signatures] # Make a list of the jobs. jobs = [] # list of cmd, outpath, outfile for i, sig in enumerate(signatures): name, outpath = names[i], paths[i] #print "Generating signature %s [%d:%d]" % ( # name, i+1, len(signatures)) #sys.stdout.flush() quantile_normalize = False assert sig.Quantile.upper() in ["YES", "NO"] if sig.Quantile.upper() == "YES": quantile_normalize = True shift_scale_normalize = False assert sig.Shift_Scale.upper() in ["YES", "NO"] if sig.Shift_Scale.upper() == "YES": shift_scale_normalize = True #outfile = os.path.join(files.outpath, "%s.out.txt" % name) outfile = os.path.join(outpath, "out.txt") if sig.Normalization.upper() == "RMA": datafile = file_layout.DATASET_RMA assert DATA_rma elif sig.Normalization.upper() == "MAS5": datafile = file_layout.DATASET_MAS5 assert DATA_mas5 elif sig.Normalization.upper() == "ILLU": datafile = file_layout.DATASET_ILLU assert DATA_illu else: raise AssertionError, "Unknown normalization." # If the entire analysis should be archived, then go ahead and # archive each of the pybinreg runs too. This will prevent # large analyses from taking up too much disk space. The # drawback is that the files that are archived are no longer # available for use here. Hopefully this won't be a problem. cmd = make_pybinreg_cmd( options.pybinreg, options.python, options.binreg_path, options.matlab, options.arrayplot, options.povray, options.cluster, options.libpath, outpath, options.archive, sig.Genes, sig.Metagenes, quantile_normalize, shift_scale_normalize, sig.Train0, sig.Train1, datafile) x = cmd, outpath, outfile jobs.append(x) # Run each of the jobs. if options.num_procs < 1 or options.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") if options.num_procs > 1: if parallel._find_parallel(): num_sigs = min(options.num_procs, len(jobs)) if num_sigs > 1: print "Predicting %d signatures at a time." % num_sigs else: print("I could not find GNU parallel. " "Predicting 1 signature at a time.") options.num_procs = 1 sys.stdout.flush() DEBUG = False # Can disable pybinreg temporarily for debugging. if not DEBUG: if options.num_procs <= 1: for x in jobs: cmd, outpath, outfile = x run_one_pybinreg(cmd, outpath, outfile) else: run_many_pybinreg(jobs, options.num_procs) if signatures: print "Extracting the reports from each signature." report_files = extract_reports(names, paths, file_layout) print "Combining probabilities from each of the signatures." summarize_probabilities(signatures, names, paths, file_layout) print "Making heatmap of the results." sys.stdout.flush() summarize_heatmap( options.python, options.arrayplot, options.cluster, options.libpath, file_layout) print "Summarizing signatures." summarize_signatures(signatures, file_layout) print "Making a report." analysis_name = make_analysis_name(options) summarize_report( analysis_name, signatures, orig_signatures, report_files, start_time, why_dropped, file_layout) if options.archive: print "Compressing results." sys.stdout.flush() archive.zip_path(file_layout.ATTIC) for i, sig in enumerate(signatures): name, outpath = names[i], paths[i] archive.zip_path(outpath) print "Done."
def main(): from optparse import OptionParser, OptionGroup usage = "usage: %prog [options] <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--bfrm_bin", dest="bfrm_bin", default=None, help="Specify the path to the BFRM binary.") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") group = OptionGroup(parser, "Filtering") group.add_option( "--filter_mean", dest="filter_mean", type=float, default=None, help="Remove this portion of genes based on mean expression.") group.add_option("--filter_var", dest="filter_var", type=float, default=None, help="Remove this portion of genes based on variance.") group.add_option("--cutoff", dest="cutoff", type=float, default=0.99, help="Cutoff probability for a gene to be in a factor.") parser.add_option_group(group) group = OptionGroup(parser, "BFRM Parameters") group.add_option("--nc", dest="num_control_vars", type="int", default=None, help="Specify the number of control variables to use.") group.add_option( "--num_factors", dest="num_factors", type="int", default=None, help="The number of factors to fit. " "For evolutionary search, starts with this number of factors.") group.add_option( "--design_file", dest="design_file", default=None, help="A file containing a matrix with additional design variables.") group.add_option( "--nucleus_file", dest="nucleus_file", default=None, help="A file that contains the genes to start the evolution. " "This should be a text file that contains a whitespace-separated " "list of genes. If this or --nucleus_geneset is given, " "the evolutionary search will be turned on.") group.add_option( "--nucleus_geneset", dest="nucleus_geneset", default=None, help="A gene set that contains the genes to start the evolution. " "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]") group.add_option("--evol_max_factors", dest="evol_max_factors", default=None, help="Maximum number of factors for the evolution.") group.add_option("--evol_max_genes", dest="evol_max_genes", default=None, help="Maximum number of genes for the evolution.") parser.add_option_group(group) # Parse the arguments. options, args = parser.parse_args() if options.cutoff <= 0 or options.cutoff > 1: parser.error("Cutoff probability should be between 0 and 1.") if options.filter_mean and (options.filter_mean < 0 or options.filter_mean >= 1): parser.error("filter_mean filter should be between 0 and 1.") if options.filter_var and (options.filter_var < 0 or options.filter_var >= 1): parser.error("filter_var filter should be between 0 and 1.") if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import arrayio from genomicode import archive from genomicode import genepattern genepattern.fix_environ_path() if len(args) != 1: parser.error("Please specify a file to factor.") filename, = args assert os.path.exists(filename), "File not found: %s" % filename if options.nucleus_file and options.nucleus_geneset: parser.error("Please specify either nucleus_file or nucleus_geneset.") nucleus = None if options.nucleus_file: nucleus = _read_nucleus_file(options.nucleus_file) elif options.nucleus_geneset: nucleus = _read_nucleus_geneset(options.nucleus_geneset) # Not sure if this is necessary. Don't know if BFRM will provide # a default if not given. if nucleus: assert options.num_factors, "Please specify number of factors." # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read the matrix and convert to GCT format. x = arrayio.read(filename) MATRIX_orig = arrayio.convert(x, to_format=arrayio.gct_format) print "Read data set with %d genes and %d samples." % (MATRIX_orig.nrow(), MATRIX_orig.ncol()) # Make a copy so that in-place changes (like log_matrix) won't # affect the original matrix. MATRIX = MATRIX_orig.matrix() # Log the data set if necessary. log_matrix(MATRIX) # Filter out based on mean and varian MATRIX = filter_dataset(MATRIX, options.filter_mean, options.filter_var) if MATRIX.nrow() != MATRIX_orig.nrow(): print "Filtered from %d genes to %d." % (MATRIX_orig.nrow(), MATRIX.nrow()) # Write out the data sets. write_dataset(file_layout.DATASET_ORIG, MATRIX_orig) write_dataset(file_layout.DATASET, MATRIX) # Run BFRM. DEBUG = False if not DEBUG: run_bfrm(file_layout, options.bfrm_bin, options.num_control_vars, options.num_factors, options.design_file, nucleus, options.evol_max_factors, options.evol_max_genes) # Generate output files. summarize_factor_scores(file_layout, options.cutoff, options.python, options.arrayplot, options.cluster, options.libpath) summarize_gene_factor_probs(file_layout, options.cutoff, options.python, options.arrayplot, options.cluster, options.libpath) summarize_factor_geneset(file_layout, options.cutoff) # BFRM model file should always be archived. archive.zip_path(file_layout.BFRM, noclobber=False) if options.archive: print "Archiving results." archive.zip_path(file_layout.ATTIC, noclobber=False) print "Done."
def main(): from optparse import OptionParser, OptionGroup # matrix_file should be a pathway x sample file. usage = "usage: %prog [options] <dataset>" parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("", "--selap", dest="selap_path", default=None, help="Specify the path to SELAPv3.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") # This doesn't give as much control over exactly which python # version is run. #parser.add_option( # "", "--binpath", dest="binpath", action="append", default=[], # help="Add to the binary search path.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") group = OptionGroup(parser, "Model Parameters") # Higher numbers have more groups. # Range from 0 and lower. group.add_option( "-p", "--penalty", dest="penalty", default="-33", help="Penalty for tuning number of subgroups (default -33).") group.add_option( "-m", "--model", dest="model_file", default=None, help="Specify a file that contains a pre-built subtype model.") parser.add_option_group(group) # Parse the input arguments. options, args = parser.parse_args() if len(args) != 1: parser.error("Please specify a file with pathway probabilities.") filename, = args if not os.path.exists(filename): parser.error("I could not find file %s." % filename) if options.penalty.find(".") >= 0: parser.error("Penalties should be integers.") if options.libpath: sys.path = options.libpath + sys.path # Import after the library path is set. import arrayio from genomicode import genepattern from genomicode import archive from genomicode import parselib genepattern.fix_environ_path() # Maximum number of models that someone can create at a time. MAX_MODELS = 50 # Allow people to supply more than one penalty. Parse into a list # of ranges. Penalties must be integers. penalties = [] for (start, end) in parselib.parse_ranges(options.penalty): penalties.extend(range(start, end + 1)) assert len(penalties) <= MAX_MODELS, "Too many penalties (max is %d)." % \ MAX_MODELS assert penalties, "At least one penalty must be specified." assert not (options.model_file and len(penalties) != 1) for p in penalties: assert p <= 0, "Penalties should be negative." num_analyses = len(penalties) # Set up the files. file_layout = make_file_layout(options.outpath, num_analyses, penalties[0]) init_paths(file_layout) # Read the matrix and convert to GCT format. MATRIX = arrayio.read(filename) MATRIX = arrayio.convert(MATRIX, to_format=arrayio.gct_format) # Align this matrix to the SELAP model, if it already exists. if options.model_file: MATRIX = align_dataset(MATRIX, options.model_file) # Write out the data set. write_dataset(file_layout.DATASET, MATRIX) for penalty in penalties: # Set up the files. file_layout = make_file_layout(options.outpath, num_analyses, penalty) init_paths(file_layout) # Make the model. write_selap_dataset(file_layout) if options.model_file: write_model(options.model_file, file_layout) else: make_model(options.selap_path, penalty, file_layout, options.matlab) # Predict the subgroups. predict_subgroups(options.selap_path, file_layout, options.matlab) # Generate some files for output. summarize_predictions(file_layout) summarize_heatmap(options.python, options.arrayplot, options.cluster, file_layout, options.libpath) # Archive the SELAP stuff, and any other big files. if options.archive: print "Archiving results." archive.zip_path(file_layout.SELAP, noclobber=False) archive.zip_path(file_layout.ATTIC, noclobber=False) if num_analyses <= 1: continue # Now do some cleanup if multiple analyses were requested. # If there were multiple penalties specified, make a copy of # some files for convenience. fl = file_layout files_to_copy = [ (fl.PREDICTIONS_PCL, fl.GLOBAL_PREDICTIONS_PCL), (fl.PREDICTIONS_PNG, fl.GLOBAL_PREDICTIONS_PNG), ] for src, dst in files_to_copy: assert os.path.exists(src) os.system("cp -p '%s' '%s'" % (src, dst)) if options.archive: archive.zip_path(file_layout.ANALYSIS) sys.stdout.flush() if num_analyses > 1: summarize_subgroups(options.outpath, num_analyses, penalties) print "Done."