def get_main_logger(log_outdir, level=logging.WARNING, include_stdout=True): """ Return logger object for main MISO thread. """ logger_name = "miso_main" misc_utils.make_dir(log_outdir) logger = logging.getLogger(logger_name) formatter = \ logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logging.root.setLevel(level) # Optionally add handler that streams all logs # to stdout if include_stdout: ch = logging.StreamHandler(sys.stdout) ch.setLevel(level) ch.setFormatter(formatter) logger.addHandler(ch) # Write to main logger filename along # with time stamp logger_basename = "main.%s.log" %(misc_utils.get_timestamp()) logger_fname = os.path.join(log_outdir, logger_basename) fh = logging.FileHandler(logger_fname) fh.setLevel(level) fh.setFormatter(formatter) logger.addHandler(fh) return logger
def main(): from optparse import OptionParser parser = OptionParser() ## ## Psi utilities ## parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None, help="Compute comparison statistics between the two " \ "given samples. Expects three directories: the first is " \ "sample1's MISO output, the second is sample2's MISO " \ "output, and the third is the directory where " \ "results of the sample comparison will be outputted.") parser.add_option("--comparison-labels", dest="comparison_labels", nargs=2, default=None, help="Use these labels for the sample comparison " "made by --compare-samples. " "Takes two arguments: the label for sample 1 " "and the label for sample 2, where sample 1 and " "sample 2 correspond to the order of samples given " "to --compare-samples.") parser.add_option("--use-compressed", dest="use_compressed", nargs=1, default=None, help="Use compressed event IDs. Takes as input a " "genes_to_filenames.shelve file produced by the " "index_gff script.") (options, args) = parser.parse_args() if options.samples_to_compare is None: greeting() use_compressed = None if options.use_compressed is not None: use_compressed = \ os.path.abspath(os.path.expanduser(options.use_compressed)) if not os.path.exists(use_compressed): print "Error: mapping filename from event IDs to compressed IDs %s " \ "is not found." %(use_compressed) sys.exit(1) else: print "Compression being used." if options.samples_to_compare is not None: sample1_dirname = os.path.abspath(options.samples_to_compare[0]) sample2_dirname = os.path.abspath(options.samples_to_compare[1]) output_dirname = os.path.abspath(options.samples_to_compare[2]) if not os.path.isdir(output_dirname): print "Making comparisons directory: %s" % (output_dirname) misc_utils.make_dir(output_dirname) ht.output_samples_comparison(sample1_dirname, sample2_dirname, output_dirname, sample_labels=options.comparison_labels, use_compressed=use_compressed)
def clear_output_dir(): output_dir = OUTPUT_DIR # Clear out the previous test output directory print "Clearing previous output directory..." if os.path.isdir(output_dir): shutil.rmtree(output_dir, ignore_errors=True) # Make new output directory misc_utils.make_dir(output_dir)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--summarize-samples", dest="summarize_samples", nargs=2, default=None, help="Compute summary statistics of the given set " "of samples. Expects a directory with MISO output " "and a directory to output summary file to.") parser.add_option("--summary-label", dest="summary_label", nargs=1, default=None, help="Label for MISO summary file. If not given, " "uses basename of MISO output directory.") parser.add_option("--use-compressed", dest="use_compressed", nargs=1, default=None, help="Use compressed event IDs. Takes as input a " "genes_to_filenames.shelve file produced by the " "index_gff script.") (options, args) = parser.parse_args() greeting() use_compressed = None if options.use_compressed is not None: use_compressed = \ os.path.abspath(os.path.expanduser(options.use_compressed)) if not os.path.exists(use_compressed): print "Error: mapping filename from event IDs to compressed IDs %s " \ "is not found." %(use_compressed) sys.exit(1) else: print "Compression being used." ## ## Summarizing samples ## if options.summarize_samples: samples_dir = \ os.path.abspath(os.path.expanduser(options.summarize_samples[0])) if options.summary_label != None: samples_label = options.summary_label print "Using summary label: %s" %(samples_label) else: samples_label = \ os.path.basename(os.path.expanduser(samples_dir)) assert(len(samples_label) >= 1) summary_output_dir = \ os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]), 'summary')) if not os.path.isdir(summary_output_dir): misc_utils.make_dir(summary_output_dir) summary_filename = os.path.join(summary_output_dir, '%s.miso_summary' %(samples_label)) samples_utils.summarize_sampler_results(samples_dir, summary_filename, use_compressed=use_compressed)
def main(): from optparse import OptionParser parser = OptionParser() ## ## Psi utilities ## parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None, help="Compute comparison statistics between the two " \ "given samples. Expects three directories: the first is " \ "sample1's MISO output, the second is sample2's MISO " \ "output, and the third is the directory where " \ "results of the sample comparison will be outputted.") parser.add_option("--comparison-labels", dest="comparison_labels", nargs=2, default=None, help="Use these labels for the sample comparison " "made by --compare-samples. " "Takes two arguments: the label for sample 1 " "and the label for sample 2, where sample 1 and " "sample 2 correspond to the order of samples given " "to --compare-samples.") parser.add_option("--use-compressed", dest="use_compressed", nargs=1, default=None, help="Use compressed event IDs. Takes as input a " "genes_to_filenames.shelve file produced by the " "index_gff script.") (options, args) = parser.parse_args() if options.samples_to_compare is None: greeting() use_compressed = None if options.use_compressed is not None: use_compressed = \ os.path.abspath(os.path.expanduser(options.use_compressed)) if not os.path.exists(use_compressed): print "Error: mapping filename from event IDs to compressed IDs %s " \ "is not found." %(use_compressed) sys.exit(1) else: print "Compression being used." if options.samples_to_compare is not None: sample1_dirname = os.path.abspath(options.samples_to_compare[0]) sample2_dirname = os.path.abspath(options.samples_to_compare[1]) output_dirname = os.path.abspath(options.samples_to_compare[2]) if not os.path.isdir(output_dirname): print "Making comparisons directory: %s" %(output_dirname) misc_utils.make_dir(output_dirname) ht.output_samples_comparison(sample1_dirname, sample2_dirname, output_dirname, sample_labels=options.comparison_labels, use_compressed=use_compressed)
def run_on_cluster(self, cmd, job_name, cluster_output_dir, cluster_scripts_dir=None, queue_type=None): ''' Composes job script and launches job ''' misc_utils.make_dir(cluster_output_dir) if cluster_scripts_dir == None: cluster_scripts_dir = os.path.join(cluster_output_dir, 'cluster_scripts') misc_utils.make_dir(cluster_scripts_dir) scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output') misc_utils.make_dir(scripts_output_dir) scripts_output_dir = os.path.abspath(scripts_output_dir) cluster_call = 'sbatch -D \"%s\"' % (scripts_output_dir) script_name = os.path.join(cluster_scripts_dir, '%s_time_%s.sh' \ %(job_name, time.strftime("%m-%d-%y_%H_%M_%S"))) self.make_bash_script(script_name, cmd) cluster_cmd = cluster_call + ' \"%s\"' % (script_name) job_id = self.launch_job(cluster_cmd) return job_id
def run_on_cluster(cmd, job_name, cluster_output_dir, cluster_scripts_dir=None, queue_type=None, cmd_name="qsub", settings_fname=None): print "Submitting job: %s" % (job_name) queue_name = None # Load command name from settings file if settings_fname != None: load_settings(settings_fname) cmd_name = Settings.get_cluster_command() if queue_type == "long": queue_name = Settings.get_long_queue_name() elif queue_type == "short": queue_name = Settings.get_short_queue_name() else: print "Warning: Unknown queue type: %s" % (queue_type) queue_name = queue_type if queue_type is None: print " - queue type: unspecified" else: print " - queue type: %s" % (queue_type) if queue_name is None: print " - queue name unspecified" else: print " - queue name: %s" % (queue_name) misc_utils.make_dir(cluster_output_dir) if cluster_scripts_dir == None: cluster_scripts_dir = os.path.join(cluster_output_dir, 'cluster_scripts') misc_utils.make_dir(cluster_scripts_dir) scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output') misc_utils.make_dir(scripts_output_dir) scripts_output_dir = os.path.abspath(scripts_output_dir) cluster_call = '%s -o \"%s\" -e \"%s\"' % (cmd_name, scripts_output_dir, scripts_output_dir) # Add queue type if given one if queue_name != None: cluster_call += ' -q \"%s\"' % (queue_name) script_name = \ valid_cluster_name(os.path.join(cluster_scripts_dir, '%s_time_%s.sh' \ %(job_name, time.strftime("%m-%d-%y_%H:%M:%S")))) make_bash_script(script_name, cmd) cluster_cmd = cluster_call + ' \"%s\"' % (script_name) job_id = launch_job(cluster_cmd, cmd_name) return job_id
def run_on_cluster(self, cmd, job_name, cluster_output_dir, cluster_scripts_dir=None, queue_type=None): ''' Composes job script and launches job ''' print "Submitting job: %s" % (job_name) queue_name = None # Load command name from settings file cmd_name = self.settings.get_cluster_command() if queue_type == "long": queue_name = self.settings.get_long_queue_name() elif queue_type == "short": queue_name = self.settings.get_short_queue_name() else: print "Warning: Unknown queue type: %s" % (queue_type) queue_name = queue_type if queue_type is None: print " - queue type: unspecified" else: print " - queue type: %s" % (queue_type) if queue_name is None: print " - queue name unspecified" else: print " - queue name: %s" % (queue_name) misc_utils.make_dir(cluster_output_dir) if cluster_scripts_dir == None: cluster_scripts_dir = os.path.join(cluster_output_dir, 'cluster_scripts') misc_utils.make_dir(cluster_scripts_dir) scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output') misc_utils.make_dir(scripts_output_dir) scripts_output_dir = os.path.abspath(scripts_output_dir) cluster_call = 'bsub -o \"%s\" -e \"%s\"' % (scripts_output_dir, scripts_output_dir) # Add queue type if given one if queue_name != None: cluster_call += ' -q \"%s\"' % (queue_name) script_name = os.path.join(cluster_scripts_dir, '%s_time_%s.sh' \ %(job_name, time.strftime("%m-%d-%y_%H_%M_%S"))) self.make_bash_script(script_name, cmd) cluster_cmd = cluster_call + ' \"%s\"' % (script_name) job_id = self.launch_job(cluster_cmd) return job_id
def run_on_cluster( cmd, job_name, cluster_output_dir, cluster_scripts_dir=None, queue_type=None, cmd_name="qsub", settings_fname=None ): print "Submitting job: %s" % (job_name) queue_name = None # Load command name from settings file if settings_fname != None: load_settings(settings_fname) cmd_name = Settings.get_cluster_command() if queue_type == "long": queue_name = Settings.get_long_queue_name() elif queue_type == "short": queue_name = Settings.get_short_queue_name() else: print "Warning: Unknown queue type: %s" % (queue_type) queue_name = queue_type if queue_type is None: print " - queue type: unspecified" else: print " - queue type: %s" % (queue_type) if queue_name is None: print " - queue name unspecified" else: print " - queue name: %s" % (queue_name) misc_utils.make_dir(cluster_output_dir) if cluster_scripts_dir == None: cluster_scripts_dir = os.path.join(cluster_output_dir, "cluster_scripts") misc_utils.make_dir(cluster_scripts_dir) scripts_output_dir = os.path.join(cluster_output_dir, "scripts_output") misc_utils.make_dir(scripts_output_dir) scripts_output_dir = os.path.abspath(scripts_output_dir) cluster_call = '%s -o "%s" -e "%s"' % (cmd_name, scripts_output_dir, scripts_output_dir) # Add queue type if given one if queue_name != None: cluster_call += ' -q "%s"' % (queue_name) script_name = valid_cluster_name( os.path.join(cluster_scripts_dir, "%s_time_%s.sh" % (job_name, time.strftime("%m-%d-%y_%H:%M:%S"))) ) make_bash_script(script_name, cmd) cluster_cmd = cluster_call + ' "%s"' % (script_name) job_id = launch_job(cluster_cmd, cmd_name) return job_id
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None, verbose=True): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ misc_utils.make_dir(output_dir) if not os.path.exists(gff_index_filename): print "Error: No GFF %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] num_chains = settings_params["num_chains"] min_event_reads = Settings.get_min_event_reads() strand_rule = Settings.get_strand_param() mean_frag_len = None frag_variance = None if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] if "filter_reads" not in settings: filter_reads = True else: filter_reads = settings["filter_reads"] # Load the BAM file upfront bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Check if we're in compressed mode compressed_mode = misc_utils.is_compressed_index(gff_index_filename) for gene_id, gene_info in gff_genes.iteritems(): lookup_id = gene_id # Skip genes that we were not asked to run on if lookup_id not in gene_ids: continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Sanity check: if the isoforms are all shorter than the read, # skip the event if all(map(lambda l: l < read_len, gene_obj.iso_lens)): print "All isoforms of %s shorter than %d, so skipping" \ %(gene_id, read_len) continue # Find the most inclusive transcription start and end sites # for each gene tx_start, tx_end = \ gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # Fetch reads aligning to the gene boundaries gene_reads = \ sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Parse reads: checking strandedness and pairing # reads in case of paired-end data reads, num_raw_reads = \ sam_utils.sam_parse_reads(gene_reads, paired_end=paired_end, strand_rule=strand_rule, target_strand=gene_obj.strand, given_read_len=read_len) # Skip gene if none of the reads align to gene boundaries if filter_reads: if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue else: print "%d raw reads in event" %(num_raw_reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = \ miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) try: os.makedirs(chrom_dir) except OSError: pass # Pick .miso output filename based on the pickle filename miso_basename = os.path.basename(gff_index_filename) if not miso_basename.endswith(".pickle"): print "Error: Invalid index file %s" %(gff_index_filename) sys.exit(1) miso_basename = miso_basename.replace(".pickle", "") output_filename = os.path.join(chrom_dir, "%s" %(miso_basename)) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, num_chains=num_chains, burn_in=burn_in, lag=lag)
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, main_logger, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.main_logger = main_logger self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): self.main_logger.error("BAM file %s not found." %(self.bam_filename)) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): self.main_logger.warning("Expected BAM index file %s not found." \ %(self.bam_index_fname)) self.main_logger.warning("Are you sure your BAM file is indexed?") self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc self.main_logger.info("Using %d processors" %(num_proc)) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: self.main_logger.error("No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?") sys.exit(1) self.batch_filenames = self.output_batch_files()
def main(): from optparse import OptionParser parser = OptionParser() ## ## Main options ## parser.add_option("--compute-gene-psi", dest="compute_gene_psi", nargs=4, default=None, help="Compute Psi using for a given multi-isoform gene. " "Expects four arguments: the first is a gene ID or set " "of comma-separated (no spaces) gene IDs, " "the second is a GFF indexed file with the gene " "information, the third is a sorted and " "indexed BAM file with reads aligned to the gene, " "and the fourth is an output directory.") parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None, help="Run in paired-end mode. Takes a mean and standard " "deviation for the fragment length distribution (assumed " "to have discretized normal form.)") parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file", nargs=3, default=None, help="Runs on a set of genes from a file. Takes as input: " "(1) a two-column tab-delimited file, where column 1 is the " "event ID (ID field from GFF) and the second column is " "the path to the indexed GFF file for that event. " "MISO will run on all the events described in the file, " "(2) a sorted, indexed BAM file to run on, and (3) a " "directory to output results to.") ## ## Psi utilities ## parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None, help="Compute comparison statistics between the two " "given samples. Expects three directories: the first is " "sample1's MISO output, the second is sample2's MISO " "output, and the third is the directory where " "results of the sample comparison will be outputted.") parser.add_option("--comparison-labels", dest="comparison_labels", nargs=2, default=None, help="Use these labels for the sample comparison " "made by --compare-samples. " "Takes two arguments: the label for sample 1 " "and the label for sample 2, where sample 1 and " "sample 2 correspond to the order of samples given " "to --compare-samples.") parser.add_option("--summarize-samples", dest="summarize_samples", nargs=2, default=None, help="Compute summary statistics of the given set " "of samples. Expects a directory with MISO output " "and a directory to output summary file to.") parser.add_option("--summary-label", dest="summary_label", nargs=1, default=None, help="Label for MISO summary file. If not given, " "uses basename of MISO output directory.") parser.add_option("--use-cluster", action="store_true", dest="use_cluster", default=False) parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int", help="Size (in number of events) of each job to " "chunk events file into. Only applies when " "running on cluster.") parser.add_option("--settings-filename", dest="settings_filename", default=os.path.join(miso_settings_path, "settings", "miso_settings.txt"), help="Filename specifying MISO settings.") parser.add_option("--read-len", dest="read_len", type="int", default=None) parser.add_option("--overhang-len", dest="overhang_len", type="int", default=None) parser.add_option("--event-type", dest="event_type", default=None, help="Event type of two-isoform " "events (e.g. 'SE', 'RI', 'A3SS', ...)") parser.add_option("--use-compressed", dest="use_compressed", nargs=1, default=None, help="Use compressed event IDs. Takes as input a " "genes_to_filenames.shelve file produced by the " "index_gff script.") ## ## Gene utilities ## parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None, help="View the contents of a gene/event that has " "been indexed. Takes as input an " "indexed (.pickle) filename.") (options, args) = parser.parse_args() if options.compute_gene_psi is None: greeting() ## ## Load the settings file ## Settings.load(os.path.expanduser(options.settings_filename)) use_compressed = None if options.use_compressed is not None: use_compressed = \ os.path.abspath(os.path.expanduser(options.use_compressed)) if not os.path.exists(use_compressed): print "Error: mapping filename from event IDs to compressed IDs %s " \ "is not found." %(use_compressed) sys.exit(1) else: print "Compression being used." if options.samples_to_compare is not None: sample1_dirname = os.path.abspath(options.samples_to_compare[0]) sample2_dirname = os.path.abspath(options.samples_to_compare[1]) output_dirname = os.path.abspath(options.samples_to_compare[2]) if not os.path.isdir(output_dirname): print "Making comparisons directory: %s" %(output_dirname) misc_utils.make_dir(output_dirname) ht.output_samples_comparison(sample1_dirname, sample2_dirname, output_dirname, sample_labels=options.comparison_labels, use_compressed=use_compressed) ## ## Main interface based on SAM files ## if options.compute_genes_from_file != None: # Run on events given by file run_compute_genes_from_file(options) if options.compute_gene_psi != None: run_compute_gene_psi(options) ## ## Summarizing samples ## if options.summarize_samples: samples_dir = \ os.path.abspath(os.path.expanduser(options.summarize_samples[0])) if options.summary_label != None: samples_label = options.summary_label print "Using summary label: %s" %(samples_label) else: samples_label = \ os.path.basename(os.path.expanduser(samples_dir)) assert(len(samples_label) >= 1) summary_output_dir = \ os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]), 'summary')) if not os.path.isdir(summary_output_dir): os.makedirs(summary_output_dir) summary_filename = os.path.join(summary_output_dir, '%s.miso_summary' %(samples_label)) summarize_sampler_results(samples_dir, summary_filename, use_compressed=use_compressed) if options.view_gene != None: indexed_gene_filename = \ os.path.abspath(os.path.expanduser(options.view_gene)) print "Viewing genes in %s" %(indexed_gene_filename) gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename) if gff_genes == None: print "No genes." sys.exit(1) for gene_id, gene_info in gff_genes.iteritems(): print "Gene %s" %(gene_id) gene_obj = gene_info['gene_object'] print " - Gene object: ", gene_obj print "==" print "Isoforms: " for isoform in gene_obj.isoforms: print " - ", isoform print "==" print "mRNA IDs: " for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']: print "%s" %(mRNA_id) print "==" print "Exons: " for exon in gene_obj.parts: print " - ", exon
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir, main_logger, use_cluster=False, SGEarray=False, chunk_jobs=800, overhang_len=1, paired_end=None, settings_fname=None, job_name="misojob", num_proc=None, prefilter=False, wait_on_jobs=True): """ Compute Psi values for genes using a GFF and a BAM filename. SGE functionality contributed by Michael Lovci. Options: - prefilter: if set to True, prefilter events by coverage. Uses bedtools to determine coverage of each event and remove events that do not meet the coverage criteria from the run. """ print "Computing Psi values..." print " - GFF index: %s" %(gff_dir) print " - BAM: %s" %(bam_filename) print " - Read length: %d" %(read_len) print " - Output directory: %s" %(output_dir) misc_utils.make_dir(output_dir) # Check GFF and BAM for various errors like headers mismatch run_events.check_gff_and_bam(gff_dir, bam_filename, main_logger, given_read_len=read_len) # Prefilter events that do not meet the coverage criteria # If filtering is on, only run on events that meet # the filter. all_gene_ids = None if prefilter: main_logger.info("Prefiltering on") if misc_utils.which("bedtools") is None: main_logger.error("Error: Cannot use bedtools. Bedtools is " \ "required for --prefilter option") sys.exit(1) filtered_gene_ids = run_events.get_ids_passing_filter(gff_dir, bam_filename, output_dir) # Prefiltering succeeded, so process only gene ids that # pass the filter if filtered_gene_ids != None: num_pass = len(filtered_gene_ids) all_gene_ids = filtered_gene_ids # If none of the events meet the read coverage filter # something must have gone wrong, e.g. mismatch # in chromosome headers between BAM and GFF if num_pass == 0: main_logger.error("None of the events in %s appear to meet the " \ "read coverage filter. Check that your BAM headers " \ "in %s match the GFF headers of indexed events." \ %(gff_dir, bam_filename)) sys.exit(1) main_logger.info("Total of %d events pass coverage filter." \ %(num_pass)) ## ## Submit jobs either using cluster or locally ## using multi-cores. ## dispatcher = GenesDispatcher(gff_dir, bam_filename, output_dir, read_len, overhang_len, main_logger, settings_fname=settings_fname, paired_end=paired_end, use_cluster=use_cluster, chunk_jobs=chunk_jobs, sge_job_name=job_name, SGEarray=SGEarray, gene_ids=all_gene_ids, num_proc=num_proc, wait_on_jobs=wait_on_jobs) dispatcher.run()
def run_SGEarray_cluster(arg_list, argfile, cluster_output_dir, queue_type="long", cluster_scripts_dir=None, chunk=2500, settings=None, cmd_name="qsub", job_name="miso_job"): """ Run MISO jobs on cluster using SGE. Function contributed by Michael Lovci, UCSD. """ misc_utils.make_dir(cluster_output_dir) # Create arguments file to pass on to job f = open(argfile, 'w') nargs = len(arg_list) if nargs % chunk == 0: njobs = nargs / chunk else: njobs = 1 + (nargs / chunk) for args in arg_list: f.write(args[0] + "\n") f.close() if cluster_scripts_dir == None: cluster_scripts_dir = os.path.join(cluster_output_dir, 'cluster_scripts') misc_utils.make_dir(cluster_scripts_dir) scripts_output_dir = os.path.join(cluster_output_dir, 'scripts_output') misc_utils.make_dir(scripts_output_dir) scripts_output_dir = os.path.abspath(scripts_output_dir) script_error = os.path.join(scripts_output_dir, string.join([job_name, "err"], ".")) script_out = os.path.join(scripts_output_dir, string.join([job_name, "out"], ".")) cluster_script = os.path.join(cluster_scripts_dir, "run_miso.sh") if settings != None: load_settings(settings) cmd_name = Settings.get_cluster_command() if queue_type == "long": queue_name = Settings.get_long_queue_name() elif queue_type == "short": queue_name = Settings.get_short_queue_name() else: raise Exception, "Unknown queue type: %s" % (queue_type) if queue_type == None: print " - queue: unspecified" else: print " - queue: %s, using queue name %s" % (queue_type, queue_name) cs = open(cluster_script, 'w') cs.write("#!/bin/sh" + "\n") cs.write("#$ -N %s\n" % (job_name)) cs.write("#$ -S /bin/sh\n") cs.write("#$ -p -1023\n") cs.write("#$ -o %s\n" % (script_out)) cs.write("#$ -e %s\n" % (script_error)) cs.write("#$ -t 1-%s\n" % (njobs)) ##execute from current working directory cs.write("#$ -cwd\n") ## import environment variables cs.write("#$ -V\n") if queue_name: cs.write("#$ -l %s\n" % (queue_name)) cs.write("echo \"hostname is:\"\n") cs.write("hostname\n") cs.write("ARGFILE=%s\n" % argfile) cs.write("SEQ=/usr/bin/seq\n") cs.write("index=0\n") cs.write("lastindex=0\n") cs.write("let \"index = $SGE_TASK_ID * %s\"\n" % (chunk)) chunk2 = chunk - 1 cs.write("let \"lastindex = $index - %s\"\n" % (chunk2)) if chunk2 > 0: cs.write("for i in `$SEQ $lastindex $index`\n") else: cs.write("for i in $index\n") # if user chooses 1 for chunk size cs.write("do\n") cs.write(" line=$(cat $ARGFILE | head -n $i | tail -n 1)\n") cs.write(" eval $line\n") cs.write("done\n") cs.close() # Make script executable os.system('chmod +x \"%s\"' % (cluster_script)) qsub_cmd = cmd_name + ' \"%s\"' % (cluster_script) os.system(qsub_cmd)
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len, use_cluster=False, chunk_jobs=False, filter_events=True, events_info_filename=None, settings_filename=None): """ Compute Psi values for skipped exons. Sample filenames is a mapping from sample label to sample. - sample_filenames = [[sample_label1, sample_filename1], [sample_label2, sample_filename2]] - output_dir: output directory - event_type: 'SE', 'RI', etc. """ misc_utils.make_dir(output_dir) output_dir = os.path.join(output_dir, event_type) output_dir = os.path.abspath(output_dir) misc_utils.make_dir(output_dir) print "Computing Psi for events of type %s" % (event_type) print " - samples used: ", sample_filenames.keys() for sample_label, sample_filename in sample_filenames.iteritems(): print "Processing sample: label=%s, filename=%s" \ %(sample_label, sample_filename) results_output_dir = os.path.join(output_dir, sample_label) misc_utils.make_dir(results_output_dir) # Load the set of counts and serialize them into JSON events = \ as_events.load_event_counts(sample_filename, event_type, events_info_filename=events_info_filename) # Filter events if filter_events: print "Filtering events..." events.filter_events(settings=Settings.get()) print "Running on a total of %d events." % (len(events.events)) events_filename = events.output_file(results_output_dir, sample_label) # Run MISO on them miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \ "--read-len %d --overhang-len %d " \ %(os.path.join(miso_path, 'run_miso.py'), events_filename, results_output_dir, event_type, read_len, overhang_len) if use_cluster: if chunk_jobs: miso_cmd += ' --use-cluster --chunk-jobs %d' % (chunk_jobs) else: miso_cmd += ' --use-cluster' print "Executing: %s" % (miso_cmd) if use_cluster: print " - Using cluster" os.system(miso_cmd)
def output_samples_comparison(sample1_dir, sample2_dir, output_dir, alpha=.95, sample_labels=None, use_compressed=None): """ Compute the bayes factors, posterior means, and other statistics between the two samples and output them to a directory. Expects two directories with samples from a MISO run, where corresponding events in the two samples' directories begin with the same event name. """ print "Given output dir: %s" %(output_dir) print "Retrieving MISO files in sample directories..." sample1_obj = MISOSamples(sample1_dir, use_compressed=use_compressed) sample2_obj = MISOSamples(sample2_dir, use_compressed=use_compressed) print "Computing sample comparison between %s and %s..." %(sample1_dir, sample2_dir) print " - No. of events in %s: %d" %(sample1_dir, sample1_obj.num_events) print " - No. of events in %s: %d" %(sample2_dir, sample2_obj.num_events) # Output header for Bayes factor file if sample_labels is None: # Use directory names as sample labels sample1_label = os.path.basename(os.path.normpath(sample1_dir)) sample2_label = os.path.basename(os.path.normpath(sample2_dir)) else: # If we're given sample labels, use them sample1_label, sample2_label = sample_labels print "Using user-given sample labels (sample1 = %s, sample2 = %s)" \ %(sample1_label, sample2_label) output_dir = os.path.join(output_dir, "%s_vs_%s" %(sample1_label, sample2_label)) print "Creating comparisons parent directory: %s" %(output_dir) # Create parent directory for comparison misc_utils.make_dir(output_dir) # Create directory for Bayes factors bf_output_dir = os.path.join(output_dir, 'bayes-factors/') misc_utils.make_dir(bf_output_dir) header_fields = ['event_name', 'sample1_posterior_mean', 'sample1_ci_low', 'sample1_ci_high', 'sample2_posterior_mean', 'sample2_ci_low', 'sample2_ci_high', 'diff', 'bayes_factor', 'isoforms', 'sample1_counts', 'sample1_assigned_counts', 'sample2_counts', 'sample2_assigned_counts', 'chrom', 'strand', 'mRNA_starts', 'mRNA_ends'] header_line = "\t".join(header_fields) + "\n" output_filename = \ os.path.join(bf_output_dir, "%s_vs_%s.miso_bf" %(sample1_label, sample2_label)) output_file = open(output_filename, 'w') output_file.write(header_line) num_events_compared = 0 file_num = 0 # Compute the Bayes factors for each file for event_name in sample1_obj.all_event_names: sample1_results = sample1_obj.get_event_samples(event_name) # Parameters from raw MISO samples file samples1 = sample1_results[0] header1 = sample1_results[1] header1 = header1[0] params1 = parse_sampler_params_from_header(header1) # Extract gene information if available gene_info = get_gene_info_from_params(params1) # Find corresponding event filename in sample 2 sample2_results = sample2_obj.get_event_samples(event_name) if sample2_results is None: continue num_events_compared += 1 # Compute delta of posterior samples and Bayes factors diff_range = arange(-1, 1, 0.001) delta_densities = \ compute_delta_densities(sample1_results, sample2_results, diff_range, event_name=event_name, sample1_label=sample1_label, sample2_label=sample2_label) bf = delta_densities['bayes_factor'] num_isoforms = shape(delta_densities['samples1'])[1] sample1_posterior_mean = mean(delta_densities['samples1'], 0) sample2_posterior_mean = mean(delta_densities['samples2'], 0) # Get the labels of the isoforms isoforms_field = delta_densities['isoforms'] # Get the counts information about both samples sample1_counts_info = delta_densities['sample1_counts'] sample2_counts_info = delta_densities['sample2_counts'] # Compute posterior mean and credible intervals for sample 1 sample1_cred_intervals = \ format_credible_intervals(event_name, delta_densities['samples1'], confidence_level=alpha) sample1_ci_low = sample1_cred_intervals[2] sample1_ci_high = sample1_cred_intervals[3] # Compute posterior mean and credible intervals for sample 2 sample2_cred_intervals = \ format_credible_intervals(event_name, delta_densities['samples2'], confidence_level=alpha) sample2_ci_low = sample2_cred_intervals[2] sample2_ci_high = sample2_cred_intervals[3] posterior_diff = sample1_posterior_mean - sample2_posterior_mean # Use precision of two decimal places if num_isoforms == 2: sample1_posterior_mean = \ Decimal(str(sample1_posterior_mean[0])).quantize(Decimal('0.01')) sample2_posterior_mean = \ Decimal(str(sample2_posterior_mean[0])).quantize(Decimal('0.01')) posterior_diff = "%.2f" %(sample1_posterior_mean - sample2_posterior_mean) bayes_factor = "%.2f" %(bf[0]) else: posterior_diff = \ ",".join(["%.2f" %(v) for v in (sample1_posterior_mean - sample2_posterior_mean)]) sample1_posterior_mean = sample1_cred_intervals[1] sample2_posterior_mean = sample2_cred_intervals[1] bayes_factor = ",".join(["%.2f" %(max(v, 0)) for v in bf]) # Write comparison output line output_fields = [event_name, # Mean and confidence bounds for sample 1 "%s" %(sample1_posterior_mean), "%s" %(sample1_ci_low), "%s" %(sample1_ci_high), # Mean and confidence bounds for sample 2 "%s" %(sample2_posterior_mean), "%s" %(sample2_ci_low), "%s" %(sample2_ci_high), # Delta Psi value "%s" %(posterior_diff), # Bayes factor "%s" %(bayes_factor), # Description of the isoforms "%s" %(isoforms_field), # Counts information for sample 1 "%s" %(sample1_counts_info['counts']), "%s" %(sample1_counts_info['assigned_counts']), # Counts information for sample 2 "%s" %(sample2_counts_info['counts']), "%s" %(sample2_counts_info['assigned_counts']), # Gene information gene_info["chrom"], gene_info["strand"], gene_info["mRNA_starts"], gene_info["mRNA_ends"]] output_line = "%s\n" %("\t".join(output_fields)) output_file.write(output_line) print "Compared a total of %d events." %(num_events_compared) output_file.close()
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None, verbose=True): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ misc_utils.make_dir(output_dir) if not os.path.exists(gff_index_filename): print "Error: No GFF %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] num_chains = settings_params["num_chains"] min_event_reads = Settings.get_min_event_reads() strand_rule = Settings.get_strand_param() mean_frag_len = None frag_variance = None if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] if "filter_reads" not in settings: filter_reads = True else: filter_reads = settings["filter_reads"] # Load the BAM file upfront bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Check if we're in compressed mode compressed_mode = misc_utils.is_compressed_index(gff_index_filename) for gene_id, gene_info in gff_genes.iteritems(): lookup_id = gene_id # Skip genes that we were not asked to run on if lookup_id not in gene_ids: continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Sanity check: if the isoforms are all shorter than the read, # skip the event if all(map(lambda l: l < read_len, gene_obj.iso_lens)): print "All isoforms of %s shorter than %d, so skipping" \ %(gene_id, read_len) continue # Find the most inclusive transcription start and end sites # for each gene tx_start, tx_end = \ gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # Fetch reads aligning to the gene boundaries gene_reads = \ sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Parse reads: checking strandedness and pairing # reads in case of paired-end data reads, num_raw_reads = \ sam_utils.sam_parse_reads(gene_reads, paired_end=paired_end, strand_rule=strand_rule, target_strand=gene_obj.strand) # Skip gene if none of the reads align to gene boundaries if filter_reads: if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue else: print "%d raw reads in event" %(num_raw_reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = \ miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) try: os.makedirs(chrom_dir) except OSError: pass # Pick .miso output filename based on the pickle filename miso_basename = os.path.basename(gff_index_filename) if not miso_basename.endswith(".pickle"): print "Error: Invalid index file %s" %(gff_index_filename) sys.exit(1) miso_basename = miso_basename.replace(".pickle", "") output_filename = os.path.join(chrom_dir, "%s" %(miso_basename)) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, num_chains=num_chains, burn_in=burn_in, lag=lag)
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): print "Error: BAM file %s not found." %(self.bam_filename) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): print "WARNING: Expected BAM index file %s not found." \ %(self.bam_index_fname) print "Are you sure your BAM file is indexed?" self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc print "Using %d processors" %(num_proc) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: print "Error: No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?" sys.exit(1) self.batch_filenames = self.output_batch_files()
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir, use_cluster=False, SGEarray=False, chunk_jobs=800, overhang_len=1, paired_end=None, settings_fname=None, job_name="misojob", num_proc=None, prefilter=False, wait_on_jobs=True): """ Compute Psi values for genes using a GFF and a BAM filename. SGE functionality contributed by Michael Lovci. Options: - prefilter: if set to True, prefilter events by coverage. Uses bedtools to determine coverage of each event and remove events that do not meet the coverage criteria from the run. """ print "Computing Psi values..." print " - GFF index: %s" %(gff_dir) print " - BAM: %s" %(bam_filename) print " - Read length: %d" %(read_len) print " - Output directory: %s" %(output_dir) misc_utils.make_dir(output_dir) # Check GFF and BAM for various errors like headers mismatch run_events.check_gff_and_bam(gff_dir, bam_filename, given_read_len=read_len) # Prefilter events that do not meet the coverage criteria # If filtering is on, only run on events that meet # the filter. all_gene_ids = None if prefilter: print " - Prefiltering on" if misc_utils.which("bedtools") is None: print "Error: Cannot use bedtools. Bedtools is " \ "required for --prefilter option" sys.exit(1) filtered_gene_ids = run_events.get_ids_passing_filter(gff_dir, bam_filename, output_dir) # Prefiltering succeeded, so process only gene ids that # pass the filter if filtered_gene_ids != None: num_pass = len(filtered_gene_ids) all_gene_ids = filtered_gene_ids # If none of the events meet the read coverage filter # something must have gone wrong, e.g. mismatch # in chromosome headers between BAM and GFF if num_pass == 0: print "Error: None of the events in %s appear to meet the " \ "read coverage filter. Check that your BAM headers " \ "in %s match the GFF headers of indexed events." \ %(gff_dir, bam_filename) sys.exit(1) print " - Total of %d events pass coverage filter." \ %(num_pass) ## ## Submit jobs either using cluster or locally ## using multi-cores. ## dispatcher = GenesDispatcher(gff_dir, bam_filename, output_dir, read_len, overhang_len, settings_fname=settings_fname, paired_end=paired_end, use_cluster=use_cluster, chunk_jobs=chunk_jobs, sge_job_name=job_name, SGEarray=SGEarray, gene_ids=all_gene_ids, num_proc=num_proc, wait_on_jobs=wait_on_jobs) dispatcher.run()
def run_SGEarray_cluster( arg_list, argfile, cluster_output_dir, queue_type="long", cluster_scripts_dir=None, chunk=2500, settings=None, cmd_name="qsub", job_name="miso_job", ): """ Run MISO jobs on cluster using SGE. Function contributed by Michael Lovci, UCSD. """ misc_utils.make_dir(cluster_output_dir) # Create arguments file to pass on to job f = open(argfile, "w") nargs = len(arg_list) if nargs % chunk == 0: njobs = nargs / chunk else: njobs = 1 + (nargs / chunk) for args in arg_list: f.write(args[0] + "\n") f.close() if cluster_scripts_dir == None: cluster_scripts_dir = os.path.join(cluster_output_dir, "cluster_scripts") misc_utils.make_dir(cluster_scripts_dir) scripts_output_dir = os.path.join(cluster_output_dir, "scripts_output") misc_utils.make_dir(scripts_output_dir) scripts_output_dir = os.path.abspath(scripts_output_dir) script_error = os.path.join(scripts_output_dir, string.join([job_name, "err"], ".")) script_out = os.path.join(scripts_output_dir, string.join([job_name, "out"], ".")) cluster_script = os.path.join(cluster_scripts_dir, "run_miso.sh") if settings != None: load_settings(settings) cmd_name = Settings.get_cluster_command() if queue_type == "long": queue_name = Settings.get_long_queue_name() elif queue_type == "short": queue_name = Settings.get_short_queue_name() else: raise Exception, "Unknown queue type: %s" % (queue_type) if queue_type == None: print " - queue: unspecified" else: print " - queue: %s, using queue name %s" % (queue_type, queue_name) cs = open(cluster_script, "w") cs.write("#!/bin/sh" + "\n") cs.write("#$ -N %s\n" % (job_name)) cs.write("#$ -S /bin/sh\n") cs.write("#$ -p -1023\n") cs.write("#$ -o %s\n" % (script_out)) cs.write("#$ -e %s\n" % (script_error)) cs.write("#$ -t 1-%s\n" % (njobs)) ##execute from current working directory cs.write("#$ -cwd\n") ## import environment variables cs.write("#$ -V\n") if queue_name: cs.write("#$ -l %s\n" % (queue_name)) cs.write('echo "hostname is:"\n') cs.write("hostname\n") cs.write("ARGFILE=%s\n" % argfile) cs.write("SEQ=/usr/bin/seq\n") cs.write("index=0\n") cs.write("lastindex=0\n") cs.write('let "index = $SGE_TASK_ID * %s"\n' % (chunk)) chunk2 = chunk - 1 cs.write('let "lastindex = $index - %s"\n' % (chunk2)) if chunk2 > 0: cs.write("for i in `$SEQ $lastindex $index`\n") else: cs.write("for i in $index\n") # if user chooses 1 for chunk size cs.write("do\n") cs.write(" line=$(cat $ARGFILE | head -n $i | tail -n 1)\n") cs.write(" eval $line\n") cs.write("done\n") cs.close() # Make script executable os.system('chmod +x "%s"' % (cluster_script)) qsub_cmd = cmd_name + ' "%s"' % (cluster_script) os.system(qsub_cmd)
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len, use_cluster=False, chunk_jobs=False, filter_events=True, events_info_filename=None, settings_filename=None): """ Compute Psi values for skipped exons. Sample filenames is a mapping from sample label to sample. - sample_filenames = [[sample_label1, sample_filename1], [sample_label2, sample_filename2]] - output_dir: output directory - event_type: 'SE', 'RI', etc. """ misc_utils.make_dir(output_dir) output_dir = os.path.join(output_dir, event_type) output_dir = os.path.abspath(output_dir) misc_utils.make_dir(output_dir) print "Computing Psi for events of type %s" %(event_type) print " - samples used: ", sample_filenames.keys() for sample_label, sample_filename in sample_filenames.iteritems(): print "Processing sample: label=%s, filename=%s" \ %(sample_label, sample_filename) results_output_dir = os.path.join(output_dir, sample_label) misc_utils.make_dir(results_output_dir) # Load the set of counts and serialize them into JSON events = \ as_events.load_event_counts(sample_filename, event_type, events_info_filename=events_info_filename) # Filter events if filter_events: print "Filtering events..." events.filter_events(settings=Settings.get()) print "Running on a total of %d events." %(len(events.events)) events_filename = events.output_file(results_output_dir, sample_label) # Run MISO on them miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \ "--read-len %d --overhang-len %d " \ %(os.path.join(miso_path, 'run_miso.py'), events_filename, results_output_dir, event_type, read_len, overhang_len) if use_cluster: if chunk_jobs: miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs) else: miso_cmd += ' --use-cluster' print "Executing: %s" %(miso_cmd) if use_cluster: print " - Using cluster" os.system(miso_cmd)
def output_samples_comparison(sample1_dir, sample2_dir, output_dir, alpha=.95, sample_labels=None, use_compressed=None): """ Compute the bayes factors, posterior means, and other statistics between the two samples and output them to a directory. Expects two directories with samples from a MISO run, where corresponding events in the two samples' directories begin with the same event name. """ print "Given output dir: %s" % (output_dir) print "Retrieving MISO files in sample directories..." sample1_obj = MISOSamples(sample1_dir, use_compressed=use_compressed) sample2_obj = MISOSamples(sample2_dir, use_compressed=use_compressed) print "Computing sample comparison between %s and %s..." % (sample1_dir, sample2_dir) print " - No. of events in %s: %d" % (sample1_dir, sample1_obj.num_events) print " - No. of events in %s: %d" % (sample2_dir, sample2_obj.num_events) # Output header for Bayes factor file if sample_labels is None: # Use directory names as sample labels sample1_label = os.path.basename(os.path.normpath(sample1_dir)) sample2_label = os.path.basename(os.path.normpath(sample2_dir)) else: # If we're given sample labels, use them sample1_label, sample2_label = sample_labels print "Using user-given sample labels (sample1 = %s, sample2 = %s)" \ %(sample1_label, sample2_label) output_dir = os.path.join(output_dir, "%s_vs_%s" % (sample1_label, sample2_label)) print "Creating comparisons parent directory: %s" % (output_dir) # Create parent directory for comparison misc_utils.make_dir(output_dir) # Create directory for Bayes factors bf_output_dir = os.path.join(output_dir, 'bayes-factors/') misc_utils.make_dir(bf_output_dir) header_fields = [ 'event_name', 'sample1_posterior_mean', 'sample1_ci_low', 'sample1_ci_high', 'sample2_posterior_mean', 'sample2_ci_low', 'sample2_ci_high', 'diff', 'bayes_factor', 'isoforms', 'sample1_counts', 'sample1_assigned_counts', 'sample2_counts', 'sample2_assigned_counts', 'chrom', 'strand', 'mRNA_starts', 'mRNA_ends' ] header_line = "\t".join(header_fields) + "\n" output_filename = \ os.path.join(bf_output_dir, "%s_vs_%s.miso_bf" %(sample1_label, sample2_label)) output_file = open(output_filename, 'w') output_file.write(header_line) num_events_compared = 0 file_num = 0 # Compute the Bayes factors for each file for event_name in sample1_obj.all_event_names: sample1_results = sample1_obj.get_event_samples(event_name) # Parameters from raw MISO samples file samples1 = sample1_results[0] header1 = sample1_results[1] header1 = header1[0] params1 = parse_sampler_params_from_header(header1) # Extract gene information if available gene_info = get_gene_info_from_params(params1) # Find corresponding event filename in sample 2 sample2_results = sample2_obj.get_event_samples(event_name) if sample2_results is None: continue num_events_compared += 1 # Compute delta of posterior samples and Bayes factors diff_range = arange(-1, 1, 0.001) delta_densities = \ compute_delta_densities(sample1_results, sample2_results, diff_range, event_name=event_name, sample1_label=sample1_label, sample2_label=sample2_label) bf = delta_densities['bayes_factor'] num_isoforms = shape(delta_densities['samples1'])[1] sample1_posterior_mean = mean(delta_densities['samples1'], 0) sample2_posterior_mean = mean(delta_densities['samples2'], 0) # Get the labels of the isoforms isoforms_field = delta_densities['isoforms'] # Get the counts information about both samples sample1_counts_info = delta_densities['sample1_counts'] sample2_counts_info = delta_densities['sample2_counts'] # Compute posterior mean and credible intervals for sample 1 sample1_cred_intervals = \ format_credible_intervals(event_name, delta_densities['samples1'], confidence_level=alpha) sample1_ci_low = sample1_cred_intervals[2] sample1_ci_high = sample1_cred_intervals[3] # Compute posterior mean and credible intervals for sample 2 sample2_cred_intervals = \ format_credible_intervals(event_name, delta_densities['samples2'], confidence_level=alpha) sample2_ci_low = sample2_cred_intervals[2] sample2_ci_high = sample2_cred_intervals[3] posterior_diff = sample1_posterior_mean - sample2_posterior_mean # Use precision of two decimal places if num_isoforms == 2: sample1_posterior_mean = \ Decimal(str(sample1_posterior_mean[0])).quantize(Decimal('0.01')) sample2_posterior_mean = \ Decimal(str(sample2_posterior_mean[0])).quantize(Decimal('0.01')) posterior_diff = "%.2f" % (sample1_posterior_mean - sample2_posterior_mean) bayes_factor = "%.2f" % (bf[0]) else: posterior_diff = \ ",".join(["%.2f" %(v) for v in (sample1_posterior_mean - sample2_posterior_mean)]) sample1_posterior_mean = sample1_cred_intervals[1] sample2_posterior_mean = sample2_cred_intervals[1] bayes_factor = ",".join(["%.2f" % (max(v, 0)) for v in bf]) # Write comparison output line output_fields = [ event_name, # Mean and confidence bounds for sample 1 "%s" % (sample1_posterior_mean), "%s" % (sample1_ci_low), "%s" % (sample1_ci_high), # Mean and confidence bounds for sample 2 "%s" % (sample2_posterior_mean), "%s" % (sample2_ci_low), "%s" % (sample2_ci_high), # Delta Psi value "%s" % (posterior_diff), # Bayes factor "%s" % (bayes_factor), # Description of the isoforms "%s" % (isoforms_field), # Counts information for sample 1 "%s" % (sample1_counts_info['counts']), "%s" % (sample1_counts_info['assigned_counts']), # Counts information for sample 2 "%s" % (sample2_counts_info['counts']), "%s" % (sample2_counts_info['assigned_counts']), # Gene information gene_info["chrom"], gene_info["strand"], gene_info["mRNA_starts"], gene_info["mRNA_ends"] ] output_line = "%s\n" % ("\t".join(output_fields)) output_file.write(output_line) print "Compared a total of %d events." % (num_events_compared) output_file.close()