def runMISOlocal(pickledDir, bamFile, readlen, overhanglen, outdir,\ paired_end, settings_f): """ Function to run MISO on a single bam file locally, i.e. do not copy to a node. Args: pickledDir (str/path): Directory pointing towards pickled MISO annotations. This database can be generated with the MISO -index flag bamFile (str/path): Directory containing sorted indexed bam file. *Please Note* Bam files must not be trimmed. MISO is not capable of processing mixed read lengths. readlen (int): Length of reads for bamFile overhanglen (int): The required number of nucleotides to overlap a splice junction to be considered in subsequent outDir (str/path): Directory where MISO results will be stored paired_end (bool): Paired-End mode. Currently MISO cannot handle paired-end data, this flag defaults to <False> settings_f (str/path): This file contains a list of flags to provide the cluster to allow for ease of job submission Returns: Nothing. Generates a directory <outDir> where pickled MISO events and PSI values are stored. """ if paired_end == False or paired_end == 'False': paired_end = None Settings.load(settings_f) run_events_analysis.compute_all_genes_psi( pickledDir, bamFile, int(readlen), outdir, overhang_len=int(overhanglen), paired_end=paired_end, settings_fname=settings_f)
def runMISOlocal(pickledDir, bamFile, readlen, overhanglen, outdir,\ paired_end, settings_f): if paired_end == False or paired_end == 'False': paired_end = None Settings.load(settings_f) #if not os.path.exists(outdir): # print 'running', outdir run_events_analysis.compute_all_genes_psi(\ pickledDir, bamFile, int(readlen), outdir,\ overhang_len=int(overhanglen),\ paired_end=paired_end, settings_fname=settings_f)
def get_ids_passing_filter(gff_index_dir, bam_filename, output_dir): """ Apply filter to events using bedtools and return only the events that meet the filter. """ min_reads = 20 settings = Settings.get() min_event_reads = Settings.get_min_event_reads() # Check that this was indexed with a version that outputs # genes.gff file genes_gff_fname = os.path.join(gff_index_dir, "genes.gff") if not os.path.isfile(genes_gff_fname): print "WARNING: Could not find \'genes.gff\' in %s - " \ "skipping prefilter stage. Please reindex your " \ "GFF file with the latest version to enable " \ "prefiltering." %(gff_index_dir) return None print "Prefiltering reads..." coverage_fname = exon_utils.get_bam_gff_coverage(bam_filename, genes_gff_fname, output_dir) ids_passing_filter = [] with open(coverage_fname) as coverage_in: for line in coverage_in: # Skip comments if line.startswith("#"): continue fields = line.strip().split("\t") # Get the counts field and the event ID # if it passes the filter counts = int(fields[9]) if counts < min_event_reads: continue attribs = gff_utils.parse_gff_attribs(fields[8]) if "ID" not in attribs: print "WARNING: No ID= found for line:\n%s\nSkipping..." \ %(line) continue event_id = attribs["ID"] ids_passing_filter.append(event_id) return ids_passing_filter
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None, verbose=True): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ misc_utils.make_dir(output_dir) if not os.path.exists(gff_index_filename): print "Error: No GFF %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] num_chains = settings_params["num_chains"] min_event_reads = Settings.get_min_event_reads() strand_rule = Settings.get_strand_param() mean_frag_len = None frag_variance = None if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] if "filter_reads" not in settings: filter_reads = True else: filter_reads = settings["filter_reads"] # Load the BAM file upfront bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Check if we're in compressed mode compressed_mode = misc_utils.is_compressed_index(gff_index_filename) for gene_id, gene_info in gff_genes.iteritems(): lookup_id = gene_id # Skip genes that we were not asked to run on if lookup_id not in gene_ids: continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Sanity check: if the isoforms are all shorter than the read, # skip the event if all(map(lambda l: l < read_len, gene_obj.iso_lens)): print "All isoforms of %s shorter than %d, so skipping" \ %(gene_id, read_len) continue # Find the most inclusive transcription start and end sites # for each gene tx_start, tx_end = \ gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # Fetch reads aligning to the gene boundaries gene_reads = \ sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Parse reads: checking strandedness and pairing # reads in case of paired-end data reads, num_raw_reads = \ sam_utils.sam_parse_reads(gene_reads, paired_end=paired_end, strand_rule=strand_rule, target_strand=gene_obj.strand, given_read_len=read_len) # Skip gene if none of the reads align to gene boundaries if filter_reads: if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue else: print "%d raw reads in event" %(num_raw_reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = \ miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) try: os.makedirs(chrom_dir) except OSError: pass # Pick .miso output filename based on the pickle filename miso_basename = os.path.basename(gff_index_filename) if not miso_basename.endswith(".pickle"): print "Error: Invalid index file %s" %(gff_index_filename) sys.exit(1) miso_basename = miso_basename.replace(".pickle", "") output_filename = os.path.join(chrom_dir, "%s" %(miso_basename)) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, num_chains=num_chains, burn_in=burn_in, lag=lag)
def main(): from optparse import OptionParser parser = OptionParser() ## ## Main options ## parser.add_option("--compute-gene-psi", dest="compute_gene_psi", nargs=4, default=None, help="Compute Psi using for a given multi-isoform gene. " "Expects four arguments: the first is a gene ID or set " "of comma-separated (no spaces) gene IDs, " "the second is a GFF indexed file with the gene " "information, the third is a sorted and " "indexed BAM file with reads aligned to the gene, " "and the fourth is an output directory.") parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None, help="Run in paired-end mode. Takes a mean and standard " "deviation for the fragment length distribution (assumed " "to have discretized normal form.)") parser.add_option("--compute-genes-from-file", dest="compute_genes_from_file", nargs=3, default=None, help="Runs on a set of genes from a file. Takes as input: " "(1) a two-column tab-delimited file, where column 1 is the " "event ID (ID field from GFF) and the second column is " "the path to the indexed GFF file for that event. " "MISO will run on all the events described in the file, " "(2) a sorted, indexed BAM file to run on, and (3) a " "directory to output results to.") ## ## Psi utilities ## parser.add_option("--compare-samples", dest="samples_to_compare", nargs=3, default=None, help="Compute comparison statistics between the two " "given samples. Expects three directories: the first is " "sample1's MISO output, the second is sample2's MISO " "output, and the third is the directory where " "results of the sample comparison will be outputted.") parser.add_option("--comparison-labels", dest="comparison_labels", nargs=2, default=None, help="Use these labels for the sample comparison " "made by --compare-samples. " "Takes two arguments: the label for sample 1 " "and the label for sample 2, where sample 1 and " "sample 2 correspond to the order of samples given " "to --compare-samples.") parser.add_option("--summarize-samples", dest="summarize_samples", nargs=2, default=None, help="Compute summary statistics of the given set " "of samples. Expects a directory with MISO output " "and a directory to output summary file to.") parser.add_option("--summary-label", dest="summary_label", nargs=1, default=None, help="Label for MISO summary file. If not given, " "uses basename of MISO output directory.") parser.add_option("--use-cluster", action="store_true", dest="use_cluster", default=False) parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int", help="Size (in number of events) of each job to " "chunk events file into. Only applies when " "running on cluster.") parser.add_option("--settings-filename", dest="settings_filename", default=os.path.join(miso_settings_path, "settings", "miso_settings.txt"), help="Filename specifying MISO settings.") parser.add_option("--read-len", dest="read_len", type="int", default=None) parser.add_option("--overhang-len", dest="overhang_len", type="int", default=None) parser.add_option("--event-type", dest="event_type", default=None, help="Event type of two-isoform " "events (e.g. 'SE', 'RI', 'A3SS', ...)") parser.add_option("--use-compressed", dest="use_compressed", nargs=1, default=None, help="Use compressed event IDs. Takes as input a " "genes_to_filenames.shelve file produced by the " "index_gff script.") ## ## Gene utilities ## parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None, help="View the contents of a gene/event that has " "been indexed. Takes as input an " "indexed (.pickle) filename.") (options, args) = parser.parse_args() if options.compute_gene_psi is None: greeting() ## ## Load the settings file ## Settings.load(os.path.expanduser(options.settings_filename)) use_compressed = None if options.use_compressed is not None: use_compressed = \ os.path.abspath(os.path.expanduser(options.use_compressed)) if not os.path.exists(use_compressed): print "Error: mapping filename from event IDs to compressed IDs %s " \ "is not found." %(use_compressed) sys.exit(1) else: print "Compression being used." if options.samples_to_compare is not None: sample1_dirname = os.path.abspath(options.samples_to_compare[0]) sample2_dirname = os.path.abspath(options.samples_to_compare[1]) output_dirname = os.path.abspath(options.samples_to_compare[2]) if not os.path.isdir(output_dirname): print "Making comparisons directory: %s" %(output_dirname) misc_utils.make_dir(output_dirname) ht.output_samples_comparison(sample1_dirname, sample2_dirname, output_dirname, sample_labels=options.comparison_labels, use_compressed=use_compressed) ## ## Main interface based on SAM files ## if options.compute_genes_from_file != None: # Run on events given by file run_compute_genes_from_file(options) if options.compute_gene_psi != None: run_compute_gene_psi(options) ## ## Summarizing samples ## if options.summarize_samples: samples_dir = \ os.path.abspath(os.path.expanduser(options.summarize_samples[0])) if options.summary_label != None: samples_label = options.summary_label print "Using summary label: %s" %(samples_label) else: samples_label = \ os.path.basename(os.path.expanduser(samples_dir)) assert(len(samples_label) >= 1) summary_output_dir = \ os.path.abspath(os.path.join(os.path.expanduser(options.summarize_samples[1]), 'summary')) if not os.path.isdir(summary_output_dir): os.makedirs(summary_output_dir) summary_filename = os.path.join(summary_output_dir, '%s.miso_summary' %(samples_label)) summarize_sampler_results(samples_dir, summary_filename, use_compressed=use_compressed) if options.view_gene != None: indexed_gene_filename = \ os.path.abspath(os.path.expanduser(options.view_gene)) print "Viewing genes in %s" %(indexed_gene_filename) gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename) if gff_genes == None: print "No genes." sys.exit(1) for gene_id, gene_info in gff_genes.iteritems(): print "Gene %s" %(gene_id) gene_obj = gene_info['gene_object'] print " - Gene object: ", gene_obj print "==" print "Isoforms: " for isoform in gene_obj.isoforms: print " - ", isoform print "==" print "mRNA IDs: " for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']: print "%s" %(mRNA_id) print "==" print "Exons: " for exon in gene_obj.parts: print " - ", exon
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, main_logger, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.main_logger = main_logger self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): self.main_logger.error("BAM file %s not found." %(self.bam_filename)) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): self.main_logger.warning("Expected BAM index file %s not found." \ %(self.bam_index_fname)) self.main_logger.warning("Are you sure your BAM file is indexed?") self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc self.main_logger.info("Using %d processors" %(num_proc)) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: self.main_logger.error("No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?") sys.exit(1) self.batch_filenames = self.output_batch_files()
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--run", dest="compute_genes_psi", nargs=2, default=None, help="Compute Psi values for a given GFF annotation " "of either whole mRNA isoforms or isoforms produced by " "single alternative splicing events. Expects two " "arguments: an indexed GFF directory with genes to " "process, and a sorted, indexed BAM file (with " "headers) to run on.") parser.add_option("--event-type", dest="event_type", nargs=1, help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)", default=None) parser.add_option("--use-cluster", dest="use_cluster", action="store_true", default=False, help="Run events on cluster.") parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int", help="Size (in number of events) of each job to chunk " "events file into. Only applies when running on cluster.") parser.add_option("--no-filter-events", dest="no_filter_events", action="store_true", default=False, help="Do not filter events for computing Psi. " "By default, MISO computes Psi only for events that " "have a sufficient number of junction reads. " "The default filter varies by event type.") parser.add_option("--settings-filename", dest="settings_filename", default=os.path.join(miso_settings_path, "settings", "miso_settings.txt"), help="Filename specifying MISO settings.") parser.add_option("--read-len", dest="read_len", default=None, type="int", help="Length of sequenced reads.") parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None, help="Run in paired-end mode. Takes mean and " "standard deviation of insert length distribution.") parser.add_option("--overhang-len", dest="overhang_len", default=None, type="int", help="Length of overhang constraints " "imposed on junctions.") parser.add_option("--output-dir", dest="output_dir", default=None, help="Directory for MISO output.") parser.add_option("--job-name", dest="job_name", nargs=1, help="Name for jobs submitted to queue for SGE jobs. " \ "Default is misojob", default="misojob") parser.add_option("--SGEarray", dest="SGEarray", action="store_true", default=False, help="Use MISO on cluster with Sun Grid Engine. " "To be used in conjunction with --use-cluster option.") parser.add_option("--prefilter", dest="prefilter", default=False, action="store_true", help="Prefilter events based on coverage. If given as " "argument, run will begin by mapping BAM reads to event " "regions (using bedtools), and omit events that do not " "meet coverage criteria from the run. By default, turned " "off. Note that events that do not meet the coverage criteria " "will not be processed regardless, but --prefilter simply " "does this filtering step at the start of the run, potentially " "saving computation time so that low coverage events will not " "be processed or distributed to jobs if MISO is run on a " "cluster. This options requires bedtools to be installed and " "available on path.") parser.add_option("-p", dest="num_proc", default=None, nargs=1, help="Number of processors to use. Only applies when running " \ "MISO on a single machine with multiple cores; does not apply " \ "to runs submitted to cluster with --use-cluster.") parser.add_option("--version", dest="version", default=False, action="store_true", help="Print MISO version.") parser.add_option("--no-wait", dest="no_wait", default=False, action="store_true", help="If passed in, do not wait on cluster jobs after " \ "they are submitted. By default, wait.") ## ## Gene utilities ## parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None, help="View the contents of a gene/event that has " "been indexed. Takes as input an " "indexed (.pickle) filename.") (options, args) = parser.parse_args() greeting() if options.version: print "MISO version %s\n" %(misopy.__version__) ## ## Load the settings file ## if not os.path.isdir(miso_settings_path): print "Error: %s is not a directory containing a default MISO " \ "settings filename. Please specify a settings filename " \ "using --settings-filename." return settings_filename = \ os.path.abspath(os.path.expanduser(options.settings_filename)) Settings.load(settings_filename) if (not options.use_cluster) and options.chunk_jobs: print "Error: Chunking jobs only applies when using " \ "the --use-cluster option to run MISO on cluster." sys.exit(1) if (not options.use_cluster) and options.SGEarray: print "Error: SGEarray implies that you are using an SGE cluster," \ "please run again with --use-cluster option enabled." sys.exit(1) ## ## Quantitation using BAM for all genes ## if options.compute_genes_psi != None: # GFF filename with genes to process gff_filename = \ os.path.abspath(os.path.expanduser(options.compute_genes_psi[0])) # BAM filename with reads bam_filename = \ os.path.abspath(os.path.expanduser(options.compute_genes_psi[1])) if options.output_dir == None: print "Error: need --output-dir to compute Psi values." sys.exit(1) # Output directory to use output_dir = os.path.abspath(os.path.expanduser(options.output_dir)) ## ## Load the main logging object ## logs_output_dir = os.path.join(output_dir, "logs") main_logger = get_main_logger(logs_output_dir) if options.read_len == None: main_logger.error("need --read-len to compute Psi values.") sys.exit(1) overhang_len = 1 if options.paired_end != None and options.overhang_len != None: main_logger.warning("cannot use --overhang-len in paired-end mode.\n" \ "Using overhang = 1") if options.overhang_len != None: overhang_len = options.overhang_len # Whether to wait on cluster jobs or not wait_on_jobs = not options.no_wait compute_all_genes_psi(gff_filename, bam_filename, options.read_len, output_dir, main_logger, overhang_len=overhang_len, use_cluster=options.use_cluster, SGEarray=options.SGEarray, job_name=options.job_name, chunk_jobs=options.chunk_jobs, paired_end=options.paired_end, settings_fname=settings_filename, prefilter=options.prefilter, num_proc=options.num_proc, wait_on_jobs=wait_on_jobs) if options.view_gene != None: indexed_gene_filename = \ os.path.abspath(os.path.expanduser(options.view_gene)) print "Viewing genes in %s" %(indexed_gene_filename) gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename) if gff_genes == None: print "No genes." sys.exit(1) for gene_id, gene_info in gff_genes.iteritems(): print "Gene %s" %(gene_id) gene_obj = gene_info['gene_object'] print " - Gene object: ", gene_obj print "==" print "Isoforms: " for isoform in gene_obj.isoforms: print " - ", isoform print "==" print "mRNA IDs: " for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']: print "%s" %(mRNA_id) print "==" print "Exons: " for exon in gene_obj.parts: print " - ", exon
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len, use_cluster=False, chunk_jobs=False, filter_events=True, events_info_filename=None, settings_filename=None): """ Compute Psi values for skipped exons. Sample filenames is a mapping from sample label to sample. - sample_filenames = [[sample_label1, sample_filename1], [sample_label2, sample_filename2]] - output_dir: output directory - event_type: 'SE', 'RI', etc. """ misc_utils.make_dir(output_dir) output_dir = os.path.join(output_dir, event_type) output_dir = os.path.abspath(output_dir) misc_utils.make_dir(output_dir) print "Computing Psi for events of type %s" % (event_type) print " - samples used: ", sample_filenames.keys() for sample_label, sample_filename in sample_filenames.iteritems(): print "Processing sample: label=%s, filename=%s" \ %(sample_label, sample_filename) results_output_dir = os.path.join(output_dir, sample_label) misc_utils.make_dir(results_output_dir) # Load the set of counts and serialize them into JSON events = \ as_events.load_event_counts(sample_filename, event_type, events_info_filename=events_info_filename) # Filter events if filter_events: print "Filtering events..." events.filter_events(settings=Settings.get()) print "Running on a total of %d events." % (len(events.events)) events_filename = events.output_file(results_output_dir, sample_label) # Run MISO on them miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \ "--read-len %d --overhang-len %d " \ %(os.path.join(miso_path, 'run_miso.py'), events_filename, results_output_dir, event_type, read_len, overhang_len) if use_cluster: if chunk_jobs: miso_cmd += ' --use-cluster --chunk-jobs %d' % (chunk_jobs) else: miso_cmd += ' --use-cluster' print "Executing: %s" % (miso_cmd) if use_cluster: print " - Using cluster" os.system(miso_cmd)
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): print "Error: BAM file %s not found." %(self.bam_filename) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): print "WARNING: Expected BAM index file %s not found." \ %(self.bam_index_fname) print "Are you sure your BAM file is indexed?" self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc print "Using %d processors" %(num_proc) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: print "Error: No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?" sys.exit(1) self.batch_filenames = self.output_batch_files()
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--run", dest="compute_genes_psi", nargs=2, default=None, help="Compute Psi values for a given GFF annotation " "of either whole mRNA isoforms or isoforms produced by " "single alternative splicing events. Expects two " "arguments: an indexed GFF directory with genes to " "process, and a sorted, indexed BAM file (with " "headers) to run on.") parser.add_option("--event-type", dest="event_type", nargs=1, help="[OPTIONAL] Type of event (e.g. SE, RI, A3SS, ...)", default=None) parser.add_option("--use-cluster", dest="use_cluster", action="store_true", default=False, help="Run events on cluster.") parser.add_option("--chunk-jobs", dest="chunk_jobs", default=False, type="int", help="Size (in number of events) of each job to chunk " "events file into. Only applies when running on cluster.") parser.add_option("--no-filter-events", dest="no_filter_events", action="store_true", default=False, help="Do not filter events for computing Psi. " "By default, MISO computes Psi only for events that " "have a sufficient number of junction reads. " "The default filter varies by event type.") parser.add_option("--settings-filename", dest="settings_filename", default=os.path.join(miso_settings_path, "settings", "miso_settings.txt"), help="Filename specifying MISO settings.") parser.add_option("--read-len", dest="read_len", default=None, type="int", help="Length of sequenced reads.") parser.add_option("--paired-end", dest="paired_end", nargs=2, default=None, help="Run in paired-end mode. Takes mean and " "standard deviation of insert length distribution.") parser.add_option("--overhang-len", dest="overhang_len", default=None, type="int", help="Length of overhang constraints " "imposed on junctions.") parser.add_option("--output-dir", dest="output_dir", default=None, help="Directory for MISO output.") parser.add_option("--job-name", dest="job_name", nargs=1, help="Name for jobs submitted to queue for SGE jobs. " \ "Default is misojob", default="misojob") parser.add_option("--SGEarray", dest="SGEarray", action="store_true", default=False, help="Use MISO on cluster with Sun Grid Engine. " "To be used in conjunction with --use-cluster option.") parser.add_option("--prefilter", dest="prefilter", default=False, action="store_true", help="Prefilter events based on coverage. If given as " "argument, run will begin by mapping BAM reads to event " "regions (using bedtools), and omit events that do not " "meet coverage criteria from the run. By default, turned " "off. Note that events that do not meet the coverage criteria " "will not be processed regardless, but --prefilter simply " "does this filtering step at the start of the run, potentially " "saving computation time so that low coverage events will not " "be processed or distributed to jobs if MISO is run on a " "cluster. This options requires bedtools to be installed and " "available on path.") parser.add_option("-p", dest="num_proc", default=None, nargs=1, help="Number of processors to use. Only applies when running " \ "MISO on a single machine with multiple cores; does not apply " \ "to runs submitted to cluster with --use-cluster.") parser.add_option("--version", dest="version", default=False, action="store_true", help="Print MISO version.") parser.add_option("--no-wait", dest="no_wait", default=False, action="store_true", help="If passed in, do not wait on cluster jobs after " \ "they are submitted. By default, wait.") ## ## Gene utilities ## parser.add_option("--view-gene", dest="view_gene", nargs=1, default=None, help="View the contents of a gene/event that has " "been indexed. Takes as input an " "indexed (.pickle) filename.") (options, args) = parser.parse_args() greeting() if options.version: print "MISO version %s\n" %(misopy.__version__) ## ## Load the settings file ## if not os.path.isdir(miso_settings_path): print "Error: %s is not a directory containing a default MISO " \ "settings filename. Please specify a settings filename " \ "using --settings-filename." return settings_filename = \ os.path.abspath(os.path.expanduser(options.settings_filename)) Settings.load(settings_filename) if (not options.use_cluster) and options.chunk_jobs: print "Error: Chunking jobs only applies when using " \ "the --use-cluster option to run MISO on cluster." sys.exit(1) if (not options.use_cluster) and options.SGEarray: print "Error: SGEarray implies that you are using an SGE cluster," \ "please run again with --use-cluster option enabled." sys.exit(1) ## ## Quantitation using BAM for all genes ## if options.compute_genes_psi != None: # GFF filename with genes to process gff_filename = \ os.path.abspath(os.path.expanduser(options.compute_genes_psi[0])) # BAM filename with reads bam_filename = \ os.path.abspath(os.path.expanduser(options.compute_genes_psi[1])) if options.output_dir == None: print "Error: need --output-dir to compute Psi values." sys.exit(1) # Output directory to use output_dir = os.path.abspath(os.path.expanduser(options.output_dir)) if options.read_len == None: print "Error: need --read-len to compute Psi values." sys.exit(1) overhang_len = 1 if options.paired_end != None and options.overhang_len != None: print "WARNING: cannot use --overhang-len in paired-end mode." print "Using overhang = 1" if options.overhang_len != None: overhang_len = options.overhang_len # Whether to wait on cluster jobs or not wait_on_jobs = not options.no_wait compute_all_genes_psi(gff_filename, bam_filename, options.read_len, output_dir, overhang_len=overhang_len, use_cluster=options.use_cluster, SGEarray=options.SGEarray, job_name=options.job_name, chunk_jobs=options.chunk_jobs, paired_end=options.paired_end, settings_fname=settings_filename, prefilter=options.prefilter, num_proc=options.num_proc, wait_on_jobs=wait_on_jobs) if options.view_gene != None: indexed_gene_filename = \ os.path.abspath(os.path.expanduser(options.view_gene)) print "Viewing genes in %s" %(indexed_gene_filename) gff_genes = gff_utils.load_indexed_gff_file(indexed_gene_filename) if gff_genes == None: print "No genes." sys.exit(1) for gene_id, gene_info in gff_genes.iteritems(): print "Gene %s" %(gene_id) gene_obj = gene_info['gene_object'] print " - Gene object: ", gene_obj print "==" print "Isoforms: " for isoform in gene_obj.isoforms: print " - ", isoform print "==" print "mRNA IDs: " for mRNA_id in gene_info['hierarchy'][gene_id]['mRNAs']: print "%s" %(mRNA_id) print "==" print "Exons: " for exon in gene_obj.parts: print " - ", exon
def runMISOsingle(pickledDir, bamFile, readlen, overhanglen, outdir,\ paired_end, settings_f, scratchDir): """ Function to run MISO on a single bam file. Args: pickledDir (str/path): Directory pointing towards pickled MISO annotations. This database can be generated with the MISO -index flag bamFile (str/path): Directory containing sorted indexed bam file. *Please Note* Bam files must not be trimmed. MISO is not capable of processing mixed read lengths. readlen (int): Length of reads for bamFile overhanglen (int): The required number of nucleotides to overlap a splice junction to be considered in subsequent outDir (str/path): Directory where MISO results will be stored paired_end (bool): Paired-End mode. Currently MISO cannot handle paired-end data, this flag defaults to <False> settings_f (str/path): This file contains a list of flags to provide the cluster to allow for ease of job submission scratchDir (str/path): Directory where MISO output will be stored. Returns: Nothing. Generates a directory <outDir> where pickled MISO events and PSI values are stored. """ if paired_end == 'False': paired_end = None t = str(time.time()) + str(random.random()) print os.path.basename(pickledDir) if not os.path.exists(scratchDir): cmd = 'mkdir ' + scratchDir process = subprocess.Popen(cmd, shell=True) process.wait() # Copy pickled dir. pickled = os.path.join(scratchDir, os.path.basename(pickledDir) + \ "." + t) cmd = 'mkdir ' + pickled process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'cp -r ' + pickledDir + '/* ' + pickled process = subprocess.Popen(cmd, shell=True) process.wait() # Copy bam file. cmd = 'cp -fL ' + bamFile + ' ' + scratchDir process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'cp -fL ' + bamFile + '.bai ' + scratchDir process = subprocess.Popen(cmd, shell=True) process.wait() bam = os.path.join(scratchDir, os.path.basename(bamFile)) # Give output directory in scratch a timestamp out = os.path.join(scratchDir, os.path.basename(outdir + "." + t)) # LOAD SETTINGS FOR MISO Settings.load(settings_f) run_events_analysis.compute_all_genes_psi(\ pickled, bam, int(readlen), out, overhang_len=int(overhanglen),\ paired_end=paired_end, settings_fname=settings_f, prefilter=False) # Summarize sample #summary_fname = os.path.join(out, os.path.basename(outdir) + '.miso_summary') #samples_utils.summarize_sampler_results(out, summary_fname) if not os.path.exists(outdir): cmd = 'mkdir -p ' + outdir process = subprocess.Popen(cmd, shell=True) process.wait() # Copy output back. cmd = 'cp -r ' + out + '/* ' + outdir process = subprocess.Popen(cmd, shell=True) process.wait() # Remove bam, output, and pickled dir. cmd = 'rm ' + bam process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'rm ' + bam + '.bai' process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'rm -fr ' + out process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'rm -fr ' + pickled process = subprocess.Popen(cmd, shell=True) process.wait()
def run(self, delay_constant=0.9): """ Run batches either locally on multi-cores or using cluster. """ batch_filenames = self.output_batch_files() # All MISO commands, each correspond to a batch, # and the number of jobs in each batch all_miso_cmds = [] num_batches = len(batch_filenames) ## ## Prepare all the files necessary to run each batch ## print "Preparing to run %d batches of jobs..." % (num_batches) miso_run = os.path.join(miso_path, "run_miso.py") for batch_num, batch in enumerate(batch_filenames): batch_filename, batch_size = batch miso_cmd = \ "python %s --compute-genes-from-file \"%s\" %s %s --read-len %d " \ %(miso_run, batch_filename, self.bam_filename, self.output_dir, self.read_len) # Add paired-end parameters and read len/overhang len if self.paired_end != None: # Run in paired-end mode frag_mean = float(self.paired_end[0]) frag_sd = float(self.paired_end[1]) miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd) else: # Overhang len only used in single-end mode miso_cmd += " --overhang-len %d" % (self.overhang_len) # Add settings filename if given if self.settings_fname != None: miso_cmd += " --settings-filename %s" \ %(self.settings_fname) all_miso_cmds.append((miso_cmd, batch_size)) ## ## Run all MISO commands for the batches ## either locally using multi-cores or on cluster ## # First handle special case of SGE cluster submission if self.use_cluster and self.SGEarray: print "Using SGEarray..." # Call SGE batch_argfile = os.path.join(self.cluster_scripts_dir, "run_args.txt") cluster_utils.run_SGEarray_cluster(all_miso_cmds, batch_argfile, self.output_dir, settings=self.settings_fname, job_name=self.sge_job_name, chunk=self.chunk_jobs) # End SGE case return # All cluster jobs cluster_jobs = [] for batch_num, cmd_info in enumerate(all_miso_cmds): miso_cmd, batch_size = cmd_info print "Running batch of %d genes.." % (batch_size) print " - Executing: %s" % (miso_cmd) # Make a log file for the batch, where all the output # will be redirected time_str = time.strftime("%m-%d-%y_%H:%M:%S") batch_logfile = os.path.join( self.batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str)) cmd_to_run = "%s >> \"%s\";" % (miso_cmd, batch_logfile) if not self.use_cluster: # Run locally p = subprocess.Popen(cmd_to_run, shell=True) thread_id = "batch-%d" % (batch_num) print " - Submitted thread %s" % (thread_id) self.threads[thread_id] = p else: # Setup cluster engine Settings.load(self.settings_fname) clustercmd = Settings.get_cluster_command() self.cluster_engine = getClusterEngine(clustercmd, self.settings_fname) # Run on cluster if batch_size >= self.long_thresh: queue_type = "long" else: queue_type = "short" # Run on cluster job_name = "gene_psi_batch_%d" % (batch_num) print "Submitting to cluster: %s" % (cmd_to_run) job_id = \ self.cluster_engine.run_on_cluster(cmd_to_run, job_name, self.output_dir, queue_type=queue_type) if job_id is not None: cluster_jobs.append(job_id) time.sleep(delay_constant) # Extra delay constant time.sleep(delay_constant) # If ran jobs on cluster, wait for them if there are any # to wait on. if self.wait_on_jobs: if self.use_cluster and (len(cluster_jobs) == 0): # If we're asked to use the cluster but the list # of cluster jobs is empty, it means we could not # find the IDs of the job from the submission # system. Report this to the user. self.main_logger.warning("Asked to wait on cluster jobs but cannot " \ "parse their job IDs from the cluster submission " \ "system.") # Try to wait on jobs no matter what; though if 'cluster_jobs' # is empty here, it will not wait self.cluster_engine.wait_on_jobs(cluster_jobs, self.cluster_cmd) else: if self.use_cluster: # If we're running in cluster mode and asked not # to wait for jobs, let user know self.main_logger.info("Not waiting on cluster jobs.") # If ran jobs locally, wait on them to finish # (this will do nothing if we submitted jobs to # cluster) self.wait_on_threads()
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len, use_cluster=False, chunk_jobs=False, filter_events=True, events_info_filename=None, settings_filename=None): """ Compute Psi values for skipped exons. Sample filenames is a mapping from sample label to sample. - sample_filenames = [[sample_label1, sample_filename1], [sample_label2, sample_filename2]] - output_dir: output directory - event_type: 'SE', 'RI', etc. """ misc_utils.make_dir(output_dir) output_dir = os.path.join(output_dir, event_type) output_dir = os.path.abspath(output_dir) misc_utils.make_dir(output_dir) print "Computing Psi for events of type %s" %(event_type) print " - samples used: ", sample_filenames.keys() for sample_label, sample_filename in sample_filenames.iteritems(): print "Processing sample: label=%s, filename=%s" \ %(sample_label, sample_filename) results_output_dir = os.path.join(output_dir, sample_label) misc_utils.make_dir(results_output_dir) # Load the set of counts and serialize them into JSON events = \ as_events.load_event_counts(sample_filename, event_type, events_info_filename=events_info_filename) # Filter events if filter_events: print "Filtering events..." events.filter_events(settings=Settings.get()) print "Running on a total of %d events." %(len(events.events)) events_filename = events.output_file(results_output_dir, sample_label) # Run MISO on them miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \ "--read-len %d --overhang-len %d " \ %(os.path.join(miso_path, 'run_miso.py'), events_filename, results_output_dir, event_type, read_len, overhang_len) if use_cluster: if chunk_jobs: miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs) else: miso_cmd += ' --use-cluster' print "Executing: %s" %(miso_cmd) if use_cluster: print " - Using cluster" os.system(miso_cmd)
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None, verbose=True): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ misc_utils.make_dir(output_dir) if not os.path.exists(gff_index_filename): print "Error: No GFF %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] num_chains = settings_params["num_chains"] min_event_reads = Settings.get_min_event_reads() strand_rule = Settings.get_strand_param() mean_frag_len = None frag_variance = None if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] if "filter_reads" not in settings: filter_reads = True else: filter_reads = settings["filter_reads"] # Load the BAM file upfront bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Check if we're in compressed mode compressed_mode = misc_utils.is_compressed_index(gff_index_filename) for gene_id, gene_info in gff_genes.iteritems(): lookup_id = gene_id # Skip genes that we were not asked to run on if lookup_id not in gene_ids: continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Sanity check: if the isoforms are all shorter than the read, # skip the event if all(map(lambda l: l < read_len, gene_obj.iso_lens)): print "All isoforms of %s shorter than %d, so skipping" \ %(gene_id, read_len) continue # Find the most inclusive transcription start and end sites # for each gene tx_start, tx_end = \ gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # Fetch reads aligning to the gene boundaries gene_reads = \ sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Parse reads: checking strandedness and pairing # reads in case of paired-end data reads, num_raw_reads = \ sam_utils.sam_parse_reads(gene_reads, paired_end=paired_end, strand_rule=strand_rule, target_strand=gene_obj.strand) # Skip gene if none of the reads align to gene boundaries if filter_reads: if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue else: print "%d raw reads in event" %(num_raw_reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = \ miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) try: os.makedirs(chrom_dir) except OSError: pass # Pick .miso output filename based on the pickle filename miso_basename = os.path.basename(gff_index_filename) if not miso_basename.endswith(".pickle"): print "Error: Invalid index file %s" %(gff_index_filename) sys.exit(1) miso_basename = miso_basename.replace(".pickle", "") output_filename = os.path.join(chrom_dir, "%s" %(miso_basename)) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, num_chains=num_chains, burn_in=burn_in, lag=lag)
def runMISOsingle(pickledDir, bamFile, readlen, overhanglen, outdir,\ paired_end, settings_f, scratchDir): if paired_end == 'False': paired_end = None t = str(time.time()) + str(random.random()) print os.path.basename(pickledDir) if not os.path.exists(scratchDir): cmd = 'mkdir ' + scratchDir process = subprocess.Popen(cmd, shell=True) process.wait() # Copy pickled dir. pickled = os.path.join(scratchDir, os.path.basename(pickledDir) + \ "." + t) cmd = 'mkdir ' + pickled process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'cp -r ' + pickledDir + '/* ' + pickled process = subprocess.Popen(cmd, shell=True) process.wait() # Copy bam file. cmd = 'cp -fL ' + bamFile + ' ' + scratchDir process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'cp -fL ' + bamFile + '.bai ' + scratchDir process = subprocess.Popen(cmd, shell=True) process.wait() bam = os.path.join(scratchDir, os.path.basename(bamFile)) # Give output directory in scratch a timestamp out = os.path.join(scratchDir, os.path.basename(outdir + "." + t)) # LOAD SETTINGS FOR MISO Settings.load(settings_f) run_events_analysis.compute_all_genes_psi(\ pickled, bam, int(readlen), out, overhang_len=int(overhanglen),\ paired_end=paired_end, settings_fname=settings_f, prefilter=True) # Summarize sample #summary_fname = os.path.join(out, os.path.basename(outdir) + '.miso_summary') #samples_utils.summarize_sampler_results(out, summary_fname) if not os.path.exists(outdir): cmd = 'mkdir -p ' + outdir process = subprocess.Popen(cmd, shell=True) process.wait() # Copy output back. cmd = 'cp -r ' + out + '/* ' + outdir process = subprocess.Popen(cmd, shell=True) process.wait() # Remove bam, output, and pickled dir. cmd = 'rm ' + bam process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'rm ' + bam + '.bai' process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'rm -fr ' + out process = subprocess.Popen(cmd, shell=True) process.wait() cmd = 'rm -fr ' + pickled process = subprocess.Popen(cmd, shell=True) process.wait()