def get_ids_passing_filter(gff_index_dir, bam_filename, output_dir): """ Apply filter to events using bedtools and return only the events that meet the filter. """ min_reads = 20 settings = Settings.get() min_event_reads = Settings.get_min_event_reads() # Check that this was indexed with a version that outputs # genes.gff file genes_gff_fname = os.path.join(gff_index_dir, "genes.gff") if not os.path.isfile(genes_gff_fname): print "WARNING: Could not find \'genes.gff\' in %s - " \ "skipping prefilter stage. Please reindex your " \ "GFF file with the latest version to enable " \ "prefiltering." %(gff_index_dir) return None print "Prefiltering reads..." coverage_fname = exon_utils.get_bam_gff_coverage(bam_filename, genes_gff_fname, output_dir) ids_passing_filter = [] with open(coverage_fname) as coverage_in: for line in coverage_in: # Skip comments if line.startswith("#"): continue fields = line.strip().split("\t") # Get the counts field and the event ID # if it passes the filter counts = int(fields[9]) if counts < min_event_reads: continue attribs = gff_utils.parse_gff_attribs(fields[8]) if "ID" not in attribs: print "WARNING: No ID= found for line:\n%s\nSkipping..." \ %(line) continue event_id = attribs["ID"] ids_passing_filter.append(event_id) return ids_passing_filter
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None, verbose=True): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ misc_utils.make_dir(output_dir) if not os.path.exists(gff_index_filename): print "Error: No GFF %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] num_chains = settings_params["num_chains"] min_event_reads = Settings.get_min_event_reads() strand_rule = Settings.get_strand_param() mean_frag_len = None frag_variance = None if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] if "filter_reads" not in settings: filter_reads = True else: filter_reads = settings["filter_reads"] # Load the BAM file upfront bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Check if we're in compressed mode compressed_mode = misc_utils.is_compressed_index(gff_index_filename) for gene_id, gene_info in gff_genes.iteritems(): lookup_id = gene_id # Skip genes that we were not asked to run on if lookup_id not in gene_ids: continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Sanity check: if the isoforms are all shorter than the read, # skip the event if all(map(lambda l: l < read_len, gene_obj.iso_lens)): print "All isoforms of %s shorter than %d, so skipping" \ %(gene_id, read_len) continue # Find the most inclusive transcription start and end sites # for each gene tx_start, tx_end = \ gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # Fetch reads aligning to the gene boundaries gene_reads = \ sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Parse reads: checking strandedness and pairing # reads in case of paired-end data reads, num_raw_reads = \ sam_utils.sam_parse_reads(gene_reads, paired_end=paired_end, strand_rule=strand_rule, target_strand=gene_obj.strand, given_read_len=read_len) # Skip gene if none of the reads align to gene boundaries if filter_reads: if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue else: print "%d raw reads in event" %(num_raw_reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = \ miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) try: os.makedirs(chrom_dir) except OSError: pass # Pick .miso output filename based on the pickle filename miso_basename = os.path.basename(gff_index_filename) if not miso_basename.endswith(".pickle"): print "Error: Invalid index file %s" %(gff_index_filename) sys.exit(1) miso_basename = miso_basename.replace(".pickle", "") output_filename = os.path.join(chrom_dir, "%s" %(miso_basename)) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, num_chains=num_chains, burn_in=burn_in, lag=lag)
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, main_logger, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.main_logger = main_logger self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): self.main_logger.error("BAM file %s not found." %(self.bam_filename)) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): self.main_logger.warning("Expected BAM index file %s not found." \ %(self.bam_index_fname)) self.main_logger.warning("Are you sure your BAM file is indexed?") self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc self.main_logger.info("Using %d processors" %(num_proc)) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: self.main_logger.error("No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?") sys.exit(1) self.batch_filenames = self.output_batch_files()
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len, use_cluster=False, chunk_jobs=False, filter_events=True, events_info_filename=None, settings_filename=None): """ Compute Psi values for skipped exons. Sample filenames is a mapping from sample label to sample. - sample_filenames = [[sample_label1, sample_filename1], [sample_label2, sample_filename2]] - output_dir: output directory - event_type: 'SE', 'RI', etc. """ misc_utils.make_dir(output_dir) output_dir = os.path.join(output_dir, event_type) output_dir = os.path.abspath(output_dir) misc_utils.make_dir(output_dir) print "Computing Psi for events of type %s" % (event_type) print " - samples used: ", sample_filenames.keys() for sample_label, sample_filename in sample_filenames.iteritems(): print "Processing sample: label=%s, filename=%s" \ %(sample_label, sample_filename) results_output_dir = os.path.join(output_dir, sample_label) misc_utils.make_dir(results_output_dir) # Load the set of counts and serialize them into JSON events = \ as_events.load_event_counts(sample_filename, event_type, events_info_filename=events_info_filename) # Filter events if filter_events: print "Filtering events..." events.filter_events(settings=Settings.get()) print "Running on a total of %d events." % (len(events.events)) events_filename = events.output_file(results_output_dir, sample_label) # Run MISO on them miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \ "--read-len %d --overhang-len %d " \ %(os.path.join(miso_path, 'run_miso.py'), events_filename, results_output_dir, event_type, read_len, overhang_len) if use_cluster: if chunk_jobs: miso_cmd += ' --use-cluster --chunk-jobs %d' % (chunk_jobs) else: miso_cmd += ' --use-cluster' print "Executing: %s" % (miso_cmd) if use_cluster: print " - Using cluster" os.system(miso_cmd)
def __init__(self, gff_dir, bam_filename, output_dir, read_len, overhang_len, settings_fname=None, paired_end=None, use_cluster=False, chunk_jobs=200, SGEarray=False, sge_job_name="misojob", gene_ids=None, num_proc=None, wait_on_jobs=True): self.threads = {} self.gff_dir = gff_dir self.bam_filename = bam_filename # Check that the BAM filename exists and that it has an index if not os.path.isfile(self.bam_filename): print "Error: BAM file %s not found." %(self.bam_filename) sys.exit(1) self.bam_index_fname = "%s.bai" %(self.bam_filename) if not os.path.isfile(self.bam_index_fname): print "WARNING: Expected BAM index file %s not found." \ %(self.bam_index_fname) print "Are you sure your BAM file is indexed?" self.output_dir = output_dir self.read_len = read_len # For now setting overhang to 1 always #self.overhang_len = overhang_len self.overhang_len = 1 self.settings_fname = settings_fname self.paired_end = paired_end self.use_cluster = use_cluster self.chunk_jobs = chunk_jobs self.settings = Settings.get() self.cluster_cmd = Settings.get_cluster_command() self.sge_job_name = sge_job_name self.wait_on_jobs = wait_on_jobs # if chunk_jobs not given (i.e. set to False), # then set it to arbitrary value if not self.chunk_jobs: self.chunk_jobs = 200 self.SGEarray = SGEarray self.num_processors = Settings.get_num_processors() if num_proc is not None: num_proc = int(num_proc) self.num_processors = num_proc print "Using %d processors" %(num_proc) self.long_thresh = 50 self.batch_logs_dir = \ os.path.join(output_dir, "batch-logs") self.batch_genes_dir = \ os.path.join(output_dir, "batch-genes") self.cluster_scripts_dir = \ os.path.join(output_dir, "cluster_scripts") self.scripts_output_dir = \ os.path.join(output_dir, "scripts_output") misc_utils.make_dir(self.batch_logs_dir) misc_utils.make_dir(self.batch_genes_dir) misc_utils.make_dir(self.cluster_scripts_dir) misc_utils.make_dir(self.scripts_output_dir) # First compile a set of genes that should be run on # and output them to file along with their indexed # filenames self.gene_ids_to_gff_index = \ gff_utils.get_gene_ids_to_gff_index(gff_dir) # If we're given filtered gene IDs, use them if gene_ids is not None: self.gene_ids = gene_ids else: self.gene_ids = self.gene_ids_to_gff_index.keys() if len(self.gene_ids) == 0: print "Error: No genes to run on. Did you pass me the wrong path " \ "to your index GFF directory? " \ "Or perhaps your indexed GFF directory " \ "is empty?" sys.exit(1) self.batch_filenames = self.output_batch_files()
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len, use_cluster=False, chunk_jobs=False, filter_events=True, events_info_filename=None, settings_filename=None): """ Compute Psi values for skipped exons. Sample filenames is a mapping from sample label to sample. - sample_filenames = [[sample_label1, sample_filename1], [sample_label2, sample_filename2]] - output_dir: output directory - event_type: 'SE', 'RI', etc. """ misc_utils.make_dir(output_dir) output_dir = os.path.join(output_dir, event_type) output_dir = os.path.abspath(output_dir) misc_utils.make_dir(output_dir) print "Computing Psi for events of type %s" %(event_type) print " - samples used: ", sample_filenames.keys() for sample_label, sample_filename in sample_filenames.iteritems(): print "Processing sample: label=%s, filename=%s" \ %(sample_label, sample_filename) results_output_dir = os.path.join(output_dir, sample_label) misc_utils.make_dir(results_output_dir) # Load the set of counts and serialize them into JSON events = \ as_events.load_event_counts(sample_filename, event_type, events_info_filename=events_info_filename) # Filter events if filter_events: print "Filtering events..." events.filter_events(settings=Settings.get()) print "Running on a total of %d events." %(len(events.events)) events_filename = events.output_file(results_output_dir, sample_label) # Run MISO on them miso_cmd = "python %s --compute-two-iso-psi %s %s --event-type %s " \ "--read-len %d --overhang-len %d " \ %(os.path.join(miso_path, 'run_miso.py'), events_filename, results_output_dir, event_type, read_len, overhang_len) if use_cluster: if chunk_jobs: miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs) else: miso_cmd += ' --use-cluster' print "Executing: %s" %(miso_cmd) if use_cluster: print " - Using cluster" os.system(miso_cmd)
def compute_gene_psi(gene_ids, gff_index_filename, bam_filename, output_dir, read_len, overhang_len, paired_end=None, event_type=None, verbose=True): """ Run Psi at the Gene-level (for multi-isoform inference.) Arguments: - Set of gene IDs corresponding to gene IDs from the GFF - Indexed GFF filename describing the genes - BAM filename with the reads (must be sorted and indexed) - Output directory - Optional: Run in paired-end mode. Gives mean and standard deviation of fragment length distribution. """ misc_utils.make_dir(output_dir) if not os.path.exists(gff_index_filename): print "Error: No GFF %s" %(gff_index_filename) return num_genes = len(gene_ids) print "Computing Psi for %d genes..." %(num_genes) print " - " + ", ".join(gene_ids) print " - GFF filename: %s" %(gff_index_filename) print " - BAM: %s" %(bam_filename) print " - Outputting to: %s" %(output_dir) if paired_end: print " - Paired-end mode: ", paired_end settings = Settings.get() settings_params = Settings.get_sampler_params() burn_in = settings_params["burn_in"] lag = settings_params["lag"] num_iters = settings_params["num_iters"] num_chains = settings_params["num_chains"] min_event_reads = Settings.get_min_event_reads() strand_rule = Settings.get_strand_param() mean_frag_len = None frag_variance = None if paired_end: mean_frag_len = int(paired_end[0]) frag_variance = power(int(paired_end[1]), 2) # Load the genes from the GFF gff_genes = gff_utils.load_indexed_gff_file(gff_index_filename) # If given a template for the SAM file, use it template = None if settings and "sam_template" in settings: template = settings["sam_template"] if "filter_reads" not in settings: filter_reads = True else: filter_reads = settings["filter_reads"] # Load the BAM file upfront bamfile = sam_utils.load_bam_reads(bam_filename, template=template) # Check if we're in compressed mode compressed_mode = misc_utils.is_compressed_index(gff_index_filename) for gene_id, gene_info in gff_genes.iteritems(): lookup_id = gene_id # Skip genes that we were not asked to run on if lookup_id not in gene_ids: continue gene_obj = gene_info['gene_object'] gene_hierarchy = gene_info['hierarchy'] # Sanity check: if the isoforms are all shorter than the read, # skip the event if all(map(lambda l: l < read_len, gene_obj.iso_lens)): print "All isoforms of %s shorter than %d, so skipping" \ %(gene_id, read_len) continue # Find the most inclusive transcription start and end sites # for each gene tx_start, tx_end = \ gff_utils.get_inclusive_txn_bounds(gene_info['hierarchy'][gene_id]) # Fetch reads aligning to the gene boundaries gene_reads = \ sam_utils.fetch_bam_reads_in_gene(bamfile, gene_obj.chrom, tx_start, tx_end, gene_obj) # Parse reads: checking strandedness and pairing # reads in case of paired-end data reads, num_raw_reads = \ sam_utils.sam_parse_reads(gene_reads, paired_end=paired_end, strand_rule=strand_rule, target_strand=gene_obj.strand) # Skip gene if none of the reads align to gene boundaries if filter_reads: if num_raw_reads < min_event_reads: print "Only %d reads in gene, skipping (needed >= %d reads)" \ %(num_raw_reads, min_event_reads) continue else: print "%d raw reads in event" %(num_raw_reads) num_isoforms = len(gene_obj.isoforms) hyperparameters = ones(num_isoforms) ## ## Run the sampler ## # Create the sampler with the right parameters depending on whether # this is a paired-end or single-end data set. if paired_end: # Sampler parameters for paired-end mode sampler_params = \ miso.get_paired_end_sampler_params(num_isoforms, mean_frag_len, frag_variance, read_len, overhang_len=overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=True, log_dir=output_dir) else: # Sampler parameters for single-end mode sampler_params = miso.get_single_end_sampler_params(num_isoforms, read_len, overhang_len) sampler = miso.MISOSampler(sampler_params, paired_end=False, log_dir=output_dir) # Make directory for chromosome -- if given an event type, put # the gene in the event type directory if event_type != None: chrom_dir = os.path.join(output_dir, event_type, gene_obj.chrom) else: chrom_dir = os.path.join(output_dir, gene_obj.chrom) try: os.makedirs(chrom_dir) except OSError: pass # Pick .miso output filename based on the pickle filename miso_basename = os.path.basename(gff_index_filename) if not miso_basename.endswith(".pickle"): print "Error: Invalid index file %s" %(gff_index_filename) sys.exit(1) miso_basename = miso_basename.replace(".pickle", "") output_filename = os.path.join(chrom_dir, "%s" %(miso_basename)) sampler.run_sampler(num_iters, reads, gene_obj, hyperparameters, sampler_params, output_filename, num_chains=num_chains, burn_in=burn_in, lag=lag)