def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1, help="Create a database for input GFF filename. Takes a " \ "GFF filename.") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") # parser.add_option("--gtf", dest="gtf", default=False, action="store_true", # help="Output file as GTF. Default is GFF.") (options, args) = parser.parse_args() if options.output_dir is None: print "Error: need --output-dir to be provided.\n" greeting() sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: gff_fname = utils.pathify(options.input_gff) if not os.path.isfile(gff_fname): print "Error: GFF file %s does not exist." %(gff_fname) sys.exit(1) create_db(gff_fname, output_dir)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1, help="Create a database for input GFF filename. Takes a " \ "GFF filename.") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") # parser.add_option("--gtf", dest="gtf", default=False, action="store_true", # help="Output file as GTF. Default is GFF.") (options, args) = parser.parse_args() if options.output_dir is None: print "Error: need --output-dir to be provided.\n" greeting() sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: gff_fname = utils.pathify(options.input_gff) if not os.path.isfile(gff_fname): print "Error: GFF file %s does not exist." % (gff_fname) sys.exit(1) create_db(gff_fname, output_dir)
def compute_insert_lens(settings, output_dir, dry_run=False): """ Compute insert lengths for all samples. """ settings_filename = utils.pathify(settings) logs_outdir = utils.pathify(logs_outdir) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="insert_lens") const_exons_gff = misowrap_obj.const_exons_gff if not os.path.isfile(const_exons_gff): print "Error: %s const exons GFF does not exist." % (const_exons_gff) sys.exit(1) pe_utils_path = misowrap_obj.pe_utils_cmd insert_len_output_dir = os.path.join(output_dir, "insert_lens") num_bams = len(misowrap_obj.bam_files) print "Computing insert lengths for %d files" % (num_bams) for bam_filename, sample_name in misowrap_obj.bam_files: print "Processing: %s" % (bam_filename) insert_len_cmd = "%s --compute-insert-len %s %s --output-dir %s" % ( pe_utils_path, bam_filename, const_exons_gff, insert_len_output_dir, ) print "Executing: %s" % (insert_len_cmd) job_name = "%s_insert_len" % (sample_name) if misowrap_obj.use_cluster: misowrap_obj.my_cluster.launch_job(insert_len_cmd, job_name, ppn=1) else: os.system(insert_len_cmd)
def make_miso_annotation(tables_dir, output_dir, org_build): """ Make GFF annotation. Takes GFF tables directory and an output directory. Adapted from https://github.com/yarden/rnaseqlib/ """ tables_dir = utils.pathify(tables_dir) output_dir = utils.pathify(output_dir) print "Making GFF alternative events annotation..." print " - UCSC tables read from: %s" % (tables_dir) print " - Output dir: %s" % (output_dir) t1 = time.time() table_fnames = def_events.load_ucsc_tables(tables_dir) num_tables = len(table_fnames) if num_tables == 0: raise Exception("No UCSC tables found in %s." % (tables_dir)) print "Loaded %d UCSC tables." % (num_tables) def_events.defineAllSplicing(tables_dir, output_dir, flanking="commonshortest", multi_iso=False, sanitize=False, genome_label=org_build) t2 = time.time() print "Took %.2f minutes to make the annotation." \ % ((t2 - t1)/60.)
def make_annotation(args): """ Make GFF annotation. Takes GFF tables directory and an output directory. """ tables_dir = utils.pathify(args.tables_dir) output_dir = utils.pathify(args.output_dir) print "Making GFF alternative events annotation..." print " - UCSC tables read from: %s" % (tables_dir) print " - Output dir: %s" % (output_dir) t1 = time.time() table_fnames = def_events.load_ucsc_tables(tables_dir) num_tables = len(table_fnames) if num_tables == 0: raise Exception, "No UCSC tables found in %s." % (tables_dir) print "Loaded %d UCSC tables." % (num_tables) def_events.defineAllSplicing(tables_dir, output_dir, flanking=args.flanking_rule, multi_iso=args.multi_iso, genome_label=args.genome_label, sanitize=args.sanitize) t2 = time.time() print "Took %.2f minutes to make the annotation." \ %((t2 - t1)/60.)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--run", dest="run", action="store_true", default=False, help="Run pipeline.") parser.add_option("--run-on-sample", dest="run_on_sample", nargs=1, default=None, help="Run on a particular sample. Takes as input the sample label.") parser.add_option("--settings", dest="settings", nargs=1, default=None, help="Settings filename.") parser.add_option("--init", dest="initialize", nargs=1, default=None, help="Initialize the pipeline. Takes as input a genome, " "e.g. mm9 or hg18") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") (options, args) = parser.parse_args() greeting() if options.output_dir == None: print "Error: need --output-dir" parser.print_help() sys.exit(1) output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) settings_filename = None if options.run: if options.settings == None: # Running of pipeline requires settings filename print "Error: need --settings" parser.print_help() sys.exit(1) settings_filename = utils.pathify(options.settings) run_pipeline(settings_filename, output_dir) if options.run_on_sample is not None: if options.settings == None: # Running of pipeline requires settings filename print "Error: need --settings" parser.print_help() sys.exit(1) settings_filename = utils.pathify(options.settings) sample_label = options.run_on_sample run_on_sample(sample_label, settings_filename, output_dir) if options.initialize is not None: genome = options.initialize initialize_pipeline(genome, output_dir)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1, help="Fetch sequence from GFF events file. Takes as input: " "GFF filename.") parser.add_option("--fi", dest="fasta_fname", default=None, nargs=1, help="FASTA filename to fetch sequences from.") parser.add_option("--with-flanking-introns", dest="with_flanking_introns", default=False, action="store_true", help="Get sequence of flanking introns relative to skipped exon.") parser.add_option("--flanking-introns-coords", dest="flanking_introns_coords", default=None, nargs=4, help="Fetch the sequences of the flanking introns " "(for SpliceGraph events). Takes as input the intervals to " "be used, which are: " "(1) start position relative to 5 prime splice site of SE " "(negative int), " "(2) end position 5 prime splice site (negative int), " "(3) start position relative to 3 prime splice site " "(positive int), " "(4) end position relative to 3 prime splice site. " "(posiitve int). " "Suggested settings are -250, -20, 20, -250.") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") (options, args) = parser.parse_args() if options.output_dir is None: greeting() print "Error: need --output-dir to be provided." sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: if options.fasta_fname is None: greeting() print "Error: Must provide input fasta file with --fi." sys.exit(1) # Check for FASTA gff_filename = utils.pathify(options.input_gff) fasta_fname = utils.pathify(options.fasta_fname) flanking_introns_coords = options.flanking_introns_coords gffutils_helpers.fetch_seq_from_gff(gff_filename, fasta_fname, output_dir, with_flanking_introns=options.with_flanking_introns, flanking_introns_coords=options.flanking_introns_coords)
def load_summaries(self, miso_samples_dir): """ Load MISO summary files. """ miso_samples_dir = utils.pathify(miso_samples_dir) print "Loading summary files.." summaries_dict = defaultdict(dict) for sample in self.sample_labels: for event_type in self.event_types: sample_name, label = sample print "SAMPLE NAME: ", sample_name print " SAMPLE LABEL: ", label sample_dir = os.path.join(miso_samples_dir, sample_name, event_type) if not os.path.isdir(sample_dir): print "WARNING: Skipping %s..." \ %(sample_dir) continue summary_filename = get_summary_filename(sample_dir) if not os.path.isfile(summary_filename): print "WARNING: %s not a summary file" \ %(summary_filename) continue summary_df = pandas.read_table(summary_filename, sep=self.delimiter) summaries_dict[event_type][sample_name] = summary_df self.summaries_df = pandas.DataFrame(summaries_dict)
def load_events_to_genes(self, delimiter="\t"): """ Load mapping from events to genes. Use the new GFF files for this. """ basename_card = "*.gff3" events_to_genes_dir = None if "events_to_genes" in self.settings_info["settings"]: events_to_genes_dir = \ utils.pathify(self.settings_info["settings"]["events_to_genes_dir"]) else: return gff_fnames = \ glob.glob(os.path.join(events_to_genes_dir, basename_card)) print "Loading events to genes mapping..." print " - Input directory: %s" %(events_to_genes_dir) print " - Number of files: %d" %(len(gff_fnames)) self.events_to_genes = defaultdict(lambda: defaultdict(str)) for fname in gff_fnames: event_type = os.path.basename(fname).split(".")[0] gff_entries = pybedtools.BedTool(fname) gene_entries = gff_entries.filter(lambda x: x.fields[2] == "gene") for gene in gene_entries: # Parse Ensembl gene, RefSeq and gene symbols attrs = gene.attrs self.events_to_genes[event_type][attrs["ID"]] = \ {"ensg_id": attrs["ensg_id"], "refseq_id": attrs["refseq_id"], "gsymbol": attrs["gsymbol"]}
def get_default_db_fname(gff_fname, db_dirname="gff_db"): """ Look for canonical GFF database filename. If exists, return its path, otherwise return None. Looks for that has 'gff_fname's basename ending in .db inside a 'gff_db' subdirectory in the same directory where 'gff_fname' is stored. For example, if 'gff_fname' is /home/user/mygff.gff it will look for /home/user/gff_db/mygff.gff.db. """ gff_fname = utils.pathify(gff_fname) # If the input ends in .db, assume it is the database if gff_fname.endswith(".db"): return gff_fname gff_basename = os.path.basename(gff_fname) gff_db_dir = os.path.join(os.path.dirname(gff_fname), db_dirname) if not os.path.isdir(gff_db_dir): return None db_fname = os.path.join(gff_db_dir, "%s.db" %(gff_basename)) if not os.path.isfile(db_fname): return None return db_fname
def load_events_to_genes(self, delimiter="\t"): """ Load mapping from events to genes. Use the new GFF files for this. """ basename_card = "*.gff3" events_to_genes_dir = None if "events_to_genes" in self.settings_info["settings"]: events_to_genes_dir = \ utils.pathify(self.settings_info["settings"]["events_to_genes_dir"]) else: return gff_fnames = \ glob.glob(os.path.join(events_to_genes_dir, basename_card)) print "Loading events to genes mapping..." print " - Input directory: %s" % (events_to_genes_dir) print " - Number of files: %d" % (len(gff_fnames)) self.events_to_genes = defaultdict(lambda: defaultdict(str)) for fname in gff_fnames: event_type = os.path.basename(fname).split(".")[0] gff_entries = pybedtools.BedTool(fname) gene_entries = gff_entries.filter(lambda x: x.fields[2] == "gene") for gene in gene_entries: # Parse Ensembl gene, RefSeq and gene symbols attrs = gene.attrs self.events_to_genes[event_type][attrs["ID"]] = \ {"ensg_id": attrs["ensg_id"], "refseq_id": attrs["refseq_id"], "gsymbol": attrs["gsymbol"]}
def summarize(settings, logs_outdir, delay=5, dry_run=False): """ Summarize samples in MISO directory. """ settings_filename = utils.pathify(settings) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="summarize") bam_files = misowrap_obj.bam_files sample_labels = misowrap_obj.sample_labels print "Summarizing MISO output..." for sample_label in sample_labels: sample_basename = sample_label[0] sample_dir_path = \ utils.pathify(os.path.join(misowrap_obj.miso_outdir, sample_basename)) print "Processing: %s" %(sample_basename) if not os.path.isdir(sample_dir_path): print "Skipping non-directory: %s" %(sample_dir_path) # List all event directories in the sample event_dirs = os.listdir(sample_dir_path) for event_dirname in event_dirs: event_dir_path = utils.pathify(os.path.join(sample_dir_path, event_dirname)) if not os.path.isdir(event_dir_path): print "Skipping non-dir: %s" %(event_dir_path) print "Processing event type: %s" %(event_dirname) summary_cmd = \ "%s --summarize-samples %s %s --summary-label %s" \ %(misowrap_obj.summarize_miso_cmd, event_dir_path, event_dir_path, sample_basename) job_name = "summarize_%s_%s" %(sample_basename, os.path.basename(event_dirname)) print "Executing: %s" %(summary_cmd) if misowrap_obj.use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(summary_cmd, job_name, ppn=1) else: if not dry_run: os.system(summary_cmd)
def get_event_types_dirs(settings_info): """ Return event types. """ miso_events_dir = \ utils.pathify(settings_info["settings"]["miso_events_dir"]) event_types_dirs = [os.path.join(miso_events_dir, dirname) \ for dirname in os.listdir(miso_events_dir)] return event_types_dirs
def filter(settings, logs_outdir, dry_run=False): """ Output a set of filtered MISO comparisons. """ settings_filename = utils.pathify(settings) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="filter") misowrap_obj.logger.info("Filtering MISO events...") psi_table = pt.PsiTable(misowrap_obj) psi_table.output_filtered_comparisons()
def summarize_miso_samples(settings_filename, output_dir): """ Summarize samples in MISO directory. """ misowrap_obj = MISOWrap(settings_filename, output_dir, logger_label="summarize") bam_files = misowrap_obj.bam_files sample_labels = misowrap_obj.sample_labels print "Summarizing MISO output..." print " - Output dir: %s" %(output_dir) run_miso_cmd = misowrap_obj.run_miso_cmd for sample_label in sample_labels: print "sample label: ", sample_label sample_basename = sample_label[0] sample_dir_path = \ utils.pathify(os.path.join(misowrap_obj.miso_outdir, sample_basename)) print "Processing: %s" %(sample_basename) if not os.path.isdir(sample_dir_path): print "Skipping non-directory: %s" %(sample_dir_path) # List all event directories in the sample event_dirs = os.listdir(sample_dir_path) for event_dirname in event_dirs: event_dir_path = utils.pathify(os.path.join(sample_dir_path, event_dirname)) if not os.path.isdir(event_dir_path): print "Skipping non-dir: %s" %(event_dir_path) print "Processing event type: %s" %(event_dirname) summary_cmd = \ "%s --summarize-samples %s %s --summary-label %s" \ %(run_miso_cmd, event_dir_path, event_dir_path, sample_basename) job_name = "summarize_%s_%s" %(sample_basename, os.path.basename(event_dirname)) print "Executing: %s" %(summary_cmd) if misowrap_obj.use_cluster: misowrap_obj.my_cluster.launch_job(summary_cmd, job_name) else: os.system(summary_cmd)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1, help="Input GFF filename for a GFF database.") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") parser.add_option("--gtf", dest="gtf", default=False, action="store_true", help="Output a GTF file instead of GFF.") parser.add_option("--db-subdir", dest="db_subdir", default="gff_db", help="Name of output subdirectory containing GFF " \ "database. By default, creates \'gff_db\' " \ "subdirectory in the directory given to --output-dir.") parser.add_option("--no-db-output", dest="no_db_output", default=False, action="store_true", help="Do not output a GFF database.") (options, args) = parser.parse_args() if options.output_dir is None: print "Error: need --output-dir to be provided.\n" greeting() sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: gff_fname = utils.pathify(options.input_gff) sanitize_gff(gff_fname, output_dir)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--input-gff", dest="input_gff", default=None, nargs=1, help="Extract lengths from GFF file.") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") (options, args) = parser.parse_args() if options.output_dir is None: print "Error: need --output-dir to be provided.\n" greeting() sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: gff_fname = utils.pathify(options.input_gff) extract_lens_from_gff(gff_fname, output_dir)
def get_output_db_fname(gff_fname, output_dir, db_subdir="gff_db"): """ Return output file for 'gff_fname' (either a GFF db or a regular GFF file) in 'output_dir' """ gff_fname = utils.pathify(gff_fname) gff_basename = os.path.basename(gff_fname) db_fname = \ os.path.join(output_dir, db_subdir, gff_basename) if not db_fname.endswith(".db"): db_fname += ".db" return db_fname
def compare(settings, logs_outdir, delay=5, dry_run=False): """ Run a MISO samples comparison between all pairs of samples. """ settings_filename = utils.pathify(settings) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="compare") bam_files = misowrap_obj.bam_files sample_labels = misowrap_obj.sample_labels read_len = misowrap_obj.read_len overhang_len = misowrap_obj.overhang_len miso_bin_dir = misowrap_obj.miso_bin_dir miso_output_dir = misowrap_obj.miso_outdir comparison_groups = misowrap_obj.comparison_groups comparisons_dir = misowrap_obj.comparisons_dir utils.make_dir(comparisons_dir) misowrap_obj.logger.info("Running MISO comparisons...") ## ## Compute comparisons between all pairs ## in a sample group ## for comp_group in comparison_groups: sample_pairs = utils.get_pairwise_comparisons(comp_group) print " - Total of %d comparisons" % (len(sample_pairs)) for sample1, sample2 in sample_pairs: # For each pair of samples, compare their output # along each event type misowrap_obj.logger.info("Comparing %s %s" % (sample1, sample2)) # Directories for each sample sample1_dir = os.path.join(miso_output_dir, sample1) sample2_dir = os.path.join(miso_output_dir, sample2) for event_type in misowrap_obj.event_types: sample1_event_dir = os.path.join(sample1_dir, event_type) sample2_event_dir = os.path.join(sample2_dir, event_type) job_name = "compare_%s_%s_%s" % (sample1, sample2, event_type) event_comparisons_dir = os.path.join(comparisons_dir, event_type) compare_cmd = "%s --compare-samples %s %s %s " "--comparison-labels %s %s" % ( misowrap_obj.compare_miso_cmd, sample1_event_dir, sample2_event_dir, event_comparisons_dir, sample1, sample2, ) misowrap_obj.logger.info("Executing: %s" % (compare_cmd)) if misowrap_obj.use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(compare_cmd, job_name, ppn=1) time.sleep(delay) else: if not dry_run: os.system(compare_cmd)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option( "--input-gff", dest="input_gff", default=None, nargs=1, help="Input GFF filename for a GFF database." ) parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") parser.add_option("--gtf", dest="gtf", default=False, action="store_true", help="Output a GTF file instead of GFF.") parser.add_option( "--db-subdir", dest="db_subdir", default="gff_db", help="Name of output subdirectory containing GFF " "database. By default, creates 'gff_db' " "subdirectory in the directory given to --output-dir.", ) parser.add_option( "--no-db-output", dest="no_db_output", default=False, action="store_true", help="Do not output a GFF database." ) (options, args) = parser.parse_args() if options.output_dir is None: print "Error: need --output-dir to be provided.\n" greeting() sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: gff_fname = utils.pathify(options.input_gff) sanitize_gff(gff_fname, output_dir)
def make_annotation(args): """ Make GFF annotation. Takes GFF tables directory and an output directory. """ tables_dir = utils.pathify(args.tables_dir) output_dir = utils.pathify(args.output_dir) print "Making GFF alternative events annotation..." print " - UCSC tables read from: %s" %(tables_dir) print " - Output dir: %s" %(output_dir) t1 = time.time() table_fnames = def_events.load_ucsc_tables(tables_dir) num_tables = len(table_fnames) if num_tables == 0: raise Exception, "No UCSC tables found in %s." %(tables_dir) print "Loaded %d UCSC tables." %(num_tables) def_events.defineAllSplicing(tables_dir, output_dir, flanking=args.flanking_rule, multi_iso=args.multi_iso, genome_label=args.genome_label, sanitize=args.sanitize) t2 = time.time() print "Took %.2f minutes to make the annotation." \ %((t2 - t1)/60.)
def compute_insert_lens(settings, output_dir, dry_run=False): """ Compute insert lengths for all samples. """ settings_filename = utils.pathify(settings) logs_outdir = utils.pathify(logs_outdir) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="insert_lens") const_exons_gff = misowrap_obj.const_exons_gff if not os.path.isfile(const_exons_gff): print "Error: %s const exons GFF does not exist." \ %(const_exons_gff) sys.exit(1) pe_utils_path = misowrap_obj.pe_utils_cmd insert_len_output_dir = os.path.join(output_dir, "insert_lens") num_bams = len(misowrap_obj.bam_files) print "Computing insert lengths for %d files" %(num_bams) for bam_filename, sample_name in misowrap_obj.bam_files: print "Processing: %s" %(bam_filename) insert_len_cmd = "%s --compute-insert-len %s %s --output-dir %s" \ %(pe_utils_path, bam_filename, const_exons_gff, insert_len_output_dir) print "Executing: %s" %(insert_len_cmd) job_name = "%s_insert_len" %(sample_name) if misowrap_obj.use_cluster: misowrap_obj.my_cluster.launch_job(insert_len_cmd, job_name, ppn=1) else: os.system(insert_len_cmd)
def summarize(settings, logs_outdir, delay=5, dry_run=False): """ Summarize samples in MISO directory. """ settings_filename = utils.pathify(settings) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="summarize") bam_files = misowrap_obj.bam_files sample_labels = misowrap_obj.sample_labels print "Summarizing MISO output..." for sample_label in sample_labels: sample_basename = sample_label[0] sample_dir_path = utils.pathify(os.path.join(misowrap_obj.miso_outdir, sample_basename)) print "Processing: %s" % (sample_basename) if not os.path.isdir(sample_dir_path): print "Skipping non-directory: %s" % (sample_dir_path) # List all event directories in the sample event_dirs = os.listdir(sample_dir_path) for event_dirname in event_dirs: event_dir_path = utils.pathify(os.path.join(sample_dir_path, event_dirname)) if not os.path.isdir(event_dir_path): print "Skipping non-dir: %s" % (event_dir_path) print "Processing event type: %s" % (event_dirname) summary_cmd = "%s --summarize-samples %s %s --summary-label %s" % ( misowrap_obj.summarize_miso_cmd, event_dir_path, event_dir_path, sample_basename, ) job_name = "summarize_%s_%s" % (sample_basename, os.path.basename(event_dirname)) print "Executing: %s" % (summary_cmd) if misowrap_obj.use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(summary_cmd, job_name, ppn=1) else: if not dry_run: os.system(summary_cmd)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--run", dest="run", nargs=1, default=None, help="Run MISO on a set of events. " "Takes a settings filename.") parser.add_option("--summarize", dest="summarize", nargs=1, default=None, help="Run MISO summarize on a set of samples. " "Takes a settings filename.") parser.add_option("--compare", dest="compare", nargs=1, default=None, help="Run MISO sample comparisons on all pairwise " "comparisons. Takes a settings filename.") parser.add_option("--filter", dest="filter", nargs=1, default=None, help="Filter a set of MISO events. " "Takes a settings filename.") parser.add_option("--compute-insert-lens", dest="compute_insert_lens", nargs=1, default=None, help="Compute insert lengths for a set of BAM files. " "takes a settings filename.") parser.add_option("--output-dir", dest="output_dir", default=None, help="Output directory.") (options, args) = parser.parse_args() greeting() if options.output_dir == None: print "Error: need --output-dir.\n" parser.print_help() sys.exit(1) output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.run != None: settings_filename = utils.pathify(options.run) run_miso_on_samples(settings_filename, output_dir) if options.summarize != None: settings_filename = utils.pathify(options.summarize) summarize_miso_samples(settings_filename, output_dir) if options.compare != None: settings_filename = utils.pathify(options.compare) compare_miso_samples(settings_filename, output_dir) if options.filter != None: settings_filename = utils.pathify(options.filter) filter_events(settings_filename, output_dir) if options.compute_insert_lens != None: settings_filename = utils.pathify(options.compute_insert_lens) compute_insert_lens(settings_filename, output_dir)
def read_pe_params(insert_len_filename): """ Get paired-end parameters from .insert_len file. """ insert_len_filename = utils.pathify(insert_len_filename) if not os.path.isfile(insert_len_filename): print "Error: %s not a file." %(insert_len_filename) sys.exit(1) insert_file = open(insert_len_filename, "r") fields = insert_file.readline()[1:].strip().split(",") pe_params = {} for field in fields: k, v = field.split("=") pe_params[k] = float(v) insert_file.close() return pe_params
def load_events_to_genes(self, source="ensGene", delimiter="\t"): """ Load mapping from events to genes. Expects a directory with files named according to events, e.g.: SE.mm9.gff3_to_ensGene.txt """ if "events_to_genes_dir" not in self.settings_info["settings"]: return events_to_genes_dir = \ self.settings_info["settings"]["events_to_genes_dir"] events_to_genes_dir = utils.pathify(events_to_genes_dir) print "Loading events to genes mapping from: %s" \ %(events_to_genes_dir) # If we're given mapping from events to genes, load # these and index them by event type. if not os.path.isdir(events_to_genes_dir): print "Error: %s not a directory." sys.exit(1) basename_card = "*_to_%s.txt" %(source) events_to_genes_files = \ glob.glob(os.path.join(events_to_genes_dir, basename_card)) if len(events_to_genes_files) == 0: print "Error: %s directory contains no %s files." \ %(events_to_genes_dir, basename_card) sys.exit(1) self.events_to_genes = defaultdict(lambda: defaultdict(list)) for fname in events_to_genes_files: # Extract event type based on filename event_type = os.path.basename(fname).split(".")[0] with open(fname, "r") as events_file: events_entries = csv.DictReader(events_file, delimiter=delimiter) for entry in events_entries: event_id = entry["event_id"] # Parse genes into a list genes = entry["gene_id"].split(",") # Index events by their type and then by # their ID self.events_to_genes[event_type][event_id].extend(genes)
def __init__(self, settings_filename, output_dir, logger_label=None): self.settings_filename = settings_filename self.settings_info = None self.logger_label = None # Main output directory self.output_dir = utils.pathify(output_dir) utils.make_dir(self.output_dir) # MISO output directory (where raw output is) self.miso_outdir = None # Comparisons output directory self.comparisons_outdir = None # BAM files to process self.bam_files = None # Sample labels self.sample_labels = None self.comparison_groups = None # Insert length directory (for paired-end samples) self.insert_lens_dir = None # Logs output directory self.logs_outdir = None # Logger object self.logger = None # Cluster submission object self.my_cluster = None # Event types to process self.event_types = None # Whether to submit jobs to cluster self.use_cluster = False # run_miso cmd self.run_miso_cmd = None # run_events_analysis cmd self.run_events_cmd = None # Constitutive exons GFF file: used to compute # the insert length distribution self.const_exons_gff = None # Load settings self.load_settings() ## ## Load annotation of events, like a map ## events to genes. ## self.events_to_genes = None self.load_events_to_genes()
def get_bf_filename(pairwise_comparison_dir): """ Return a Bayes factor filename from a pairwise comparisons directory. """ pairwise_comparison_dir = utils.pathify(pairwise_comparison_dir) if not os.path.isdir(pairwise_comparison_dir): print "WARNING: Could not find %s" %(pairwise_comparison_dir) return None bf_dir = os.path.join(pairwise_comparison_dir, "bayes-factors") if not os.path.isdir(bf_dir): # Attempt current directory without "bayes-factor" # inner directory bf_dir = pairwise_comparison_dir bf_filename = glob.glob(os.path.join(bf_dir, "*.miso_bf")) if len(bf_filename) > 1: print "Error: Multiple BF filenames in %s" %(bf_dir) return None bf_filename = bf_filename[0] return bf_filename
def load_pipeline_settings(self): """ Load the settings filename """ if not os.path.isfile(self.settings_filename): print "Error: %s is not a settings filename." % (self.settings_filename) sys.exit(1) self.settings = settings.load_settings(self.settings_filename) self.settings_info, self.parsed_settings = self.settings self.genome = self.settings_info["mapping"]["genome"] # Determine if we're in paired-end mode self.is_paired_end = False if self.settings_info["mapping"]["paired"]: self.is_paired_end = True # Load the sequence files self.load_sequence_files() # Load the directory where pipeline output should go self.output_dir = utils.pathify(self.settings_info["data"]["outdir"]) print "Loaded pipeline settings (source: %s)." % (self.settings_filename) # Pipeline init directory self.init_dir = os.path.join(self.settings_info["pipeline-files"]["init_dir"]) # Loading group information if there is any self.load_groups()
def load_sequence_files(self): """ Load sequence files from settings file. """ if self.settings_info is None: print "Error: cannot load sequence files if settings " "are not loaded." sys.exit(1) seq_files = self.settings_info["data"]["sequence_files"] # Get the absolute path names, with the prefix input directory, # for each sequence file sequence_filenames = [] input_dir = utils.pathify(self.settings_info["data"]["indir"]) for seq_entry in seq_files: if len(seq_entry) != 2: print "Error: Must provide a sequence filename and a " "sample label for each entry." sys.exit(1) fname, seq_label = seq_entry seq_fname = os.path.join(input_dir, fname) if not os.path.isfile(seq_fname): print "Error: Cannot find sequence file %s" % (seq_fname) sys.exit(1) sequence_filenames.append([seq_fname, seq_label]) self.sequence_filenames = sequence_filenames return sequence_filenames
def launchJob(cmd, job_name, scriptOptions, verbose=False, test=False, fast=False, queue_type="quick", ppn="4"): """ Submits a job on the cluster which will run command 'cmd', with options 'scriptOptions' Optionally: verbose: output the job script test: don't actually submit the job script (usually used in conjunction with verbose) fast: submit only to the fast nodes on coyote Returns a job ID if the job was submitted properly """ if type(cmd) not in [type(list()), type(tuple())]: cmd = [cmd] scriptOptions.setdefault("workingdir", os.getcwd()) scriptOptions.setdefault("nodes", "1") scriptOptions.setdefault("ppn", str(ppn)) scriptOptions.setdefault("jobname", job_name) scriptOptions.setdefault("scriptuser", getpass.getuser()) scriptOptions.setdefault("queue", queue_type) scriptOptions.setdefault("outdir", "") scriptOptions["command"] = " ".join(cmd) pid = os.getpid() outscriptName = "%s.%i" % (scriptOptions["jobname"], pid) scriptOptions["outf"] = \ utils.pathify(os.path.join(scriptOptions["outdir"], outscriptName+".out")) if fast: assert scriptOptions["nodes"] == "1", \ "Can only choose specific nodes if you're " \ "not restricting jobs to the fast nodes." scriptOptions["nodes"] = "1:E5450" outtext = """#!/bin/bash #PBS -l nodes=%(nodes)s:ppn=%(ppn)s #PBS -j oe #PBS -o %(outf)s #PBS -m a #PBS -M %(scriptuser)[email protected] #PBS -N %(jobname)s #PBS -q %(queue)s #PBS -S /bin/bash echo $HOSTNAME echo Working directory is %(workingdir)s cd %(workingdir)s echo "%(command)s" %(command)s echo "===== %(command)s finished =====" """ % scriptOptions if verbose: print outscriptName print outtext call = "qsub -" if not test: try: qsub = subprocess.Popen(call, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) print "Executing: ", scriptOptions["command"] qsub.stdin.write(outtext) output = qsub.communicate() if output[0].strip().endswith(".coyote.mit.edu"): jobID = int(output[0].split(".")[0]) if verbose: print "Process launched with job ID:", jobID return jobID else: raise Exception("Failed to launch job '%s': %s" \ %(outscriptName, str(output))) except: print "failing..." raise return None
def combine_comparisons( settings, logs_outdir, common_cols=["isoforms", "chrom", "strand", "mRNA_starts", "mRNA_ends", "gene_id", "gene_symbol"], delay=5, dry_run=False, NA_VAL="NA", ): """ Output combined MISO comparisons. For each event type, combine the MISO comparisons for the relevant groups based on the 'comparison_groups' in the misowrap settings file. """ settings_filename = utils.pathify(settings) logs_outdir = utils.pathify(logs_outdir) utils.make_dir(logs_outdir) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="combine_comparisons") comparisons_dir = misowrap_obj.comparisons_dir if not os.path.isdir(comparisons_dir): misowrap_obj.logger.critical("Comparisons directory %s not found. " % (comparisons_dir)) sys.exit(1) # Comparison types to combine: unfiltered comparisons and filtered comparisons # (if available) unfiltered_comp_dir = os.path.join(comparisons_dir, "combined_comparisons") filtered_comp_dir = os.path.join(comparisons_dir, "filtered_events") dirs_to_process = [unfiltered_comp_dir, filtered_comp_dir] comparison_groups = misowrap_obj.comparison_groups for curr_comp_dir in dirs_to_process: if not os.path.isdir(curr_comp_dir): print "Comparisons directory %s not found, skipping" % (curr_comp_dir) continue # For each event type, output the sample comparisons for event_type in misowrap_obj.event_types: # Collection of MISO comparison dataframes (to be merged later) # for the current event type comparison_dfs = [] comparison_labels = [] event_dir = os.path.join(curr_comp_dir, event_type) if not os.path.isdir(event_dir): misowrap_obj.logger.info("Cannot find event type %s dir, " "skipping..." % (event_type)) continue # Look only at sample comparisons within each sample group for comp_group in comparison_groups: sample_pairs = utils.get_pairwise_comparisons(comp_group) misowrap_obj.logger.info(" - Total of %d comparisons" % (len(sample_pairs))) for sample1, sample2 in sample_pairs: # Load miso_bf file for the current comparison # and join it to the combined df comparison_name = "%s_vs_%s" % (sample1, sample2) bf_data = miso_utils.load_miso_bf_file(event_dir, comparison_name, substitute_labels=True) if bf_data is None: misowrap_obj.logger.warning("Could not find comparison %s" % (comparison_name)) continue comparison_dfs.append(bf_data) comparison_labels.append(comparison_name) # Merge the comparison dfs together print "Merging comparisons for %s" % (event_type) combined_df = pandas_utils.combine_dfs(comparison_dfs) output_filename = os.path.join(output_dir, "%s.miso_bf" % (event_type)) misowrap_obj.logger.info("Outputting %s results to: %s" % (event_type, output_filename)) if not dry_run: combined_df.to_csv(output_filename, float_format="%.4f", sep="\t", na_rep=NA_VAL, index=True)
def load_settings(self): """ Load settings for misowrap. """ settings_info, parsed_settings = \ misowrap_settings.load_misowrap_settings(self.settings_filename) self.settings_info = settings_info # Load basic settings about data self.read_len = self.settings_info["settings"]["readlen"] self.overhang_len = self.settings_info["settings"]["overhanglen"] self.miso_bin_dir = \ utils.pathify(self.settings_info["settings"]["miso_bin_dir"]) self.miso_settings_filename = \ utils.pathify(self.settings_info["settings"]["miso_settings_filename"]) self.miso_events_dir = \ utils.pathify(self.settings_info["settings"]["miso_events_dir"]) self.miso_outdir = \ utils.pathify(self.settings_info["settings"]["miso_output_dir"]) # Load data-related parameters self.bam_files = self.settings_info["data"]["bam_files"] if "insert_lens_dir" in self.settings_info["data"]: self.insert_lens_dir = \ utils.pathify(self.settings_info["data"]["insert_lens_dir"]) # Sample labels self.sample_labels = self.settings_info["data"]["sample_labels"] # Set output directories self.comparisons_dir = os.path.join(self.output_dir, "comparisons") self.comparison_groups = \ self.settings_info["data"]["comparison_groups"] self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs") # Create necessary directories utils.make_dir(self.logs_outdir) if "cluster_type" in self.settings_info["settings"]: self.use_cluster = True self.cluster_type = \ self.settings_info["settings"]["cluster_type"] self.chunk_jobs = \ self.settings_info["settings"]["chunk_jobs"] if self.use_cluster: print "Loading cluster information." # Load cluster object if given a cluster type self.load_cluster() # Create a logger object if self.logger_label is None: self.logger_label = "misowrap" else: self.logger_label = "misowrap_%s" % (logger_label) self.logger = utils.get_logger(self.logger_label, self.logs_outdir) # Whether to prefilter MISO events # Set general default settings if "prefilter_miso" not in settings_info["settings"]: # By default, set it so that MISO events are not # prefiltered settings_info["settings"]["prefilter_miso"] = False self.prefilter_miso = \ self.settings_info["settings"]["prefilter_miso"] # Load event types self.load_event_types() # Set path to MISO scripts self.compare_miso_cmd = os.path.join(self.miso_bin_dir, "compare_miso") self.summarize_miso_cmd = os.path.join(self.miso_bin_dir, "summarize_miso") self.run_events_cmd = os.path.join(self.miso_bin_dir, "miso") self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils") # Files related to gene tables self.tables_dir = \ os.path.join(self.settings_info["pipeline-files"]["init_dir"], "ucsc") if not os.path.isdir(self.tables_dir): print "Error: %s directory does not exist." \ %(self.tables_dir) sys.exit(1) self.const_exons_gff = os.path.join(self.tables_dir, "exons", "const_exons", "ensGene.const_exons.gff") if not os.path.isfile(self.const_exons_gff): print "Error: Const. exons GFF %s does not exist." \ %(self.const_exons_gff) sys.exit(1)
def run( settings, logs_outdir, use_cluster=True, base_delay=10, # Batch delay (20 mins by default) batch_delay=60 * 20, delay_every_n_jobs=30, dry_run=False, event_types=None, ): """ Run MISO on a set of samples. """ settings_filename = utils.pathify(settings) if event_types is not None: print "Only running MISO on event types: ", event_types misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="run") output_dir = misowrap_obj.miso_outdir bam_files = misowrap_obj.bam_files read_len = misowrap_obj.read_len overhang_len = misowrap_obj.overhang_len events_dir = misowrap_obj.miso_events_dir single_end = True if misowrap_obj.insert_lens_dir is not None: insert_lens_dir = misowrap_obj.insert_lens_dir misowrap_obj.logger.info("Running in paired-end mode...") misowrap_obj.logger.info(" - Insert length directory: %s" % (insert_lens_dir)) single_end = False else: misowrap_obj.logger.info("Running in single-end mode...") run_events_analysis = misowrap_obj.run_events_cmd event_types_dirs = miso_utils.get_event_types_dirs(misowrap_obj.settings_info) miso_settings_filename = misowrap_obj.miso_settings_filename n = 0 for bam_input in bam_files: bam_filename, sample_label = bam_input bam_filename = utils.pathify(bam_filename) misowrap_obj.logger.info("Processing: %s" % (bam_filename)) for event_type_dir in event_types_dirs: event_type = os.path.basename(event_type_dir) if event_types is not None: if event_type not in event_types: print "Skipping event type: %s" % (event_type) continue print " - Using event dir: %s" % (event_type_dir) miso_cmd = "%s" % (run_events_analysis) bam_basename = os.path.basename(bam_filename) # Output directory for sample sample_output_dir = os.path.join(output_dir, sample_label, event_type) # Pass sample to MISO along with event miso_cmd += " --run %s %s" % (event_type_dir, bam_filename) if not single_end: insert_len_filename = os.path.join(insert_lens_dir, "%s.insert_len" % (bam_basename)) misowrap_obj.logger.info("Reading paired-end parameters " "from file...") misowrap_obj.logger.info(" - PE file: %s" % (insert_len_filename)) pe_params = miso_utils.read_pe_params(insert_len_filename) # Paired-end parameters miso_cmd += " --paired-end %.2f %.2f" % (pe_params["mean"], pe_params["sdev"]) # Read length miso_cmd += " --read-len %d" % (read_len) # Overhang length miso_cmd += " --overhang-len %d" % (overhang_len) # Prefilter? if misowrap_obj.prefilter_miso: miso_cmd += " --prefilter" # Output directory miso_cmd += " --output-dir %s" % (sample_output_dir) # Use cluster if misowrap_obj.use_cluster: miso_cmd += " --use-cluster" miso_cmd += " --chunk-jobs %d" % (misowrap_obj.chunk_jobs) # Settings miso_cmd += " --settings %s" % (miso_settings_filename) misowrap_obj.logger.info("Executing: %s" % (miso_cmd)) job_name = "%s_%s" % (sample_label, event_type) if use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(miso_cmd, job_name, ppn=1) if n == delay_every_n_jobs: # Larger delay everytime we've submitted n jobs misowrap_obj.logger.info("Submitted %d jobs, now waiting %.2f mins." % (n, batch_delay / 60.0)) time.sleep(batch_delay) n = 0 time.sleep(base_delay) else: if not dry_run: os.system(miso_cmd) n += 1
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option( "--input-gff", dest="input_gff", default=None, nargs=1, help="Fetch sequence from GFF events file. Takes as input: " "GFF filename.") parser.add_option("--fi", dest="fasta_fname", default=None, nargs=1, help="FASTA filename to fetch sequences from.") parser.add_option( "--with-flanking-introns", dest="with_flanking_introns", default=False, action="store_true", help="Get sequence of flanking introns relative to skipped exon.") parser.add_option( "--flanking-introns-coords", dest="flanking_introns_coords", default=None, nargs=4, help="Fetch the sequences of the flanking introns " "(for SpliceGraph events). Takes as input the intervals to " "be used, which are: " "(1) start position relative to 5 prime splice site of SE " "(negative int), " "(2) end position 5 prime splice site (negative int), " "(3) start position relative to 3 prime splice site " "(positive int), " "(4) end position relative to 3 prime splice site. " "(posiitve int). " "Suggested settings are -250, -20, 20, -250.") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") (options, args) = parser.parse_args() if options.output_dir is None: greeting() print "Error: need --output-dir to be provided." sys.exit(1) output_dir = options.output_dir output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) if options.input_gff is not None: if options.fasta_fname is None: greeting() print "Error: Must provide input fasta file with --fi." sys.exit(1) # Check for FASTA gff_filename = utils.pathify(options.input_gff) fasta_fname = utils.pathify(options.fasta_fname) flanking_introns_coords = options.flanking_introns_coords gffutils_helpers.fetch_seq_from_gff( gff_filename, fasta_fname, output_dir, with_flanking_introns=options.with_flanking_introns, flanking_introns_coords=options.flanking_introns_coords)
def combine_comparisons(settings, logs_outdir, common_cols=["isoforms", "chrom", "strand", "mRNA_starts", "mRNA_ends", "gene_id", "gene_symbol"], delay=5, dry_run=False, NA_VAL="NA"): """ Output combined MISO comparisons. For each event type, combine the MISO comparisons for the relevant groups based on the 'comparison_groups' in the misowrap settings file. """ settings_filename = utils.pathify(settings) logs_outdir = utils.pathify(logs_outdir) utils.make_dir(logs_outdir) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="combine_comparisons") comparisons_dir = misowrap_obj.comparisons_dir if not os.path.isdir(comparisons_dir): misowrap_obj.logger.critical("Comparisons directory %s not found. " \ %(comparisons_dir)) sys.exit(1) # Comparison types to combine: unfiltered comparisons and filtered comparisons # (if available) unfiltered_comp_dir = os.path.join(comparisons_dir, "combined_comparisons") filtered_comp_dir = os.path.join(comparisons_dir, "filtered_events") dirs_to_process = [unfiltered_comp_dir, filtered_comp_dir] comparison_groups = misowrap_obj.comparison_groups for curr_comp_dir in dirs_to_process: if not os.path.isdir(curr_comp_dir): print "Comparisons directory %s not found, skipping" %(curr_comp_dir) continue # For each event type, output the sample comparisons for event_type in misowrap_obj.event_types: # Collection of MISO comparison dataframes (to be merged later) # for the current event type comparison_dfs = [] comparison_labels = [] event_dir = os.path.join(curr_comp_dir, event_type) if not os.path.isdir(event_dir): misowrap_obj.logger.info("Cannot find event type %s dir, " \ "skipping..." %(event_type)) continue # Look only at sample comparisons within each sample group for comp_group in comparison_groups: sample_pairs = utils.get_pairwise_comparisons(comp_group) misowrap_obj.logger.info(" - Total of %d comparisons" \ %(len(sample_pairs))) for sample1, sample2 in sample_pairs: # Load miso_bf file for the current comparison # and join it to the combined df comparison_name = "%s_vs_%s" %(sample1, sample2) bf_data = miso_utils.load_miso_bf_file(event_dir, comparison_name, substitute_labels=True) if bf_data is None: misowrap_obj.logger.warning("Could not find comparison %s" \ %(comparison_name)) continue comparison_dfs.append(bf_data) comparison_labels.append(comparison_name) # Merge the comparison dfs together print "Merging comparisons for %s" %(event_type) combined_df = pandas_utils.combine_dfs(comparison_dfs) output_filename = os.path.join(output_dir, "%s.miso_bf" %(event_type)) misowrap_obj.logger.info("Outputting %s results to: %s" \ %(event_type, output_filename)) if not dry_run: combined_df.to_csv(output_filename, float_format="%.4f", sep="\t", na_rep=NA_VAL, index=True)
def annotate_gff_with_genes(args): """ Annotate GFF with genes table. """ gff_fname = utils.pathify(args.gff_filename) if not os.path.isfile(gff_fname): raise Exception, "Cannot find %s" % (gff_fname) table_fname = utils.pathify(args.table_filename) if not os.path.isfile(table_fname): raise Exception, "Cannot find %s" % (table_fname) table_bed = get_table_as_bedtool(table_fname) # Get BedTool for events, containing only the gene entries all_events_bed = pybedtools.BedTool(gff_fname) event_genes = \ all_events_bed.filter(lambda entry: entry.fields[2] == "gene") print "Determining overlap between events and genes..." # Intersect event genes with gene txStart/txEnd intersected_bed = \ event_genes.intersect(table_bed, wb=True, s=True, f=1) # Map event genes to their IDs # # event_gene1 -> refseq -> value # -> ensgene -> value # event_gene2 -> refseq -> # ... event_genes_to_info = \ defaultdict(lambda: defaultdict(list)) for entry in intersected_bed: event_gene_attrs = utils.parse_attributes(entry.fields[8]) event_gene_str = event_gene_attrs["ID"] gene_info_field = entry.fields[-1] # Strip semicolon of ID attributes if gene_info_field.endswith(";"): gene_info_field = gene_info_field[0:-1] # Convert attributes into dictionary gene_info = utils.parse_attributes(gene_info_field) ensgene_id = gene_info["ensg_id"] refseq_id = gene_info["refseq_id"] gene_symbol = gene_info["gsymbol"] # Skip null entries if not is_null_id(ensgene_id): event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id) if not is_null_id(refseq_id): event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id) if not is_null_id(gene_symbol): event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol) # Incorporate the gene information into the GFF and output it # it using gffutils print "Loading events into GFF database..." events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False) output_fname = gff_fname events_out = gffwriter.GFFWriter(output_fname, in_place=True) print " - Outputting annotated GFF to: %s" % (output_fname) def new_recs(): for gene_recs in list(events_db.iter_by_parent_childs()): gene_rec = gene_recs[0] event_id = gene_rec.id # Use existing IDs if present if "ensgene_id" in gene_rec.attributes: ensgene_id = gene_rec.attributes["ensg_id"][0] else: ensgene_id = "NA" if "refseq_id" in gene_rec.attributes: refseq_id = gene_rec.attributes["refseq_id"][0] else: refseq_id = "NA" if "gene_symbol" in gene_rec.attributes: gene_symbol = gene_rec.attributes["gsymbol"][0] else: gene_symbol = "NA" if event_id in event_genes_to_info: event_info = event_genes_to_info[event_id] ensgene_ids = \ utils.unique_list(event_info["ensg_id"]) if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA": ensgene_id = ",".join(ensgene_ids) refseq_ids = \ utils.unique_list(event_info["refseq_id"]) if len(refseq_ids) > 0 and refseq_ids[0] != "NA": refseq_id = ",".join(refseq_ids) gene_symbols = \ utils.unique_list(event_info["gsymbol"]) if len(gene_symbols) > 0 and gene_symbols[0] != "NA": gene_symbol = ",".join(gene_symbols) gene_rec.attributes["ensg_id"] = [ensgene_id] gene_rec.attributes["refseq_id"] = [refseq_id] gene_rec.attributes["gsymbol"] = [gene_symbol] # Yield all the gene's records for g in gene_recs: yield g t1 = time.time() print "Creating annotated GFF database..." annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False) t2 = time.time() print "Creation took %.2f secs" % (t2 - t1) # Write to file print "Writing annotated GFF to file..." for gene_rec in annotated_db.all_features(featuretype="gene"): events_out.write_gene_recs(annotated_db, gene_rec.id) events_out.close()
def load_settings(self): """ Load settings for misowrap. """ settings_info, parsed_settings = \ misowrap_settings.load_misowrap_settings(self.settings_filename) self.settings_info = settings_info # Load basic settings about data self.read_len = self.settings_info["settings"]["readlen"] self.overhang_len = self.settings_info["settings"]["overhanglen"] self.miso_bin_dir = \ utils.pathify(self.settings_info["settings"]["miso_bin_dir"]) self.miso_settings_filename = \ utils.pathify(self.settings_info["settings"]["miso_settings_filename"]) self.miso_events_dir = \ utils.pathify(self.settings_info["settings"]["miso_events_dir"]) self.miso_outdir = \ utils.pathify(self.settings_info["settings"]["miso_output_dir"]) # Load data-related parameters self.bam_files = self.settings_info["data"]["bam_files"] if "insert_lens_dir" in self.settings_info["data"]: self.insert_lens_dir = \ utils.pathify(self.settings_info["data"]["insert_lens_dir"]) # Sample labels self.sample_labels = self.settings_info["data"]["sample_labels"] # Set output directories self.comparisons_dir = os.path.join(self.output_dir, "comparisons") self.comparison_groups = \ self.settings_info["data"]["comparison_groups"] self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs") # Create necessary directories utils.make_dir(self.miso_outdir) utils.make_dir(self.comparisons_dir) utils.make_dir(self.logs_outdir) if "cluster_type" in self.settings_info["settings"]: self.use_cluster = True self.cluster_type = \ self.settings_info["settings"]["cluster_type"] self.chunk_jobs = \ self.settings_info["settings"]["chunk_jobs"] if self.use_cluster: print "Loading cluster information." # Load cluster object if given a cluster type self.load_cluster() # Create a logger object if self.logger_label is None: self.logger_label = "misowrap" else: self.logger_label = "misowrap_%s" %(logger_label) self.logger = utils.get_logger(self.logger_label, self.logs_outdir) # Whether to prefilter MISO events # Set general default settings if "prefilter_miso" not in settings_info["settings"]: # By default, set it so that MISO events are not # prefiltered settings_info["settings"]["prefilter_miso"] = False self.prefilter_miso = \ self.settings_info["settings"]["prefilter_miso"] # Load event types self.load_event_types() # Set path to MISO scripts self.run_miso_cmd = os.path.join(self.miso_bin_dir, "run_miso.py") self.run_events_cmd = os.path.join(self.miso_bin_dir, "run_events_analysis.py") self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils.py") # Files related to gene tables self.tables_dir = \ os.path.join(self.settings_info["pipeline-files"]["init_dir"], "ucsc") if not os.path.isdir(self.tables_dir): print "Error: %s directory does not exist." \ %(self.tables_dir) sys.exit(1) self.const_exons_gff = os.path.join(self.tables_dir, "exons", "const_exons", "ensGene.const_exons.gff") if not os.path.isfile(self.const_exons_gff): print "Error: Const. exons GFF %s does not exist." \ %(self.const_exons_gff) sys.exit(1)
def download_genome_seq(genome, output_dir): """ Download genome sequence files from UCSC. """ print "Downloading genome sequence files for %s" %(genome) print " - Output dir: %s" %(output_dir) output_dir = utils.pathify(os.path.join(output_dir, "genome")) utils.make_dir(output_dir) dir_files = os.listdir(output_dir) # Change to output directory os.chdir(output_dir) ## ## Download the genome sequence files ## genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP, genome) # Fetch all chromosome sequence files if len(dir_files) >= 1: print "Directory %s exists and contains files; " \ "skipping download of genome..." \ %(output_dir) else: download_utils.wget(os.path.join(genome_url, "*")) # Remove random chromosome contigs for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")): if "_" in os.path.basename(fname): print "Deleting: %s" %(fname) os.remove(fname) ## ## Uncompress the files ## print "Uncompressing files..." uncompress_cmd = "gunzip %s/*.gz" %(output_dir) print " - Uncompress cmd: %s" %(uncompress_cmd) t1 = time.time() ret_val = os.system(uncompress_cmd) if ret_val != 0: print "Error: Cannot uncompress files in %s" %(output_dir) sys.exit(1) t2 = time.time() print "Uncompressing took %.2f minutes" %((t2 - t1)/60.) # Create a single genome FASTA file by concatenating the # chromosomes together genome_output_fname = \ os.path.join(output_dir, "%s.fa" %(genome)) if not os.path.isfile(genome_output_fname): print "Concatenating genome chromosomes into one file..." print " - Output file: %s" %(genome_output_fname) t1 = time.time() concat_chrom_cmd = "cat %s/*.fa > %s" %(output_dir, genome_output_fname) print " - Concat cmd: %s" %(concat_chrom_cmd) ret_val = os.system(concat_chrom_cmd) if ret_val != 0: print "Error: Could not concatenate genome chromosomes." sys.exit(1) # Create an index for resulting genome file print "Indexing genome file..." samtools_index_cmd = "samtools faidx %s" %(genome_output_fname) print " - Index cmd: %s" %(samtools_index_cmd) ret_val = os.system(samtools_index_cmd) if ret_val != 0: print "Error: Could not index genome file." sys.exit(1) t2 = time.time() print "Concatenation and indexing took %.2f minutes" \ %((t2 - t1)/60.)
def main(): from optparse import OptionParser parser = OptionParser() parser.add_option("--run", dest="run", action="store_true", default=False, help="Run pipeline.") parser.add_option( "--run-on-sample", dest="run_on_sample", nargs=1, default=None, help="Run on a particular sample. Takes as input the sample label.") parser.add_option("--settings", dest="settings", nargs=1, default=None, help="Settings filename.") parser.add_option("--init", dest="initialize", nargs=1, default=None, help="Initialize the pipeline. Takes as input a genome, " "e.g. mm9 or hg18") parser.add_option("--output-dir", dest="output_dir", nargs=1, default=None, help="Output directory.") ## ## Options related to --init ## parser.add_option("--frac-constitutive", dest="frac_constitutive", nargs=1, default=0.7, type="float", help="Fraction (number between 0 and 1) of " \ "transcripts that an exon can be in to be considered " \ "constitutive. Default is 0.7 (i.e. 70% of " \ "transcripts.)") parser.add_option("--constitutive-exon-diff", dest="constitutive_exon_diff", nargs=1, default=10, type="int", help="Number of \'wiggle\' bases by which an exon can " \ "differ in order to be considered constitutive. By " \ "default set to 10.") (options, args) = parser.parse_args() greeting() if options.output_dir == None: print "Error: need --output-dir argument." parser.print_help() sys.exit(1) output_dir = utils.pathify(options.output_dir) if not os.path.isdir(output_dir): os.makedirs(output_dir) settings_filename = None if options.run: if options.settings == None: # Running of pipeline requires settings filename print "Error: need --settings" parser.print_help() sys.exit(1) settings_filename = utils.pathify(options.settings) run_pipeline(settings_filename, output_dir) if options.run_on_sample is not None: if options.settings == None: # Running of pipeline requires settings filename print "Error: need --settings" parser.print_help() sys.exit(1) settings_filename = utils.pathify(options.settings) sample_label = options.run_on_sample run_on_sample(sample_label, settings_filename, output_dir) if options.initialize is not None: # Parse initialization-related settings frac_constitutive = float(options.frac_constitutive) constitutive_exon_diff = int(options.constitutive_exon_diff) init_params = { "frac_constitutive": frac_constitutive, "constitutive_exon_diff": constitutive_exon_diff } genome = options.initialize initialize_pipeline(genome, output_dir, init_params=init_params)
def filter_comparisons(fname, output_dir, event_type=None, atleast_inc=None, atleast_exc=None, atleast_sum=None, gene_table=None, gene_id_cols=["ensg_id", "gsymbol"], dry_run=False): """ Filter a MISO comparison file (*.miso_bf) Annotate a GFF file with useful information. For now, add annotation of gene IDs based on an input GFF annotation of genes. Computes the most inclusive transcription start/end coordinates fonr each gene, and then uses pybedtools to intersect (in strand-specific manner) with the input annotation. """ fname = utils.pathify(fname) output_dir = utils.pathify(output_dir) print "Filtering MISO comparisons file..." print " - MISO comparisons: %s" %(fname) print " - Event type: %s" %(event_type) if event_type is not None: output_dir = os.path.join(output_dir, event_type) utils.make_dir(output_dir) print " - Output dir: %s" %(output_dir) if "UTR" in event_type: def_atleast_inc = tandemutr_atleast_inc def_atleast_exc = tandemutr_atleast_exc def_atleast_sum = tandemutr_atleast_sum elif "SE" in event_type: def_atleast_inc = se_atleast_inc def_atleast_exc = se_atleast_exc def_atleast_sum = se_atleast_sum elif "AFE" in event_type: def_atleast_inc = afe_atleast_inc def_atleast_exc = afe_atleast_exc def_atleast_sum = afe_atleast_sum elif "ALE" in event_type: def_atleast_inc = ale_atleast_inc def_atleast_exc = ale_atleast_exc def_atleast_sum = ale_atleast_sum elif "RI" in event_type: def_atleast_inc = ri_atleast_inc def_atleast_exc = ri_atleast_exc def_atleast_sum = ri_atleast_sum else: def_atleast_inc = 0 def_atleast_exc = 0 def_atleast_sum = 0 # If read count filters are not given, use the default if atleast_inc is None: atleast_inc = def_atleast_inc if atleast_exc is None: atleast_exc = def_atleast_exc if atleast_sum is None: atleast_sum = def_atleast_sum # Filter the events file if not os.path.isfile(fname): print "Error: Cannot find MISO comparisons file %s" %(fname) sys.exit(1) if not fname.endswith(".miso_bf"): print "Warning: MISO comparisons file %s does not end in " \ ".miso_bf. Are you sure it is a comparisons file?" \ %(fname) # Filter comparisons # ... filtered_df = None comparison_counts = \ self.load_comparisons_counts_from_df(comparisons_df[event_type]) # Get counts for each read class for sample 1 and sample 2 comparison_counts = \ miso_utils.get_counts_by_class("sample1_counts_int", "sample1", comparison_counts) comparison_counts = \ miso_utils.get_counts_by_class("sample2_counts_int", "sample2", comparison_counts) filtered_df = comparison_counts # Filter exclusion reads # Only apply this to events other than TandemUTRs! if "TandemUTR" in event_type: atleast_exc = 0 atleast_const = 5 # Filter inclusion reads filtered_df = \ filtered_df[filtered_df["sample1_inc_counts"] \ | filtered_df["sample2_inc_counts"] \ >= atleast_inc] # Filter exclusion reads filtered_df = \ filtered_df[filtered_df["sample1_exc_counts"] \ | filtered_df["sample2_exc_counts"] \ >= atleast_exc] # Filter the sum of inclusion and exclusion reads sample1_sum = \ filtered_df["sample1_inc_counts"] + \ filtered_df["sample1_exc_counts"] sample2_sum = \ filtered_df["sample2_inc_counts"] + \ filtered_df["sample2_exc_counts"] filtered_df = \ filtered_df[sample1_sum | sample2_sum >= atleast_sum] # Filter constitutive reads filtered_df = \ filtered_df[filtered_df["sample1_const_counts"] \ | filtered_df["sample2_const_counts"] \ >= atleast_const] self.filtered_events[event_type] = filtered_df
def run(settings, logs_outdir, use_cluster=True, base_delay=10, # Batch delay (20 mins by default) batch_delay=60*20, delay_every_n_jobs=30, dry_run=False, event_types=None): """ Run MISO on a set of samples. """ settings_filename = utils.pathify(settings) if event_types is not None: print "Only running MISO on event types: ", event_types misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="run") output_dir = misowrap_obj.miso_outdir bam_files = misowrap_obj.bam_files read_len = misowrap_obj.read_len overhang_len = misowrap_obj.overhang_len events_dir = misowrap_obj.miso_events_dir single_end = True if misowrap_obj.insert_lens_dir is not None: insert_lens_dir = misowrap_obj.insert_lens_dir misowrap_obj.logger.info("Running in paired-end mode...") misowrap_obj.logger.info(" - Insert length directory: %s" \ %(insert_lens_dir)) single_end = False else: misowrap_obj.logger.info("Running in single-end mode...") run_events_analysis = misowrap_obj.run_events_cmd event_types_dirs = \ miso_utils.get_event_types_dirs(misowrap_obj.settings_info) miso_settings_filename = misowrap_obj.miso_settings_filename n = 0 for bam_input in bam_files: bam_filename, sample_label = bam_input bam_filename = utils.pathify(bam_filename) misowrap_obj.logger.info("Processing: %s" %(bam_filename)) for event_type_dir in event_types_dirs: event_type = os.path.basename(event_type_dir) if event_types is not None: if event_type not in event_types: print "Skipping event type: %s" %(event_type) continue print " - Using event dir: %s" %(event_type_dir) miso_cmd = "%s" %(run_events_analysis) bam_basename = os.path.basename(bam_filename) # Output directory for sample sample_output_dir = os.path.join(output_dir, sample_label, event_type) # Pass sample to MISO along with event miso_cmd += " --run %s %s" %(event_type_dir, bam_filename) if not single_end: insert_len_filename = \ os.path.join(insert_lens_dir, "%s.insert_len" %(bam_basename)) misowrap_obj.logger.info("Reading paired-end parameters " \ "from file...") misowrap_obj.logger.info(" - PE file: %s" \ %(insert_len_filename)) pe_params = miso_utils.read_pe_params(insert_len_filename) # Paired-end parameters miso_cmd += " --paired-end %.2f %.2f" %(pe_params["mean"], pe_params["sdev"]) # Read length miso_cmd += " --read-len %d" %(read_len) # Overhang length miso_cmd += " --overhang-len %d" %(overhang_len) # Prefilter? if misowrap_obj.prefilter_miso: miso_cmd += " --prefilter" # Output directory miso_cmd += " --output-dir %s" %(sample_output_dir) # Use cluster if misowrap_obj.use_cluster: miso_cmd += " --use-cluster" miso_cmd += " --chunk-jobs %d" %(misowrap_obj.chunk_jobs) # Settings miso_cmd += " --settings %s" %(miso_settings_filename) misowrap_obj.logger.info("Executing: %s" %(miso_cmd)) job_name = "%s_%s" %(sample_label, event_type) if use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(miso_cmd, job_name, ppn=1) if n == delay_every_n_jobs: # Larger delay everytime we've submitted n jobs misowrap_obj.logger.info("Submitted %d jobs, now waiting %.2f mins." \ %(n, batch_delay / 60.)) time.sleep(batch_delay) n = 0 time.sleep(base_delay) else: if not dry_run: os.system(miso_cmd) n += 1
filtered_df["sample2_exc_counts"] filtered_df = \ filtered_df[sample1_sum | sample2_sum >= atleast_sum] # Filter constitutive reads filtered_df = \ filtered_df[filtered_df["sample1_const_counts"] \ | filtered_df["sample2_const_counts"] \ >= atleast_const] self.filtered_events[event_type] = filtered_df if not dry_run: # Call filtered comparisons here pass # Add gene information if get_genes_from_gff is not None: gene_table_fname = utils.pathify(get_genes_from_gff) print "Adding gene information from %s" %(gene_table_fname) if not os.path.isfile(gene_table_fname): print "Error: GFF file %s not found." %(gene_table_fname) sys.exit(1) events_to_genes = get_events_to_genes(gene_table_fname) def main(): argh.dispatch_commands([ filter_comparisons, combine_comparisons ])
def compare(settings, logs_outdir, delay=5, dry_run=False): """ Run a MISO samples comparison between all pairs of samples. """ settings_filename = utils.pathify(settings) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="compare") bam_files = misowrap_obj.bam_files sample_labels = misowrap_obj.sample_labels read_len = misowrap_obj.read_len overhang_len = misowrap_obj.overhang_len miso_bin_dir = misowrap_obj.miso_bin_dir miso_output_dir = misowrap_obj.miso_outdir comparison_groups = misowrap_obj.comparison_groups comparisons_dir = misowrap_obj.comparisons_dir utils.make_dir(comparisons_dir) misowrap_obj.logger.info("Running MISO comparisons...") ## ## Compute comparisons between all pairs ## in a sample group ## for comp_group in comparison_groups: sample_pairs = utils.get_pairwise_comparisons(comp_group) print " - Total of %d comparisons" %(len(sample_pairs)) for sample1, sample2 in sample_pairs: # For each pair of samples, compare their output # along each event type misowrap_obj.logger.info("Comparing %s %s" %(sample1, sample2)) # Directories for each sample sample1_dir = os.path.join(miso_output_dir, sample1) sample2_dir = os.path.join(miso_output_dir, sample2) for event_type in misowrap_obj.event_types: sample1_event_dir = os.path.join(sample1_dir, event_type) sample2_event_dir = os.path.join(sample2_dir, event_type) job_name = "compare_%s_%s_%s" %(sample1, sample2, event_type) event_comparisons_dir = \ os.path.join(comparisons_dir, event_type) compare_cmd = "%s --compare-samples %s %s %s " \ "--comparison-labels %s %s" \ %(misowrap_obj.compare_miso_cmd, sample1_event_dir, sample2_event_dir, event_comparisons_dir, sample1, sample2) misowrap_obj.logger.info("Executing: %s" %(compare_cmd)) if misowrap_obj.use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(compare_cmd, job_name, ppn=1) time.sleep(delay) else: if not dry_run: os.system(compare_cmd)
def run_miso_on_samples(settings_filename, output_dir, use_cluster=True, delay=120): """ Run MISO on a set of samples. """ misowrap_obj = MISOWrap(settings_filename, output_dir, logger_label="run") bam_files = misowrap_obj.bam_files read_len = misowrap_obj.read_len overhang_len = misowrap_obj.overhang_len events_dir = misowrap_obj.miso_events_dir single_end = True if misowrap_obj.insert_lens_dir is not None: insert_lens_dir = misowrap_obj.insert_lens_dir misowrap_obj.logger.info("Running in paired-end mode...") misowrap_obj.logger.info(" - Insert length directory: %s" \ %(insert_lens_dir)) single_end = False else: misowrap_obj.logger.info("Running in single-end mode...") run_events_analysis = misowrap_obj.run_events_cmd event_types_dirs = \ miso_utils.get_event_types_dirs(misowrap_obj.settings_info) miso_settings_filename = misowrap_obj.miso_settings_filename for bam_input in bam_files: bam_filename, sample_label = bam_input bam_filename = utils.pathify(bam_filename) misowrap_obj.logger.info("Processing: %s" %(bam_filename)) for event_type_dir in event_types_dirs: event_type = os.path.basename(event_type_dir) print " - Using event dir: %s" %(event_type_dir) miso_cmd = "%s" %(run_events_analysis) bam_basename = os.path.basename(bam_filename) # Output directory for sample sample_output_dir = os.path.join(output_dir, sample_label, event_type) # Pass sample to MISO along with event miso_cmd += " --compute-genes-psi %s %s" %(event_type_dir, bam_filename) if not single_end: insert_len_filename = \ os.path.join(insert_lens_dir, "%s.insert_len" %(bam_basename)) misowrap_obj.logger.info("Reading paired-end parameters " \ "from file...") misowrap_obj.logger.info(" - PE file: %s" \ %(insert_len_filename)) pe_params = miso_utils.read_pe_params(insert_len_filename) # Paired-end parameters miso_cmd += " --paired-end %.2f %.2f" %(pe_params["mean"], pe_params["sdev"]) # Read length miso_cmd += " --read-len %d" %(read_len) # Overhang length miso_cmd += " --overhang-len %d" %(overhang_len) # Prefilter? if misowrap_obj.prefilter_miso: miso_cmd += " --prefilter" # Output directory miso_cmd += " --output-dir %s" %(sample_output_dir) # Use cluster if misowrap_obj.use_cluster: miso_cmd += " --use-cluster" miso_cmd += " --chunk-jobs %d" %(misowrap_obj.chunk_jobs) # Settings miso_cmd += " --settings %s" %(miso_settings_filename) misowrap_obj.logger.info("Executing: %s" %(miso_cmd)) job_name = "%s_%s" %(sample_label, event_type) if use_cluster: misowrap_obj.my_cluster.launch_job(miso_cmd, job_name) time.sleep(delay) else: os.system(miso_cmd)