def output_global_alignment(kmers_fname, output_dir): """ Output a global alignment (*.aln) for a set of kmers. Using clustawl for now. Parameters: ----------- kmers_fname : filename of FASTA file containing kmers output_dir : output directory """ utils.make_dir(output_dir) output_fname = \ os.path.join(output_dir, "%s.aln" %(os.path.basename(kmers_fname))) if os.path.isfile(output_fname): print "Alignment filename %s exists. Skipping..." \ %(output_fname) clustalw_cmd = \ "clustalw -INFILE=%s -OUTFILE=%s -PIM" %(kmers_fname, output_fname) print "Executing: %s" % (clustalw_cmd) t1 = time.time() os.system(clustalw_cmd) t2 = time.time() print "Global alignment took %.2f minutes." % ((t2 - t1) / 60.) return output_fname
def sanitize_splicegraph_events(genome, event_type, splicegraph_dir, output_dir): """ Sanitize and annotate old SpliceGraph events before merging with new events. """ gff_fname = os.path.join(splicegraph_dir, genome, "%s.%s.gff3" % (event_type, genome)) if not os.path.isfile(gff_fname): print "Cannot find %s" % (gff_fname) return # Make output directory for sanitized files output_dir = os.path.join(output_dir, genome) utils.make_dir(output_dir) print "Sanitizing: %s" % (gff_fname) gff_label = os.path.basename(gff_fname) output_fname = os.path.join(output_dir, gff_label) if os.path.isfile(output_fname): print "%s already exists, skipping" % (output_fname) return sanitize_cmd = \ "gffutils-cli sanitize %s > %s" %(gff_fname, output_fname) ret_val = os.system(sanitize_cmd) if ret_val != 0: raise Exception, "Sanitize command failed." # Now that it is sanitized, annotate it print "Annotating GFF..." gffutils_helpers.annotate_gff(output_fname, genome)
def get_const_exons(gff_filename, output_filename, base_diff=5): """ Get constitutive exons for GFF filename. - base_diff: Number of bases +/- that can be omitted when for an exon to be considered constitutive. """ print "Getting constitutive exons from: %s" %(gff_filename) dir_name = os.path.dirname(output_filename) if not os.path.isdir(dir_name): utils.make_dir(dir_name) print "Loading GFF file..." gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) print "Done loading." gff_out = gff_utils.GFFWriter(open(output_filename, "w")) for gene, mRNAs in gff_db.mRNAs_by_gene.iteritems(): # Get constitutive exons from the current set # of mRNAs const_exons = const_exons_from_mRNAs(gff_in, mRNAs) for exon_rec in const_exons: # Write exons to file gff_out.write_rec(exon_rec)
def output_dinuc_enriched_kmers(logger, fasta_fname, output_dir, kmer_lens, num_shuffles=100): """ Output enriched kmers in a FASTA file relative to a dinucleotide shuffled version of it. """ logger.info("Output dinucleotide enriched Kmers..") logger.info(" - Input FASTA: %s" % (fasta_fname)) logger.info(" - Output dir: %s" % (output_dir)) utils.make_dir(output_dir) # Shuffle the FASTA shuffled_dir = os.path.join(output_dir, "shuffled_fasta") utils.make_dir(shuffled_dir) shuffled_fasta = ShuffledFasta(fasta_fname, shuffled_dir) for kmer_len in kmer_lens: kmers = Kmers(kmer_len, fasta_fname=fasta_fname, shuffled_fasta=shuffled_fasta) output_basename = \ "%s.%d_kmers.counts" %(os.path.basename(fasta_fname), kmer_len) enrichment_fname = os.path.join(output_dir, output_basename) logger.info("Outputting enriched Kmers to: %s" % (enrichment_fname)) if not os.path.isfile(enrichment_fname): # Get the enriched kmers results = kmers.get_enriched_kmers(output_dir, num_shuffles=num_shuffles) # Output enrichment result kmers.output_enriched_kmers(results, enrichment_fname) else: logger.info("Found %s, skipping.. " % (enrichment_fname))
def output_table_seqs(table_gff_fname, fi_fname, output_dir): """ Output table sequences to a file. """ print "Outputting sequences from GFF table..." print " - Input GFF table: %s" %(table_gff_fname) print " - Genome FASTA index: %s" %(fi_fname) print " - Output dir: %s" %(output_dir) utils.make_dir(output_dir) table_basename = os.path.basename(table_gff_fname).rsplit(".", 1)[0] output_fname = os.path.join(output_dir, "%s.fa" %(table_basename)) print " - Output file: %s" %(output_fname) if os.path.isfile(output_fname): print "Found %s. Skipping..." %(output_fname) return output_fname entries = pybedtools.BedTool(table_gff_fname) def fields2name(f): """ replace GFF featuretype field with the attributes field. """ #f[2] = f[-1] custom_field = "%s:%s-%s:%s" %(f.chrom, f.start, f.stop, f.strand) f[2] = "%s;%s" %(custom_field, f[-1]) return f # Output sequences as FASTA try: entries.each(fields2name).sequence(fi=fi_fname, fo=output_fname, s=True, name=True) except pybedtools.helpers.BEDToolsError as s: pass return output_fname
def output_intron_table(tables_dir, intron_gff_fname, output_dir): """ Output a table of introns. Just adds length and gene information to each entry. """ print "Outputting intron table from %s" % (intron_gff_fname) output_basename = os.path.basename(intron_gff_fname).rsplit(".", 1)[0] utils.make_dir(output_dir) output_fname = os.path.join(output_dir, "%s.gff" % (output_basename)) print " - Output file: %s" % (output_fname) if not os.path.isfile(intron_gff_fname): raise Exception, "Cannot find %s" % (intron_gff_fname) trans_to_gene = trans_to_gene_from_table(tables_dir) table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt") table_df = pandas.read_table(table_fname, sep="\t") trans_to_gene = trans_to_gene_from_table(tables_dir) intron_entries = pybedtools.BedTool(intron_gff_fname) output_file = open(output_fname, "w") for entry in intron_entries: transcripts = entry.attrs["Parent"].split(",") genes_str = \ ",".join([trans_to_gene[trans] for trans in transcripts]) entry.attrs["gene_id"] = genes_str entry.attrs["region_len"] = str(len(entry)) output_file.write(str(entry)) output_file.close() return output_fname
def output_table_seqs(table_gff_fname, fi_fname, output_dir): """ Output table sequences to a file. """ print "Outputting sequences from GFF table..." print " - Input GFF table: %s" % (table_gff_fname) print " - Genome FASTA index: %s" % (fi_fname) print " - Output dir: %s" % (output_dir) utils.make_dir(output_dir) table_basename = os.path.basename(table_gff_fname).rsplit(".", 1)[0] output_fname = os.path.join(output_dir, "%s.fa" % (table_basename)) print " - Output file: %s" % (output_fname) if os.path.isfile(output_fname): print "Found %s. Skipping..." % (output_fname) return output_fname entries = pybedtools.BedTool(table_gff_fname) def fields2name(f): """ replace GFF featuretype field with the attributes field. """ #f[2] = f[-1] custom_field = "%s:%s-%s:%s" % (f.chrom, f.start, f.stop, f.strand) f[2] = "%s;%s" % (custom_field, f[-1]) return f # Output sequences as FASTA try: entries.each(fields2name).sequence(fi=fi_fname, fo=output_fname, s=True, name=True) except pybedtools.helpers.BEDToolsError as s: pass return output_fname
def download_ucsc_tables(genome, output_dir): """ Download all relevant UCSC tables for a given genome. """ tables_outdir = os.path.join(output_dir, "ucsc") utils.make_dir(tables_outdir) print "Download UCSC tables..." print " - Output dir: %s" %(tables_outdir) ucsc_tables = get_ucsc_tables_urls(genome) for table_label, table_url in ucsc_tables: print "Downloading %s" %(table_label) # If the table exists in uncompressed form, don't download it table_filename = os.path.join(tables_outdir, table_label) unzipped_table_fname = table_filename[0:-3] if os.path.isfile(unzipped_table_fname): print "Got %s already. Skipping download.." \ %(unzipped_table_fname) continue # Download table download_status = download_utils.download_url(table_url, tables_outdir) if download_status is None: print "Failed to get %s, skipping.." %(table_label) continue # Uncompress table utils.gunzip_file(table_filename, tables_outdir)
def output_global_alignment(kmers_fname, output_dir): """ Output a global alignment (*.aln) for a set of kmers. Using clustawl for now. Parameters: ----------- kmers_fname : filename of FASTA file containing kmers output_dir : output directory """ utils.make_dir(output_dir) output_fname = \ os.path.join(output_dir, "%s.aln" %(os.path.basename(kmers_fname))) if os.path.isfile(output_fname): print "Alignment filename %s exists. Skipping..." \ %(output_fname) clustalw_cmd = \ "clustalw -INFILE=%s -OUTFILE=%s -PIM" %(kmers_fname, output_fname) print "Executing: %s" %(clustalw_cmd) t1 = time.time() os.system(clustalw_cmd) t2 = time.time() print "Global alignment took %.2f minutes." %((t2 - t1)/60.) return output_fname
def main(): genomes = ["mm9", "mm10", "hg18", "hg19"] event_types = ["SE", "MXE", "A3SS", "A5SS", "RI"] # Directory where UCSC tables are ucsc_tables_dir = os.path.expanduser("~/jaen/ucsc_tables/") events_dir = os.path.expanduser("~/jaen/gff-events/ver2/") for genome in genomes: print "Making annotations for %s" %(genome) output_dir = os.path.join(events_dir, genome) curr_tables_dir = os.path.join(ucsc_tables_dir, genome) utils.make_dir(output_dir) cmd = \ "gff_make_annotation %s %s --genome-label %s --sanitize " \ %(curr_tables_dir, output_dir, genome) print "Executing: " print cmd #os.system(cmd) #Annotate the GFFs with gene information gff_fnames = [] for genome in genomes: commonshortest_dir = \ os.path.join(events_dir, genome, "commonshortest") for event_type in event_types: curr_gff = os.path.join(commonshortest_dir, "%s.%s.gff3" %(event_type, genome)) gffutils_helpers.annotate_gff(curr_gff, genome) # Clean up empty attributes print "Cleaning up empty attributes" remove_empty_attrs.run(curr_gff) # Zip the annotations zip_annotations(events_dir, genomes) upload_annotations(events_dir, genomes)
def get_dinuc_shuffled_fasta(self): """ Get dinucleotide shuffled versions of the FASTA file. Output FASTA files to output directory. """ utils.make_dir(self.output_dir) print "Shuffling FASTA %d times into: %s" % (self.num_shuffles, self.output_dir) t1 = time.time() shuffled_fnames = [] for shuffle_num in range(self.num_shuffles): shuffled_basename = os.path.basename(self.fasta_fname) # Remove FASTA extension shuffled_basename = shuffled_basename.rsplit(".", 1)[0] # Record that it's a shuffle in the filename shuffled_basename = "%s.shuffle_%d.fa" % (shuffled_basename, shuffle_num) shuffled_fname = os.path.join(self.output_dir, shuffled_basename) if not os.path.isfile(shuffled_fname): output_dinuc_shuffled_fasta(self.fasta_fname, shuffled_fname) shuffled_fnames.append(shuffled_fname) t2 = time.time() print "Shuffling took %.2f seconds" % (t2 - t1) self.shuffled_fasta_fnames = shuffled_fnames return self.shuffled_fasta_fnames
def output_intron_table(tables_dir, intron_gff_fname, output_dir): """ Output a table of introns. Just adds length and gene information to each entry. """ print "Outputting intron table from %s" %(intron_gff_fname) output_basename = os.path.basename(intron_gff_fname).rsplit(".", 1)[0] utils.make_dir(output_dir) output_fname = os.path.join(output_dir, "%s.gff" %(output_basename)) print " - Output file: %s" %(output_fname) if not os.path.isfile(intron_gff_fname): raise Exception, "Cannot find %s" %(intron_gff_fname) trans_to_gene = trans_to_gene_from_table(tables_dir) table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt") table_df = pandas.read_table(table_fname, sep="\t") trans_to_gene = trans_to_gene_from_table(tables_dir) intron_entries = pybedtools.BedTool(intron_gff_fname) output_file = open(output_fname, "w") for entry in intron_entries: transcripts = entry.attrs["Parent"].split(",") genes_str = \ ",".join([trans_to_gene[trans] for trans in transcripts]) entry.attrs["gene_id"] = genes_str entry.attrs["region_len"] = str(len(entry)) output_file.write(str(entry)) output_file.close() return output_fname
def fix_ale_gff(gff_fname, output_dir): utils.make_dir(output_dir) fixed_gff_fname = os.path.join(output_dir, os.path.basename(gff_fname)) gff_in = list(pybedtools.BedTool(gff_fname)) for entries in ale_iterator(gff_in): fix_ale_entries(entries)
def conserved_events_mouse_to_human(event_types=["SE", "SE_shortest_noAceView"]): """ Generate conserved events for the given event types, outputting result to output_dir. Generate conserved events by mapping from mouse events to human. """ mouse_genome = "mm9" output_dir = os.path.join(CONS_EVENTS_DIR, "mouse_to_human") utils.make_dir(output_dir) print "Generating conserved events from mouse to human..." print " - Output dir: %s" %(output_dir) for event_type in event_types: print "Generating conserved events of type %s" %(event_type) mouse_gff_fname = \ os.path.join(GFF_EVENTS_DIR, mouse_genome, "%s.%s.gff3" %(event_type, mouse_genome)) print "Mapping %s to human" %(mouse_gff_fname) if not os.path.isfile(mouse_gff_fname): raise Exception, "Cannot find mouse gff %s" %(mouse_gff_fname) cmd = \ "bsub time python %s --get-orthologs %s \"mouse\" \"human\" --output-dir %s" \ %(CONS_SCRIPT_FNAME, mouse_gff_fname, output_dir) print "Executing: %s" %(cmd) ret_val = os.system(cmd) if ret_val != 0: raise Exception, "Call to %s failed." %(CONS_SCRIPT_FNAME)
def conserved_events_mouse_to_human( event_types=["SE", "SE_shortest_noAceView"]): """ Generate conserved events for the given event types, outputting result to output_dir. Generate conserved events by mapping from mouse events to human. """ mouse_genome = "mm9" output_dir = os.path.join(CONS_EVENTS_DIR, "mouse_to_human") utils.make_dir(output_dir) print "Generating conserved events from mouse to human..." print " - Output dir: %s" % (output_dir) for event_type in event_types: print "Generating conserved events of type %s" % (event_type) mouse_gff_fname = \ os.path.join(GFF_EVENTS_DIR, mouse_genome, "%s.%s.gff3" %(event_type, mouse_genome)) print "Mapping %s to human" % (mouse_gff_fname) if not os.path.isfile(mouse_gff_fname): raise Exception, "Cannot find mouse gff %s" % (mouse_gff_fname) cmd = \ "bsub time python %s --get-orthologs %s \"mouse\" \"human\" --output-dir %s" \ %(CONS_SCRIPT_FNAME, mouse_gff_fname, output_dir) print "Executing: %s" % (cmd) ret_val = os.system(cmd) if ret_val != 0: raise Exception, "Call to %s failed." % (CONS_SCRIPT_FNAME)
def run_homer(logger, bed_fname, genome, output_dir, params): """ Run Homer against an input BED file. findMotifsGenome.pl <pos file> <genome> <output directory> """ if homer_path is None: logger.critical("Error: Cannot find or execute Homer program.") sys.exit(1) params_str = " ".join(["%s %s" % (p, params[p]) for p in params]) utils.make_dir(output_dir) # If there's a Homer results directory in the target # directory, then don't rerun Homer if os.path.isdir(os.path.join(output_dir, "homerResults")): logger.info("Found Homer results, skipping..") return output_dir homer_cmd = "%s %s %s %s %s" % (homer_path, bed_fname, genome, output_dir, params_str) logger.info("Calling Homer: ") logger.info("Executing: %s" % (homer_cmd)) t1 = time.time() ret_val = os.system(homer_cmd) if ret_val != 0: logger.critical("Error: Homer call failed.") sys.exit(1) t2 = time.time() logger.info("Homer completed in %.2f minutes" % ((t2 - t1) / 60.)) return output_dir
def get_dinuc_shuffled_fasta(self): """ Get dinucleotide shuffled versions of the FASTA file. Output FASTA files to output directory. """ utils.make_dir(self.output_dir) print "Shuffling FASTA %d times into: %s" %(self.num_shuffles, self.output_dir) t1 = time.time() shuffled_fnames = [] for shuffle_num in range(self.num_shuffles): shuffled_basename = os.path.basename(self.fasta_fname) # Remove FASTA extension shuffled_basename = shuffled_basename.rsplit(".", 1)[0] # Record that it's a shuffle in the filename shuffled_basename = "%s.shuffle_%d.fa" %(shuffled_basename, shuffle_num) shuffled_fname = os.path.join(self.output_dir, shuffled_basename) if not os.path.isfile(shuffled_fname): output_dinuc_shuffled_fasta(self.fasta_fname, shuffled_fname) shuffled_fnames.append(shuffled_fname) t2 = time.time() print "Shuffling took %.2f seconds" %(t2 - t1) self.shuffled_fasta_fnames = shuffled_fnames return self.shuffled_fasta_fnames
def sanitize_splicegraph_events(genome, event_type, splicegraph_dir, output_dir): """ Sanitize and annotate old SpliceGraph events before merging with new events. """ gff_fname = os.path.join(splicegraph_dir, genome, "%s.%s.gff3" %(event_type, genome)) if not os.path.isfile(gff_fname): print "Cannot find %s" %(gff_fname) return # Make output directory for sanitized files output_dir = os.path.join(output_dir, genome) utils.make_dir(output_dir) print "Sanitizing: %s" %(gff_fname) gff_label = os.path.basename(gff_fname) output_fname = os.path.join(output_dir, gff_label) if os.path.isfile(output_fname): print "%s already exists, skipping" %(output_fname) return sanitize_cmd = \ "gffutils-cli sanitize %s > %s" %(gff_fname, output_fname) ret_val = os.system(sanitize_cmd) if ret_val != 0: raise Exception, "Sanitize command failed." # Now that it is sanitized, annotate it print "Annotating GFF..." gffutils_helpers.annotate_gff(output_fname, genome)
def run_homer(logger, bed_fname, genome, output_dir, params): """ Run Homer against an input BED file. findMotifsGenome.pl <pos file> <genome> <output directory> """ if homer_path is None: logger.critical("Error: Cannot find or execute Homer program.") sys.exit(1) params_str = " ".join(["%s %s" %(p, params[p]) for p in params]) utils.make_dir(output_dir) # If there's a Homer results directory in the target # directory, then don't rerun Homer if os.path.isdir(os.path.join(output_dir, "homerResults")): logger.info("Found Homer results, skipping..") return output_dir homer_cmd = "%s %s %s %s %s" %(homer_path, bed_fname, genome, output_dir, params_str) logger.info("Calling Homer: ") logger.info("Executing: %s" %(homer_cmd)) t1 = time.time() ret_val = os.system(homer_cmd) if ret_val != 0: logger.critical("Error: Homer call failed.") sys.exit(1) t2 = time.time() logger.info("Homer completed in %.2f minutes" %((t2 - t1)/60.)) return output_dir
def output_dinuc_enriched_kmers(logger, fasta_fname, output_dir, kmer_lens, num_shuffles=100): """ Output enriched kmers in a FASTA file relative to a dinucleotide shuffled version of it. """ logger.info("Output dinucleotide enriched Kmers..") logger.info(" - Input FASTA: %s" %(fasta_fname)) logger.info(" - Output dir: %s" %(output_dir)) utils.make_dir(output_dir) # Shuffle the FASTA shuffled_dir = os.path.join(output_dir, "shuffled_fasta") utils.make_dir(shuffled_dir) shuffled_fasta = ShuffledFasta(fasta_fname, shuffled_dir) for kmer_len in kmer_lens: kmers = Kmers(kmer_len, fasta_fname=fasta_fname, shuffled_fasta=shuffled_fasta) output_basename = \ "%s.%d_kmers.counts" %(os.path.basename(fasta_fname), kmer_len) enrichment_fname = os.path.join(output_dir, output_basename) logger.info("Outputting enriched Kmers to: %s" %(enrichment_fname)) if not os.path.isfile(enrichment_fname): # Get the enriched kmers results = kmers.get_enriched_kmers(output_dir, num_shuffles=num_shuffles) # Output enrichment result kmers.output_enriched_kmers(results, enrichment_fname) else: logger.info("Found %s, skipping.. " %(enrichment_fname))
def output_rpkm(sample, output_dir, settings_info, rna_base, logger): """ Output RPKM tables for the sample. Takes as input: - sample: a sample object - output_dir: output directory - settings_info: settings information - rna_base: an RNABase object """ # Output RPKM information for all constitutive exon tables in the # in the RNA Base print "Outputting RPKM for: %s" %(sample.label) rpkm_tables = {} for table_name, const_exons in rna_base.tables_to_const_exons.iteritems(): rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir, table_name)) rpkm_tables[table_name] = rpkm_output_filename if os.path.isfile(rpkm_output_filename): logger.info(" - Skipping RPKM output, found %s" %(rpkm_output_filename)) print " - Skipping RPKM output, %s exists" %(rpkm_output_filename) continue # Directory where BAM containing mapping to constitutive # exons be stored bam2gff_outdir = os.path.join(output_dir, "bam2gff_const_exons") utils.make_dir(bam2gff_outdir) # Map reads to GFF of constitutive exons # Use the rRNA subtracted BAM file print "Using constitutive exons GFF -> %s" %(const_exons.gff_filename) exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename, const_exons.gff_filename, bam2gff_outdir) # Compute RPKMs for sample num_mapped = int(sample.qc.qc_results["num_mapped"]) if num_mapped == 0: logger.critical("Cannot compute RPKMs since sample %s has 0 mapped reads." \ %(sample.label)) print "Error: Cannot compute RPKMs since sample %s has 0 mapped reads." \ %(sample.label) sys.exit(1) print "Sample %s has %s mapped reads" %(sample.label, num_mapped) read_len = settings_info["readlen"] logger.info("Outputting RPKM from GFF aligned BAM (table %s)" %(table_name)) output_rpkm_from_gff_aligned_bam(exons_bam_fname, num_mapped, read_len, const_exons, rpkm_output_filename) logger.info("Finished outputting RPKM for %s to %s" %(sample.label, rpkm_output_filename)) return rpkm_output_filename
def __init__(self, sample, pipeline): # Pipeline instance that the sample is attached to self.pipeline = pipeline self.sample = sample self.settings_info = pipeline.settings_info # Define logger self.logger = utils.get_logger("QualityControl.%s" % (sample.label), self.pipeline.pipeline_outdirs["logs"]) # QC header: order of QC fields to be outputted self.regions_header = [ "num_ribo", "num_exons", "num_cds", "num_introns", "num_3p_utr", "num_5p_utr", "num_tRNAs", "num_junctions", ] self.qc_stats_header = [ "percent_mapped", "percent_unique", "percent_ribo", "percent_exons", "percent_cds", "percent_introns", "percent_3p_utr", "percent_5p_utr", "percent_tRNAs", "3p_to_cds", "5p_to_cds", "3p_to_5p", "exon_intron_ratio", ] self.qc_header = ( ["num_reads", "num_mapped", "num_ribosub_mapped", "num_unique_mapped"] + self.qc_stats_header + self.regions_header ) # QC results self.na_val = "NA" self.qc_results = defaultdict(lambda: self.na_val) # QC output dir self.qc_outdir = self.pipeline.pipeline_outdirs["qc"] # QC filename for this sample self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label) utils.make_dir(self.sample_outdir) # Regions output dir self.regions_outdir = os.path.join(self.sample_outdir, "regions") utils.make_dir(self.regions_outdir) self.qc_filename = os.path.join(self.sample_outdir, "%s.qc.txt" % (self.sample.label)) self.qc_loaded = False # use ensGene gene table for QC computations self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"] # Load QC information if file corresponding to sample # already exists self.load_qc_from_file()
def output_filtered_comparisons(self, output_dir=None, sort_column="bayes_factor", columns_to_write=[#"event_name", "gene_id", "gene_symbol", "sample1_posterior_mean", "sample1_ci_low", "sample1_ci_high", "sample2_posterior_mean", "sample2_ci_low", "sample2_ci_high", "diff", "bayes_factor", "isoforms", "sample1_counts", "sample1_assigned_counts", "sample2_counts", "sample2_assigned_counts", "chrom", "strand", "mRNA_starts", "mRNA_ends"]): """ Output filtered comparisons table. """ if output_dir == None: output_dir = self.misowrap_obj.comparisons_dir # Output each file by event type output_dir = os.path.join(output_dir, "filtered_events") print "Outputting filtered events..." print " - Output dir: %s" %(output_dir) utils.make_dir(output_dir) for event_type, filtered_df in self.filtered_events.iteritems(): curr_output_dir = os.path.join(output_dir, event_type) print "Event type: %s" %(event_type) # View by comparison comparison_labels = \ utils.unique_list(filtered_df.index.get_level_values(0)) print "Outputting %d comparisons" %(len(comparison_labels)) for label in comparison_labels: print "Comparison: %s" %(label) comparison_output_dir = os.path.join(curr_output_dir, label) utils.make_dir(comparison_output_dir) output_filename = os.path.join(comparison_output_dir, "%s.%s.filtered.miso_bf" \ %(label, event_type)) print "Outputting to: %s" %(output_filename) curr_df = filtered_df.ix[label].sort_index(by=sort_column, ascending=False) curr_df.to_csv(output_filename, sep=self.delimiter, float_format="%.4f", cols=columns_to_write)
def intersect_events_with_genes(events_gff_fname, gene_tables_dir, output_dir, genes_source="ensGene", na_val="NA"): """ Intersect GFF events with a genes table (also in GFF format). Computes the outermost transcription start/end bounds for each genes and then intersects the GFF events with these bounds. Outputs a mapping from event ID to one or more genes IDs that it maps to, if the event overlaps an annotated gene. - events_gff_fname: GFF events filename - gene_tables_dir: Directory with gene tables (created by --init module of rnaseqlib) - output_dir: output directory - genes_source: source of genes table, e.g. ensGene or refGene. By default, assumes input is an Ensembl table. """ utils.make_dir(output_dir) events_basename = os.path.basename(events_gff_fname) events_to_genes_fname = \ os.path.join(output_dir, "%s_to_%s.txt" \ %(events_basename, genes_source)) print "Outputting events to genes..." print " - Output file: %s" %(events_to_genes_fname) if os.path.isfile(events_to_genes_fname): print "Found %s. Skipping.." %(events_to_genes_fname) return events_to_genes_fname # Load the gene table without parsing the individual genes gene_table = tables.GeneTable(gene_tables_dir, genes_source) # Create a BED file containing the most inclusive txStart/txEnd # for each gene in the table bed_coords_fname = output_inclusive_trans_coords(gene_table, output_dir) # Intersect the GFF events with this BED file of coordinates # to determine what genes each event overlaps intersected_bed_fname = \ intersect_events_with_bed(events_gff_fname, bed_coords_fname, output_dir) # Parse the resulting intersectBed results to get a mapping # from events to the genes they map to events_to_genes = get_events_to_genes(intersected_bed_fname) # Output the result to a file with open(events_to_genes_fname, "w") as events_to_genes_out: header = "event_id\tgene_id\n" events_to_genes_out.write(header) for event, genes in events_to_genes.iteritems(): genes_str = ",".join(genes) output_line = "%s\t%s\n" %(event, genes_str) events_to_genes_out.write(output_line)
def intersect_events_with_genes(events_gff_fname, gene_tables_dir, output_dir, genes_source="ensGene", na_val="NA"): """ Intersect GFF events with a genes table (also in GFF format). Computes the outermost transcription start/end bounds for each genes and then intersects the GFF events with these bounds. Outputs a mapping from event ID to one or more genes IDs that it maps to, if the event overlaps an annotated gene. - events_gff_fname: GFF events filename - gene_tables_dir: Directory with gene tables (created by --init module of rnaseqlib) - output_dir: output directory - genes_source: source of genes table, e.g. ensGene or refGene. By default, assumes input is an Ensembl table. """ utils.make_dir(output_dir) events_basename = os.path.basename(events_gff_fname) events_to_genes_fname = \ os.path.join(output_dir, "%s_to_%s.txt" \ %(events_basename, genes_source)) print "Outputting events to genes..." print " - Output file: %s" % (events_to_genes_fname) if os.path.isfile(events_to_genes_fname): print "Found %s. Skipping.." % (events_to_genes_fname) return events_to_genes_fname # Load the gene table without parsing the individual genes gene_table = tables.GeneTable(gene_tables_dir, genes_source) # Create a BED file containing the most inclusive txStart/txEnd # for each gene in the table bed_coords_fname = output_inclusive_trans_coords(gene_table, output_dir) # Intersect the GFF events with this BED file of coordinates # to determine what genes each event overlaps intersected_bed_fname = \ intersect_events_with_bed(events_gff_fname, bed_coords_fname, output_dir) # Parse the resulting intersectBed results to get a mapping # from events to the genes they map to events_to_genes = get_events_to_genes(intersected_bed_fname) # Output the result to a file with open(events_to_genes_fname, "w") as events_to_genes_out: header = "event_id\tgene_id\n" events_to_genes_out.write(header) for event, genes in events_to_genes.iteritems(): genes_str = ",".join(genes) output_line = "%s\t%s\n" % (event, genes_str) events_to_genes_out.write(output_line)
def output_rpkm(sample, output_dir, settings_info, rna_base, logger): """ Output RPKM tables for the sample. Takes as input: - sample: a sample object - output_dir: output directory - settings_info: settings information - rna_base: an RNABase object """ # Output RPKM information for all constitutive exon tables in the # in the RNA Base print "Outputting RPKM for: %s" %(sample.label) rpkm_tables = {} for table_name, const_exons in rna_base.tables_to_const_exons.iteritems(): rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir, table_name)) rpkm_tables[table_name] = rpkm_output_filename if os.path.isfile(rpkm_output_filename): logger.info(" - Skipping RPKM output, found %s" %(rpkm_output_filename)) continue # Directory where BAM containing mapping to constitutive # exons be stored bam2gff_outdir = os.path.join(output_dir, "bam2gff_const_exons") utils.make_dir(bam2gff_outdir) # Map reads to GFF of constitutive exons # Use the rRNA subtracted BAM file exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename, const_exons.gff_filename, bam2gff_outdir) # Compute RPKMs for sample: use number of ribosub mapped reads num_mapped = int(sample.qc.qc_results["num_ribosub_mapped"]) if num_mapped == 0: logger.critical("Cannot compute RPKMs since sample %s has 0 " \ "mapped reads." %(sample.label)) sys.exit(1) logger.info("Sample %s has %s mapped reads" %(sample.label, num_mapped)) read_len = settings_info["readlen"] logger.info("Outputting RPKM from GFF aligned BAM (table %s)" \ %(table_name)) output_rpkm_from_gff_aligned_bam(exons_bam_fname, num_mapped, read_len, const_exons, rpkm_output_filename) logger.info("Finished outputting RPKM for %s to %s" %(sample.label, rpkm_output_filename)) return rpkm_output_filename
def compare(settings, logs_outdir, delay=5, dry_run=False): """ Run a MISO samples comparison between all pairs of samples. """ settings_filename = utils.pathify(settings) misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="compare") bam_files = misowrap_obj.bam_files sample_labels = misowrap_obj.sample_labels read_len = misowrap_obj.read_len overhang_len = misowrap_obj.overhang_len miso_bin_dir = misowrap_obj.miso_bin_dir miso_output_dir = misowrap_obj.miso_outdir comparison_groups = misowrap_obj.comparison_groups comparisons_dir = misowrap_obj.comparisons_dir utils.make_dir(comparisons_dir) misowrap_obj.logger.info("Running MISO comparisons...") ## ## Compute comparisons between all pairs ## in a sample group ## for comp_group in comparison_groups: sample_pairs = utils.get_pairwise_comparisons(comp_group) print " - Total of %d comparisons" % (len(sample_pairs)) for sample1, sample2 in sample_pairs: # For each pair of samples, compare their output # along each event type misowrap_obj.logger.info("Comparing %s %s" % (sample1, sample2)) # Directories for each sample sample1_dir = os.path.join(miso_output_dir, sample1) sample2_dir = os.path.join(miso_output_dir, sample2) for event_type in misowrap_obj.event_types: sample1_event_dir = os.path.join(sample1_dir, event_type) sample2_event_dir = os.path.join(sample2_dir, event_type) job_name = "compare_%s_%s_%s" % (sample1, sample2, event_type) event_comparisons_dir = os.path.join(comparisons_dir, event_type) compare_cmd = "%s --compare-samples %s %s %s " "--comparison-labels %s %s" % ( misowrap_obj.compare_miso_cmd, sample1_event_dir, sample2_event_dir, event_comparisons_dir, sample1, sample2, ) misowrap_obj.logger.info("Executing: %s" % (compare_cmd)) if misowrap_obj.use_cluster: if not dry_run: misowrap_obj.my_cluster.launch_job(compare_cmd, job_name, ppn=1) time.sleep(delay) else: if not dry_run: os.system(compare_cmd)
def run_meme_on_enriched_kmers(self, output_dir, fold_enriched_cutoff=2, method="max", len_to_output=None): """ Run MEME on all enriched kmers. """ self.logger.info("Running MEME on enriched BindnSeq kmers...") self.logger.info(" - Output dir: %s" %(output_dir)) self.logger.info(" - Fold enrichment cutoff: %.1f" %(fold_enriched_cutoff)) self.logger.info(" - Enrichment method: %s" %(method)) # Make directory for all the kmer sequences to be # processed by MEME self.seqs_dir = os.path.join(output_dir, "seqs") utils.make_dir(self.seqs_dir) # Output all enriched kmers to file if len_to_output is None: len_to_output = "all" self.seqs_fname = \ os.path.join(self.seqs_dir, "enriched_kmers.cutoff_%.1f.method_%s.%s_kmers.fasta" \ %(fold_enriched_cutoff, method, str(len_to_output))) self.logger.info("Outputting sequences as FASTA to: %s" %(self.seqs_fname)) seqs_out = open(self.seqs_fname, "w") for kmer_len in [4,5,6]:#self.kmer_lens: if len_to_output != "all": if len_to_output != kmer_len: print "Skipping %d" %(kmer_len) continue odds_ratios = self.odds_ratios[kmer_len] # Rank the odds ratios ranked_ratios = self.rank_enriched_kmers(odds_ratios) # Select only the kmers that meet the cutoff enriched_ratios = \ ranked_ratios[ranked_ratios["rank"] >= fold_enriched_cutoff] # Write those to file for kmer in enriched_ratios["kmer"].values: header = ">%s\n" %(kmer) seq = "%s\n" %(kmer) seqs_out.write(header) seqs_out.write(seq) seqs_out.close() # Run MEME on FASTA file with kmers output_dir = os.path.join(output_dir, "meme_output") utils.make_dir(output_dir) self.logger.info("Running MEME on enriched BindnSeq kmers...") self.logger.info(" - MEME output dir: %s" %(output_dir)) if len(glob.glob(os.path.join(output_dir, "*"))) >= 1: self.logger.info("MEME output exists. Skipping...") return meme_utils.run_meme(self.logger, self.seqs_fname, output_dir)
def find_motifs_homer(self, output_dir, homer_kmer_lens=[4, 5, 6, 7, 8]): """ Find motifs with Homer. """ output_dir = os.path.join(output_dir, "homer_output") utils.make_dir(output_dir) params = {"-rna": "", "-len": ",".join(map(str, homer_kmer_lens))} # Run on exp homer_utils.run_homer(self.logger, self.exp_coords_fname, self.genome, os.path.join(output_dir, "exp"), params) # Run on control homer_utils.run_homer(self.logger, self.control_coords_fname, self.genome, os.path.join(output_dir, "control"), params)
def init_dirs(self): """ Make sure directories exist. """ utils.make_dir(self.exons_dir) utils.make_dir(self.const_exons_dir) utils.make_dir(self.introns_dir) utils.make_dir(self.utrs_dir)
def merge_events(genome, event_type, splicegraph_events_dir, new_events_dir, output_dir): """ Merge events. """ sg_gff_fname = os.path.join(splicegraph_events_dir, genome, "%s.%s.gff3" %(event_type, genome)) if not os.path.isfile(sg_gff_fname): print "Cannot find %s" %(sg_gff_fname) return if "_" in event_type: new_event_type = event_type.split("_")[0] else: new_event_type = event_type new_gff_fname = os.path.join(new_events_dir, genome, "commonshortest", "%s.%s.gff3" %(new_event_type, genome)) if not os.path.isfile(new_gff_fname): print "Cannot find %s" %(new_gff_fname) return output_dir = os.path.join(output_dir, genome) utils.make_dir(output_dir) output_gff_fname = \ os.path.join(output_dir, "%s.%s.gff3" %(event_type, genome)) print "Merging %s.." %(event_type) print " - Old: %s" %(sg_gff_fname) print " - New: %s" %(new_gff_fname) merge_func = None if event_type.startswith("SE"): merge_func = merge_se elif event_type.startswith("MXE"): merge_func = merge_mxe elif event_type.startswith("A5SS"): merge_func = merge_a5ss elif event_type.startswith("A3SS"): merge_func = merge_a3ss elif event_type.startswith("RI"): merge_func = merge_ri if merge_func is None: raise Exception, "Unrecognized event type %s" %(event_type) # Make merge operation merge_func(sg_gff_fname, new_gff_fname, output_gff_fname, genome)
def __init__(self, sample, pipeline): # Pipeline instance that the sample is attached to self.pipeline = pipeline self.sample = sample self.settings_info = pipeline.settings_info # Define logger self.logger = utils.get_logger("QualityControl.%s" % (sample.label), self.pipeline.pipeline_outdirs["logs"]) # QC header: order of QC fields to be outputted self.regions_header = [ "num_ribo", "num_exons", "num_cds", "num_introns", "num_3p_utr", "num_5p_utr", "num_tRNAs", "num_junctions" ] self.qc_stats_header = [ "percent_mapped", "percent_unique", "percent_ribo", "percent_exons", "percent_cds", "percent_introns", "percent_3p_utr", "percent_5p_utr", "percent_tRNAs", "3p_to_cds", "5p_to_cds", "3p_to_5p", "exon_intron_ratio" ] self.qc_header = ["num_reads", "num_mapped", "num_ribosub_mapped", "num_unique_mapped"] + \ self.qc_stats_header + \ self.regions_header # QC results self.na_val = "NA" self.qc_results = defaultdict(lambda: self.na_val) # QC output dir self.qc_outdir = self.pipeline.pipeline_outdirs["qc"] # QC filename for this sample self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label) utils.make_dir(self.sample_outdir) # Regions output dir self.regions_outdir = os.path.join(self.sample_outdir, "regions") utils.make_dir(self.regions_outdir) self.qc_filename = os.path.join(self.sample_outdir, "%s.qc.txt" % (self.sample.label)) self.qc_loaded = False # use ensGene gene table for QC computations self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"] # Load QC information if file corresponding to sample # already exists self.load_qc_from_file()
def trim_polyA_ends(fastq_filename, output_dir, compressed=False, min_polyA_len=3, min_read_len=22): """ Trim polyA ends from reads. """ print "Trimming polyA trails from: %s" %(fastq_filename) # Strip the trailing extension output_basename = \ ".".join(os.path.basename(fastq_filename).split(".")[0:-1]) output_basename = "%s.trimmed_polyA.fastq.gz" %(output_basename) output_filename = os.path.join(output_dir, output_basename) utils.make_dir(output_dir) if os.path.isfile(output_filename): print "SKIPPING: %s already exists!" %(output_filename) return output_filename print " - Outputting trimmed sequences to: %s" %(output_filename) input_file = fastq_utils.read_open_fastq(fastq_filename) output_file = fastq_utils.write_open_fastq(output_filename) t1 = time.time() for line in fastq_utils.read_fastq(input_file): header, seq, header2, qual = line if seq.endswith("A"): # Skip sequences that do not end with at least N # many As if seq[-min_polyA_len:] != ("A" * min_polyA_len): continue # Get sequence stripped of contiguous strech of polyAs stripped_seq = rstrip_stretch(seq, "A") if len(stripped_seq) < min_read_len: # Skip altogether reads that are shorter than # the required length after trimming continue # Strip the quality scores to match trimmed sequence new_qual = qual[0:len(stripped_seq)] new_rec = (header, stripped_seq, header2, new_qual) # Write the record with trimmed sequence back out to file fastq_utils.write_fastq(output_file, new_rec) t2 = time.time() print "Trimming took %.2f mins." %((t2 - t1)/60.) output_file.close() return output_filename
def trim_polyA_ends(fastq_filename, output_dir, compressed=False, min_polyA_len=3, min_read_len=22): """ Trim polyA ends from reads. """ print "Trimming polyA trails from: %s" % (fastq_filename) # Strip the trailing extension output_basename = \ ".".join(os.path.basename(fastq_filename).split(".")[0:-1]) output_basename = "%s.trimmed_polyA.fastq.gz" % (output_basename) output_filename = os.path.join(output_dir, output_basename) utils.make_dir(output_dir) if os.path.isfile(output_filename): print "SKIPPING: %s already exists!" % (output_filename) return output_filename print " - Outputting trimmed sequences to: %s" % (output_filename) input_file = fastq_utils.read_open_fastq(fastq_filename) output_file = fastq_utils.write_open_fastq(output_filename) t1 = time.time() for line in fastq_utils.read_fastq(input_file): header, seq, header2, qual = line if seq.endswith("A"): # Skip sequences that do not end with at least N # many As if seq[-min_polyA_len:] != ("A" * min_polyA_len): continue # Get sequence stripped of contiguous strech of polyAs stripped_seq = rstrip_stretch(seq, "A") if len(stripped_seq) < min_read_len: # Skip altogether reads that are shorter than # the required length after trimming continue # Strip the quality scores to match trimmed sequence new_qual = qual[0:len(stripped_seq)] new_rec = (header, stripped_seq, header2, new_qual) # Write the record with trimmed sequence back out to file fastq_utils.write_fastq(output_file, new_rec) t2 = time.time() print "Trimming took %.2f mins." % ((t2 - t1) / 60.) output_file.close() return output_filename
def output_filtered_comparisons( self, output_dir=None, sort_column="bayes_factor", columns_to_write=[ #"event_name", "gene_id", "gene_symbol", "sample1_posterior_mean", "sample1_ci_low", "sample1_ci_high", "sample2_posterior_mean", "sample2_ci_low", "sample2_ci_high", "diff", "bayes_factor", "isoforms", "sample1_counts", "sample1_assigned_counts", "sample2_counts", "sample2_assigned_counts", "chrom", "strand", "mRNA_starts", "mRNA_ends" ]): """ Output filtered comparisons table. """ if output_dir == None: output_dir = self.misowrap_obj.comparisons_dir # Output each file by event type output_dir = os.path.join(output_dir, "filtered_events") print "Outputting filtered events..." print " - Output dir: %s" % (output_dir) utils.make_dir(output_dir) for event_type, filtered_df in self.filtered_events.iteritems(): curr_output_dir = os.path.join(output_dir, event_type) print "Event type: %s" % (event_type) # View by comparison comparison_labels = \ utils.unique_list(filtered_df.index.get_level_values(0)) print "Outputting %d comparisons" % (len(comparison_labels)) for label in comparison_labels: print "Comparison: %s" % (label) comparison_output_dir = os.path.join(curr_output_dir, label) utils.make_dir(comparison_output_dir) output_filename = os.path.join(comparison_output_dir, "%s.%s.filtered.miso_bf" \ %(label, event_type)) print "Outputting to: %s" % (output_filename) curr_df = filtered_df.ix[label].sort_index(by=sort_column, ascending=False) curr_df.to_csv(output_filename, sep=self.delimiter, float_format="%.4f", cols=columns_to_write)
def __init__(self, settings_filename, output_dir, logger_label=None): self.settings_filename = settings_filename self.settings_info = None self.logger_label = None # Main output directory self.output_dir = utils.pathify(output_dir) utils.make_dir(self.output_dir) # MISO output directory (where raw output is) self.miso_outdir = None # Comparisons output directory self.comparisons_outdir = None # BAM files to process self.bam_files = None # Sample labels self.sample_labels = None self.comparison_groups = None # Insert length directory (for paired-end samples) self.insert_lens_dir = None # Logs output directory self.logs_outdir = None # Logger object self.logger = None # Cluster submission object self.my_cluster = None # Event types to process self.event_types = None # Whether to submit jobs to cluster self.use_cluster = False # run_miso cmd self.run_miso_cmd = None # run_events_analysis cmd self.run_events_cmd = None # Constitutive exons GFF file: used to compute # the insert length distribution self.const_exons_gff = None # Load settings self.load_settings() ## ## Load annotation of events, like a map ## events to genes. ## self.events_to_genes = None self.load_events_to_genes()
def index_merged_events(): event_types = ["SE", "SE_shortest_noAceView", "MXE", "A3SS", "A5SS", "RI"] genomes = ["mm9", "hg18", "hg19"] for genome in genomes: for event_type in event_types: gff_fname = \ os.path.join(MERGED_EVENTS_DIR, genome, "%s.%s.gff3" %(event_type, genome)) output_dir = \ os.path.join(MERGED_EVENTS_DIR, "pickled", genome, event_type) if not os.path.isdir(output_dir): utils.make_dir(output_dir) if not os.path.isfile(gff_fname): print "Cannot find %s" % (gff_fname) continue cmd = "index_gff --index %s %s" % (gff_fname, output_dir) ret_val = os.system(cmd) if ret_val != 0: raise Exception, "Failed to index %s" % (gff_fname)
def index_merged_events(): event_types = ["SE", "SE_shortest_noAceView", "MXE", "A3SS", "A5SS", "RI"] genomes = ["mm9", "hg18", "hg19"] for genome in genomes: for event_type in event_types: gff_fname = \ os.path.join(MERGED_EVENTS_DIR, genome, "%s.%s.gff3" %(event_type, genome)) output_dir = \ os.path.join(MERGED_EVENTS_DIR, "pickled", genome, event_type) if not os.path.isdir(output_dir): utils.make_dir(output_dir) if not os.path.isfile(gff_fname): print "Cannot find %s" %(gff_fname) continue cmd = "index_gff --index %s %s" %(gff_fname, output_dir) ret_val = os.system(cmd) if ret_val != 0: raise Exception, "Failed to index %s" %(gff_fname)
def __init__(self, event_ids, label, input_seqs_fname, remove_repeats=False, entry_types=None, output_dir=None): self.event_ids = event_ids self.label = label self.entry_types = entry_types self.output_dir = output_dir self.input_seqs_fname = input_seqs_fname # Whether to remove repeats or not from sequences self.remove_repeats = remove_repeats utils.make_dir(output_dir) # Sequence filenames for each entry type self.seqs_fnames = {} # BED filenames for each entry type self.bed_fnames = {} # Total length of sequences self.total_lens = {} self.output_event_seqs_and_coords()
def build_indices(self): """ Build relevant genome indices for use with Bowtie/Tophat. """ if not self.with_index: print "Not building indices." return print "Building indices.." fasta_files = self.get_bowtie_index_fasta_files() num_files = len(fasta_files) if num_files == 0: print "WARNING: No FASTA files to build index from." return self.indices_dir = os.path.join(self.output_dir, "indices") utils.make_dir(self.indices_dir) ## ## Check if the Bowtie index is already present, if so skip ## # Check for Bowtie 1 indices indices = glob.glob(os.path.join(self.indices_dir, "%s*.ebwt" %(self.genome))) # Check for Bowtie 2 indices indices += glob.glob(os.path.join(self.indices_dir, "%s*.bt2")) if len(indices) >= 1: print "Found Bowtie index files in %s. Skipping index build.." \ %(self.indices_dir) return print "Building Bowtie index from %d files" %(num_files) for fasta_fname in fasta_files: print " - %s" %(os.path.basename(fasta_fname)) fasta_str = ",".join(fasta_files) # Change to indices directory os.chdir(self.indices_dir) # Use the genome as basename for the bowtie index bowtie_build_cmd = "bowtie-build %s %s" %(fasta_str, self.genome) t1 = time.time() os.system(bowtie_build_cmd) t2 = time.time() print "Bowtie build took %.2f minutes" %((t2 - t1) / 60.)
def jf_count_kmers(fastx_fname, kmer_len, output_dir, hash_size=100000000): """ Count kmers using jellyfish. """ if not os.path.isfile(fastx_fname): print "Error: fastx file %s not found." %(fastx_fname) sys.exit(1) # Count kmers, use temporary file for db fastx_basename = os.path.basename(fastx_fname) output_dir = os.path.join(output_dir, "jf_counts") utils.make_dir(output_dir) db_fname = "%s.jf" %(os.path.join(output_dir, fastx_basename)) output_fname = "%s_counts" %(db_fname) if os.path.isfile(db_fname): #print "Overwriting %s" %(db_fname) os.remove(db_fname) if os.path.isfile(output_fname): #print "Overwriting %s" %(output_fname) os.remove(output_fname) count_cmd = "%s count -m %d -o %s -s %d %s" \ %(jf_path, kmer_len, db_fname, hash_size, fastx_fname) #print "Counting kmers with jf: %s" %(count_cmd) ret_val = os.system(count_cmd) if ret_val != 0: raise Exception, "jellyfish count call failed." sys.exit(1) # Merge db results merged_fname = jf_merge(db_fname) # Load up kmer results dump_cmd = "%s dump -o %s %s" \ %(jf_path, output_fname, merged_fname) ret_val = os.system(dump_cmd) return output_fname
def merge_events(genome, event_type, splicegraph_events_dir, new_events_dir, output_dir): """ Merge events. """ sg_gff_fname = os.path.join(splicegraph_events_dir, genome, "%s.%s.gff3" % (event_type, genome)) if not os.path.isfile(sg_gff_fname): print "Cannot find %s" % (sg_gff_fname) return if "_" in event_type: new_event_type = event_type.split("_")[0] else: new_event_type = event_type new_gff_fname = os.path.join(new_events_dir, genome, "commonshortest", "%s.%s.gff3" % (new_event_type, genome)) if not os.path.isfile(new_gff_fname): print "Cannot find %s" % (new_gff_fname) return output_dir = os.path.join(output_dir, genome) utils.make_dir(output_dir) output_gff_fname = \ os.path.join(output_dir, "%s.%s.gff3" %(event_type, genome)) print "Merging %s.." % (event_type) print " - Old: %s" % (sg_gff_fname) print " - New: %s" % (new_gff_fname) merge_func = None if event_type.startswith("SE"): merge_func = merge_se elif event_type.startswith("MXE"): merge_func = merge_mxe elif event_type.startswith("A5SS"): merge_func = merge_a5ss elif event_type.startswith("A3SS"): merge_func = merge_a3ss elif event_type.startswith("RI"): merge_func = merge_ri if merge_func is None: raise Exception, "Unrecognized event type %s" % (event_type) # Make merge operation merge_func(sg_gff_fname, new_gff_fname, output_gff_fname, genome)
def download_genome_seq(genome, output_dir): """ Download genome sequence files from UCSC. """ print "Downloading genome sequence files for %s" %(genome) print " - Output dir: %s" %(output_dir) output_dir = os.path.join(output_dir, "genome") if os.path.isdir(output_dir): dir_files = os.listdir(output_dir) if len(dir_files) >= 1: print "Directory %s exists and contains files; skipping download of genome..." \ %(output_dir) return None utils.make_dir(output_dir) # Change to output directory os.chdir(output_dir) ## ## Download the genome sequence files ## genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP, genome) # Fetch all chromosome sequence files download_utils.wget(os.path.join(genome_url, "*")) # Download only chrom17 / chr13 random #download_utils.wget(os.path.join(genome_url, "chr17.fa.gz")) #download_utils.wget(os.path.join(genome_url, "chr13_random.fa.gz")) # Remove random chromosome contigs for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")): if "_" in os.path.basename(fname): print "Deleting: %s" %(fname) os.remove(fname) ## ## Uncompress the files ## print "Uncompressing files..." uncompress_cmd = "gunzip %s/*.gz" %(output_dir) t1 = time.time() os.system(uncompress_cmd) t2 = time.time() print "Uncompressing took %.2f minutes" %((t2 - t1)/60.)
def find_motifs_homer(self, output_dir, homer_kmer_lens=[4,5,6,7,8]): """ Find motifs with Homer. """ output_dir = os.path.join(output_dir, "homer_output") utils.make_dir(output_dir) params = {"-rna": "", "-len": ",".join(map(str, homer_kmer_lens))} # Run on exp homer_utils.run_homer(self.logger, self.exp_coords_fname, self.genome, os.path.join(output_dir, "exp"), params) # Run on control homer_utils.run_homer(self.logger, self.control_coords_fname, self.genome, os.path.join(output_dir, "control"), params)
def init_outdirs(self): """ Create the output directories for the pipeline. Structure is: output_dir - rawdata: trimmed reads, etc. - mapping: mapped data files - qc: quality control output - analysis: analysis output """ print "Initializing the pipeline output directories." utils.make_dir(self.output_dir) # Subdirectories of toplevel subdirs self.toplevel_subdirs = defaultdict(list) self.toplevel_subdirs["analysis"] = ["rpkm", "insert_lens"] for dirname in self.toplevel_dirs: dirpath = os.path.join(self.output_dir, dirname) print " - Creating: %s" % (dirpath) utils.make_dir(dirpath) self.pipeline_outdirs[dirname] = dirpath for subdir_name in self.toplevel_subdirs[dirname]: subdir_path = os.path.join(dirpath, subdir_name) utils.make_dir(subdir_path) # Variables storing commonly accessed directories self.rpkm_dir = os.path.join(self.pipeline_outdirs["analysis"], "rpkm")
def download_misc_seqs(genome, output_dir): """ Download assorted sequences related to genome. """ # Mapping from sequence label (e.g. rRNA) # to accession numbers organism = None if genome.startswith("hg"): organism = "human" elif genome.startswith("mm"): organism = "mouse" else: print "Error: Unsupported genome." sys.exit(1) # Fetch the accession numbers for the organism's # misc sequences and download them misc_seqs = NCBI_MISC_SEQS[organism] ncbi_outdir = os.path.join(output_dir, "ncbi") misc_outdir = os.path.join(output_dir, "misc") utils.make_dir(ncbi_outdir) utils.make_dir(misc_outdir) for seq_label, access_id in misc_seqs.iteritems(): if access_id is None: continue output_filename = os.path.join(misc_outdir, "%s.fa" %(seq_label)) if os.path.isfile(output_filename): print "%s exists. Skipping download.." %(seq_label) continue print "Downloading: %s (NCBI: %s)" %(seq_label, access_id) url_filename = download_ncbi_fasta(access_id, ncbi_outdir) fasta_in = fasta_utils.read_fasta(url_filename) fasta_out = open(output_filename, "w") print " - Writing to: %s" %(output_filename) # Fetch first FASTA record rec = fasta_in.next() curr_label, fasta_seq = rec # Output it with the required label new_rec = (">%s" %(seq_label), fasta_seq) fasta_utils.write_fasta(fasta_out, [new_rec])
def build_indices(self): """ Build relevant genome indices for use with Bowtie/Tophat. """ if not self.with_index: print "Not building indices." return print "Building indices.." fasta_files = self.get_bowtie_index_fasta_files() num_files = len(fasta_files) if num_files == 0: print "WARNING: No FASTA files to build index from." return self.indices_dir = os.path.join(self.output_dir, "indices") utils.make_dir(self.indices_dir) ## ## Check if the Bowtie index is already present, if so skip ## # Check for Bowtie 1 indices indices = glob.glob( os.path.join(self.indices_dir, "%s*.ebwt" % (self.genome))) # Check for Bowtie 2 indices indices += glob.glob(os.path.join(self.indices_dir, "%s*.bt2")) if len(indices) >= 1: print "Found Bowtie index files in %s. Skipping index build.." \ %(self.indices_dir) return print "Building Bowtie index from %d files" % (num_files) for fasta_fname in fasta_files: print " - %s" % (os.path.basename(fasta_fname)) fasta_str = ",".join(fasta_files) # Change to indices directory os.chdir(self.indices_dir) # Use the genome as basename for the bowtie index bowtie_build_cmd = "bowtie-build %s %s" % (fasta_str, self.genome) t1 = time.time() os.system(bowtie_build_cmd) t2 = time.time() print "Bowtie build took %.2f minutes" % ((t2 - t1) / 60.)
def jf_count_kmers(fastx_fname, kmer_len, output_dir, hash_size=100000000): """ Count kmers using jellyfish. """ if not os.path.isfile(fastx_fname): print "Error: fastx file %s not found." % (fastx_fname) sys.exit(1) # Count kmers, use temporary file for db fastx_basename = os.path.basename(fastx_fname) output_dir = os.path.join(output_dir, "jf_counts") utils.make_dir(output_dir) db_fname = "%s.jf" % (os.path.join(output_dir, fastx_basename)) output_fname = "%s_counts" % (db_fname) if os.path.isfile(db_fname): #print "Overwriting %s" %(db_fname) os.remove(db_fname) if os.path.isfile(output_fname): #print "Overwriting %s" %(output_fname) os.remove(output_fname) count_cmd = "%s count -m %d -o %s -s %d %s" \ %(jf_path, kmer_len, db_fname, hash_size, fastx_fname) #print "Counting kmers with jf: %s" %(count_cmd) ret_val = os.system(count_cmd) if ret_val != 0: raise Exception, "jellyfish count call failed." sys.exit(1) # Merge db results merged_fname = jf_merge(db_fname) # Load up kmer results dump_cmd = "%s dump -o %s %s" \ %(jf_path, output_fname, merged_fname) ret_val = os.system(dump_cmd) return output_fname
def launchJob(cmd, job_name, scriptOptions, output_dir, verbose=False, test=False, ppn="4", queue_type="normal"): """ Submits a job on the cluster which will run command 'cmd', with options 'scriptOptions' Optionally: verbose: output the job script test: don't actually submit the job script (usually used in conjunction with verbose) Returns a job ID if the job was submitted properly """ if type(cmd) not in [type(list()), type(tuple())]: cmd = [cmd] scriptOptions.setdefault("workingdir", os.getcwd()) scriptOptions.setdefault("ppn", str(ppn)) scriptOptions.setdefault("scriptuser", getpass.getuser()) scriptOptions.setdefault("jobname", job_name) # remove queue name option #scriptOptions.setdefault("queue", queue_type) scriptOptions.setdefault("outdir", output_dir) scriptOptions["command"] = " ".join(cmd) if verbose: print "==SUBMITTING TO CLUSTER==" print cmd print scriptOptions pid = os.getpid() outscriptName = "%s.%i" % (scriptOptions["jobname"], pid) script_outdir = os.path.join(scriptOptions["outdir"], "cluster_scripts") utils.make_dir(script_outdir) scriptOptions["outf"] = \ os.path.abspath(os.path.join(script_outdir, outscriptName+".out")) outtext = """#!/bin/sh #BSUB -n %(ppn)s #BSUB -R "rusage[mem=800]" #BSUB -o %(outf)s #BSUB -J %(jobname)s echo Working directory is %(workingdir)s cd %(workingdir)s echo "%(command)s" %(command)s echo "===== %(command)s finished =====" """ % scriptOptions if verbose: print outscriptName call = "bsub " if not test: try: qsub = subprocess.Popen(call, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) print "Executing: ", scriptOptions["command"] qsub.stdin.write(outtext) output = qsub.communicate() if "is submitted to" in output[0]: jobID = int(output[0].strip().split()[1][1:-1]) print "Process launched with job ID:", jobID return jobID else: raise Exception("Failed to launch job '%s': %s" \ %(outscriptName, str(output))) except: print "failing..." raise return None
def download_genome_seq(genome, output_dir): """ Download genome sequence files from UCSC. """ print "Downloading genome sequence files for %s" %(genome) print " - Output dir: %s" %(output_dir) output_dir = utils.pathify(os.path.join(output_dir, "genome")) utils.make_dir(output_dir) dir_files = os.listdir(output_dir) # Change to output directory os.chdir(output_dir) ## ## Download the genome sequence files ## genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP, genome) # Fetch all chromosome sequence files if len(dir_files) >= 1: print "Directory %s exists and contains files; " \ "skipping download of genome..." \ %(output_dir) else: download_utils.wget(os.path.join(genome_url, "*")) # Remove random chromosome contigs for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")): if "_" in os.path.basename(fname): print "Deleting: %s" %(fname) os.remove(fname) ## ## Uncompress the files ## print "Uncompressing files..." uncompress_cmd = "gunzip %s/*.gz" %(output_dir) print " - Uncompress cmd: %s" %(uncompress_cmd) t1 = time.time() ret_val = os.system(uncompress_cmd) if ret_val != 0: print "Error: Cannot uncompress files in %s" %(output_dir) sys.exit(1) t2 = time.time() print "Uncompressing took %.2f minutes" %((t2 - t1)/60.) # Create a single genome FASTA file by concatenating the # chromosomes together genome_output_fname = \ os.path.join(output_dir, "%s.fa" %(genome)) if not os.path.isfile(genome_output_fname): print "Concatenating genome chromosomes into one file..." print " - Output file: %s" %(genome_output_fname) t1 = time.time() concat_chrom_cmd = "cat %s/*.fa > %s" %(output_dir, genome_output_fname) print " - Concat cmd: %s" %(concat_chrom_cmd) ret_val = os.system(concat_chrom_cmd) if ret_val != 0: print "Error: Could not concatenate genome chromosomes." sys.exit(1) # Create an index for resulting genome file print "Indexing genome file..." samtools_index_cmd = "samtools faidx %s" %(genome_output_fname) print " - Index cmd: %s" %(samtools_index_cmd) ret_val = os.system(samtools_index_cmd) if ret_val != 0: print "Error: Could not index genome file." sys.exit(1) t2 = time.time() print "Concatenation and indexing took %.2f minutes" \ %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir, with_flanking_introns=False, flanking_introns_coords=None, overwrite=True, entries_to_include=["gene", "mRNA", "exon"]): """ Fetch sequence from GFF file. Outputs: (1) GFF file containing an annotation of the sequences. (2) FASTA file with the actual sequences. If asked, fetch the flanking intronic sequences. Flanking regions are marked below: U: region of upstream intron D: region of downstream intron U D [ U P ]-----[ S E ]-----[ D N ] a,b c,d a,b,c,d correspond to optional flanking intron coordinates that determine the regions of the upstream/downstream introns that should be fetched: a, b: negative ints, position relative to 5' splice site of SE a < b c, d: positive ints, position relative to 3' splice site of SE c < d """ # Load GFF genes gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname, reverse_recs=True) file_basename = re.sub("\.gff3?", "", os.path.basename(gff_fname)) output_basename = "%s.event_seqs" %(file_basename) if flanking_introns_coords is not None: output_basename = "%s.flank_intronic_%s_%s_%s_%s" \ %(output_basename, flanking_introns_coords[0], flanking_introns_coords[1], flanking_introns_coords[2], flanking_introns_coords[3]) gff_outdir = os.path.join(output_dir, "gff_coords") utils.make_dir(gff_outdir) gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename)) fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename)) if not overwrite: if os.path.isfile(fasta_output_fname): print "Output file %s exists. Skipping..." %(fasta_output_fname) return fasta_output_fname print "Outputting GFF coordinates to: %s" %(gff_output_fname) if os.path.isfile(gff_output_fname): print " - Overwriting existing file" print "Outputting sequences to: %s" %(fasta_output_fname) if os.path.isfile(fasta_output_fname): print " - Overwriting existing file" genes = gene_utils.load_genes_from_gff(gff_fname) gff_out_file = open(gff_output_fname, "w") gff_out = miso_gff_utils.Writer(gff_out_file) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] # GFF records to write for the current gene recs_to_write = [] # For mRNA entries, extract the flanking introns of the # alternative exon if asked event_recs = get_event_recs_from_gene(gene_obj, gene_tree) long_mRNA_id = event_recs["long_mRNA"].get_id() if event_recs is None: continue # Write out up, se, and dn exons recs_to_write.extend([event_recs["up_exon"]["record"], event_recs["se_exon"]["record"], event_recs["dn_exon"]["record"]]) if with_flanking_introns: introns_coords = \ get_flanking_introns_coords(gene_obj) if introns_coords == None: raise Exception, "Cannot find flanking introns coordinates." sys.exit(1) # Fetch upstream intron sequence up_intron_start, up_intron_end = \ introns_coords["up_intron"] up_intron_len = up_intron_end - up_intron_start + 1 # Fetch downstream intron sequence dn_intron_start, dn_intron_end = \ introns_coords["dn_intron"] dn_intron_len = dn_intron_end - dn_intron_start + 1 # If given custom coordinates, use them instead of entire up/down # flanking intronic coordinates. se_exon_rec = event_recs["se_exon"]["record"] if flanking_introns_coords is not None: # (start,end) of upstream intron sequence a, b = \ int(flanking_introns_coords[0]), int(flanking_introns_coords[1]) c, d = \ int(flanking_introns_coords[2]), int(flanking_introns_coords[3]) a, b, c, d = error_check_intronic_coords(a, b, c, d, up_intron_len, dn_intron_len) # Coordinates relative to 5' splice site of sequence to be fetched # The start of upstream intron sequence is negative from the 5' ss up_intron_start = se_exon_rec.start + a up_intron_end = se_exon_rec.start + b dn_intron_start = se_exon_rec.end + c dn_intron_end = se_exon_rec.end + d # Make GFF records for up/dn intronic sequences chrom = se_exon_rec.seqid source = se_exon_rec.source rec_type = "intron" strand = se_exon_rec.strand up_intron_str = "%s.up_intron" %(long_mRNA_id) up_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", up_intron_start, up_intron_end, strand=strand, attributes={"ID": [up_intron_str], "Parent": [gene_obj.label]}) dn_intron_str = "%s.dn_intron" %(long_mRNA_id) dn_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", dn_intron_start, dn_intron_end, strand=strand, attributes={"ID": [dn_intron_str], "Parent": [gene_obj.label]}) recs_to_write.append(up_intron_rec) recs_to_write.append(dn_intron_rec) # Write out records to GFF for rec in recs_to_write: gff_out.write(rec) gff_out_file.close() # Output FASTA sequences output_fasta_seqs_from_gff(gff_output_fname, fasta_fname, fasta_output_fname) return fasta_output_fname
def filter_comparisons(fname, output_dir, event_type=None, atleast_inc=None, atleast_exc=None, atleast_sum=None, gene_table=None, gene_id_cols=["ensg_id", "gsymbol"], dry_run=False): """ Filter a MISO comparison file (*.miso_bf) Annotate a GFF file with useful information. For now, add annotation of gene IDs based on an input GFF annotation of genes. Computes the most inclusive transcription start/end coordinates fonr each gene, and then uses pybedtools to intersect (in strand-specific manner) with the input annotation. """ fname = utils.pathify(fname) output_dir = utils.pathify(output_dir) print "Filtering MISO comparisons file..." print " - MISO comparisons: %s" %(fname) print " - Event type: %s" %(event_type) if event_type is not None: output_dir = os.path.join(output_dir, event_type) utils.make_dir(output_dir) print " - Output dir: %s" %(output_dir) if "UTR" in event_type: def_atleast_inc = tandemutr_atleast_inc def_atleast_exc = tandemutr_atleast_exc def_atleast_sum = tandemutr_atleast_sum elif "SE" in event_type: def_atleast_inc = se_atleast_inc def_atleast_exc = se_atleast_exc def_atleast_sum = se_atleast_sum elif "AFE" in event_type: def_atleast_inc = afe_atleast_inc def_atleast_exc = afe_atleast_exc def_atleast_sum = afe_atleast_sum elif "ALE" in event_type: def_atleast_inc = ale_atleast_inc def_atleast_exc = ale_atleast_exc def_atleast_sum = ale_atleast_sum elif "RI" in event_type: def_atleast_inc = ri_atleast_inc def_atleast_exc = ri_atleast_exc def_atleast_sum = ri_atleast_sum else: def_atleast_inc = 0 def_atleast_exc = 0 def_atleast_sum = 0 # If read count filters are not given, use the default if atleast_inc is None: atleast_inc = def_atleast_inc if atleast_exc is None: atleast_exc = def_atleast_exc if atleast_sum is None: atleast_sum = def_atleast_sum # Filter the events file if not os.path.isfile(fname): print "Error: Cannot find MISO comparisons file %s" %(fname) sys.exit(1) if not fname.endswith(".miso_bf"): print "Warning: MISO comparisons file %s does not end in " \ ".miso_bf. Are you sure it is a comparisons file?" \ %(fname) # Filter comparisons # ... filtered_df = None comparison_counts = \ self.load_comparisons_counts_from_df(comparisons_df[event_type]) # Get counts for each read class for sample 1 and sample 2 comparison_counts = \ miso_utils.get_counts_by_class("sample1_counts_int", "sample1", comparison_counts) comparison_counts = \ miso_utils.get_counts_by_class("sample2_counts_int", "sample2", comparison_counts) filtered_df = comparison_counts # Filter exclusion reads # Only apply this to events other than TandemUTRs! if "TandemUTR" in event_type: atleast_exc = 0 atleast_const = 5 # Filter inclusion reads filtered_df = \ filtered_df[filtered_df["sample1_inc_counts"] \ | filtered_df["sample2_inc_counts"] \ >= atleast_inc] # Filter exclusion reads filtered_df = \ filtered_df[filtered_df["sample1_exc_counts"] \ | filtered_df["sample2_exc_counts"] \ >= atleast_exc] # Filter the sum of inclusion and exclusion reads sample1_sum = \ filtered_df["sample1_inc_counts"] + \ filtered_df["sample1_exc_counts"] sample2_sum = \ filtered_df["sample2_inc_counts"] + \ filtered_df["sample2_exc_counts"] filtered_df = \ filtered_df[sample1_sum | sample2_sum >= atleast_sum] # Filter constitutive reads filtered_df = \ filtered_df[filtered_df["sample1_const_counts"] \ | filtered_df["sample2_const_counts"] \ >= atleast_const] self.filtered_events[event_type] = filtered_df
], "hg19": ["SE", "TandemUTR", "A3SS", "A5SS", "ALE", "AFE", "MXE", "RI"] } # Gene tables indexed by genome gene_tables = { "mm9": "/home/yarden/jaen/pipeline_init/mm9/ucsc/", "mm10": "/home/yarden/jaen/pipeline_init/mm9/ucsc/", "hg18": "/home/yarden/jaen/pipeline_init/hg18/ucsc/", "hg19": "/home/yarden/jaen/pipeline_init/hg19/ucsc/" } intersect_events = "intersect_events.py" events_dir = "/home/yarden/jaen/gff-events" events_outdir = os.path.join(events_dir, "annotated_events") utils.make_dir(events_outdir) for genome, events in genomes_to_events.iteritems(): print "Processing genome %s" % (genome) curr_outdir = os.path.join(events_outdir, genome) print " - Output dir: %s" % (curr_outdir) for event in events: if ("AceView" in event) or ("3pseq" in event): continue print "Intersecting %s.." % (event) events_fname = os.path.join(events_dir, genome, "%s.%s.gff3" % (event, genome)) if not os.path.isfile(events_fname): raise Exception, "%s does not exist." % (events_fname) print " - Events file: %s" % (events_fname) cmd = "%s --intersect %s %s --output-dir %s" \
def load_settings(self): """ Load settings for misowrap. """ settings_info, parsed_settings = \ misowrap_settings.load_misowrap_settings(self.settings_filename) self.settings_info = settings_info # Load basic settings about data self.read_len = self.settings_info["settings"]["readlen"] self.overhang_len = self.settings_info["settings"]["overhanglen"] self.miso_bin_dir = \ utils.pathify(self.settings_info["settings"]["miso_bin_dir"]) self.miso_settings_filename = \ utils.pathify(self.settings_info["settings"]["miso_settings_filename"]) self.miso_events_dir = \ utils.pathify(self.settings_info["settings"]["miso_events_dir"]) self.miso_outdir = \ utils.pathify(self.settings_info["settings"]["miso_output_dir"]) # Load data-related parameters self.bam_files = self.settings_info["data"]["bam_files"] if "insert_lens_dir" in self.settings_info["data"]: self.insert_lens_dir = \ utils.pathify(self.settings_info["data"]["insert_lens_dir"]) # Sample labels self.sample_labels = self.settings_info["data"]["sample_labels"] # Set output directories self.comparisons_dir = os.path.join(self.output_dir, "comparisons") self.comparison_groups = \ self.settings_info["data"]["comparison_groups"] self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs") # Create necessary directories utils.make_dir(self.logs_outdir) if "cluster_type" in self.settings_info["settings"]: self.use_cluster = True self.cluster_type = \ self.settings_info["settings"]["cluster_type"] self.chunk_jobs = \ self.settings_info["settings"]["chunk_jobs"] if self.use_cluster: print "Loading cluster information." # Load cluster object if given a cluster type self.load_cluster() # Create a logger object if self.logger_label is None: self.logger_label = "misowrap" else: self.logger_label = "misowrap_%s" % (logger_label) self.logger = utils.get_logger(self.logger_label, self.logs_outdir) # Whether to prefilter MISO events # Set general default settings if "prefilter_miso" not in settings_info["settings"]: # By default, set it so that MISO events are not # prefiltered settings_info["settings"]["prefilter_miso"] = False self.prefilter_miso = \ self.settings_info["settings"]["prefilter_miso"] # Load event types self.load_event_types() # Set path to MISO scripts self.compare_miso_cmd = os.path.join(self.miso_bin_dir, "compare_miso") self.summarize_miso_cmd = os.path.join(self.miso_bin_dir, "summarize_miso") self.run_events_cmd = os.path.join(self.miso_bin_dir, "miso") self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils") # Files related to gene tables self.tables_dir = \ os.path.join(self.settings_info["pipeline-files"]["init_dir"], "ucsc") if not os.path.isdir(self.tables_dir): print "Error: %s directory does not exist." \ %(self.tables_dir) sys.exit(1) self.const_exons_gff = os.path.join(self.tables_dir, "exons", "const_exons", "ensGene.const_exons.gff") if not os.path.isfile(self.const_exons_gff): print "Error: Const. exons GFF %s does not exist." \ %(self.const_exons_gff) sys.exit(1)