class TestStatsManager(unittest.TestCase): def setUp(self): self.mgr = StatsManager() def test_initialize(self): self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0) def test_clear_alt(self): self.mgr.update_alt(self.get_new_dict()) self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1) self.mgr.clear_alt() self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0) def test_clear_all(self): self.populate_ref() self.mgr.update_alt(self.get_new_dict()) self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1) self.assertEquals(self.mgr.ref_stats["Number of CDS"], 7) self.mgr.clear_all() self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0) self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0) def populate_ref(self): self.mgr.ref_stats["Total sequence length"] = 100 self.mgr.ref_stats["Number of genes"] = 5 self.mgr.ref_stats["Number of mRNAs"] = 7 self.mgr.ref_stats["Number of exons"] = 7 self.mgr.ref_stats["Number of introns"] = 7 self.mgr.ref_stats["Number of CDS"] = 7 self.mgr.ref_stats["Overlapping genes"] = 3 self.mgr.ref_stats["Contained genes"] = 3 self.mgr.ref_stats["CDS: complete"] = 3 self.mgr.ref_stats["CDS: start, no stop"] = 1 self.mgr.ref_stats["CDS: stop, no start"] = 1 self.mgr.ref_stats["CDS: no stop, no start"] = 2 self.mgr.ref_stats["Longest gene"] = 25 self.mgr.ref_stats["Longest mRNA"] = 25 self.mgr.ref_stats["Longest exon"] = 21 self.mgr.ref_stats["Longest intron"] = 21 self.mgr.ref_stats["Longest CDS"] = 20 self.mgr.ref_stats["Shortest gene"] = 10 self.mgr.ref_stats["Shortest mRNA"] = 10 self.mgr.ref_stats["Shortest exon"] = 8 self.mgr.ref_stats["Shortest intron"] = 8 self.mgr.ref_stats["Shortest CDS"] = 6 self.mgr.ref_stats["Total gene length"] = 70 self.mgr.ref_stats["Total mRNA length"] = 70 self.mgr.ref_stats["Total exon length"] = 65 self.mgr.ref_stats["Total intron length"] = 65 self.mgr.ref_stats["Total CDS length"] = 60 def get_new_dict(self): d = {} d["Total sequence length"] = 50 d["Number of genes"] = 1 d["Number of mRNAs"] = 1 d["Number of exons"] = 1 d["Number of introns"] = 1 d["Number of CDS"] = 1 d["Overlapping genes"] = 1 d["Contained genes"] = 1 d["CDS: complete"] = 3 d["CDS: start, no stop"] = 1 d["CDS: stop, no start"] = 1 d["CDS: no stop, no start"] = 2 d["Longest gene"] = 30 d["Longest mRNA"] = 30 d["Longest exon"] = 9 d["Longest intron"] = 9 d["Longest CDS"] = 8 d["Shortest gene"] = 5 d["Shortest mRNA"] = 5 d["Shortest exon"] = 2 d["Shortest intron"] = 2 d["Shortest CDS"] = 3 d["Total gene length"] = 15 d["Total mRNA length"] = 15 d["Total exon length"] = 15 d["Total intron length"] = 15 d["Total CDS length"] = 10 return d def test_alt_is_empty(self): self.assertTrue(self.mgr.alt_is_empty()) self.mgr.update_alt(self.get_new_dict()) self.assertFalse(self.mgr.alt_is_empty()) def test_update_ref(self): self.populate_ref() newdict = self.get_new_dict() self.assertEquals(self.mgr.ref_stats["Total sequence length"], 100) self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 6) self.assertEquals(self.mgr.ref_stats["Longest gene"], 25) self.mgr.update_ref(newdict) self.assertEquals(self.mgr.ref_stats["Total sequence length"], 150) self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 3) self.assertEquals(self.mgr.ref_stats["Longest gene"], 30) def test_summary_with_modifications(self): self.populate_ref() self.mgr.update_alt(self.get_new_dict()) expected = " Reference Genome Modified Genome \n" expected += " ---------------- --------------- \n" expected += "Total sequence length 100 50 \n" expected += "Number of genes 5 1 \n" expected += "Number of mRNAs 7 1 \n" expected += "Number of exons 7 1 \n" expected += "Number of introns 7 1 \n" expected += "Number of CDS 7 1 \n" expected += "Overlapping genes 3 1 \n" expected += "Contained genes 3 1 \n" expected += "CDS: complete 3 3 \n" expected += "CDS: start, no stop 1 1 \n" expected += "CDS: stop, no start 1 1 \n" expected += "CDS: no stop, no start 2 2 \n" expected += "Total gene length 70 15 \n" expected += "Total mRNA length 70 15 \n" expected += "Total exon length 65 15 \n" expected += "Total intron length 65 15 \n" expected += "Total CDS length 60 10 \n" expected += "Shortest gene 10 5 \n" expected += "Shortest mRNA 10 5 \n" expected += "Shortest exon 8 2 \n" expected += "Shortest intron 8 2 \n" expected += "Shortest CDS 6 3 \n" expected += "Longest gene 25 30 \n" expected += "Longest mRNA 25 30 \n" expected += "Longest exon 21 9 \n" expected += "Longest intron 21 9 \n" expected += "Longest CDS 20 8 \n" expected += "mean gene length 14 15 \n" expected += "mean mRNA length 10 15 \n" expected += "mean exon length 9 15 \n" expected += "mean intron length 9 15 \n" expected += "mean CDS length 9 10 \n" expected += "% of genome covered by genes 70.0 30.0 \n" expected += "% of genome covered by CDS 60.0 20.0 \n" expected += "mean mRNAs per gene 1 1 \n" expected += "mean exons per mRNA 1 1 \n" expected += "mean introns per mRNA 1 1 \n" summary = self.mgr.summary() self.assertEquals(summary, expected) def test_summary_without_modifications(self): self.populate_ref() expected = " Genome \n" expected += " ------ \n" expected += "Total sequence length 100 \n" expected += "Number of genes 5 \n" expected += "Number of mRNAs 7 \n" expected += "Number of exons 7 \n" expected += "Number of introns 7 \n" expected += "Number of CDS 7 \n" expected += "Overlapping genes 3 \n" expected += "Contained genes 3 \n" expected += "CDS: complete 3 \n" expected += "CDS: start, no stop 1 \n" expected += "CDS: stop, no start 1 \n" expected += "CDS: no stop, no start 2 \n" expected += "Total gene length 70 \n" expected += "Total mRNA length 70 \n" expected += "Total exon length 65 \n" expected += "Total intron length 65 \n" expected += "Total CDS length 60 \n" expected += "Shortest gene 10 \n" expected += "Shortest mRNA 10 \n" expected += "Shortest exon 8 \n" expected += "Shortest intron 8 \n" expected += "Shortest CDS 6 \n" expected += "Longest gene 25 \n" expected += "Longest mRNA 25 \n" expected += "Longest exon 21 \n" expected += "Longest intron 21 \n" expected += "Longest CDS 20 \n" expected += "mean gene length 14.0 \n" expected += "mean mRNA length 10.0 \n" expected += "mean exon length 9.28571428571 \n" expected += "mean intron length 9.28571428571 \n" expected += "mean CDS length 8.57142857143 \n" expected += "% of genome covered by genes 0.7 \n" expected += "% of genome covered by CDS 0.6 \n" expected += "mean mRNAs per gene 1.4 \n" expected += "mean exons per mRNA 1.0 \n" expected += "mean introns per mRNA 1.0 \n" summary = self.mgr.summary() #self.assertEquals(summary, expected) def test_format_column(self): column = ['a', 'sd', 'asdf'] self.assertEquals(format_column(column, 5), ['a ', 'sd ', 'asdf ']) def test_format_columns(self): desired_tbl = ' columnA columnB \n' \ ' ------- ------- \n' \ 'dog 24 4222 \n' \ 'foo 4232234 84 \n' column_names = ['columnA', 'columnB'] dictA = {'foo': 4232234, 'dog': 24} dictB = {'foo': 84, 'dog': 4222} self.assertEquals( format_columns(column_names, ['dog', 'foo'], [dictA, dictB], 1), desired_tbl)
class Controller: def __init__(self): self.seqs = [] self.removed_features = [] self.filter_mgr = FilterManager() self.stats_mgr = StatsManager() def execute(self, args): """At a minimum, write a fasta, gff and tbl to output directory. Optionally do more.""" # Verify and read fasta file fastapath = args.fasta if not os.path.isfile(fastapath): sys.stderr.write("Failed to find " + fastapath + ". No genome was loaded.\n") sys.exit() sys.stderr.write("Reading fasta...\n") self.read_fasta(fastapath) sys.stderr.write("Done.\n") # Create output directory out_dir = "gag_output" if args.out: out_dir = args.out os.system('mkdir ' + out_dir) # Verify and read gff file # This step also writes genome.ignored.gff, # genome.invalid.gff and genome.comments.gff gffpath = args.gff if not os.path.isfile(gffpath): sys.stderr.write("Failed to find " + gffpath + ". No genome was loaded.") return sys.stderr.write("Reading gff...\n") self.read_gff(gffpath, out_dir) sys.stderr.write("Done.\n") # Calculate stats before genome is modified sys.stderr.write("Calculating stats on original genome\n") for seq in self.seqs: self.stats_mgr.update_ref(seq.stats()) # Optional annotation step if args.anno: anno_filename = args.anno self.annotate_from_file(anno_filename) # Optional step to trim sequences, subsequences or features if args.trim: trim_filename = args.trim self.trim_from_file(trim_filename) # Optional step to create start and stop codons if args.fix_start_stop: sys.stderr.write("Creating start and stop codons...\n") self.fix_start_stop_codons() # Optional step to fix terminal Ns if args.fix_terminal_ns: sys.stderr.write("Fixing terminal Ns...\n") self.fix_terminal_ns() # Optional filtering steps # Remove if args.remove_cds_shorter_than: min_length = args.remove_cds_shorter_than sys.stderr.write("Removing CDS shorter than %s...\n" % min_length) self.apply_filter("cds_shorter_than", min_length, "REMOVE") if args.remove_cds_longer_than: max_length = args.remove_cds_longer_than sys.stderr.write("Removing CDS longer than %s...\n" % max_length) self.apply_filter("cds_longer_than", max_length, "REMOVE") if args.remove_exons_shorter_than: min_length = args.remove_exons_shorter_than sys.stderr.write("Removing exons shorter than %s...\n" % min_length) self.apply_filter("exon_shorter_than", min_length, "REMOVE") if args.remove_exons_longer_than: max_length = args.remove_exons_longer_than sys.stderr.write("Removing exons longer than %s...\n" % max_length) self.apply_filter("exon_longer_than", max_length, "REMOVE") if args.remove_introns_shorter_than: min_length = args.remove_introns_shorter_than sys.stderr.write("Removing exons shorter than %s...\n" % min_length) self.apply_filter("intron_shorter_than", min_length, "REMOVE") if args.remove_introns_longer_than: max_length = args.remove_introns_longer_than sys.stderr.write("Removing exons longer than %s...\n" % max_length) self.apply_filter("intron_longer_than", max_length, "REMOVE") if args.remove_genes_shorter_than: min_length = args.remove_genes_shorter_than sys.stderr.write("Removing genes shorter than %s...\n" % min_length) self.apply_filter("gene_shorter_than", min_length, "REMOVE") if args.remove_genes_longer_than: max_length = args.remove_genes_longer_than sys.stderr.write("Removing genes longer than %s...\n" % max_length) self.apply_filter("gene_longer_than", max_length, "REMOVE") # Flag if args.flag_cds_shorter_than: min_length = args.flag_cds_shorter_than sys.stderr.write("Flagging CDS shorter than %s...\n" % min_length) self.apply_filter("cds_shorter_than", min_length, "FLAG") if args.flag_cds_longer_than: max_length = args.flag_cds_longer_than sys.stderr.write("Flagging CDS longer than %s...\n" % max_length) self.apply_filter("cds_longer_than", max_length, "FLAG") if args.flag_exons_shorter_than: min_length = args.flag_exons_shorter_than sys.stderr.write("Flagging exons shorter than %s...\n" % min_length) self.apply_filter("exon_shorter_than", min_length, "FLAG") if args.flag_exons_longer_than: max_length = args.flag_exons_longer_than sys.stderr.write("Flagging exons longer than %s...\n" % max_length) self.apply_filter("exon_longer_than", max_length, "FLAG") if args.flag_introns_shorter_than: min_length = args.flag_introns_shorter_than sys.stderr.write("Flagging exons shorter than %s...\n" % min_length) self.apply_filter("intron_shorter_than", min_length, "FLAG") if args.flag_introns_longer_than: max_length = args.flag_introns_longer_than sys.stderr.write("Flagging exons longer than %s...\n" % max_length) self.apply_filter("intron_longer_than", max_length, "FLAG") if args.flag_genes_shorter_than: min_length = args.flag_genes_shorter_than sys.stderr.write("Flagging genes shorter than %s...\n" % min_length) self.apply_filter("gene_shorter_than", min_length, "FLAG") if args.flag_genes_longer_than: max_length = args.flag_genes_longer_than sys.stderr.write("Flagging genes longer than %s...\n" % max_length) self.apply_filter("gene_longer_than", max_length, "FLAG") # Write fasta, gff and tbl file to output folder # Open files fasta = open(out_dir + '/genome.fasta', 'w') gff = open(out_dir + '/genome.gff', 'w') tbl = open(out_dir + '/genome.tbl', 'w') proteins = open(out_dir + '/genome.proteins.fasta', 'w') removed = open(out_dir + '/genome.removed.gff', 'w') stats_file = open(out_dir + '/genome.stats', 'w') # Calculate stats on modified genome sys.stderr.write("Calculating stats on modified genome\n") for seq in self.seqs: self.stats_mgr.update_alt(seq.stats()) # Write stats file sys.stderr.write("Writing stats file to " + out_dir + "/ ...\n") for line in self.stats_mgr.summary(): stats_file.write(line) # Write fasta, gff, tbl, protein fasta sys.stderr.write("Writing gff, tbl and fasta to " + out_dir + "/ ...\n") gff.write("##gff-version 3\n") for seq in self.seqs: fasta.write(seq.to_fasta()) gff.write(seq.to_gff()) tbl.write(seq.to_tbl()) proteins.write(seq.to_protein_fasta()) # Write removed.gff for feature in self.removed_features: removed.write(feature.to_gff()) # Close files gff.close() tbl.close() fasta.close() proteins.close() removed.close() stats_file.close() def add_annotations_from_list(self, anno_list): for seq in self.seqs: seq.add_annotations_from_list(anno_list) def trim_from_file(self, filename): if not os.path.isfile(filename): sys.stderr.write("Error: " + filename + " is not a file. Nothing trimmed.\n") return trimlist = self.read_bed_file(open(filename, 'rb')) if not trimlist: sys.stderr.write("Failed to read .bed file; nothing trimmed.\n") return else: self.trim_from_list(trimlist) def annotate_from_file(self, filename): if not os.path.isfile(filename): sys.stderr.write("Error: " + filename + " is not a file. Nothing annotated.\n") return annos = self.read_annotation_file(open(filename, 'rb')) if not annos: sys.stderr.write("Failed to read annotations from " + filename + "; no annotations added.\n") return else: sys.stderr.write("Adding annotations to genome ...\n") self.add_annotations_from_list(annos) sys.stderr.write("...done\n") def trim_from_list(self, trimlist): for seq in self.seqs: # In the case that there are multiple regions to trim in a single # sequence, trim from the end so indices don't get messed up to_trim_this_seq = [x for x in trimlist if x[0] == seq.header] to_trim_this_seq = sorted(to_trim_this_seq, key=lambda entry: entry[2], reverse=True) for entry in to_trim_this_seq: removed_genes = seq.trim_region(entry[1], entry[2]) self.removed_features.extend(removed_genes) sys.stderr.write("Trimmed " + entry[0] + " from ") sys.stderr.write(str(entry[1]) + " to " + str(entry[2]) + "\n") self.remove_empty_features(seq) def get_filter_arg(self, filter_name): return self.filter_mgr.get_filter_arg(filter_name) def apply_filter(self, filter_name, val, filter_mode): for seq in self.seqs: self.filter_mgr.apply_filter(filter_name, val, filter_mode, seq) self.remove_empty_features(seq) def fix_terminal_ns(self): for seq in self.seqs: seq.remove_terminal_ns() self.remove_empty_features(seq) def fix_start_stop_codons(self): for seq in self.seqs: seq.create_starts_and_stops() ## Reading in files def read_fasta(self, line): reader = FastaReader() self.seqs = reader.read(open(line, 'r')) def read_gff(self, line, prefix): # Takes prefix b/c reader returns comments, invalids, ignored # and this method writes them to output files # That's kind of messy gffreader = GFFReader() reader = open(line, 'rb') genes, comments, invalids, ignored = gffreader.read_file(reader) for gene in genes: self.add_gene(gene) # Write comments, invalid lines and ignored features with open(prefix + "/genome.comments.gff", 'w') as comments_file: for comment in comments: comments_file.write(comment) with open(prefix + "/genome.invalid.gff", 'w') as invalid_file: for invalid in invalids: invalid_file.write(invalid) with open(prefix + "/genome.ignored.gff", 'w') as ignored_file: for item in ignored: ignored_file.write(item) def read_bed_file(self, io_buffer): trimlist = [] for line in io_buffer: splitline = line.strip().split('\t') if len(splitline) != 3: return [] else: try: entry = [splitline[0], int(splitline[1]), int(splitline[2])] except ValueError: sys.stderr.write("Error reading .bed file. Non-integer value ") sys.sdterr.write("in column 2 or 3. Here is the line:\n") sys.stderr.write(line) return [] trimlist.append(entry) return trimlist def read_annotation_file(self, io_buffer): annos = [] for line in io_buffer: splitline = line.strip().split('\t') if len(splitline) != 3: return [] else: annos.append(splitline) return annos ## Clean up def remove_empty_features(self, seq): """Removes any empty mRNAs or genes from a seq and adds them to self.removed_features.""" self.removed_features.extend(seq.remove_empty_mrnas()) self.removed_features.extend(seq.remove_empty_genes()) def stats(self): if not self.seqs: return self.no_genome_message else: number_of_gagflags = 0 # TODO have stats mgr handle "number of sequences" first_line = "Number of sequences: " + str(len(self.seqs)) + "\n" sys.stderr.write("Calculating statistics on genome...\n") self.stats_mgr.clear_alt() for seq in self.seqs: self.stats_mgr.update_alt(seq.stats()) number_of_gagflags += seq.number_of_gagflags() last_line = "(" + str(number_of_gagflags) + " features flagged)\n" return first_line + self.stats_mgr.summary() + last_line ## Utility methods def add_gene(self, gene): for seq in self.seqs: if seq.header == gene.seq_name: seq.add_gene(gene) def get_locus_tag(self): locus_tag = "" for seq in self.seqs: if locus_tag: break else: locus_tag = seq.get_locus_tag() return locus_tag def remove_from_list(self, bad_list): # First remove any seqs on the list to_remove = [] for seq in self.seqs: if seq.header in bad_list: to_remove.append(seq) if to_remove: for seq in to_remove: self.seqs.remove(seq) sys.stderr.write("Warning: removing seq " + seq.header + ".\n") sys.stderr.write("You must reload genome to get this sequence back.\n") self.removed_features.extend(to_remove) # Now pass the list down to each seq for seq in self.seqs: removed_from_seq = seq.remove_from_list(bad_list) self.removed_features.extend(removed_from_seq) def contains_mrna(self, mrna_id): for seq in self.seqs: if seq.contains_mrna(mrna_id): return True return False def contains_gene(self, gene_id): for seq in self.seqs: if seq.contains_gene(gene_id): return True return False
class ConsoleController: no_genome_message = "It looks like no genome is currently loaded. Try the 'load' command.\n"+\ "Type 'help load' to learn how to use it, or just 'help' for general advice.\n" ## Setup, loading and saving sessions def __init__(self): self.seqs = [] self.annot = Annotator() self.filter_mgr = FilterManager() self.stats_mgr = StatsManager() self.seq_fixer = SeqFixer() def genome_is_loaded(self): for seq in self.seqs: if seq.genes: return True return False def barf_folder(self, line): if not self.seqs: return self.no_genome_message elif len(line) == 0: sys.stderr.write("Usage: barffolder <directory>\n") return else: # Create directory, open files os.system('mkdir '+line) gff = open(line+'/genome.gff', 'w') removed_gff = open(line+'/genome.removed.gff', 'w') tbl = open(line+'/genome.tbl', 'w') fasta = open(line+'/genome.fasta', 'w') mrna_fasta = open(line+'/genome.mrna.fasta', 'w') cds_fasta = open(line+'/genome.cds.fasta', 'w') protein_fasta = open(line+'/genome.proteins.fasta', 'w') # Deep copy each seq, apply fixes and filters, write sys.stderr.write("Writing gff, tbl and fasta...\n") for seq in self.seqs: cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) gff.write(cseq.to_gff()) removed_gff.write(cseq.removed_to_gff()) tbl.write(cseq.to_tbl()) mrna_fasta.write(cseq.to_mrna_fasta()) cds_fasta.write(cseq.to_cds_fasta()) protein_fasta.write(cseq.to_protein_fasta()) fasta.write(cseq.to_fasta()) # Close files gff.close() tbl.close() fasta.close() mrna_fasta.close() cds_fasta.close() protein_fasta.close() return "Genome written to " + line def load_folder(self, line): if not line: line = "." fastapath = line + '/genome.fasta' gffpath = line + '/genome.gff' # Verify files if not os.path.isfile(fastapath): sys.stderr.write("Failed to find " + fastapath + ". No genome was loaded.") return if not os.path.isfile(gffpath): sys.stderr.write("Failed to find " + gffpath + ". No genome was loaded.") return # Read the fasta sys.stderr.write("Reading fasta...\n") self.read_fasta(fastapath) sys.stderr.write("Done.\n") # Read the gff sys.stderr.write("Reading gff...\n") self.read_gff(gffpath) sys.stderr.write("Done.\n") # Clear stats; read in new stats self.stats_mgr.clear_all() for seq in self.seqs: self.stats_mgr.update_ref(seq.stats()) def set_filter_arg(self, filter_name, val): self.filter_mgr.set_filter_arg(filter_name, val) def get_filter_arg(self, filter_name): return self.filter_mgr.get_filter_arg(filter_name) def set_filter_remove(self, filter_name, remove): self.filter_mgr.set_filter_remove(filter_name, remove) def apply_filters(self): for seq in self.seqs: self.filter_mgr.apply_filters(seq) def fix_terminal_ns(self): self.seq_fixer.fix_terminal_ns() return "Terminal Ns will now be fixed." def fix_start_stop_codons(self): self.seq_fixer.fix_start_stop_codons() return "Will verify and create start/stop codons." ## Assorted utilities def get_n_seq_ids(self, number): """Returns a message indicating the first n seq_ids in the genome. If no seqs loaded, returns a message to that effect. If fewer than n seqs loaded, returns the seq_ids of those seqs.""" if not self.seqs: return "No sequences currently in memory.\n" else: if len(self.seqs) < number: number = len(self.seqs) seq_list = [] for seq in self.seqs: seq_list.append(seq.header) if len(seq_list) == number: break result = "First " + str(len(seq_list)) + " seq ids are: " result += format_list_with_strings(seq_list) return result def get_n_gene_ids(self, number): """Returns a message indicating the first n gene_ids in the genome. If no genes are present, returns a message to that effect. If fewer than n genes are loaded, returns the gene_ids of those genes.""" genes_list = [] while len(genes_list) < number: for seq in self.seqs: genes_list.extend(seq.get_gene_ids()) # List may now contain more than 'number' ids, or it may contain zero if not genes_list: return "No genes currently in memory.\n" if len(genes_list) > number: genes_list = genes_list[:number] result = "First " + str(len(genes_list)) + " gene ids are: " result += format_list_with_strings(genes_list) return result def get_n_mrna_ids(self, number): """Returns a message indicating the first n mrna_ids in the genome. If no mrnas are present, returns a message to that effect. If fewer than n mrnas are loaded, returns the mrna_ids of those mrnas.""" mrnas_list = [] while len(mrnas_list) < number: for seq in self.seqs: mrnas_list.extend(seq.get_mrna_ids()) # List may now contain more than 'number' ids, or it may contain zero if not mrnas_list: return "No mrnas currently in memory.\n" if len(mrnas_list) > number: mrnas_list = mrnas_list[:number] result = "First " + str(len(mrnas_list)) + " mrna ids are: " result += format_list_with_strings(mrnas_list) return result ## Reading in files def read_fasta(self, line): reader = FastaReader() self.seqs = reader.read(open(line, 'r')) def read_gff(self, line): gffreader = GFFReader() reader = open(line, 'rb') genes = gffreader.read_file(reader) for gene in genes: self.add_gene(gene) ## Output info to console def barf_gene_gff(self, line): if not self.seqs: return self.no_genome_message else: for seq in self.seqs: if seq.contains_gene(line): cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) return cseq.gene_to_gff(line) def barf_seq(self, line): if not self.seqs: return self.no_genome_message else: args = line.split(' ') if len(args) == 1: seq_id = args[0] for seq in self.seqs: if seq.header == seq_id: cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) return cseq.get_subseq() elif len(args) == 3: seq_id = args[0] start = int(args[1]) stop = int(args[2]) for seq in self.seqs: if seq.header == seq_id: cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) return cseq.get_subseq(start, stop) else: return "Usage: barfseq <seq_id> <start_index> <end_index>\n" def barf_cds_seq(self, line): if not self.seqs: return self.no_genome_message else: name = line for seq in self.seqs: if seq.contains_mrna(name): cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) return cseq.extract_cds_seq(name) return "Error: Couldn't find mRNA.\n" def cds_to_gff(self, line): if not self.seqs: return self.no_genome_message else: name = line for seq in self.seqs: if seq.contains_mrna(name): cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) return cseq.cds_to_gff(name) return "Error: Couldn't find mRNA.\n" def cds_to_tbl(self, line): if not self.seqs: return self.no_genome_message else: name = line for seq in self.seqs: if seq.contains_mrna(name): cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) return cseq.cds_to_tbl(name) return "Error: Couldn't find mRNA.\n" def barf_gene_tbl(self, line): if not self.seqs: return self.no_genome_message else: output = ">Feature SeqId\n" for seq in self.seqs: if seq.contains_gene(line): cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) output += cseq.gene_to_tbl(line) return output def stats(self): if not self.seqs: return self.no_genome_message else: number_of_gagflags = 0 first_line = "Number of sequences: " + str(len(self.seqs)) + "\n" if self.filter_mgr.dirty or self.seq_fixer.dirty: self.stats_mgr.clear_alt() sys.stderr.write("Calculating statistics on genome...\n") for seq in self.seqs: # Deep copy seq, apply fixes and filters, then update stats cseq = copy.deepcopy(seq) self.seq_fixer.fix(cseq) self.filter_mgr.apply_filters(cseq) self.stats_mgr.update_alt(cseq.stats()) number_of_gagflags += cseq.number_of_gagflags() self.filter_mgr.dirty = False self.seq_fixer.dirty = False last_line = "(" + str(number_of_gagflags) + " features flagged)\n" return first_line + self.stats_mgr.summary() + last_line ## Utility methods def add_gene(self, gene): for seq in self.seqs: if seq.header == gene.seq_name: seq.add_gene(gene) def get_locus_tag(self): locus_tag = "" for seq in self.seqs: if locus_tag: break else: locus_tag = seq.get_locus_tag() return locus_tag def clear_seqs(self): self.seqs[:] = [] def contains_mrna(self, mrna_id): for seq in self.seqs: if seq.contains_mrna(mrna_id): return True return False def contains_gene(self, gene_id): for seq in self.seqs: if seq.contains_gene(gene_id): return True return False def contains_seq(self, seq_id): for seq in self.seqs: if seq.header == seq_id: return True return False def can_write_to_path(self, path): if len(path.split()) > 1: return False else: return not os.path.exists(path)
class Controller(object): def __init__(self): self.seqs = [] self.removed_features = [] self.filter_mgr = FilterManager() self.stats_mgr = StatsManager() def execute(self, args): """At a minimum, write a fasta, gff and tbl to output directory. Optionally do more.""" # Verify and read fasta file fastapath = args.fasta if not os.path.isfile(fastapath): sys.stderr.write("Failed to find " + fastapath + ". No genome was loaded.\n") sys.exit() sys.stderr.write("Reading fasta...\n") self.read_fasta(fastapath) sys.stderr.write("Done.\n") # Create output directory out_dir = "gag_output" if args.out: out_dir = args.out os.system('mkdir ' + out_dir) # Verify and read gff file # This step also writes genome.ignored.gff, # genome.invalid.gff and genome.comments.gff gffpath = args.gff if not os.path.isfile(gffpath): sys.stderr.write("Failed to find " + gffpath + ". No genome was loaded.") return sys.stderr.write("Reading gff...\n") self.read_gff(gffpath, out_dir) sys.stderr.write("Done.\n") # Calculate stats before genome is modified sys.stderr.write("Calculating stats on original genome\n") for seq in self.seqs: self.stats_mgr.update_ref(seq.stats()) # Optional annotation step if args.anno: anno_filename = args.anno self.annotate_from_file(anno_filename) # Optional step to trim sequences, subsequences or features if args.trim: trim_filename = args.trim self.trim_from_file(trim_filename) # Optional step to create start and stop codons if args.fix_start_stop: sys.stderr.write("Creating start and stop codons...\n") self.fix_start_stop_codons() # Optional step to fix terminal Ns if args.fix_terminal_ns: sys.stderr.write("Fixing terminal Ns...\n") self.fix_terminal_ns() # Optional filtering steps # Remove if args.remove_cds_shorter_than: min_length = args.remove_cds_shorter_than sys.stderr.write("Removing CDS shorter than %s...\n" % min_length) self.apply_filter("cds_shorter_than", min_length, "REMOVE") if args.remove_cds_longer_than: max_length = args.remove_cds_longer_than sys.stderr.write("Removing CDS longer than %s...\n" % max_length) self.apply_filter("cds_longer_than", max_length, "REMOVE") if args.remove_exons_shorter_than: min_length = args.remove_exons_shorter_than sys.stderr.write("Removing exons shorter than %s...\n" % min_length) self.apply_filter("exon_shorter_than", min_length, "REMOVE") if args.remove_exons_longer_than: max_length = args.remove_exons_longer_than sys.stderr.write("Removing exons longer than %s...\n" % max_length) self.apply_filter("exon_longer_than", max_length, "REMOVE") if args.remove_introns_shorter_than: min_length = args.remove_introns_shorter_than sys.stderr.write("Removing exons shorter than %s...\n" % min_length) self.apply_filter("intron_shorter_than", min_length, "REMOVE") if args.remove_introns_longer_than: max_length = args.remove_introns_longer_than sys.stderr.write("Removing exons longer than %s...\n" % max_length) self.apply_filter("intron_longer_than", max_length, "REMOVE") if args.remove_genes_shorter_than: min_length = args.remove_genes_shorter_than sys.stderr.write("Removing genes shorter than %s...\n" % min_length) self.apply_filter("gene_shorter_than", min_length, "REMOVE") if args.remove_genes_longer_than: max_length = args.remove_genes_longer_than sys.stderr.write("Removing genes longer than %s...\n" % max_length) self.apply_filter("gene_longer_than", max_length, "REMOVE") # Flag if args.flag_cds_shorter_than: min_length = args.flag_cds_shorter_than sys.stderr.write("Flagging CDS shorter than %s...\n" % min_length) self.apply_filter("cds_shorter_than", min_length, "FLAG") if args.flag_cds_longer_than: max_length = args.flag_cds_longer_than sys.stderr.write("Flagging CDS longer than %s...\n" % max_length) self.apply_filter("cds_longer_than", max_length, "FLAG") if args.flag_exons_shorter_than: min_length = args.flag_exons_shorter_than sys.stderr.write("Flagging exons shorter than %s...\n" % min_length) self.apply_filter("exon_shorter_than", min_length, "FLAG") if args.flag_exons_longer_than: max_length = args.flag_exons_longer_than sys.stderr.write("Flagging exons longer than %s...\n" % max_length) self.apply_filter("exon_longer_than", max_length, "FLAG") if args.flag_introns_shorter_than: min_length = args.flag_introns_shorter_than sys.stderr.write("Flagging exons shorter than %s...\n" % min_length) self.apply_filter("intron_shorter_than", min_length, "FLAG") if args.flag_introns_longer_than: max_length = args.flag_introns_longer_than sys.stderr.write("Flagging exons longer than %s...\n" % max_length) self.apply_filter("intron_longer_than", max_length, "FLAG") if args.flag_genes_shorter_than: min_length = args.flag_genes_shorter_than sys.stderr.write("Flagging genes shorter than %s...\n" % min_length) self.apply_filter("gene_shorter_than", min_length, "FLAG") if args.flag_genes_longer_than: max_length = args.flag_genes_longer_than sys.stderr.write("Flagging genes longer than %s...\n" % max_length) self.apply_filter("gene_longer_than", max_length, "FLAG") # Write fasta, gff and tbl file to output folder # Open files fasta = open(out_dir + '/genome.fasta', 'w') gff = open(out_dir + '/genome.gff', 'w') tbl = open(out_dir + '/genome.tbl', 'w') proteins = open(out_dir + '/genome.proteins.fasta', 'w') mrna = open(out_dir + '/genome.mrna.fasta', 'w') removed = open(out_dir + '/genome.removed.gff', 'w') stats_file = open(out_dir + '/genome.stats', 'w') # Calculate stats on modified genome sys.stderr.write("Calculating stats on modified genome\n") for seq in self.seqs: self.stats_mgr.update_alt(seq.stats()) # Write stats file sys.stderr.write("Writing stats file to " + out_dir + "/ ...\n") for line in self.stats_mgr.summary(): stats_file.write(line) # Write fasta, gff, tbl, protein fasta sys.stderr.write("Writing gff, tbl and fasta to " + out_dir + "/ ...\n") gff.write("##gff-version 3\n") for seq in self.seqs: if seq.is_empty(): continue fasta.write(seq.to_fasta()) gff.write(seq.to_gff()) if not args.skip_empty_scaffolds or len(seq.genes) > 0: # Possibly skip empty sequences tbl.write(seq.to_tbl()) proteins.write(seq.to_protein_fasta()) mrna.write(seq.to_mrna_fasta()) # Write removed.gff for feature in self.removed_features: removed.write(feature.to_gff()) # Close files gff.close() tbl.close() fasta.close() proteins.close() removed.close() stats_file.close() def add_annotations_from_list(self, anno_list): for seq in self.seqs: seq.add_annotations_from_list(anno_list) def trim_from_file(self, filename): if not os.path.isfile(filename): sys.stderr.write("Error: " + filename + " is not a file. Nothing trimmed.\n") return trimlist = read_bed_file(open(filename, 'rb')) if not trimlist: sys.stderr.write("Failed to read .bed file; nothing trimmed.\n") return else: self.trim_from_list(trimlist) def annotate_from_file(self, filename): if not os.path.isfile(filename): sys.stderr.write("Error: " + filename + " is not a file. Nothing annotated.\n") return annos = read_annotation_file(open(filename, 'rb')) if not annos: sys.stderr.write("Failed to read annotations from " + filename + "; no annotations added.\n") return else: sys.stderr.write("Adding annotations to genome ...\n") self.add_annotations_from_list(annos) sys.stderr.write("...done\n") def trim_from_list(self, trimlist): for seq in self.seqs: # In the case that there are multiple regions to trim in a single # sequence, trim from the end so indices don't get messed up to_trim_this_seq = [x for x in trimlist if x[0] == seq.header] to_trim_this_seq = sorted(to_trim_this_seq, key=lambda _entry: _entry[2], reverse=True) for entry in to_trim_this_seq: removed_genes = seq.trim_region(entry[1], entry[2]) self.removed_features.extend(removed_genes) sys.stderr.write("Trimmed " + entry[0] + " from ") sys.stderr.write(str(entry[1]) + " to " + str(entry[2]) + "\n") self.remove_empty_features(seq) def get_filter_arg(self, filter_name): return self.filter_mgr.get_filter_arg(filter_name) def apply_filter(self, filter_name, val, filter_mode): for seq in self.seqs: self.filter_mgr.apply_filter(filter_name, val, filter_mode, seq) self.remove_empty_features(seq) def fix_terminal_ns(self): for seq in self.seqs: seq.remove_terminal_ns() self.remove_empty_features(seq) def fix_start_stop_codons(self): for seq in self.seqs: seq.create_starts_and_stops() # Reading in files def read_fasta(self, line): reader = FastaReader() self.seqs = reader.read(open(line, 'r')) def read_gff(self, line, prefix): # Takes prefix b/c reader returns comments, invalids, ignored # and this method writes them to output files # That's kind of messy gffreader = GFFReader() reader = open(line, 'rb') genes, comments, invalids, ignored = gffreader.read_file(reader) for gene in genes: self.add_gene(gene) # Write comments, invalid lines and ignored features with open(prefix + "/genome.comments.gff", 'w') as comments_file: for comment in comments: comments_file.write(comment) with open(prefix + "/genome.invalid.gff", 'w') as invalid_file: for invalid in invalids: invalid_file.write(invalid) with open(prefix + "/genome.ignored.gff", 'w') as ignored_file: for item in ignored: ignored_file.write(item) # Clean up def remove_empty_features(self, seq): """Removes any empty mRNAs or genes from a seq and adds them to self.removed_features.""" self.removed_features.extend(seq.remove_empty_mrnas()) self.removed_features.extend(seq.remove_empty_genes()) def stats(self): if not self.seqs: return "error: no sequences" else: number_of_gagflags = 0 # TODO have stats mgr handle "number of sequences" first_line = "Number of sequences: " + str(len(self.seqs)) + "\n" sys.stderr.write("Calculating statistics on genome...\n") self.stats_mgr.clear_alt() for seq in self.seqs: self.stats_mgr.update_alt(seq.stats()) number_of_gagflags += seq.number_of_gagflags() last_line = "(" + str(number_of_gagflags) + " features flagged)\n" return first_line + self.stats_mgr.summary() + last_line # Utility methods def add_gene(self, gene): for seq in self.seqs: if seq.header == gene.seq_name: seq.add_gene(gene) def get_locus_tag(self): locus_tag = "" for seq in self.seqs: if locus_tag: break else: locus_tag = seq.get_locus_tag() return locus_tag def remove_from_list(self, bad_list): # First remove any seqs on the list to_remove = [] for seq in self.seqs: if seq.header in bad_list: to_remove.append(seq) if to_remove: for seq in to_remove: self.seqs.remove(seq) sys.stderr.write("Warning: removing seq " + seq.header + ".\n") sys.stderr.write("You must reload genome to get this sequence back.\n") self.removed_features.extend(to_remove) # Now pass the list down to each seq for seq in self.seqs: removed_from_seq = seq.remove_from_list(bad_list) self.removed_features.extend(removed_from_seq) def contains_mrna(self, mrna_id): for seq in self.seqs: if seq.contains_mrna(mrna_id): return True return False def contains_gene(self, gene_id): for seq in self.seqs: if seq.contains_gene(gene_id): return True return False
class TestStatsManager(unittest.TestCase): def setUp(self): self.mgr = StatsManager() def test_initialize(self): self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0) def test_clear_alt(self): self.mgr.update_alt(self.get_new_dict()) self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1) self.mgr.clear_alt() self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0) def test_clear_all(self): self.populate_ref() self.mgr.update_alt(self.get_new_dict()) self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1) self.assertEquals(self.mgr.ref_stats["Number of CDS"], 7) self.mgr.clear_all() self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0) self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0) def populate_ref(self): self.mgr.ref_stats["Total sequence length"] = 100 self.mgr.ref_stats["Number of genes"] = 5 self.mgr.ref_stats["Number of mRNAs"] = 7 self.mgr.ref_stats["Number of exons"] = 7 self.mgr.ref_stats["Number of introns"] = 7 self.mgr.ref_stats["Number of CDS"] = 7 self.mgr.ref_stats["CDS: complete"] = 3 self.mgr.ref_stats["CDS: start, no stop"] = 1 self.mgr.ref_stats["CDS: stop, no start"] = 1 self.mgr.ref_stats["CDS: no stop, no start"] = 2 self.mgr.ref_stats["Longest gene"] = 25 self.mgr.ref_stats["Longest mRNA"] = 25 self.mgr.ref_stats["Longest exon"] = 21 self.mgr.ref_stats["Longest intron"] = 21 self.mgr.ref_stats["Longest CDS"] = 20 self.mgr.ref_stats["Shortest gene"] = 10 self.mgr.ref_stats["Shortest mRNA"] = 10 self.mgr.ref_stats["Shortest exon"] = 8 self.mgr.ref_stats["Shortest intron"] = 8 self.mgr.ref_stats["Shortest CDS"] = 6 self.mgr.ref_stats["Total gene length"] = 70 self.mgr.ref_stats["Total mRNA length"] = 70 self.mgr.ref_stats["Total exon length"] = 65 self.mgr.ref_stats["Total intron length"] = 65 self.mgr.ref_stats["Total CDS length"] = 60 def get_new_dict(self): d = {} d["Total sequence length"] = 50 d["Number of genes"] = 1 d["Number of mRNAs"] = 1 d["Number of exons"] = 1 d["Number of introns"] = 1 d["Number of CDS"] = 1 d["CDS: complete"] = 3 d["CDS: start, no stop"] = 1 d["CDS: stop, no start"] = 1 d["CDS: no stop, no start"] = 2 d["Longest gene"] = 30 d["Longest mRNA"] = 30 d["Longest exon"] = 9 d["Longest intron"] = 9 d["Longest CDS"] = 8 d["Shortest gene"] = 5 d["Shortest mRNA"] = 5 d["Shortest exon"] = 2 d["Shortest intron"] = 2 d["Shortest CDS"] = 3 d["Total gene length"] = 15 d["Total mRNA length"] = 15 d["Total exon length"] = 15 d["Total intron length"] = 15 d["Total CDS length"] = 10 return d def test_alt_is_empty(self): self.assertTrue(self.mgr.alt_is_empty()) self.mgr.update_alt(self.get_new_dict()) self.assertFalse(self.mgr.alt_is_empty()) def test_update_ref(self): self.populate_ref() newdict = self.get_new_dict() self.assertEquals(self.mgr.ref_stats["Total sequence length"], 100) self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 6) self.assertEquals(self.mgr.ref_stats["Longest gene"], 25) self.mgr.update_ref(newdict) self.assertEquals(self.mgr.ref_stats["Total sequence length"], 150) self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 3) self.assertEquals(self.mgr.ref_stats["Longest gene"], 30) def test_summary_with_modifications(self): self.populate_ref() self.mgr.update_alt(self.get_new_dict()) expected = " Reference Genome Modified Genome \n" expected += " ---------------- --------------- \n" expected += "Total sequence length 100 50 \n" expected += "Number of genes 5 1 \n" expected += "Number of mRNAs 7 1 \n" expected += "Number of exons 7 1 \n" expected += "Number of introns 7 1 \n" expected += "Number of CDS 7 1 \n" expected += "CDS: complete 3 3 \n" expected += "CDS: start, no stop 1 1 \n" expected += "CDS: stop, no start 1 1 \n" expected += "CDS: no stop, no start 2 2 \n" expected += "Total gene length 70 15 \n" expected += "Total mRNA length 70 15 \n" expected += "Total exon length 65 15 \n" expected += "Total intron length 65 15 \n" expected += "Total CDS length 60 10 \n" expected += "Shortest gene 10 5 \n" expected += "Shortest mRNA 10 5 \n" expected += "Shortest exon 8 2 \n" expected += "Shortest intron 8 2 \n" expected += "Shortest CDS 6 3 \n" expected += "Longest gene 25 30 \n" expected += "Longest mRNA 25 30 \n" expected += "Longest exon 21 9 \n" expected += "Longest intron 21 9 \n" expected += "Longest CDS 20 8 \n" expected += "mean gene length 14 15 \n" expected += "mean mRNA length 10 15 \n" expected += "mean exon length 9 15 \n" expected += "mean intron length 9 15 \n" expected += "mean CDS length 9 10 \n" expected += "% of genome covered by genes 70.0 30.0 \n" expected += "% of genome covered by CDS 60.0 20.0 \n" expected += "mRNAs per gene 1 1 \n" expected += "exons per mRNA 1 1 \n" expected += "introns per mRNA 1 1 \n" summary = self.mgr.summary() self.assertEquals(summary, expected) def test_summary_without_modifications(self): self.populate_ref() expected = " Genome \n" expected += " ------ \n" expected += "Total sequence length 100 \n" expected += "Number of genes 5 \n" expected += "Number of mRNAs 7 \n" expected += "Number of exons 7 \n" expected += "Number of introns 7 \n" expected += "Number of CDS 7 \n" expected += "CDS: complete 3 \n" expected += "CDS: start, no stop 1 \n" expected += "CDS: stop, no start 1 \n" expected += "CDS: no stop, no start 2 \n" expected += "Total gene length 70 \n" expected += "Total mRNA length 70 \n" expected += "Total exon length 65 \n" expected += "Total intron length 65 \n" expected += "Total CDS length 60 \n" expected += "Shortest gene 10 \n" expected += "Shortest mRNA 10 \n" expected += "Shortest exon 8 \n" expected += "Shortest intron 8 \n" expected += "Shortest CDS 6 \n" expected += "Longest gene 25 \n" expected += "Longest mRNA 25 \n" expected += "Longest exon 21 \n" expected += "Longest intron 21 \n" expected += "Longest CDS 20 \n" expected += "mean gene length 14.0 \n" expected += "mean mRNA length 10.0 \n" expected += "mean exon length 9.28571428571 \n" expected += "mean intron length 9.28571428571 \n" expected += "mean CDS length 8.57142857143 \n" expected += "% of genome covered by genes 0.7 \n" expected += "% of genome covered by CDS 0.6 \n" expected += "mRNAs per gene 1.4 \n" expected += "exons per mRNA 1.0 \n" expected += "introns per mRNA 1.0 \n" summary = self.mgr.summary() # self.assertEquals(summary, expected) def test_format_column(self): column = ["a", "sd", "asdf"] self.assertEquals(format_column(column, 5), ["a ", "sd ", "asdf "]) def test_format_columns(self): desired_tbl = ( " columnA columnB \n" " ------- ------- \n" "dog 24 4222 \n" "foo 4232234 84 \n" ) column_names = ["columnA", "columnB"] dictA = {"foo": 4232234, "dog": 24} dictB = {"foo": 84, "dog": 4222} self.assertEquals(format_columns(column_names, ["dog", "foo"], [dictA, dictB], 1), desired_tbl)