示例#1
0
 def setUp(self):
     self.mgr = StatsManager()
示例#2
0
文件: controller.py 项目: desiro/GAG
 def __init__(self):
     self.seqs = []
     self.removed_features = []
     self.filter_mgr = FilterManager()
     self.stats_mgr = StatsManager()
示例#3
0
class TestStatsManager(unittest.TestCase):
    def setUp(self):
        self.mgr = StatsManager()

    def test_initialize(self):
        self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0)

    def test_clear_alt(self):
        self.mgr.update_alt(self.get_new_dict())
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1)
        self.mgr.clear_alt()
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0)

    def test_clear_all(self):
        self.populate_ref()
        self.mgr.update_alt(self.get_new_dict())
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1)
        self.assertEquals(self.mgr.ref_stats["Number of CDS"], 7)
        self.mgr.clear_all()
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0)
        self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0)

    def populate_ref(self):
        self.mgr.ref_stats["Total sequence length"] = 100
        self.mgr.ref_stats["Number of genes"] = 5
        self.mgr.ref_stats["Number of mRNAs"] = 7
        self.mgr.ref_stats["Number of exons"] = 7
        self.mgr.ref_stats["Number of introns"] = 7
        self.mgr.ref_stats["Number of CDS"] = 7
        self.mgr.ref_stats["Overlapping genes"] = 3
        self.mgr.ref_stats["Contained genes"] = 3
        self.mgr.ref_stats["CDS: complete"] = 3
        self.mgr.ref_stats["CDS: start, no stop"] = 1
        self.mgr.ref_stats["CDS: stop, no start"] = 1
        self.mgr.ref_stats["CDS: no stop, no start"] = 2
        self.mgr.ref_stats["Longest gene"] = 25
        self.mgr.ref_stats["Longest mRNA"] = 25
        self.mgr.ref_stats["Longest exon"] = 21
        self.mgr.ref_stats["Longest intron"] = 21
        self.mgr.ref_stats["Longest CDS"] = 20
        self.mgr.ref_stats["Shortest gene"] = 10
        self.mgr.ref_stats["Shortest mRNA"] = 10
        self.mgr.ref_stats["Shortest exon"] = 8
        self.mgr.ref_stats["Shortest intron"] = 8
        self.mgr.ref_stats["Shortest CDS"] = 6
        self.mgr.ref_stats["Total gene length"] = 70
        self.mgr.ref_stats["Total mRNA length"] = 70
        self.mgr.ref_stats["Total exon length"] = 65
        self.mgr.ref_stats["Total intron length"] = 65
        self.mgr.ref_stats["Total CDS length"] = 60

    def get_new_dict(self):
        d = {}
        d["Total sequence length"] = 50
        d["Number of genes"] = 1
        d["Number of mRNAs"] = 1
        d["Number of exons"] = 1
        d["Number of introns"] = 1
        d["Number of CDS"] = 1
        d["Overlapping genes"] = 1
        d["Contained genes"] = 1
        d["CDS: complete"] = 3
        d["CDS: start, no stop"] = 1
        d["CDS: stop, no start"] = 1
        d["CDS: no stop, no start"] = 2
        d["Longest gene"] = 30
        d["Longest mRNA"] = 30
        d["Longest exon"] = 9
        d["Longest intron"] = 9
        d["Longest CDS"] = 8
        d["Shortest gene"] = 5
        d["Shortest mRNA"] = 5
        d["Shortest exon"] = 2
        d["Shortest intron"] = 2
        d["Shortest CDS"] = 3
        d["Total gene length"] = 15
        d["Total mRNA length"] = 15
        d["Total exon length"] = 15
        d["Total intron length"] = 15
        d["Total CDS length"] = 10
        return d

    def test_alt_is_empty(self):
        self.assertTrue(self.mgr.alt_is_empty())
        self.mgr.update_alt(self.get_new_dict())
        self.assertFalse(self.mgr.alt_is_empty())

    def test_update_ref(self):
        self.populate_ref()
        newdict = self.get_new_dict()
        self.assertEquals(self.mgr.ref_stats["Total sequence length"], 100)
        self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 6)
        self.assertEquals(self.mgr.ref_stats["Longest gene"], 25)
        self.mgr.update_ref(newdict)
        self.assertEquals(self.mgr.ref_stats["Total sequence length"], 150)
        self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 3)
        self.assertEquals(self.mgr.ref_stats["Longest gene"], 30)

    def test_summary_with_modifications(self):
        self.populate_ref()
        self.mgr.update_alt(self.get_new_dict())
        expected = "                                 Reference Genome     Modified Genome     \n"
        expected += "                                 ----------------     ---------------     \n"
        expected += "Total sequence length            100                  50                  \n"
        expected += "Number of genes                  5                    1                   \n"
        expected += "Number of mRNAs                  7                    1                   \n"
        expected += "Number of exons                  7                    1                   \n"
        expected += "Number of introns                7                    1                   \n"
        expected += "Number of CDS                    7                    1                   \n"
        expected += "Overlapping genes                3                    1                   \n"
        expected += "Contained genes                  3                    1                   \n"
        expected += "CDS: complete                    3                    3                   \n"
        expected += "CDS: start, no stop              1                    1                   \n"
        expected += "CDS: stop, no start              1                    1                   \n"
        expected += "CDS: no stop, no start           2                    2                   \n"
        expected += "Total gene length                70                   15                  \n"
        expected += "Total mRNA length                70                   15                  \n"
        expected += "Total exon length                65                   15                  \n"
        expected += "Total intron length              65                   15                  \n"
        expected += "Total CDS length                 60                   10                  \n"
        expected += "Shortest gene                    10                   5                   \n"
        expected += "Shortest mRNA                    10                   5                   \n"
        expected += "Shortest exon                    8                    2                   \n"
        expected += "Shortest intron                  8                    2                   \n"
        expected += "Shortest CDS                     6                    3                   \n"
        expected += "Longest gene                     25                   30                  \n"
        expected += "Longest mRNA                     25                   30                  \n"
        expected += "Longest exon                     21                   9                   \n"
        expected += "Longest intron                   21                   9                   \n"
        expected += "Longest CDS                      20                   8                   \n"
        expected += "mean gene length                 14                   15                  \n"
        expected += "mean mRNA length                 10                   15                  \n"
        expected += "mean exon length                 9                    15                  \n"
        expected += "mean intron length               9                    15                  \n"
        expected += "mean CDS length                  9                    10                  \n"
        expected += "% of genome covered by genes     70.0                 30.0                \n"
        expected += "% of genome covered by CDS       60.0                 20.0                \n"
        expected += "mean mRNAs per gene              1                    1                   \n"
        expected += "mean exons per mRNA              1                    1                   \n"
        expected += "mean introns per mRNA            1                    1                   \n"
        summary = self.mgr.summary()
        self.assertEquals(summary, expected)

    def test_summary_without_modifications(self):
        self.populate_ref()
        expected = "                                 Genome            \n"
        expected += "                                 ------            \n"
        expected += "Total sequence length            100               \n"
        expected += "Number of genes                  5                 \n"
        expected += "Number of mRNAs                  7                 \n"
        expected += "Number of exons                  7                 \n"
        expected += "Number of introns                7                 \n"
        expected += "Number of CDS                    7                 \n"
        expected += "Overlapping genes                3                 \n"
        expected += "Contained genes                  3                 \n"
        expected += "CDS: complete                    3                 \n"
        expected += "CDS: start, no stop              1                 \n"
        expected += "CDS: stop, no start              1                 \n"
        expected += "CDS: no stop, no start           2                 \n"
        expected += "Total gene length                70                \n"
        expected += "Total mRNA length                70                \n"
        expected += "Total exon length                65                \n"
        expected += "Total intron length              65                \n"
        expected += "Total CDS length                 60                \n"
        expected += "Shortest gene                    10                \n"
        expected += "Shortest mRNA                    10                \n"
        expected += "Shortest exon                    8                 \n"
        expected += "Shortest intron                  8                 \n"
        expected += "Shortest CDS                     6                 \n"
        expected += "Longest gene                     25                \n"
        expected += "Longest mRNA                     25                \n"
        expected += "Longest exon                     21                \n"
        expected += "Longest intron                   21                \n"
        expected += "Longest CDS                      20                \n"
        expected += "mean gene length                 14.0              \n"
        expected += "mean mRNA length                 10.0              \n"
        expected += "mean exon length                 9.28571428571     \n"
        expected += "mean intron length               9.28571428571     \n"
        expected += "mean CDS length                  8.57142857143     \n"
        expected += "% of genome covered by genes     0.7               \n"
        expected += "% of genome covered by CDS       0.6               \n"
        expected += "mean mRNAs per gene              1.4               \n"
        expected += "mean exons per mRNA              1.0               \n"
        expected += "mean introns per mRNA            1.0               \n"
        summary = self.mgr.summary()
        #self.assertEquals(summary, expected)

    def test_format_column(self):
        column = ['a', 'sd', 'asdf']
        self.assertEquals(format_column(column, 5),
                          ['a        ', 'sd       ', 'asdf     '])

    def test_format_columns(self):
        desired_tbl = '    columnA columnB \n' \
                      '    ------- ------- \n' \
                      'dog 24      4222    \n' \
                      'foo 4232234 84      \n'
        column_names = ['columnA', 'columnB']
        dictA = {'foo': 4232234, 'dog': 24}
        dictB = {'foo': 84, 'dog': 4222}
        self.assertEquals(
            format_columns(column_names, ['dog', 'foo'], [dictA, dictB], 1),
            desired_tbl)
示例#4
0
 def __init__(self):
     self.seqs = []
     self.removed_features = []
     self.filter_mgr = FilterManager()
     self.stats_mgr = StatsManager()
示例#5
0
文件: controller.py 项目: desiro/GAG
class Controller:

    def __init__(self):
        self.seqs = []
        self.removed_features = []
        self.filter_mgr = FilterManager()
        self.stats_mgr = StatsManager()

    def execute(self, args):
        """At a minimum, write a fasta, gff and tbl to output directory. Optionally do more."""
        # Verify and read fasta file
        fastapath = args.fasta
        if not os.path.isfile(fastapath):
            sys.stderr.write("Failed to find " + fastapath + ". No genome was loaded.\n")
            sys.exit()
        sys.stderr.write("Reading fasta...\n")
        self.read_fasta(fastapath)
        sys.stderr.write("Done.\n")

        # Create output directory
        out_dir = "gag_output"
        if args.out:
            out_dir = args.out
        os.system('mkdir ' + out_dir)

        # Verify and read gff file
        # This step also writes genome.ignored.gff,
        # genome.invalid.gff and genome.comments.gff
        gffpath = args.gff
        if not os.path.isfile(gffpath):
            sys.stderr.write("Failed to find " + gffpath + ". No genome was loaded.")
            return
        sys.stderr.write("Reading gff...\n")
        self.read_gff(gffpath, out_dir)
        sys.stderr.write("Done.\n")

        # Calculate stats before genome is modified
        sys.stderr.write("Calculating stats on original genome\n")
        for seq in self.seqs:
            self.stats_mgr.update_ref(seq.stats())

        # Optional annotation step
        if args.anno:
            anno_filename = args.anno
            self.annotate_from_file(anno_filename)

        # Optional step to trim sequences, subsequences or features
        if args.trim:
            trim_filename = args.trim
            self.trim_from_file(trim_filename)

        # Optional step to create start and stop codons
        if args.fix_start_stop:
            sys.stderr.write("Creating start and stop codons...\n")
            self.fix_start_stop_codons()

        # Optional step to fix terminal Ns
        if args.fix_terminal_ns:
            sys.stderr.write("Fixing terminal Ns...\n")
            self.fix_terminal_ns()

        # Optional filtering steps
        # Remove
        if args.remove_cds_shorter_than:
            min_length = args.remove_cds_shorter_than
            sys.stderr.write("Removing CDS shorter than %s...\n" % min_length)
            self.apply_filter("cds_shorter_than", min_length, "REMOVE")
        if args.remove_cds_longer_than:
            max_length = args.remove_cds_longer_than
            sys.stderr.write("Removing CDS longer than %s...\n" % max_length)
            self.apply_filter("cds_longer_than", max_length, "REMOVE")
        if args.remove_exons_shorter_than:
            min_length = args.remove_exons_shorter_than
            sys.stderr.write("Removing exons shorter than %s...\n" % min_length)
            self.apply_filter("exon_shorter_than", min_length, "REMOVE")
        if args.remove_exons_longer_than:
            max_length = args.remove_exons_longer_than
            sys.stderr.write("Removing exons longer than %s...\n" % max_length)
            self.apply_filter("exon_longer_than", max_length, "REMOVE")
        if args.remove_introns_shorter_than:
            min_length = args.remove_introns_shorter_than
            sys.stderr.write("Removing exons shorter than %s...\n" % min_length)
            self.apply_filter("intron_shorter_than", min_length, "REMOVE")
        if args.remove_introns_longer_than:
            max_length = args.remove_introns_longer_than
            sys.stderr.write("Removing exons longer than %s...\n" % max_length)
            self.apply_filter("intron_longer_than", max_length, "REMOVE")
        if args.remove_genes_shorter_than:
            min_length = args.remove_genes_shorter_than
            sys.stderr.write("Removing genes shorter than %s...\n" % min_length)
            self.apply_filter("gene_shorter_than", min_length, "REMOVE")
        if args.remove_genes_longer_than:
            max_length = args.remove_genes_longer_than
            sys.stderr.write("Removing genes longer than %s...\n" % max_length)
            self.apply_filter("gene_longer_than", max_length, "REMOVE")
        # Flag
        if args.flag_cds_shorter_than:
            min_length = args.flag_cds_shorter_than
            sys.stderr.write("Flagging CDS shorter than %s...\n" % min_length)
            self.apply_filter("cds_shorter_than", min_length, "FLAG")
        if args.flag_cds_longer_than:
            max_length = args.flag_cds_longer_than
            sys.stderr.write("Flagging CDS longer than %s...\n" % max_length)
            self.apply_filter("cds_longer_than", max_length, "FLAG")
        if args.flag_exons_shorter_than:
            min_length = args.flag_exons_shorter_than
            sys.stderr.write("Flagging exons shorter than %s...\n" % min_length)
            self.apply_filter("exon_shorter_than", min_length, "FLAG")
        if args.flag_exons_longer_than:
            max_length = args.flag_exons_longer_than
            sys.stderr.write("Flagging exons longer than %s...\n" % max_length)
            self.apply_filter("exon_longer_than", max_length, "FLAG")
        if args.flag_introns_shorter_than:
            min_length = args.flag_introns_shorter_than
            sys.stderr.write("Flagging exons shorter than %s...\n" % min_length)
            self.apply_filter("intron_shorter_than", min_length, "FLAG")
        if args.flag_introns_longer_than:
            max_length = args.flag_introns_longer_than
            sys.stderr.write("Flagging exons longer than %s...\n" % max_length)
            self.apply_filter("intron_longer_than", max_length, "FLAG")
        if args.flag_genes_shorter_than:
            min_length = args.flag_genes_shorter_than
            sys.stderr.write("Flagging genes shorter than %s...\n" % min_length)
            self.apply_filter("gene_shorter_than", min_length, "FLAG")
        if args.flag_genes_longer_than:
            max_length = args.flag_genes_longer_than
            sys.stderr.write("Flagging genes longer than %s...\n" % max_length)
            self.apply_filter("gene_longer_than", max_length, "FLAG")

        # Write fasta, gff and tbl file to output folder
        # Open files
        fasta = open(out_dir + '/genome.fasta', 'w')
        gff = open(out_dir + '/genome.gff', 'w')
        tbl = open(out_dir + '/genome.tbl', 'w')
        proteins = open(out_dir + '/genome.proteins.fasta', 'w')
        removed = open(out_dir + '/genome.removed.gff', 'w')
        stats_file = open(out_dir + '/genome.stats', 'w')

        # Calculate stats on modified genome
        sys.stderr.write("Calculating stats on modified genome\n")
        for seq in self.seqs:
            self.stats_mgr.update_alt(seq.stats())

        # Write stats file
        sys.stderr.write("Writing stats file to " + out_dir + "/ ...\n")
        for line in self.stats_mgr.summary():
            stats_file.write(line)

        # Write fasta, gff, tbl, protein fasta
        sys.stderr.write("Writing gff, tbl and fasta to " + out_dir + "/ ...\n")
        gff.write("##gff-version 3\n")
        for seq in self.seqs:
            fasta.write(seq.to_fasta())
            gff.write(seq.to_gff())
            tbl.write(seq.to_tbl())
            proteins.write(seq.to_protein_fasta())

        # Write removed.gff
        for feature in self.removed_features:
            removed.write(feature.to_gff())

        # Close files
        gff.close()
        tbl.close()
        fasta.close()
        proteins.close()
        removed.close()
        stats_file.close()

    def add_annotations_from_list(self, anno_list):
        for seq in self.seqs:
            seq.add_annotations_from_list(anno_list)

    def trim_from_file(self, filename):
        if not os.path.isfile(filename):
            sys.stderr.write("Error: " + filename + " is not a file. Nothing trimmed.\n")
            return
        trimlist = self.read_bed_file(open(filename, 'rb'))
        if not trimlist:
            sys.stderr.write("Failed to read .bed file; nothing trimmed.\n")
            return
        else:
            self.trim_from_list(trimlist)

    def annotate_from_file(self, filename):
        if not os.path.isfile(filename):
            sys.stderr.write("Error: " + filename + " is not a file. Nothing annotated.\n")
            return
        annos = self.read_annotation_file(open(filename, 'rb'))
        if not annos:
            sys.stderr.write("Failed to read annotations from " + filename + "; no annotations added.\n")
            return
        else:
            sys.stderr.write("Adding annotations to genome ...\n")
            self.add_annotations_from_list(annos)
            sys.stderr.write("...done\n")

    def trim_from_list(self, trimlist):
        for seq in self.seqs:
            # In the case that there are multiple regions to trim in a single
            # sequence, trim from the end so indices don't get messed up
            to_trim_this_seq = [x for x in trimlist if x[0] == seq.header]
            to_trim_this_seq = sorted(to_trim_this_seq, key=lambda entry: entry[2], reverse=True)
            for entry in to_trim_this_seq:
                removed_genes = seq.trim_region(entry[1], entry[2])
                self.removed_features.extend(removed_genes)
                sys.stderr.write("Trimmed " + entry[0] + " from ")
                sys.stderr.write(str(entry[1]) + " to " + str(entry[2]) + "\n")
            self.remove_empty_features(seq)

    def get_filter_arg(self, filter_name):
        return self.filter_mgr.get_filter_arg(filter_name)
        
    def apply_filter(self, filter_name, val, filter_mode):
        for seq in self.seqs:
            self.filter_mgr.apply_filter(filter_name, val, filter_mode, seq)
            self.remove_empty_features(seq)

    def fix_terminal_ns(self):
        for seq in self.seqs:
            seq.remove_terminal_ns()
            self.remove_empty_features(seq)

    def fix_start_stop_codons(self):
        for seq in self.seqs:
            seq.create_starts_and_stops()

## Reading in files

    def read_fasta(self, line):
        reader = FastaReader()
        self.seqs = reader.read(open(line, 'r'))

    def read_gff(self, line, prefix):
        # Takes prefix b/c reader returns comments, invalids, ignored
        # and this method writes them to output files
        # That's kind of messy
        gffreader = GFFReader()
        reader = open(line, 'rb')
        genes, comments, invalids, ignored = gffreader.read_file(reader)
        for gene in genes:
            self.add_gene(gene)
        # Write comments, invalid lines and ignored features
        with open(prefix + "/genome.comments.gff", 'w') as comments_file:
            for comment in comments:
                comments_file.write(comment)
        with open(prefix + "/genome.invalid.gff", 'w') as invalid_file:
            for invalid in invalids:
                invalid_file.write(invalid)
        with open(prefix + "/genome.ignored.gff", 'w') as ignored_file:
            for item in ignored:
                ignored_file.write(item)

    def read_bed_file(self, io_buffer):
        trimlist = []
        for line in io_buffer:
            splitline = line.strip().split('\t')
            if len(splitline) != 3:
                return []
            else:
                try:
                    entry = [splitline[0], int(splitline[1]), int(splitline[2])]
                except ValueError:
                    sys.stderr.write("Error reading .bed file. Non-integer value ")
                    sys.sdterr.write("in column 2 or 3. Here is the line:\n")
                    sys.stderr.write(line)
                    return []
                trimlist.append(entry)
        return trimlist

    def read_annotation_file(self, io_buffer):
        annos = []
        for line in io_buffer:
            splitline = line.strip().split('\t')
            if len(splitline) != 3:
                return []
            else:
                annos.append(splitline)
        return annos


## Clean up

    def remove_empty_features(self, seq):
        """Removes any empty mRNAs or genes from a seq and adds them to self.removed_features."""
        self.removed_features.extend(seq.remove_empty_mrnas())
        self.removed_features.extend(seq.remove_empty_genes())
        
    def stats(self):
        if not self.seqs:
            return self.no_genome_message
        else:
            number_of_gagflags = 0
            # TODO have stats mgr handle "number of sequences"
            first_line = "Number of sequences:   " + str(len(self.seqs)) + "\n"
            sys.stderr.write("Calculating statistics on genome...\n")
            self.stats_mgr.clear_alt()
            for seq in self.seqs:
                self.stats_mgr.update_alt(seq.stats())
                number_of_gagflags += seq.number_of_gagflags()
            last_line = "(" + str(number_of_gagflags) + " features flagged)\n"
            return first_line + self.stats_mgr.summary() + last_line

## Utility methods

    def add_gene(self, gene):
        for seq in self.seqs:
            if seq.header == gene.seq_name:
                seq.add_gene(gene)

    def get_locus_tag(self):
        locus_tag = ""
        for seq in self.seqs:
            if locus_tag:
                break
            else:
                locus_tag = seq.get_locus_tag()
        return locus_tag
    
    def remove_from_list(self, bad_list):
        # First remove any seqs on the list
        to_remove = []
        for seq in self.seqs:
            if seq.header in bad_list:
                to_remove.append(seq)
        if to_remove:
            for seq in to_remove:
                self.seqs.remove(seq)
                sys.stderr.write("Warning: removing seq " + seq.header + ".\n")
                sys.stderr.write("You must reload genome to get this sequence back.\n")
            self.removed_features.extend(to_remove)
        # Now pass the list down to each seq
        for seq in self.seqs:
            removed_from_seq = seq.remove_from_list(bad_list)
            self.removed_features.extend(removed_from_seq)

    def contains_mrna(self, mrna_id):
        for seq in self.seqs:
            if seq.contains_mrna(mrna_id):
                return True
        return False

    def contains_gene(self, gene_id):
        for seq in self.seqs:
            if seq.contains_gene(gene_id):
                return True
        return False
示例#6
0
class Controller(object):
    def __init__(self):
        self.seqs = []
        self.removed_features = []
        self.filter_mgr = FilterManager()
        self.stats_mgr = StatsManager()

    def execute(self, args):
        """At a minimum, write a fasta, gff and tbl to output directory. Optionally do more."""
        # Verify and read fasta file
        fastapath = args.fasta
        if not os.path.isfile(fastapath):
            sys.stderr.write("Failed to find " + fastapath + ". No genome was loaded.\n")
            sys.exit()
        sys.stderr.write("Reading fasta...\n")
        self.read_fasta(fastapath)
        sys.stderr.write("Done.\n")

        # Create output directory
        out_dir = "gag_output"
        if args.out:
            out_dir = args.out
        os.system('mkdir ' + out_dir)

        # Verify and read gff file
        # This step also writes genome.ignored.gff,
        # genome.invalid.gff and genome.comments.gff
        gffpath = args.gff
        if not os.path.isfile(gffpath):
            sys.stderr.write("Failed to find " + gffpath + ". No genome was loaded.")
            return
        sys.stderr.write("Reading gff...\n")
        self.read_gff(gffpath, out_dir)
        sys.stderr.write("Done.\n")

        # Calculate stats before genome is modified
        sys.stderr.write("Calculating stats on original genome\n")
        for seq in self.seqs:
            self.stats_mgr.update_ref(seq.stats())

        # Optional annotation step
        if args.anno:
            anno_filename = args.anno
            self.annotate_from_file(anno_filename)

        # Optional step to trim sequences, subsequences or features
        if args.trim:
            trim_filename = args.trim
            self.trim_from_file(trim_filename)

        # Optional step to create start and stop codons
        if args.fix_start_stop:
            sys.stderr.write("Creating start and stop codons...\n")
            self.fix_start_stop_codons()

        # Optional step to fix terminal Ns
        if args.fix_terminal_ns:
            sys.stderr.write("Fixing terminal Ns...\n")
            self.fix_terminal_ns()

        # Optional filtering steps
        # Remove
        if args.remove_cds_shorter_than:
            min_length = args.remove_cds_shorter_than
            sys.stderr.write("Removing CDS shorter than %s...\n" % min_length)
            self.apply_filter("cds_shorter_than", min_length, "REMOVE")
        if args.remove_cds_longer_than:
            max_length = args.remove_cds_longer_than
            sys.stderr.write("Removing CDS longer than %s...\n" % max_length)
            self.apply_filter("cds_longer_than", max_length, "REMOVE")
        if args.remove_exons_shorter_than:
            min_length = args.remove_exons_shorter_than
            sys.stderr.write("Removing exons shorter than %s...\n" % min_length)
            self.apply_filter("exon_shorter_than", min_length, "REMOVE")
        if args.remove_exons_longer_than:
            max_length = args.remove_exons_longer_than
            sys.stderr.write("Removing exons longer than %s...\n" % max_length)
            self.apply_filter("exon_longer_than", max_length, "REMOVE")
        if args.remove_introns_shorter_than:
            min_length = args.remove_introns_shorter_than
            sys.stderr.write("Removing exons shorter than %s...\n" % min_length)
            self.apply_filter("intron_shorter_than", min_length, "REMOVE")
        if args.remove_introns_longer_than:
            max_length = args.remove_introns_longer_than
            sys.stderr.write("Removing exons longer than %s...\n" % max_length)
            self.apply_filter("intron_longer_than", max_length, "REMOVE")
        if args.remove_genes_shorter_than:
            min_length = args.remove_genes_shorter_than
            sys.stderr.write("Removing genes shorter than %s...\n" % min_length)
            self.apply_filter("gene_shorter_than", min_length, "REMOVE")
        if args.remove_genes_longer_than:
            max_length = args.remove_genes_longer_than
            sys.stderr.write("Removing genes longer than %s...\n" % max_length)
            self.apply_filter("gene_longer_than", max_length, "REMOVE")
        # Flag
        if args.flag_cds_shorter_than:
            min_length = args.flag_cds_shorter_than
            sys.stderr.write("Flagging CDS shorter than %s...\n" % min_length)
            self.apply_filter("cds_shorter_than", min_length, "FLAG")
        if args.flag_cds_longer_than:
            max_length = args.flag_cds_longer_than
            sys.stderr.write("Flagging CDS longer than %s...\n" % max_length)
            self.apply_filter("cds_longer_than", max_length, "FLAG")
        if args.flag_exons_shorter_than:
            min_length = args.flag_exons_shorter_than
            sys.stderr.write("Flagging exons shorter than %s...\n" % min_length)
            self.apply_filter("exon_shorter_than", min_length, "FLAG")
        if args.flag_exons_longer_than:
            max_length = args.flag_exons_longer_than
            sys.stderr.write("Flagging exons longer than %s...\n" % max_length)
            self.apply_filter("exon_longer_than", max_length, "FLAG")
        if args.flag_introns_shorter_than:
            min_length = args.flag_introns_shorter_than
            sys.stderr.write("Flagging exons shorter than %s...\n" % min_length)
            self.apply_filter("intron_shorter_than", min_length, "FLAG")
        if args.flag_introns_longer_than:
            max_length = args.flag_introns_longer_than
            sys.stderr.write("Flagging exons longer than %s...\n" % max_length)
            self.apply_filter("intron_longer_than", max_length, "FLAG")
        if args.flag_genes_shorter_than:
            min_length = args.flag_genes_shorter_than
            sys.stderr.write("Flagging genes shorter than %s...\n" % min_length)
            self.apply_filter("gene_shorter_than", min_length, "FLAG")
        if args.flag_genes_longer_than:
            max_length = args.flag_genes_longer_than
            sys.stderr.write("Flagging genes longer than %s...\n" % max_length)
            self.apply_filter("gene_longer_than", max_length, "FLAG")

        # Write fasta, gff and tbl file to output folder
        # Open files
        fasta = open(out_dir + '/genome.fasta', 'w')
        gff = open(out_dir + '/genome.gff', 'w')
        tbl = open(out_dir + '/genome.tbl', 'w')
        proteins = open(out_dir + '/genome.proteins.fasta', 'w')
        mrna = open(out_dir + '/genome.mrna.fasta', 'w')
        removed = open(out_dir + '/genome.removed.gff', 'w')
        stats_file = open(out_dir + '/genome.stats', 'w')

        # Calculate stats on modified genome
        sys.stderr.write("Calculating stats on modified genome\n")
        for seq in self.seqs:
            self.stats_mgr.update_alt(seq.stats())

        # Write stats file
        sys.stderr.write("Writing stats file to " + out_dir + "/ ...\n")
        for line in self.stats_mgr.summary():
            stats_file.write(line)

        # Write fasta, gff, tbl, protein fasta
        sys.stderr.write("Writing gff, tbl and fasta to " + out_dir + "/ ...\n")
        gff.write("##gff-version 3\n")
        for seq in self.seqs:
            if seq.is_empty():
                continue
            fasta.write(seq.to_fasta())
            gff.write(seq.to_gff())
            if not args.skip_empty_scaffolds or len(seq.genes) > 0:
                # Possibly skip empty sequences
                tbl.write(seq.to_tbl())
            proteins.write(seq.to_protein_fasta())
            mrna.write(seq.to_mrna_fasta())

        # Write removed.gff
        for feature in self.removed_features:
            removed.write(feature.to_gff())

        # Close files
        gff.close()
        tbl.close()
        fasta.close()
        proteins.close()
        removed.close()
        stats_file.close()

    def add_annotations_from_list(self, anno_list):
        for seq in self.seqs:
            seq.add_annotations_from_list(anno_list)

    def trim_from_file(self, filename):
        if not os.path.isfile(filename):
            sys.stderr.write("Error: " + filename + " is not a file. Nothing trimmed.\n")
            return
        trimlist = read_bed_file(open(filename, 'rb'))
        if not trimlist:
            sys.stderr.write("Failed to read .bed file; nothing trimmed.\n")
            return
        else:
            self.trim_from_list(trimlist)

    def annotate_from_file(self, filename):
        if not os.path.isfile(filename):
            sys.stderr.write("Error: " + filename + " is not a file. Nothing annotated.\n")
            return
        annos = read_annotation_file(open(filename, 'rb'))
        if not annos:
            sys.stderr.write("Failed to read annotations from " + filename + "; no annotations added.\n")
            return
        else:
            sys.stderr.write("Adding annotations to genome ...\n")
            self.add_annotations_from_list(annos)
            sys.stderr.write("...done\n")

    def trim_from_list(self, trimlist):
        for seq in self.seqs:
            # In the case that there are multiple regions to trim in a single
            # sequence, trim from the end so indices don't get messed up
            to_trim_this_seq = [x for x in trimlist if x[0] == seq.header]
            to_trim_this_seq = sorted(to_trim_this_seq, key=lambda _entry: _entry[2], reverse=True)
            for entry in to_trim_this_seq:
                removed_genes = seq.trim_region(entry[1], entry[2])
                self.removed_features.extend(removed_genes)
                sys.stderr.write("Trimmed " + entry[0] + " from ")
                sys.stderr.write(str(entry[1]) + " to " + str(entry[2]) + "\n")
            self.remove_empty_features(seq)

    def get_filter_arg(self, filter_name):
        return self.filter_mgr.get_filter_arg(filter_name)

    def apply_filter(self, filter_name, val, filter_mode):
        for seq in self.seqs:
            self.filter_mgr.apply_filter(filter_name, val, filter_mode, seq)
            self.remove_empty_features(seq)

    def fix_terminal_ns(self):
        for seq in self.seqs:
            seq.remove_terminal_ns()
            self.remove_empty_features(seq)

    def fix_start_stop_codons(self):
        for seq in self.seqs:
            seq.create_starts_and_stops()

        # Reading in files

    def read_fasta(self, line):
        reader = FastaReader()
        self.seqs = reader.read(open(line, 'r'))

    def read_gff(self, line, prefix):
        # Takes prefix b/c reader returns comments, invalids, ignored
        # and this method writes them to output files
        # That's kind of messy
        gffreader = GFFReader()
        reader = open(line, 'rb')
        genes, comments, invalids, ignored = gffreader.read_file(reader)
        for gene in genes:
            self.add_gene(gene)
        # Write comments, invalid lines and ignored features
        with open(prefix + "/genome.comments.gff", 'w') as comments_file:
            for comment in comments:
                comments_file.write(comment)
        with open(prefix + "/genome.invalid.gff", 'w') as invalid_file:
            for invalid in invalids:
                invalid_file.write(invalid)
        with open(prefix + "/genome.ignored.gff", 'w') as ignored_file:
            for item in ignored:
                ignored_file.write(item)

    # Clean up

    def remove_empty_features(self, seq):
        """Removes any empty mRNAs or genes from a seq and adds them to self.removed_features."""
        self.removed_features.extend(seq.remove_empty_mrnas())
        self.removed_features.extend(seq.remove_empty_genes())

    def stats(self):
        if not self.seqs:
            return "error: no sequences"
        else:
            number_of_gagflags = 0
            # TODO have stats mgr handle "number of sequences"
            first_line = "Number of sequences:   " + str(len(self.seqs)) + "\n"
            sys.stderr.write("Calculating statistics on genome...\n")
            self.stats_mgr.clear_alt()
            for seq in self.seqs:
                self.stats_mgr.update_alt(seq.stats())
                number_of_gagflags += seq.number_of_gagflags()
            last_line = "(" + str(number_of_gagflags) + " features flagged)\n"
            return first_line + self.stats_mgr.summary() + last_line

        # Utility methods

    def add_gene(self, gene):
        for seq in self.seqs:
            if seq.header == gene.seq_name:
                seq.add_gene(gene)

    def get_locus_tag(self):
        locus_tag = ""
        for seq in self.seqs:
            if locus_tag:
                break
            else:
                locus_tag = seq.get_locus_tag()
        return locus_tag

    def remove_from_list(self, bad_list):
        # First remove any seqs on the list
        to_remove = []
        for seq in self.seqs:
            if seq.header in bad_list:
                to_remove.append(seq)
        if to_remove:
            for seq in to_remove:
                self.seqs.remove(seq)
                sys.stderr.write("Warning: removing seq " + seq.header + ".\n")
                sys.stderr.write("You must reload genome to get this sequence back.\n")
            self.removed_features.extend(to_remove)
        # Now pass the list down to each seq
        for seq in self.seqs:
            removed_from_seq = seq.remove_from_list(bad_list)
            self.removed_features.extend(removed_from_seq)

    def contains_mrna(self, mrna_id):
        for seq in self.seqs:
            if seq.contains_mrna(mrna_id):
                return True
        return False

    def contains_gene(self, gene_id):
        for seq in self.seqs:
            if seq.contains_gene(gene_id):
                return True
        return False
示例#7
0
 def __init__(self):
     self.seqs = []
     self.annot = Annotator()
     self.filter_mgr = FilterManager()
     self.stats_mgr = StatsManager()
     self.seq_fixer = SeqFixer()
示例#8
0
class ConsoleController:

    no_genome_message = "It looks like no genome is currently loaded. Try the 'load' command.\n"+\
            "Type 'help load' to learn how to use it, or just 'help' for general advice.\n"

## Setup, loading and saving sessions

    def __init__(self):
        self.seqs = []
        self.annot = Annotator()
        self.filter_mgr = FilterManager()
        self.stats_mgr = StatsManager()
        self.seq_fixer = SeqFixer()

    def genome_is_loaded(self):
        for seq in self.seqs:
            if seq.genes:
                return True
        return False

    def barf_folder(self, line):
        if not self.seqs:
            return self.no_genome_message
        elif len(line) == 0:
            sys.stderr.write("Usage: barffolder <directory>\n")
            return
        else:
            # Create directory, open files
            os.system('mkdir '+line)
            gff = open(line+'/genome.gff', 'w')
            removed_gff = open(line+'/genome.removed.gff', 'w')
            tbl = open(line+'/genome.tbl', 'w')
            fasta = open(line+'/genome.fasta', 'w')
            mrna_fasta = open(line+'/genome.mrna.fasta', 'w')
            cds_fasta = open(line+'/genome.cds.fasta', 'w')
            protein_fasta = open(line+'/genome.proteins.fasta', 'w')

            # Deep copy each seq, apply fixes and filters, write
            sys.stderr.write("Writing gff, tbl and fasta...\n")
            for seq in self.seqs:
                cseq = copy.deepcopy(seq)
                self.seq_fixer.fix(cseq)
                self.filter_mgr.apply_filters(cseq)
                gff.write(cseq.to_gff())
                removed_gff.write(cseq.removed_to_gff())
                tbl.write(cseq.to_tbl())
                mrna_fasta.write(cseq.to_mrna_fasta())
                cds_fasta.write(cseq.to_cds_fasta())
                protein_fasta.write(cseq.to_protein_fasta())
                fasta.write(cseq.to_fasta())

            # Close files
            gff.close()
            tbl.close()
            fasta.close()
            mrna_fasta.close()
            cds_fasta.close()
            protein_fasta.close()

            return "Genome written to " + line
        
    def load_folder(self, line):
        if not line:
            line = "."
        fastapath = line + '/genome.fasta'
        gffpath = line + '/genome.gff'

        # Verify files
        if not os.path.isfile(fastapath):
            sys.stderr.write("Failed to find " + fastapath + ". No genome was loaded.")
            return
        if not os.path.isfile(gffpath):
            sys.stderr.write("Failed to find " + gffpath + ". No genome was loaded.")
            return

        # Read the fasta
        sys.stderr.write("Reading fasta...\n")
        self.read_fasta(fastapath)
        sys.stderr.write("Done.\n")

        # Read the gff
        sys.stderr.write("Reading gff...\n")
        self.read_gff(gffpath)
        sys.stderr.write("Done.\n")

        # Clear stats; read in new stats
        self.stats_mgr.clear_all()
        for seq in self.seqs:
            self.stats_mgr.update_ref(seq.stats())

    def set_filter_arg(self, filter_name, val):
        self.filter_mgr.set_filter_arg(filter_name, val)

    def get_filter_arg(self, filter_name):
        return self.filter_mgr.get_filter_arg(filter_name)

    def set_filter_remove(self, filter_name, remove):
        self.filter_mgr.set_filter_remove(filter_name, remove)
        
    def apply_filters(self):
        for seq in self.seqs:
            self.filter_mgr.apply_filters(seq)

    def fix_terminal_ns(self):
        self.seq_fixer.fix_terminal_ns()
        return "Terminal Ns will now be fixed."

    def fix_start_stop_codons(self):
        self.seq_fixer.fix_start_stop_codons()
        return "Will verify and create start/stop codons."

## Assorted utilities

    def get_n_seq_ids(self, number):
        """Returns a message indicating the first n seq_ids in the genome.

        If no seqs loaded, returns a message to that effect. If fewer than n
        seqs loaded, returns the seq_ids of those seqs."""
        if not self.seqs:
            return "No sequences currently in memory.\n"
        else:
            if len(self.seqs) < number:
                number = len(self.seqs)
            seq_list = []
            for seq in self.seqs:
                seq_list.append(seq.header)
                if len(seq_list) == number:
                    break
            result = "First " + str(len(seq_list)) + " seq ids are: "
            result += format_list_with_strings(seq_list)
            return result

    def get_n_gene_ids(self, number):
        """Returns a message indicating the first n gene_ids in the genome.

        If no genes are present, returns a message to that effect. If fewer than n
        genes are loaded, returns the gene_ids of those genes."""
        genes_list = []
        while len(genes_list) < number:
            for seq in self.seqs:
                genes_list.extend(seq.get_gene_ids())
        # List may now contain more than 'number' ids, or it may contain zero
        if not genes_list:
            return "No genes currently in memory.\n"
        if len(genes_list) > number:
            genes_list = genes_list[:number]
        result = "First " + str(len(genes_list)) + " gene ids are: "
        result += format_list_with_strings(genes_list)
        return result

    def get_n_mrna_ids(self, number):
        """Returns a message indicating the first n mrna_ids in the genome.

        If no mrnas are present, returns a message to that effect. If fewer than n
        mrnas are loaded, returns the mrna_ids of those mrnas."""
        mrnas_list = []
        while len(mrnas_list) < number:
            for seq in self.seqs:
                mrnas_list.extend(seq.get_mrna_ids())
        # List may now contain more than 'number' ids, or it may contain zero
        if not mrnas_list:
            return "No mrnas currently in memory.\n"
        if len(mrnas_list) > number:
            mrnas_list = mrnas_list[:number]
        result = "First " + str(len(mrnas_list)) + " mrna ids are: "
        result += format_list_with_strings(mrnas_list)
        return result


## Reading in files

    def read_fasta(self, line):
        reader = FastaReader()
        self.seqs = reader.read(open(line, 'r'))

    def read_gff(self, line):
        gffreader = GFFReader()
        reader = open(line, 'rb')
        genes = gffreader.read_file(reader)
        for gene in genes:
            self.add_gene(gene)


## Output info to console

    def barf_gene_gff(self, line):
        if not self.seqs:
            return self.no_genome_message
        else:
            for seq in self.seqs:
                if seq.contains_gene(line):
                    cseq = copy.deepcopy(seq)
                    self.seq_fixer.fix(cseq)
                    self.filter_mgr.apply_filters(cseq)
                    return cseq.gene_to_gff(line)

    def barf_seq(self, line):
        if not self.seqs:
            return self.no_genome_message
        else:
            args = line.split(' ')
            if len(args) == 1:
                seq_id = args[0]
                for seq in self.seqs:
                    if seq.header == seq_id:
                        cseq = copy.deepcopy(seq)
                        self.seq_fixer.fix(cseq)
                        self.filter_mgr.apply_filters(cseq)
                        return cseq.get_subseq()
            elif len(args) == 3:
                seq_id = args[0]
                start = int(args[1])
                stop = int(args[2])
                for seq in self.seqs:
                    if seq.header == seq_id:
                        cseq = copy.deepcopy(seq)
                        self.seq_fixer.fix(cseq)
                        self.filter_mgr.apply_filters(cseq)
                        return cseq.get_subseq(start, stop)
            else:
                return "Usage: barfseq <seq_id> <start_index> <end_index>\n"

    def barf_cds_seq(self, line):
        if not self.seqs:
            return self.no_genome_message
        else:
            name = line
            for seq in self.seqs:
                if seq.contains_mrna(name):
                    cseq = copy.deepcopy(seq)
                    self.seq_fixer.fix(cseq)
                    self.filter_mgr.apply_filters(cseq)
                    return cseq.extract_cds_seq(name)
            return "Error: Couldn't find mRNA.\n"

    def cds_to_gff(self, line):
        if not self.seqs:
            return self.no_genome_message
        else:
            name = line
            for seq in self.seqs:
                if seq.contains_mrna(name):
                    cseq = copy.deepcopy(seq)
                    self.seq_fixer.fix(cseq)
                    self.filter_mgr.apply_filters(cseq)
                    return cseq.cds_to_gff(name)
            return "Error: Couldn't find mRNA.\n"

    def cds_to_tbl(self, line):
        if not self.seqs:
            return self.no_genome_message
        else:
            name = line
            for seq in self.seqs:
                if seq.contains_mrna(name):
                    cseq = copy.deepcopy(seq)
                    self.seq_fixer.fix(cseq)
                    self.filter_mgr.apply_filters(cseq)
                    return cseq.cds_to_tbl(name)
            return "Error: Couldn't find mRNA.\n"

    def barf_gene_tbl(self, line):
        if not self.seqs:
            return self.no_genome_message
        else:
            output = ">Feature SeqId\n"
            for seq in self.seqs:
                if seq.contains_gene(line):
                    cseq = copy.deepcopy(seq)
                    self.seq_fixer.fix(cseq)
                    self.filter_mgr.apply_filters(cseq)
                    output += cseq.gene_to_tbl(line)
            return output

    def stats(self):
        if not self.seqs:
            return self.no_genome_message
        else:
            number_of_gagflags = 0
            first_line = "Number of sequences:   " + str(len(self.seqs)) + "\n"
            if self.filter_mgr.dirty or self.seq_fixer.dirty:
                self.stats_mgr.clear_alt()
                sys.stderr.write("Calculating statistics on genome...\n")
                for seq in self.seqs:
                    # Deep copy seq, apply fixes and filters, then update stats
                    cseq = copy.deepcopy(seq)
                    self.seq_fixer.fix(cseq)
                    self.filter_mgr.apply_filters(cseq)
                    self.stats_mgr.update_alt(cseq.stats())
                    number_of_gagflags += cseq.number_of_gagflags()
                self.filter_mgr.dirty = False
                self.seq_fixer.dirty = False
            last_line = "(" + str(number_of_gagflags) + " features flagged)\n"
            return first_line + self.stats_mgr.summary() + last_line

## Utility methods

    def add_gene(self, gene):
        for seq in self.seqs:
            if seq.header == gene.seq_name:
                seq.add_gene(gene)

    def get_locus_tag(self):
        locus_tag = ""
        for seq in self.seqs:
            if locus_tag:
                break
            else:
                locus_tag = seq.get_locus_tag()
        return locus_tag
    
    def clear_seqs(self):
        self.seqs[:] = []

    def contains_mrna(self, mrna_id):
        for seq in self.seqs:
            if seq.contains_mrna(mrna_id):
                return True
        return False

    def contains_gene(self, gene_id):
        for seq in self.seqs:
            if seq.contains_gene(gene_id):
                return True
        return False

    def contains_seq(self, seq_id):
        for seq in self.seqs:
            if seq.header == seq_id:
                return True
        return False

    def can_write_to_path(self, path):
        if len(path.split()) > 1:
            return False
        else:
            return not os.path.exists(path)
示例#9
0
 def setUp(self):
     self.mgr = StatsManager()
示例#10
0
class TestStatsManager(unittest.TestCase):
    def setUp(self):
        self.mgr = StatsManager()

    def test_initialize(self):
        self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0)

    def test_clear_alt(self):
        self.mgr.update_alt(self.get_new_dict())
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1)
        self.mgr.clear_alt()
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0)

    def test_clear_all(self):
        self.populate_ref()
        self.mgr.update_alt(self.get_new_dict())
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 1)
        self.assertEquals(self.mgr.ref_stats["Number of CDS"], 7)
        self.mgr.clear_all()
        self.assertEquals(self.mgr.alt_stats["Number of CDS"], 0)
        self.assertEquals(self.mgr.ref_stats["Number of CDS"], 0)

    def populate_ref(self):
        self.mgr.ref_stats["Total sequence length"] = 100
        self.mgr.ref_stats["Number of genes"] = 5
        self.mgr.ref_stats["Number of mRNAs"] = 7
        self.mgr.ref_stats["Number of exons"] = 7
        self.mgr.ref_stats["Number of introns"] = 7
        self.mgr.ref_stats["Number of CDS"] = 7
        self.mgr.ref_stats["CDS: complete"] = 3
        self.mgr.ref_stats["CDS: start, no stop"] = 1
        self.mgr.ref_stats["CDS: stop, no start"] = 1
        self.mgr.ref_stats["CDS: no stop, no start"] = 2
        self.mgr.ref_stats["Longest gene"] = 25
        self.mgr.ref_stats["Longest mRNA"] = 25
        self.mgr.ref_stats["Longest exon"] = 21
        self.mgr.ref_stats["Longest intron"] = 21
        self.mgr.ref_stats["Longest CDS"] = 20
        self.mgr.ref_stats["Shortest gene"] = 10
        self.mgr.ref_stats["Shortest mRNA"] = 10
        self.mgr.ref_stats["Shortest exon"] = 8
        self.mgr.ref_stats["Shortest intron"] = 8
        self.mgr.ref_stats["Shortest CDS"] = 6
        self.mgr.ref_stats["Total gene length"] = 70
        self.mgr.ref_stats["Total mRNA length"] = 70
        self.mgr.ref_stats["Total exon length"] = 65
        self.mgr.ref_stats["Total intron length"] = 65
        self.mgr.ref_stats["Total CDS length"] = 60

    def get_new_dict(self):
        d = {}
        d["Total sequence length"] = 50
        d["Number of genes"] = 1
        d["Number of mRNAs"] = 1
        d["Number of exons"] = 1
        d["Number of introns"] = 1
        d["Number of CDS"] = 1
        d["CDS: complete"] = 3
        d["CDS: start, no stop"] = 1
        d["CDS: stop, no start"] = 1
        d["CDS: no stop, no start"] = 2
        d["Longest gene"] = 30
        d["Longest mRNA"] = 30
        d["Longest exon"] = 9
        d["Longest intron"] = 9
        d["Longest CDS"] = 8
        d["Shortest gene"] = 5
        d["Shortest mRNA"] = 5
        d["Shortest exon"] = 2
        d["Shortest intron"] = 2
        d["Shortest CDS"] = 3
        d["Total gene length"] = 15
        d["Total mRNA length"] = 15
        d["Total exon length"] = 15
        d["Total intron length"] = 15
        d["Total CDS length"] = 10
        return d

    def test_alt_is_empty(self):
        self.assertTrue(self.mgr.alt_is_empty())
        self.mgr.update_alt(self.get_new_dict())
        self.assertFalse(self.mgr.alt_is_empty())

    def test_update_ref(self):
        self.populate_ref()
        newdict = self.get_new_dict()
        self.assertEquals(self.mgr.ref_stats["Total sequence length"], 100)
        self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 6)
        self.assertEquals(self.mgr.ref_stats["Longest gene"], 25)
        self.mgr.update_ref(newdict)
        self.assertEquals(self.mgr.ref_stats["Total sequence length"], 150)
        self.assertEquals(self.mgr.ref_stats["Shortest CDS"], 3)
        self.assertEquals(self.mgr.ref_stats["Longest gene"], 30)

    def test_summary_with_modifications(self):
        self.populate_ref()
        self.mgr.update_alt(self.get_new_dict())
        expected = "                                 Reference Genome     Modified Genome     \n"
        expected += "                                 ----------------     ---------------     \n"
        expected += "Total sequence length            100                  50                  \n"
        expected += "Number of genes                  5                    1                   \n"
        expected += "Number of mRNAs                  7                    1                   \n"
        expected += "Number of exons                  7                    1                   \n"
        expected += "Number of introns                7                    1                   \n"
        expected += "Number of CDS                    7                    1                   \n"
        expected += "CDS: complete                    3                    3                   \n"
        expected += "CDS: start, no stop              1                    1                   \n"
        expected += "CDS: stop, no start              1                    1                   \n"
        expected += "CDS: no stop, no start           2                    2                   \n"
        expected += "Total gene length                70                   15                  \n"
        expected += "Total mRNA length                70                   15                  \n"
        expected += "Total exon length                65                   15                  \n"
        expected += "Total intron length              65                   15                  \n"
        expected += "Total CDS length                 60                   10                  \n"
        expected += "Shortest gene                    10                   5                   \n"
        expected += "Shortest mRNA                    10                   5                   \n"
        expected += "Shortest exon                    8                    2                   \n"
        expected += "Shortest intron                  8                    2                   \n"
        expected += "Shortest CDS                     6                    3                   \n"
        expected += "Longest gene                     25                   30                  \n"
        expected += "Longest mRNA                     25                   30                  \n"
        expected += "Longest exon                     21                   9                   \n"
        expected += "Longest intron                   21                   9                   \n"
        expected += "Longest CDS                      20                   8                   \n"
        expected += "mean gene length                 14                   15                  \n"
        expected += "mean mRNA length                 10                   15                  \n"
        expected += "mean exon length                 9                    15                  \n"
        expected += "mean intron length               9                    15                  \n"
        expected += "mean CDS length                  9                    10                  \n"
        expected += "% of genome covered by genes     70.0                 30.0                \n"
        expected += "% of genome covered by CDS       60.0                 20.0                \n"
        expected += "mRNAs per gene                   1                    1                   \n"
        expected += "exons per mRNA                   1                    1                   \n"
        expected += "introns per mRNA                 1                    1                   \n"
        summary = self.mgr.summary()
        self.assertEquals(summary, expected)

    def test_summary_without_modifications(self):
        self.populate_ref()
        expected = "                                 Genome            \n"
        expected += "                                 ------            \n"
        expected += "Total sequence length            100               \n"
        expected += "Number of genes                  5                 \n"
        expected += "Number of mRNAs                  7                 \n"
        expected += "Number of exons                  7                 \n"
        expected += "Number of introns                7                 \n"
        expected += "Number of CDS                    7                 \n"
        expected += "CDS: complete                    3                 \n"
        expected += "CDS: start, no stop              1                 \n"
        expected += "CDS: stop, no start              1                 \n"
        expected += "CDS: no stop, no start           2                 \n"
        expected += "Total gene length                70                \n"
        expected += "Total mRNA length                70                \n"
        expected += "Total exon length                65                \n"
        expected += "Total intron length              65                \n"
        expected += "Total CDS length                 60                \n"
        expected += "Shortest gene                    10                \n"
        expected += "Shortest mRNA                    10                \n"
        expected += "Shortest exon                    8                 \n"
        expected += "Shortest intron                  8                 \n"
        expected += "Shortest CDS                     6                 \n"
        expected += "Longest gene                     25                \n"
        expected += "Longest mRNA                     25                \n"
        expected += "Longest exon                     21                \n"
        expected += "Longest intron                   21                \n"
        expected += "Longest CDS                      20                \n"
        expected += "mean gene length                 14.0              \n"
        expected += "mean mRNA length                 10.0              \n"
        expected += "mean exon length                 9.28571428571     \n"
        expected += "mean intron length               9.28571428571     \n"
        expected += "mean CDS length                  8.57142857143     \n"
        expected += "% of genome covered by genes     0.7               \n"
        expected += "% of genome covered by CDS       0.6               \n"
        expected += "mRNAs per gene                   1.4               \n"
        expected += "exons per mRNA                   1.0               \n"
        expected += "introns per mRNA                 1.0               \n"
        summary = self.mgr.summary()
        # self.assertEquals(summary, expected)

    def test_format_column(self):
        column = ["a", "sd", "asdf"]
        self.assertEquals(format_column(column, 5), ["a        ", "sd       ", "asdf     "])

    def test_format_columns(self):
        desired_tbl = (
            "    columnA columnB \n" "    ------- ------- \n" "dog 24      4222    \n" "foo 4232234 84      \n"
        )
        column_names = ["columnA", "columnB"]
        dictA = {"foo": 4232234, "dog": 24}
        dictB = {"foo": 84, "dog": 4222}
        self.assertEquals(format_columns(column_names, ["dog", "foo"], [dictA, dictB], 1), desired_tbl)