예제 #1
0
def parse_fams_r2r(fam_groups, fam, basefolder, cpus=1):
    """Aligns each family with cm of each group and creates r2r for alignment

    Parameters
    ----------
    fam_groups : list of str
        groups in the family
    fam : str
        name of the family
    basefolder : str
        path to the base output directory
    """
    # Load family stockholm file
    sto = next(
        StockholmAlignment.from_file(
            join(basefolder, fam, "bayesfold-aln.sto"), RNASequence))
    # grab r2r weights information
    r2r_counts = sto.gf["USE_THIS_WEIGHT_MAP"]
    # write out degapped sequences
    degapped = join(basefolder, fam, "degapped.fna")
    with open(degapped, 'w') as fout:
        fout.write(sto.degapped().to_fasta())
    # apply r2r weights to each family and re-create r2r drawing
    for group in fam_groups:
        # align family sequences to cm for group
        group_sto = join(basefolder, fam, "%s.sto" % group)
        cmalign(degapped, join(basefolder, group, "cmfile.cm"), group_sto,
                cpus)
        # add weight to stockholm file and write back out
        sto = StockholmAlignment.from_file(group_sto, RNASequence)
        sto.gf["USE_THIS_WEIGHT_MAP"] = r2r_counts
        with open(group_sto, 'w') as fout:
            fout.write(str(sto))
        # rebuild r2r with new alignment
        make_r2r(group_sto, join(basefolder, fam), "%s_%s" % (fam, group))
예제 #2
0
    def test_to_file(self):
        st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF, gs=self.GS,
                                gr=self.GR)

        with tempfile.NamedTemporaryFile('r+') as temp_file:
            st.to_file(temp_file)
            temp_file.flush()
            temp_file.seek(0)
            obs = temp_file.read()
            exp = ('# STOCKHOLM 1.0\n'
                   '#=GF AC RF00360\n'
                   '#=GF BM cmbuild  -F CM SEED\n'
                   '#=GF BM cmsearch  -Z 274931 -E 1000000\n'
                   '#=GF SQ 9\n'
                   '#=GF RN [1]\n'
                   '#=GF RM 11469857\n'
                   '#=GF RT TITLE1\n'
                   '#=GF RA Auth1;\n'
                   '#=GF RL J Mol Biol\n'
                   '#=GF RN [2]\n'
                   '#=GF RM 12007400\n'
                   '#=GF RT TITLE2\n'
                   '#=GF RA Auth2;\n'
                   '#=GF RL Cell\n'
                   '#=GS seq1 AC 111\n'
                   '#=GS seq2 AC 222\n'
                   'seq1          ACC-G-GGTA\n'
                   '#=GR seq1 SS  1110101111\n'
                   'seq2          TCC-G-GGCA\n'
                   '#=GR seq2 SS  0110101110\n'
                   '#=GC SS_cons  (((....)))\n//')
        self.assertEqual(obs, exp)
예제 #3
0
 def test_from_file_GS(self):
     """Make sure GS lines are parsed correctly"""
     sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n"
                    "seq1          ACC-G-GGTA\n"
                    "seq2          TCC-G-GGCA\n//")
     obs_sto = next(StockholmAlignment.from_file(sto, DNA))
     exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {})
     self.assertEqual(obs_sto, exp_sto)
예제 #4
0
 def test_from_file_GC(self):
     """Make sure GC lines are parsed correctly"""
     sto = StringIO("# STOCKHOLM 1.0\n"
                    "seq1         ACC-G-GGTA\nseq2         TCC-G-GGCA\n"
                    "#=GC SS_cons (((....)))\n//")
     obs_sto = next(StockholmAlignment.from_file(sto, DNA))
     exp_sto = StockholmAlignment(self.seqs, {}, {}, {}, self.GC)
     self.assertEqual(obs_sto, exp_sto)
예제 #5
0
 def test_from_file_GR(self):
     """Make sure GR lines are parsed correctly"""
     sto = StringIO("# STOCKHOLM 1.0\nseq1          ACC-G\n"
                    "#=GR seq1 SS  11101\nseq2          TCC-G\n"
                    "#=GR seq2 SS  01101\n\nseq1          -GGTA\n"
                    "#=GR seq1 SS  01111\nseq2          -GGCA\n"
                    "#=GR seq2 SS  01110\n//")
     obs_sto = next(StockholmAlignment.from_file(sto, DNA))
     exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {})
     self.assertEqual(obs_sto, exp_sto)
예제 #6
0
 def test_from_file_alignment(self):
     """make sure can parse basic sto file with interleaved alignment"""
     sto = StringIO("# STOCKHOLM 1.0\n"
                    "seq1      ACC-G\n"
                    "seq2      TCC-G\n\n"
                    "seq1      -GGTA\n"
                    "seq2      -GGCA\n//")
     obs_sto = next(StockholmAlignment.from_file(sto, DNA))
     exp_sto = StockholmAlignment(self.seqs)
     self.assertEqual(obs_sto, exp_sto)
예제 #7
0
 def test_from_file_GF(self):
     """Make sure GF lines are parsed correctly"""
     # remove rn line to make sure auto-added
     self.GF.pop("RN")
     sto = StringIO("# STOCKHOLM 1.0\n#=GF RN [1]\n#=GF RM 11469857\n"
                    "#=GF RT TITLE1\n#=GF RA Auth1;\n#=GF RL J Mol Biol\n"
                    "#=GF RN [2]\n#=GF RM 12007400\n#=GF RT TITLE2\n"
                    "#=GF RA Auth2;\n#=GF RL Cell\n#=GF AC RF00360\n"
                    "#=GF BM cmbuild  -F CM SEED\n"
                    "#=GF BM cmsearch  -Z 274931 -E 1000000\n#=GF SQ 9\n"
                    "seq1         ACC-G-GGTA\nseq2         TCC-G-GGCA\n//")
     obs_sto = next(StockholmAlignment.from_file(sto, DNA))
     exp_sto = StockholmAlignment(self.seqs, self.GF, {}, {}, {})
     self.assertEqual(obs_sto, exp_sto)
예제 #8
0
 def test_from_file_multi(self):
     """Make sure yield works correctly with multi-alignment sto files"""
     sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n"
                    "seq1          ACC-G-GGTA\n"
                    "seq2          TCC-G-GGCA\n//\n"
                    "# STOCKHOLM 1.0\nseq1          ACC-G-GGTA\n"
                    "#=GR seq1 SS  1110101111\nseq2          TCC-G-GGCA\n"
                    "#=GR seq2 SS  0110101110\n//")
     obs_sto = StockholmAlignment.from_file(sto, DNA)
     count = 0
     for obs in obs_sto:
         if count == 0:
             exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {})
             self.assertEqual(obs, exp_sto)
         elif count == 1:
             exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {})
             self.assertEqual(obs, exp_sto)
         else:
             raise AssertionError("More than 2 sto alignments parsed!")
         count += 1
예제 #9
0
 def setUp(self):
     """Setup for stockholm tests."""
     self.seqs = [DNASequence("ACC-G-GGTA", id="seq1"),
                  DNASequence("TCC-G-GGCA", id="seq2")]
     self.GF = OrderedDict([
         ("AC", "RF00360"),
         ("BM", ["cmbuild  -F CM SEED",
                 "cmsearch  -Z 274931 -E 1000000"]),
         ("SQ", "9"),
         ("RT", ["TITLE1",  "TITLE2"]),
         ("RN", ["[1]", "[2]"]),
         ("RA", ["Auth1;", "Auth2;"]),
         ("RL", ["J Mol Biol", "Cell"]),
         ("RM", ["11469857", "12007400"]),
         ('RN', ['[1]', '[2]'])
     ])
     self.GS = {"AC": OrderedDict([("seq1", "111"), ("seq2", "222")])}
     self.GR = {"SS": OrderedDict([("seq1", "1110101111"),
                                   ("seq2", "0110101110")])}
     self.GC = {"SS_cons": "(((....)))"}
     self.st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF,
                                  gs=self.GS, gr=self.GR)
예제 #10
0
class StockholmAlignmentTests(TestCase):
    """Tests for stockholmAlignment object"""

    def setUp(self):
        """Setup for stockholm tests."""
        self.seqs = [DNASequence("ACC-G-GGTA", id="seq1"),
                     DNASequence("TCC-G-GGCA", id="seq2")]
        self.GF = OrderedDict([
            ("AC", "RF00360"),
            ("BM", ["cmbuild  -F CM SEED",
                    "cmsearch  -Z 274931 -E 1000000"]),
            ("SQ", "9"),
            ("RT", ["TITLE1",  "TITLE2"]),
            ("RN", ["[1]", "[2]"]),
            ("RA", ["Auth1;", "Auth2;"]),
            ("RL", ["J Mol Biol", "Cell"]),
            ("RM", ["11469857", "12007400"]),
            ('RN', ['[1]', '[2]'])
        ])
        self.GS = {"AC": OrderedDict([("seq1", "111"), ("seq2", "222")])}
        self.GR = {"SS": OrderedDict([("seq1", "1110101111"),
                                      ("seq2", "0110101110")])}
        self.GC = {"SS_cons": "(((....)))"}
        self.st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF,
                                     gs=self.GS, gr=self.GR)

    def test_retrieve_metadata(self):
        self.assertEqual(self.st.gc, self.GC)
        self.assertEqual(self.st.gf, self.GF)
        self.assertEqual(self.st.gs, self.GS)
        self.assertEqual(self.st.gr, self.GR)

    def test_from_file_alignment(self):
        """make sure can parse basic sto file with interleaved alignment"""
        sto = StringIO("# STOCKHOLM 1.0\n"
                       "seq1      ACC-G\n"
                       "seq2      TCC-G\n\n"
                       "seq1      -GGTA\n"
                       "seq2      -GGCA\n//")
        obs_sto = next(StockholmAlignment.from_file(sto, DNA))
        exp_sto = StockholmAlignment(self.seqs)
        self.assertEqual(obs_sto, exp_sto)

    def test_from_file_GF(self):
        """Make sure GF lines are parsed correctly"""
        # remove rn line to make sure auto-added
        self.GF.pop("RN")
        sto = StringIO("# STOCKHOLM 1.0\n#=GF RN [1]\n#=GF RM 11469857\n"
                       "#=GF RT TITLE1\n#=GF RA Auth1;\n#=GF RL J Mol Biol\n"
                       "#=GF RN [2]\n#=GF RM 12007400\n#=GF RT TITLE2\n"
                       "#=GF RA Auth2;\n#=GF RL Cell\n#=GF AC RF00360\n"
                       "#=GF BM cmbuild  -F CM SEED\n"
                       "#=GF BM cmsearch  -Z 274931 -E 1000000\n#=GF SQ 9\n"
                       "seq1         ACC-G-GGTA\nseq2         TCC-G-GGCA\n//")
        obs_sto = next(StockholmAlignment.from_file(sto, DNA))
        exp_sto = StockholmAlignment(self.seqs, self.GF, {}, {}, {})
        self.assertEqual(obs_sto, exp_sto)

    def test_from_file_GC(self):
        """Make sure GC lines are parsed correctly"""
        sto = StringIO("# STOCKHOLM 1.0\n"
                       "seq1         ACC-G-GGTA\nseq2         TCC-G-GGCA\n"
                       "#=GC SS_cons (((....)))\n//")
        obs_sto = next(StockholmAlignment.from_file(sto, DNA))
        exp_sto = StockholmAlignment(self.seqs, {}, {}, {}, self.GC)
        self.assertEqual(obs_sto, exp_sto)

    def test_from_file_GS(self):
        """Make sure GS lines are parsed correctly"""
        sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n"
                       "seq1          ACC-G-GGTA\n"
                       "seq2          TCC-G-GGCA\n//")
        obs_sto = next(StockholmAlignment.from_file(sto, DNA))
        exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {})
        self.assertEqual(obs_sto, exp_sto)

    def test_from_file_GR(self):
        """Make sure GR lines are parsed correctly"""
        sto = StringIO("# STOCKHOLM 1.0\nseq1          ACC-G\n"
                       "#=GR seq1 SS  11101\nseq2          TCC-G\n"
                       "#=GR seq2 SS  01101\n\nseq1          -GGTA\n"
                       "#=GR seq1 SS  01111\nseq2          -GGCA\n"
                       "#=GR seq2 SS  01110\n//")
        obs_sto = next(StockholmAlignment.from_file(sto, DNA))
        exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {})
        self.assertEqual(obs_sto, exp_sto)

    def test_from_file_multi(self):
        """Make sure yield works correctly with multi-alignment sto files"""
        sto = StringIO("# STOCKHOLM 1.0\n#=GS seq2 AC 222\n#=GS seq1 AC 111\n"
                       "seq1          ACC-G-GGTA\n"
                       "seq2          TCC-G-GGCA\n//\n"
                       "# STOCKHOLM 1.0\nseq1          ACC-G-GGTA\n"
                       "#=GR seq1 SS  1110101111\nseq2          TCC-G-GGCA\n"
                       "#=GR seq2 SS  0110101110\n//")
        obs_sto = StockholmAlignment.from_file(sto, DNA)
        count = 0
        for obs in obs_sto:
            if count == 0:
                exp_sto = StockholmAlignment(self.seqs, {}, self.GS, {}, {})
                self.assertEqual(obs, exp_sto)
            elif count == 1:
                exp_sto = StockholmAlignment(self.seqs, {}, {}, self.GR, {})
                self.assertEqual(obs, exp_sto)
            else:
                raise AssertionError("More than 2 sto alignments parsed!")
            count += 1

    def test_parse_gf_multiline_nh(self):
        """Makes sure a multiline NH code is parsed correctly"""
        sto = ["#=GF TN MULTILINE TREE",
               "#=GF NH THIS IS FIRST", "#=GF NH THIS IS SECOND",
               "#=GF AC 1283394"]
        exp = {'TN': 'MULTILINE TREE',
               'NH': 'THIS IS FIRST THIS IS SECOND',
               'AC': '1283394'}
        self.assertEqual(self.st._parse_gf_info(sto), exp)

    def test_parse_gf_multiline_cc(self):
        """Makes sure a multiline CC code is parsed correctly"""
        sto = ["#=GF CC THIS IS FIRST", "#=GF CC THIS IS SECOND"]
        exp = {'CC': 'THIS IS FIRST THIS IS SECOND'}
        self.assertEqual(self.st._parse_gf_info(sto), exp)

    def test_parse_gf_info_nongf(self):
        """Makes sure error raised if non-GF line passed"""
        sto = ["#=GF AC BLAAAAAAAHHH", "#=GC HUH THIS SHOULD NOT BE HERE"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gf_info(sto)

    def test_parse_gf_info_malformed(self):
        """Makes sure error raised if too short a line passed"""
        sto = ["#=GF AC", "#=GF"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gf_info(sto)

    def test_parse_gc_info_nongf(self):
        """Makes sure error raised if non-GC line passed"""
        sto = ["#=GC AC BLAAAAAAAHHH", "#=GF HUH THIS SHOULD NOT BE HERE"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gf_info(sto)

    def test_parse_gc_info_strict_len(self):
        """Make sure error raised if GC lines bad length and strict parsing"""
        sto = ["#=GC SS_cons (((..)))"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gc_info(sto, seqlen=20, strict=True)

    def test_parse_gc_info_strict_duplicate(self):
        """Make sure error raised if GC lines repeated"""
        sto = ["#=GC SS_cons (((..)))", "#=GC SS_cons (((..)))"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gc_info(sto, seqlen=8, strict=True)

    def test_parse_gc_info_malformed(self):
        """Makes sure error raised if too short a line passed"""
        sto = ["#=GC AC BLAAAAAAAHHH", "#=GC"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gc_info(sto)

    def test_parse_gs_gr_info_mixed(self):
        """Makes sure error raised if mixed GS and GR lines passed"""
        sto = ["#=GS seq1 AC BLAAA", "#=GR seq2 HUH THIS SHOULD NOT BE HERE"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gs_gr_info(sto)

    def test_parse_gs_gr_info_malformed(self):
        """Makes sure error raised if too short a line passed"""
        sto = ["#=GS AC BLAAAAAAAHHH", "#=GS"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gs_gr_info(sto)

    def test_parse_gs_gr_info_strict(self):
        """Make sure error raised if GR lines bad length and strict parsing"""
        sto = ["#=GR seq1 SS  10101111", "#=GR seq2 SS  01101"]
        with self.assertRaises(StockholmParseError):
            self.st._parse_gs_gr_info(sto, seqlen=20, strict=True)

    def test_str(self):
        """ Make sure stockholm with all information contained is formatted
        correctly """
        st = StockholmAlignment(self.seqs, gc=self.GC, gf=self.GF, gs=self.GS,
                                gr=self.GR)
        obs = str(st)
        exp = ('# STOCKHOLM 1.0\n'
               '#=GF AC RF00360\n'
               '#=GF BM cmbuild  -F CM SEED\n'
               '#=GF BM cmsearch  -Z 274931 -E 1000000\n'
               '#=GF SQ 9\n'
               '#=GF RN [1]\n'
               '#=GF RM 11469857\n'
               '#=GF RT TITLE1\n'
               '#=GF RA Auth1;\n'
               '#=GF RL J Mol Biol\n'
               '#=GF RN [2]\n'
               '#=GF RM 12007400\n'
               '#=GF RT TITLE2\n'
               '#=GF RA Auth2;\n'
               '#=GF RL Cell\n'
               '#=GS seq1 AC 111\n'
               '#=GS seq2 AC 222\n'
               'seq1          ACC-G-GGTA\n'
               '#=GR seq1 SS  1110101111\n'
               'seq2          TCC-G-GGCA\n'
               '#=GR seq2 SS  0110101110\n'
               '#=GC SS_cons  (((....)))\n//')
        self.assertEqual(obs, exp)

    def test_str_gc(self):
        """ Make sure stockholm with only GC information contained is formatted
        correctly """
        st = StockholmAlignment(self.seqs, gc=self.GC, gf=None, gs=None,
                                gr=None)
        obs = str(st)
        exp = ("# STOCKHOLM 1.0\nseq1          ACC-G-GGTA\n"
               "seq2          TCC-G-GGCA\n"
               "#=GC SS_cons  (((....)))\n//")
        self.assertEqual(obs, exp)

    def test_str_gf(self):
        """ Make sure stockholm with only GF information contained is formatted
        correctly """
        st = StockholmAlignment(self.seqs, gc=None, gf=self.GF, gs=None,
                                gr=None)
        obs = str(st)
        exp = ('# STOCKHOLM 1.0\n'
               '#=GF AC RF00360\n'
               '#=GF BM cmbuild  -F CM SEED\n'
               '#=GF BM cmsearch  -Z 274931 -E 1000000\n'
               '#=GF SQ 9\n'
               '#=GF RN [1]\n'
               '#=GF RM 11469857\n'
               '#=GF RT TITLE1\n'
               '#=GF RA Auth1;\n'
               '#=GF RL J Mol Biol\n'
               '#=GF RN [2]\n'
               '#=GF RM 12007400\n'
               '#=GF RT TITLE2\n'
               '#=GF RA Auth2;\n'
               '#=GF RL Cell\n'
               'seq1          ACC-G-GGTA\n'
               'seq2          TCC-G-GGCA\n//')
        self.assertEqual(obs, exp)

    def test_str_gs(self):
        """ Make sure stockholm with only GS information contained is formatted
        correctly """
        st = StockholmAlignment(self.seqs, gc=None, gf=None, gs=self.GS,
                                gr=None)
        obs = str(st)
        exp = ('# STOCKHOLM 1.0\n'
               '#=GS seq1 AC 111\n'
               '#=GS seq2 AC 222\n'
               'seq1          ACC-G-GGTA\n'
               'seq2          TCC-G-GGCA\n//')
        self.assertEqual(obs, exp)

    def test_str_gr(self):
        """ Make sure stockholm with only GR information contained is formatted
        correctly """
        st = StockholmAlignment(self.seqs, gc=None, gf=None, gs=None,
                                gr=self.GR)
        obs = str(st)
        exp = ("# STOCKHOLM 1.0\nseq1          ACC-G-GGTA\n"
               "#=GR seq1 SS  1110101111\nseq2          TCC-G-GGCA\n"
               "#=GR seq2 SS  0110101110\n//")
        self.assertEqual(obs, exp)

    def test_str_trees(self):
        """ Make sure stockholm with trees printed correctly"""
        GF = OrderedDict({"NH": ["IMATREE", "IMATREETOO"],
                          "TN": ["Tree2", "Tree1"]})
        st = StockholmAlignment(self.seqs, gc=None, gf=GF, gs=None,
                                gr=None)
        obs = str(st)
        exp = ("# STOCKHOLM 1.0\n#=GF TN Tree2\n#=GF NH IMATREE\n#=GF TN Tree1"
               "\n#=GF NH IMATREETOO\nseq1          ACC-G-GGTA\n"
               "seq2          TCC-G-GGCA\n//")

        self.assertEqual(obs, exp)