def test_parse_fasta__multiple_records(): lines = [">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n", ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"] expected = [(("first", None), "TGTTCTCCACCGTGCACAACCCTTCATCCA"), (("Second", "XT:1:0"), "GAGAGCTCAGCTAAC"), (("Third", None), "CGCTGACCAAAAACGGACAGGGCATTCGGC")] assert_list_equals(parse_fasta(lines), expected)
def test_parse_fasta__multiple_records(): lines = [ ">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n", ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n", ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n" ] expected = [(("first", None), "TGTTCTCCACCGTGCACAACCCTTCATCCA"), (("Second", "XT:1:0"), "GAGAGCTCAGCTAAC"), (("Third", None), "CGCTGACCAAAAACGGACAGGGCATTCGGC")] assert_list_equals(parse_fasta(lines), expected)
def parse_msa(lines, read_meta = False): """Parses a MSA from a file/list of lines, and returns a dictionary of names to sequences. If read_meta is True, meta information included after the first space in header of each sequence: >NAME META-INFORMATION SEQUENCE As suggested above, sequences are expected to be in FASTA format.""" msa, metas = {}, {} for ((name, meta), sequence) in parse_fasta(lines): if name in msa: raise MSAError("Duplicate names found, cannot be represented as MSA: " + name) msa[name] = sequence metas[name] = meta validate_msa(msa) if read_meta: return msa, metas return msa
def parse_msa(lines, read_meta=False): """Parses a MSA from a file/list of lines, and returns a dictionary of names to sequences. If read_meta is True, meta information included after the first space in header of each sequence: >NAME META-INFORMATION SEQUENCE As suggested above, sequences are expected to be in FASTA format.""" msa, metas = {}, {} for ((name, meta), sequence) in parse_fasta(lines): if name in msa: raise MSAError( "Duplicate names found, cannot be represented as MSA: " + name) msa[name] = sequence metas[name] = meta validate_msa(msa) if read_meta: return msa, metas return msa
def test_parse_fasta__single_record(): lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"] expected = [(("single", None), "TGTTCTCCACCGTGCACAACCCTTCATCCA")] assert_list_equals(parse_fasta(lines), expected)
def test_parse_fasta__no_records(): assert_list_equals(parse_fasta([]), [])
def test_parse_fasta__empty_name__alone(): lines = [">\n", "ACGT\n"] list(parse_fasta(lines))
def test_parse_fasta__missing_name__alone(): lines = ["ACGT\n"] list(parse_fasta(lines))
def test_parse_fasta__empty_record__middle(): lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"] list(parse_fasta(lines))
def test_parse_empty_record_last(): lines = [">fasta1\n", "ACGT\n", ">fasta2\n"] list(parse_fasta(lines))
def test_parse_fasta__empty_record_name_only__first(): list(parse_fasta([">fasta1\n", ">fasta2\n", "AGTC\n"]))
def test_parse_fasta__empty_record_name_only__nothing_else(): list(parse_fasta([">fasta1\n"]))
def test_parse_fasta__missing_name__with_others(): lines = ["ACGT\n", ">Foo\n", "ACGGTA\n"] list(parse_fasta(lines))
def test_parse_fasta__empty_name__with_others(): lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"] list(parse_fasta(lines))