Пример #1
0
    def test_parsers_error(self):
        """DelimitedRecordFinder should raise RecordError if trailing data"""
        good = [
            "  \t   abc  \n",
            "\t   def\n",
            "// \t\n",
            "\t\n",
            "\t efg \n",
            "\t\t//\n",
        ]
        blank = ["", "   ", "\t    \t\n\n"]
        bad = ["abc"]

        result = [["abc", "def", "//"], ["efg", "//"]]
        r = DelimitedRecordFinder("//")

        self.assertEqual(list(r(good)), result)
        self.assertEqual(list(r(good + blank)), result)
        try:
            list(r(good + bad))
        except RecordError:
            pass
        else:
            raise AssertionError("Parser failed to raise error on bad data")

        r = DelimitedRecordFinder("//", strict=False)
        self.assertEqual(list(r(good + bad)), result + [["abc"]])
Пример #2
0
 def test_parsers(self):
     """DelimitedRecordFinder should split records into lines correctly"""
     lines = "abc\ndef\n//\nefg\n//".split()
     self.assertEqual(
         list(DelimitedRecordFinder("//")(lines)),
         [["abc", "def", "//"], ["efg", "//"]],
     )
     self.assertEqual(
         list(DelimitedRecordFinder("//", keep_delimiter=False)(lines)),
         [["abc", "def"], ["efg"]],
     )
Пример #3
0
def MinimalGreengenesParser(lines,
                            LineDelim="=",
                            RecStart="BEGIN",
                            RecEnd="END"):
    """Parses raw Greengeens 16S rRNA Gene records

    lines  :  open records file
    LineDelim  :  individual line delimiter, eg foo=bar
    RecStart  :  start identifier for a record
    RecEnd  :  end identifier for a record
    """
    line_parser = DefaultDelimitedSplitter(delimiter=LineDelim)

    # parse what the ending record looks like so it can match after being split
    RecordDelim = line_parser(RecEnd)

    # make sure to ignore the starting record
    ignore = make_ignore_f(RecStart)

    parser = DelimitedRecordFinder(RecordDelim,
                                   constructor=line_parser,
                                   keep_delimiter=False,
                                   ignore=ignore)

    for record in parser(lines):
        yield GenericRecord(record)
Пример #4
0
 def test_parsers_strip(self):
     """DelimitedRecordFinder should trim each line correctly"""
     lines = "  \t   abc  \n \t   def\n  // \t\n\t\t efg \n//".split("\n")
     self.assertEqual(
         list(DelimitedRecordFinder("//")(lines)),
         [["abc", "def", "//"], ["efg", "//"]],
     )
Пример #5
0
    def test_parsers_ignore(self):
        """DelimitedRecordFinder should skip lines to ignore."""
        def never(line):
            return False

        def ignore_labels(line):
            return (not line) or line.isspace() or line.startswith("#")

        lines = [">abc", "\n", "1", "$$", ">def", "#ignore", "2", "$$"]
        self.assertEqual(
            list(DelimitedRecordFinder("$$")(lines)),
            [[">abc", "1", "$$"], [">def", "#ignore", "2", "$$"]],
        )
        self.assertEqual(
            list(DelimitedRecordFinder("$$", ignore=never)(lines)),
            [[">abc", "", "1", "$$"], [">def", "#ignore", "2", "$$"]],
        )
        self.assertEqual(
            list(DelimitedRecordFinder("$$", ignore=ignore_labels)(lines)),
            [[">abc", "1", "$$"], [">def", "2", "$$"]],
        )
Пример #6
0
all_chars = maketrans("", "")
dna_lc = "utacgrywsmkbdhvn"
dna_lc_cmp = "aatgcyrwskmvhdbn"
dna_trans = maketrans(dna_lc + dna_lc.upper(), dna_lc_cmp + dna_lc_cmp.upper())
rna_lc = "utacgrywsmkbdhvn"
rna_lc_cmp = "aaugcyrwskmvhdbn"
rna_trans = maketrans(rna_lc + rna_lc.upper(), rna_lc_cmp + rna_lc_cmp.upper())

locus_fields = [
    None, "locus", "length", None, "mol_type", "topology", "db", "date"
]
_locus_parser = FieldWrapper(locus_fields)

# need to turn off line stripping, because whitespace is significant
GbFinder = DelimitedRecordFinder("//", constructor=rstrip)


class PartialRecordError(Exception):
    pass


def parse_locus(line):
    """Parses a locus line, including conversion of Length to an int.

    WARNING: Gives incorrect results on legacy records that omit the topology.
    All records spot-checked on 8/30/05 had been updated to include the topology
    even when prior versions omitted it.
    """
    result = _locus_parser(line)
    try:
Пример #7
0
 def test_parsers_empty(self):
     """DelimitedRecordFinder should return empty list on empty lines"""
     self.assertEqual(list(DelimitedRecordFinder("//")(["  ", "\n"])), [])
     self.assertEqual(list(DelimitedRecordFinder("//")([])), [])
Пример #8
0
def QMEPsiBlast9(lines):
    """Returns successive query, match, e-value from lines of Psi-Blast run.

    Assumes tabular output. Uses last iteration from each query.

    WARNING: Allows duplicates in result
    """
    result = []
    for query in PsiBlastQueryFinder(lines):
        for iteration in PsiBlastFinder(query):
            pass
        result.extend(QMEBlast9(iteration))
    return result


fastacmd_taxonomy_splitter = DelimitedRecordFinder(delimiter="", ignore=never_ignore)
fasta_field_map = {
    "NCBI sequence id": "seq_id",
    "NCBI taxonomy id": "tax_id",
    "Common name": "common_name",
    "Scientific name": "scientific_name",
}


def FastacmdTaxonomyParser(lines):
    """Yields successive records from the results of fastacmd -T.

    Format is four lines separated by newline:
    NCBI sequence
    NCBI taxonomy
    Common name
Пример #9
0

__author__ = "Sandra Smit"
__copyright__ = "Copyright 2007-2020, The Cogent Project"
__credits__ = ["Sandra Smit", "Rob Knight"]
__license__ = "BSD-3"
__version__ = "2020.6.30a"
__maintainer__ = "Sandra Smit"
__email__ = "*****@*****.**"
__status__ = "Development"

strip = str.strip
maketrans = str.maketrans


RdbFinder = DelimitedRecordFinder("//")

_field_names = {
    "acc": "rRNA",
    "src": "Source",
    "str": "Strain",
    "ta1": "Taxonomy1",
    "ta2": "Taxonomy2",
    "ta3": "Taxonomy3",
    "ta4": "Taxonomy4",
    "chg": "Changes",
    "rem": "Remarks",
    "aut": "Authors",
    "ttl": "Title",
    "jou": "Journal",
    "dat": "JournalYear",