def test_parsers_error(self): """DelimitedRecordFinder should raise RecordError if trailing data""" good = [ " \t abc \n", "\t def\n", "// \t\n", "\t\n", "\t efg \n", "\t\t//\n", ] blank = ["", " ", "\t \t\n\n"] bad = ["abc"] result = [["abc", "def", "//"], ["efg", "//"]] r = DelimitedRecordFinder("//") self.assertEqual(list(r(good)), result) self.assertEqual(list(r(good + blank)), result) try: list(r(good + bad)) except RecordError: pass else: raise AssertionError("Parser failed to raise error on bad data") r = DelimitedRecordFinder("//", strict=False) self.assertEqual(list(r(good + bad)), result + [["abc"]])
def test_parsers(self): """DelimitedRecordFinder should split records into lines correctly""" lines = "abc\ndef\n//\nefg\n//".split() self.assertEqual( list(DelimitedRecordFinder("//")(lines)), [["abc", "def", "//"], ["efg", "//"]], ) self.assertEqual( list(DelimitedRecordFinder("//", keep_delimiter=False)(lines)), [["abc", "def"], ["efg"]], )
def MinimalGreengenesParser(lines, LineDelim="=", RecStart="BEGIN", RecEnd="END"): """Parses raw Greengeens 16S rRNA Gene records lines : open records file LineDelim : individual line delimiter, eg foo=bar RecStart : start identifier for a record RecEnd : end identifier for a record """ line_parser = DefaultDelimitedSplitter(delimiter=LineDelim) # parse what the ending record looks like so it can match after being split RecordDelim = line_parser(RecEnd) # make sure to ignore the starting record ignore = make_ignore_f(RecStart) parser = DelimitedRecordFinder(RecordDelim, constructor=line_parser, keep_delimiter=False, ignore=ignore) for record in parser(lines): yield GenericRecord(record)
def test_parsers_strip(self): """DelimitedRecordFinder should trim each line correctly""" lines = " \t abc \n \t def\n // \t\n\t\t efg \n//".split("\n") self.assertEqual( list(DelimitedRecordFinder("//")(lines)), [["abc", "def", "//"], ["efg", "//"]], )
def test_parsers_ignore(self): """DelimitedRecordFinder should skip lines to ignore.""" def never(line): return False def ignore_labels(line): return (not line) or line.isspace() or line.startswith("#") lines = [">abc", "\n", "1", "$$", ">def", "#ignore", "2", "$$"] self.assertEqual( list(DelimitedRecordFinder("$$")(lines)), [[">abc", "1", "$$"], [">def", "#ignore", "2", "$$"]], ) self.assertEqual( list(DelimitedRecordFinder("$$", ignore=never)(lines)), [[">abc", "", "1", "$$"], [">def", "#ignore", "2", "$$"]], ) self.assertEqual( list(DelimitedRecordFinder("$$", ignore=ignore_labels)(lines)), [[">abc", "1", "$$"], [">def", "2", "$$"]], )
all_chars = maketrans("", "") dna_lc = "utacgrywsmkbdhvn" dna_lc_cmp = "aatgcyrwskmvhdbn" dna_trans = maketrans(dna_lc + dna_lc.upper(), dna_lc_cmp + dna_lc_cmp.upper()) rna_lc = "utacgrywsmkbdhvn" rna_lc_cmp = "aaugcyrwskmvhdbn" rna_trans = maketrans(rna_lc + rna_lc.upper(), rna_lc_cmp + rna_lc_cmp.upper()) locus_fields = [ None, "locus", "length", None, "mol_type", "topology", "db", "date" ] _locus_parser = FieldWrapper(locus_fields) # need to turn off line stripping, because whitespace is significant GbFinder = DelimitedRecordFinder("//", constructor=rstrip) class PartialRecordError(Exception): pass def parse_locus(line): """Parses a locus line, including conversion of Length to an int. WARNING: Gives incorrect results on legacy records that omit the topology. All records spot-checked on 8/30/05 had been updated to include the topology even when prior versions omitted it. """ result = _locus_parser(line) try:
def test_parsers_empty(self): """DelimitedRecordFinder should return empty list on empty lines""" self.assertEqual(list(DelimitedRecordFinder("//")([" ", "\n"])), []) self.assertEqual(list(DelimitedRecordFinder("//")([])), [])
def QMEPsiBlast9(lines): """Returns successive query, match, e-value from lines of Psi-Blast run. Assumes tabular output. Uses last iteration from each query. WARNING: Allows duplicates in result """ result = [] for query in PsiBlastQueryFinder(lines): for iteration in PsiBlastFinder(query): pass result.extend(QMEBlast9(iteration)) return result fastacmd_taxonomy_splitter = DelimitedRecordFinder(delimiter="", ignore=never_ignore) fasta_field_map = { "NCBI sequence id": "seq_id", "NCBI taxonomy id": "tax_id", "Common name": "common_name", "Scientific name": "scientific_name", } def FastacmdTaxonomyParser(lines): """Yields successive records from the results of fastacmd -T. Format is four lines separated by newline: NCBI sequence NCBI taxonomy Common name
__author__ = "Sandra Smit" __copyright__ = "Copyright 2007-2020, The Cogent Project" __credits__ = ["Sandra Smit", "Rob Knight"] __license__ = "BSD-3" __version__ = "2020.6.30a" __maintainer__ = "Sandra Smit" __email__ = "*****@*****.**" __status__ = "Development" strip = str.strip maketrans = str.maketrans RdbFinder = DelimitedRecordFinder("//") _field_names = { "acc": "rRNA", "src": "Source", "str": "Strain", "ta1": "Taxonomy1", "ta2": "Taxonomy2", "ta3": "Taxonomy3", "ta4": "Taxonomy4", "chg": "Changes", "rem": "Remarks", "aut": "Authors", "ttl": "Title", "jou": "Journal", "dat": "JournalYear",