def test_parsers_error(self): """DelimitedRecordFinder should raise RecordError if trailing data""" good = [ ' \t abc \n', '\t def\n', '// \t\n', '\t\n', '\t efg \n', '\t\t//\n', ] blank = ['', ' ', '\t \t\n\n'] bad = ['abc'] result = [['abc', 'def', '//'], ['efg','//']] r = DelimitedRecordFinder('//') self.assertEqual(list(r(good)), result) self.assertEqual(list(r(good+blank)), result) try: list(r(good+bad)) except RecordError: pass else: raise AssertionError, "Parser failed to raise error on bad data" r = DelimitedRecordFinder('//', strict=False) self.assertEqual(list(r(good+bad)), result + [['abc']])
def test_parsers(self): """DelimitedRecordFinder should split records into lines correctly""" lines = 'abc\ndef\n//\nefg\n//'.split() self.assertEqual(list(DelimitedRecordFinder('//')(lines)), \ [['abc', 'def', '//'], ['efg','//']]) self.assertEqual(list(DelimitedRecordFinder('//', keep_delimiter=False) (lines)), \ [['abc', 'def'], ['efg']])
def __call__(self, infile): """ Parse AAIndex file into dict of AAIndex objects with ID as key infile = file to parse as file object or list of lines Usage: aa1p = AAIndex1Parser() aaIndex1Objects = aa1p('data/AAIndex1') aa2p = AAIndex2Parser() aaIndex2Objects = aa2p('data/AAIndex2') """ result = {} # Break down the file into records delimited by '//' and then # parse each record into AAIndexRecord objects which will be stored # in a dict keyed by the records unique ID string AAIndexRecordFinder = DelimitedRecordFinder('//', constructor=rstrip) # parser is a generator of AAIndexRecords from file parser = AAIndexRecordFinder(infile) for r in parser: new_record = self._parse_record(r) if new_record: yield new_record
def MinimalGreengenesParser(lines, LineDelim="=", RecStart="BEGIN", RecEnd="END"): """Parses raw Greengeens 16S rRNA Gene records lines : open records file LineDelim : individual line delimiter, eg foo=bar RecStart : start identifier for a record RecEnd : end identifier for a record """ line_parser = DefaultDelimitedSplitter(delimiter=LineDelim) # parse what the ending record looks like so it can match after being split RecordDelim = line_parser(RecEnd) # make sure to ignore the starting record ignore = make_ignore_f(RecStart) parser = DelimitedRecordFinder(RecordDelim, constructor=line_parser, keep_delimiter=False, ignore=ignore) for record in parser(lines): yield GenericRecord(record)
def test_parsers_ignore(self): """DelimitedRecordFinder should skip lines to ignore.""" def never(line): return False def ignore_labels(line): return (not line) or line.isspace() or line.startswith('#') lines = ['>abc','\n','1', '$$', '>def','#ignore','2', '$$'] self.assertEqual(list(DelimitedRecordFinder('$$')(lines)), [['>abc', '1', '$$'],['>def','#ignore','2', '$$']]) self.assertEqual(list(DelimitedRecordFinder('$$', ignore=never)(lines)), [['>abc', '', '1', '$$'],['>def','#ignore','2','$$']]) self.assertEqual(list(DelimitedRecordFinder('$$', ignore=ignore_labels)(lines)), [['>abc','1','$$'],['>def','2','$$']])
def is_empty_or_html(line): """Return True for HTML line and empty (or whitespace only) line. line -- string The Rfam adaptor that retrieves records inlcudes two HTML tags in the record. These lines need to be ignored in addition to empty lines. """ if line.startswith('<pre') or line.startswith('</pre'): return True return (not line) or line.isspace() Sequence = BYTES.Sequence RfamFinder = DelimitedRecordFinder('//', ignore=is_empty_or_html) def load_from_clustal(data, seq_constructor=Sequence, strict=True): recs = [(name, seq_constructor(seq, )) for name, seq in\ ClustalParser(data, strict)] lengths = [len(i[1]) for i in recs] if lengths and max(lengths) == min(lengths): return Alignment(recs, MolType=BYTES) else: return SequenceCollection(recs, MolType=BYTES) #all fields concerning the references are translated to None, except for # the MedLine ID, so that we can lookup the information if needed. #RC = Reference comment
all_chars = maketrans('', '') dna_lc = 'utacgrywsmkbdhvn' dna_lc_cmp = 'aatgcyrwskmvhdbn' dna_trans = maketrans(dna_lc + dna_lc.upper(), dna_lc_cmp + dna_lc_cmp.upper()) rna_lc = 'utacgrywsmkbdhvn' rna_lc_cmp = 'aaugcyrwskmvhdbn' rna_trans = maketrans(rna_lc + rna_lc.upper(), rna_lc_cmp + rna_lc_cmp.upper()) locus_fields = [ None, 'locus', 'length', None, 'mol_type', 'topology', 'db', 'date' ] _locus_parser = FieldWrapper(locus_fields) #need to turn off line stripping, because whitespace is significant GbFinder = DelimitedRecordFinder('//', constructor=rstrip) class PartialRecordError(Exception): pass def parse_locus(line): """Parses a locus line, including conversion of Length to an int. WARNING: Gives incorrect results on legacy records that omit the topology. All records spot-checked on 8/30/05 had been updated to include the topology even when prior versions omitted it. """ result = _locus_parser(line) try:
def test_parsers_empty(self): """DelimitedRecordFinder should return empty list on empty lines""" self.assertEqual(list(DelimitedRecordFinder('//')([' ','\n'])), []) self.assertEqual(list(DelimitedRecordFinder('//')([])), [])
def test_parsers_strip(self): """DelimitedRecordFinder should trim each line correctly""" lines = ' \t abc \n \t def\n // \t\n\t\t efg \n//'.split('\n') self.assertEqual(list(DelimitedRecordFinder('//')(lines)), \ [['abc', 'def', '//'], ['efg','//']])
for ix, best_hit in enumerate(best_hits): new_val = cast_fun(hit[field]) old_val = cast_fun(best_hit[field]) if cmp_fun(new_val, old_val): best_hits[ix] = hit continue yield q, best_hits def filterByIteration(self, iteration=-1): """Returns copy of self containing only specified iteration. Negative indices count backwards.""" #raise error if both field and f passed, uses same dict as filterByField fastacmd_taxonomy_splitter = DelimitedRecordFinder(delimiter='', \ ignore=never_ignore) fasta_field_map = { 'NCBI sequence id': 'seq_id', 'NCBI taxonomy id': 'tax_id', 'Common name': 'common_name', 'Scientific name': 'scientific_name' } def FastacmdTaxonomyParser(lines): """Yields successive records from the results of fastacmd -T. Format is four lines separated by newline: NCBI sequence NCBI taxonomy Common name
return ELinkResultParser(link.read()) def get_between_tags(line): """"Returns portion of line between xml tags.""" return line.split('>', 1)[1].rsplit('<', 1)[0] def taxon_lineage_extractor(lines): """Extracts lineage from taxonomy record lines, not incl. species.""" for line in lines: if '<Lineage>' in line: #expect line of form <Lineage>xxxx</Lineage> where xxxx semicolon- #delimited between_tags = line.split('>', 1)[1].rsplit('<', 1)[0] yield map(strip, between_tags.split(';')) taxon_record_finder = DelimitedRecordFinder('</Taxon>', constructor=None, strict=False) def get_taxid_name_lineage(rec): """Returns taxon id, name, and lineage from single xml taxon record.""" tax_tag = ' <TaxId>' name_tag = ' <ScientificName>' lineage_tag = ' <Lineage>' taxid = name = lineage = None for line in rec: if line.startswith(tax_tag): taxid = get_between_tags(line) elif line.startswith(name_tag): name = get_between_tags(line) elif line.startswith(lineage_tag): lineage = map(strip, get_between_tags(line).split(';')) return taxid, name, lineage
from cogent.parse.record_finder import DelimitedRecordFinder from cogent.parse.record import RecordError from cogent.core.sequence import Sequence, RnaSequence from cogent.core.info import Info from cogent.core.alphabet import AlphabetError __author__ = "Sandra Smit" __copyright__ = "Copyright 2007-2012, The Cogent Project" __credits__ = ["Sandra Smit", "Rob Knight"] __license__ = "GPL" __version__ = "1.5.3-dev" __maintainer__ = "Sandra Smit" __email__ = "*****@*****.**" __status__ = "Development" RdbFinder = DelimitedRecordFinder('//') _field_names = {'acc':'rRNA',\ 'src':'Source',\ 'str':'Strain',\ 'ta1':'Taxonomy1',\ 'ta2':'Taxonomy2',\ 'ta3':'Taxonomy3',\ 'ta4':'Taxonomy4',\ 'chg':'Changes',\ 'rem':'Remarks',\ 'aut':'Authors',\ 'ttl':'Title',\ 'jou':'Journal',\ 'dat':'JournalYear',\ 'vol':'JournalVolume',\