def test_parsers_ignore(self): """LabeledRecordFinder should skip lines to ignore.""" def never(line): return False def ignore_labels(line): return (not line) or line.isspace() or line.startswith("#") def is_start(line): return line.startswith(">") lines = [">abc", "\n", "1", ">def", "#ignore", "2"] self.assertEqual( list(LabeledRecordFinder(is_start)(lines)), [[">abc", "1"], [">def", "#ignore", "2"]], ) self.assertEqual( list(LabeledRecordFinder(is_start, ignore=never)(lines)), [[">abc", "", "1"], [">def", "#ignore", "2"]], ) self.assertEqual( list(LabeledRecordFinder(is_start, ignore=ignore_labels)(lines)), [[">abc", "1"], [">def", "2"]], )
# separate by semicolons # get rid of leading/trailing spaces taxa = list(map(strip, taxonomy.split(";"))) # delete trailing period if present last = taxa[-1] if last.endswith("."): taxa[-1] = last[:-1] return species, taxa def is_feature_component_start(line): """Checks if a line starts with '/', ignoring whitespace.""" return line.lstrip().startswith("/") feature_component_iterator = LabeledRecordFinder(is_feature_component_start) _join_with_empty = dict.fromkeys(["translation"]) _leave_as_lines = {} def parse_feature(lines): """Parses a feature. Doesn't handle subfeatures. Returns dict containing: 'type': source, gene, CDS, etc. 'location': unparsed location string ...then, key-value pairs for each annotation, e.g. '/gene="MNBH"' -> {'gene':['MNBH']} (i.e. quotes stripped) All relations are assumed 'to many', and order will be preserved. """
def is_gde_label(x): """Checks if x looks like a GDE label line.""" return x and x[0] in "%#" def is_blank_or_comment(x): """Checks if x is blank or a FASTA comment line.""" return (not x) or x.startswith("#") or x.isspace() def is_blank(x): """Checks if x is blank.""" return (not x) or x.isspace() FastaFinder = LabeledRecordFinder(is_fasta_label, ignore=is_blank_or_comment) def MinimalFastaParser(infile, strict=True, label_to_name=str, finder=FastaFinder, label_characters=">"): """Yields successive sequences from infile as (label, seq) tuples. If strict is True (default), raises RecordError when label or seq missing. """ try: infile = open_(infile) close_at_end = True except (TypeError, AttributeError):
def setUp(self): """Define a standard LabeledRecordFinder""" self.FastaLike = LabeledRecordFinder(lambda x: x.startswith(">"))
WARNING: Only maps the data type if the key is in label_constructors above. """ if not line.startswith("#"): raise ValueError("Labels must start with a # symbol.") if line.find(":") == -1: raise ValueError("Labels must contain a : symbol.") key, value = list(map(strip, line[1:].split(":", 1))) key = key.upper() if key in label_constructors: value = label_constructors[key](value) return key, value BlatFinder = LabeledRecordFinder(query_finder, constructor=strip, ignore=is_blat_junk) BlastFinder = LabeledRecordFinder(query_finder, constructor=strip, ignore=is_blast_junk) PsiBlastFinder = LabeledRecordFinder( iter_finder, constructor=strip, ignore=is_blast_junk ) PsiBlastQueryFinder = LabeledRecordFinder( iteration_set_finder, constructor=strip, ignore=is_blast_junk ) def GenericBlastParser9(lines, finder, make_col_headers=False): """Yields successive records from lines (props, data list)
__version__ = "2019.10.24a" __maintainer__ = "Rob Knight" __email__ = "*****@*****.**" __status__ = "Development" maketrans = str.maketrans strip = str.strip rstrip = str.rstrip def ll_start(line): """Returns True if line looks like the start of a LocusLink record.""" return line.startswith(">>") LLFinder = LabeledRecordFinder(ll_start) pipes = DelimitedSplitter("|", None) first_pipe = DelimitedSplitter("|") commas = DelimitedSplitter(",", None) first_colon = DelimitedSplitter(":", 1) accession_wrapper = FieldWrapper(["Accession", "Gi", "Strain"], pipes) def _read_accession(line): """Reads accession lines: format is Accession | Gi | Strain.""" return MappedRecord(accession_wrapper(line)) rell_wrapper = FieldWrapper(["Description", "Id", "IdType", "Printable"], pipes)