def __init__(self): # # Parser combinators # SPACES = spaces() optional_spaces = optional(SPACES) empty = SPACES.parsecmap(lambda x: EMPTY) comment = string('%%%') >> regex('.*') comment = comment.parsecmap(Comment) codepoint_hex = regex('[0-9A-F]+') codepoint_hex = codepoint_hex.parsecmap(lambda x: int(x, 16)) codepoint = string('U+') >> codepoint_hex codepoint_seq = sepBy(codepoint, SPACES) codepoint_seq = codepoint_seq.parsecmap(tuple) arrow = string('=>') arrow = optional_spaces >> arrow << optional_spaces mapping = joint( codepoint_seq << arrow, codepoint_seq, optional(comment), ) mapping = mapping.parsecmap(lambda x: Mapping(x[0], x[1], x[2])) line = try_choice(mapping, try_choice( comment, empty, )) self.parse = line.parse
def parse_ECOSTRESS(filename): """ Load a spectrum from an ECOSTRESS file """ spaces = parsec.regex(r'\s*', re.MULTILINE) @parsec.generate def header_field(): k = yield spaces >> parsec.regex('[^:]*') << spaces yield parsec.string(':') v = yield parsec.ends_with(parsec.regex('[^\n]*'), parsec.string('\n')) return { k: v.strip() } @parsec.generate def header(): items = yield parsec.many(header_field) d = {} for item in items: d.update(item) return d floating = parsec.regex('[-+]?([0-9]+(\.[0-9]+)?|\.[0-9]+)') @parsec.generate def sample(): fwhm = yield spaces >> floating << spaces level = yield floating << spaces yield parsec.optional(parsec.string('\n')) return (float(fwhm), float(level)) @parsec.generate def parser(): head = yield header yield parsec.many(parsec.string('\n')) samps = yield parsec.many(sample) return head, samps def parse_spectrum(filename): try: with open(filename, 'r', encoding='iso-8859-1') as f: parsed = parser.parse(f.read()) return parsed except: print('Error parsing '+filename) raise header, data = parse_spectrum(filename) name = header.pop('Name') type_ = header.pop('Type') class_ = header.pop('Class') x0 = float(header.pop('First X Value')) x1 = float(header.pop('Last X Value')) x_min, x_max = min(x0, x1), max(x0, x1) n_samples = header.pop('Number of X Values') x = list(map(lambda v: v[0], data)) y = list(map(lambda v: v[1], data)) return Spectrum(name, type_, class_, x_min, x_max, n_samples, header, x, y)
def param(): "Parse IBIS parameter." pname = yield regex( r"^[a-zA-Z]\w*", re.MULTILINE) # Parameters must begin with a letter in column 1. if DBG: print(pname) res = yield (regex(r"\s*") >> ( (word(string("=")) >> number) | typminmax | name | rest_line)) yield ignore # So that ``param`` functions as a lexeme. return (pname.lower(), res)
def n_ANY(n): """Given n, return Parser for string of n length containing any characters Useful for ignoring n characters with the compose >> or skip << operations or for capturing fixed length fields """ return p.regex('(.){{{0}}}'.format(n))
def string_esc(): return string('\\') >> (string('\\') | string('/') | string('b').result('\b') | string('f').result('\f') | string('n').result('\n') | string('r').result('\r') | string('t').result('\t') | regex(r'u[0-9a-fA-F]{4}').parsecmap(to_unichr) | string(end_quote))
def _parse_fasta(self, filehandle, sep="|"): """ Parse a fasta file. The header is split into fields on 'sep'. The sequence is added as a final field. """ p_header = parsec.string(">") >> parsec.regex("[^\n\r]*") << parsec.spaces() p_seq = ( parsec.sepBy1( parsec.regex("[^>\n\r]*"), sep=parsec.regex("[\r\n\t ]+") ).parsecmap(concat) << parsec.spaces() ) p_entry = p_header + p_seq p_fasta = parsec.many1(p_entry) log(f"Reading {file_str(filehandle)} as a fasta file:") try: entries = p_fasta.parse(filehandle.read()) except AttributeError: # in case I want to pass in a list of strings, e.g., in tests entries = p_fasta.parse(filehandle) row = [h.split(sep) + [q] for (h, q) in entries] return row
def fn(): "Parse IBIS keyword." yield regex(r"^\[", re.MULTILINE) wordlets = yield sepBy1(name_only, one_of( " _")) # ``name`` gobbles up trailing space, which we don't want. yield string("]") yield ignore # So that ``keyword`` functions as a lexeme. res = ("_".join(wordlets) ) # Canonicalize to: "<wordlet1>_<wordlet2>_...". if kywrd: # assert res.lower() == kywrd.lower(), f"Expecting: {kywrd}; got: {res}." # Does not work! if res.lower() == kywrd.lower(): return res else: return fail.desc(f"Expecting: {kywrd}; got: {res}.") return res
def number(): "Parse an IBIS numerical value." s = yield word( regex( r"[-+]?[0-9]*\.?[0-9]+(([eE][-+]?[0-9]+)|([TknGmpMuf][a-zA-Z]*))?") << many(letter())) m = re.search(r'[^\d]+$', s) if m: ix = m.start() c = s[ix] if c in IBIS_num_suf: res = float(s[:ix] + IBIS_num_suf[c]) else: raise ParseError("IBIS numerical suffix", s[ix:], ix) else: res = float(s) return res
''' "" => "" "1" => "One" "234" => "Two,Three,Four" "10,000" => "One,Zero,Zero,Zero,Zero" " 4619 " => "Four,Six,One,Nine" ''' examples = ['', '1', '234', '10,000', ' 4619 '] expected = [ '', 'One', 'Two,Three,Four', 'One,Zero,Zero,Zero,Zero', 'Four,Six,One,Nine' ] print(parsec.string("1").parse("111")) whitespace = parsec.regex(r'\s+') commas = parsec.regex(r',+') ignore = parsec.many((whitespace | commas)) lexeme = lambda p: p << ignore zero = lexeme(parsec.string('0')).result('Zero') one = lexeme(parsec.string('1')).result('One') two = lexeme(parsec.string('2')).result('Two') three = lexeme(parsec.string('3')).result('Three') four = lexeme(parsec.string('4')).result('Four') five = lexeme(parsec.string('5')).result('Five') six = lexeme(parsec.string('6')).result('Six') seven = lexeme(parsec.string('7')).result('Seven') eight = lexeme(parsec.string('8')).result('Eight') nine = lexeme(parsec.string('9')).result('Nine')
import parsec as ps import weakref from wurlitzer import sys_pipes __author__ = "Craig Ramsay" __copyright__ = "Copyright 2019, Xilinx" __email__ = "*****@*****.**" # The HWH file includes a lot of information about all of the available code # parameters. This includes nested lists, etc. so we use [parser # combinators](https://en.wikipedia.org/wiki/Parsec_(parser)) to keep this # managable. _whitespace = ps.regex(r'\s*') _lexeme = lambda p: p << _whitespace _lbrace = _lexeme(ps.string('{')) _rbrace = _lexeme(ps.string('}')) _separator = _lexeme(ps.regex(r'[ ,]')) _name = _lexeme(ps.regex(r'[\w]+')) _num_hex = _lexeme(ps.regex(r'0x[0-9a-fA-F]+')).parsecmap(lambda h: int(h, base=16)) _num_int = _lexeme(ps.regex(r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?')).parsecmap(int) _num_float = _lexeme(ps.regex(r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?')).parsecmap(float) _list_of = lambda elems: _lbrace >> ps.many(elems) << _rbrace _sep_list_of = lambda elems: _lbrace >> ps.sepBy(elems, _separator) << _rbrace _param_value = _num_int | _list_of(_num_int) @ps.generate
def int_number(): return psc.regex('-?(0|[1-9][0-9]*)').parsecmap(int)
def number(): return lexeme(regex( r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?')).parsecmap(float)
def n_I(n): """ Given n, return parser for an integer between one and n digits long """ return p.regex('([0-9]){{1,{0}}}'.format(n)).parsecmap(int)
if self.is_plain: return '%s%s' % (self.element_type, star_str) if self.is_list: return '[%s%s]' % (self.element_type, star_str) if self.is_dlist: return '[[%s%s]]' % (self.element_type, star_str) if self.is_set: return '{%s%s}' % (self.element_type, star_str) if self.is_dict: return '{%s: %s%s}' % (self.element_type[0], self.element_type[1], star_str) raise RuntimeError('Invalid codegen kind: %s' % self.kind) name_pattern = parsec.spaces() >> parsec.regex( r'[_a-zA-Z][_a-zA-Z0-9<>, ]*(::[_a-zA-Z][_a-zA-Z0-9<>, ]*)*' ) << parsec.spaces() star_pattern = parsec.spaces() >> parsec.optional(parsec.string('*'), '') << parsec.spaces() parse_meta = parsec.spaces().parsecmap(lambda _: CodeGenKind('meta')) parse_plain = (parsec.spaces() >> (name_pattern + star_pattern) << parsec.spaces() ).parsecmap(lambda value: CodeGenKind('plain', value)) parse_list = (parsec.string('[') >> (name_pattern + star_pattern) << parsec.string(']') ).parsecmap(lambda value: CodeGenKind('list', value)) parse_dlist = (parsec.string('[[') >> (name_pattern + star_pattern) << parsec.string(']]')
import collections import re import parsec # The targets of our parsing. # We look for instructions to insert text, chapters, # paragraphs and line breaks. Text = collections.namedtuple('Text', ['text']) NewChapter = collections.namedtuple('NewChapter', ['text']) NewParagraph = collections.namedtuple('NewParagraph', []) Break = collections.namedtuple('Break', []) # Parse and ignore trailing white space after our reserved words. whitespace = parsec.regex(r'\s*', re.MULTILINE) skip_whitespace = lambda p: p << whitespace # noqa # Break and new paragraph commands are simply reserved words. break_command = skip_whitespace(parsec.string('#break')).result(Break()) par_command = skip_whitespace(parsec.string('#par')).result(NewParagraph()) # The newch_parser is used to implement the newch_command parser below. newch_parser = skip_whitespace(parsec.string('#newch')) # The text parser consumes all input between reserved words. commands = '#newch|#par|#break' text_parser = parsec.regex('(?!(%s))(.+?)(?=%s|$)' % (commands, commands)) @parsec.Parser def text_command(text, index):
def header_field(): k = yield spaces >> parsec.regex('[^:]*') << spaces yield parsec.string(':') v = yield parsec.ends_with(parsec.regex('[^\n]*'), parsec.string('\n')) return { k: v.strip() }
def number(): return lexeme(parsec.regex(r'-?(0|[1-9][0-9]*)([.0-9]+)?'))
import parsec # "1 - (2 * (3 + 4) / 5)" => -1.8 example = "1 - (2 * (3 + 4) / 5)" expected = -1.8 whitespace = parsec.regex(r'\s*') lexeme = lambda p: p << whitespace plus = lexeme(parsec.string('+')) minus = lexeme(parsec.string('-')) divide = lexeme(parsec.string('/')) multiply = lexeme(parsec.string('*')) lbrace = lexeme(parsec.string('(')) rbrace = lexeme(parsec.string(')')) def number(): return lexeme(parsec.regex(r'-?(0|[1-9][0-9]*)([.0-9]+)?')) @parsec.generate def addition(): n1 = yield value yield plus n2 = yield value return float(n1) + float(n2)
import parsec as p import re def clean_host(x): x = re.sub(";.*", "", x.strip().lower()) if "scrofa" in x: x = "swine" elif "pig" in x: x = "swine" elif "porcine" in x: x = "swine" elif "boar" in x: x = "swine" elif "sapiens" in x: x = "human" return x p_host = p.regex(re.compile("swine|human", re.IGNORECASE))
def parse_program(): integer_arg = p.regex(r"[+-][0-9]+").parsecmap(int) expression = (((p.string("nop") << p.space()) + integer_arg) | ((p.string("acc") << p.space()) + integer_arg) | ((p.string("jmp") << p.space()) + integer_arg)) return (yield p.many(expression << p.optional(p.string("\n"))))
import re from parsec import string, sepBy, regex, sepEndBy1, spaces, Parser, separated, Value, generate, many1, digit quoted_string = regex(r'"[^"]*"', re.MULTILINE) cell = quoted_string ^ regex(r'[^,"\r\n]*') end_line = regex(r'\r\n?', re.MULTILINE) row = sepBy(cell, string(",") << spaces()) header = row csv = (header << end_line) + sepEndBy1(row, end_line) def parser_by_count(value): try: num_cells = int(value) return separated(cell, string(",") << spaces(), mint=num_cells, maxt=num_cells) except ValueError: return Parser( lambda index, text: Value.failure(index, "expected a number")) first_cell = (cell << string(",") << spaces()) counting_parser = first_cell.bind(parser_by_count) # @generate def matrix_parser(): cell = many1(digit()).parsecmap(''.join).parsecmap(int)
def day_minus_one(day): m, d = day d -= 1 if d <= 0: m -= 1 if m <= 0: m = 12 return (m, max_days_in_month[m]) else: return (m, d) # === DAYS AND RANGES === # number = parsec.regex(r"[0-90-9]{1,2}") month_def = (number << parsec.string("月")) ^ parsec.string("") day_def = number << parsec.regex(r"日?") @parsec.generate def range_def(): start_month = yield month_def start_day = yield day_def yield parsec.string("~") end_month = yield month_def end_day = yield day_def start_month = int(start_month) if start_month else None
class Schematics(namedtuple("Schematics", "width height tags tiles")): pass HEADER = b"msch" VERSION = b"\x00" ######################################## ## Reader ######################################## header = string(HEADER) version = string(VERSION) everything = regex(b"(?s).*") # don't forget newlines byte = regex(b"(?s).") char = byte.parsecmap(lambda x: unpack("b", x)[0]) short = regex(b"(?s).{2}").parsecmap(lambda x: unpack(">h", x)[0]) intp = regex(b"(?s).{4}").parsecmap(lambda x: unpack(">i", x)[0]) nbytes = lambda x: times(byte, x).parsecmap(lambda x: b"".join(x)) @generate def utf8_bytes(): """ Parses utf8 string, prefixed with length. """ length = yield short name = yield nbytes(length) return name.decode("utf8")
res[pname] = self.get(pname + "_")[pname + "_"] except: # If we get an exception, we have an ordinary (i.e. - not mapped) trait. res[pname] = self.get(pname)[pname] elif isinstance(param, dict): # We received a dictionary of subparameters, in 'param'. subs = {} for sname in param.keys(): subs.update(self.input_ami_param(param, sname)) res[pname] = subs return res ##### # AMI file parser. ##### # ignore cases. whitespace = regex(r"\s+", re.MULTILINE) comment = regex(r"\|.*") ignore = many((whitespace | comment)) def lexeme(p): """Lexer for words.""" return p << ignore # skip all ignored characters. def int2tap(x): """Convert integer to tap position.""" if (x[0] == '-'): res = ("pre" + x[1:]) else: res = ("post" + x) return res
Uses the parser-combinator library Parsec (https://github.com/sighingnow/parsec.py) To install from pypi:: pip install relaxedjson To install as an egg-link in development mode:: python setup.py develop -N """ import re from parsec import (sepBy, regex, string, generate, many, endBy) whitespace = regex(r'\s*', re.MULTILINE) lexeme = lambda p: p << whitespace comment = string('/*') >> regex(r'(?:[^*]|\*(?!\/))+', re.MULTILINE) << string('*/') comment = lexeme(comment) lbrace = lexeme(string('{')) rbrace = lexeme(string('}')) lbrack = lexeme(string('[')) rbrack = lexeme(string(']')) colon = lexeme(string(':')) comma = lexeme(string(',')) true = lexeme(string('true')).result(True) false = lexeme(string('false')).result(False)
def __init__( self, duty_expressions: Iterable[DutyExpression], monetary_units: Iterable[MonetaryUnit], permitted_measurements: Iterable[Measurement], component_output: Type[TrackedModel] = MeasureComponent, ): # Decimal numbers are a sequence of digits (without a left-trailing zero) # followed optionally by a decimal point and a number of digits (we have seen # some percentage values have three decimal digits). Money values are similar # but only 2 digits are allowed after the decimal. # TODO: work out if float will cause representation problems. decimal = regex(r"(0|[1-9][0-9]*)([.][0-9]+)?").parsecmap(float) # Specific duty amounts reference various types of unit. # For monetary units, the expression just contains the same code as is # present in the sentence. Percentage values correspond to no unit. self._monetary_unit = (reduce(try_choice, map(code, monetary_units)) if monetary_units else fail) percentage_unit = token("%").result(None) # We have to try and parse measurements with qualifiers first # else we may match the first part of a unit without the qualifier with_qualifier = [ m for m in permitted_measurements if m.measurement_unit_qualifier is not None ] no_qualifier = [ m for m in permitted_measurements if m.measurement_unit_qualifier is None ] measurements = [ measurement(m) for m in chain(with_qualifier, no_qualifier) ] self._measurement = reduce(try_choice, measurements) if measurements else fail # Each measure component can have an amount, monetary unit and measurement. # Which expression elements are allowed in a component is controlled by # the duty epxression applicability codes. We convert the duty expressions # into parsers that will only parse the elements that are permitted for this type. def component(duty_exp: DutyExpression) -> Parser: """Matches a string prefix and returns the associated type id, along with any parsed amounts and units according to their applicability, as a 4-tuple of (id, amount, monetary unit, measurement).""" prefix = duty_exp.prefix has_amount = duty_exp.duty_amount_applicability_code has_measurement = duty_exp.measurement_unit_applicability_code has_monetary = duty_exp.monetary_unit_applicability_code id = token(prefix).result(duty_exp) this_value = if_applicable(has_amount, decimal) this_monetary_unit = if_applicable( has_monetary, spaces() >> self._monetary_unit, # We must match the percentage if the amount should be there # and no monetary unit matches. default=(percentage_unit if has_amount == ApplicabilityCode.MANDATORY else optional(percentage_unit)), ) this_measurement = if_applicable( has_measurement, optional(token("/")) >> self._measurement, ) component = joint(id, this_value, this_monetary_unit, this_measurement) measurement_only = joint(id, this_measurement).parsecmap( lambda t: (t[0], None, None, t[1]), ) # It's possible for units that contain numbers (e.g. DTN => '100 kg') # to be confused with a simple specific duty (e.g 100.0 + kg) # So in the case that amounts are only optional and measurements are present, # we have to check for just measurements first. return (measurement_only ^ component if has_amount == ApplicabilityCode.PERMITTED and has_measurement != ApplicabilityCode.NOT_PERMITTED else component).parsecmap( lambda exp: component_output( duty_expression=exp[0], duty_amount=exp[1], monetary_unit=exp[2], component_measurement=exp[3], ), ) # Duty sentences can only be of a finite length – each expression may only # appear once and in order of increasing expression id. So we try all expressions # in order and filter out the None results for ones that did not match. expressions = ([ component(exp) ^ empty for exp in sorted(duty_expressions, key=lambda e: e.sid) ] if duty_expressions else [fail]) self._sentence = joint(*expressions).parsecmap( lambda sentence: [exp for exp in sentence if exp is not None], )
def string_part(): return regex(r'[^{}\\]+'.format(end_quote))
def is_any(parsers): if not parsers: return # ? result = parsers[0] for p in parsers[1:]: result |= p return result def is_a(enum_cls): return is_any([string(m) for m in enum_cls.__members__]).parsecmap(enum_cls) identifier = lexeme(regex(r'[^\d\W]\w*')) number = lexeme( regex('-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?')) # eugh @lexeme @generate def quoted_string(): yield string('"') chars = yield many(none_of('"')) yield string('"') return ''.join(chars) literal = number | quoted_string value = identifier | literal
def object_pair(): key = yield quoted | lexeme(regex(r'[a-zA-Z][-_a-zA-Z0-9]*')) yield many(comment) << colon << many(comment) val = yield value raise StopGenerator((key, val))
def number() -> int: """Parse number.""" return regex(r"([0-9]*)").parsecmap(int)