def __init__(self): # # Parser combinators # SPACES = spaces() optional_spaces = optional(SPACES) empty = SPACES.parsecmap(lambda x: EMPTY) comment = string('%%%') >> regex('.*') comment = comment.parsecmap(Comment) codepoint_hex = regex('[0-9A-F]+') codepoint_hex = codepoint_hex.parsecmap(lambda x: int(x, 16)) codepoint = string('U+') >> codepoint_hex codepoint_seq = sepBy(codepoint, SPACES) codepoint_seq = codepoint_seq.parsecmap(tuple) arrow = string('=>') arrow = optional_spaces >> arrow << optional_spaces mapping = joint( codepoint_seq << arrow, codepoint_seq, optional(comment), ) mapping = mapping.parsecmap(lambda x: Mapping(x[0], x[1], x[2])) line = try_choice(mapping, try_choice( comment, empty, )) self.parse = line.parse
def enum(): yield lexeme(string('enum')) name = yield identifier yield lexeme(string('{')) members = yield many(enum_value) yield lexeme(string('}')) return ProtobufEnum(name, members)
def message(): yield lexeme(string('message')) name = yield identifier yield lexeme(string('{')) fields = yield many(field) yield lexeme(string('}')) return Message(name, fields)
def sdf_molecule(): header = yield sdf_header atoms = yield P.times(sdf_atom, header.atom_num) bonds = yield P.times(sdf_bond, header.bond_num) yield seperator >> P.string('M') >> seperator >> P.string( 'END') >> seperator >> P.string('> <value>') value = yield seperator >> num_f yield seperator >> P.string('$$$$') return SdfMolecule(header, atoms, bonds, value)
def matrix_parser(): cell = many1(digit()).parsecmap(''.join).parsecmap(int) height = yield cell yield (string(",") << spaces()) width = yield cell yield string('\n') row = separated(cell, string(",") << spaces(), mint=width, maxt=width) rows = separated(row, string('\n'), mint=height, maxt=height) return rows
def sdf_header(): mol_num = yield transparent >> num_i atom_num = yield P.string('\n\n\n') >> P.times(P.one_of(' 0123456789'), 3) atom_num = int(''.join(atom_num)) bond_num = yield P.times(P.one_of(' 0123456789'), 3) bond_num = int(''.join(bond_num)) yield P.times(seperator >> num_i, 5) yield seperator >> P.string("V2000") return SdfHeader(mol_num, atom_num, bond_num)
def pins(): "Parse [Component].[Pin]." def filt(x): (_, (mod, _)) = x m = mod.upper() return (not ((m == "POWER") or (m == "GND") or (m == "NC"))) yield (lexeme(string("signal_name")) << lexeme(string("model_name"))) rlcs = yield optional(count(rlc, 3), []) prs = yield many1(pin(rlcs)) prs_filt = list(filter(filt, prs)) return dict(prs_filt)
def string_esc(): return string('\\') >> (string('\\') | string('/') | string('b').result('\b') | string('f').result('\f') | string('n').result('\n') | string('r').result('\r') | string('t').result('\t') | regex(r'u[0-9a-fA-F]{4}').parsecmap(to_unichr) | string(end_quote))
def ratio(): [num, den] = yield (separated(number, string("/"), 2, maxt=2, end=False) | na.result([0, 0])) if den: return num / den else: return None
def sdf_bond(): fst = yield P.string('\n') >> P.times(P.one_of(' 0123456789'), 3) fst = int(''.join(fst)) snd = yield P.times(P.one_of(' 0123456789'), 3) snd = int(''.join(snd)) bond_type = yield seperator >> num_i yield P.times(seperator >> num_i, 3) return SdfBond(fst, snd, bond_type)
def abbrev( obj: Union[MeasurementUnit, MeasurementUnitQualifier, ], ) -> Parser: """ Matches an abbreviation and returns the associated object. Humans cannot be relied upon to use spaces or thousand separators correctly so these can ignored. """ return reduce( try_choice, [ string(obj.abbreviation), string(obj.abbreviation.replace(" ", "")), string(obj.abbreviation.replace("1,000", "1000")), string(obj.abbreviation.replace(" ", "").replace("1,000", "1000")), ], ).result(obj)
def parse_untenbi(): raw_pattern = yield (pattern_def ^ parsec.string("")) raw_rules = yield all_rules result = {} if raw_pattern: result["pattern"] = pattern_translate[raw_pattern] # Interpret each rule for raw_rule in raw_rules: if raw_rule["rule"] in {"start_date", "start_date_plus1"}: if raw_rule["rule"] == "start_date_plus1": result["start"] = day_plus_one(raw_rule["day"]) else: result["start"] = raw_rule["day"] if "end" not in result: result["end"] = (12, 31) if "pattern" not in result: result["pattern"] = "毎日" elif raw_rule["rule"] in {"end_date", "end_date_minus1"}: if raw_rule["rule"] == "end_date_minus1": result["end"] = day_minus_one(raw_rule["day"]) else: result["end"] = raw_rule["day"] if "start" not in result: result["start"] = (1, 1) if "pattern" not in result: result["pattern"] = "毎日" else: if "pattern" not in result: result[ "pattern"] = "全休" if raw_rule["rule"] == "added" else "毎日" if raw_rule["rule"] not in result: result[raw_rule["rule"]] = set() result[raw_rule["rule"]].update(raw_rule["days"]) # Nullyfing exceptions if "added" in result and "removed" in result: if result["pattern"] == "毎日": result["removed"].difference_update(result["added"]) del result["added"] elif result["pattern"] == "全休": result["added"].difference_update(result["removed"]) del result["removed"] return result
def field(): q = yield qualifier ft = yield field_type ident = yield identifier yield equals fi = yield field_id options = yield optional(field_options, default_value=[]) yield lexeme(string(';')) return Field(q, ft, ident, fi, options)
def parser_by_count(value): try: num_cells = int(value) return separated(cell, string(",") << spaces(), mint=num_cells, maxt=num_cells) except ValueError: return Parser( lambda index, text: Value.failure(index, "expected a number"))
def param(): "Parse IBIS parameter." pname = yield regex( r"^[a-zA-Z]\w*", re.MULTILINE) # Parameters must begin with a letter in column 1. if DBG: print(pname) res = yield (regex(r"\s*") >> ( (word(string("=")) >> number) | typminmax | name | rest_line)) yield ignore # So that ``param`` functions as a lexeme. return (pname.lower(), res)
def single_day_rule(): date = yield single_day_def rule_type = yield (parsec.string("から運転") ^ parsec.string("からは運転") ^ parsec.string("からは運休") ^ parsec.string("まで運転") ^ parsec.string("までは運転") ^ parsec.string("まで運休") ^ parsec.string("までは運休")) if date["day"][0] is None: raise ValueError("month definition is required in から運転・まで運転 rules") return {"day": date["day"], "rule": rule_type_translations[rule_type]}
def fix_image_url(url, repo_name): '''Fixes a GitHub image urls. Any links with `github.com` are invalid, because the return *html* content. Image links would have `githubusercontent.com`. For example: - This returns an html: https://github.com/Retrothopter/Niobium-Nanotech/blob/master/Preview.png - This returns a png: https://githubusercontent.com/Retrothopter/Niobium-Nanotech/blob/master/Preview.png Any links that are relative are also invalid. For example: - preview.png - sprites/preview.png - /sprites/preview.png''' # FIXME: this assumes `master` is always the branch we want, while in reality we need the # `default_branch` of the repository, which could also for example be `main` from urllib.parse import urlparse from parsec import optional, string, regex, none_of, many, ParseError glob = ( optional(string('/')) >> string(repo_name) >> string("/blob/master/") >> many(none_of("?")).parsecmap(lambda x: "".join(x))) o = urlparse(url) if o.netloc == "raw.githubusercontent.com": return url try: path = glob.parse(o.path) except ParseError as e: path = None if o.netloc == "github.com" and path: return f"https://raw.githubusercontent.com/{repo_name}/master/{path}" if o.netloc == "": return f"https://raw.githubusercontent.com/{repo_name}/master/{o.path}" return url
def fix_image_url(url, repo_name): '''Fixes a GitHub url, where the url should point to an image. Any links with `github.com` are invalid, because they're html links, while image links would have `githubusercontent.com`, for example: - https://github.com/Retrothopter/Niobium-Nanotech/blob/master/Preview.png; Any links that don't have a domain are relative and as such invalid, for example: - preview.png; - sprites/preview.png; - /sprites/preview.png This is also why a repo name is required. ''' from urllib.parse import urlparse from parsec import optional, string, regex, none_of, many, ParseError glob = ( optional(string('/')) >> string(repo_name) >> string("/blob/master/") >> many(none_of("?")).parsecmap(lambda x: "".join(x))) o = urlparse(url) if o.netloc == "raw.githubusercontent.com": return url try: path = glob.parse(o.path) except ParseError as e: path = None if o.netloc == "github.com" and path: return f"https://raw.githubusercontent.com/{repo_name}/master/{path}" if o.netloc == "": return f"https://raw.githubusercontent.com/{repo_name}/master/{o.path}" # print('[warning] non github url:', url) return url
def fn(): "Parse IBIS keyword." yield regex(r"^\[", re.MULTILINE) wordlets = yield sepBy1(name_only, one_of( " _")) # ``name`` gobbles up trailing space, which we don't want. yield string("]") yield ignore # So that ``keyword`` functions as a lexeme. res = ("_".join(wordlets) ) # Canonicalize to: "<wordlet1>_<wordlet2>_...". if kywrd: # assert res.lower() == kywrd.lower(), f"Expecting: {kywrd}; got: {res}." # Does not work! if res.lower() == kywrd.lower(): return res else: return fail.desc(f"Expecting: {kywrd}; got: {res}.") return res
def test_memberMappings(self): from parsec import sepBy from parsec import string from proguard_mapping_parser.parser import memberMapping members = sepBy(memberMapping, string('\n')) self.assertEquals( [ ((None, ('java.util.HashMap', None), 'mHashMap', None), 'a'), (((35, 37), ('void', None), '<init>', []), '<init>'), (((66, 66), ('boolean', None), 'contains', [ ('java.lang.Object', None), ]), 'a'), ], members.parse( ' java.util.HashMap mHashMap -> a\n' ' 35:37:void <init>() -> <init>\n' ' 66:66:boolean contains(java.lang.Object) -> a\n'))
def range_def(): start_month = yield month_def start_day = yield day_def yield parsec.string("~") end_month = yield month_def end_day = yield day_def start_month = int(start_month) if start_month else None start_day = int(start_day) end_month = int(end_month) if end_month else None end_day = int(end_day) return { "type": "range", "start": (start_month, start_day), "end": (end_month, end_day) }
def _parse_fasta(self, filehandle, sep="|"): """ Parse a fasta file. The header is split into fields on 'sep'. The sequence is added as a final field. """ p_header = parsec.string(">") >> parsec.regex("[^\n\r]*") << parsec.spaces() p_seq = ( parsec.sepBy1( parsec.regex("[^>\n\r]*"), sep=parsec.regex("[\r\n\t ]+") ).parsecmap(concat) << parsec.spaces() ) p_entry = p_header + p_seq p_fasta = parsec.many1(p_entry) log(f"Reading {file_str(filehandle)} as a fasta file:") try: entries = p_fasta.parse(filehandle.read()) except AttributeError: # in case I want to pass in a list of strings, e.g., in tests entries = p_fasta.parse(filehandle) row = [h.split(sep) + [q] for (h, q) in entries] return row
pip install relaxedjson To install as an egg-link in development mode:: python setup.py develop -N """ import re from parsec import (sepBy, regex, string, generate, many, endBy) whitespace = regex(r'\s*', re.MULTILINE) lexeme = lambda p: p << whitespace comment = string('/*') >> regex(r'(?:[^*]|\*(?!\/))+', re.MULTILINE) << string('*/') comment = lexeme(comment) lbrace = lexeme(string('{')) rbrace = lexeme(string('}')) lbrack = lexeme(string('[')) rbrack = lexeme(string(']')) colon = lexeme(string(':')) comma = lexeme(string(',')) true = lexeme(string('true')).result(True) false = lexeme(string('false')).result(False) null = lexeme(string('null')).result(None) quote = string('"') | string("'")
# The targets of our parsing. # We look for instructions to insert text, chapters, # paragraphs and line breaks. Text = collections.namedtuple('Text', ['text']) NewChapter = collections.namedtuple('NewChapter', ['text']) NewParagraph = collections.namedtuple('NewParagraph', []) Break = collections.namedtuple('Break', []) # Parse and ignore trailing white space after our reserved words. whitespace = parsec.regex(r'\s*', re.MULTILINE) skip_whitespace = lambda p: p << whitespace # noqa # Break and new paragraph commands are simply reserved words. break_command = skip_whitespace(parsec.string('#break')).result(Break()) par_command = skip_whitespace(parsec.string('#par')).result(NewParagraph()) # The newch_parser is used to implement the newch_command parser below. newch_parser = skip_whitespace(parsec.string('#newch')) # The text parser consumes all input between reserved words. commands = '#newch|#par|#break' text_parser = parsec.regex('(?!(%s))(.+?)(?=%s|$)' % (commands, commands)) @parsec.Parser def text_command(text, index): """Parse a text command returning the text to be inserted.""" res = text_parser(text, index) if not res.status: return res
class Schematic(namedtuple("Schematic", "name pos config rotation")): pass class Schematics(namedtuple("Schematics", "width height tags tiles")): pass HEADER = b"msch" VERSION = b"\x00" ######################################## ## Reader ######################################## header = string(HEADER) version = string(VERSION) everything = regex(b"(?s).*") # don't forget newlines byte = regex(b"(?s).") char = byte.parsecmap(lambda x: unpack("b", x)[0]) short = regex(b"(?s).{2}").parsecmap(lambda x: unpack(">h", x)[0]) intp = regex(b"(?s).{4}").parsecmap(lambda x: unpack(">i", x)[0]) nbytes = lambda x: times(byte, x).parsecmap(lambda x: b"".join(x)) @generate def utf8_bytes(): """ Parses utf8 string, prefixed with length. """ length = yield short
return '[%s%s]' % (self.element_type, star_str) if self.is_dlist: return '[[%s%s]]' % (self.element_type, star_str) if self.is_set: return '{%s%s}' % (self.element_type, star_str) if self.is_dict: return '{%s: %s%s}' % (self.element_type[0], self.element_type[1], star_str) raise RuntimeError('Invalid codegen kind: %s' % self.kind) name_pattern = parsec.spaces() >> parsec.regex( r'[_a-zA-Z][_a-zA-Z0-9<>, ]*(::[_a-zA-Z][_a-zA-Z0-9<>, ]*)*' ) << parsec.spaces() star_pattern = parsec.spaces() >> parsec.optional(parsec.string('*'), '') << parsec.spaces() parse_meta = parsec.spaces().parsecmap(lambda _: CodeGenKind('meta')) parse_plain = (parsec.spaces() >> (name_pattern + star_pattern) << parsec.spaces() ).parsecmap(lambda value: CodeGenKind('plain', value)) parse_list = (parsec.string('[') >> (name_pattern + star_pattern) << parsec.string(']') ).parsecmap(lambda value: CodeGenKind('list', value)) parse_dlist = (parsec.string('[[') >> (name_pattern + star_pattern) << parsec.string(']]') ).parsecmap(lambda value: CodeGenKind('dlist', value)) parse_set = (parsec.string('{') >> (name_pattern + star_pattern) << parsec.string('}')
import re from parsec import string, sepBy, regex, sepEndBy1, spaces, Parser, separated, Value, generate, many1, digit quoted_string = regex(r'"[^"]*"', re.MULTILINE) cell = quoted_string ^ regex(r'[^,"\r\n]*') end_line = regex(r'\r\n?', re.MULTILINE) row = sepBy(cell, string(",") << spaces()) header = row csv = (header << end_line) + sepEndBy1(row, end_line) def parser_by_count(value): try: num_cells = int(value) return separated(cell, string(",") << spaces(), mint=num_cells, maxt=num_cells) except ValueError: return Parser( lambda index, text: Value.failure(index, "expected a number")) first_cell = (cell << string(",") << spaces()) counting_parser = first_cell.bind(parser_by_count) # @generate def matrix_parser(): cell = many1(digit()).parsecmap(''.join).parsecmap(int)
def parser(): head = yield header yield parsec.many(parsec.string('\n')) samps = yield parsec.many(sample) return head, samps
def quoted(): end_quote = yield quote body = yield many(charseq(end_quote)) yield string(end_quote) raise StopGenerator(''.join(body))
def sample(): fwhm = yield spaces >> floating << spaces level = yield floating << spaces yield parsec.optional(parsec.string('\n')) return (float(fwhm), float(level))