def text_to_pages(txt: str) -> str: """ Convert raw text of a docket to an xml-string, where the nodes are the pages and sections of the docket. i.e. <docket> <page> <section> </section </page> <page> <section_continued> </section_continued> </page> </docket>""" grammar = Grammar(docket_sections) try: nodes = grammar.parse(txt) visitor = CustomVisitorFactory( common_terminals, docket_sections_nonterminals, docket_sections_custom_nodevisitors).create_instance() return visitor.visit(nodes) except Exception as e: slines = txt.split("\n") logging.error("text_to_pages failed.") return "<docket></docket>"
def __init__(self, drawer): super().__init__(drawer) self.drawer = drawer self.source = [] self.command = '' self.data = 0 self.peg_grammar = Grammar(r''' line = statement ws? comment? ws statement = directive ws? parameter? directive = ~"P|X|Y|D|W|N|E|S|U" parameter = ~"-?\d{0,}\.{0,1}\d{0,}" comment = ~"#.*" ws = ~"\s*" ''') self.peg_visitor = TigrVisitor() self.no_parameter_commands = { 'D': self.drawer.pen_down, 'U': self.drawer.pen_up } self.one_parameter_commands = { 'P': self.drawer.select_pen, # 'G': self.drawer.goto, 'X': self.drawer.go_along, 'Y': self.drawer.go_down, } self.draw_commands = { 'N': self.drawer.draw_line, 'E': self.drawer.draw_line, 'S': self.drawer.draw_line, 'W': self.drawer.draw_line, } self.draw_degrees = {'N': 90 * 1, 'E': 0, 'S': 90 * 3, 'W': 90 * 2}
def text_to_pages(txt: str) -> Tuple[str, List[str]]: """ Convert raw text of a docket to an xml-string, where the nodes are the pages and sections of the docket. i.e. .. code-block:: <docket> <page> <section> </section </page> <page> <section_continued> </section_continued> </page> </docket> """ errors = [] grammar = Grammar(docket_sections) try: nodes = grammar.parse(txt) visitor = CustomVisitorFactory( common_terminals, docket_sections_nonterminals, docket_sections_custom_nodevisitors, ).create_instance() return visitor.visit(nodes), errors except Exception as e: slines = txt.split("\n") logger.error("text_to_pages failed.") errors.append("Could not extract pages from docket text.") return "<docket></docket>", errors
def test_custom_visitor_factory(): text = """Hi there, partner""" grammar = r""" text = greeting punctuation identifier greeting = hi_there? punctuation = comma? identifier = partner? hi_there = "Hi there" comma = ", " partner = "partner" """ grammar = Grammar(grammar) terminals = ["hi_there", "comma", "partner"] nonterminals = ["text", "greeting", "punctuation", "identifier"] custom_visitor = CustomVisitorFactory(terminals, nonterminals, dict()).create_instance() #custom_visitor = custom_visitor.create_instance() root = grammar.parse(text) # print("The parse tree:") # print(root.prettily()) xml = custom_visitor.visit(root) assert xml == "<text> <greeting> Hi there </greeting><punctuation> , </punctuation><identifier> partner </identifier> </text>" # print(xml) # print("Finished.")
def parse(section_text): section_text += "\n" grammar = Grammar(grammars[0]) visitor = DefendantInfoVisitor() root = grammar.parse(section_text) section_xml = visitor.visit(root) return section_xml
def test2(): grammar = Grammar(r""" text = text_quoted / text_simple text_quoted = ~r'"([^"\\]|\\.)*"' text_simple = ~'[a-zA-Z0-9.*_-]+' """) data = '"z,@! \\" ok \\" "' print(grammar.parse(data))
def parse_pdf(pdf: Union[BinaryIO, str], tempdir=None) -> Tuple[Person, Case]: """ Parse the a pdf of a criminal record docket. The 'see' references are to the DocketParse library, which also parses pdf dockets. Args: pdf: a binary reader or a string path to a pdf file. tempdir: The pdf must be written to txt with pdftotext, so we need a temporary directory for it. Returns: The Person to whom the docket relates, and the Case to which the Docket relates. """ # a list of strings errors = [] # pdf to raw text txt = get_text_from_pdf(pdf, tempdir=tempdir) # text to xml sections (see DocketParse.sectionize). This handles page breaks. pages_tree = etree.fromstring(text_to_pages(txt)) sections_tree = sections_from_pages(pages_tree) # parse individual sections with grammars for those sections # TODO add try catch blocks that allow for continuing even after certain parts fail, like # if a single section fails to parse. for section_name, grammar, terminals, nonterminals, custom_visitors in section_grammars: try: section = sections_tree.xpath( f"//section[@name='{section_name}']")[0] # remove blank lines at the ends of the section. section_text = "\n".join( [ln for ln in section.text.split("\n") if ln.strip()]) grammar = Grammar(grammar) try: nodes = grammar.parse(section_text) except Exception as e: slines = section_text.split("\n") errors.append(f" Text for {section_name} failed to parse.") logging.error(f" Text for {section_name} failed to parse.") continue visitor = CustomVisitorFactory(terminals, nonterminals, custom_visitors).create_instance() parsed_section_text = visitor.visit(nodes) parsed_section_xml = etree.fromstring(parsed_section_text) # replace original unparsed section's text w/ the parsed xml. sections_tree.xpath( f"//section[@name='{section_name}']")[0].text = "" sections_tree.xpath(f"//section[@name='{section_name}']" )[0].append(parsed_section_xml) except (Exception, IndexError) as e: # not all dockets have all sections, so not being able to find a section is not # necessarily an error. #slines = section_text.split("\n") logging.info(f" Could not find section {section_name}") #slines = etree.tostring(sections_tree, encoding="unicode").split("\n") # extract Person and Case information from xml. # i.e. defendant_name = section_tree.xpath("//caption/name")[0].text defendant = get_person(sections_tree) case = get_case(sections_tree) return defendant, case, errors
def test_parameters_ok(value): test_grammar = Grammar( grammar.params + grammar.type_ + grammar.symbols + grammar.ident + grammar.ws) tree = test_grammar.parse(value) assert tree is not None
def test_annotationlist_ok(value): test_grammar = Grammar( "start = annotationlist\n" + grammar.annotation + grammar.symbols + grammar.ident + grammar.ws) tree = test_grammar.parse(value) assert tree is not None
def test_enum_ok(value): test_grammar = Grammar( "start=enum\n" + grammar.enum + grammar.ws + grammar.symbols + grammar.ident ) tree = test_grammar.parse(value) assert tree is not None
def parse(section_text): grammar = Grammar(grammars[0]) custom_visitor = CustomVisitorFactory(terminals, nonterminals, dict()).create_instance() root = grammar.parse(section_text) # print("Parse tree:") # print(root.prettily()) xml = custom_visitor.visit(root) # print(xml) return xml
def test_param_list_ok(value): test_grammar = Grammar( "start = param_list\n" + grammar.params + grammar.type_ + grammar.symbols + grammar.ident + grammar.ws) tree = test_grammar.parse(value) assert tree is not None
def __init__(self, drawer): super().__init__(drawer) self.peg_grammar = Grammar(r''' line = statement ws? comment? ws statement = directive ws? parameter? directive = ~"P|X|Y|D|W|N|E|S|U" parameter = ~"-?\d{0,}\.{0,1}\d{0,}" comment = ~"#.*" ws = ~"\s*" ''') self.peg_visitor = self.TigrVisitor()
def test_use_regex_library(): grammar = Grammar(r''' unicode_word = ~"[\p{L}]*" ''', use_regex_library=True) text = 'Тест' expected = RegexNode(expr=Regex(pattern=r'[\p{L}]*', use_regex_library=True), full_text=text, start=0, end=4) result = grammar.parse(text=text) eq_(result, expected)
def test13(grammar: Grammar): data = '{($.user.id = 2 && $.users[0].email = "nonmatch") || $.actions[2] = "GET"}' print(grammar.parse(data)) data = [ ' { ($.user.id = 1) && ($.users[0].email = "*****@*****.**") } ', '{($.user.id = 2 && $.users[0].email = "nonmatch") || $.actions[2] = "GET"}', '{ $.user.email = "*****@*****.**" || $.coordinates[0][1] = nonmatch && $.actions[2] = nomatch }', '{ ($.user.email = "*****@*****.**" || $.coordinates[0][1] = nonmatch) && $.actions[2] = nomatch }' ] for datum in data: print(grammar.parse(datum))
def parse(section_text): clean_section_text = clean_headers(section_text) # print("====") # print(clean_section_text) # print("====") # print("----") # print(temp_text) # print("-----") grammar = Grammar(grammars[0]) visitor = DispositionVisitor() root = grammar.parse(clean_section_text) reconstituted_xml = visitor.visit(root) return reconstituted_xml
def test1(): grammar = Grammar(""" selector = root item* item = dot identifier ("[" index "]")* index = ~'([1-9][0-9]+)|[0-9]' identifier = ~'[a-zA-Z0-9_-]+' root = "$" dot = "." """) data = '$.a.b[1][2].c[3]' print(grammar.parse(data)) data = '$.a.b[10][2]' print(grammar.parse(data))
def test_class_ok(value): test_grammar = Grammar( grammar.class_ + grammar.annotation + grammar.function + grammar.params + grammar.type_ + grammar.qualifier + grammar.ws + grammar.symbols + grammar.ident ) tree = test_grammar.parse(value) assert tree is not None
def test_func_body_ok(value): test_grammar = Grammar( "start=func_body\n" + grammar.annotation + grammar.function + grammar.params + grammar.type_ + grammar.qualifier + grammar.ws + grammar.symbols + grammar.ident ) tree = test_grammar.parse(value) assert tree is not None
def grammar12(): return Grammar(r""" top_cond = _* '{' cond '}' _* cond = (_* cond_simple_seq _*) / (_* cond_quoted_seq _*) cond_simple_seq = cond_simple cond_tail* cond_quoted_seq = cond_quoted cond_tail* cond_quoted = '(' cond ')' cond_tail = _ + op_boolean _+ cond cond_simple = cmp_common / cmp_is_true / cmp_is_false / cmp_is_null / cmp_not_exists cmp_common = selector _* op_common _* text cmp_is_true = selector _+ 'IS' _+ 'TRUE' cmp_is_false = selector _+ 'IS' _+ 'FALSE' cmp_is_null = selector _+ 'IS' _+ 'NULL' cmp_not_exists = selector _+ 'NOT' _+ 'EXISTS' op_common = '=' / '!=' / '<=' / '>=' / '<' / '>' op_boolean = '||' / '&&' selector = root path+ path = dot child ('[' index ']')* index = ~'([1-9][0-9]+)|[0-9]' child = ~'[a-zA-Z0-9_-]+' root = "$" dot = "." text = text_quoted / text_simple text_quoted = ~r'"([^"\\]|\\.)*"' text_simple = ~'[a-zA-Z0-9.*_-]+' _ = ~'[ \t]' """)
def grammar10(): return Grammar(""" top_cond = _* '{' cond '}' _* cond = (_* cond_simple_seq _*) / (_* cond_quoted_seq _*) cond_simple_seq = cond_simple (_+ op_boolean _+ cond)* cond_quoted_seq = '(' cond ')' (_+ op_boolean _+ cond)* cond_simple = cmp_common / cmp_is / cmp_not_exists cmp_common = selector _* op_common _* text cmp_is = selector _+ 'IS' _+ ('TRUE' / 'FALSE' / 'NULL') cmp_not_exists = selector _+ 'NOT' _+ 'EXISTS' op_common = '=' / '!=' / '<=' / '>=' / '<' / '>' op_boolean = '||' / '&&' selector = root path* path = dot child ('[' index ']')* index = ~'([1-9][0-9]+)|[0-9]' child = ~'[a-zA-Z0-9_-]+' text = ~'[a-zA-Z0-9._-]+' root = "$" dot = "." _ = ~'[ \t]' """)
def test_visitor(): """Assert a tree gets visited correctly.""" grammar = Grammar(r''' bold_text = bold_open text bold_close text = ~'[a-zA-Z 0-9]*' bold_open = '((' bold_close = '))' ''') text = '((o hai))' tree = Node(grammar['bold_text'], text, 0, 9, [Node(grammar['bold_open'], text, 0, 2), Node(grammar['text'], text, 2, 7), Node(grammar['bold_close'], text, 7, 9)]) eq_(grammar.parse(text), tree) result = HtmlFormatter().visit(tree) eq_(result, '<b>o hai</b>')
def grammar6(): return Grammar(""" top_cond = _ '{' cond '}' _ cond = (_ cond_simple_seq _) / (_ cond_quoted_seq _) cond_simple_seq = cond_simple _ (boolean_op _ cond)* cond_quoted_seq = '(' cond ')' _ (boolean_op _ cond)* cond_simple = cmp_basic / cmp_numeric / cmp_is / cmp_not_exists cmp_basic = (_ cmp_eq _) / (_ cmp_ne _) cmp_numeric = (_ cmp_le _) / (_ cmp_ge _) / (_ cmp_lt _) / (_ cmp_gt _) cmp_is = (_ cmp_is_true _) / (_ cmp_is_false _) / (_ cmp_is_null _) cmp_eq = selector _ '=' _ text cmp_ne = selector _ '!=' _ text cmp_le = selector _ '<=' _ text cmp_ge = selector _ '>=' _ text cmp_lt = selector _ '<' _ text cmp_gt = selector _ '>' _ text cmp_is_true = selector _ 'IS' _ 'TRUE' cmp_is_false = selector _ 'IS' _ 'FALSE' cmp_is_null = selector _ 'IS' _ 'NULL' cmp_not_exists = selector _ 'NOT' _ 'EXISTS' boolean_op = '||' / '&&' selector = ~'[a-zA-Z0-9._-]+' text = ~'[a-zA-Z0-9._-]+' _ = ~'[ \t]*' """)
def get_action_sequence_and_all_actions(self, query: List[str] = None, prelinked_entities: Dict[str, Dict[str, str]] = None) -> Tuple[List[str], List[str]]: # pylint: disable=line-too-long grammar_with_context = deepcopy(self.base_grammar_dictionary) if not self.use_prelinked_entities and prelinked_entities is not None: raise ConfigurationError("The Text2SqlNoGrammarWorld was specified to not use prelinked " "entities, but prelinked entities were passed.") prelinked_entities = prelinked_entities or {} update_grammar_numbers_and_strings_with_variables(grammar_with_context, prelinked_entities, self.columns) update_grammar_with_tokens(grammar_with_context, query) grammar = Grammar(format_grammar_string(grammar_with_context)) valid_actions = initialize_valid_actions(grammar) all_actions = set() for action_list in valid_actions.values(): all_actions.update(action_list) sorted_actions = sorted(all_actions) sql_visitor = SqlVisitor(grammar) try: action_sequence = sql_visitor.parse(" ".join(query)) if query else [] except ParseError as e: print("\nParse Error - details:\n", e.pos, '\n', e.expr, '\n', e.text) action_sequence = None except RecursionError as er: print("\nParse recursion error - details:\n", " ".join(query), '\n', grammar_with_context['terminal']) action_sequence = None return action_sequence, sorted_actions
def test_variable_free_world_cannot_parse_as_statements(self): world = Text2SqlWorld(self.schema) grammar_dictionary = world.base_grammar_dictionary for productions in grammar_dictionary.items(): assert "AS" not in productions sql_with_as = [ 'SELECT', 'COUNT', '(', '*', ')', 'FROM', 'LOCATION', 'AS', 'LOCATIONalias0', ',', 'RESTAURANT', 'WHERE', 'LOCATION', '.', 'CITY_NAME', '=', "'city_name0'", 'AND', 'RESTAURANT', '.', 'NAME', '=', 'LOCATION', '.', 'RESTAURANT_ID', 'AND', 'RESTAURANT', '.', 'NAME', '=', "'name0'", ';' ] grammar = Grammar(format_grammar_string(world.base_grammar_dictionary)) sql_visitor = SqlVisitor(grammar) with self.assertRaises(ParseError): sql_visitor.parse(" ".join(sql_with_as)) sql = [ 'SELECT', 'COUNT', '(', '*', ')', 'FROM', 'LOCATION', ',', 'RESTAURANT', 'WHERE', 'LOCATION', '.', 'CITY_NAME', '=', "'city_name0'", 'AND', 'RESTAURANT', '.', 'NAME', '=', 'LOCATION', '.', 'RESTAURANT_ID', 'AND', 'RESTAURANT', '.', 'NAME', '=', "'name0'", ';' ] # Without the AS we should still be able to parse it. sql_visitor = SqlVisitor(grammar) sql_visitor.parse(" ".join(sql))
def get_action_sequence_and_all_actions(self, query: List[str] = None, prelinked_entities: Dict[str, Dict[str, str]] = None) -> Tuple[List[str], List[str]]: # pylint: disable=line-too-long grammar_with_context = deepcopy(self.base_grammar_dictionary) if not self.use_prelinked_entities and prelinked_entities is not None: raise ConfigurationError( "The Text2SqlWorld was specified to not use prelinked " "entities, but prelinked entities were passed.") prelinked_entities = prelinked_entities or {} if self.use_untyped_entities: update_grammar_values_with_variables(grammar_with_context, prelinked_entities) else: update_grammar_numbers_and_strings_with_variables( grammar_with_context, prelinked_entities, self.columns) grammar = Grammar(format_grammar_string(grammar_with_context)) valid_actions = initialize_valid_actions(grammar) all_actions = set() for action_list in valid_actions.values(): all_actions.update(action_list) sorted_actions = sorted(all_actions) sql_visitor = SqlVisitor(grammar) action_sequence = sql_visitor.parse(" ".join(query)) if query else [] return action_sequence, sorted_actions
def grammar8(): return Grammar(""" top_cond = _* '{' cond '}' _* cond = (_* cond_simple_seq _*) / (_* cond_quoted_seq _*) cond_simple_seq = cond_simple (_+ boolean_op _+ cond)* cond_quoted_seq = '(' cond ')' (_+ boolean_op _+ cond)* cond_simple = cmp_basic / cmp_numeric / cmp_is / cmp_not_exists cmp_basic = cmp_eq / cmp_ne cmp_numeric = cmp_le / cmp_ge / cmp_lt / cmp_gt cmp_is = cmp_is_true / cmp_is_false / cmp_is_null cmp_eq = selector _* '=' _* text cmp_ne = selector _* '!=' _* text cmp_le = selector _* '<=' _* text cmp_ge = selector _* '>=' _* text cmp_lt = selector _* '<' _* text cmp_gt = selector _* '>' _* text cmp_is_true = selector _+ 'IS' _+ 'TRUE' cmp_is_false = selector _+ 'IS' _+ 'FALSE' cmp_is_null = selector _+ 'IS' _+ 'NULL' cmp_not_exists = selector _+ 'NOT' _+ 'EXISTS' boolean_op = '||' / '&&' selector = ~'[a-zA-Z0-9._-]+' text = ~'[a-zA-Z0-9._-]+' _ = ~'[ \t]' """)
def _construct(names): grammar_list = [] for name in reversed(names): grammar_path = path.join(_grammar_dir, '{}.grammar'.format(name)) with open(grammar_path, 'r') as file: grammar_list.append(file.read()) return Grammar('\n'.join(grammar_list))
def get_action_sequence_and_all_actions(self, allow_aliases: bool = False ) -> Tuple[List[str], List[str]]: grammar_with_context = deepcopy(self.base_grammar_dictionary) if not allow_aliases: update_grammar_to_be_table_names_free(grammar_with_context) schema = self.db_context.schema update_grammar_with_tables(grammar_with_context, schema) grammar = Grammar(format_grammar_string(grammar_with_context)) valid_actions = initialize_valid_actions(grammar) all_actions = set() for action_list in valid_actions.values(): all_actions.update(action_list) sorted_actions = sorted(all_actions) self.valid_actions = valid_actions self.valid_actions_flat = sorted_actions action_sequence = None if self.query is not None: sql_visitor = SqlVisitor(grammar) query = " ".join(self.query).lower().replace("``", "'").replace( "''", "'") try: action_sequence = sql_visitor.parse(query) if query else [] except ParseError as e: pass return action_sequence, sorted_actions
def test(): grammar = Grammar( #pattern = "{" ws text ws "=" ws text ws "}" """ #pattern6 = _ "{" pattern5 "}" _ #pattern6 = "(" pattern4 (logical pattern4)* #pattern5 = pattern4 (logical pattern4)* #pattern6 = pattern5 / pattern4 #pattern7 = _ pattern6 _ # (a || (b || c)) # ((a || b) || c) # ((a || b || c)) pattern7 = "(" pattern7 (logical pattern6)* ")" pattern6 = pattern5 / pattern4 # (a || b || c) pattern5 = "(" pattern4 ")" # a || b || c pattern4 = pattern3 (logical pattern3)* pattern3 = _ (pattern2 / pattern1) _ pattern2 = "(" _ (pattern2/pattern1) _ ")" pattern1 = _ (compare_eq / compare_ne) _ compare_eq = text _ "=" _ text compare_ne = text _ "!=" _ text logical = "&&" / "||" _ = ~"[ \t]"* text = ~"[a-zA-Z0-9_-]+" item = dot identifier ("[" index "]")* index = ~"[0-9]|[1-9][0+9]+" identifier = ~"[a-zA-Z0-9_-]+" selector = root item* root = "$" dot = "." """) #data = ' { ( -a_bc= 123 ) || abc = 123 } ' data = '(((-a_bc=123))||abc=12)' print(grammar.parse(data))
def __init__(self, tune_fn): NodeVisitor.__init__(self) # start with an empty tune, voice, note, and list of modifiers self.tune = Tune() self.voice = Voice() self.note = Note() self.note_modifiers = [] # at the outset, we are not in a voice's content self.in_content = False # set up the actual parser grammar = Grammar(open("doremi-grammar", "r").read()) # read and parse the tune tune_text = codecs.open(tune_fn, "r", "utf-8").read() self.syntax = grammar.parse(tune_text)
def grammar3(): return Grammar(""" expr = atom_ext / expr_ext atom_ext = atom (or expr)* expr_ext = '(' expr ')' (or expr)* atom = ~'[a-zA-Z0-9._-]+' or = '||' """)
def parse(path): """ Parse a pdf docket into an xml document. This xml document will be of the form: <docket> <page> <caption> </caption> <body> <section name='a'> </section> <section name='b'> </section> ... </body> <footer> </footer> </page> <page> ... </page> </docket> ... This xml most closely resembles the original docket. (The caveat is that section names are removed from the text and turned into name attributes of the section xml elements). But some sections extend across pages, and this xml schema leves these sections separated from each other. TODO: Turn this into a real .xsd schema definition. """ print("Starting parse {}".format(path)) # start = datetime.now() # docket_text = pdf_to_text(path) pdf2text_time = (datetime.now()-start).microseconds # start = datetime.now() # grammar = Grammar(grammar_list[0]) create_grammar_time = (datetime.now()-start).microseconds # visitor = DocketVisitor() start = datetime.now() # root = grammar.parse(docket_text) parse_grammar_time = (datetime.now()-start).microseconds # start = datetime.now() # results = visitor.visit(root) node_visitor_time = (datetime.now()-start).microseconds # logging.info("{}, {}, {}, {}".format(pdf2text_time, create_grammar_time, parse_grammar_time, node_visitor_time)) return results
def jexl_grammar(jexl_config): return Grammar(r""" expression = ( _ (conditional_expression / binary_expression / unary_expression / complex_value) _ ) conditional_expression = ( conditional_test _ "?" _ expression _ ":" _ expression ) conditional_test = (binary_expression / unary_expression / complex_value) binary_expression = binary_operand (_ binary_operator _ binary_operand)+ binary_operator = {binary_op_pattern} binary_operand = (unary_expression / complex_value) unary_expression = unary_operator _ unary_operand unary_operator = {unary_op_pattern} unary_operand = (unary_expression / complex_value) complex_value = value (transform / attribute / filter_expression)* transform = "|" identifier transform_arguments? transform_arguments = "(" _ value_list _ ")" attribute = "." identifier filter_expression = "[" _ expression _ "]" value = ( boolean / string / numeric / subexpression / object_literal / array_literal / identifier / relative_identifier ) subexpression = "(" _ expression _ ")" object_literal = "{{" _ object_key_value_list? _ "}}" object_key_value_list = object_key_value (_ "," _ object_key_value)* object_key_value = identifier _ ":" _ expression array_literal = "[" _ value_list? _ "]" value_list = expression (_ "," _ expression)* identifier = ~r"[a-zA-Z_\$][a-zA-Z0-9_\$]*" relative_identifier = "." identifier boolean = "true" / "false" string = ~"\"[^\"\\\\\\n\\r]*(?:\\\\.[^\"\\\\\\n\\r]*)*\""is / ~"'[^'\\\\\\n\\r]*(?:\\\\.[^'\\\\\\n\\r]*)*'"is numeric = "-"? number ("." number)? number = ~r"[0-9]+" _ = ~r"\s*" """.format( binary_op_pattern=operator_pattern(jexl_config.binary_operators.values()), unary_op_pattern=operator_pattern(jexl_config.unary_operators.values()) ))
def __init__(self, text): NodeVisitor.__init__(self) # start with a new empty lyric self.lyric = Lyric() # add an empty voice to it self.lyric.voices.append(LyricVoice()) # build an abstract syntax tree self.grammar = Grammar(open("lyric-grammar", "r").read()) self.syntax = self.grammar.parse(text)
class LyricParser(NodeVisitor): """Parses .drmw lyric files for association with Doremi tunes""" def __init__(self, text): NodeVisitor.__init__(self) # start with a new empty lyric self.lyric = Lyric() # add an empty voice to it self.lyric.voices.append(LyricVoice()) # build an abstract syntax tree self.grammar = Grammar(open("lyric-grammar", "r").read()) self.syntax = self.grammar.parse(text) def convert(self): """Convert the syntax tree to our internal representation""" self.visit(self.syntax) # remove any extra empty voices self.lyric.voices = [voice for voice in self.lyric.voices if voice.name != ""] # remove any extra empty verses for voice in self.lyric.voices: voice.verses = voice.verses[:-1] return self.lyric def visit_title(self, node, vc): self.lyric.title = get_string_val(node) def visit_author(self, node, vc): self.lyric.author = get_string_val(node) def visit_meter(self, node, vc): self.lyric.meter = get_string_val(node) def visit_voicespec(self, node, vc): # the current voice is complete, so start a new one self.lyric.voices.append(LyricVoice()) def visit_voice(self, node, vc): self.lyric.voices[-1].name = get_node_val(node, "name") def visit_verse(self, node, vc): # the verse is complete, so start a new one self.lyric.voices[-1].verses.append(Verse()) def visit_word(self, node, vc): self.lyric.voices[-1].verses[-1].words.append(node.text.strip()) def generic_visit(self, node, vc): pass
def test_custom_visitor_factory(): text = """Hi there, partner""" grammar = r""" text = greeting punctuation identifier greeting = hi_there? punctuation = comma? identifier = partner? hi_there = "Hi there" comma = ", " partner = "partner" """ grammar = Grammar(grammar) terminals = ["hi_there", "comma", "partner"] nonterminals = ["text", "greeting", "punctuation", "identifier"] custom_visitor = CustomVisitorFactory(terminals, nonterminals, dict()).create_instance() #custom_visitor = custom_visitor.create_instance() root = grammar.parse(text) # print("The parse tree:") # print(root.prettily()) xml = custom_visitor.visit(root) assert xml=="<text> <greeting> Hi there </greeting><punctuation> , </punctuation><identifier> partner </identifier> </text>" # print(xml) # print("Finished.")
Defendant eligible for work release. Probation Max of 3.00 Years 12/20/2011 3 years All conditions previously imposed to remain. """, """ Manufacture or Deliver Shreeves-Johns, Karen 07/13/2011 Probation Max of 3.00 Years 07/13/2011 3 years Defendant is to pay imposed mandatory court costs. To submit to random drug screens. To pursue a prescribed secular course of study or vocational training. Case relisted for status of compliance on 9/22/11 courtroom 605. Shreeves-Johns, Karen 12/20/2011 Confinement Min of 11.00 Months 15.00 Days 12/20/2011 Max of 23.00 Months 11 1/2 - 23 months Defendant eligible for work release. Probation Max of 3.00 Years 12/20/2011 3 years All conditions previously imposed to remain. """, ] grammar = Grammar(grammars[0]) root = grammar.parse(texts[0]) print("parsed.") visitor = DetailsVisitor() print(visitor.visit(root))
def stringify_list(self, list): output = "" for element in list: output += element return output # End of Class test_num = 0 # grammar = Grammar(grammars[test_num]) # root = grammar.parse(texts[0]) # print("Parsed okay.") # visitor = CaseInfoVisitor() # results = visitor.visit(root) # print(results) # for r in results: # print(r) # print(root.prettily()) #with open("./sample_dockets/CP-51-CR-0000001-2011.txt") as f: with open("./sample_dockets/CP-51-CR-0005727-2011.txt") as f: grammar = Grammar(grammars[test_num]) root = grammar.parse(f.read()) visitor = DocketVisitor_2() print("Parse succeeded.") with open("output2.txt", 'w+') as f2: f2.write(visitor.visit(root)) f2.close() f.close()