예제 #1
0
def text_to_pages(txt: str) -> str:
    """ Convert raw text of a docket to an xml-string, where the nodes are the pages and sections of the docket.
    
    i.e. 
    <docket>
        <page>
            <section> 
            </section
        </page>
        <page>
            <section_continued>
            </section_continued>
        </page>
    </docket>"""
    grammar = Grammar(docket_sections)
    try:
        nodes = grammar.parse(txt)
        visitor = CustomVisitorFactory(
            common_terminals, docket_sections_nonterminals,
            docket_sections_custom_nodevisitors).create_instance()
        return visitor.visit(nodes)
    except Exception as e:
        slines = txt.split("\n")
        logging.error("text_to_pages failed.")
        return "<docket></docket>"
예제 #2
0
    def __init__(self, drawer):
        super().__init__(drawer)
        self.drawer = drawer
        self.source = []
        self.command = ''
        self.data = 0

        self.peg_grammar = Grammar(r'''
            line = statement ws? comment? ws
            statement   = directive ws? parameter?
            directive   = ~"P|X|Y|D|W|N|E|S|U"
            parameter   = ~"-?\d{0,}\.{0,1}\d{0,}"
            comment     = ~"#.*"
            ws          = ~"\s*"
        ''')
        self.peg_visitor = TigrVisitor()

        self.no_parameter_commands = {
            'D': self.drawer.pen_down,
            'U': self.drawer.pen_up
        }

        self.one_parameter_commands = {
            'P': self.drawer.select_pen,
            # 'G': self.drawer.goto,
            'X': self.drawer.go_along,
            'Y': self.drawer.go_down,
        }
        self.draw_commands = {
            'N': self.drawer.draw_line,
            'E': self.drawer.draw_line,
            'S': self.drawer.draw_line,
            'W': self.drawer.draw_line,
        }
        self.draw_degrees = {'N': 90 * 1, 'E': 0, 'S': 90 * 3, 'W': 90 * 2}
예제 #3
0
def text_to_pages(txt: str) -> Tuple[str, List[str]]:
    """ Convert raw text of a docket to an xml-string, where the nodes are the pages and sections of the docket.
    
    i.e.

    .. code-block:: 

        <docket>
            <page>
                <section> 
                </section
            </page>
            <page>
                <section_continued>
                </section_continued>
            </page>
        </docket>
    """
    errors = []
    grammar = Grammar(docket_sections)
    try:
        nodes = grammar.parse(txt)
        visitor = CustomVisitorFactory(
            common_terminals,
            docket_sections_nonterminals,
            docket_sections_custom_nodevisitors,
        ).create_instance()
        return visitor.visit(nodes), errors
    except Exception as e:
        slines = txt.split("\n")
        logger.error("text_to_pages failed.")
        errors.append("Could not extract pages from docket text.")
        return "<docket></docket>", errors
def test_custom_visitor_factory():
    text = """Hi there, partner"""
    grammar = r"""
  text = greeting punctuation identifier
  greeting = hi_there?
  punctuation = comma?
  identifier = partner?

  hi_there = "Hi there"
  comma = ", "
  partner = "partner"
  """
    grammar = Grammar(grammar)
    terminals = ["hi_there", "comma", "partner"]
    nonterminals = ["text", "greeting", "punctuation", "identifier"]
    custom_visitor = CustomVisitorFactory(terminals, nonterminals,
                                          dict()).create_instance()
    #custom_visitor = custom_visitor.create_instance()
    root = grammar.parse(text)
    #   print("The parse tree:")
    #   print(root.prettily())
    xml = custom_visitor.visit(root)
    assert xml == "<text> <greeting> Hi there </greeting><punctuation> ,  </punctuation><identifier> partner </identifier> </text>"


#   print(xml)
#   print("Finished.")
def parse(section_text):
  section_text += "\n"
  grammar = Grammar(grammars[0])
  visitor = DefendantInfoVisitor()
  root = grammar.parse(section_text)
  section_xml = visitor.visit(root)
  return section_xml
예제 #6
0
def test2():
    grammar = Grammar(r"""
        text = text_quoted / text_simple
        text_quoted = ~r'"([^"\\]|\\.)*"'
        text_simple = ~'[a-zA-Z0-9.*_-]+'
        """)
    data = '"z,@! \\" ok \\" "'
    print(grammar.parse(data))
예제 #7
0
def parse_pdf(pdf: Union[BinaryIO, str], tempdir=None) -> Tuple[Person, Case]:
    """
    Parse the a pdf of a criminal record docket. 

    The 'see' references are to the DocketParse library, which also parses pdf dockets. 

    Args:
        pdf: a binary reader or a string path to a pdf file.
        tempdir: The pdf must be written to txt with pdftotext, so we need a temporary directory for it.
    
    Returns:
        The Person to whom the docket relates, and the Case to which the Docket relates.
    """
    # a list of strings
    errors = []
    # pdf to raw text
    txt = get_text_from_pdf(pdf, tempdir=tempdir)
    # text to xml sections (see DocketParse.sectionize). This handles page breaks.
    pages_tree = etree.fromstring(text_to_pages(txt))
    sections_tree = sections_from_pages(pages_tree)
    # parse individual sections with grammars for those sections
    # TODO add try catch blocks that allow for continuing even after certain parts fail, like
    #       if a single section fails to parse.
    for section_name, grammar, terminals, nonterminals, custom_visitors in section_grammars:
        try:
            section = sections_tree.xpath(
                f"//section[@name='{section_name}']")[0]
            # remove blank lines at the ends of the section.
            section_text = "\n".join(
                [ln for ln in section.text.split("\n") if ln.strip()])
            grammar = Grammar(grammar)
            try:
                nodes = grammar.parse(section_text)
            except Exception as e:
                slines = section_text.split("\n")
                errors.append(f"    Text for {section_name} failed to parse.")
                logging.error(f"    Text for {section_name} failed to parse.")
                continue
            visitor = CustomVisitorFactory(terminals, nonterminals,
                                           custom_visitors).create_instance()
            parsed_section_text = visitor.visit(nodes)
            parsed_section_xml = etree.fromstring(parsed_section_text)
            # replace original unparsed section's text w/ the parsed xml.
            sections_tree.xpath(
                f"//section[@name='{section_name}']")[0].text = ""
            sections_tree.xpath(f"//section[@name='{section_name}']"
                                )[0].append(parsed_section_xml)
        except (Exception, IndexError) as e:
            # not all dockets have all sections, so not being able to find a section is not
            # necessarily an error.
            #slines = section_text.split("\n")
            logging.info(f"    Could not find section {section_name}")
            #slines = etree.tostring(sections_tree, encoding="unicode").split("\n")
    # extract Person and Case information from xml.
    # i.e. defendant_name = section_tree.xpath("//caption/name")[0].text
    defendant = get_person(sections_tree)
    case = get_case(sections_tree)
    return defendant, case, errors
예제 #8
0
def test_parameters_ok(value):
    test_grammar = Grammar(
        grammar.params
        + grammar.type_
        + grammar.symbols
        + grammar.ident
        + grammar.ws)
    tree = test_grammar.parse(value)
    assert tree is not None
예제 #9
0
def test_annotationlist_ok(value):
    test_grammar = Grammar(
        "start = annotationlist\n"
        + grammar.annotation
        + grammar.symbols
        + grammar.ident
        + grammar.ws)
    tree = test_grammar.parse(value)
    assert tree is not None
예제 #10
0
def test_enum_ok(value):
    test_grammar = Grammar(
        "start=enum\n"
        + grammar.enum
        + grammar.ws
        + grammar.symbols
        + grammar.ident
    )
    tree = test_grammar.parse(value)
    assert tree is not None
예제 #11
0
def parse(section_text):

  grammar = Grammar(grammars[0])
  custom_visitor = CustomVisitorFactory(terminals, nonterminals, dict()).create_instance()
  root = grammar.parse(section_text)
#   print("Parse tree:")
#   print(root.prettily())
  xml = custom_visitor.visit(root)
  # print(xml)
  return xml
예제 #12
0
def test_param_list_ok(value):
    test_grammar = Grammar(
        "start = param_list\n"
        + grammar.params
        + grammar.type_
        + grammar.symbols
        + grammar.ident
        + grammar.ws)
    tree = test_grammar.parse(value)
    assert tree is not None
예제 #13
0
 def __init__(self, drawer):
     super().__init__(drawer)
     self.peg_grammar = Grammar(r'''
         line = statement ws? comment? ws
         statement   = directive ws? parameter?
         directive   = ~"P|X|Y|D|W|N|E|S|U"
         parameter   = ~"-?\d{0,}\.{0,1}\d{0,}"
         comment     = ~"#.*"
         ws          = ~"\s*"
     ''')
     self.peg_visitor = self.TigrVisitor()
예제 #14
0
def parse(section_text):

    grammar = Grammar(grammars[0])
    custom_visitor = CustomVisitorFactory(terminals, nonterminals,
                                          dict()).create_instance()
    root = grammar.parse(section_text)
    #   print("Parse tree:")
    #   print(root.prettily())
    xml = custom_visitor.visit(root)
    # print(xml)
    return xml
예제 #15
0
def test_use_regex_library():
    grammar = Grammar(r'''
    unicode_word = ~"[\p{L}]*"
    ''',
                      use_regex_library=True)
    text = 'Тест'
    expected = RegexNode(expr=Regex(pattern=r'[\p{L}]*',
                                    use_regex_library=True),
                         full_text=text,
                         start=0,
                         end=4)
    result = grammar.parse(text=text)
    eq_(result, expected)
예제 #16
0
def test13(grammar: Grammar):
    data = '{($.user.id = 2 && $.users[0].email = "nonmatch") || $.actions[2] = "GET"}'
    print(grammar.parse(data))

    data = [
        ' { ($.user.id = 1) && ($.users[0].email = "*****@*****.**") } ',
        '{($.user.id = 2 && $.users[0].email = "nonmatch") || $.actions[2] = "GET"}',
        '{ $.user.email = "*****@*****.**" || $.coordinates[0][1] = nonmatch && $.actions[2] = nomatch }',
        '{ ($.user.email = "*****@*****.**" || $.coordinates[0][1] = nonmatch) && $.actions[2] = nomatch }'
    ]

    for datum in data:
        print(grammar.parse(datum))
예제 #17
0
def parse(section_text):
  clean_section_text = clean_headers(section_text)
#   print("====")
#   print(clean_section_text)
#   print("====")
#   print("----")
#   print(temp_text)
#   print("-----")
  grammar = Grammar(grammars[0])
  visitor = DispositionVisitor()
  root = grammar.parse(clean_section_text)
  reconstituted_xml = visitor.visit(root)
  return reconstituted_xml
예제 #18
0
def test1():
    grammar = Grammar("""
        selector = root item*
        item = dot identifier ("[" index "]")*
        index = ~'([1-9][0-9]+)|[0-9]'
        identifier = ~'[a-zA-Z0-9_-]+'
        root = "$"
        dot = "."
        """)
    data = '$.a.b[1][2].c[3]'
    print(grammar.parse(data))
    data = '$.a.b[10][2]'
    print(grammar.parse(data))
예제 #19
0
def test_class_ok(value):
    test_grammar = Grammar(
        grammar.class_
        + grammar.annotation
        + grammar.function
        + grammar.params
        + grammar.type_
        + grammar.qualifier
        + grammar.ws
        + grammar.symbols
        + grammar.ident
    )
    tree = test_grammar.parse(value)
    assert tree is not None
예제 #20
0
def test_func_body_ok(value):
    test_grammar = Grammar(
        "start=func_body\n"
        + grammar.annotation
        + grammar.function
        + grammar.params
        + grammar.type_
        + grammar.qualifier
        + grammar.ws
        + grammar.symbols
        + grammar.ident
    )
    tree = test_grammar.parse(value)
    assert tree is not None
예제 #21
0
def grammar12():
    return Grammar(r"""
        top_cond = _* '{' cond '}' _*
        cond = (_* cond_simple_seq _*) / (_* cond_quoted_seq _*)
        cond_simple_seq = cond_simple cond_tail*
        cond_quoted_seq = cond_quoted cond_tail*
        cond_quoted = '(' cond ')'
        cond_tail = _ + op_boolean _+ cond
        cond_simple = cmp_common / cmp_is_true / cmp_is_false / cmp_is_null / 
        cmp_not_exists

        cmp_common = selector _* op_common _* text
        cmp_is_true = selector _+ 'IS' _+ 'TRUE'
        cmp_is_false = selector _+ 'IS' _+ 'FALSE'
        cmp_is_null = selector _+ 'IS' _+ 'NULL'
        cmp_not_exists = selector _+ 'NOT' _+ 'EXISTS'

        op_common = '=' / '!=' / '<=' / '>=' / '<' / '>'
        op_boolean = '||' / '&&'

        selector = root path+
        path = dot child ('[' index ']')*
        index = ~'([1-9][0-9]+)|[0-9]'
        child = ~'[a-zA-Z0-9_-]+'
        root = "$"
        dot = "."

        text = text_quoted / text_simple
        text_quoted = ~r'"([^"\\]|\\.)*"'
        text_simple = ~'[a-zA-Z0-9.*_-]+'

        _ = ~'[ \t]'
        """)
예제 #22
0
def grammar10():
    return Grammar("""
        top_cond = _* '{' cond '}' _*
        cond = (_* cond_simple_seq _*) / (_* cond_quoted_seq _*)
        cond_simple_seq = cond_simple (_+ op_boolean _+ cond)*
        cond_quoted_seq = '(' cond ')' (_+ op_boolean _+ cond)*
        cond_simple = cmp_common / cmp_is / cmp_not_exists

        cmp_common = selector _* op_common _* text
        cmp_is = selector _+ 'IS' _+ ('TRUE' / 'FALSE' / 'NULL')
        cmp_not_exists = selector _+ 'NOT' _+ 'EXISTS'

        op_common = '=' / '!=' / '<=' / '>=' / '<' / '>'
        op_boolean = '||' / '&&'
        
        selector = root path*
        path = dot child ('[' index ']')*
        
        index = ~'([1-9][0-9]+)|[0-9]'
        child = ~'[a-zA-Z0-9_-]+'
        text = ~'[a-zA-Z0-9._-]+'
        root = "$"
        dot = "."
        _ = ~'[ \t]'
        """)
def test_visitor():
    """Assert a tree gets visited correctly."""
    grammar = Grammar(r'''
        bold_text  = bold_open text bold_close
        text       = ~'[a-zA-Z 0-9]*'
        bold_open  = '(('
        bold_close = '))'
    ''')
    text = '((o hai))'
    tree = Node(grammar['bold_text'], text, 0, 9,
                [Node(grammar['bold_open'], text, 0, 2),
                 Node(grammar['text'], text, 2, 7),
                 Node(grammar['bold_close'], text, 7, 9)])
    eq_(grammar.parse(text), tree)
    result = HtmlFormatter().visit(tree)
    eq_(result, '<b>o hai</b>')
예제 #24
0
def grammar6():
    return Grammar("""
        top_cond = _ '{' cond '}' _
        cond = (_ cond_simple_seq _) / (_ cond_quoted_seq _)
        cond_simple_seq = cond_simple _ (boolean_op _ cond)*
        cond_quoted_seq = '(' cond ')' _ (boolean_op _ cond)*
        cond_simple = cmp_basic / cmp_numeric / cmp_is / cmp_not_exists
        
        cmp_basic = (_ cmp_eq _) / (_ cmp_ne _)
        cmp_numeric = (_ cmp_le _) / (_ cmp_ge _) / (_ cmp_lt _) / (_ cmp_gt _)
        cmp_is = (_ cmp_is_true _) / (_ cmp_is_false _) / (_ cmp_is_null _)
        
        cmp_eq = selector _ '=' _ text
        cmp_ne = selector _ '!=' _ text
        
        cmp_le = selector _ '<=' _ text
        cmp_ge = selector _ '>=' _ text
        cmp_lt = selector _ '<' _ text
        cmp_gt = selector _ '>' _ text
        
        cmp_is_true = selector _ 'IS' _ 'TRUE'
        cmp_is_false = selector _ 'IS' _ 'FALSE'
        cmp_is_null = selector _ 'IS' _ 'NULL'
        cmp_not_exists = selector _ 'NOT' _ 'EXISTS'
        
        
        boolean_op = '||' / '&&'
        selector = ~'[a-zA-Z0-9._-]+'
        text = ~'[a-zA-Z0-9._-]+'
        _ = ~'[ \t]*'
        """)
예제 #25
0
def test_visitor():
    """Assert a tree gets visited correctly."""
    grammar = Grammar(r'''
        bold_text  = bold_open text bold_close
        text       = ~'[a-zA-Z 0-9]*'
        bold_open  = '(('
        bold_close = '))'
    ''')
    text = '((o hai))'
    tree = Node(grammar['bold_text'], text, 0, 9,
                [Node(grammar['bold_open'], text, 0, 2),
                 Node(grammar['text'], text, 2, 7),
                 Node(grammar['bold_close'], text, 7, 9)])
    eq_(grammar.parse(text), tree)
    result = HtmlFormatter().visit(tree)
    eq_(result, '<b>o hai</b>')
예제 #26
0
    def get_action_sequence_and_all_actions(self,
                                            query: List[str] = None,
                                            prelinked_entities: Dict[str, Dict[str, str]] = None) -> Tuple[List[str], List[str]]: # pylint: disable=line-too-long
        grammar_with_context = deepcopy(self.base_grammar_dictionary)

        if not self.use_prelinked_entities and prelinked_entities is not None:
            raise ConfigurationError("The Text2SqlNoGrammarWorld was specified to not use prelinked "
                                     "entities, but prelinked entities were passed.")
        prelinked_entities = prelinked_entities or {}

        update_grammar_numbers_and_strings_with_variables(grammar_with_context,
                                                              prelinked_entities,
                                                              self.columns)
        update_grammar_with_tokens(grammar_with_context,
                                    query)

        grammar = Grammar(format_grammar_string(grammar_with_context))

        valid_actions = initialize_valid_actions(grammar)
        all_actions = set()
        for action_list in valid_actions.values():
            all_actions.update(action_list)
        sorted_actions = sorted(all_actions)

        sql_visitor = SqlVisitor(grammar)
        try:
            action_sequence = sql_visitor.parse(" ".join(query)) if query else []
        except ParseError as e:
            print("\nParse Error - details:\n", e.pos, '\n', e.expr, '\n', e.text)
            action_sequence = None
        except RecursionError as er:
            print("\nParse recursion error - details:\n", " ".join(query), '\n', grammar_with_context['terminal'])
            action_sequence = None

        return action_sequence, sorted_actions
예제 #27
0
    def test_variable_free_world_cannot_parse_as_statements(self):
        world = Text2SqlWorld(self.schema)
        grammar_dictionary = world.base_grammar_dictionary
        for productions in grammar_dictionary.items():
            assert "AS" not in productions

        sql_with_as = [
            'SELECT', 'COUNT', '(', '*', ')', 'FROM', 'LOCATION', 'AS',
            'LOCATIONalias0', ',', 'RESTAURANT', 'WHERE', 'LOCATION', '.',
            'CITY_NAME', '=', "'city_name0'", 'AND', 'RESTAURANT', '.', 'NAME',
            '=', 'LOCATION', '.', 'RESTAURANT_ID', 'AND', 'RESTAURANT', '.',
            'NAME', '=', "'name0'", ';'
        ]

        grammar = Grammar(format_grammar_string(world.base_grammar_dictionary))
        sql_visitor = SqlVisitor(grammar)

        with self.assertRaises(ParseError):
            sql_visitor.parse(" ".join(sql_with_as))

        sql = [
            'SELECT', 'COUNT', '(', '*', ')', 'FROM', 'LOCATION', ',',
            'RESTAURANT', 'WHERE', 'LOCATION', '.', 'CITY_NAME', '=',
            "'city_name0'", 'AND', 'RESTAURANT', '.', 'NAME', '=', 'LOCATION',
            '.', 'RESTAURANT_ID', 'AND', 'RESTAURANT', '.', 'NAME', '=',
            "'name0'", ';'
        ]

        # Without the AS we should still be able to parse it.
        sql_visitor = SqlVisitor(grammar)
        sql_visitor.parse(" ".join(sql))
예제 #28
0
    def get_action_sequence_and_all_actions(self,
                                            query: List[str] = None,
                                            prelinked_entities: Dict[str, Dict[str, str]] = None) -> Tuple[List[str], List[str]]:  # pylint: disable=line-too-long
        grammar_with_context = deepcopy(self.base_grammar_dictionary)

        if not self.use_prelinked_entities and prelinked_entities is not None:
            raise ConfigurationError(
                "The Text2SqlWorld was specified to not use prelinked "
                "entities, but prelinked entities were passed.")
        prelinked_entities = prelinked_entities or {}

        if self.use_untyped_entities:
            update_grammar_values_with_variables(grammar_with_context,
                                                 prelinked_entities)
        else:
            update_grammar_numbers_and_strings_with_variables(
                grammar_with_context, prelinked_entities, self.columns)

        grammar = Grammar(format_grammar_string(grammar_with_context))

        valid_actions = initialize_valid_actions(grammar)
        all_actions = set()
        for action_list in valid_actions.values():
            all_actions.update(action_list)
        sorted_actions = sorted(all_actions)

        sql_visitor = SqlVisitor(grammar)
        action_sequence = sql_visitor.parse(" ".join(query)) if query else []
        return action_sequence, sorted_actions
예제 #29
0
def grammar8():
    return Grammar("""
        top_cond = _* '{' cond '}' _*
        cond = (_* cond_simple_seq _*) / (_* cond_quoted_seq _*)
        cond_simple_seq = cond_simple (_+ boolean_op _+ cond)*
        cond_quoted_seq = '(' cond ')' (_+ boolean_op _+ cond)*
        cond_simple = cmp_basic / cmp_numeric / cmp_is / cmp_not_exists

        cmp_basic = cmp_eq / cmp_ne
        cmp_numeric = cmp_le / cmp_ge / cmp_lt / cmp_gt
        cmp_is = cmp_is_true / cmp_is_false / cmp_is_null

        cmp_eq = selector _* '=' _* text
        cmp_ne = selector _* '!=' _* text

        cmp_le = selector _* '<=' _* text
        cmp_ge = selector _* '>=' _* text
        cmp_lt = selector _* '<' _* text
        cmp_gt = selector _* '>' _* text

        cmp_is_true = selector _+ 'IS' _+ 'TRUE'
        cmp_is_false = selector _+ 'IS' _+ 'FALSE'
        cmp_is_null = selector _+ 'IS' _+ 'NULL'
        cmp_not_exists = selector _+ 'NOT' _+ 'EXISTS'

        boolean_op = '||' / '&&'
        selector = ~'[a-zA-Z0-9._-]+'
        text = ~'[a-zA-Z0-9._-]+'
        _ = ~'[ \t]'
        """)
예제 #30
0
def _construct(names):
    grammar_list = []
    for name in reversed(names):
        grammar_path = path.join(_grammar_dir, '{}.grammar'.format(name))
        with open(grammar_path, 'r') as file:
            grammar_list.append(file.read())
    return Grammar('\n'.join(grammar_list))
예제 #31
0
    def get_action_sequence_and_all_actions(self,
                                            allow_aliases: bool = False
                                            ) -> Tuple[List[str], List[str]]:
        grammar_with_context = deepcopy(self.base_grammar_dictionary)
        if not allow_aliases:
            update_grammar_to_be_table_names_free(grammar_with_context)

        schema = self.db_context.schema
        update_grammar_with_tables(grammar_with_context, schema)
        grammar = Grammar(format_grammar_string(grammar_with_context))

        valid_actions = initialize_valid_actions(grammar)
        all_actions = set()
        for action_list in valid_actions.values():
            all_actions.update(action_list)
        sorted_actions = sorted(all_actions)
        self.valid_actions = valid_actions
        self.valid_actions_flat = sorted_actions
        action_sequence = None
        if self.query is not None:
            sql_visitor = SqlVisitor(grammar)
            query = " ".join(self.query).lower().replace("``", "'").replace(
                "''", "'")
            try:
                action_sequence = sql_visitor.parse(query) if query else []
            except ParseError as e:
                pass

        return action_sequence, sorted_actions
예제 #32
0
def test():
    grammar = Grammar(
        #pattern = "{" ws text ws "=" ws text ws "}"
        """
        #pattern6 = _ "{" pattern5 "}" _
        #pattern6 = "(" pattern4 (logical pattern4)*
        #pattern5 = pattern4 (logical pattern4)*
        
        #pattern6 = pattern5 / pattern4
        
        
        #pattern7 = _ pattern6 _
        
        # (a || (b || c))
        # ((a || b) || c)
        # ((a || b || c))
        
        pattern7 = "(" pattern7 (logical pattern6)* ")"
        
        pattern6 = pattern5 / pattern4
        # (a || b || c)
        pattern5 = "(" pattern4 ")"
        # a || b || c
        pattern4 = pattern3 (logical pattern3)*
        
        pattern3 = _ (pattern2 / pattern1) _
        pattern2 = "(" _ (pattern2/pattern1) _ ")"
        pattern1 = _ (compare_eq / compare_ne) _
        
        compare_eq = text _ "=" _ text
        compare_ne = text _ "!=" _ text
        logical = "&&" / "||"
        _ = ~"[ \t]"*
        text = ~"[a-zA-Z0-9_-]+"
        
        item = dot identifier ("[" index "]")*
        index = ~"[0-9]|[1-9][0+9]+"
        identifier = ~"[a-zA-Z0-9_-]+"
        
        selector = root item*
        
        root = "$"
        dot = "."
        """)
    #data = ' { ( -a_bc= 123 ) || abc = 123 } '
    data = '(((-a_bc=123))||abc=12)'
    print(grammar.parse(data))
예제 #33
0
    def __init__(self, tune_fn):
        NodeVisitor.__init__(self)
        # start with an empty tune, voice, note, and list of modifiers
        self.tune = Tune()
        self.voice = Voice()
        self.note = Note()
        self.note_modifiers = []

        # at the outset, we are not in a voice's content
        self.in_content = False

        # set up the actual parser
        grammar = Grammar(open("doremi-grammar", "r").read())

        # read and parse the tune
        tune_text = codecs.open(tune_fn, "r", "utf-8").read()
        self.syntax = grammar.parse(tune_text)
예제 #34
0
def grammar3():
    return Grammar("""
        expr =  atom_ext / expr_ext
        atom_ext = atom (or expr)*
        expr_ext = '(' expr ')' (or expr)*
        atom = ~'[a-zA-Z0-9._-]+'
        or = '||'
        """)
예제 #35
0
def parse(path):
  """
  Parse a pdf docket into an xml document.
  This xml document will be of the form:
  <docket>
    <page>
      <caption> </caption>
      <body>
        <section name='a'> </section>
        <section name='b'> </section>
        ...
      </body>
      <footer> </footer>
    </page>
    <page>
      ...
    </page>
  </docket>
  ...

  This xml most closely resembles the original docket. (The caveat
  is that section names are removed from the text and turned into name
  attributes of the section xml elements).

  But some sections extend across pages, and this xml schema leves these
  sections separated from each other.


  TODO: Turn this into a real .xsd schema definition.
  """
  print("Starting parse {}".format(path)) #
  start = datetime.now() #
  docket_text = pdf_to_text(path)
  pdf2text_time = (datetime.now()-start).microseconds #
  start = datetime.now() #
  grammar = Grammar(grammar_list[0])
  create_grammar_time = (datetime.now()-start).microseconds #
  visitor = DocketVisitor()
  start = datetime.now() #
  root = grammar.parse(docket_text)
  parse_grammar_time = (datetime.now()-start).microseconds #
  start = datetime.now() #
  results = visitor.visit(root)
  node_visitor_time = (datetime.now()-start).microseconds #
  logging.info("{}, {}, {}, {}".format(pdf2text_time, create_grammar_time, parse_grammar_time, node_visitor_time))
  return results
def jexl_grammar(jexl_config):
    return Grammar(r"""
        expression = (
            _ (conditional_expression / binary_expression / unary_expression / complex_value) _
        )

        conditional_expression = (
            conditional_test _ "?" _ expression _ ":" _ expression
        )
        conditional_test = (binary_expression / unary_expression / complex_value)

        binary_expression = binary_operand (_ binary_operator _ binary_operand)+
        binary_operator = {binary_op_pattern}
        binary_operand = (unary_expression / complex_value)

        unary_expression = unary_operator _ unary_operand
        unary_operator = {unary_op_pattern}
        unary_operand = (unary_expression / complex_value)

        complex_value = value (transform / attribute / filter_expression)*

        transform = "|" identifier transform_arguments?
        transform_arguments = "(" _ value_list _ ")"

        attribute = "." identifier

        filter_expression = "[" _ expression _ "]"

        value = (
            boolean / string / numeric / subexpression / object_literal /
            array_literal / identifier / relative_identifier
        )

        subexpression = "(" _ expression _ ")"

        object_literal = "{{" _ object_key_value_list? _ "}}"
        object_key_value_list = object_key_value (_ "," _ object_key_value)*
        object_key_value = identifier _ ":" _ expression

        array_literal = "[" _ value_list? _ "]"
        value_list = expression (_ "," _ expression)*

        identifier = ~r"[a-zA-Z_\$][a-zA-Z0-9_\$]*"
        relative_identifier = "." identifier

        boolean = "true" / "false"
        string = ~"\"[^\"\\\\\\n\\r]*(?:\\\\.[^\"\\\\\\n\\r]*)*\""is /
                 ~"'[^'\\\\\\n\\r]*(?:\\\\.[^'\\\\\\n\\r]*)*'"is
        numeric = "-"? number ("." number)?

        number = ~r"[0-9]+"

        _ = ~r"\s*"
    """.format(
        binary_op_pattern=operator_pattern(jexl_config.binary_operators.values()),
        unary_op_pattern=operator_pattern(jexl_config.unary_operators.values())
    ))
예제 #37
0
    def __init__(self, text):
        NodeVisitor.__init__(self)

        # start with a new empty lyric
        self.lyric = Lyric()
        # add an empty voice to it
        self.lyric.voices.append(LyricVoice())

        # build an abstract syntax tree
        self.grammar = Grammar(open("lyric-grammar", "r").read())
        self.syntax = self.grammar.parse(text)
예제 #38
0
class LyricParser(NodeVisitor):
    """Parses .drmw lyric files for association with Doremi tunes"""
    def __init__(self, text):
        NodeVisitor.__init__(self)

        # start with a new empty lyric
        self.lyric = Lyric()
        # add an empty voice to it
        self.lyric.voices.append(LyricVoice())

        # build an abstract syntax tree
        self.grammar = Grammar(open("lyric-grammar", "r").read())
        self.syntax = self.grammar.parse(text)
        
    def convert(self):
        """Convert the syntax tree to our internal representation"""
        self.visit(self.syntax)

        # remove any extra empty voices
        self.lyric.voices = [voice for voice in self.lyric.voices
                             if voice.name != ""]

        # remove any extra empty verses
        for voice in self.lyric.voices:
            voice.verses = voice.verses[:-1]
            
        return self.lyric

    def visit_title(self, node, vc):
        self.lyric.title = get_string_val(node)

    def visit_author(self, node, vc):
        self.lyric.author = get_string_val(node)

    def visit_meter(self, node, vc):
        self.lyric.meter = get_string_val(node)

    def visit_voicespec(self, node, vc):
        # the current voice is complete, so start a new one
        self.lyric.voices.append(LyricVoice())
        
    def visit_voice(self, node, vc):
        self.lyric.voices[-1].name = get_node_val(node, "name")

    def visit_verse(self, node, vc):
        # the verse is complete, so start a new one
        self.lyric.voices[-1].verses.append(Verse())

    def visit_word(self, node, vc):
        self.lyric.voices[-1].verses[-1].words.append(node.text.strip())

    def generic_visit(self, node, vc):
        pass
def test_custom_visitor_factory():
  text = """Hi there, partner"""
  grammar = r"""
  text = greeting punctuation identifier
  greeting = hi_there?
  punctuation = comma?
  identifier = partner?

  hi_there = "Hi there"
  comma = ", "
  partner = "partner"
  """
  grammar = Grammar(grammar)
  terminals = ["hi_there", "comma", "partner"]
  nonterminals = ["text", "greeting", "punctuation", "identifier"]
  custom_visitor = CustomVisitorFactory(terminals, nonterminals, dict()).create_instance()
  #custom_visitor = custom_visitor.create_instance()
  root = grammar.parse(text)
#   print("The parse tree:")
#   print(root.prettily())
  xml = custom_visitor.visit(root)
  assert xml=="<text> <greeting> Hi there </greeting><punctuation> ,  </punctuation><identifier> partner </identifier> </text>"
#   print(xml)
#   print("Finished.")
예제 #40
0
               Defendant eligible for work release.
          Probation                                                                 Max of 3.00 Years                                  12/20/2011
                                                                                    3 years
               All conditions previously imposed to remain.
""",
    """
      Manufacture or Deliver
         Shreeves-Johns, Karen                                                     07/13/2011
            Probation                                                                Max of 3.00 Years                                   07/13/2011
                                                                                     3 years
                  Defendant is to pay imposed mandatory court costs.
                  To submit to random drug screens.
                  To pursue a prescribed secular course of study or vocational training.
                  Case relisted for status of compliance on 9/22/11 courtroom 605.
       Shreeves-Johns, Karen                                                     12/20/2011
          Confinement                                                              Min of 11.00 Months 15.00 Days                      12/20/2011
                                                                                   Max of 23.00 Months
                                                                                   11 1/2 - 23 months
               Defendant eligible for work release.
          Probation                                                                 Max of 3.00 Years                                  12/20/2011
                                                                                    3 years
               All conditions previously imposed to remain.
""",
]

grammar = Grammar(grammars[0])
root = grammar.parse(texts[0])
print("parsed.")
visitor = DetailsVisitor()
print(visitor.visit(root))
예제 #41
0
  def stringify_list(self, list):
    output = ""
    for element in list:
      output += element
    return output
# End of Class

test_num = 0

# grammar = Grammar(grammars[test_num])
# root = grammar.parse(texts[0])
# print("Parsed okay.")
# visitor = CaseInfoVisitor()
# results = visitor.visit(root)
# print(results)
# for r in results:
#   print(r)
# print(root.prettily())

#with open("./sample_dockets/CP-51-CR-0000001-2011.txt") as f:
with open("./sample_dockets/CP-51-CR-0005727-2011.txt") as f:
  grammar = Grammar(grammars[test_num])
  root = grammar.parse(f.read())
  visitor = DocketVisitor_2()
  print("Parse succeeded.")
  with open("output2.txt", 'w+') as f2:
    f2.write(visitor.visit(root))
  f2.close()
f.close()