Exemplo n.º 1
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal,title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal','decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha','upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman','upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc','circle','square','dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")
        
        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol,state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor,newstate)
        
        # CONSTRUCTORS
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)
        setattr(make_body,'newstate','body')
        
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_section,'newstate','section')

        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsection,'newstate','subsection')

        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber,title=title)
            return parser.make_children(s)
        setattr(make_subsubsection,'newstate','subsubsection')

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])

#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        def make_ol_decimal(parser):
            return make_orderedlist(parser,"decimal","ol-decimal")
        setattr(make_ol_decimal,'newstate','ol-decimal')

        def make_ol_alpha(parser):
            return make_orderedlist(parser,"lower-alpha", "ol-alpha")
        setattr(make_ol_alpha,'newstate','ol-alpha')

        def make_ol_roman(parser):
            return make_orderedlist(parser,"lower-roman", "ol-roman")
        setattr(make_ol_roman,'newstate','ol-romal')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype,ordinal,separator,rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)
        setattr(make_listitem,'newstate','listitem')

        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-a")
        # setattr(make_state_a, 'newstate', 'state-a')

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-b")
        # setattr(make_state_b, 'newstate', 'state-b')

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],id="state-c")
        # setattr(make_state_c, 'newstate', 'state-c')
        
        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and 
                    len(list(filter(None,s.split(".")))))

        def make_orderedlist(parser,listtype,childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem,"listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match
        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None,chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +',chunk)
            if m:
                if chunk.startswith("0"):
                    listtype="decimal-leading-zero"
                else:
                    listtype="decimal"
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal,separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype,ordinal,separator,rest)

            if chunk.startswith("* "):
                return ("disc",None,None,chunk)
            if chunk.startswith("- "):
                return ("dash",None,None,chunk)
                
            return (listtype,ordinal,separator,chunk) # None * 3

        
        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal,
                          is_li_roman, 
                          is_li_alpha,
                          is_header,
                          is_section,
                          is_subsection,
                          is_subsubsection,
                          is_preformatted,
                          is_definition,
                          is_description,
                          is_state_a,
                          is_state_b,
                          is_state_c,
                          is_paragraph)
        p.set_transitions({("body", is_paragraph): (make_paragraph, None),
                           ("body", is_section): (make_section,"section"),
                           ("body", is_state_a): (make_state_a, "state-a"),
                           ("state-a", is_state_b): (make_state_b, "state-b"),
                           ("state-b", is_state_c): (make_state_c, "state-c"),
                           ("state-c", is_section): (False, None),
                           ("section", is_paragraph): (make_paragraph, None),
                           ("section", is_subsection): (make_subsection, "subsection"),
                           ("subsection", is_paragraph): (make_paragraph,None),
                           ("subsection", is_subsection): (False,None),
                           ("subsection", is_state_a): (False,"body"), 
                           ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"),
                           ("subsubsection", is_paragraph): (make_paragraph,None),
                           ("subsubsection", is_section): (False, None),
                           ("subsection", is_section): (False, None),
                           ("section", is_section): (False, None),
                           ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
                           ("ol-decimal",is_li_decimal):(make_listitem,"listitem"),
                           ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"),
                           ("ol-alpha",is_li_alpha):(make_listitem,"listitem"),
                           ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"),
                           ("ol-roman",is_li_roman):(make_listitem,"listitem"),
                           ("ol-roman",is_li_alpha):(False,None),
                           ("ol-alpha",is_li_decimal):(False,None),
                           ("listitem",is_li_alpha):sublist_or_parent, 
                           ("listitem",is_li_roman):sublist_or_parent, 
                           ("listitem",is_li_decimal):sublist_or_parent, 
                           })

        p.debug = debug

        tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b
Exemplo n.º 2
0
    def get_parser(self, basefile, sanitized, parseconfig="default"):

        def is_header(parser):
            p = parser.reader.peek()
            # older direktiv sources start with dir number
            if re.match(r'Dir\.? \d{4}:\d+$', p):
                return False
            return (headerlike(p) and 
                    not is_strecksats(parser, parser.reader.peek(2)))

        def is_strecksats(parser, chunk=None):
            if chunk is None:
                chunk = parser.reader.peek()
            return chunk.startswith(("--", "- "))

        def is_section(parser):
            (ordinal, headingtype, title) = analyze_sectionstart(parser)
            if ordinal:
                return headingtype == "h1"

        def is_subsection(parser):
            (ordinal, headingtype, title) = analyze_sectionstart(parser)
            if ordinal:
                return headingtype == "h2"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            chunk = parser.reader.next()
            ordinal, headingtype, title = analyze_sectionstart(parser, chunk)
            s = Avsnitt(ordinal=ordinal, title=title)
            return parser.make_children(s)

        @newstate('strecksats')
        def make_strecksatslista(parser):
            ul = Strecksatslista()
            li = make_listitem(parser)
            ul.append(li)
            res = parser.make_children(ul)
            return res

        def make_listitem(parser):
            chunk = parser.reader.next()
            s = str(chunk)
            if " " in s:
                # assume text before first space is the bullet
                s = s.split(" ",1)[1]
            else:
                # assume the bullet is a single char
                s = s[1:]
            return Strecksatselement([s])

        def make_header(parser):
            return Heading([parser.reader.next()])
        
        def make_paragraph(parser):
            return Paragraph([parser.reader.next()])

        @newstate('unorderedsection')
        def make_unorderedsection(parser):
            s = UnorderedSection(title=parser.reader.next().strip())
            return parser.make_children(s)
            
        def headerlike(p):
            return (p[0].lower() != p[0]
                    and len(p) < 150
                    and not (p.endswith(".") and
                             not (p.endswith("m.m.") or
                                  p.endswith("m. m.") or
                                  p.endswith("m.fl.") or
                                  p.endswith("m. fl."))))

        re_sectionstart = re.compile("^(\d[\.\d]*) +([A-ZÅÄÖ].*)$").match
        def analyze_sectionstart(parser, chunk=None):
            """returns (ordinal, headingtype, text) if it looks like a section
            heading, (None, None, chunk) otherwise."""
            if chunk is None:
                chunk = parser.reader.peek()
            m = re_sectionstart(chunk)
            if m and headerlike(m.group(2)):
                return (m.group(1),
                        "h" + str(m.group(1).count(".") + 1),
                        m.group(2).strip())
            else:
                return None, None, chunk

        p = FSMParser()
        if parseconfig == "simple":
            recognizers = [is_header, is_strecksats, is_paragraph]
        else:
            recognizers = [is_section,
                           is_subsection,
                           is_header,
                           is_strecksats,
                           is_paragraph]
        p.set_recognizers(*recognizers)
        commonstates = ("body", "section", "subsection", "unorderedsection")
        p.set_transitions({(commonstates, is_paragraph): (make_paragraph, None),
                           (commonstates, is_strecksats): (make_strecksatslista, "strecksats"),
                           (commonstates, is_header): (make_unorderedsection, "unorderedsection"),
                           (commonstates, is_section): (make_section, "section"),
                           
                           ("unorderedsection", is_header): (False, None),
                           ("unorderedsection", is_section): (False, None),
                           ("strecksats", is_paragraph): (False, None),
                           ("strecksats", is_strecksats): (make_listitem, None),
                           ("section", is_header): (False, None),
                           ("section", is_section): (False, None),
                           ("section", is_subsection): (make_section, "subsection"),
                           ("subsection", is_subsection): (False, None),
                           ("subsection", is_section): (False, None)})
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Exemplo n.º 3
0
    def get_parser(self, basefile, sanitized, parseconfig="default"):
        def is_heading(parser):
            return parser.reader.peek().font.size == 17

        def is_dnr(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12 and
                    re.match('\d+-\d{2,4}', str(chunk))):
                return True

        def is_datum(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12 and
                    re.match('\d{4}-\d{2}-\d{2}', str(chunk))):
                return True

        def is_nonessential(parser):
            chunk = parser.reader.peek()
            if chunk.top >= 1159 or chunk.top <= 146:
                return True

        def is_abstract(parser):
            if str(parser.reader.peek()).startswith("Beslutet i korthet:"):
                return True

        def is_section(parser):
            chunk = parser.reader.peek()
            strchunk = str(chunk)
            if chunk.font.size == 14 and chunk[0].tag == "b" and not strchunk.endswith("."):
                return True

        def is_blockquote(parser):
            chunk = parser.reader.peek()
            if chunk.left >= 255:
                return True

        def is_normal(parser):
            chunk = parser.reader.peek()
            if chunk.left < 255:
                return True

        def is_paragraph(parser):
            return True

        @decorators.newstate("body")
        def make_body(parser):
            return parser.make_children(Body())

        def make_heading(parser):
            # h = Heading(str(parser.reader.next()).strip())
            h = Meta([str(parser.reader.next()).strip()],
                     predicate=DCTERMS.title,
                     lang="sv")
            return h

        @decorators.newstate("abstract")
        def make_abstract(parser):
            a = Abstract([Paragraph(parser.reader.next())])
            return parser.make_children(a)

        @decorators.newstate("section")
        def make_section(parser):
            s = UnorderedSection(title=str(parser.reader.next()).strip())
            return parser.make_children(s)

        @decorators.newstate("blockquote")
        def make_blockquote(parser):
            b = Blockquote()
            return parser.make_children(b)

        def make_paragraph(parser):
            # A Paragraph containing PDFReader.Textelement object will
            # render these as <span> objects (the default rendering. A
            # PDFReader.Textbox object containing same will render
            # unstyled Textelements as plain strings, cutting down on
            # unneccesary <span> elements. However, these themselves
            # render with unneccessary @style and @class attributes,
            # which we don't want. For now, lets stick with Paragraphs
            # as containers and maybe later figure out how to get
            # PDFReader.Textelements to render themselves sanely.
            # 
            # p = parser.reader.next()
            p = Paragraph(parser.reader.next())
            return p

        def make_datum(parser):
            datestr = str(parser.reader.next()).strip()
            year = int(datestr.split("-")[0])
            if 2100 > year > 1970:
                parser.remove_recognizer(is_datum)
                d = [datestr]
                return Meta(d, predicate=RPUBL.avgorandedatum,
                            datatype=XSD.date)
            else:
                self.log.warning("Year in %s doesn't look valid" % datestr)
                return None

        def make_dnr(parser):
            parser.remove_recognizer(is_dnr)
            ds = [x for x in str(parser.reader.next()).strip().split(" ")]
            return Meta(ds, predicate=RPUBL.diarienummer)

        def skip_nonessential(parser):
            parser.reader.next()  # return nothing

        p = FSMParser()
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.set_recognizers(is_datum,
                          is_dnr,
                          is_nonessential,
                          is_heading,
                          is_abstract,
                          is_section,
                          is_normal,
                          is_blockquote,
                          is_paragraph)
        p.set_transitions({("body", is_heading): (make_heading, None),
                           ("body", is_nonessential): (skip_nonessential, None),
                           ("body", is_datum): (make_datum, None),
                           ("body", is_dnr): (make_dnr, None),
                           ("body", is_abstract): (make_abstract, "abstract"),
                           ("body", is_section): (make_section, "section"),
                           ("body", is_blockquote): (make_blockquote, "blockquote"),
                           ("body", is_paragraph): (make_paragraph, None),
                           ("abstract", is_paragraph): (make_paragraph, None),
                           ("abstract", is_section): (False, None),
                           ("abstract", is_dnr): (False, None),
                           ("abstract", is_datum): (False, None),
                           ("section", is_paragraph): (make_paragraph, None),
                           ("section", is_nonessential): (skip_nonessential, None),
                           ("section", is_section): (False, None),
                           ("section", is_blockquote): (make_blockquote, "blockquote"),
                           ("section", is_datum): (make_datum, None),
                           ("section", is_dnr): (make_dnr, None),
                           ("blockquote", is_blockquote): (make_paragraph, None),
                           ("blockquote", is_nonessential): (skip_nonessential,  None),
                           ("blockquote", is_section): (False, None),
                           ("blockquote", is_normal): (False, None),
                           ("blockquote", is_datum): (make_datum, None),
                           ("blockquote", is_dnr): (make_dnr, None),
                           })
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Exemplo n.º 4
0
    def get_parser(self, basefile, sanitized_body, parseconfig="default"):
        # a typical decision structure:

        # [h1] Justitiekanslerns beslut
        #    ... text ...
        #    [h2] Ärendet (h3)
        #        [h3] Bakgrund (p/em)
        #        ... text ...
        #        [h3] Anspråket
        #        ... text ...
        #        [h3 class="reglering"] Rättslig reglering m.m. (p/strong)
        #    [h2] Justitiekanslerns bedömning
        #        [h3] Skadestånd
        #        [h3] Tillsyn
        def is_section(parser):
            return parser.reader.peek().name == "h3"

        def is_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "em"

        def is_special_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(
                chunk.children)[0].name == "strong"

        def is_subsubsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "u"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('special_subsection')
        def make_special_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        def make_paragraph(parser):
            # FIXME: this strips out formatting tags NB: Now this is a
            # SFS stycke that has fragment_label, id/uri and other
            # crap. Let's see if it still works!
            return AnonStycke([parser.reader.next().get_text()])

        p = FSMParser()
        p.set_recognizers(is_section, is_subsection, is_subsubsection,
                          is_paragraph)
        p.set_transitions({
            ("body", is_section): (make_section, "section"),
            ("section", is_section): (False, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("section", is_special_subsection):
            (make_special_subsection, "special_subsection"),
            ("subsection", is_section): (False, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_special_subsection): (False, None),
            ("subsection", is_subsubsection):
            (make_subsection, "subsubsection"),
            ("special_subsection", is_section): (False, None),
            ("special_subsection", is_subsection): (False, None),
            ("special_subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_section): (False, None),
            ("subsubsection", is_special_subsection): (False, None),
            ("subsubsection", is_subsection): (False, None),
            ("subsubsection", is_subsubsection): (False, None),
            (("body", "section", "subsection", "subsubsection"), is_paragraph):
            (make_paragraph, None)
        })
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Exemplo n.º 5
0
    def get_parser():

        def is_header(parser):
            chunk = parser.reader.peek()
            if type(chunk) in (html.H1, html.H2, html.H3, html.H4):
                return True
            else:
                return False

        def is_preamblesection(parser):
            if not is_header(parser):
                return False
            chunk = parser.reader.peek()
            return chunk.as_plaintext().lower() in ("abstract",
                                                    "status of this document",
                                                    "table of contents",
                                                    "appendices")

        def is_preambleending(parser):
            chunk = parser.reader.peek()

            return type(chunk) in (html.HR,)

        def is_section(parser):
            if not is_header(parser):
                return False
            chunk = parser.reader.peek()
            (ordinal, title) = analyze_sectionstart(chunk.as_plaintext())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            if not is_header(parser):
                return False
            chunk = parser.reader.peek()
            (ordinal, title) = analyze_sectionstart(chunk.as_plaintext())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            if not is_header(parser):
                return False
            chunk = parser.reader.peek()
            (ordinal, title) = analyze_sectionstart(chunk.as_plaintext())
            return section_segments_count(ordinal) == 3

        def is_other(parser, chunk=None):
            return True

        def make_body(parser):
            return p.make_children(Body())
        setattr(make_body, 'newstate', 'body')

        def make_preamble_section(parser):
            s = PreambleSection(title=parser.reader.next().as_plaintext())
            return p.make_children(s)
        setattr(make_preamble_section, 'newstate', 'preamblesection')

        def make_other(parser):
            return p.reader.next()

        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next().as_plaintext())
            s = Section(ordinal=secnumber, title=title, uri=None, meta=None)
            return parser.make_children(s)
        setattr(make_section, 'newstate', 'section')

        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next().as_plaintext())
            s = Subsection(ordinal=secnumber, title=title, uri=None, meta=None)
            return parser.make_children(s)
        setattr(make_subsection, 'newstate', 'subsection')

        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next().as_plaintext())
            s = Subsubsection(ordinal=secnumber, title=title, uri=None, meta=None)
            return parser.make_children(s)
        setattr(make_subsubsection, 'newstate', 'subsubsection')

        # Some helpers for the above
        def section_segments_count(s):
            return ((s is not None) and
                    len(list(filter(None, s.split(".")))))

        # Matches
        # "1 Blahonga" => ("1","Blahonga")
        # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection")
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2))
            else:
                return (None, chunk)

        p = FSMParser()

        p.set_recognizers(is_section,
                          is_subsection,
                          is_subsubsection,
                          is_preamblesection,
                          is_preambleending,
                          is_header,
                          is_other)
        commonstates = ("body", "preamblesection", "section", "subsection", "subsubsection")
        p.set_transitions(
            {("body", is_preamblesection): (make_preamble_section, "preamblesection"),
             ("preamblesection", is_preamblesection): (False, None),
             ("preamblesection", is_preambleending): (False, None),
             ("preamblesection", is_section): (False, None),
             ("body", is_section): (make_section, "section"),
             (commonstates, is_other): (make_other, None),
             ("section", is_subsection): (make_subsection, "subsection"),
             ("section", is_section): (False, None),
             ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"),
             ("subsection", is_subsection): (False, None),
             ("subsection", is_section): (False, None),
             ("subsubsection", is_subsubsection): (False, None),
             ("subsubsection", is_subsection): (False, None),
             ("subsubsection", is_section): (False, None),
             })
        p.initial_state = "body"
        p.initial_constructor = make_body
        return p
Exemplo n.º 6
0
    def get_parser(self, basefile, sanitized, parseconfig="default"):
        def is_heading(parser):
            return parser.reader.peek().font.size == 17

        def is_dnr(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12 and re.match('\d+-\d{2,4}', str(chunk))):
                return True

        def is_datum(parser):
            chunk = parser.reader.peek()
            if (chunk.font.size == 12
                    and re.match('\d{4}-\d{2}-\d{2}', str(chunk))):
                return True

        def is_nonessential(parser):
            chunk = parser.reader.peek()
            if chunk.top >= 1159 or chunk.top <= 146:
                return True

        def is_abstract(parser):
            if str(parser.reader.peek()).startswith("Beslutet i korthet:"):
                return True

        def is_section(parser):
            chunk = parser.reader.peek()
            strchunk = str(chunk)
            if chunk.font.size == 14 and chunk[
                    0].tag == "b" and not strchunk.endswith("."):
                return True

        def is_blockquote(parser):
            chunk = parser.reader.peek()
            if chunk.left >= 255:
                return True

        def is_normal(parser):
            chunk = parser.reader.peek()
            if chunk.left < 255:
                return True

        def is_paragraph(parser):
            return True

        @decorators.newstate("body")
        def make_body(parser):
            return parser.make_children(Body())

        def make_heading(parser):
            # h = Heading(str(parser.reader.next()).strip())
            h = Meta([str(parser.reader.next()).strip()],
                     predicate=DCTERMS.title,
                     lang="sv")
            return h

        @decorators.newstate("abstract")
        def make_abstract(parser):
            a = Abstract([Paragraph(parser.reader.next())])
            return parser.make_children(a)

        @decorators.newstate("section")
        def make_section(parser):
            s = UnorderedSection(title=str(parser.reader.next()).strip())
            return parser.make_children(s)

        @decorators.newstate("blockquote")
        def make_blockquote(parser):
            b = Blockquote()
            return parser.make_children(b)

        def make_paragraph(parser):
            # A Paragraph containing PDFReader.Textelement object will
            # render these as <span> objects (the default rendering. A
            # PDFReader.Textbox object containing same will render
            # unstyled Textelements as plain strings, cutting down on
            # unneccesary <span> elements. However, these themselves
            # render with unneccessary @style and @class attributes,
            # which we don't want. For now, lets stick with Paragraphs
            # as containers and maybe later figure out how to get
            # PDFReader.Textelements to render themselves sanely.
            #
            # p = parser.reader.next()
            p = Paragraph(parser.reader.next())
            return p

        def make_datum(parser):
            datestr = str(parser.reader.next()).strip()
            year = int(datestr.split("-")[0])
            if 2100 > year > 1970:
                parser.remove_recognizer(is_datum)
                d = [datestr]
                return Meta(d,
                            predicate=RPUBL.avgorandedatum,
                            datatype=XSD.date)
            else:
                self.log.warning("Year in %s doesn't look valid" % datestr)
                return None

        def make_dnr(parser):
            parser.remove_recognizer(is_dnr)
            ds = [x for x in str(parser.reader.next()).strip().split(" ")]
            return Meta(ds, predicate=RPUBL.diarienummer)

        def skip_nonessential(parser):
            parser.reader.next()  # return nothing

        p = FSMParser()
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.set_recognizers(is_datum, is_dnr, is_nonessential, is_heading,
                          is_abstract, is_section, is_normal, is_blockquote,
                          is_paragraph)
        p.set_transitions({
            ("body", is_heading): (make_heading, None),
            ("body", is_nonessential): (skip_nonessential, None),
            ("body", is_datum): (make_datum, None),
            ("body", is_dnr): (make_dnr, None),
            ("body", is_abstract): (make_abstract, "abstract"),
            ("body", is_section): (make_section, "section"),
            ("body", is_blockquote): (make_blockquote, "blockquote"),
            ("body", is_paragraph): (make_paragraph, None),
            ("abstract", is_paragraph): (make_paragraph, None),
            ("abstract", is_section): (False, None),
            ("abstract", is_dnr): (False, None),
            ("abstract", is_datum): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_nonessential): (skip_nonessential, None),
            ("section", is_section): (False, None),
            ("section", is_blockquote): (make_blockquote, "blockquote"),
            ("section", is_datum): (make_datum, None),
            ("section", is_dnr): (make_dnr, None),
            ("blockquote", is_blockquote): (make_paragraph, None),
            ("blockquote", is_nonessential): (skip_nonessential, None),
            ("blockquote", is_section): (False, None),
            ("blockquote", is_normal): (False, None),
            ("blockquote", is_datum): (make_datum, None),
            ("blockquote", is_dnr): (make_dnr, None),
        })
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Exemplo n.º 7
0
but by the catch-all is_paragraph. The recognizers are run in the
order specified by FSMParser.set_transitions().

    This is a preformatted section.
        It could be used for source code,
    +-------------------+
    |   line drawings   |
    +-------------------+
        or what have                 you.

Second section
==============

The above new section implicitly closed the first section which we
were in. This was made explicit by the last transition rule, which
stated that any time a section is encountered while in the "section"
state, we should not create any more children (False) but instead
return to our previous state (which in this case is "body", but for a
more complex language could be any number of states)."""

p = FSMParser()
p.set_recognizers(is_section, is_preformatted, is_paragraph)
p.set_transitions(transitions)
p.initial_constructor = make_body
p.initial_state = "body"
body = p.parse(text.split("\n\n"))
# print(elements.serialize(body))

# end main
return_value = elements.serialize(body)
Exemplo n.º 8
0
but by the catch-all is_paragraph. The recognizers are run in the
order specified by FSMParser.set_transitions().

    This is a preformatted section.
        It could be used for source code,
    +-------------------+
    |   line drawings   |
    +-------------------+
        or what have                 you.

Second section
==============

The above new section implicitly closed the first section which we
were in. This was made explicit by the last transition rule, which
stated that any time a section is encountered while in the "section"
state, we should not create any more children (False) but instead
return to our previous state (which in this case is "body", but for a
more complex language could be any number of states)."""

p = FSMParser()
p.set_recognizers(is_section, is_preformatted, is_paragraph)
p.set_transitions(transitions)
p.initial_constructor = make_body
p.initial_state = "body"
body = p.parse(text.split("\n\n"))
# print(elements.serialize(body))

# end main
return_value = elements.serialize(body)
Exemplo n.º 9
0
    def get_parser(basefile="0"):

        # recognizers, constructors and helpers are created as nested
        # ordinary functions, but could just as well be staticmethods
        # (or module-global functions)

        def is_rfcheader(parser, chunk=None, lenient=True):
            if not chunk:
                chunk = parser.reader.peek()
            (leftlines, rightlines, linelens) = _splitcolumns(chunk)
            # all rfc headers are at least 2 lines long (eg. rfc 889)
            if len(linelens) < 2:
                return False
            targetlen = linelens[0]
            for (idx, length) in enumerate(linelens):
                if rightlines[idx] == "" and length > 40:
                    return False
                elif rightlines[idx] != "" and length != targetlen and not lenient:
                    return False
                    # Most modern RFC has justified right margin
                    # (which is what this test targets) but some older
                    # RFCs (like 889) have ragged right margin (or
                    # rather left-justified two columns). However,
                    # since make_rfcheader checks next chunk as well
                    # (if there is a spurious double newline right in
                    # the middle of the header, which is a thing that
                    # has happened (RFC 6912)), this recognizer has a
                    # lenient and a non-lenient mode.
            return True

        # FIXME: use this in parse_header as well
        def _splitcolumns(chunk):
            linelens = []
            leftlines = []
            rightlines = []
            for line in chunk.split("\n"):
                linelens.append(len(line))
                if "   " in line:
                    (left, right) = line.split("   ", 1)
                else:
                    (left, right) = line, ""
                leftlines.append(left)
                rightlines.append(right)
            return (leftlines, rightlines, linelens)

        def is_doctitle(parser, chunk=None):
            return True

        def is_pagebreak(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            return ('\f' in chunk)

        def is_header(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            stripchunk = chunk.strip()
            # a header should be non-emtpy, be on a single line, not
            # end with "." and not start with an indent.
            if ((stripchunk != "") and
                (len(stripchunk.split("\n")) == 1) and
                (not stripchunk.endswith('.')) and
                    (not chunk.startswith(' '))):
                return True

        def is_section(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 1

        def is_subsection(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            # all paragraphs start with a three space indent -- start
            # by removing this
            stripped = "\n".join([x[3:] for x in chunk.split("\n")])
            # replace double spaces after end of sentences to avoid
            # false positives:
            stripped = stripped.replace(".  ", ". ")
            # If any double spaces left, probably preformatted text
            # (eg. tables etc). Same if several periods are present
            # (indicative of leaders in TOCs)
            return ("  " in stripped or
                    "...." in stripped or
                    ". . . " in stripped)

        def is_bnf(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
                return (is_preformatted(parser, chunk) and " = " in chunk)

        def is_paragraph(parser, chunk=None):
            return True

        def is_ul_listitem(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            return chunk.strip().startswith("o  ")

        def is_definition_title(parser, chunk=None):
            # looks like header but starts indented
            return False

        def is_definition(parser, chunk=None):
            # entire p is indented 6 spaces instead of 3. But if it
            # follows a ul li, problably continuation of that.
            return False

        def make_body(parser):
            return p.make_children(Body())
        setattr(make_body, 'newstate', 'body')

        def make_preamble_section(parser):
            s = PreambleSection(title=parser.reader.next())
            return p.make_children(s)
        setattr(make_preamble_section, 'newstate', 'preamble-section')

        # used for older rfcs
        def make_abstract(parser):
            s = PreambleSection(title="(Abstract)")
            return p.make_children(s)
        setattr(make_abstract, 'newstate', 'preamble-section')

        def skip_pagebreak(parser):
            chunk = parser.reader.next()
            lastline = chunk.split("\n")[-1]
            parts = re.split("  +", lastline)
            if len(parts) > 2:
                return Pagebreak(shorttitle=parts[1])
            else:
                return None

        def make_header(parser):
            chunk = parser.reader.next()
            h = Heading(chunk.strip())
            return h

        def make_paragraph(parser):
            chunk = p.reader.next()
            return Paragraph([" ".join(chunk.split())])

        def make_preformatted(parser):
            chunk = p.reader.next()
            return Preformatted([chunk])

        def make_bnf(parser):
            chunk = p.reader.next()
            return Preformatted([chunk], **{'class': 'bnf'})

        def make_section(parser):
            (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next())
            s = Section(ordinal=secnumber,
                        title=title,
                        identifier=identifier)
            return parser.make_children(s)
        setattr(make_section, 'newstate', 'section')

        def make_subsection(parser):
            (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next())
            s = Subsection(ordinal=secnumber,
                           title=title,
                           identifier=identifier)
            return parser.make_children(s)
        setattr(make_subsection, 'newstate', 'subsection')

        def make_subsubsection(parser):
            (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next())
            s = Subsubsection(ordinal=secnumber,
                              title=title,
                              identifier=identifier)
            return parser.make_children(s)
        setattr(make_subsubsection, 'newstate', 'subsubsection')

        def make_unordered_list(parser):
            (listtype, ordinal, separator, rest) = analyze_listitem(parser.reader.peek())
            ol = UnorderedList(type=listtype)  # should
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)
        setattr(make_unordered_list, 'newstate', 'unorderedlist')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)
        setattr(make_listitem, 'newstate', 'listitem')

        def make_rfcheader(parser):
            headerchunk = parser.reader.next()
            if is_rfcheader(parser, lenient=False):
                headerchunk += "\n" + parser.reader.next()
            return RFCHeader(headerchunk)

        def make_doctitle(parser):
            return DocTitle(parser.reader.next())

        # Some helpers for the above
        def section_segments_count(s):
            return ((s is not None) and
                    len(list(filter(None, s.split(".")))))

        # Matches
        # "1 Blahonga" => ("1","Blahonga", "RFC 1234, section 1")
        # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection", "RFC 1234, section 1.2.3")
        # "   Normal paragraph" => (None, "   Normal paragraph", None)
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            m = re_sectionstart(chunk)
            if m:
                ordinal = m.group(1).rstrip(".")
                title = m.group(2)
                identifier = "RFC %s, section %s" % (basefile, ordinal)
                return (ordinal, title, identifier)
            else:
                return (None, chunk, None)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = None

            # FIXME: Tighten these patterns to RFC conventions
            # match "1. Foo..." or "14) bar..." but not "4 This is a heading"
            if chunk.startswith("   o  "):
                return ("disc", None, None, chunk[6:])

            return (listtype, ordinal, separator, chunk)  # None * 3

        p = FSMParser()

        p.set_recognizers(is_pagebreak,
                          is_rfcheader,
                          is_doctitle,
                          is_section,
                          is_subsection,
                          is_subsubsection,
                          is_header,
                          is_ul_listitem,
                          is_preformatted,
                          is_definition_title,
                          is_definition,
                          is_paragraph)
        # start_state: "body" or "rfcheader", then "title", then
        # "preamble" (consisting of preamblesections that has title
        # (eg "Abstract", "Status of This Memo" + content), then "section".
        commonstates = ("section", "subsection", "subsubsection")
        p.set_transitions({("body", is_rfcheader): (make_rfcheader, "doctitle"),
                           ("doctitle", is_doctitle): (make_doctitle, "preamble"),
                           ("preamble", is_header): (make_preamble_section, "preamble-section"),
                           ("preamble", is_paragraph): (make_abstract, "preamble-section"),
                           ("preamble-section", is_paragraph): (make_paragraph, None),
                           ("preamble-section", is_header): (False, None),
                           ("preamble-section", is_pagebreak): (skip_pagebreak, None),
                           ("preamble-section", is_section): (False, "after-preamble"),
                           ("after-preamble", is_section): (make_section, "section"),
                           ("section", is_subsection): (make_subsection, "subsection"),
                           ("section", is_section): (False, None),
                           ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"),
                           ("subsection", is_subsection): (False, None),
                           ("subsection", is_section): (False, None),
                           ("subsubsection", is_subsubsection): (False, None),
                           ("subsubsection", is_subsection): (False, None),
                           ("subsubsection", is_section): (False, None),
                           (commonstates, is_ul_listitem): (make_unordered_list, "ul-list"),
                           ("ul-list", is_ul_listitem): (make_listitem, "listitem"),
                           ("ul-list", is_paragraph): (False, None),
                           ("listitem", is_paragraph): (False, None),
                           (commonstates, is_bnf): (make_bnf, None),
                           (commonstates, is_preformatted): (make_preformatted, None),
                           (commonstates, is_paragraph): (make_paragraph, None),
                           (commonstates, is_pagebreak): (skip_pagebreak, None),
                           })
        p.initial_state = "body"
        p.initial_constructor = make_body
        return p
Exemplo n.º 10
0
Arquivo: rfc.py Projeto: zigit/ferenda
    def get_parser(basefile="0"):

        # recognizers, constructors and helpers are created as nested
        # ordinary functions, but could just as well be staticmethods
        # (or module-global functions)

        def is_rfcheader(parser, chunk=None, lenient=True):
            if not chunk:
                chunk = parser.reader.peek()
            (leftlines, rightlines, linelens) = _splitcolumns(chunk)
            # all rfc headers are at least 2 lines long (eg. rfc 889)
            if len(linelens) < 2:
                return False
            targetlen = linelens[0]
            for (idx, length) in enumerate(linelens):
                if rightlines[idx] == "" and length > 40:
                    return False
                elif rightlines[
                        idx] != "" and length != targetlen and not lenient:
                    return False
                    # Most modern RFC has justified right margin
                    # (which is what this test targets) but some older
                    # RFCs (like 889) have ragged right margin (or
                    # rather left-justified two columns). However,
                    # since make_rfcheader checks next chunk as well
                    # (if there is a spurious double newline right in
                    # the middle of the header, which is a thing that
                    # has happened (RFC 6912)), this recognizer has a
                    # lenient and a non-lenient mode.
            return True

        # FIXME: use this in parse_header as well
        def _splitcolumns(chunk):
            linelens = []
            leftlines = []
            rightlines = []
            for line in chunk.split("\n"):
                linelens.append(len(line))
                if "   " in line:
                    (left, right) = line.split("   ", 1)
                else:
                    (left, right) = line, ""
                leftlines.append(left)
                rightlines.append(right)
            return (leftlines, rightlines, linelens)

        def is_doctitle(parser, chunk=None):
            return True

        def is_pagebreak(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            return ('\f' in chunk)

        def is_header(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            stripchunk = chunk.strip()
            # a header should be non-emtpy, be on a single line, not
            # end with "." and not start with an indent.
            if ((stripchunk != "") and (len(stripchunk.split("\n")) == 1)
                    and (not stripchunk.endswith('.'))
                    and (not chunk.startswith(' '))):
                return True

        def is_section(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 1

        def is_subsection(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser, chunk=None):
            (ordinal, title, identifier) = analyze_sectionstart(parser, chunk)
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            # all paragraphs start with a three space indent -- start
            # by removing this
            stripped = "\n".join([x[3:] for x in chunk.split("\n")])
            # replace double spaces after end of sentences to avoid
            # false positives:
            stripped = stripped.replace(".  ", ". ")
            # If any double spaces left, probably preformatted text
            # (eg. tables etc). Same if several periods are present
            # (indicative of leaders in TOCs)
            return ("  " in stripped or "...." in stripped
                    or ". . . " in stripped)

        def is_bnf(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
                return (is_preformatted(parser, chunk) and " = " in chunk)

        def is_paragraph(parser, chunk=None):
            return True

        def is_ul_listitem(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            return chunk.strip().startswith("o  ")

        def is_definition_title(parser, chunk=None):
            # looks like header but starts indented
            return False

        def is_definition(parser, chunk=None):
            # entire p is indented 6 spaces instead of 3. But if it
            # follows a ul li, problably continuation of that.
            return False

        def make_body(parser):
            return parser.make_children(Body())

        setattr(make_body, 'newstate', 'body')

        def make_preamble_section(parser):
            s = PreambleSection(title=parser.reader.next())
            return parser.make_children(s)

        setattr(make_preamble_section, 'newstate', 'preamble-section')

        # used for older rfcs
        def make_abstract(parser):
            s = PreambleSection(title="(Abstract)")
            return parser.make_children(s)

        setattr(make_abstract, 'newstate', 'preamble-section')

        def skip_pagebreak(parser):
            chunk = parser.reader.next()
            lastline = chunk.split("\n")[-1]
            parts = re.split("  +", lastline)
            if len(parts) > 2:
                return Pagebreak(shorttitle=parts[1])
            else:
                return None

        def make_header(parser):
            chunk = parser.reader.next()
            h = Heading(chunk.strip())
            return h

        def make_paragraph(parser):
            chunk = p.reader.next()
            return Paragraph([" ".join(chunk.split())])

        def make_preformatted(parser):
            chunk = p.reader.next()
            return Preformatted([chunk])

        def make_bnf(parser):
            chunk = p.reader.next()
            return Preformatted([chunk], **{'class': 'bnf'})

        def make_section(parser):
            (secnumber, title,
             identifier) = analyze_sectionstart(parser, parser.reader.next())
            s = Section(ordinal=secnumber, title=title, identifier=identifier)
            return parser.make_children(s)

        setattr(make_section, 'newstate', 'section')

        def make_subsection(parser):
            (secnumber, title,
             identifier) = analyze_sectionstart(parser, parser.reader.next())
            s = Subsection(ordinal=secnumber,
                           title=title,
                           identifier=identifier)
            return parser.make_children(s)

        setattr(make_subsection, 'newstate', 'subsection')

        def make_subsubsection(parser):
            (secnumber, title,
             identifier) = analyze_sectionstart(parser, parser.reader.next())
            s = Subsubsection(ordinal=secnumber,
                              title=title,
                              identifier=identifier)
            return parser.make_children(s)

        setattr(make_subsubsection, 'newstate', 'subsubsection')

        def make_unordered_list(parser):
            (listtype, ordinal, separator,
             rest) = analyze_listitem(parser.reader.peek())
            ol = UnorderedList(type=listtype)  # should
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)

        setattr(make_unordered_list, 'newstate', 'unorderedlist')

        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)

        setattr(make_listitem, 'newstate', 'listitem')

        def make_rfcheader(parser):
            headerchunk = parser.reader.next()
            if is_rfcheader(parser, lenient=False):
                headerchunk += "\n" + parser.reader.next()
            return RFCHeader(headerchunk)

        def make_doctitle(parser):
            return DocTitle(parser.reader.next())

        # Some helpers for the above
        def section_segments_count(s):
            return ((s is not None) and len(list(filter(None, s.split(".")))))

        # Matches
        # "1 Blahonga" => ("1","Blahonga", "RFC 1234, section 1")
        # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection", "RFC 1234, section 1.2.3")
        # "   Normal paragraph" => (None, "   Normal paragraph", None)
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(parser, chunk=None):
            if not chunk:
                chunk = parser.reader.peek()
            m = re_sectionstart(chunk)
            if m:
                ordinal = m.group(1).rstrip(".")
                title = m.group(2)
                identifier = "RFC %s, section %s" % (basefile, ordinal)
                return (ordinal, title, identifier)
            else:
                return (None, chunk, None)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = None

            # FIXME: Tighten these patterns to RFC conventions
            # match "1. Foo..." or "14) bar..." but not "4 This is a heading"
            if chunk.startswith("   o  "):
                return ("disc", None, None, chunk[6:])

            return (listtype, ordinal, separator, chunk)  # None * 3

        p = FSMParser()

        p.set_recognizers(is_pagebreak, is_rfcheader, is_doctitle, is_section,
                          is_subsection, is_subsubsection, is_header,
                          is_ul_listitem, is_preformatted, is_definition_title,
                          is_definition, is_paragraph)
        # start_state: "body" or "rfcheader", then "title", then
        # "preamble" (consisting of preamblesections that has title
        # (eg "Abstract", "Status of This Memo" + content), then "section".
        commonstates = ("section", "subsection", "subsubsection")
        p.set_transitions({
            ("body", is_rfcheader): (make_rfcheader, "doctitle"),
            ("doctitle", is_doctitle): (make_doctitle, "preamble"),
            ("preamble", is_header):
            (make_preamble_section, "preamble-section"),
            ("preamble", is_paragraph): (make_abstract, "preamble-section"),
            ("preamble-section", is_paragraph): (make_paragraph, None),
            ("preamble-section", is_header): (False, None),
            ("preamble-section", is_pagebreak): (skip_pagebreak, None),
            ("preamble-section", is_section): (False, "after-preamble"),
            ("after-preamble", is_section): (make_section, "section"),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("section", is_section): (False, None),
            ("subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_section): (False, None),
            ("subsubsection", is_subsubsection): (False, None),
            ("subsubsection", is_subsection): (False, None),
            ("subsubsection", is_section): (False, None),
            (commonstates, is_ul_listitem): (make_unordered_list, "ul-list"),
            ("ul-list", is_ul_listitem): (make_listitem, "listitem"),
            ("ul-list", is_paragraph): (False, None),
            ("listitem", is_paragraph): (False, None),
            (commonstates, is_bnf): (make_bnf, None),
            (commonstates, is_preformatted): (make_preformatted, None),
            (commonstates, is_paragraph): (make_paragraph, None),
            (commonstates, is_pagebreak): (skip_pagebreak, None),
        })
        p.initial_state = "body"
        p.initial_constructor = make_body
        return p
Exemplo n.º 11
0
    def get_parser(self, basefile, sanitized_body, parseconfig="default"):
        # a typical decision structure:

        # [h1] Justitiekanslerns beslut
        #    ... text ...
        #    [h2] Ärendet (h3)
        #        [h3] Bakgrund (p/em)
        #        ... text ...
        #        [h3] Anspråket
        #        ... text ...
        #        [h3 class="reglering"] Rättslig reglering m.m. (p/strong)
        #    [h2] Justitiekanslerns bedömning
        #        [h3] Skadestånd
        #        [h3] Tillsyn
        def is_section(parser):
            return parser.reader.peek().name == "h3"

        def is_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "em"

        def is_special_subsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "strong"

        def is_subsubsection(parser):
            chunk = parser.reader.peek()
            return chunk.name == "p" and list(chunk.children)[0].name == "u"

        def is_paragraph(parser):
            return True

        @newstate('body')
        def make_body(parser):
            return parser.make_children(Body())

        @newstate('section')
        def make_section(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('special_subsection')
        def make_special_subsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            s = AnonSektion(title=parser.reader.next().get_text())
            return parser.make_children(s)

        def make_paragraph(parser):
            # FIXME: this strips out formatting tags NB: Now this is a
            # SFS stycke that has fragment_label, id/uri and other
            # crap. Let's see if it still works!
            return AnonStycke([parser.reader.next().get_text()])

        p = FSMParser()
        p.set_recognizers(is_section,
                          is_subsection,
                          is_subsubsection,
                          is_paragraph)
        p.set_transitions({
            ("body", is_section): (make_section, "section"),
            ("section", is_section): (False, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("section", is_special_subsection): (make_special_subsection, "special_subsection"),
            ("subsection", is_section): (False, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_special_subsection): (False, None),
            ("subsection", is_subsubsection): (make_subsection, "subsubsection"),
            ("special_subsection", is_section): (False, None),
            ("special_subsection", is_subsection): (False, None),
            ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"),
            ("subsubsection", is_section): (False, None),
            ("subsubsection", is_special_subsection): (False, None),
            ("subsubsection", is_subsection): (False, None),
            ("subsubsection", is_subsubsection): (False, None),
            (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None)
        })
        p.initial_state = "body"
        p.initial_constructor = make_body
        p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
        return p.parse
Exemplo n.º 12
0
    def run_test_file(self, filename, debug=False):
        # some basic recognizers and constructors to parse a simple
        # structured plaintext format.
        #
        # RECOGNIZERS
        def is_header(parser):
            suspect = parser.reader.peek()
            return (len(suspect) > 100 and not suspect.endswith("."))

        def is_section(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 1

        def is_subsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 2

        def is_subsubsection(parser):
            (ordinal, title) = analyze_sectionstart(parser.reader.peek())
            return section_segments_count(ordinal) == 3

        def is_preformatted(parser):
            return "   " in parser.reader.peek()

        def is_definition(parser):
            return False

        def is_description(parser):
            return False

        def is_li_decimal(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('decimal', 'decimal-leading-zero')

        def is_li_alpha(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-alpha', 'upper-alpha')

        def is_li_roman(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('lower-roman', 'upper-roman')

        def is_unordereditem(parser):
            listtype = analyze_listitem(parser.reader.peek())[0]
            return listtype in ('disc', 'circle', 'square', 'dash')

        def is_state_a(parser):
            return parser.reader.peek().startswith("State A:")

        def is_state_b(parser):
            return parser.reader.peek().startswith("State B:")

        def is_state_c(parser):
            return parser.reader.peek().startswith("State C:")

        def is_paragraph(parser):
            # c.f. test/files/fsmparser/invalid.txt
            return len(parser.reader.peek()) > 6

        # MAGIC
        def sublist_or_parent(symbol, state_stack):
            constructor = False
            newstate = None
            if symbol == is_li_alpha and "ol-alpha" not in state_stack:  # maybe only check state_stack[-2]
                constructor = make_ol_alpha
                newstate = "ol-alpha"
            elif symbol == is_li_roman and "ol-roman" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            elif symbol == is_li_decimal and "ol-decimal" not in state_stack:
                constructor = make_ol_roman
                newstate = "ol-roman"
            else:
                pass
            return (constructor, newstate)

        # CONSTRUCTORS
        @newstate('body')
        def make_body(parser):
            parser._debug("Hello")
            b = elements.Body()
            return parser.make_children(b)

        @newstate('section')
        def make_section(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Section(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsection')
        def make_subsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        @newstate('subsubsection')
        def make_subsubsection(parser):
            (secnumber, title) = analyze_sectionstart(parser.reader.next())
            s = elements.Subsubsection(ordinal=secnumber, title=title)
            return parser.make_children(s)

        def make_paragraph(parser):
            return elements.Paragraph([parser.reader.next().strip()])

        def make_preformatted(parser):
            return elements.Preformatted([parser.reader.next()])


#        def make_unorderedlist(parser):
#            listtype = analyze_listitem(parser.reader.peek())[0]
#            assert ordinal is None
#            ul = elements.UnorderedList(type=listtype)
#            ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list
#            return parser.make_children(ul)
#        setattr(make_unorderedlist,'newstate','unorderedlist')

        @newstate('ol-decimal')
        def make_ol_decimal(parser):
            return make_orderedlist(parser, "decimal", "ol-decimal")

        @newstate('ol-alpha')
        def make_ol_alpha(parser):
            return make_orderedlist(parser, "lower-alpha", "ol-alpha")

        @newstate('ol-roman')
        def make_ol_roman(parser):
            return make_orderedlist(parser, "lower-roman", "ol-roman")

        @newstate('listitem')
        def make_listitem(parser):
            chunk = parser.reader.next()
            (listtype, ordinal, separator, rest) = analyze_listitem(chunk)
            li = elements.ListItem(ordinal=ordinal)
            li.append(rest)
            return parser.make_children(li)

        # NOTE: no @newstate decorator for these -- we transition from
        # one state to the next, not push a new state onto the stack
        def make_state_a(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-a")

        def make_state_b(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-b")

        def make_state_c(parser):
            return elements.Paragraph([parser.reader.next().strip()],
                                      id="state-c")

        # HELPERS
        def section_segments_count(s):
            return ((s is not None) and len(list(filter(None, s.split(".")))))

        def make_orderedlist(parser, listtype, childstate):
            listtype = analyze_listitem(parser.reader.peek())[0]
            ol = elements.OrderedList(type=listtype)
            ol.append(parser.make_child(make_listitem, "listitem"))
            return parser.make_children(ol)

        # matches
        # "1 Blahonga"
        # "1.2.3. This is a subsubsection"
        re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match

        def analyze_sectionstart(chunk):
            m = re_sectionstart(chunk)
            if m:
                return (m.group(1).rstrip("."), m.group(2).strip())
            else:
                return (None, chunk)

        def analyze_listitem(chunk):
            # returns: same as list-style-type in CSS2.1, sans
            # 'georgian', 'armenian' and 'greek', plus 'dashed'
            listtype = ordinal = separator = rest = None
            # match "1. Foo…" or "14) bar…" but not "4 This is a heading"
            m = re.match('^(\d+)([\.\)]) +', chunk)
            if m:
                if chunk.startswith("0"):
                    listtype = "decimal-leading-zero"
                else:
                    listtype = "decimal"
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "IX. Foo… or "vii) bar…" but not "vi is a sucky
            # editor" or "MMXIII is the current year"
            m = re.match('^([IVXivx]+)([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-roman'
                else:
                    listtype = 'upper-roman'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            # match "a. Foo… or "z) bar…" but not "to. Next sentence…"
            m = re.match('^([A-Za-z])([\.\)]) +', chunk)
            if m:
                if chunk[0].islower():
                    listtype = 'lower-alpha'
                else:
                    listtype = 'upper-alpha'
                (ordinal, separator) = m.groups()
                rest = chunk[m.end():]
                return (listtype, ordinal, separator, rest)

            if chunk.startswith("* "):
                return ("disc", None, None, chunk)
            if chunk.startswith("- "):
                return ("dash", None, None, chunk)

            return (listtype, ordinal, separator, chunk)  # None * 3

        # MAIN CODE
        p = FSMParser()
        p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header,
                          is_section, is_subsection, is_subsubsection,
                          is_preformatted, is_definition, is_description,
                          is_state_a, is_state_b, is_state_c, is_paragraph)
        p.set_transitions({
            ("body", is_paragraph): (make_paragraph, None),
            ("body", is_section): (make_section, "section"),
            ("body", is_state_a): (make_state_a, "state-a"),
            ("state-a", is_state_b): (make_state_b, "state-b"),
            ("state-b", is_state_c): (make_state_c, "state-c"),
            ("state-c", is_section): (False, None),
            ("section", is_paragraph): (make_paragraph, None),
            ("section", is_subsection): (make_subsection, "subsection"),
            ("subsection", is_paragraph): (make_paragraph, None),
            ("subsection", is_subsection): (False, None),
            ("subsection", is_state_a): (False, "body"),
            ("subsection", is_subsubsection):
            (make_subsubsection, "subsubsection"),
            ("subsubsection", is_paragraph): (make_paragraph, None),
            ("subsubsection", is_section): (False, None),
            ("subsection", is_section): (False, None),
            ("section", is_section): (False, None),
            ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"),
            ("ol-decimal", is_li_decimal): (make_listitem, "listitem"),
            ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"),
            ("ol-alpha", is_li_alpha): (make_listitem, "listitem"),
            ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"),
            ("ol-roman", is_li_roman): (make_listitem, "listitem"),
            ("ol-roman", is_li_alpha): (False, None),
            ("ol-alpha", is_li_decimal): (False, None),
            ("listitem", is_li_alpha):
            sublist_or_parent,
            ("listitem", is_li_roman):
            sublist_or_parent,
            ("listitem", is_li_decimal):
            sublist_or_parent,
        })

        p.debug = debug

        tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX)
        p.initial_state = "body"
        p.initial_constructor = make_body
        b = p.parse(tr.getiterator(tr.readparagraph))
        return p, b