def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal','decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha','upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman','upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc','circle','square','dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol,state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor,newstate) # CONSTRUCTORS def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) setattr(make_body,'newstate','body') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_section,'newstate','section') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_subsection,'newstate','subsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_subsubsection,'newstate','subsubsection') def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') def make_ol_decimal(parser): return make_orderedlist(parser,"decimal","ol-decimal") setattr(make_ol_decimal,'newstate','ol-decimal') def make_ol_alpha(parser): return make_orderedlist(parser,"lower-alpha", "ol-alpha") setattr(make_ol_alpha,'newstate','ol-alpha') def make_ol_roman(parser): return make_orderedlist(parser,"lower-roman", "ol-roman") setattr(make_ol_roman,'newstate','ol-romal') def make_listitem(parser): chunk = parser.reader.next() (listtype,ordinal,separator,rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) setattr(make_listitem,'newstate','listitem') def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-a") # setattr(make_state_a, 'newstate', 'state-a') def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-b") # setattr(make_state_b, 'newstate', 'state-b') def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-c") # setattr(make_state_c, 'newstate', 'state-c') # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None,s.split("."))))) def make_orderedlist(parser,listtype,childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem,"listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None,chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +',chunk) if m: if chunk.startswith("0"): listtype="decimal-leading-zero" else: listtype="decimal" (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) if chunk.startswith("* "): return ("disc",None,None,chunk) if chunk.startswith("- "): return ("dash",None,None,chunk) return (listtype,ordinal,separator,chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section,"section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph,None), ("subsection", is_subsection): (False,None), ("subsection", is_state_a): (False,"body"), ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"), ("subsubsection", is_paragraph): (make_paragraph,None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal",is_li_decimal):(make_listitem,"listitem"), ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"), ("ol-alpha",is_li_alpha):(make_listitem,"listitem"), ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"), ("ol-roman",is_li_roman):(make_listitem,"listitem"), ("ol-roman",is_li_alpha):(False,None), ("ol-alpha",is_li_decimal):(False,None), ("listitem",is_li_alpha):sublist_or_parent, ("listitem",is_li_roman):sublist_or_parent, ("listitem",is_li_decimal):sublist_or_parent, }) p.debug = debug tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b
def get_parser(self, basefile, sanitized, parseconfig="default"): def is_header(parser): p = parser.reader.peek() # older direktiv sources start with dir number if re.match(r'Dir\.? \d{4}:\d+$', p): return False return (headerlike(p) and not is_strecksats(parser, parser.reader.peek(2))) def is_strecksats(parser, chunk=None): if chunk is None: chunk = parser.reader.peek() return chunk.startswith(("--", "- ")) def is_section(parser): (ordinal, headingtype, title) = analyze_sectionstart(parser) if ordinal: return headingtype == "h1" def is_subsection(parser): (ordinal, headingtype, title) = analyze_sectionstart(parser) if ordinal: return headingtype == "h2" def is_paragraph(parser): return True @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('section') def make_section(parser): chunk = parser.reader.next() ordinal, headingtype, title = analyze_sectionstart(parser, chunk) s = Avsnitt(ordinal=ordinal, title=title) return parser.make_children(s) @newstate('strecksats') def make_strecksatslista(parser): ul = Strecksatslista() li = make_listitem(parser) ul.append(li) res = parser.make_children(ul) return res def make_listitem(parser): chunk = parser.reader.next() s = str(chunk) if " " in s: # assume text before first space is the bullet s = s.split(" ",1)[1] else: # assume the bullet is a single char s = s[1:] return Strecksatselement([s]) def make_header(parser): return Heading([parser.reader.next()]) def make_paragraph(parser): return Paragraph([parser.reader.next()]) @newstate('unorderedsection') def make_unorderedsection(parser): s = UnorderedSection(title=parser.reader.next().strip()) return parser.make_children(s) def headerlike(p): return (p[0].lower() != p[0] and len(p) < 150 and not (p.endswith(".") and not (p.endswith("m.m.") or p.endswith("m. m.") or p.endswith("m.fl.") or p.endswith("m. fl.")))) re_sectionstart = re.compile("^(\d[\.\d]*) +([A-ZÅÄÖ].*)$").match def analyze_sectionstart(parser, chunk=None): """returns (ordinal, headingtype, text) if it looks like a section heading, (None, None, chunk) otherwise.""" if chunk is None: chunk = parser.reader.peek() m = re_sectionstart(chunk) if m and headerlike(m.group(2)): return (m.group(1), "h" + str(m.group(1).count(".") + 1), m.group(2).strip()) else: return None, None, chunk p = FSMParser() if parseconfig == "simple": recognizers = [is_header, is_strecksats, is_paragraph] else: recognizers = [is_section, is_subsection, is_header, is_strecksats, is_paragraph] p.set_recognizers(*recognizers) commonstates = ("body", "section", "subsection", "unorderedsection") p.set_transitions({(commonstates, is_paragraph): (make_paragraph, None), (commonstates, is_strecksats): (make_strecksatslista, "strecksats"), (commonstates, is_header): (make_unorderedsection, "unorderedsection"), (commonstates, is_section): (make_section, "section"), ("unorderedsection", is_header): (False, None), ("unorderedsection", is_section): (False, None), ("strecksats", is_paragraph): (False, None), ("strecksats", is_strecksats): (make_listitem, None), ("section", is_header): (False, None), ("section", is_section): (False, None), ("section", is_subsection): (make_section, "subsection"), ("subsection", is_subsection): (False, None), ("subsection", is_section): (False, None)}) p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def get_parser(self, basefile, sanitized, parseconfig="default"): def is_heading(parser): return parser.reader.peek().font.size == 17 def is_dnr(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d+-\d{2,4}', str(chunk))): return True def is_datum(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d{4}-\d{2}-\d{2}', str(chunk))): return True def is_nonessential(parser): chunk = parser.reader.peek() if chunk.top >= 1159 or chunk.top <= 146: return True def is_abstract(parser): if str(parser.reader.peek()).startswith("Beslutet i korthet:"): return True def is_section(parser): chunk = parser.reader.peek() strchunk = str(chunk) if chunk.font.size == 14 and chunk[0].tag == "b" and not strchunk.endswith("."): return True def is_blockquote(parser): chunk = parser.reader.peek() if chunk.left >= 255: return True def is_normal(parser): chunk = parser.reader.peek() if chunk.left < 255: return True def is_paragraph(parser): return True @decorators.newstate("body") def make_body(parser): return parser.make_children(Body()) def make_heading(parser): # h = Heading(str(parser.reader.next()).strip()) h = Meta([str(parser.reader.next()).strip()], predicate=DCTERMS.title, lang="sv") return h @decorators.newstate("abstract") def make_abstract(parser): a = Abstract([Paragraph(parser.reader.next())]) return parser.make_children(a) @decorators.newstate("section") def make_section(parser): s = UnorderedSection(title=str(parser.reader.next()).strip()) return parser.make_children(s) @decorators.newstate("blockquote") def make_blockquote(parser): b = Blockquote() return parser.make_children(b) def make_paragraph(parser): # A Paragraph containing PDFReader.Textelement object will # render these as <span> objects (the default rendering. A # PDFReader.Textbox object containing same will render # unstyled Textelements as plain strings, cutting down on # unneccesary <span> elements. However, these themselves # render with unneccessary @style and @class attributes, # which we don't want. For now, lets stick with Paragraphs # as containers and maybe later figure out how to get # PDFReader.Textelements to render themselves sanely. # # p = parser.reader.next() p = Paragraph(parser.reader.next()) return p def make_datum(parser): datestr = str(parser.reader.next()).strip() year = int(datestr.split("-")[0]) if 2100 > year > 1970: parser.remove_recognizer(is_datum) d = [datestr] return Meta(d, predicate=RPUBL.avgorandedatum, datatype=XSD.date) else: self.log.warning("Year in %s doesn't look valid" % datestr) return None def make_dnr(parser): parser.remove_recognizer(is_dnr) ds = [x for x in str(parser.reader.next()).strip().split(" ")] return Meta(ds, predicate=RPUBL.diarienummer) def skip_nonessential(parser): parser.reader.next() # return nothing p = FSMParser() p.initial_state = "body" p.initial_constructor = make_body p.set_recognizers(is_datum, is_dnr, is_nonessential, is_heading, is_abstract, is_section, is_normal, is_blockquote, is_paragraph) p.set_transitions({("body", is_heading): (make_heading, None), ("body", is_nonessential): (skip_nonessential, None), ("body", is_datum): (make_datum, None), ("body", is_dnr): (make_dnr, None), ("body", is_abstract): (make_abstract, "abstract"), ("body", is_section): (make_section, "section"), ("body", is_blockquote): (make_blockquote, "blockquote"), ("body", is_paragraph): (make_paragraph, None), ("abstract", is_paragraph): (make_paragraph, None), ("abstract", is_section): (False, None), ("abstract", is_dnr): (False, None), ("abstract", is_datum): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_nonessential): (skip_nonessential, None), ("section", is_section): (False, None), ("section", is_blockquote): (make_blockquote, "blockquote"), ("section", is_datum): (make_datum, None), ("section", is_dnr): (make_dnr, None), ("blockquote", is_blockquote): (make_paragraph, None), ("blockquote", is_nonessential): (skip_nonessential, None), ("blockquote", is_section): (False, None), ("blockquote", is_normal): (False, None), ("blockquote", is_datum): (make_datum, None), ("blockquote", is_dnr): (make_dnr, None), }) p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def get_parser(self, basefile, sanitized_body, parseconfig="default"): # a typical decision structure: # [h1] Justitiekanslerns beslut # ... text ... # [h2] Ärendet (h3) # [h3] Bakgrund (p/em) # ... text ... # [h3] Anspråket # ... text ... # [h3 class="reglering"] Rättslig reglering m.m. (p/strong) # [h2] Justitiekanslerns bedömning # [h3] Skadestånd # [h3] Tillsyn def is_section(parser): return parser.reader.peek().name == "h3" def is_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "em" def is_special_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list( chunk.children)[0].name == "strong" def is_subsubsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "u" def is_paragraph(parser): return True @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('section') def make_section(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('special_subsection') def make_special_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) def make_paragraph(parser): # FIXME: this strips out formatting tags NB: Now this is a # SFS stycke that has fragment_label, id/uri and other # crap. Let's see if it still works! return AnonStycke([parser.reader.next().get_text()]) p = FSMParser() p.set_recognizers(is_section, is_subsection, is_subsubsection, is_paragraph) p.set_transitions({ ("body", is_section): (make_section, "section"), ("section", is_section): (False, None), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_special_subsection): (make_special_subsection, "special_subsection"), ("subsection", is_section): (False, None), ("subsection", is_subsection): (False, None), ("subsection", is_special_subsection): (False, None), ("subsection", is_subsubsection): (make_subsection, "subsubsection"), ("special_subsection", is_section): (False, None), ("special_subsection", is_subsection): (False, None), ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_section): (False, None), ("subsubsection", is_special_subsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_subsubsection): (False, None), (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None) }) p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def get_parser(): def is_header(parser): chunk = parser.reader.peek() if type(chunk) in (html.H1, html.H2, html.H3, html.H4): return True else: return False def is_preamblesection(parser): if not is_header(parser): return False chunk = parser.reader.peek() return chunk.as_plaintext().lower() in ("abstract", "status of this document", "table of contents", "appendices") def is_preambleending(parser): chunk = parser.reader.peek() return type(chunk) in (html.HR,) def is_section(parser): if not is_header(parser): return False chunk = parser.reader.peek() (ordinal, title) = analyze_sectionstart(chunk.as_plaintext()) return section_segments_count(ordinal) == 1 def is_subsection(parser): if not is_header(parser): return False chunk = parser.reader.peek() (ordinal, title) = analyze_sectionstart(chunk.as_plaintext()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): if not is_header(parser): return False chunk = parser.reader.peek() (ordinal, title) = analyze_sectionstart(chunk.as_plaintext()) return section_segments_count(ordinal) == 3 def is_other(parser, chunk=None): return True def make_body(parser): return p.make_children(Body()) setattr(make_body, 'newstate', 'body') def make_preamble_section(parser): s = PreambleSection(title=parser.reader.next().as_plaintext()) return p.make_children(s) setattr(make_preamble_section, 'newstate', 'preamblesection') def make_other(parser): return p.reader.next() def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next().as_plaintext()) s = Section(ordinal=secnumber, title=title, uri=None, meta=None) return parser.make_children(s) setattr(make_section, 'newstate', 'section') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next().as_plaintext()) s = Subsection(ordinal=secnumber, title=title, uri=None, meta=None) return parser.make_children(s) setattr(make_subsection, 'newstate', 'subsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next().as_plaintext()) s = Subsubsection(ordinal=secnumber, title=title, uri=None, meta=None) return parser.make_children(s) setattr(make_subsubsection, 'newstate', 'subsubsection') # Some helpers for the above def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) # Matches # "1 Blahonga" => ("1","Blahonga") # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection") re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2)) else: return (None, chunk) p = FSMParser() p.set_recognizers(is_section, is_subsection, is_subsubsection, is_preamblesection, is_preambleending, is_header, is_other) commonstates = ("body", "preamblesection", "section", "subsection", "subsubsection") p.set_transitions( {("body", is_preamblesection): (make_preamble_section, "preamblesection"), ("preamblesection", is_preamblesection): (False, None), ("preamblesection", is_preambleending): (False, None), ("preamblesection", is_section): (False, None), ("body", is_section): (make_section, "section"), (commonstates, is_other): (make_other, None), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_section): (False, None), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsection", is_subsection): (False, None), ("subsection", is_section): (False, None), ("subsubsection", is_subsubsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_section): (False, None), }) p.initial_state = "body" p.initial_constructor = make_body return p
def get_parser(self, basefile, sanitized, parseconfig="default"): def is_heading(parser): return parser.reader.peek().font.size == 17 def is_dnr(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d+-\d{2,4}', str(chunk))): return True def is_datum(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d{4}-\d{2}-\d{2}', str(chunk))): return True def is_nonessential(parser): chunk = parser.reader.peek() if chunk.top >= 1159 or chunk.top <= 146: return True def is_abstract(parser): if str(parser.reader.peek()).startswith("Beslutet i korthet:"): return True def is_section(parser): chunk = parser.reader.peek() strchunk = str(chunk) if chunk.font.size == 14 and chunk[ 0].tag == "b" and not strchunk.endswith("."): return True def is_blockquote(parser): chunk = parser.reader.peek() if chunk.left >= 255: return True def is_normal(parser): chunk = parser.reader.peek() if chunk.left < 255: return True def is_paragraph(parser): return True @decorators.newstate("body") def make_body(parser): return parser.make_children(Body()) def make_heading(parser): # h = Heading(str(parser.reader.next()).strip()) h = Meta([str(parser.reader.next()).strip()], predicate=DCTERMS.title, lang="sv") return h @decorators.newstate("abstract") def make_abstract(parser): a = Abstract([Paragraph(parser.reader.next())]) return parser.make_children(a) @decorators.newstate("section") def make_section(parser): s = UnorderedSection(title=str(parser.reader.next()).strip()) return parser.make_children(s) @decorators.newstate("blockquote") def make_blockquote(parser): b = Blockquote() return parser.make_children(b) def make_paragraph(parser): # A Paragraph containing PDFReader.Textelement object will # render these as <span> objects (the default rendering. A # PDFReader.Textbox object containing same will render # unstyled Textelements as plain strings, cutting down on # unneccesary <span> elements. However, these themselves # render with unneccessary @style and @class attributes, # which we don't want. For now, lets stick with Paragraphs # as containers and maybe later figure out how to get # PDFReader.Textelements to render themselves sanely. # # p = parser.reader.next() p = Paragraph(parser.reader.next()) return p def make_datum(parser): datestr = str(parser.reader.next()).strip() year = int(datestr.split("-")[0]) if 2100 > year > 1970: parser.remove_recognizer(is_datum) d = [datestr] return Meta(d, predicate=RPUBL.avgorandedatum, datatype=XSD.date) else: self.log.warning("Year in %s doesn't look valid" % datestr) return None def make_dnr(parser): parser.remove_recognizer(is_dnr) ds = [x for x in str(parser.reader.next()).strip().split(" ")] return Meta(ds, predicate=RPUBL.diarienummer) def skip_nonessential(parser): parser.reader.next() # return nothing p = FSMParser() p.initial_state = "body" p.initial_constructor = make_body p.set_recognizers(is_datum, is_dnr, is_nonessential, is_heading, is_abstract, is_section, is_normal, is_blockquote, is_paragraph) p.set_transitions({ ("body", is_heading): (make_heading, None), ("body", is_nonessential): (skip_nonessential, None), ("body", is_datum): (make_datum, None), ("body", is_dnr): (make_dnr, None), ("body", is_abstract): (make_abstract, "abstract"), ("body", is_section): (make_section, "section"), ("body", is_blockquote): (make_blockquote, "blockquote"), ("body", is_paragraph): (make_paragraph, None), ("abstract", is_paragraph): (make_paragraph, None), ("abstract", is_section): (False, None), ("abstract", is_dnr): (False, None), ("abstract", is_datum): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_nonessential): (skip_nonessential, None), ("section", is_section): (False, None), ("section", is_blockquote): (make_blockquote, "blockquote"), ("section", is_datum): (make_datum, None), ("section", is_dnr): (make_dnr, None), ("blockquote", is_blockquote): (make_paragraph, None), ("blockquote", is_nonessential): (skip_nonessential, None), ("blockquote", is_section): (False, None), ("blockquote", is_normal): (False, None), ("blockquote", is_datum): (make_datum, None), ("blockquote", is_dnr): (make_dnr, None), }) p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
but by the catch-all is_paragraph. The recognizers are run in the order specified by FSMParser.set_transitions(). This is a preformatted section. It could be used for source code, +-------------------+ | line drawings | +-------------------+ or what have you. Second section ============== The above new section implicitly closed the first section which we were in. This was made explicit by the last transition rule, which stated that any time a section is encountered while in the "section" state, we should not create any more children (False) but instead return to our previous state (which in this case is "body", but for a more complex language could be any number of states).""" p = FSMParser() p.set_recognizers(is_section, is_preformatted, is_paragraph) p.set_transitions(transitions) p.initial_constructor = make_body p.initial_state = "body" body = p.parse(text.split("\n\n")) # print(elements.serialize(body)) # end main return_value = elements.serialize(body)
def get_parser(basefile="0"): # recognizers, constructors and helpers are created as nested # ordinary functions, but could just as well be staticmethods # (or module-global functions) def is_rfcheader(parser, chunk=None, lenient=True): if not chunk: chunk = parser.reader.peek() (leftlines, rightlines, linelens) = _splitcolumns(chunk) # all rfc headers are at least 2 lines long (eg. rfc 889) if len(linelens) < 2: return False targetlen = linelens[0] for (idx, length) in enumerate(linelens): if rightlines[idx] == "" and length > 40: return False elif rightlines[idx] != "" and length != targetlen and not lenient: return False # Most modern RFC has justified right margin # (which is what this test targets) but some older # RFCs (like 889) have ragged right margin (or # rather left-justified two columns). However, # since make_rfcheader checks next chunk as well # (if there is a spurious double newline right in # the middle of the header, which is a thing that # has happened (RFC 6912)), this recognizer has a # lenient and a non-lenient mode. return True # FIXME: use this in parse_header as well def _splitcolumns(chunk): linelens = [] leftlines = [] rightlines = [] for line in chunk.split("\n"): linelens.append(len(line)) if " " in line: (left, right) = line.split(" ", 1) else: (left, right) = line, "" leftlines.append(left) rightlines.append(right) return (leftlines, rightlines, linelens) def is_doctitle(parser, chunk=None): return True def is_pagebreak(parser, chunk=None): if not chunk: chunk = parser.reader.peek() return ('\f' in chunk) def is_header(parser, chunk=None): if not chunk: chunk = parser.reader.peek() stripchunk = chunk.strip() # a header should be non-emtpy, be on a single line, not # end with "." and not start with an indent. if ((stripchunk != "") and (len(stripchunk.split("\n")) == 1) and (not stripchunk.endswith('.')) and (not chunk.startswith(' '))): return True def is_section(parser, chunk=None): (ordinal, title, identifier) = analyze_sectionstart(parser, chunk) return section_segments_count(ordinal) == 1 def is_subsection(parser, chunk=None): (ordinal, title, identifier) = analyze_sectionstart(parser, chunk) return section_segments_count(ordinal) == 2 def is_subsubsection(parser, chunk=None): (ordinal, title, identifier) = analyze_sectionstart(parser, chunk) return section_segments_count(ordinal) == 3 def is_preformatted(parser, chunk=None): if not chunk: chunk = parser.reader.peek() # all paragraphs start with a three space indent -- start # by removing this stripped = "\n".join([x[3:] for x in chunk.split("\n")]) # replace double spaces after end of sentences to avoid # false positives: stripped = stripped.replace(". ", ". ") # If any double spaces left, probably preformatted text # (eg. tables etc). Same if several periods are present # (indicative of leaders in TOCs) return (" " in stripped or "...." in stripped or ". . . " in stripped) def is_bnf(parser, chunk=None): if not chunk: chunk = parser.reader.peek() return (is_preformatted(parser, chunk) and " = " in chunk) def is_paragraph(parser, chunk=None): return True def is_ul_listitem(parser, chunk=None): if not chunk: chunk = parser.reader.peek() return chunk.strip().startswith("o ") def is_definition_title(parser, chunk=None): # looks like header but starts indented return False def is_definition(parser, chunk=None): # entire p is indented 6 spaces instead of 3. But if it # follows a ul li, problably continuation of that. return False def make_body(parser): return p.make_children(Body()) setattr(make_body, 'newstate', 'body') def make_preamble_section(parser): s = PreambleSection(title=parser.reader.next()) return p.make_children(s) setattr(make_preamble_section, 'newstate', 'preamble-section') # used for older rfcs def make_abstract(parser): s = PreambleSection(title="(Abstract)") return p.make_children(s) setattr(make_abstract, 'newstate', 'preamble-section') def skip_pagebreak(parser): chunk = parser.reader.next() lastline = chunk.split("\n")[-1] parts = re.split(" +", lastline) if len(parts) > 2: return Pagebreak(shorttitle=parts[1]) else: return None def make_header(parser): chunk = parser.reader.next() h = Heading(chunk.strip()) return h def make_paragraph(parser): chunk = p.reader.next() return Paragraph([" ".join(chunk.split())]) def make_preformatted(parser): chunk = p.reader.next() return Preformatted([chunk]) def make_bnf(parser): chunk = p.reader.next() return Preformatted([chunk], **{'class': 'bnf'}) def make_section(parser): (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next()) s = Section(ordinal=secnumber, title=title, identifier=identifier) return parser.make_children(s) setattr(make_section, 'newstate', 'section') def make_subsection(parser): (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next()) s = Subsection(ordinal=secnumber, title=title, identifier=identifier) return parser.make_children(s) setattr(make_subsection, 'newstate', 'subsection') def make_subsubsection(parser): (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next()) s = Subsubsection(ordinal=secnumber, title=title, identifier=identifier) return parser.make_children(s) setattr(make_subsubsection, 'newstate', 'subsubsection') def make_unordered_list(parser): (listtype, ordinal, separator, rest) = analyze_listitem(parser.reader.peek()) ol = UnorderedList(type=listtype) # should ol.append(parser.make_child(make_listitem, "listitem")) return parser.make_children(ol) setattr(make_unordered_list, 'newstate', 'unorderedlist') def make_listitem(parser): chunk = parser.reader.next() (listtype, ordinal, separator, rest) = analyze_listitem(chunk) li = ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) setattr(make_listitem, 'newstate', 'listitem') def make_rfcheader(parser): headerchunk = parser.reader.next() if is_rfcheader(parser, lenient=False): headerchunk += "\n" + parser.reader.next() return RFCHeader(headerchunk) def make_doctitle(parser): return DocTitle(parser.reader.next()) # Some helpers for the above def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) # Matches # "1 Blahonga" => ("1","Blahonga", "RFC 1234, section 1") # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection", "RFC 1234, section 1.2.3") # " Normal paragraph" => (None, " Normal paragraph", None) re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(parser, chunk=None): if not chunk: chunk = parser.reader.peek() m = re_sectionstart(chunk) if m: ordinal = m.group(1).rstrip(".") title = m.group(2) identifier = "RFC %s, section %s" % (basefile, ordinal) return (ordinal, title, identifier) else: return (None, chunk, None) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = None # FIXME: Tighten these patterns to RFC conventions # match "1. Foo..." or "14) bar..." but not "4 This is a heading" if chunk.startswith(" o "): return ("disc", None, None, chunk[6:]) return (listtype, ordinal, separator, chunk) # None * 3 p = FSMParser() p.set_recognizers(is_pagebreak, is_rfcheader, is_doctitle, is_section, is_subsection, is_subsubsection, is_header, is_ul_listitem, is_preformatted, is_definition_title, is_definition, is_paragraph) # start_state: "body" or "rfcheader", then "title", then # "preamble" (consisting of preamblesections that has title # (eg "Abstract", "Status of This Memo" + content), then "section". commonstates = ("section", "subsection", "subsubsection") p.set_transitions({("body", is_rfcheader): (make_rfcheader, "doctitle"), ("doctitle", is_doctitle): (make_doctitle, "preamble"), ("preamble", is_header): (make_preamble_section, "preamble-section"), ("preamble", is_paragraph): (make_abstract, "preamble-section"), ("preamble-section", is_paragraph): (make_paragraph, None), ("preamble-section", is_header): (False, None), ("preamble-section", is_pagebreak): (skip_pagebreak, None), ("preamble-section", is_section): (False, "after-preamble"), ("after-preamble", is_section): (make_section, "section"), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_section): (False, None), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsection", is_subsection): (False, None), ("subsection", is_section): (False, None), ("subsubsection", is_subsubsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_section): (False, None), (commonstates, is_ul_listitem): (make_unordered_list, "ul-list"), ("ul-list", is_ul_listitem): (make_listitem, "listitem"), ("ul-list", is_paragraph): (False, None), ("listitem", is_paragraph): (False, None), (commonstates, is_bnf): (make_bnf, None), (commonstates, is_preformatted): (make_preformatted, None), (commonstates, is_paragraph): (make_paragraph, None), (commonstates, is_pagebreak): (skip_pagebreak, None), }) p.initial_state = "body" p.initial_constructor = make_body return p
def get_parser(basefile="0"): # recognizers, constructors and helpers are created as nested # ordinary functions, but could just as well be staticmethods # (or module-global functions) def is_rfcheader(parser, chunk=None, lenient=True): if not chunk: chunk = parser.reader.peek() (leftlines, rightlines, linelens) = _splitcolumns(chunk) # all rfc headers are at least 2 lines long (eg. rfc 889) if len(linelens) < 2: return False targetlen = linelens[0] for (idx, length) in enumerate(linelens): if rightlines[idx] == "" and length > 40: return False elif rightlines[ idx] != "" and length != targetlen and not lenient: return False # Most modern RFC has justified right margin # (which is what this test targets) but some older # RFCs (like 889) have ragged right margin (or # rather left-justified two columns). However, # since make_rfcheader checks next chunk as well # (if there is a spurious double newline right in # the middle of the header, which is a thing that # has happened (RFC 6912)), this recognizer has a # lenient and a non-lenient mode. return True # FIXME: use this in parse_header as well def _splitcolumns(chunk): linelens = [] leftlines = [] rightlines = [] for line in chunk.split("\n"): linelens.append(len(line)) if " " in line: (left, right) = line.split(" ", 1) else: (left, right) = line, "" leftlines.append(left) rightlines.append(right) return (leftlines, rightlines, linelens) def is_doctitle(parser, chunk=None): return True def is_pagebreak(parser, chunk=None): if not chunk: chunk = parser.reader.peek() return ('\f' in chunk) def is_header(parser, chunk=None): if not chunk: chunk = parser.reader.peek() stripchunk = chunk.strip() # a header should be non-emtpy, be on a single line, not # end with "." and not start with an indent. if ((stripchunk != "") and (len(stripchunk.split("\n")) == 1) and (not stripchunk.endswith('.')) and (not chunk.startswith(' '))): return True def is_section(parser, chunk=None): (ordinal, title, identifier) = analyze_sectionstart(parser, chunk) return section_segments_count(ordinal) == 1 def is_subsection(parser, chunk=None): (ordinal, title, identifier) = analyze_sectionstart(parser, chunk) return section_segments_count(ordinal) == 2 def is_subsubsection(parser, chunk=None): (ordinal, title, identifier) = analyze_sectionstart(parser, chunk) return section_segments_count(ordinal) == 3 def is_preformatted(parser, chunk=None): if not chunk: chunk = parser.reader.peek() # all paragraphs start with a three space indent -- start # by removing this stripped = "\n".join([x[3:] for x in chunk.split("\n")]) # replace double spaces after end of sentences to avoid # false positives: stripped = stripped.replace(". ", ". ") # If any double spaces left, probably preformatted text # (eg. tables etc). Same if several periods are present # (indicative of leaders in TOCs) return (" " in stripped or "...." in stripped or ". . . " in stripped) def is_bnf(parser, chunk=None): if not chunk: chunk = parser.reader.peek() return (is_preformatted(parser, chunk) and " = " in chunk) def is_paragraph(parser, chunk=None): return True def is_ul_listitem(parser, chunk=None): if not chunk: chunk = parser.reader.peek() return chunk.strip().startswith("o ") def is_definition_title(parser, chunk=None): # looks like header but starts indented return False def is_definition(parser, chunk=None): # entire p is indented 6 spaces instead of 3. But if it # follows a ul li, problably continuation of that. return False def make_body(parser): return parser.make_children(Body()) setattr(make_body, 'newstate', 'body') def make_preamble_section(parser): s = PreambleSection(title=parser.reader.next()) return parser.make_children(s) setattr(make_preamble_section, 'newstate', 'preamble-section') # used for older rfcs def make_abstract(parser): s = PreambleSection(title="(Abstract)") return parser.make_children(s) setattr(make_abstract, 'newstate', 'preamble-section') def skip_pagebreak(parser): chunk = parser.reader.next() lastline = chunk.split("\n")[-1] parts = re.split(" +", lastline) if len(parts) > 2: return Pagebreak(shorttitle=parts[1]) else: return None def make_header(parser): chunk = parser.reader.next() h = Heading(chunk.strip()) return h def make_paragraph(parser): chunk = p.reader.next() return Paragraph([" ".join(chunk.split())]) def make_preformatted(parser): chunk = p.reader.next() return Preformatted([chunk]) def make_bnf(parser): chunk = p.reader.next() return Preformatted([chunk], **{'class': 'bnf'}) def make_section(parser): (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next()) s = Section(ordinal=secnumber, title=title, identifier=identifier) return parser.make_children(s) setattr(make_section, 'newstate', 'section') def make_subsection(parser): (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next()) s = Subsection(ordinal=secnumber, title=title, identifier=identifier) return parser.make_children(s) setattr(make_subsection, 'newstate', 'subsection') def make_subsubsection(parser): (secnumber, title, identifier) = analyze_sectionstart(parser, parser.reader.next()) s = Subsubsection(ordinal=secnumber, title=title, identifier=identifier) return parser.make_children(s) setattr(make_subsubsection, 'newstate', 'subsubsection') def make_unordered_list(parser): (listtype, ordinal, separator, rest) = analyze_listitem(parser.reader.peek()) ol = UnorderedList(type=listtype) # should ol.append(parser.make_child(make_listitem, "listitem")) return parser.make_children(ol) setattr(make_unordered_list, 'newstate', 'unorderedlist') def make_listitem(parser): chunk = parser.reader.next() (listtype, ordinal, separator, rest) = analyze_listitem(chunk) li = ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) setattr(make_listitem, 'newstate', 'listitem') def make_rfcheader(parser): headerchunk = parser.reader.next() if is_rfcheader(parser, lenient=False): headerchunk += "\n" + parser.reader.next() return RFCHeader(headerchunk) def make_doctitle(parser): return DocTitle(parser.reader.next()) # Some helpers for the above def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) # Matches # "1 Blahonga" => ("1","Blahonga", "RFC 1234, section 1") # "1.2.3. This is a subsubsection" => ("1.2.3", "This is a subsection", "RFC 1234, section 1.2.3") # " Normal paragraph" => (None, " Normal paragraph", None) re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(parser, chunk=None): if not chunk: chunk = parser.reader.peek() m = re_sectionstart(chunk) if m: ordinal = m.group(1).rstrip(".") title = m.group(2) identifier = "RFC %s, section %s" % (basefile, ordinal) return (ordinal, title, identifier) else: return (None, chunk, None) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = None # FIXME: Tighten these patterns to RFC conventions # match "1. Foo..." or "14) bar..." but not "4 This is a heading" if chunk.startswith(" o "): return ("disc", None, None, chunk[6:]) return (listtype, ordinal, separator, chunk) # None * 3 p = FSMParser() p.set_recognizers(is_pagebreak, is_rfcheader, is_doctitle, is_section, is_subsection, is_subsubsection, is_header, is_ul_listitem, is_preformatted, is_definition_title, is_definition, is_paragraph) # start_state: "body" or "rfcheader", then "title", then # "preamble" (consisting of preamblesections that has title # (eg "Abstract", "Status of This Memo" + content), then "section". commonstates = ("section", "subsection", "subsubsection") p.set_transitions({ ("body", is_rfcheader): (make_rfcheader, "doctitle"), ("doctitle", is_doctitle): (make_doctitle, "preamble"), ("preamble", is_header): (make_preamble_section, "preamble-section"), ("preamble", is_paragraph): (make_abstract, "preamble-section"), ("preamble-section", is_paragraph): (make_paragraph, None), ("preamble-section", is_header): (False, None), ("preamble-section", is_pagebreak): (skip_pagebreak, None), ("preamble-section", is_section): (False, "after-preamble"), ("after-preamble", is_section): (make_section, "section"), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_section): (False, None), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsection", is_subsection): (False, None), ("subsection", is_section): (False, None), ("subsubsection", is_subsubsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_section): (False, None), (commonstates, is_ul_listitem): (make_unordered_list, "ul-list"), ("ul-list", is_ul_listitem): (make_listitem, "listitem"), ("ul-list", is_paragraph): (False, None), ("listitem", is_paragraph): (False, None), (commonstates, is_bnf): (make_bnf, None), (commonstates, is_preformatted): (make_preformatted, None), (commonstates, is_paragraph): (make_paragraph, None), (commonstates, is_pagebreak): (skip_pagebreak, None), }) p.initial_state = "body" p.initial_constructor = make_body return p
def get_parser(self, basefile, sanitized_body, parseconfig="default"): # a typical decision structure: # [h1] Justitiekanslerns beslut # ... text ... # [h2] Ärendet (h3) # [h3] Bakgrund (p/em) # ... text ... # [h3] Anspråket # ... text ... # [h3 class="reglering"] Rättslig reglering m.m. (p/strong) # [h2] Justitiekanslerns bedömning # [h3] Skadestånd # [h3] Tillsyn def is_section(parser): return parser.reader.peek().name == "h3" def is_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "em" def is_special_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "strong" def is_subsubsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "u" def is_paragraph(parser): return True @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('section') def make_section(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('special_subsection') def make_special_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) def make_paragraph(parser): # FIXME: this strips out formatting tags NB: Now this is a # SFS stycke that has fragment_label, id/uri and other # crap. Let's see if it still works! return AnonStycke([parser.reader.next().get_text()]) p = FSMParser() p.set_recognizers(is_section, is_subsection, is_subsubsection, is_paragraph) p.set_transitions({ ("body", is_section): (make_section, "section"), ("section", is_section): (False, None), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_special_subsection): (make_special_subsection, "special_subsection"), ("subsection", is_section): (False, None), ("subsection", is_subsection): (False, None), ("subsection", is_special_subsection): (False, None), ("subsection", is_subsubsection): (make_subsection, "subsubsection"), ("special_subsection", is_section): (False, None), ("special_subsection", is_subsection): (False, None), ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_section): (False, None), ("subsubsection", is_special_subsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_subsubsection): (False, None), (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None) }) p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal', 'decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha', 'upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman', 'upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc', 'circle', 'square', 'dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol, state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor, newstate) # CONSTRUCTORS @newstate('body') def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) @newstate('section') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber, title=title) return parser.make_children(s) def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') @newstate('ol-decimal') def make_ol_decimal(parser): return make_orderedlist(parser, "decimal", "ol-decimal") @newstate('ol-alpha') def make_ol_alpha(parser): return make_orderedlist(parser, "lower-alpha", "ol-alpha") @newstate('ol-roman') def make_ol_roman(parser): return make_orderedlist(parser, "lower-roman", "ol-roman") @newstate('listitem') def make_listitem(parser): chunk = parser.reader.next() (listtype, ordinal, separator, rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) # NOTE: no @newstate decorator for these -- we transition from # one state to the next, not push a new state onto the stack def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-a") def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-b") def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-c") # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) def make_orderedlist(parser, listtype, childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem, "listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None, chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +', chunk) if m: if chunk.startswith("0"): listtype = "decimal-leading-zero" else: listtype = "decimal" (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) if chunk.startswith("* "): return ("disc", None, None, chunk) if chunk.startswith("- "): return ("dash", None, None, chunk) return (listtype, ordinal, separator, chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({ ("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section, "section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph, None), ("subsection", is_subsection): (False, None), ("subsection", is_state_a): (False, "body"), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_paragraph): (make_paragraph, None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal", is_li_decimal): (make_listitem, "listitem"), ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"), ("ol-alpha", is_li_alpha): (make_listitem, "listitem"), ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"), ("ol-roman", is_li_roman): (make_listitem, "listitem"), ("ol-roman", is_li_alpha): (False, None), ("ol-alpha", is_li_decimal): (False, None), ("listitem", is_li_alpha): sublist_or_parent, ("listitem", is_li_roman): sublist_or_parent, ("listitem", is_li_decimal): sublist_or_parent, }) p.debug = debug tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b