def get_parser(self, basefile, sanitized, parseconfig="default"): def is_heading(parser): return parser.reader.peek().font.size == 17 def is_dnr(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d+-\d{2,4}', str(chunk))): return True def is_datum(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d{4}-\d{2}-\d{2}', str(chunk))): return True def is_nonessential(parser): chunk = parser.reader.peek() if chunk.top >= 1159 or chunk.top <= 146: return True def is_abstract(parser): if str(parser.reader.peek()).startswith("Beslutet i korthet:"): return True def is_section(parser): chunk = parser.reader.peek() strchunk = str(chunk) if chunk.font.size == 14 and chunk[0].tag == "b" and not strchunk.endswith("."): return True def is_blockquote(parser): chunk = parser.reader.peek() if chunk.left >= 255: return True def is_normal(parser): chunk = parser.reader.peek() if chunk.left < 255: return True def is_paragraph(parser): return True @decorators.newstate("body") def make_body(parser): return parser.make_children(Body()) def make_heading(parser): # h = Heading(str(parser.reader.next()).strip()) h = Meta([str(parser.reader.next()).strip()], predicate=DCTERMS.title, lang="sv") return h @decorators.newstate("abstract") def make_abstract(parser): a = Abstract([Paragraph(parser.reader.next())]) return parser.make_children(a) @decorators.newstate("section") def make_section(parser): s = UnorderedSection(title=str(parser.reader.next()).strip()) return parser.make_children(s) @decorators.newstate("blockquote") def make_blockquote(parser): b = Blockquote() return parser.make_children(b) def make_paragraph(parser): # A Paragraph containing PDFReader.Textelement object will # render these as <span> objects (the default rendering. A # PDFReader.Textbox object containing same will render # unstyled Textelements as plain strings, cutting down on # unneccesary <span> elements. However, these themselves # render with unneccessary @style and @class attributes, # which we don't want. For now, lets stick with Paragraphs # as containers and maybe later figure out how to get # PDFReader.Textelements to render themselves sanely. # # p = parser.reader.next() p = Paragraph(parser.reader.next()) return p def make_datum(parser): datestr = str(parser.reader.next()).strip() year = int(datestr.split("-")[0]) if 2100 > year > 1970: parser.remove_recognizer(is_datum) d = [datestr] return Meta(d, predicate=RPUBL.avgorandedatum, datatype=XSD.date) else: self.log.warning("Year in %s doesn't look valid" % datestr) return None def make_dnr(parser): parser.remove_recognizer(is_dnr) ds = [x for x in str(parser.reader.next()).strip().split(" ")] return Meta(ds, predicate=RPUBL.diarienummer) def skip_nonessential(parser): parser.reader.next() # return nothing p = FSMParser() p.initial_state = "body" p.initial_constructor = make_body p.set_recognizers(is_datum, is_dnr, is_nonessential, is_heading, is_abstract, is_section, is_normal, is_blockquote, is_paragraph) p.set_transitions({("body", is_heading): (make_heading, None), ("body", is_nonessential): (skip_nonessential, None), ("body", is_datum): (make_datum, None), ("body", is_dnr): (make_dnr, None), ("body", is_abstract): (make_abstract, "abstract"), ("body", is_section): (make_section, "section"), ("body", is_blockquote): (make_blockquote, "blockquote"), ("body", is_paragraph): (make_paragraph, None), ("abstract", is_paragraph): (make_paragraph, None), ("abstract", is_section): (False, None), ("abstract", is_dnr): (False, None), ("abstract", is_datum): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_nonessential): (skip_nonessential, None), ("section", is_section): (False, None), ("section", is_blockquote): (make_blockquote, "blockquote"), ("section", is_datum): (make_datum, None), ("section", is_dnr): (make_dnr, None), ("blockquote", is_blockquote): (make_paragraph, None), ("blockquote", is_nonessential): (skip_nonessential, None), ("blockquote", is_section): (False, None), ("blockquote", is_normal): (False, None), ("blockquote", is_datum): (make_datum, None), ("blockquote", is_dnr): (make_dnr, None), }) p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal','decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha','upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman','upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc','circle','square','dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol,state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor,newstate) # CONSTRUCTORS def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) setattr(make_body,'newstate','body') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_section,'newstate','section') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_subsection,'newstate','subsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_subsubsection,'newstate','subsubsection') def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') def make_ol_decimal(parser): return make_orderedlist(parser,"decimal","ol-decimal") setattr(make_ol_decimal,'newstate','ol-decimal') def make_ol_alpha(parser): return make_orderedlist(parser,"lower-alpha", "ol-alpha") setattr(make_ol_alpha,'newstate','ol-alpha') def make_ol_roman(parser): return make_orderedlist(parser,"lower-roman", "ol-roman") setattr(make_ol_roman,'newstate','ol-romal') def make_listitem(parser): chunk = parser.reader.next() (listtype,ordinal,separator,rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) setattr(make_listitem,'newstate','listitem') def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-a") # setattr(make_state_a, 'newstate', 'state-a') def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-b") # setattr(make_state_b, 'newstate', 'state-b') def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-c") # setattr(make_state_c, 'newstate', 'state-c') # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None,s.split("."))))) def make_orderedlist(parser,listtype,childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem,"listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None,chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +',chunk) if m: if chunk.startswith("0"): listtype="decimal-leading-zero" else: listtype="decimal" (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) if chunk.startswith("* "): return ("disc",None,None,chunk) if chunk.startswith("- "): return ("dash",None,None,chunk) return (listtype,ordinal,separator,chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section,"section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph,None), ("subsection", is_subsection): (False,None), ("subsection", is_state_a): (False,"body"), ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"), ("subsubsection", is_paragraph): (make_paragraph,None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal",is_li_decimal):(make_listitem,"listitem"), ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"), ("ol-alpha",is_li_alpha):(make_listitem,"listitem"), ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"), ("ol-roman",is_li_roman):(make_listitem,"listitem"), ("ol-roman",is_li_alpha):(False,None), ("ol-alpha",is_li_decimal):(False,None), ("listitem",is_li_alpha):sublist_or_parent, ("listitem",is_li_roman):sublist_or_parent, ("listitem",is_li_decimal):sublist_or_parent, }) p.debug = debug tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b
def get_parser(self, basefile, sanitized, parseconfig="default"): def is_header(parser): p = parser.reader.peek() # older direktiv sources start with dir number if re.match(r'Dir\.? \d{4}:\d+$', p): return False return (headerlike(p) and not is_strecksats(parser, parser.reader.peek(2))) def is_strecksats(parser, chunk=None): if chunk is None: chunk = parser.reader.peek() return chunk.startswith(("--", "- ")) def is_section(parser): (ordinal, headingtype, title) = analyze_sectionstart(parser) if ordinal: return headingtype == "h1" def is_subsection(parser): (ordinal, headingtype, title) = analyze_sectionstart(parser) if ordinal: return headingtype == "h2" def is_paragraph(parser): return True @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('section') def make_section(parser): chunk = parser.reader.next() ordinal, headingtype, title = analyze_sectionstart(parser, chunk) s = Avsnitt(ordinal=ordinal, title=title) return parser.make_children(s) @newstate('strecksats') def make_strecksatslista(parser): ul = Strecksatslista() li = make_listitem(parser) ul.append(li) res = parser.make_children(ul) return res def make_listitem(parser): chunk = parser.reader.next() s = str(chunk) if " " in s: # assume text before first space is the bullet s = s.split(" ",1)[1] else: # assume the bullet is a single char s = s[1:] return Strecksatselement([s]) def make_header(parser): return Heading([parser.reader.next()]) def make_paragraph(parser): return Paragraph([parser.reader.next()]) @newstate('unorderedsection') def make_unorderedsection(parser): s = UnorderedSection(title=parser.reader.next().strip()) return parser.make_children(s) def headerlike(p): return (p[0].lower() != p[0] and len(p) < 150 and not (p.endswith(".") and not (p.endswith("m.m.") or p.endswith("m. m.") or p.endswith("m.fl.") or p.endswith("m. fl.")))) re_sectionstart = re.compile("^(\d[\.\d]*) +([A-ZÅÄÖ].*)$").match def analyze_sectionstart(parser, chunk=None): """returns (ordinal, headingtype, text) if it looks like a section heading, (None, None, chunk) otherwise.""" if chunk is None: chunk = parser.reader.peek() m = re_sectionstart(chunk) if m and headerlike(m.group(2)): return (m.group(1), "h" + str(m.group(1).count(".") + 1), m.group(2).strip()) else: return None, None, chunk p = FSMParser() if parseconfig == "simple": recognizers = [is_header, is_strecksats, is_paragraph] else: recognizers = [is_section, is_subsection, is_header, is_strecksats, is_paragraph] p.set_recognizers(*recognizers) commonstates = ("body", "section", "subsection", "unorderedsection") p.set_transitions({(commonstates, is_paragraph): (make_paragraph, None), (commonstates, is_strecksats): (make_strecksatslista, "strecksats"), (commonstates, is_header): (make_unorderedsection, "unorderedsection"), (commonstates, is_section): (make_section, "section"), ("unorderedsection", is_header): (False, None), ("unorderedsection", is_section): (False, None), ("strecksats", is_paragraph): (False, None), ("strecksats", is_strecksats): (make_listitem, None), ("section", is_header): (False, None), ("section", is_section): (False, None), ("section", is_subsection): (make_section, "subsection"), ("subsection", is_subsection): (False, None), ("subsection", is_section): (False, None)}) p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def get_parser(self, basefile, sanitized_body, parseconfig="default"): # a typical decision structure: # [h1] Justitiekanslerns beslut # ... text ... # [h2] Ärendet (h3) # [h3] Bakgrund (p/em) # ... text ... # [h3] Anspråket # ... text ... # [h3 class="reglering"] Rättslig reglering m.m. (p/strong) # [h2] Justitiekanslerns bedömning # [h3] Skadestånd # [h3] Tillsyn def is_section(parser): return parser.reader.peek().name == "h3" def is_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "em" def is_special_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list( chunk.children)[0].name == "strong" def is_subsubsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "u" def is_paragraph(parser): return True @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('section') def make_section(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('special_subsection') def make_special_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) def make_paragraph(parser): # FIXME: this strips out formatting tags NB: Now this is a # SFS stycke that has fragment_label, id/uri and other # crap. Let's see if it still works! return AnonStycke([parser.reader.next().get_text()]) p = FSMParser() p.set_recognizers(is_section, is_subsection, is_subsubsection, is_paragraph) p.set_transitions({ ("body", is_section): (make_section, "section"), ("section", is_section): (False, None), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_special_subsection): (make_special_subsection, "special_subsection"), ("subsection", is_section): (False, None), ("subsection", is_subsection): (False, None), ("subsection", is_special_subsection): (False, None), ("subsection", is_subsubsection): (make_subsection, "subsubsection"), ("special_subsection", is_section): (False, None), ("special_subsection", is_subsection): (False, None), ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_section): (False, None), ("subsubsection", is_special_subsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_subsubsection): (False, None), (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None) }) p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def get_parser(self, basefile, sanitized, parseconfig="default"): def is_heading(parser): return parser.reader.peek().font.size == 17 def is_dnr(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d+-\d{2,4}', str(chunk))): return True def is_datum(parser): chunk = parser.reader.peek() if (chunk.font.size == 12 and re.match('\d{4}-\d{2}-\d{2}', str(chunk))): return True def is_nonessential(parser): chunk = parser.reader.peek() if chunk.top >= 1159 or chunk.top <= 146: return True def is_abstract(parser): if str(parser.reader.peek()).startswith("Beslutet i korthet:"): return True def is_section(parser): chunk = parser.reader.peek() strchunk = str(chunk) if chunk.font.size == 14 and chunk[ 0].tag == "b" and not strchunk.endswith("."): return True def is_blockquote(parser): chunk = parser.reader.peek() if chunk.left >= 255: return True def is_normal(parser): chunk = parser.reader.peek() if chunk.left < 255: return True def is_paragraph(parser): return True @decorators.newstate("body") def make_body(parser): return parser.make_children(Body()) def make_heading(parser): # h = Heading(str(parser.reader.next()).strip()) h = Meta([str(parser.reader.next()).strip()], predicate=DCTERMS.title, lang="sv") return h @decorators.newstate("abstract") def make_abstract(parser): a = Abstract([Paragraph(parser.reader.next())]) return parser.make_children(a) @decorators.newstate("section") def make_section(parser): s = UnorderedSection(title=str(parser.reader.next()).strip()) return parser.make_children(s) @decorators.newstate("blockquote") def make_blockquote(parser): b = Blockquote() return parser.make_children(b) def make_paragraph(parser): # A Paragraph containing PDFReader.Textelement object will # render these as <span> objects (the default rendering. A # PDFReader.Textbox object containing same will render # unstyled Textelements as plain strings, cutting down on # unneccesary <span> elements. However, these themselves # render with unneccessary @style and @class attributes, # which we don't want. For now, lets stick with Paragraphs # as containers and maybe later figure out how to get # PDFReader.Textelements to render themselves sanely. # # p = parser.reader.next() p = Paragraph(parser.reader.next()) return p def make_datum(parser): datestr = str(parser.reader.next()).strip() year = int(datestr.split("-")[0]) if 2100 > year > 1970: parser.remove_recognizer(is_datum) d = [datestr] return Meta(d, predicate=RPUBL.avgorandedatum, datatype=XSD.date) else: self.log.warning("Year in %s doesn't look valid" % datestr) return None def make_dnr(parser): parser.remove_recognizer(is_dnr) ds = [x for x in str(parser.reader.next()).strip().split(" ")] return Meta(ds, predicate=RPUBL.diarienummer) def skip_nonessential(parser): parser.reader.next() # return nothing p = FSMParser() p.initial_state = "body" p.initial_constructor = make_body p.set_recognizers(is_datum, is_dnr, is_nonessential, is_heading, is_abstract, is_section, is_normal, is_blockquote, is_paragraph) p.set_transitions({ ("body", is_heading): (make_heading, None), ("body", is_nonessential): (skip_nonessential, None), ("body", is_datum): (make_datum, None), ("body", is_dnr): (make_dnr, None), ("body", is_abstract): (make_abstract, "abstract"), ("body", is_section): (make_section, "section"), ("body", is_blockquote): (make_blockquote, "blockquote"), ("body", is_paragraph): (make_paragraph, None), ("abstract", is_paragraph): (make_paragraph, None), ("abstract", is_section): (False, None), ("abstract", is_dnr): (False, None), ("abstract", is_datum): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_nonessential): (skip_nonessential, None), ("section", is_section): (False, None), ("section", is_blockquote): (make_blockquote, "blockquote"), ("section", is_datum): (make_datum, None), ("section", is_dnr): (make_dnr, None), ("blockquote", is_blockquote): (make_paragraph, None), ("blockquote", is_nonessential): (skip_nonessential, None), ("blockquote", is_section): (False, None), ("blockquote", is_normal): (False, None), ("blockquote", is_datum): (make_datum, None), ("blockquote", is_dnr): (make_dnr, None), }) p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def get_parser(self, basefile, sanitized_body, parseconfig="default"): # a typical decision structure: # [h1] Justitiekanslerns beslut # ... text ... # [h2] Ärendet (h3) # [h3] Bakgrund (p/em) # ... text ... # [h3] Anspråket # ... text ... # [h3 class="reglering"] Rättslig reglering m.m. (p/strong) # [h2] Justitiekanslerns bedömning # [h3] Skadestånd # [h3] Tillsyn def is_section(parser): return parser.reader.peek().name == "h3" def is_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "em" def is_special_subsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "strong" def is_subsubsection(parser): chunk = parser.reader.peek() return chunk.name == "p" and list(chunk.children)[0].name == "u" def is_paragraph(parser): return True @newstate('body') def make_body(parser): return parser.make_children(Body()) @newstate('section') def make_section(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('special_subsection') def make_special_subsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): s = AnonSektion(title=parser.reader.next().get_text()) return parser.make_children(s) def make_paragraph(parser): # FIXME: this strips out formatting tags NB: Now this is a # SFS stycke that has fragment_label, id/uri and other # crap. Let's see if it still works! return AnonStycke([parser.reader.next().get_text()]) p = FSMParser() p.set_recognizers(is_section, is_subsection, is_subsubsection, is_paragraph) p.set_transitions({ ("body", is_section): (make_section, "section"), ("section", is_section): (False, None), ("section", is_subsection): (make_subsection, "subsection"), ("section", is_special_subsection): (make_special_subsection, "special_subsection"), ("subsection", is_section): (False, None), ("subsection", is_subsection): (False, None), ("subsection", is_special_subsection): (False, None), ("subsection", is_subsubsection): (make_subsection, "subsubsection"), ("special_subsection", is_section): (False, None), ("special_subsection", is_subsection): (False, None), ("special_subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_section): (False, None), ("subsubsection", is_special_subsection): (False, None), ("subsubsection", is_subsection): (False, None), ("subsubsection", is_subsubsection): (False, None), (("body", "section", "subsection", "subsubsection"), is_paragraph): (make_paragraph, None) }) p.initial_state = "body" p.initial_constructor = make_body p.debug = os.environ.get('FERENDA_FSMDEBUG', False) return p.parse
def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal', 'decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha', 'upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman', 'upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc', 'circle', 'square', 'dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol, state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor, newstate) # CONSTRUCTORS @newstate('body') def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) @newstate('section') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber, title=title) return parser.make_children(s) def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') @newstate('ol-decimal') def make_ol_decimal(parser): return make_orderedlist(parser, "decimal", "ol-decimal") @newstate('ol-alpha') def make_ol_alpha(parser): return make_orderedlist(parser, "lower-alpha", "ol-alpha") @newstate('ol-roman') def make_ol_roman(parser): return make_orderedlist(parser, "lower-roman", "ol-roman") @newstate('listitem') def make_listitem(parser): chunk = parser.reader.next() (listtype, ordinal, separator, rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) # NOTE: no @newstate decorator for these -- we transition from # one state to the next, not push a new state onto the stack def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-a") def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-b") def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-c") # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) def make_orderedlist(parser, listtype, childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem, "listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None, chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +', chunk) if m: if chunk.startswith("0"): listtype = "decimal-leading-zero" else: listtype = "decimal" (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) if chunk.startswith("* "): return ("disc", None, None, chunk) if chunk.startswith("- "): return ("dash", None, None, chunk) return (listtype, ordinal, separator, chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({ ("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section, "section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph, None), ("subsection", is_subsection): (False, None), ("subsection", is_state_a): (False, "body"), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_paragraph): (make_paragraph, None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal", is_li_decimal): (make_listitem, "listitem"), ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"), ("ol-alpha", is_li_alpha): (make_listitem, "listitem"), ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"), ("ol-roman", is_li_roman): (make_listitem, "listitem"), ("ol-roman", is_li_alpha): (False, None), ("ol-alpha", is_li_decimal): (False, None), ("listitem", is_li_alpha): sublist_or_parent, ("listitem", is_li_roman): sublist_or_parent, ("listitem", is_li_decimal): sublist_or_parent, }) p.debug = debug tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b