def extract_head(self, fp, basefile): """Parsear ut det SFSR-registret som innehåller alla ändringar i lagtexten från HTML-filer""" # NB: We should really call self.store.register_path, but that # custom func isn't mocked by ferenda.testutil.RepoTester, # and downloaded_path is. So we call that one and munge it. filename = self.store.downloaded_path(basefile).replace( "/downloaded/", "/register/") with codecs.open(filename, encoding=self.source_encoding) as rfp: soup = bs4.BeautifulSoup(rfp.read(), "lxml") # do we really have a registry? notfound = soup.find(text="Sökningen gav ingen träff!") if notfound: raise InteExisterandeSFS(str(notfound)) textheader = fp.read(2048) if not isinstance(textheader, str): # Depending on whether the fp is opened through standard # open() or bz2.BZ2File() in self.parse_open(), it might # return bytes or unicode strings. This seem to be a # problem in BZ2File (or how we use it). Just roll with it. textheader = textheader.decode(self.source_encoding) idx = textheader.index("\r\n" * 4) fp.seek(idx + 8) reader = TextReader(string=textheader, linesep=TextReader.DOS) subreader = reader.getreader(reader.readchunk, reader.linesep * 4) return soup, subreader.getiterator(subreader.readparagraph)
def extract_head(self, fp, basefile): """Parsear ut det SFSR-registret som innehåller alla ändringar i lagtexten från HTML-filer""" # NB: We should really call self.store.register_path, but that # custom func isn't mocked by ferenda.testutil.RepoTester, # and downloaded_path is. So we call that one and munge it. filename = self.store.downloaded_path(basefile).replace( "/downloaded/", "/register/") with codecs.open(filename, encoding=self.source_encoding) as rfp: soup = bs4.BeautifulSoup(rfp.read(), "lxml") # do we really have a registry? notfound = soup.find(text="Sökningen gav ingen träff!") if notfound: raise InteExisterandeSFS(str(notfound)) textheader = fp.read(2048) if not isinstance(textheader, str): # Depending on whether the fp is opened through standard # open() or bz2.BZ2File() in self.parse_open(), it might # return bytes or unicode strings. This seem to be a # problem in BZ2File (or how we use it). Just roll with it. textheader = textheader.decode(self.source_encoding) idx = textheader.index("\r\n" * 4) fp.seek(idx + 8) reader = TextReader(string=textheader, linesep=TextReader.DOS) subreader = reader.getreader( reader.readchunk, reader.linesep * 4) return soup, subreader.getiterator(subreader.readparagraph)
def parametric_test(self, filename): resultfilename = filename.replace(".txt", ".xml") debug = not os.path.exists(resultfilename) p, b = self.run_test_file(filename, debug) self.maxDiff = 4096 if os.path.exists(resultfilename): with codecs.open(resultfilename, encoding="utf-8") as fp: result = fp.read().strip() # print(elements.serialize(b)) if result != elements.serialize(b).strip(): # re-run the parse but with debugging on print("============DEBUG OUTPUT================") p.debug = True tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) b = p.parse(tr.getiterator(tr.readparagraph)) print("===============RESULT===================") print(elements.serialize(b)) self.fail("========See output above=======") else: self.assertEqual(result, elements.serialize(b).strip()) else: print("\nResult:\n" + elements.serialize(b)) self.fail()
def extract_body(self, fp, basefile): bodystring = fp.read() # see comment in extract_head for why we must handle both # bytes- and str-files if not isinstance(bodystring, str): bodystring = bodystring.decode(self.source_encoding) reader = TextReader(string=bodystring, linesep=TextReader.DOS) reader.autostrip = True return reader
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join(doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance(part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.rdftype(self.ns['rfc'].RFC) desc.value(self.ns['dct'].title, title, lang="en") self.parse_header(header, desc) if not desc.getvalues(self.ns['dct'].identifier): desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body))
def download(self): self.log.debug("download: Start at %s" % self.start_url) indextext = requests.get(self.start_url).text reader = TextReader(string=indextext) # see TextReader class iterator = reader.getiterator(reader.readparagraph) if not isinstance(self.config.downloadmax, (int, type(None))): self.config.downloadmax = int(self.config.downloadmax) for basefile in self.download_get_basefiles(iterator): self.download_single(basefile)
def _find_uppdaterad_tom(self, sfsnr, filename=None, reader=None): if not reader: reader = TextReader(filename, encoding=self.source_encoding) try: reader.cue("Ändring införd:<b> t.o.m. SFS") l = reader.readline() m = re.search('(\d+:\s?\d+)', l) if m: return m.group(1) else: # if m is None, the SFS id is using a non-standard # formatting (eg 1996/613-first-version) -- interpret # it as if it didn't exist return sfsnr except IOError: return sfsnr # the base SFS nr
def parametric_test(self, filename): # these options adjusts the constructed URIs. by default, the # official rpubl URIs are minted. # # self.repo.config.localizeuri = True # self.repo.config.url = "http://example.org/" # self.repo.config.urlpath = '' # a few of the subclasses have specialized rules. make sure we # instantiate the correct class repo = os.path.basename(filename).split("-")[0] basefile = os.path.splitext(os.path.basename(filename))[0].replace( "-", "/", 1).replace("-", ":") repoclass = self.aliases[repo] self.repo = repoclass( datadir=self.datadir, storelocation=self.datadir + "/ferenda.sqlite", indexlocation=self.datadir + "/whoosh", ) doc = self.repo.make_document(basefile) text = self.repo.sanitize_text(util.readfile(filename), basefile) reader = TextReader(string=text, encoding='utf-8') self.repo.parse_metadata_from_textreader(reader, doc) wantfile = filename.replace(".txt", ".n3") if os.path.exists(wantfile): self.assertEqualGraphs(wantfile, doc.meta, exact=False) else: self.fail( "Expected a %s with the following content:\n\n%s" % (wantfile, doc.meta.serialize(format="n3").decode("utf-8")))
def parametric_test(self, filename): # these options adjusts the constructed URIs. by default, the # official rpubl URIs are minted. # # self.repo.config.localizeuri = True # self.repo.config.url = "http://example.org/" # self.repo.config.urlpath = '' # a few of the subclasses have specialized rules. make sure we # instantiate the correct class repo, basefile = self.parse_filename(filename) doc = repo.make_document(basefile) text = repo.sanitize_text(util.readfile(filename), basefile) reader = TextReader(string=text, encoding='utf-8') props = repo.extract_metadata(reader, basefile) props = repo.sanitize_metadata(props, basefile) resource = repo.polish_metadata(props, basefile) repo.infer_metadata(resource, basefile) wantfile = filename.replace(".txt", ".n3") if os.path.exists(wantfile): self.assertEqualGraphs(wantfile, resource.graph, exact=False) else: self.fail( "Expected a %s with the following content:\n\n%s" % (wantfile, doc.meta.serialize(format="n3").decode("utf-8")))
def extract_body(self, fp, basefile): rawtext = fp.read().decode(self.source_encoding) # remove whitespace on otherwise empty lines rawtext = re.sub("\n\t\n", "\n\n", rawtext) reader = TextReader(string=rawtext, linesep=TextReader.UNIX) return reader
def find_version(f): # need to look at the file to find out its version encoding = self._sniff_encoding(f) raw = open(f, 'rb').read(8000) text = unescape(raw.decode(encoding, errors="replace")) reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) return updated_to
def extract_body(self, fp, basefile): rawtext = fp.read() if isinstance(rawtext, bytes): # happens when creating the intermediate file rawtext = rawtext.decode(self.source_encoding) # remove whitespace on otherwise empty lines rawtext = re.sub("\n\t\n", "\n\n", rawtext) reader = TextReader(string=rawtext, linesep=TextReader.UNIX) return reader
def download(self, basefile=None): """Download rfcs starting from http://www.ietf.org/download/rfc-index.txt""" if basefile and self.document_url_template: return self.download_single(basefile) res = requests.get(self.start_url) indextext = res.text reader = TextReader(string=indextext, linesep=TextReader.UNIX) # see TextReader class iterator = reader.getiterator(reader.readparagraph) for (basefile, url) in self.download_get_basefiles(iterator): try: if not os.path.exists(self.store.downloaded_path(basefile)): self.download_single(basefile) except requests.exceptions.HTTPError as e: if e.response.status_code == 404: # create a empty dummy file in order to # avoid looking for it over and over again: with open(self.store.downloaded_path(basefile), "w"): pass
def testparser(testcase, parser, filename): """Helper function to test :py:class:`~ferenda.FSMParser` based parsers.""" wantfilename = filename.replace(".txt", ".xml") if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ: parser.debug = True tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) b = parser.parse(tr.getiterator(tr.readparagraph)) if 'FERENDA_FSMDEBUG' in os.environ: print(elements.serialize(b)) testcase.maxDiff = 4096 if os.path.exists(wantfilename): with codecs.open(wantfilename, encoding="utf-8") as fp: want = fp.read().strip() got = elements.serialize(b).strip() testcase.assertEqualXML(want, got) else: raise AssertionError("Want file not found. Result of parse:\n" + elements.serialize(b))
def parse_basefile(self, basefile): # create an Document instance with an initialized doc.meta RDFLib graph doc = self.make_document() intermediate_path = self.generic_path(basefile, 'intermediate', '.txt') downloaded_path = self.downloaded_path(basefile) doc.uri = self.canonical_uri(basefile) doc.lang = "sv" html = codecs.open(downloaded_path, encoding="iso-8859-1").read() header_chunk = util.extract_text( html, '<pre>\n <pre>', '<hr>', strip_tags=False) self.make_meta(header_chunk, doc.meta, doc.uri, basefile) util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") reader = TextReader(intermediate_path, encoding="utf-8") reader.readparagraph() self.make_body(reader, doc.body) # Iterate through body tree and find things to link to (See # EurlexTreaties.process_body for inspiration) self.process_body(doc.body, '', doc.uri) return doc
def extract_body(self, fp, basefile): if util.name_from_fp(fp).endswith((".txt", ".txt.bz2")): bodystring = fp.read() if isinstance(bodystring, bytes): # fp is opened in bytestream mode bodystring = bodystring.decode("utf-8") return TextReader(string=bodystring) else: reader = super(PropTrips, self).extract_body(fp, basefile) pdffile = self.store.downloaded_path(basefile, attachment="index.pdf") for page in reader: page.src = pdffile return reader
def make_document(self, basefile=None): doc = super(SFS, self).make_document(basefile) if basefile: # toc_generate_page calls this w/o basefile # We need to get the uppdaterad_tom field to create a proper # URI. First create a throwaway reader and make sure we have # the intermediate file at ready # FIXME: this is broken fp = self.downloaded_to_intermediate(basefile) t = TextReader(string=fp.read(2048)) fp.close() uppdaterad_tom = self._find_uppdaterad_tom(basefile, reader=t) doc.uri = self.canonical_uri(basefile, uppdaterad_tom) return doc
def parametric_test(self, filename): resultfilename = filename.replace(".txt",".xml") debug = not os.path.exists(resultfilename) p, b = self.run_test_file(filename, debug) self.maxDiff = 4096 if os.path.exists(resultfilename): with codecs.open(resultfilename,encoding="utf-8") as fp: result = fp.read().strip() # print(elements.serialize(b)) if result != elements.serialize(b).strip(): # re-run the parse but with debugging on print("============DEBUG OUTPUT================") p.debug = True tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX) b = p.parse(tr.getiterator(tr.readparagraph)) print("===============RESULT===================") print(elements.serialize(b)) self.fail("========See output above=======") else: self.assertEqual(result, elements.serialize(b).strip()) else: print("\nResult:\n"+elements.serialize(b)) self.fail()
def parametric_test(self, filename): self.maxDiff = None reader = TextReader(filename=filename, encoding='iso-8859-1', linesep=TextReader.DOS) reader.autostrip = True # p.lagrum_parser = FakeParser() parser = self.p.get_parser("9999:998", reader) b = parser(reader) elements = self.p._count_elements(b) # FIXME: How was this used? Where should we plug # skipfragments? if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2: self.p.skipfragments = [ ('rinfoex:avdelningnummer', 'rpubl:kapitelnummer'), ('rpubl:kapitelnummer', 'rpubl:paragrafnummer')] else: self.p.skipfragments = [('rinfoex:avdelningnummer', 'rpubl:kapitelnummer')] # NB: _construct_ids won't look for references self.p.visit_node(b, self.p.construct_id, {'basefile': '9999:998', 'uris': set()}) self.p.visit_node(b, self.p.find_definitions, False, debug=False) self.p.lagrum_parser.parse_recursive(b) self._remove_uri_for_testcases(b) resultfilename = filename.replace(".txt", ".xml") if os.path.exists(resultfilename): with codecs.open(resultfilename, encoding="utf-8") as fp: result = fp.read().strip() self.assertEqual(result, serialize(b).strip()) else: self.assertEqual("", serialize(b).strip()) # reset the state of the repo... self.p.current_section = '0' self.p.current_headline_level = 0
def fsmparse(self, functionname, source): """Parse a list of text chunks using a named fsm parser and output the parse tree and final result to stdout. :param functionname: A function that returns a configured :py:class:`~ferenda.FSMParser` :type functionname: str :param source: A file containing the text chunks, separated by double newlines :type source: str """ modulename, classname, methodname = functionname.rsplit(".", 2) __import__(modulename) m = sys.modules[modulename] for name, cls in inspect.getmembers(m, inspect.isclass): if name == classname: break method = getattr(cls,methodname) parser = method() parser.debug = True tr = TextReader(source) b = parser.parse(tr.getiterator(tr.readparagraph)) print(serialize(b))
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if not f.startswith("downloaded/sfs"): # sfst or sfsr continue for regex in self.templ: m = re.match(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version # text = t.extractfile(f).read(4000).decode("latin-1") text = open(f).read(4000).decode("latin-1") reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: version = None current += 1 de = DocumentEntry() de.basefile = basefile de.id = self.canonical_uri(basefile, updated_to) # fudge timestamps best as we can de.orig_created = datetime.fromtimestamp( os.path.getctime(f)) de.orig_updated = datetime.fromtimestamp( os.path.getmtime(f)) de.orig_updated = datetime.now() de.orig_url = self.document_url_template % locals() de.published = datetime.now() de.url = self.generated_url(basefile) de.title = "SFS %s" % basefile # de.set_content() # de.set_link() de.save(self.store.documententry_path(basefile)) # this yields more reasonable basefiles, but they are not # backwards compatible -- skip them for now # basefile = basefile.replace("_", "").replace(".", "") if "type" in m.groupdict() and m.group("type") == "sfsr": dest = self.store.register_path(basefile) current -= 1 # to offset the previous increment else: dest = self.store.downloaded_path(basefile, version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions" % (current, archived))
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def downloaded_to_intermediate(self, basefile): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join( doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance( part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dct:title of the document and # specify that it is in English desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en") # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dct:issued date for the document re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year,dt.month,dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date desc.value(self.ns['dct'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dct:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dct'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ",1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append(s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def downloaded_to_intermediate(self, basefile, attachment=None): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning("%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal,title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal','decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha','upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman','upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc','circle','square','dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol,state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor,newstate) # CONSTRUCTORS def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) setattr(make_body,'newstate','body') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_section,'newstate','section') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_subsection,'newstate','subsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber,title=title) return parser.make_children(s) setattr(make_subsubsection,'newstate','subsubsection') def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') def make_ol_decimal(parser): return make_orderedlist(parser,"decimal","ol-decimal") setattr(make_ol_decimal,'newstate','ol-decimal') def make_ol_alpha(parser): return make_orderedlist(parser,"lower-alpha", "ol-alpha") setattr(make_ol_alpha,'newstate','ol-alpha') def make_ol_roman(parser): return make_orderedlist(parser,"lower-roman", "ol-roman") setattr(make_ol_roman,'newstate','ol-romal') def make_listitem(parser): chunk = parser.reader.next() (listtype,ordinal,separator,rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) setattr(make_listitem,'newstate','listitem') def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-a") # setattr(make_state_a, 'newstate', 'state-a') def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-b") # setattr(make_state_b, 'newstate', 'state-b') def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()],id="state-c") # setattr(make_state_c, 'newstate', 'state-c') # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None,s.split("."))))) def make_orderedlist(parser,listtype,childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem,"listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None,chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +',chunk) if m: if chunk.startswith("0"): listtype="decimal-leading-zero" else: listtype="decimal" (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal,separator) = m.groups() rest = chunk[m.end():] return (listtype,ordinal,separator,rest) if chunk.startswith("* "): return ("disc",None,None,chunk) if chunk.startswith("- "): return ("dash",None,None,chunk) return (listtype,ordinal,separator,chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section,"section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph,None), ("subsection", is_subsection): (False,None), ("subsection", is_state_a): (False,"body"), ("subsection", is_subsubsection): (make_subsubsection,"subsubsection"), ("subsubsection", is_paragraph): (make_paragraph,None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal",is_li_decimal):(make_listitem,"listitem"), ("ol-decimal",is_li_alpha):(make_ol_alpha,"ol-alpha"), ("ol-alpha",is_li_alpha):(make_listitem,"listitem"), ("ol-alpha",is_li_roman):(make_ol_roman,"ol-roman"), ("ol-roman",is_li_roman):(make_listitem,"listitem"), ("ol-roman",is_li_alpha):(False,None), ("ol-alpha",is_li_decimal):(False,None), ("listitem",is_li_alpha):sublist_or_parent, ("listitem",is_li_roman):sublist_or_parent, ("listitem",is_li_decimal):sublist_or_parent, }) p.debug = debug tr=TextReader(filename,encoding="utf-8",linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b
def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal', 'decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha', 'upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman', 'upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc', 'circle', 'square', 'dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol, state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor, newstate) # CONSTRUCTORS @newstate('body') def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) @newstate('section') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber, title=title) return parser.make_children(s) def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') @newstate('ol-decimal') def make_ol_decimal(parser): return make_orderedlist(parser, "decimal", "ol-decimal") @newstate('ol-alpha') def make_ol_alpha(parser): return make_orderedlist(parser, "lower-alpha", "ol-alpha") @newstate('ol-roman') def make_ol_roman(parser): return make_orderedlist(parser, "lower-roman", "ol-roman") @newstate('listitem') def make_listitem(parser): chunk = parser.reader.next() (listtype, ordinal, separator, rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) # NOTE: no @newstate decorator for these -- we transition from # one state to the next, not push a new state onto the stack def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-a") def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-b") def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-c") # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) def make_orderedlist(parser, listtype, childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem, "listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None, chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +', chunk) if m: if chunk.startswith("0"): listtype = "decimal-leading-zero" else: listtype = "decimal" (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) if chunk.startswith("* "): return ("disc", None, None, chunk) if chunk.startswith("- "): return ("dash", None, None, chunk) return (listtype, ordinal, separator, chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({ ("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section, "section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph, None), ("subsection", is_subsection): (False, None), ("subsection", is_state_a): (False, "body"), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_paragraph): (make_paragraph, None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal", is_li_decimal): (make_listitem, "listitem"), ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"), ("ol-alpha", is_li_alpha): (make_listitem, "listitem"), ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"), ("ol-roman", is_li_roman): (make_listitem, "listitem"), ("ol-roman", is_li_alpha): (False, None), ("ol-alpha", is_li_decimal): (False, None), ("listitem", is_li_alpha): sublist_or_parent, ("listitem", is_li_roman): sublist_or_parent, ("listitem", is_li_decimal): sublist_or_parent, }) p.debug = debug tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b