def htmlparser(chunks): b = Body() for block in chunks: tagtype = Preformatted if block.name == "pre" else Paragraph t = util.normalize_space(''.join(block.findAll(text=True))) block.extract() # to avoid seeing it again if t: b.append(tagtype([t])) return b
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([ Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([ 42, date(2013, 11, 27), datetime(2013, 11, 27, 12, 0, 0), b'bytestring', { 'foo': 'bar', 'x': 'y' } ], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse( data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def test_parse_existing(self): # make sure parserecursive doesn't mess with existing structure. class MyHeader(UnicodeElement): pass doc = Body([ MyHeader("My document"), Paragraph([ "It's a very very fine document.", MyHeader("Subheading"), "And now we're done." ]) ]) want = serialize(doc) # first test a blank CitationParser, w/o patterns or formatter cp = CitationParser() doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got) cp = CitationParser(ferenda.citationpatterns.url) cp.set_formatter(URIFormatter(("url", ferenda.uriformats.url))) doccopy = deepcopy(doc) cp.parse_recursive(doccopy) got = serialize(doccopy) self.assertEqual(want, got)
def toc_generate_page_body(self, documentlist, nav): # make a copy because toc_generate_page_body_thread will eat # it, and we need to reuse it documentlist = list(documentlist) # for item in documentlist: # print(repr(str(item[0]))+",") rootul = self.toc_generate_page_body_thread(documentlist) assert len( documentlist ) == 0, "toc_generate_page_body_thread left some items in the documentlist" uls = OrderedDict() # create one ul per two-char-prefix (eg "Ab", "Ac", "Ad", "Af" and so on) for li in rootul: strdoc = str(li) prefix = strdoc.replace(" ", "").replace( "-", "")[:2].capitalize() # maybe clean even more, eg remove space? # remove anything non-numerical if prefix not in uls: uls[prefix] = UnorderedList() currentul = uls[prefix] currentul.append(li) d = Div(**{'class': 'threecol'}) for k, v in uls.items(): if len(k) > 2: continue d.append(H2([k])) d.append(v) return Body([nav, d])
def parse(self, doc): # create a dummy txt d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['dcterms'].title, Literal(doc.basefile, lang=doc.lang)) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) doc.body = Body() # can be empty, all content in doc.meta return True
def test_parse_recursive(self): doc_citation = ("Doc" + Word(nums).setResultsName("ordinal") + "/" + Word(nums, exact=4).setResultsName("year") ).setResultsName("DocRef") def doc_uri_formatter(parts): return "http://example.org/docs/%(year)s/%(ordinal)s/" % parts doc = Body([ Heading(["About Doc 43/2012 and it's interpretation"]), Paragraph([ "According to Doc 43/2012", Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated" ]) ]) result = Body([ Heading([ "About ", LinkSubject("Doc 43/2012", predicate="dcterms:references", uri="http://example.org/docs/2012/43/"), " and it's interpretation" ]), Paragraph([ "According to ", LinkSubject("Doc 43/2012", predicate="dcterms:references", uri="http://example.org/docs/2012/43/"), Footnote([ "Available at ", LinkSubject("http://example.org/xyz", predicate="dcterms:references", uri="http://example.org/xyz") ]), " the bizbaz should be frobnicated" ]) ]) cp = CitationParser(ferenda.citationpatterns.url, doc_citation) cp.set_formatter( URIFormatter(("url", ferenda.uriformats.url), ("DocRef", doc_uri_formatter))) doc = cp.parse_recursive(doc) self.maxDiff = 4096 self.assertEqual(serialize(doc), serialize(result))
def toc_generate_page_body(self, documentlist, nav): ul = UnorderedList([ListItem(x) for x in documentlist], role='main') dl = DL(**{'class': 'dl-horizontal'}) for label, doclist in documentlist: dl.append(DT(label)) for doc in doclist: dl.append(DD(doc)) return Body([nav, dl])
def parse_from_pdfreader(self, pdfreader, doc): doc.body = Body([pdfreader]) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) return doc
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([42, date(2013,11,27), datetime(2013,11,27,12,0,0), b'bytestring', {'foo': 'bar', 'x': 'y'}], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse(data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def parse_pdfs(self, basefile, pdffiles): doc = Body() for pdffile in pdffiles: # FIXME: downloaded_path must be more fully mocked # (support attachments) by testutil.RepoTester. In the # meantime, we do some path munging ourselves pdf_path = self.store.downloaded_path(basefile).replace("index.html", pdffile) intermediate_path = self.store.intermediate_path(basefile, attachment=pdffile) intermediate_dir = os.path.dirname(intermediate_path) try: pdf = self.parse_pdf(pdf_path, intermediate_dir) for page in pdf: pass # page.crop(left=50,top=0,bottom=900,right=700) doc.append(pdf) except ValueError: (exc_type, exc_value, exc_trackback) = sys.exc_info() self.log.warning("Ignoring exception %s (%s), skipping PDF %s" % (exc_type, exc_value, pdffile)) return doc
def postprocess_doc(self, doc): next_is_title = False newbody = Body() glue = lambda x, y, z: False for para in doc.body.textboxes(gluefunc=glue, pageobjects=True): strpara = str(para).strip() if strpara == "Kommittédirektiv": next_is_title = True elif next_is_title: doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(strpara))) next_is_title = False elif strpara.startswith("Beslut vid regeringssammanträde den "): datestr = strpara[36:] # length of above prefix if datestr.endswith("."): datestr = datestr[:-1] doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(self.parse_swedish_date(datestr), datatype=XSD.date))) if isinstance(para, Page): newbody.append(Sidbrytning(ordinal=para.number, width=para.width, height=para.height, src=para.src)) else: newbody.append(para) doc.body = newbody
def test_compound(self): x = CompoundElement(["hello", "world"], id="42", foo="bar") x.foo = "baz" with self.assertRaises(AttributeError): x.y = "z" x.append( os.listdir) # a non-serializable object (in this case a function) self.assertEqual( b'<compoundelement xmlns="http://www.w3.org/1999/xhtml" id="42">helloworld<built-in function listdir></compoundelement>', etree.tostring(x.as_xhtml())) self.assertEqual( Body([Section([Paragraph(["Hello"]), Paragraph(["World"])])]).as_plaintext(), "Hello World")
def test_serialize_pyparsing(self): # these objects can't be roundtripped from ferenda.citationpatterns import url x = url.parseString("http://example.org/foo?param=val") serialized = serialize(Body([x])) self.assertEqual( """<Body> <url> <netloc>example.org</netloc> <path>/foo</path> <query>param=val</query> <scheme>http</scheme> </url> </Body> """, serialized)
def parse(tokenstream): current_type = None body = Body() for p in tokenstream: new_type = guess_type(p, current_type) # if not new_type == None: # print "Guessed %s for %r" % (new_type.__name__,p[:20]) if new_type is None: pass elif new_type == Continuation and len(body) > 0: # Don't create a new text node, add this text to the last # text node created para = body.pop() para.append(p) body.append(para) else: if new_type == Continuation: new_type = Paragraph body.append(new_type([p])) current_type = new_type return body
def make_body(parser): return parser.make_children(Body())
def toc_generate_page_body(self, documentlist, nav): # move documentlist into a ordereddict keyed on url, # concatenating rpubl_konsolideringsunderlag as we go documents = OrderedDict() # make sure all rpubl:KonsolideradGrundforfattning comes first in the list for row in documentlist: row = dict(row) if row['rdf_type'] == str(RPUBL.KonsolideradGrundforfattning): if row['uri'] not in documents: documents[row['uri']] = row # transform single value to a list, so we can # append more if other rows are about the same # rpubl:KonsolideradGrundforfattning row['rpubl_konsolideringsunderlag'] = [ row['rpubl_konsolideringsunderlag'] ] else: documents[ row['uri']]['rpubl_konsolideringsunderlag'].append( row['rpubl_konsolideringsunderlag']) # then the rest for row in documentlist: if row['rdf_type'] != str(RPUBL.KonsolideradGrundforfattning): documents[row['uri']] = row # now that we have all documents, check if some of them change # some other of them for uri in list(documents): row = documents[uri] if 'rpubl_andrar' in row: if row['rpubl_andrar'] not in documents: self.log.warning( "%(uri)s: changes %(rpubl_andrar)s, but that doc doesn't exist" % row) continue if 'andras_av' not in documents[row['rpubl_andrar']]: documents[row['rpubl_andrar']]['andras_av'] = [] documents[row['rpubl_andrar']]['andras_av'].insert(0, uri) documents.move_to_end(uri) dl = html.DL(role='main') for uri in list(documents): if uri not in documents: continue # we must have removed it earlier in the loop row = documents[uri] label = row.get('dcterms_title', row.get('dcterms_identifier', '(Titel saknas)')) if row['dcterms_identifier'] not in label: label = "%s: %s" % (row['dcterms_identifier'], label) # in most cases we want to link this thing, but not if # this is the base act of a non-consolidated act (we link # to it in the DD element below instead) if (row['rdf_type'] == str(RPUBL.KonsolideradGrundforfattning) or 'andras_av' not in row): label = Link(label, uri=uri) dl.append(html.DT([label])) # groups of base+change acts may be present wether we have # consolidated acts or not, and they might be grouped a # little differently, but we need to do the same things # with them. relevant_docs = [] if row['rdf_type'] == str(RPUBL.KonsolideradGrundforfattning): relevant_docs = row['rpubl_konsolideringsunderlag'] elif 'andras_av' in row: relevant_docs = [uri] + row['andras_av'] if relevant_docs: fs = [] for f in relevant_docs: if f in documents: fs.append( Link(documents[f]['dcterms_identifier'], uri=documents[f]['uri'])) fs.append(", ") del documents[f] if fs: dl.append( html.DD( ["Grund- och ändringsförfattningar: ", *fs[:-1]])) return Body([nav, dl])
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def test_serialize_newstr(self): # really a test for future.types.newstr.newstr, here aliased # to str() -- this is only ever an issue on py2. tree = Body([], a=str("x"), b="y") serialized = serialize(tree, format="xml") self.assertEqual('<Body a="x" b="y" />\n', serialized)
Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated"]) ]) # end makedoc # begin derived-class from ferenda.elements import CompoundElement, OrdinalElement class Preamble(CompoundElement): pass class PreambleRecital(CompoundElement,OrdinalElement): tagname = "div" rdftype = "eurlex:PreambleRecital" doc = Preamble([PreambleRecital("Un",ordinal=1)], [PreambleRecital("Deux",ordinal=2)], [PreambleRecital("Trois",ordinal=3)]) # end derived-class # begin as-xhtml from ferenda.elements import SectionalElement p = SectionalElement(["Some content"], ordinal = "1a", identifier = "Doc pt 1(a)", title="Title or name of the part") body = Body([p]) from lxml import etree etree.tostring(body.as_xhtml("http://example.org/doc")) # end as-xhtml return_value = etree.tostring(body.as_xhtml("http://example.org/doc"), pretty_print=True)
def get_parser(self, basefile, sanitized, parseconfig="default"): return lambda stream: Body(list(stream))
# -*- coding: utf-8 -*- from __future__ import unicode_literals # begin makedoc from ferenda.elements import Body, Heading, Paragraph, Footnote doc = Body([Heading(["About Doc 43/2012 and it's interpretation"],predicate="dcterms:title"), Paragraph(["According to Doc 43/2012", Footnote(["Available at http://example.org/xyz"]), " the bizbaz should be frobnicated"]) ]) # end makedoc # begin derived-class from ferenda.elements import CompoundElement, OrdinalElement class Preamble(CompoundElement): pass class PreambleRecital(CompoundElement,OrdinalElement): tagname = "div" rdftype = "eurlex:PreambleRecital" doc = Preamble([PreambleRecital("Un",ordinal=1)], [PreambleRecital("Deux",ordinal=2)], [PreambleRecital("Trois",ordinal=3)]) # end derived-class # begin as-xhtml from ferenda.elements import SectionalElement p = SectionalElement(["Some content"], ordinal = "1a", identifier = "Doc pt 1(a)",
def test_create_body(self): b = Body() doc = Document(body=b) self.assertIs(b, doc.body)