def parse(self, doc): head, body = util.readfile(self.store.downloaded_path( doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add( (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup( "<div class='sitenews-item'>" + body + "</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update( doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def parse(self, doc): source = util.readfile(self.store.downloaded_path(doc.basefile)) html = publish_string(source, writer_name="html") soup = BeautifulSoup(html, "lxml") docinfo = soup.find("table", "docinfo") docuri = URIRef(doc.uri) if docinfo: # this is where our custom metadata goes for row in docinfo.find_all("tr", "field"): key, val = row.th.text.strip(), row.td.text.strip() if key == 'footer-order:': doc.meta.add((docuri, OLO['index'], Literal(int(val)))) else: self.log.warning("%s: Unknown metadata directive %s (%s)" % (doc.basefile, key, val)) # we don't need these in the final result docinfo.decompose() soup.find("h1", "title").decompose() doc.body = elements_from_soup(soup.body) doc.meta.add((docuri, DCTERMS.title, Literal(soup.title.text, doc.lang))) doc.meta.add((docuri, PROV.wasGeneratedBy, Literal(self.qualified_class_name()))) doc.meta.add((docuri, RDF.type, self.rdf_type)) self.parse_entry_update(doc) return True
def test_elements_from_soup(self): from ferenda.elements import html soup = BeautifulSoup("""<body> <h1>Sample</h1> <div class="main"> <img src="xyz.png"/> <p>Some <b>text</b></p> <dl> <dt>Term 1</dt> <dd>Definition 1</dd> </dl> </div> <div id="foot"> <hr/> <a href="/">home</a> - <a href="/about">about</a> </div> </body>""", "lxml") body = html.elements_from_soup(soup.body) # print("Body: \n%s" % serialize(body)) result = html.Body([html.H1(["Sample"]), html.Div([html.Img(src="xyz.png"), html.P(["Some ", html.B(["text"])]), html.DL([html.DT(["Term 1"]), html.DD(["Definition 1"])]) ], **{"class": "main"}), html.Div([html.HR(), html.A(["home"], href="/"), " - ", html.A(["about"], href="/about") ], id="foot")]) self.maxDiff = 4096 self.assertEqual(serialize(body), serialize(result))
def parse(self, doc): head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update(doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def test_elements_from_soup(self): soup = BeautifulSoup("""<html> <head> <title>Example doc</title> </head> <body> <marquee>Hello world</marquee> <!-- Hello world --> <center>Hello world</center> <p>That's enough of this nonsense</p> </body>""", "lxml") got = html.elements_from_soup(soup.html) self.assertEqual(html.HTML([html.Head([html.Title(["Example doc"])]), html.Body([html.P(["That's enough of this nonsense"])])]), got)
def _decode_query_result(self, response, pagenum, pagelen): json = response.json() res = [] for hit in json['hits']['hits']: h = hit['_source'] # wrap highlighted field in P, convert to elements hltext = " ... ".join([x.strip() for x in hit['highlight']['text']]) soup = BeautifulSoup("<p>%s</p>" % re.sub("\s+", " ", hltext)) h['text'] = html.elements_from_soup(soup.html.body.p) res.append(h) pager = {'pagenum': pagenum, 'pagecount': int(math.ceil(json['hits']['total'] / float(pagelen))), 'firstresult': (pagenum - 1) * pagelen + 1, 'lastresult': (pagenum - 1) * pagelen + len(json['hits']['hits']), 'totalresults': json['hits']['total']} return res, pager
def test_elements_from_soup(self): soup = BeautifulSoup( """<html> <head> <title>Example doc</title> </head> <body> <marquee>Hello world</marquee> <!-- Hello world --> <center>Hello world</center> <p>That's enough of this nonsense</p> </body>""", "lxml") got = html.elements_from_soup(soup.html) self.assertEqual( html.HTML([ html.Head([html.Title(["Example doc"])]), html.Body([html.P(["That's enough of this nonsense"])]) ]), got)
def _decode_query_result_hit(self, hit): h = hit['_source'] # h['repo'] = hit['_type'] if "join" in h: del h["join"] if 'highlight' in hit: for hlfield in ('text', 'label'): if hlfield in hit['highlight']: # wrap highlighted field in P, convert to # elements. hltext = re.sub("\s+", " ", " ... ".join([x.strip() for x in hit['highlight'][hlfield]])) hltext = hltext.replace("<em>", "<strong class='match'>").replace("</em>", " </strong>") # FIXME: BeautifulSoup/lxml returns empty soup if # first char is '§' or some other non-ascii char (like # a smart quote). Padding with a space makes problem # disappear, but need to find root cause. soup = BeautifulSoup("<p> %s</p>" % hltext, "lxml") h[hlfield] = html.elements_from_soup(soup.html.body.p) return h
def test_elements_from_soup(self): from ferenda.elements import html soup = BeautifulSoup( """<body> <h1>Sample</h1> <div class="main"> <img src="xyz.png"/> <p>Some <b>text</b></p> <dl> <dt>Term 1</dt> <dd>Definition 1</dd> </dl> </div> <div id="foot"> <hr/> <a href="/">home</a> - <a href="/about">about</a> </div> </body>""", "lxml") body = html.elements_from_soup(soup.body) # print("Body: \n%s" % serialize(body)) result = html.Body([ html.H1(["Sample"]), html.Div([ html.Img(src="xyz.png"), html.P(["Some ", html.B(["text"])]), html.DL([html.DT(["Term 1"]), html.DD(["Definition 1"])]) ], **{"class": "main"}), html.Div([ html.HR(), html.A(["home"], href="/"), " - ", html.A(["about"], href="/about") ], id="foot") ]) self.maxDiff = 4096 self.assertEqual(serialize(body), serialize(result))
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ferenda.compat import Mock from ferenda.elements.html import elements_from_soup from bs4 import BeautifulSoup doc = Mock() filedir = os.path.dirname(__file__) with open(filedir + "/../doc/examples/citationparsing-before.xhtml") as fp: doc.body = elements_from_soup(BeautifulSoup(fp.read(), "lxml").body) # begin from pyparsing import Word, nums from ferenda import CitationParser from ferenda import URIFormatter import ferenda.citationpatterns import ferenda.uriformats # Create two ParserElements for IETF document references and internal # references rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef") bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef") std_citation = "STD" + Word(nums).setResultsName("STDRef") ietf_doc_citation = (rfc_citation | bcp_citation | std_citation).setResultsName("IETFRef") endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") + "]").setResultsName("EndnoteRef")
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ferenda.compat import Mock from ferenda.elements.html import elements_from_soup from bs4 import BeautifulSoup doc = Mock() doc.body = elements_from_soup( BeautifulSoup( """<html> <body> URLs often appear like http://example.org/foo, in running text </body> </html>""", "lxml").body) # begin from ferenda import CitationParser from ferenda import URIFormatter import ferenda.citationpatterns import ferenda.uriformats # CitationParser is initialized with a list of pyparsing # ParserElements (or any other object that has a scanString method # that returns a generator of (tokens,start,end) tuples, where start # and end are integer string indicies and tokens are dict-like # objects) citparser = CitationParser(ferenda.citationpatterns.url) # URIFormatter is initialized with a list of tuples, where each # tuple is a string (identifying a named ParseResult) and a function # (that takes as a single argument a dict-like object and returns a # URI string (possibly relative)
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ferenda.compat import Mock from ferenda.elements.html import elements_from_soup from bs4 import BeautifulSoup doc = Mock() doc.body = elements_from_soup(BeautifulSoup("""<html> <body> URLs often appear like http://example.org/foo, in running text </body> </html>""").body) # begin from ferenda import CitationParser from ferenda import URIFormatter import ferenda.citationpatterns import ferenda.uriformats # CitationParser is initialized with a list of pyparsing # ParserElements (or any other object that has a scanString method # that returns a generator of (tokens,start,end) tuples, where start # and end are integer string indicies and tokens are dict-like # objects) citparser = CitationParser(ferenda.citationpatterns.url) # URIFormatter is initialized with a list of tuples, where each # tuple is a string (identifying a named ParseResult) and a function # (that takes as a single argument a dict-like object and returns a # URI string (possibly relative) citparser.set_formatter(URIFormatter(("URLRef", ferenda.uriformats.url)))
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ferenda.compat import Mock from ferenda.elements.html import elements_from_soup from bs4 import BeautifulSoup doc = Mock() filedir = os.path.dirname(__file__) doc.body = elements_from_soup( BeautifulSoup( open(filedir + "/../doc/examples/citationparsing-before.xhtml").read(), "lxml").body) # begin from pyparsing import Word, nums from ferenda import CitationParser from ferenda import URIFormatter import ferenda.citationpatterns import ferenda.uriformats # Create two ParserElements for IETF document references and internal # references rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef") bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef") std_citation = "STD" + Word(nums).setResultsName("STDRef") ietf_doc_citation = (rfc_citation | bcp_citation | std_citation).setResultsName("IETFRef") endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") +
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ferenda.compat import Mock from ferenda.elements.html import elements_from_soup from bs4 import BeautifulSoup doc = Mock() filedir = os.path.dirname(__file__) doc.body = elements_from_soup(BeautifulSoup(open(filedir+"/../doc/examples/citationparsing-before.xhtml").read()).body) # begin from pyparsing import Word, nums from ferenda import CitationParser from ferenda import URIFormatter import ferenda.citationpatterns import ferenda.uriformats # Create two ParserElements for IETF document references and internal # references rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef") bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef") std_citation = "STD" + Word(nums).setResultsName("STDRef") ietf_doc_citation = (rfc_citation | bcp_citation | std_citation).setResultsName("IETFRef") endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") + "]").setResultsName("EndnoteRef") # Create a URI formatter for IETF documents (URI formatter for endnotes # is so simple that we just use a lambda function below def rfc_uri_formatter(parts):
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ferenda.compat import Mock from ferenda.elements.html import elements_from_soup from bs4 import BeautifulSoup doc = Mock() filedir = os.path.dirname(__file__) with open(filedir+"/../doc/examples/citationparsing-before.xhtml") as fp: doc.body = elements_from_soup(BeautifulSoup(fp.read(), "lxml").body) # begin from pyparsing import Word, nums from ferenda import CitationParser from ferenda import URIFormatter import ferenda.citationpatterns import ferenda.uriformats # Create two ParserElements for IETF document references and internal # references rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef") bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef") std_citation = "STD" + Word(nums).setResultsName("STDRef") ietf_doc_citation = (rfc_citation | bcp_citation | std_citation).setResultsName("IETFRef") endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") + "]").setResultsName("EndnoteRef") # Create a URI formatter for IETF documents (URI formatter for endnotes # is so simple that we just use a lambda function below