def postprocess_doc(self, doc): next_is_title = False newbody = Body() glue = lambda x, y, z: False for para in doc.body.textboxes(gluefunc=glue, pageobjects=True): strpara = str(para).strip() if strpara == "Kommittédirektiv": next_is_title = True elif next_is_title: doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(strpara))) next_is_title = False elif strpara.startswith("Beslut vid regeringssammanträde den "): datestr = strpara[36:] # length of above prefix if datestr.endswith("."): datestr = datestr[:-1] doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(self.parse_swedish_date(datestr), datatype=XSD.date))) if isinstance(para, Page): newbody.append(Sidbrytning(ordinal=para.number, width=para.width, height=para.height, src=para.src)) else: newbody.append(para) doc.body = newbody
def htmlparser(chunks): b = Body() for block in chunks: tagtype = Preformatted if block.name == "pre" else Paragraph t = util.normalize_space(''.join(block.findAll(text=True))) block.extract() # to avoid seeing it again if t: b.append(tagtype([t])) return b
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([ Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([ 42, date(2013, 11, 27), datetime(2013, 11, 27, 12, 0, 0), b'bytestring', { 'foo': 'bar', 'x': 'y' } ], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse( data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def test_serialize_roundtrip(self): # Create a elements object tree tree = Body([Section([Paragraph(["Hello"]), Paragraph(["World"])], ordinal="1", title="Main section"), Section([42, date(2013,11,27), datetime(2013,11,27,12,0,0), b'bytestring', {'foo': 'bar', 'x': 'y'}], ordinal=2, title="Native types") ]) # roundtrip using the default XML format serialized = serialize(tree) self.assertIsInstance(serialized, str) newtree = deserialize(serialized, caller_globals=globals()) self.assertEqual(tree, newtree) # make another section with special (but commonly used) types # and try to roundtrip them. The XML serialization format does # not support this. graph = Graph().parse(data="""@prefix dcterms: <http://purl.org/dc/terms/> . <http://example.org/1> dcterms:title "Hello world"@en . """, format="turtle") parseresult = urlparser.parseString("http://example.org/1") tree.append(Section([parseresult, graph], meta=graph)) # roundtrip using JSON (which uses fully qualified classnames, # so we don't need to pass globals() into deserialize() serialized = serialize(tree, format="json") self.assertIsInstance(serialized, str) newtree = deserialize(serialized, format="json") # two pyparsing.ParseResult objects cannot be directly # compared (they don't implement __eq__), therefore we compare # their XML representations tree[2][0] = util.parseresults_as_xml(tree[2][0]) newtree[2][0] = util.parseresults_as_xml(newtree[2][0]) self.assertEqual(tree, newtree)
def parse(tokenstream): current_type = None body = Body() for p in tokenstream: new_type = guess_type(p, current_type) # if not new_type == None: # print "Guessed %s for %r" % (new_type.__name__,p[:20]) if new_type is None: pass elif new_type == Continuation and len(body) > 0: # Don't create a new text node, add this text to the last # text node created para = body.pop() para.append(p) body.append(para) else: if new_type == Continuation: new_type = Paragraph body.append(new_type([p])) current_type = new_type return body
def parse_pdfs(self, basefile, pdffiles): doc = Body() for pdffile in pdffiles: # FIXME: downloaded_path must be more fully mocked # (support attachments) by testutil.RepoTester. In the # meantime, we do some path munging ourselves pdf_path = self.store.downloaded_path(basefile).replace("index.html", pdffile) intermediate_path = self.store.intermediate_path(basefile, attachment=pdffile) intermediate_dir = os.path.dirname(intermediate_path) try: pdf = self.parse_pdf(pdf_path, intermediate_dir) for page in pdf: pass # page.crop(left=50,top=0,bottom=900,right=700) doc.append(pdf) except ValueError: (exc_type, exc_value, exc_trackback) = sys.exc_info() self.log.warning("Ignoring exception %s (%s), skipping PDF %s" % (exc_type, exc_value, pdffile)) return doc