class TestDescriber(unittest.TestCase): def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc") def test_getvalues(self): self.assertEqual(self.desc.getvalues(DCTERMS.alternate), []) self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"]) self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)), set(["ID1", "ID2"])) def test_getvalue(self): self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world") self.assertEqual(self.desc.getvalue(DCTERMS.issued), datetime.date(2013,10,11)) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.alternate) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.identifier) def test_getrels(self): self.assertEqual(self.desc.getrels(DCTERMS.replaces), []) self.assertEqual(self.desc.getrels(DCTERMS.references), ["http://example.org/doc2"]) self.assertEqual(set(self.desc.getrels(DCTERMS.subject)), set(["http://example.org/concept1", "http://example.org/concept2"])) def test_getrel(self): self.assertEqual(self.desc.getrel(DCTERMS.references), "http://example.org/doc2") with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.replaces) with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.subject) def test_getrdftype(self): self.assertEqual(self.desc.getrdftype(), "http://xmlns.com/foaf/0.1/Document")
class TestDescriber(unittest.TestCase): def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc") def test_getvalues(self): self.assertEqual(self.desc.getvalues(DCTERMS.alternate), []) self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"]) self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)), set(["ID1", "ID2"])) def test_getvalue(self): self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world") self.assertEqual(self.desc.getvalue(DCTERMS.issued), datetime.date(2013, 10, 11)) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.alternate) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.identifier) def test_getrels(self): self.assertEqual(self.desc.getrels(DCTERMS.replaces), []) self.assertEqual(self.desc.getrels(DCTERMS.references), ["http://example.org/doc2"]) self.assertEqual( set(self.desc.getrels(DCTERMS.subject)), set(["http://example.org/concept1", "http://example.org/concept2"])) def test_getrel(self): self.assertEqual(self.desc.getrel(DCTERMS.references), "http://example.org/doc2") with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.replaces) with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.subject) def test_getrdftype(self): self.assertEqual(self.desc.getrdftype(), "http://xmlns.com/foaf/0.1/Document")
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join( doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance( part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join(doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance(part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True