示例#1
0
class TestDescriber(unittest.TestCase):
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """, format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")

    def test_getvalues(self):
        self.assertEqual(self.desc.getvalues(DCTERMS.alternate),
                         [])
        self.assertEqual(self.desc.getvalues(DCTERMS.title),
                         ["Hello world"])
        self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)),
                         set(["ID1", "ID2"]))

    def test_getvalue(self):
        self.assertEqual(self.desc.getvalue(DCTERMS.title),
                         "Hello world")
        self.assertEqual(self.desc.getvalue(DCTERMS.issued),
                         datetime.date(2013,10,11))
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.alternate)
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.identifier)

    def test_getrels(self):
        self.assertEqual(self.desc.getrels(DCTERMS.replaces),
                         [])
        self.assertEqual(self.desc.getrels(DCTERMS.references),
                         ["http://example.org/doc2"])
        self.assertEqual(set(self.desc.getrels(DCTERMS.subject)),
                         set(["http://example.org/concept1",
                              "http://example.org/concept2"]))

    def test_getrel(self):
        self.assertEqual(self.desc.getrel(DCTERMS.references),
                         "http://example.org/doc2")
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.replaces)
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.subject)
            
    def test_getrdftype(self):
        self.assertEqual(self.desc.getrdftype(),
                         "http://xmlns.com/foaf/0.1/Document")
示例#2
0
class TestDescriber(unittest.TestCase):
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """,
                         format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")

    def test_getvalues(self):
        self.assertEqual(self.desc.getvalues(DCTERMS.alternate), [])
        self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"])
        self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)),
                         set(["ID1", "ID2"]))

    def test_getvalue(self):
        self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world")
        self.assertEqual(self.desc.getvalue(DCTERMS.issued),
                         datetime.date(2013, 10, 11))
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.alternate)
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.identifier)

    def test_getrels(self):
        self.assertEqual(self.desc.getrels(DCTERMS.replaces), [])
        self.assertEqual(self.desc.getrels(DCTERMS.references),
                         ["http://example.org/doc2"])
        self.assertEqual(
            set(self.desc.getrels(DCTERMS.subject)),
            set(["http://example.org/concept1",
                 "http://example.org/concept2"]))

    def test_getrel(self):
        self.assertEqual(self.desc.getrel(DCTERMS.references),
                         "http://example.org/doc2")
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.replaces)
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.subject)

    def test_getrdftype(self):
        self.assertEqual(self.desc.getrdftype(),
                         "http://xmlns.com/foaf/0.1/Document")
示例#3
0
文件: rfc.py 项目: zigit/ferenda
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True
示例#4
0
文件: rfc.py 项目: staffanm/ferenda
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))
        
        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True