Exemplo n.º 1
0
    def parse_metadata_from_soup(self, soup, doc):
        doc.lang = self.lang
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        dcterms = self.ns['dcterms']

        # dcterms:title
        d.value(dcterms.title, soup.find("title").string, lang=doc.lang)
        d.value(dcterms.identifier, doc.basefile)
        # dcterms:abstract
        abstract = soup.find(_class="abstract")
        if abstract:
            d.value(dcterms['abstract'], abstract.string, lang=doc.lang)

        # dcterms:published
        datehdr = soup.find(lambda x: x.name in ('h2', 'h3')
                            and re.search("W3C\s+Recommendation,?\s+", x.text))
        if datehdr:
            datestr = " ".join(datehdr.text.split())
            m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr)
            if not m:
                self.log.warning("%s: Couldn't parse datestr %s" %
                                 (doc.basefile, datestr))
            else:
                datestr = " ".join(m.groups())
                date = None
                try:
                    # 17 December 1996
                    date = util.strptime(datestr, "%d %B %Y").date()
                except ValueError:
                    try:
                        # 17 Dec 1996
                        date = util.strptime(datestr, "%d %b %Y").date()
                    except ValueError:
                        self.log.warning("%s: Could not parse datestr %s" %
                                         (doc.basefile, datestr))
                if date:
                    d.value(dcterms.issued, date)

        # dcterms:editor
        editors = soup.find("dt", text=re.compile("Editors?:"))
        if editors:
            for editor in editors.find_next_siblings("dd"):
                editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x)
                editor_name = editor_string.split(", ")[0]
                d.value(dcterms.editor, editor_name)

        # dcterms:publisher
        d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c")

        # assure we got exactly one of each of the required properties
        for required in (dcterms.title, dcterms.issued):
            d.getvalue(required)  # throws KeyError if not found (or more than one)
Exemplo n.º 2
0
 def parse_metadata_from_soup(self, soup, doc):
     from rdflib import Namespace
     from ferenda import Describer
     from ferenda import util
     import re
     DCT = Namespace("http://purl.org/dc/terms/")
     FOAF = Namespace("http://xmlns.com/foaf/0.1/")
     d = Describer(doc.meta, doc.uri)
     d.rdftype(FOAF.Document)
     d.value(DCT.title, soup.find("title").text, lang=doc.lang)
     d.value(DCT.abstract, soup.find(True, "abstract"), lang=doc.lang)
     # find the issued date -- assume it's the first thing that looks
     # like a date on the form "22 August 2013"
     re_date = re.compile(r'(\d+ \w+ \d{4})')
     datenode = soup.find(text=re_date)
     datestr = re_date.search(datenode).group(1)
     d.value(DCT.issued, util.strptime(datestr, "%d %B %Y"))
     editors = soup.find("dt", text=re.compile("Editors?:"))
     for editor in editors.find_next_siblings("dd"):
         editor_name = editor.text.strip().split(", ")[0]
         d.value(DCT.editor, editor_name)
Exemplo n.º 3
0
 def parse_metadata_from_soup(self, soup, doc):
     from rdflib import Namespace
     from ferenda import Describer
     from ferenda import util
     import re
     DCTERMS = Namespace("http://purl.org/dc/terms/")
     FOAF = Namespace("http://xmlns.com/foaf/0.1/")
     d = Describer(doc.meta, doc.uri)
     d.rdftype(FOAF.Document)
     d.value(DCTERMS.title, soup.find("title").text, lang=doc.lang)
     d.value(DCTERMS.abstract, soup.find(True, "abstract"), lang=doc.lang)
     # find the issued date -- assume it's the first thing that looks
     # like a date on the form "22 August 2013"
     re_date = re.compile(r'(\d+ \w+ \d{4})')
     datenode = soup.find(text=re_date)
     datestr = re_date.search(datenode).group(1)
     d.value(DCTERMS.issued, util.strptime(datestr, "%d %B %Y"))
     editors = soup.find("dt", text=re.compile("Editors?:"))
     for editor in editors.find_next_siblings("dd"):
         editor_name = editor.text.strip().split(", ")[0]
         d.value(DCTERMS.editor, editor_name)
Exemplo n.º 4
0
 def extract_metadata(self, soup, basefile):
     attribs = self.metadata_from_basefile(basefile)
     attribs["dcterms:title"] = soup.dokument.titel.text
     attribs["dcterms:issued"] = util.strptime(
         soup.dokument.publicerad.text, "%Y-%m-%d %H:%M:%S").date()
     return attribs
Exemplo n.º 5
0
 def extract_metadata(self, soup, basefile):
     attribs = self.metadata_from_basefile(basefile)
     attribs["dcterms:title"] = soup.dokument.titel.text
     attribs["dcterms:issued"] = util.strptime(soup.dokument.publicerad.text,
                                               "%Y-%m-%d %H:%M:%S").date()
     return attribs