Exemplo n.º 1
0
 def find_firstpage_metadata(self, firstpage, basefile):
     res = {}
     m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
                   util.normalize_space(firstpage),
                   flags=re.I)
     if not m:
         self.log.warning(
             "%s: Couldn't find title in first %s characters (first page)" %
             (basefile, len(firstpage)))
     else:
         res["dcterms:title"] = m.groups(1)
     m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})",
                   util.normalize_space(firstpage),
                   flags=re.I)
     if not m:
         self.log.warning(
             "%s: Couldn't find date in first %s characters (first page)" %
             (basefile, len(firstpage)))
     else:
         try:
             res["dcterms:issued"] = self.parse_swedish_date(
                 m.group(1).lower())
         except ValueError as e:
             self.log.warning("%s: Couldn't parse date %s" %
                              (basefile, m.group(1)))
     return res
Exemplo n.º 2
0
 def extract_metadata(self, rawhead, basefile):
     d = self.metadata_from_basefile(basefile)
     if rawhead:  # sometimes there's no headnote.html
         for label, key in {"Ämbetsberättelse": 'dcterms:bibliographicCitation',
                            "Beslutsdatum": 'dcterms:issued',
                            "Diarienummer": 'rpubl:diarienummer'}.items():
             labelnode = rawhead.find(text=re.compile("%s:" % label))
             if labelnode:
                 d[key] = util.normalize_space(labelnode.next_sibling.text)
         # this data might contain spurious spaces due to <span
         # class="Definition"> tags -- see eg 3128-2002. Data in
         # the document is preferable
         d["dcterms:title"] = util.normalize_space(rawhead.find("h2").text)
     return d
Exemplo n.º 3
0
    def download_get_basefiles_page(self, pagetree):
        # feed the lxml tree into beautifulsoup by serializing it to a
        # string -- is there a better way?
        soup = BeautifulSoup(etree.tostring(pagetree))
        for tr in soup.findAll("tr"):
            if ((not tr.find("a")) or
                    not re.match(self.basefile_regex, tr.find("a").text)):
                # FIXME: Maybe re.search instead of .match to find
                # "Prop. 2012/13:152"
                continue
            # First, look at desc (third td):
            descnodes = [util.normalize_space(x) for x
                         in tr.find_all("td")[2]
                         if isinstance(x, str)]
            bilaga = None
            if len(descnodes) > 1:
                if descnodes[1].startswith("Bilaga:"):
                    bilaga = util.normalize_space(descnodes[0].split(",")[-1])
            desc = "\n".join(descnodes)

            # then, find basefile (second td)
            tds = tr.find_all("td")
            td = tds[1]
            basefile = td.a.text
            assert re.match(self.basefile_regex, basefile)

            basefile = self.sanitize_basefile(basefile)

            url = td.a['href']

            # self.download_single(basefile, refresh=refresh, url=url)

            # and, if present, extra files (in td 4+5)
            extraurls = []
            for td in tr.findAll("td")[3:]:
                extraurls.append(td.a['href'])

            # we slightly abuse the protocol between
            # download_get_basefiles and this generator -- instead of
            # yielding just two strings, we yield two tuples with some
            # extra information that download_single will need.
            yield (basefile, bilaga), (url, extraurls)

        nextpage = None
        for element, attribute, link, pos in pagetree.iterlinks():
            if element.text == "Fler poster":
                nextpage = link
        raise NoMoreLinks(nextpage)
Exemplo n.º 4
0
    def test_fallback_ocr(self):
        try:
            # actually running tesseract takes ages -- for day-to-day
            # testing we can just as well use the canned hocr.html
            # files that _copy_sample fixes for us.
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(
                filename="test/files/pdfreader/scanned-ecma-99.pdf",
                workdir=self.datadir,
                images=False)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(
                filename="test/files/pdfreader/scanned-ecma-99.pdf",
                workdir=self.datadir,
                images=False)

        self.assertTrue(reader.is_empty())
        reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                           workdir=self.datadir,
                           ocr_lang="eng")
        self.assertFalse(reader.is_empty())
        self.assertEqual(2, len(reader))
        self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION",
                         util.normalize_space(str(reader[0][1])))
Exemplo n.º 5
0
 def extract_metadata(self, rawhead, basefile):
     d = self.metadata_from_basefile(basefile)
     if rawhead:  # sometimes there's no headnote.html
         for label, key in {
                 "Ämbetsberättelse": 'dcterms:bibliographicCitation',
                 "Beslutsdatum": 'dcterms:issued',
                 "Diarienummer": 'rpubl:diarienummer'
         }.items():
             labelnode = rawhead.find(text=re.compile("%s:" % label))
             if labelnode:
                 d[key] = util.normalize_space(labelnode.next_sibling.text)
         # this data might contain spurious spaces due to <span
         # class="Definition"> tags -- see eg 3128-2002. Data in
         # the document is preferable
         d["dcterms:title"] = util.normalize_space(rawhead.find("h2").text)
     return d
Exemplo n.º 6
0
    def sanitize_metadata(self, a, basefile):
        # trim space
        for k in ("dcterms:title", "dcterms:abstract"):
            if k in a:
                a[k] = util.normalize_space(a[k])
        # trim identifier
        a["dcterms:identifier"] = self.sanitize_identifier(
            a["dcterms:identifier"].replace("ID-nummer: ", ""))
        # FIXME call sanitize_identifier
        # save for later
        self._identifier = a["dcterms:identifier"]
        # it's rare, but in some cases a document can be published by
        # two different departments (eg dir. 2011:80). Convert string
        # to a list in these cases (SwedishLegalSource.polish_metadata
        # will handle that)
        if "rpubl:departement" in a and ", " in a["rpubl:departement"]:
            a["rpubl:departement"] = a["rpubl:departement"].split(", ")
        # remove empty utgarFran list
        if a["rpubl:utgarFran"]:
            a["rpubl:utgarFran"] = [URIRef(x) for x in a["rpubl:utgarFran"]]
        else:
            del a["rpubl:utgarFran"]

        # FIXME: possibly derive utrSerie from self.document_type?
        if self.rdf_type == RPUBL.Utredningsbetankande:
            altlabel = "SOU" if self.document_type == Regeringen.SOU else "Ds"
            a["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel)
        return a
Exemplo n.º 7
0
    def parse_antiword_docbook(self, text, basefile):
        soup = BeautifulSoup(text)
        head = {}
        header_elements = soup.find("para")
        header_text = ''
        for el in header_elements.contents:
            if hasattr(el, 'name') and el.name == "informaltable":
                break
            else:
                header_text += el.string

        # Högst uppe på varje domslut står domstolsnamnet ("Högsta
        # domstolen") följt av referatnumret ("NJA 1987
        # s. 113"). Beroende på worddokumentet ser dock XML-strukturen
        # olika ut. Det vanliga är att informationen finns i en
        # pipeseparerad paragraf:

        parts = [x.strip() for x in header_text.split("|")]
        if len(parts) > 1:
            head['Domstol'] = parts[0]
            head['Referat'] = parts[1]
        else:
            # alternativ står de på första raden i en informaltable
            row = soup.find("informaltable").tgroup.tbody.row.findAll('entry')
            head['Domstol'] = row[0].get_text(strip=True)
            head['Referat'] = row[1].get_text(strip=True)

        # Hitta övriga enkla metadatafält i sidhuvudet
        for key in self.labels:
            node = soup.find(text=re.compile(key + ':'))
            if node:
                txt = node.find_parent('entry').find_next_sibling('entry').get_text(strip=True)
                if txt:
                    head[key] = txt

        # Hitta sammansatta metadata i sidhuvudet
        for key in ["Lagrum", "Rättsfall"]:
            node = soup.find(text=re.compile(key + ':'))
            if node:
                head[key] = []
                textchunk = node.find_parent(
                    'entry').find_next_sibling('entry').string
                for line in [util.normalize_space(x) for x in textchunk.split("\n\n")]:
                    if line:
                        head[key].append(line)

        body = []
        for p in soup.find(text=re.compile('REFERAT')).find_parent('tgroup').find_next_sibling('tgroup').find('entry').get_text(strip=True).split("\n\n"):
            body.append(p)

        # Hitta sammansatta metadata i sidfoten
        head['Sökord'] = soup.find(text=re.compile('Sökord:')).find_parent(
            'entry').next_sibling.next_sibling.get_text(strip=True)

        if soup.find(text=re.compile('^\s*Litteratur:\s*$')):
            n = soup.find(text=re.compile('^\s*Litteratur:\s*$')).find_parent(
                'entry').next_sibling.next_sibling.get_text(strip=True)
            head['Litteratur'] = n
        return head, body
Exemplo n.º 8
0
 def sanitize_term(self, term):
     # sanity checking -- not everything can be a legit
     # keyword. Must be under 100 chars and not start with . or /
     term = util.normalize_space(term)
     if (self.term_max_len >= len(term) >= self.term_min_len
             and term[0] not in self.invalid_term_start
             and term[-1] not in self.invalid_term_end):
         return term
Exemplo n.º 9
0
 def sanitize_metadata(self, attribs, basefile):
     attribs = super(PropTrips, self).sanitize_metadata(attribs, basefile)
     if ('dcterms:title' in attribs and 'dcterms:identifier' in attribs
             and attribs['dcterms:title'].endswith(
                 attribs['dcterms:identifier'])):
         x = attribs['dcterms:title'][:-len(attribs['dcterms:identifier'])]
         attribs['dcterms:title'] = util.normalize_space(x)
     return attribs
Exemplo n.º 10
0
 def sanitize_metadata(self, attribs, basefile):
     attribs = super(PropTrips, self).sanitize_metadata(attribs, basefile)
     if ('dcterms:title' in attribs and
         'dcterms:identifier' in attribs and
         attribs['dcterms:title'].endswith(attribs['dcterms:identifier'])):
         x = attribs['dcterms:title'][:-len(attribs['dcterms:identifier'])]
         attribs['dcterms:title'] = util.normalize_space(x)
     return attribs
Exemplo n.º 11
0
 def sanitize_term(self, term):
     # sanity checking -- not everything can be a legit
     # keyword. Must be under 100 chars and not start with . or /
     term = util.normalize_space(term)
     if (self.term_max_len >= len(term) >= self.term_min_len and 
         term[0] not in self.invalid_term_start and 
         term[-1] not in self.invalid_term_end):
         return term
Exemplo n.º 12
0
 def htmlparser(chunks):
     b = Body()
     for block in chunks:
         tagtype = Preformatted if block.name == "pre" else Paragraph
         t = util.normalize_space(''.join(block.findAll(text=True)))
         block.extract()  # to avoid seeing it again
         if t:
             b.append(tagtype([t]))
     return b
Exemplo n.º 13
0
 def htmlparser(chunks):
     b = Body()
     for block in chunks:
         tagtype = Preformatted if block.name == "pre" else Paragraph
         t = util.normalize_space(''.join(block.findAll(text=True)))
         block.extract()  # to avoid seeing it again
         if t:
             b.append(tagtype([t]))
     return b
Exemplo n.º 14
0
 def find_firstpage_metadata(self, firstpage, basefile):
     res = {}
     m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
                   util.normalize_space(firstpage), flags=re.I)
     if not m:
         self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
                          (basefile, len(firstpage)))
     else:
         res["dcterms:title"] = m.groups(1)
     m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
     if not m:
         self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
                          (basefile, len(firstpage)))
     else:
         try:
             res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
         except ValueError as e:
             self.log.warning("%s: Couldn't parse date %s" % (basefile, m.group(1)))
     return res
Exemplo n.º 15
0
 def as_plaintext(self):
     """Returns the plain text of this element, including child elements."""
     res = []
     for subpart in self:
         if isinstance(subpart, str):
             res.append(util.normalize_space(subpart))
         elif (isinstance(subpart, AbstractElement) or hasattr(subpart, 'as_plaintext')):
             res.append(subpart.as_plaintext())
     # the rule for concatenating children into a plaintext string is:
     # filter out all empty children, then place single space between the others.
     return " ".join(filter(None,res))
Exemplo n.º 16
0
 def extract_metadata(self, rawhead, basefile):
     res = self.metadata_from_basefile(basefile)
     # extracting title and other metadata (dep, publication date
     # etc) requires parsing of the body (and subsequent processing
     # in postprocess_doc). For documents marked as metadataonly in
     # options.py, the body is never parsed. Therefore, we do a
     # very limited parsing of the first page here.
     if self.get_parse_options(basefile) == "metadataonly":
         text = util.normalize_space(etree.tostring(rawhead, method="text", encoding="utf-8").decode("utf-8"))
         res.update(self.find_firstpage_metadata(text, basefile))
     return res
Exemplo n.º 17
0
 def sanitize_metadata(self, attribs, basefile):
     # remove trailing "Avgörande 1993-05-03; 92-2571"
     if attribs['dcterms:title'].strip():
         attribs['dcterms:title'] = Literal(
             re.sub("Avgörande \d+-\d+-\d+; \d+-\d+\.?",
                    "", util.normalize_space(attribs['dcterms:title'])),
             lang="sv")
     else:
         del attribs['dcterms:title'] # no real content -- delete
                                      # it and fill the value with
                                      # stuff from the document
                                      # later.
     return attribs
Exemplo n.º 18
0
Arquivo: arn.py Projeto: zigit/ferenda
 def sanitize_metadata(self, attribs, basefile):
     # remove trailing "Avgörande 1993-05-03; 92-2571"
     if attribs['dcterms:title'].strip():
         attribs['dcterms:title'] = Literal(re.sub(
             "Avgörande \d+-\d+-\d+; \d+-\d+\.?", "",
             util.normalize_space(attribs['dcterms:title'])),
                                            lang="sv")
     else:
         del attribs['dcterms:title']  # no real content -- delete
         # it and fill the value with
         # stuff from the document
         # later.
     return attribs
Exemplo n.º 19
0
 def as_plaintext(self):
     """Returns the plain text of this element, including child elements."""
     res = []
     for subpart in self:
         if isinstance(subpart, str):
             res.append(util.normalize_space(subpart))
         elif (isinstance(subpart, AbstractElement) or
               hasattr(subpart, 'as_plaintext')):
             res.append(subpart.as_plaintext())
     # the rule for concatenating children into a plaintext string is:
     # filter out all empty children, then place single space between
     # the others.
     return " ".join(filter(None, res))
Exemplo n.º 20
0
 def _extract_plaintext(self, resource, resources):
     about = resource.get("about")
     if about and "#sid" in about:
         # select all text content contained in the first 2 <p>
         # tags following the pagebreak -- this should typically be
         # enough to show a helpful snippet in the autocomplete box
         nodes = resource.xpath("following::h:p[position() < 2]//text()",
                                namespaces={'h': 'http://www.w3.org/1999/xhtml'})
         plaintext = util.normalize_space(" ".join(nodes))
         if not plaintext:
             plaintext = "(Sid %s saknar text)" % about.split("#sid")[1]
         return plaintext
     else:
         return super(FixedLayoutSource, self)._extract_plaintext(resource, resources)
Exemplo n.º 21
0
 def postprocess_doc(self, doc):
     if self.get_parse_options(doc.basefile) == "metadataonly":
         return
     # the first thing will be a Sidbrytning; continue scanning text until next sidbrytning
     firstpage = ""
     for thing in doc.body[1:]:
         if isinstance(thing, Sidbrytning):
             break
         elif isinstance(thing, Textbox):
             firstpage += util.normalize_space(str(thing)) + "\n\n"
     metadata = self.find_firstpage_metadata(firstpage, doc.basefile)
     if "dcterms:title" in metadata:
         doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(metadata["dcterms:title"], lang=self.lang)))
     if "dcterms:issued" in metadata:
         doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(metadata["dcterms:issued"])))
Exemplo n.º 22
0
 def _extract_plaintext(self, resource, resources):
     about = resource.get("about")
     if about and "#sid" in about:
         # select all text content contained in the first 2 <p>
         # tags following the pagebreak -- this should typically be
         # enough to show a helpful snippet in the autocomplete box
         nodes = resource.xpath(
             "following::h:p[position() < 2]//text()",
             namespaces={'h': 'http://www.w3.org/1999/xhtml'})
         plaintext = util.normalize_space(" ".join(nodes))
         if not plaintext:
             plaintext = "(Sid %s saknar text)" % about.split("#sid")[1]
         return plaintext
     else:
         return super(FixedLayoutSource,
                      self)._extract_plaintext(resource, resources)
Exemplo n.º 23
0
 def parse_from_textreader(self, textreader, doc):
     describer = Describer(doc.meta, doc.uri)
     for p in textreader.getiterator(textreader.readparagraph):
         # print "Handing %r (%s)" % (p[:40], len(doc.body))
         if not p.strip():
             continue
         elif not doc.body and 'Obs! Dokumenten i denna databas kan vara ofullständiga.' in p:
             continue
         elif not doc.body and p.strip().startswith("Dokument:"):
             # We already know this
             continue
         elif not doc.body and p.strip().startswith("Titel:"):
             describer.value(
                 self.ns['dct'].title, util.normalize_space(p[7:]))
         else:
             doc.body.append(Preformatted([p]))
Exemplo n.º 24
0
    def sanitize_metadata(self, a, basefile):
        # trim space
        for k in ("dcterms:title", "dcterms:abstract"):
            if k in a:
                a[k] = util.normalize_space(a[k])
        # trim identifier
        try:
            # The identifier displayed on the HTML page is not always
            # correct -- it might be missing digits (eg "SOU 207:111"
            # instead of "SOU 2017:111"). Try to sanitize it, but if
            # we fail, infer it from our basefile instead.
            a["dcterms:identifier"] = self.sanitize_identifier(
                a["dcterms:identifier"].replace("ID-nummer: ", ""))
        except ValueError as e:
            inferred_identifier = str(self.infer_identifier(basefile))
            self.log.warning(
                "%s: Irregular identifier %s, using inferred identifier %s instead"
                % (basefile, a["dcterms:identifier"], inferred_identifier))
            a["dcterms:identifier"] = inferred_identifier
        # save for later
        self._identifier = a["dcterms:identifier"]
        # it's rare, but in some cases a document can be published by
        # two different departments (eg dir. 2011:80). Convert string
        # to a list in these cases (SwedishLegalSource.polish_metadata
        # will handle that)
        if "rpubl:departement" in a and ", " in a["rpubl:departement"]:
            a["rpubl:departement"] = a["rpubl:departement"].split(", ")
        # remove empty utgarFran list
        if a["rpubl:utgarFran"]:
            a["rpubl:utgarFran"] = [URIRef(x) for x in a["rpubl:utgarFran"]]
        else:
            del a["rpubl:utgarFran"]

        # FIXME: possibly derive utrSerie from self.document_type?
        if self.rdf_type == RPUBL.Utredningsbetankande:
            altlabel = "SOU" if self.document_type == Regeringen.SOU else "Ds"
            a["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel)
        return a
Exemplo n.º 25
0
    def sanitize_metadata(self, a, basefile):
        # trim space
        for k in ("dcterms:title", "dcterms:abstract"):
            if k in a:
                a[k] = util.normalize_space(a[k])
        # trim identifier
        try:
            # The identifier displayed on the HTML page is not always
            # correct -- it might be missing digits (eg "SOU 207:111"
            # instead of "SOU 2017:111"). Try to sanitize it, but if
            # we fail, infer it from our basefile instead.
            a["dcterms:identifier"] = self.sanitize_identifier(
                a["dcterms:identifier"].replace("ID-nummer: ", ""))
        except ValueError as e:
            inferred_identifier = str(self.infer_identifier(basefile))
            self.log.warning("%s: Irregular identifier %s, using inferred identifier %s instead" % (basefile, a["dcterms:identifier"], inferred_identifier))
            a["dcterms:identifier"] = inferred_identifier
        # save for later
        self._identifier = a["dcterms:identifier"]
        # it's rare, but in some cases a document can be published by
        # two different departments (eg dir. 2011:80). Convert string
        # to a list in these cases (SwedishLegalSource.polish_metadata
        # will handle that)
        if "rpubl:departement" in a and ", " in a["rpubl:departement"]:
            a["rpubl:departement"] = a["rpubl:departement"].split(", ")
        # remove empty utgarFran list
        if a["rpubl:utgarFran"]:
            a["rpubl:utgarFran"] = [URIRef(x) for x in a["rpubl:utgarFran"]]
        else:
            del a["rpubl:utgarFran"]

        # FIXME: possibly derive utrSerie from self.document_type?
        if self.rdf_type == RPUBL.Utredningsbetankande:
            altlabel = "SOU" if self.document_type == Regeringen.SOU else "Ds"
            a["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel)
        return a
Exemplo n.º 26
0
    def test_ocr(self):
        try:
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(filename="test/files/pdfreader/scanned.pdf",
                               workdir=self.datadir,
                               ocr_lang="swe")
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/scanned.pdf",
                               workdir=self.datadir,
                               ocr_lang="swe")

        # assert that a hOCR file has been created
        self.assertTrue(
            os.path.exists(self.datadir + os.sep + "scanned.hocr.html"))

        # assert that we have two pages
        self.assertEqual(2, len(reader))

        # assert that first element in the first textbox in the first
        # page corresponds to the first bbox, scaled by the
        # pixel/point scaling factor.
        self.assertEqual("Regeringens ", str(reader[0][0][0]))
        self.assertEqual(47, reader[0][0][0].top)
        self.assertEqual(38, reader[0][0][0].left)
        self.assertEqual(21, reader[0][0][0].height)
        self.assertEqual(118, reader[0][0][0].width)

        # assert that the <s>third</s>fifth textbox (which has mostly
        # normal text) is rendered correctly (note that we have a
        # couple of OCR errors).
        # self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i bifogade utdrag ur regeringsprotokollet den 31 oktober l99l.", util.normalize_space(str(reader[0][3])))
        self.assertEqual(
            "Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i",
            util.normalize_space(str(reader[0][5])))
Exemplo n.º 27
0
    def test_fallback_ocr(self):
        try:
            # actually running tesseract takes ages -- for day-to-day
            # testing we can just as well use the canned hocr.html
            # files that _copy_sample fixes for us.
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                               workdir=self.datadir,
                               images=False)
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                               workdir=self.datadir,
                               images=False)

        self.assertTrue(reader.is_empty())
        reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf",
                           workdir=self.datadir,
                           ocr_lang="eng")
        self.assertFalse(reader.is_empty())
        self.assertEqual(2, len(reader))
        self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION",
                         util.normalize_space(str(reader[0][1])))
Exemplo n.º 28
0
    def analyze_baseline_queries(self, analyzed_articles, num_of_keyterms=5):
        basefile = "tfeu"
        # Helper from http://effbot.org/zone/element-lib.htm

        def flatten(elem, include_tail=0):
            text = elem.text or ""
            for e in elem:
                text += flatten(e, 1)
                if include_tail and elem.tail:
                    text += elem.tail
            return text
        # step 1: Create a temporary whoosh index in order to find out
        # the most significant words for each article

        #ana = analysis.StandardAnalyzer()
        ana = analysis.StemmingAnalyzer()
        # vectorformat = formats.Frequency(ana)
        schema = fields.Schema(article=fields.ID(unique=True),
                               content=fields.TEXT(analyzer=ana,
                                                   stored=True))

        st = RamStorage()
        tmpidx = st.create_index(schema)
        w = tmpidx.writer()

        XHT_NS = "{http://www.w3.org/1999/xhtml}"
        tree = ET.parse(self.parsed_path(basefile))
        els = tree.findall("//" + XHT_NS + "div")
        articles = []
        for el in els:
            if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article":
                text = util.normalize_space(flatten(el))
                article = str(el.attrib['about'])
                articles.append(article)
                w.update_document(article=article, content=text)
        w.commit()
        self.log.info("Indexed %d articles" % len(articles))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, for each article, use the 5 most distinctive terms
        # (filtering away numbers) to create a query against that index
        tempsearch = tmpidx.searcher()
        g = Graph()
        g.bind('celex', 'http://lagen.nu/ext/celex/')
        g.bind('ir', 'http://lagen.nu/informationretrieval#')
        IR = Namespace('http://lagen.nu/informationretrieval#')
        # celex:12008E264 ir:keyterm "blahonga"@en.

        outfile = self.generic_path("keyterms", "analyzed", ".tex")
        util.ensure_dir(outfile)
        fp = open(outfile, "w")
        fp.write("""
\\begin{tabular}{r|%s}
  \\hline
  \\textbf{Art.} & \\multicolumn{%s}{l}{\\textbf{Terms}} \\\\
  \\hline
""" % ("l" * num_of_keyterms, num_of_keyterms))

        for article in analyzed_articles:
            fp.write(str(int(article.split("E")[1])))
            r = tempsearch.search(query.Term("article", article))
            terms = r.key_terms("content", numterms=num_of_keyterms + 1)
            terms = [t[0] for t in terms if not t[0].isdigit(
            )][:num_of_keyterms]
            for term in terms:
                fp.write(" & " + term)
                g.add((
                    URIRef(article), IR["keyterm"], Literal(term, lang="en")))
            self.log.debug("Article %s:%r" % (article, terms))
            fp.write("\\\\\n")
        fp.write("""
  \\hline
\\end{tabular}
""")
        fp.close()

        outfile = self.generic_path("keyterms", "analyzed", ".n3")
        util.ensure_dir(outfile)
        fp = open(outfile, "w")
        fp.write(g.serialize(format="n3"))
        fp.close()
Exemplo n.º 29
0
    def polish_metadata(self, head, doc):
        basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')

        def basefile_to_referat(basefile):
            templ = {'ADO': 'AD %(year)s nr %(ordinal)s',
                     'MD': 'MD %(year)s:%(ordinal)s'}
            m = basefile_regex.match(basefile)
            if m:
                return templ[m.group("type")] % (m.groupdict())

        def ref_to_uri(ref):
            # FIXME: We'd like to retire legalref and replace it with
            # pyparsing grammars.
            nodes = self.rattsfall_parser.parse(ref)
            uri = nodes[0].uri
            return localize_uri(uri)

        def dom_to_uri(domstol, malnr, avg):
            baseuri = self.config.url
            slug = self.slugs[domstol]
            return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals()

        def localize_uri(uri):
            if "publ/rattsfall" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall",
                                   self.config.url + "res/dv")
            elif "publ/sfs/" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/sfs",
                                   self.config.url + "res/sfs")

        def split_nja(value):
            # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86")
            return [x[:-1] for x in value.split("(")]

        def sokord_uri(value):
            return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')

        # 0. create Referat key if not present
        if "Referat" not in head:
            # For some courts (MD, AD, MOD?, MIG?) this is possible
            head["Referat"] = basefile_to_referat(doc.basefile)

        # 1. mint uris and create the two Describers we'll use
        refuri = ref_to_uri(head["Referat"])
        refdesc = Describer(doc.meta, refuri)
        domuri = dom_to_uri(head["Domstol"],
                            head["Målnummer"],
                            head["Avgörandedatum"])
        domdesc = Describer(doc.meta, domuri)

        # 2. convert all strings in head to proper RDF
        for label, value in head.items():
            if label == "Rubrik":
                value = util.normalize_space(value)
                refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv")
                domdesc.value(self.ns['dct'].title, value, lang="sv")

            elif label == "Domstol":
                domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value))
            elif label == "Målnummer":
                domdesc.rel(self.ns['rpubl'].malnummer, value)
            elif label == "Domsnummer":
                domdesc.rel(self.ns['rpubl'].domsnummer, value)
            elif label == "Diarienummer":
                domdesc.rel(self.ns['rpubl'].diarienummer, value)
            elif label == "Avdelning":
                domdesc.rel(self.ns['rpubl'].avdelning, value)
            elif label == "Referat":

                for pred, regex in {'rattsfallspublikation': r'([^ ]+)',
                                    'arsutgava': r'(\d{4})',
                                    'lopnummer': r'\d{4}(?:\:| nr )(\d+)',
                                    'sidnummer': r's.? ?(\d+)'}.items():
                    m = re.search(regex, value)
                    if m:
                        if pred == 'rattsfallspublikation':
                            # "NJA" -> "http://lcaolhost:8000/coll/dv/nja"
                            uri = self.config.url + "coll/dv/" + m.group(1).lower()
                            refdesc.rel(self.ns['rpubl'][pred], uri)
                        else:
                            refdesc.value(self.ns['rpubl'][pred], m.group(1))

                    if value.startswith("NJA"):
                        realvalue, extra = split_nja(value)
                        ordinal = extra.split(" ")[1]
                        refdesc.value(self.ns['dct'].bibliographicCitation,
                                      extra)
                        refdesc.rel(self.ns['owl'].sameAs,
                                    self.config.url + "res/dv/nja/" + ordinal)
                        refdesc.value(self.ns['dct'].identifier, realvalue)
                    else:
                        refdesc.value(self.ns['dct'].identifier, value)

            elif label == "Avgörandedatum":
                with util.c_locale():
                    d = datetime.strptime(value, '%Y-%m-%d')
                domdesc.value(self.ns['rpubl'].avgorandedatum, d)

            elif label == "Lagrum":
                for i in value:  # better be list not string
                    for node in self.lagrum_parser.parse(i):
                        if isinstance(node, Link):

                            domdesc.rel(self.ns['rpubl'].lagrum,
                                        localize_uri(node.uri))
            elif label == "Rättsfall":
                for i in value:
                    for node in self.rattsfall_parser.parse(i):
                        if isinstance(node, Link):
                            domdesc.rel(self.ns['rpubl'].rattsfall,
                                        localize_uri(node.uri))
            elif label == "Litteratur":
                for i in value.split(";"):
                    domdesc.value(self.ns['dct'].relation, util.normalize_space(i))
            elif label == "Sökord":
                for s in self.re_delimSplit(value):
                    s = util.normalize_space(s)
                    if not s:
                        continue
                    # terms longer than 72 chars are not legitimate
                    # terms. more likely descriptions. If a term has a - in
                    # it, it's probably a separator between a term and a
                    # description
                    while len(s) >= 72 and " - " in s:
                        h, s = s.split(" - ", 1)
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(h))
                    if len(s) < 72:
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(s))

        # 3. mint some owl:sameAs URIs
        refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri))
        domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri))

        # 4. Add some same-for-everyone properties
        refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket'))
        refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat)
        domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande)
        refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri)
        # 5. assert that we have everything we need

        # 6. done!
        return refuri
Exemplo n.º 30
0
    def analyze_baseline_queries(self, analyzed_articles, num_of_keyterms=5):
        basefile = "tfeu"

        # Helper from http://effbot.org/zone/element-lib.htm

        def flatten(elem, include_tail=0):
            text = elem.text or ""
            for e in elem:
                text += flatten(e, 1)
                if include_tail and elem.tail:
                    text += elem.tail
            return text

        # step 1: Create a temporary whoosh index in order to find out
        # the most significant words for each article

        #ana = analysis.StandardAnalyzer()
        ana = analysis.StemmingAnalyzer()
        # vectorformat = formats.Frequency(ana)
        schema = fields.Schema(article=fields.ID(unique=True),
                               content=fields.TEXT(analyzer=ana, stored=True))

        st = RamStorage()
        tmpidx = st.create_index(schema)
        w = tmpidx.writer()

        XHT_NS = "{http://www.w3.org/1999/xhtml}"
        tree = ET.parse(self.parsed_path(basefile))
        els = tree.findall("//" + XHT_NS + "div")
        articles = []
        for el in els:
            if 'typeof' in el.attrib and el.attrib[
                    'typeof'] == "eurlex:Article":
                text = util.normalize_space(flatten(el))
                article = str(el.attrib['about'])
                articles.append(article)
                w.update_document(article=article, content=text)
        w.commit()
        self.log.info("Indexed %d articles" % len(articles))

        # Step 2: Open the large whoosh index containing the text of
        # all cases. Then, for each article, use the 5 most distinctive terms
        # (filtering away numbers) to create a query against that index
        tempsearch = tmpidx.searcher()
        g = Graph()
        g.bind('celex', 'http://lagen.nu/ext/celex/')
        g.bind('ir', 'http://lagen.nu/informationretrieval#')
        IR = Namespace('http://lagen.nu/informationretrieval#')
        # celex:12008E264 ir:keyterm "blahonga"@en.

        outfile = self.generic_path("keyterms", "analyzed", ".tex")
        util.ensure_dir(outfile)
        fp = open(outfile, "w")
        fp.write("""
\\begin{tabular}{r|%s}
  \\hline
  \\textbf{Art.} & \\multicolumn{%s}{l}{\\textbf{Terms}} \\\\
  \\hline
""" % ("l" * num_of_keyterms, num_of_keyterms))

        for article in analyzed_articles:
            fp.write(str(int(article.split("E")[1])))
            r = tempsearch.search(query.Term("article", article))
            terms = r.key_terms("content", numterms=num_of_keyterms + 1)
            terms = [t[0] for t in terms
                     if not t[0].isdigit()][:num_of_keyterms]
            for term in terms:
                fp.write(" & " + term)
                g.add((URIRef(article), IR["keyterm"], Literal(term,
                                                               lang="en")))
            self.log.debug("Article %s:%r" % (article, terms))
            fp.write("\\\\\n")
        fp.write("""
  \\hline
\\end{tabular}
""")
        fp.close()

        outfile = self.generic_path("keyterms", "analyzed", ".n3")
        util.ensure_dir(outfile)
        fp = open(outfile, "w")
        fp.write(g.serialize(format="n3"))
        fp.close()
Exemplo n.º 31
0
 def header_lines(self, header_chunk):
     header = re.compile("([^:]+):\s*<b>([^<]*)</b>")
     for m in header.finditer(header_chunk):
         yield [util.normalize_space(x) for x in m.groups()]
Exemplo n.º 32
0
    def parse(self, doc):
        # some very simple heuristic rules for determining 
        # what an individual paragraph is
   
        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True
  
        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p
        
        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))
  
        # First paragraph of an RFC is always a header block 
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)
  
        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()
        
        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para])) 

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dct:title of the document and 
        # specify that it is in English
        desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en")

        # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile)
  
        # find and convert the publication date in the header to a datetime 
        # object, and set it as the dct:issued date for the document   
        re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale(): 
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year,dt.month,dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dct'].issued, pubdate)
  
        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dct:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header, re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dct'].subject, cat_match.group(1))
            
        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs, 
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)
  
        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
        # Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
# begin parse2                                   
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ",1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title, ordinal=ordinal, identifier=identifier)
                stack[1:] = [] # clear all but bottom element
                stack[0].append(s) # add new section to body
                stack.append(s)    # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[2:] = [] # clear all but bottom two elements
                stack[1].append(s) # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[3:] = [] # clear all but bottom three
                stack[-1].append(s) # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2                                   

# begin citation1                                   
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef")
# end citation1                                   

# begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                 uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                 uri += "#S" + parts['Sec']
            return uri
# end citation2                                   

# begin citation3
        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, 
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Exemplo n.º 33
0
    def find_definitions(self, element, find_definitions):
        if not isinstance(element, CompoundElement):
            return None
        find_definitions_recursive = find_definitions
        # Hitta begreppsdefinitioner
        if isinstance(element, Paragraf):
            # kolla om första stycket innehåller en text som
            # antyder att definitioner följer
            # self.log.debug("Testing %r against some regexes" % element[0][0])
            if self.re_definitions(element[0][0]):
                find_definitions = "normal"
            if (self.re_brottsdef(element[0][0]) or
                    self.re_brottsdef_alt(element[0][0])):
                find_definitions = "brottsrubricering"
            if self.re_parantesdef(element[0][0]):
                find_definitions = "parantes"
            if self.re_loptextdef(element[0][0]):
                find_definitions = "loptext"

            for p in element:
                if isinstance(p, Stycke):
                    # do an extra check in case "I denna paragraf
                    # avses med" occurs in the 2nd or later
                    # paragrapgh of a section
                    if self.re_definitions(p[0]):
                        find_definitions = "normal"
            find_definitions_recursive = find_definitions

        # Hitta lagrumshänvisningar + definitioner
        if isinstance(element, (Stycke, Listelement, Tabellrad)):
            nodes = []
            term = None

            # self.log.debug("handling text %s, find_definitions %s" % (element[0],find_definitions))
            if find_definitions:
                # For Tabellrad, this is a Tabellcell, not a string,
                # but we fix that later
                elementtext = element[0] 
                termdelimiter = ":"

                if isinstance(element, Tabellrad):
                    # only the first cell can be a definition, and
                    # only if it's not the text "Beteckning". So for
                    # the reminder of this func, we switch context to
                    # not the element itself but rather the first
                    # cell.
                    element = elementtext 
                    elementtext = element[0]
                    if elementtext != "Beteckning":
                        term = elementtext
                        self.log.debug(
                            '"%s" är nog en definition (1)' % term)
                elif isinstance(element, Stycke):

                    # Case 1: "antisladdsystem: ett tekniskt stödsystem"
                    # Sometimes, : is not the delimiter between
                    # the term and the definition, but even in
                    # those cases, : might figure in the
                    # definition itself, usually as part of the
                    # SFS number. Do some hairy heuristics to find
                    # out what delimiter to use
                    if find_definitions == "normal":
                        if not self.re_definitions(elementtext):
                            if " - " in elementtext:
                                if (":" in elementtext and
                                        (elementtext.index(":") < elementtext.index(" - "))):
                                    termdelimiter = ":"
                                else:
                                    termdelimiter = " - "
                            m = self.re_SearchSfsId(elementtext)

                            if termdelimiter == ":" and m and m.start() < elementtext.index(
                                    ":"):
                                termdelimiter = " "

                            if termdelimiter in elementtext:
                                term = elementtext.split(termdelimiter)[0]
                                self.log.debug('"%s" är nog en definition (2.1)' % term)

                    # case 2: "Den som berövar annan livet, döms
                    # för mord till fängelse"
                    m = self.re_brottsdef(elementtext)
                    if m:
                        term = m.group(2)
                        self.log.debug(
                            '"%s" är nog en definition (2.2)' % term)

                    # case 3: "För miljöbrott döms till böter"
                    m = self.re_brottsdef_alt(elementtext)
                    if m:
                        term = m.group(1)
                        self.log.debug(
                            '"%s" är nog en definition (2.3)' % term)

                    # case 4: "Inteckning får på ansökan av
                    # fastighetsägaren dödas (dödning)."
                    m = self.re_parantesdef(elementtext)
                    if m:
                        term = m.group(1)
                        # print("%s: %s" %  (basefile, elementtext))
                        self.log.debug(
                            '"%s" är nog en definition (2.4)' % term)

                    # case 5: "Med detaljhandel avses i denna lag
                    # försäljning av läkemedel"
                    m = self.re_loptextdef(elementtext)
                    if m:
                        term = m.group(1)
                        self.log.debug(
                            '"%s" är nog en definition (2.5)' % term)

                elif isinstance(element, Listelement):
                    for rx in (self.re_Bullet,
                               self.re_DottedNumber,
                               self.re_Bokstavslista):
                        elementtext = rx.sub('', elementtext)
                    term = elementtext.split(termdelimiter)[0]
                    self.log.debug('"%s" är nog en definition (3)' % term)

                # Longest legitimate term found "Valutaväxling,
                # betalningsöverföring och annan finansiell
                # verksamhet"
                if term and len(term) < 68:
                    term = util.normalize_space(term)
                    termnode = LinkSubject(term, uri=self._term_to_subject(
                        term), predicate="dcterms:subject")
                    find_definitions_recursive = False
                else:
                    term = None

            if term:
                idx = None
                for p in element:
                    if isinstance(p, str) and term in p:
                        (head, tail) = p.split(term, 1)
                        nodes = (head, termnode, tail)
                        idx = element.index(p)
                if not idx is None:
                    element[idx:idx + 1] = nodes

        return find_definitions_recursive
Exemplo n.º 34
0
    def parse_from_textreader(self, reader, basefile):
        tracelog = logging.getLogger("%s.tracelog" % self.alias)

        doc = self.make_document(basefile)
        g = doc.meta

        # 1.2: Load known entities and their URIs (we have to add some
        # that are not yet in the official resource lists
        resource_list_file = self.store.path("resourcelist", "intermediate", ".rdf")
        if not os.path.exists(resource_list_file):
            self.download_resource_lists("http://service.lagrummet.se/var/common", resource_list_file)
        resources = Graph()
        resources.parse(resource_list_file, format="xml")

        # 1.3: Define regexps for the data we search for.
        fwdtests = {
            "dct:issn": ["^ISSN (\d+\-\d+)$"],
            "dct:title": ["((?:Föreskrifter|[\w ]+s (?:föreskrifter|allmänna råd)).*?)\n\n"],
            "dct:identifier": ["^([A-ZÅÄÖ-]+FS\s\s?\d{4}:\d+)$"],
            "rpubl:utkomFranTryck": ["Utkom från\strycket\s+den\s(\d+ \w+ \d{4})"],
            "rpubl:omtryckAv": ["^(Omtryck)$"],
            "rpubl:genomforDirektiv": ["Celex (3\d{2,4}\w\d{4})"],
            "rpubl:beslutsdatum": ["(?:har beslutats|beslutade|beslutat) den (\d+ \w+ \d{4})"],
            "rpubl:beslutadAv": [
                "\n([A-ZÅÄÖ][\w ]+?)\d? (?:meddelar|lämnar|föreskriver)",
                "\s(?:meddelar|föreskriver) ([A-ZÅÄÖ][\w ]+?)\d?\s",
            ],
            "rpubl:bemyndigande": [
                " ?(?:meddelar|föreskriver|Föreskrifterna meddelas|Föreskrifterna upphävs)\d?,? (?:följande |)med stöd av\s(.*?) ?(?:att|efter\ssamråd|dels|följande|i fråga om|och lämnar allmänna råd|och beslutar följande allmänna råd|\.\n)",
                "^Med stöd av (.*)\s(?:meddelar|föreskriver)",
            ],
        }

        # 2: Find metadata properties

        # 2.1 Find some of the properties on the first page (or the
        # 2nd, or 3rd... continue past TOC pages, cover pages etc
        # until the "real" first page is found) NB: FFFS 2007:1 has
        # ten (10) TOC pages!
        pagecnt = 0
        for page in reader.getiterator(reader.readpage):
            # replace single newlines with spaces, but keep double
            # newlines
            # page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")])
            pagecnt += 1
            props = {}
            for (prop, tests) in list(fwdtests.items()):
                if prop in props:
                    continue
                for test in tests:
                    m = re.search(test, page, re.MULTILINE | re.DOTALL | re.UNICODE)
                    if m:
                        props[prop] = util.normalize_space(m.group(1))
            # Single required propery. If we find this, we're done
            if "rpubl:beslutsdatum" in props:
                break
            self.log.warning("%s: Couldn't find required props on page %s" % (basefile, pagecnt))

        # 2.2 Find some of the properties on the last 'real' page (not
        # counting appendicies)
        reader.seek(0)
        pagesrev = reversed(list(reader.getiterator(reader.readpage)))
        # The language used to expres these two properties differ
        # quite a lot, more than what is reasonable to express in a
        # single regex. We therefore define a set of possible
        # expressions and try them in turn.
        revtests = {
            "rpubl:ikrafttradandedatum": [
                "(?:Denna författning|Dessa föreskrifter|Dessa allmänna råd|Dessa föreskrifter och allmänna råd)\d* träder i ?kraft den (\d+ \w+ \d{4})",
                "Dessa föreskrifter träder i kraft, (?:.*), i övrigt den (\d+ \w+ \d{4})",
                "ska(?:ll|)\supphöra att gälla (?:den |)(\d+ \w+ \d{4}|denna dag|vid utgången av \w+ \d{4})",
                "träder i kraft den dag då författningen enligt uppgift på den (utkom från trycket)",
            ],
            "rpubl:upphaver": [
                "träder i kraft den (?:\d+ \w+ \d{4}), då(.*)ska upphöra att gälla",
                "ska(?:ll|)\supphöra att gälla vid utgången av \w+ \d{4}, nämligen(.*?)\n\n",
                "att (.*) skall upphöra att gälla (denna dag|vid utgången av \w+ \d{4})",
            ],
        }

        cnt = 0
        for page in pagesrev:
            cnt += 1
            # Normalize the whitespace in each paragraph so that a
            # linebreak in the middle of the natural language
            # expression doesn't break our regexes.
            page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")])

            for (prop, tests) in list(revtests.items()):
                if prop in props:
                    continue
                for test in tests:
                    # Not re.DOTALL -- we've normalized whitespace and
                    # don't want to match across paragraphs
                    m = re.search(test, page, re.MULTILINE | re.UNICODE)
                    if m:
                        props[prop] = util.normalize_space(m.group(1))
                        # print u"%s: '%s' resulted in match '%s' at page %s from end" %
                        # (prop,test,props[prop], cnt)

            # Single required propery. If we find this, we're done
            if "rpubl:ikrafttradandedatum" in props:
                break

        # 3: Clean up data - converting strings to Literals or
        # URIRefs, find legal references, etc
        if "dct:identifier" in props:
            (publication, year, ordinal) = re.split("[ :]", props["dct:identifier"])
            # FIXME: Read resources graph instead
            fs = resources.value(predicate=self.ns["skos"].altLabel, object=Literal(publication, lang="sv"))
            props["rpubl:forfattningssamling"] = fs
            publ = resources.value(subject=fs, predicate=self.ns["dct"].publisher)
            props["dct:publisher"] = publ

            props["rpubl:arsutgava"] = Literal(year)  # conversion to int, date not needed
            props["rpubl:lopnummer"] = Literal(ordinal)
            props["dct:identifier"] = Literal(props["dct:identifier"])

            # Now we can mint the uri (should be done through LegalURI)
            uri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (
                props["rpubl:forfattningssamling"].split("/")[-1],
                props["rpubl:arsutgava"],
                props["rpubl:lopnummer"],
            )
            self.log.debug("URI: %s" % uri)
        else:
            self.log.error("Couldn't find dct:identifier, cannot create URI, giving up")
            return None

        tracelog.info("Cleaning rpubl:beslutadAv")
        if "rpubl:beslutadAv" in props:
            agency = resources.value(
                predicate=self.ns["foaf"].name, object=Literal(props["rpubl:beslutadAv"], lang="sv")
            )
            if agency:
                props["rpubl:beslutadAv"] = agency
            else:
                self.log.warning("Cannot find URI for rpubl:beslutadAv value %r" % props["rpubl:beslutadAv"])
                del props["rpubl:beslutadAv"]

        tracelog.info("Cleaning dct:issn")
        if "dct:issn" in props:
            props["dct:issn"] = Literal(props["dct:issn"])

        tracelog.info("Cleaning dct:title")

        # common false positive
        if "dct:title" in props and "denna f\xf6rfattning har beslutats den" in props["dct:title"]:
            del props["dct:title"]

        if "dct:title" in props:
            tracelog.info("Inspecting dct:title %r" % props["dct:title"])
            # sometimes the title isn't separated with two newlines from the rest of the text
            if "\nbeslutade den " in props["dct:title"]:
                props["dct:title"] = props["dct:title"].split("\nbeslutade den ")[0]
            props["dct:title"] = Literal(util.normalize_space(props["dct:title"]), lang="sv")

            if re.search("^(Föreskrifter|[\w ]+s föreskrifter) om ändring i ", props["dct:title"], re.UNICODE):
                tracelog.info("Finding rpubl:andrar in dct:title")
                orig = re.search("([A-ZÅÄÖ-]+FS \d{4}:\d+)", props["dct:title"]).group(0)
                (publication, year, ordinal) = re.split("[ :]", orig)
                origuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (
                    self.rpubl_uri_transform(publication),
                    year,
                    ordinal,
                )
                props["rpubl:andrar"] = URIRef(origuri)
                if "rpubl:omtryckAv" in props:
                    props["rpubl:omtryckAv"] = URIRef(origuri)
            if (
                re.search("^(Föreskrifter|[\w ]+s föreskrifter) om upphävande av", props["dct:title"], re.UNICODE)
                and not "rpubl:upphaver" in props
            ):
                tracelog.info("Finding rpubl:upphaver in dct:title")
                props["rpubl:upphaver"] = six.text_type(props["dct:title"])  # cleaned below

        tracelog.info("Cleaning date properties")
        for prop in ("rpubl:utkomFranTryck", "rpubl:beslutsdatum", "rpubl:ikrafttradandedatum"):
            if prop in props:
                if props[prop] == "denna dag" and prop == "rpubl:ikrafttradandedatum":
                    props[prop] = props["rpubl:beslutsdatum"]
                elif props[prop] == "utkom från trycket" and prop == "rpubl:ikrafttradandedatum":
                    props[prop] = props["rpubl:utkomFranTryck"]
                else:
                    props[prop] = Literal(self.parse_swedish_date(props[prop].lower()))

        tracelog.info("Cleaning rpubl:genomforDirektiv")
        if "rpubl:genomforDirektiv" in props:
            props["rpubl:genomforDirektiv"] = URIRef(
                "http://rinfo.lagrummet.se/ext/eur-lex/%s" % props["rpubl:genomforDirektiv"]
            )

        tracelog.info("Cleaning rpubl:bemyndigande")
        has_bemyndiganden = False

        if "rpubl:bemyndigande" in props:
            # SimpleParse can't handle unicode endash sign, transform
            # into regular ascii hyphen
            props["rpubl:bemyndigande"] = props["rpubl:bemyndigande"].replace("\u2013", "-")
            parser = LegalRef(LegalRef.LAGRUM)
            result = parser.parse(props["rpubl:bemyndigande"])
            bemyndigande_uris = [x.uri for x in result if hasattr(x, "uri")]

            # some of these uris need to be filtered away due to
            # over-matching by parser.parse
            filtered_bemyndigande_uris = []
            for bem_uri in bemyndigande_uris:
                keep = True
                for compare in bemyndigande_uris:
                    if len(compare) > len(bem_uri) and compare.startswith(bem_uri):
                        keep = False
                if keep:
                    filtered_bemyndigande_uris.append(bem_uri)

            for bem_uri in filtered_bemyndigande_uris:
                g.add((URIRef(uri), self.ns["rpubl"]["bemyndigande"], URIRef(bem_uri)))
                has_bemyndiganden = True
            del props["rpubl:bemyndigande"]

        tracelog.info("Cleaning rpubl:upphaver")
        if "rpubl:upphaver" in props:
            for upph in re.findall("([A-ZÅÄÖ-]+FS \d{4}:\d+)", util.normalize_space(props["rpubl:upphaver"])):
                (publication, year, ordinal) = re.split("[ :]", upph)
                upphuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (publication.lower(), year, ordinal)
                g.add((URIRef(uri), self.ns["rpubl"]["upphaver"], URIRef(upphuri)))
            del props["rpubl:upphaver"]

        tracelog.info("Deciding rdf:type")
        if "dct:title" in props and "allmänna råd" in props["dct:title"] and not "föreskrifter" in props["dct:title"]:
            props["rdf:type"] = self.ns["rpubl"]["AllmannaRad"]
        else:
            props["rdf:type"] = self.ns["rpubl"]["Myndighetsforeskrift"]

        # 3.5: Check to see that we have all properties that we expect
        # (should maybe be done elsewhere later?)
        tracelog.info("Checking required properties")
        for prop in (
            "dct:identifier",
            "dct:title",
            "rpubl:arsutgava",
            "dct:publisher",
            "rpubl:beslutadAv",
            "rpubl:beslutsdatum",
            "rpubl:forfattningssamling",
            "rpubl:ikrafttradandedatum",
            "rpubl:lopnummer",
            "rpubl:utkomFranTryck",
        ):
            if not prop in props:
                self.log.warning("%s: Failed to find %s" % (basefile, prop))

        tracelog.info("Checking rpubl:bemyndigande")
        if props["rdf:type"] == self.ns["rpubl"]["Myndighetsforeskrift"]:
            if not has_bemyndiganden:
                self.log.warning("%s: Failed to find rpubl:bemyndigande" % (basefile))

        # 4: Add the cleaned data to a RDFLib Graph
        # (maybe we should do that as early as possible?)
        tracelog.info("Adding items to rdflib.Graph")
        for (prop, value) in list(props.items()):
            (prefix, term) = prop.split(":", 1)
            p = self.ns[prefix][term]
            if not (isinstance(value, URIRef) or isinstance(value, Literal)):
                self.log.warning("%s: %s is a %s, not a URIRef or Literal" % (basefile, prop, type(value)))
            g.add((URIRef(uri), p, value))

        # 5: Create data for the body, removing various control characters
        # TODO: Use pdftohtml to create a nice viewable HTML
        # version instead of this plaintext stuff
        reader.seek(0)
        body = []

        # A fairly involved way of filtering out all control
        # characters from a string
        import unicodedata

        if six.PY3:
            all_chars = (chr(i) for i in range(0x10000))
        else:
            all_chars = (unichr(i) for i in range(0x10000))
        control_chars = "".join(c for c in all_chars if unicodedata.category(c) == "Cc")
        # tab and newline are technically Control characters in
        # unicode, but we want to keep them.
        control_chars = control_chars.replace("\t", "").replace("\n", "")

        control_char_re = re.compile("[%s]" % re.escape(control_chars))
        for page in reader.getiterator(reader.readpage):
            text = xml_escape(control_char_re.sub("", page))
            body.append("<pre>%s</pre>\n\n" % text)

        # 5: Done!
        #
        doc.body = body
        doc.lang = "sv"
        doc.uri = uri
        return doc
Exemplo n.º 35
0
 def parse_from_soup(self, soup, doc):
     for block in soup.findAll(['div', 'p']):
         t = util.normalize_space(''.join(block.findAll(text=True)))
         block.extract()  # to avoid seeing it again
         if t:
             doc.body.append(Paragraph([t]))
Exemplo n.º 36
0
    def extract_metadata_header(self, reader, basefile):
        re_sfs = re.compile(r'(\d{4}:\d+)\s*$').search
        d = {}
        for line in reader:
            if ":" in line:
                (key,
                 val) = [util.normalize_space(x) for x in line.split(":", 1)]
            # Simple string literals
            if key == 'Rubrik':
                d["dcterms:title"] = val
            elif key == 'Övrigt':
                d["rdfs:comment"] = val
            elif key == 'SFS nr':
                identifier = "SFS " + val
                # delay actual writing to graph, since we may need to
                # amend this

            # date literals
            elif key == 'Utfärdad':
                d["rpubl:utfardandedatum"] = val[:10]
            elif key == 'Tidsbegränsad':
                # FIXME: Should be done by lagen.nu.SFS
                d["rinfoex:tidsbegransad"] = val[:10]
            elif key == 'Upphävd':
                dat = datetime.strptime(val[:10], '%Y-%m-%d')
                d["rpubl:upphavandedatum"] = val[:10]
                if not self.config.keepexpired and dat < datetime.today():
                    raise UpphavdForfattning(
                        "%s is an expired SFS" % basefile,
                        dummyfile=self.store.parsed_path(basefile))

            # urirefs
            elif key == 'Departement/ myndighet':
                # this is only needed because of SFS 1942:724, which
                # has "Försvarsdepartementet, Socialdepartementet"...
                if "departementet, " in val:
                    val = val.split(", ")[0]
                d["dcterms:creator"] = val
            elif (key == 'Ändring införd' and re_sfs(val)):
                uppdaterad = re_sfs(val).group(1)
                # not sure we need to add this, since parse_metadata
                # catches the same
                d["rpubl:konsolideringsunderlag"] = [
                    URIRef(self.canonical_uri(uppdaterad))
                ]
                if identifier and identifier != "SFS " + uppdaterad:
                    identifier += " i lydelse enligt SFS " + uppdaterad
                d["dcterms:issued"] = uppdaterad

            elif (key == 'Omtryck' and re_sfs(val)):
                d["rinfoex:omtryck"] = self.canonical_uri(re_sfs(val).group(1))
            elif (key == 'Författningen har upphävts genom' and re_sfs(val)):
                s = re_sfs(val).group(1)
                d["rinfoex:upphavdAv"] = self.canonical_uri(s)
            else:
                self.log.warning('%s: Obekant nyckel [\'%s\']' %
                                 (basefile, key))

        d["dcterms:identifier"] = identifier

        # FIXME: This is a misuse of the dcterms:issued prop in order
        # to mint the correct URI. We need to remove this somehow afterwards.
        if "dcterms:issued" not in d:
            d["dcterms:issued"] = basefile

        if "dcterms:title" not in d:
            self.log.warning("%s: Rubrik saknas" % basefile)
        return d
Exemplo n.º 37
0
    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Exemplo n.º 38
0
    def extract_metadata_header(self, reader, basefile):
        re_sfs = re.compile(r'(\d{4}:\d+)\s*$').search
        d = {}
        for line in reader:
            if ":" in line:
                (key, val) = [util.normalize_space(x)
                              for x in line.split(":", 1)]
            # Simple string literals
            if key == 'Rubrik':
                d["dcterms:title"] = val
            elif key == 'Övrigt':
                d["rdfs:comment"] = val
            elif key == 'SFS nr':
                identifier = "SFS " + val
                # delay actual writing to graph, since we may need to
                # amend this

            # date literals
            elif key == 'Utfärdad':
                d["rpubl:utfardandedatum"] = val[:10]
            elif key == 'Tidsbegränsad':
                # FIXME: Should be done by lagen.nu.SFS
                d["rinfoex:tidsbegransad"] = val[:10]
            elif key == 'Upphävd':
                dat = datetime.strptime(val[:10], '%Y-%m-%d')
                d["rpubl:upphavandedatum"] = val[:10]
                if not self.config.keepexpired and dat < datetime.today():
                    raise UpphavdForfattning("%s is an expired SFS" % basefile,
                                             dummyfile=self.store.parsed_path(basefile))

            # urirefs
            elif key == 'Departement/ myndighet':
                # this is only needed because of SFS 1942:724, which
                # has "Försvarsdepartementet, Socialdepartementet"...
                if "departementet, " in val:
                    val = val.split(", ")[0]
                d["dcterms:creator"] = val
            elif (key == 'Ändring införd' and re_sfs(val)):
                uppdaterad = re_sfs(val).group(1)
                # not sure we need to add this, since parse_metadata
                # catches the same
                d["rpubl:konsolideringsunderlag"] = [URIRef(self.canonical_uri(uppdaterad))]
                if identifier and identifier != "SFS " + uppdaterad:
                    identifier += " i lydelse enligt SFS " + uppdaterad
                d["dcterms:issued"] = uppdaterad

            elif (key == 'Omtryck' and re_sfs(val)):
                d["rinfoex:omtryck"] = self.canonical_uri(re_sfs(val).group(1))
            elif (key == 'Författningen har upphävts genom' and
                  re_sfs(val)):
                s = re_sfs(val).group(1)
                d["rinfoex:upphavdAv"] = self.canonical_uri(s)
            else:
                self.log.warning(
                    '%s: Obekant nyckel [\'%s\']' % (basefile, key))

        d["dcterms:identifier"] = identifier

        # FIXME: This is a misuse of the dcterms:issued prop in order
        # to mint the correct URI. We need to remove this somehow afterwards.
        if "dcterms:issued" not in d:
            d["dcterms:issued"] = basefile

        if "dcterms:title" not in d:
            self.log.warning("%s: Rubrik saknas" % basefile)
        return d
Exemplo n.º 39
0
    def extract_metadata_register(self, soup, basefile):
        d = {}
        rubrik = util.normalize_space(soup.body('table')[2].text)
        changes = soup.body('table')[3:-2]
        g = self.make_graph()  # used for qname lookup only
        for table in changes:
            sfsnr = table.find(text="SFS-nummer:").find_parent(
                "td").find_next_sibling("td").text.strip()
            docuri = self.canonical_uri(sfsnr)
            rowdict = {}
            parts = sfsnr.split(":")
            d[docuri] = {
                "dcterms:publisher": "Regeringskansliet",
                "rpubl:arsutgava": parts[0],
                "rpubl:beslutadAv": "Regeringskansliet",
                "rpubl:forfattningssamling": "SFS",
                "rpubl:lopnummer": parts[1]
            }
            for row in table('tr'):
                key = row.td.text.strip()
                if key.endswith(":"):
                    key = key[:-1]  # trim ending ":"
                elif key == '':
                    continue
                # FIXME: the \xa0 (&nbsp;) to space conversion should
                # maye be part of normalize_space?
                val = util.normalize_space(row('td')[1].text)
                if val == "":
                    continue
                rowdict[key] = val
            # first change does not contain a "Rubrik" key. Fake it.
            if 'Rubrik' not in rowdict and rubrik:
                rowdict['Rubrik'] = rubrik
                rubrik = None
            for key, val in rowdict.items():
                if key == 'SFS-nummer':
                    (arsutgava, lopnummer) = val.split(":")
                    d[docuri]["dcterms:identifier"] = "SFS " + val
                    d[docuri]["rpubl:arsutgava"] = arsutgava
                    d[docuri]["rpubl:lopnummer"] = lopnummer

                elif key == 'Ansvarig myndighet':
                    d[docuri]["rpubl:departement"] = val
                    # FIXME: Sanitize this in
                    # sanitize_metadata->sanitize_department, lookup
                    # resource in polish_metadata
                elif key == 'Rubrik':
                    # Change acts to Balkar never contain the SFS no
                    # of the Balk.
                    if basefile not in val and not val.endswith("balken"):
                        self.log.warning("%s: Base SFS %s not in title %r" %
                                         (basefile, basefile, val))
                    d[docuri]["dcterms:title"] = val
                    d[docuri]["rdf:type"] = self._forfattningstyp(val)
                elif key == 'Observera':
                    if not self.config.keepexpired:
                        if 'Författningen är upphävd/skall upphävas: ' in val:
                            dateval = datetime.strptime(val[41:51], '%Y-%m-%d')
                            if dateval < datetime.today():
                                raise UpphavdForfattning(
                                    "%s is an expired SFS" % basefile,
                                    dummyfile=self.store.parsed_path(basefile))
                    d[docuri]["rdfs:comment"] = val
                elif key == 'Ikraft':
                    d[docuri]["rpubl:ikrafttradandedatum"] = val[:10]
                elif key == 'Omfattning':
                    # First, create rdf statements for every
                    # single modified section we can find
                    for changecat in val.split('; '):
                        if (changecat.startswith('ändr.')
                                or changecat.startswith('ändr ')
                                or changecat.startswith('ändring ')):
                            pred = self.ns['rpubl'].ersatter
                        elif (changecat.startswith('upph.')
                              or changecat.startswith('upp.')
                              or changecat.startswith('utgår')):
                            pred = self.ns['rpubl'].upphaver
                        elif (changecat.startswith('ny')
                              or changecat.startswith('ikrafttr.')
                              or changecat.startswith('ikrafftr.')
                              or changecat.startswith('ikraftr.')
                              or changecat.startswith('ikraftträd.')
                              or changecat.startswith('tillägg')):
                            pred = self.ns['rpubl'].inforsI
                        elif (changecat.startswith('nuvarande')
                              or changecat.startswith('rubr. närmast')
                              or changecat
                              in ('begr. giltighet', 'Omtryck', 'omtryck',
                                  'forts.giltighet', 'forts. giltighet',
                                  'forts. giltighet av vissa best.')):
                            # some of these changecats are renames, eg
                            # "nuvarande 2, 3, 4, 5 §§ betecknas 10,
                            # 11, 12, 13, 14, 15 §§;" or
                            # "rubr. närmast efter 1 § sätts närmast
                            # före 10 §"
                            pred = None
                        else:
                            self.log.warning("%s: Okänd omfattningstyp %r" %
                                             (basefile, changecat))
                            pred = None
                        old_currenturl = self.lagrum_parser._currenturl
                        self.lagrum_parser._currenturl = docuri
                        for node in self.lagrum_parser.parse_string(
                                changecat, pred):
                            if hasattr(node, 'predicate'):
                                qname = g.qname(node.predicate)
                                d[docuri][qname] = node.uri
                        self.lagrum_parser._currenturl = old_currenturl
                    # Secondly, preserve the entire text
                    d[docuri]["rpubl:andrar"] = val
                elif key == 'Förarbeten':
                    for node in self.forarbete_parser.parse_string(
                            val, "rpubl:forarbete"):
                        if hasattr(node, 'uri'):
                            if "rpubl:forarbete" not in d[docuri]:
                                d[docuri]["rpubl:forarbete"] = []
                            d[docuri]["rpubl:forarbete"].append(node.uri)
                            d[node.uri] = {"dcterms:identifier": str(node)}
                elif key == 'CELEX-nr':
                    for celex in re.findall('3\d{2,4}[LR]\d{4}', val):
                        b = BNode()
                        cg = Graph()
                        cg.add((b, RPUBL.celexNummer, Literal(celex)))
                        celexuri = self.minter.space.coin_uri(cg.resource(b))
                        if "rpubl:genomforDirektiv" not in d[docuri]:
                            d[docuri]["rpubl:genomforDirektiv"] = []
                        d[docuri]["rpubl:genomforDirektiv"].append(celexuri)
                        d[celexuri] = {"rpubl:celexNummer": celex}
                elif key == 'Tidsbegränsad':
                    d["rinfoex:tidsbegransad"] = val[:10]
                    expdate = datetime.strptime(val[:10], '%Y-%m-%d')
                    if expdate < datetime.today():
                        if not self.config.keepexpired:
                            raise UpphavdForfattning(
                                "%s is expired (time-limited) SFS" % basefile,
                                dummyfile=self.store.parsed_path(basefile))
                else:
                    self.log.warning('%s: Obekant nyckel [\'%s\']' % basefile,
                                     key)
            utfardandedatum = self._find_utfardandedatum(sfsnr)
            if utfardandedatum:
                d[docuri]["rpubl:utfardandedatum"] = utfardandedatum
        return d
Exemplo n.º 40
0
    def extract_metadata_register(self, soup, basefile):
        d = {}
        rubrik = util.normalize_space(soup.body('table')[2].text)
        changes = soup.body('table')[3:-2]
        g = self.make_graph()  # used for qname lookup only
        for table in changes:
            sfsnr = table.find(text="SFS-nummer:").find_parent(
                "td").find_next_sibling("td").text.strip()
            docuri = self.canonical_uri(sfsnr)
            rowdict = {}
            parts = sfsnr.split(":")
            d[docuri] = {
                "dcterms:publisher": "Regeringskansliet",
                "rpubl:arsutgava": parts[0],
                "rpubl:beslutadAv": "Regeringskansliet",
                "rpubl:forfattningssamling": "SFS",
                "rpubl:lopnummer": parts[1]
            }
            for row in table('tr'):
                key = row.td.text.strip()
                if key.endswith(":"):
                    key = key[:-1]  # trim ending ":"
                elif key == '':
                    continue
                # FIXME: the \xa0 (&nbsp;) to space conversion should
                # maye be part of normalize_space?
                val = util.normalize_space(row('td')[1].text)
                if val == "":
                    continue
                rowdict[key] = val
            # first change does not contain a "Rubrik" key. Fake it.
            if 'Rubrik' not in rowdict and rubrik:
                rowdict['Rubrik'] = rubrik
                rubrik = None
            for key, val in rowdict.items():
                if key == 'SFS-nummer':
                    (arsutgava, lopnummer) = val.split(":")
                    d[docuri]["dcterms:identifier"] = "SFS " + val
                    d[docuri]["rpubl:arsutgava"] = arsutgava
                    d[docuri]["rpubl:lopnummer"] = lopnummer

                elif key == 'Ansvarig myndighet':
                    d[docuri]["rpubl:departement"] = val
                    # FIXME: Sanitize this in
                    # sanitize_metadata->sanitize_department, lookup
                    # resource in polish_metadata
                elif key == 'Rubrik':
                    # Change acts to Balkar never contain the SFS no
                    # of the Balk.
                    if basefile not in val and not val.endswith("balken"):
                        self.log.warning(
                            "%s: Base SFS %s not in title %r" % (basefile,
                                                                 basefile,
                                                                 val))
                    d[docuri]["dcterms:title"] = val
                    d[docuri]["rdf:type"] = self._forfattningstyp(val)
                elif key == 'Observera':
                    if not self.config.keepexpired:
                        if 'Författningen är upphävd/skall upphävas: ' in val:
                            dateval = datetime.strptime(val[41:51], '%Y-%m-%d')
                            if dateval < datetime.today():
                                raise UpphavdForfattning("%s is an expired SFS"
                                                         % basefile,
                                                         dummyfile=self.store.parsed_path(basefile))
                    d[docuri]["rdfs:comment"] = val
                elif key == 'Ikraft':
                    d[docuri]["rpubl:ikrafttradandedatum"] = val[:10]
                elif key == 'Omfattning':
                    # First, create rdf statements for every
                    # single modified section we can find
                    for changecat in val.split('; '):
                        if (changecat.startswith('ändr.') or
                            changecat.startswith('ändr ') or
                                changecat.startswith('ändring ')):
                            pred = self.ns['rpubl'].ersatter
                        elif (changecat.startswith('upph.') or
                              changecat.startswith('upp.') or
                              changecat.startswith('utgår')):
                            pred = self.ns['rpubl'].upphaver
                        elif (changecat.startswith('ny') or
                              changecat.startswith('ikrafttr.') or
                              changecat.startswith('ikrafftr.') or
                              changecat.startswith('ikraftr.') or
                              changecat.startswith('ikraftträd.') or
                              changecat.startswith('tillägg')):
                            pred = self.ns['rpubl'].inforsI
                        elif (changecat.startswith('nuvarande') or
                              changecat.startswith('rubr. närmast') or
                              changecat in ('begr. giltighet', 'Omtryck',
                                            'omtryck', 'forts.giltighet',
                                            'forts. giltighet',
                                            'forts. giltighet av vissa best.')):
                            # some of these changecats are renames, eg
                            # "nuvarande 2, 3, 4, 5 §§ betecknas 10,
                            # 11, 12, 13, 14, 15 §§;" or
                            # "rubr. närmast efter 1 § sätts närmast
                            # före 10 §"
                            pred = None
                        else:
                            self.log.warning(
                                "%s: Okänd omfattningstyp %r" %
                                (basefile, changecat))
                            pred = None
                        old_currenturl = self.lagrum_parser._currenturl
                        self.lagrum_parser._currenturl = docuri
                        for node in self.lagrum_parser.parse_string(changecat,
                                                                    pred):
                            if hasattr(node, 'predicate'):
                                qname = g.qname(node.predicate)
                                d[docuri][qname] = node.uri
                        self.lagrum_parser._currenturl = old_currenturl
                    # Secondly, preserve the entire text
                    d[docuri]["rpubl:andrar"] = val
                elif key == 'Förarbeten':
                    for node in self.forarbete_parser.parse_string(val,
                                                                   "rpubl:forarbete"):
                        if hasattr(node, 'uri'):
                            if "rpubl:forarbete" not in d[docuri]:
                                d[docuri]["rpubl:forarbete"] = []
                            d[docuri]["rpubl:forarbete"].append(node.uri)
                            d[node.uri] = {"dcterms:identifier": str(node)}
                elif key == 'CELEX-nr':
                    for celex in re.findall('3\d{2,4}[LR]\d{4}', val):
                        b = BNode()
                        cg = Graph()
                        cg.add((b, RPUBL.celexNummer, Literal(celex)))
                        celexuri = self.minter.space.coin_uri(cg.resource(b))
                        if "rpubl:genomforDirektiv" not in d[docuri]:
                            d[docuri]["rpubl:genomforDirektiv"] = []
                        d[docuri]["rpubl:genomforDirektiv"].append(celexuri)
                        d[celexuri] = {"rpubl:celexNummer": celex}
                elif key == 'Tidsbegränsad':
                    d["rinfoex:tidsbegransad"] = val[:10]
                    expdate = datetime.strptime(val[:10], '%Y-%m-%d')
                    if expdate < datetime.today():
                        if not self.config.keepexpired:
                            raise UpphavdForfattning(
                                "%s is expired (time-limited) SFS" % basefile,
                                dummyfile=self.store.parsed_path(basefile))
                else:
                    self.log.warning(
                        '%s: Obekant nyckel [\'%s\']' % basefile, key)
            utfardandedatum = self._find_utfardandedatum(sfsnr)
            if utfardandedatum:
                d[docuri]["rpubl:utfardandedatum"] = utfardandedatum
        return d
Exemplo n.º 41
0
    def find_definitions(self, element, find_definitions):
        if not isinstance(element, CompoundElement):
            return None
        find_definitions_recursive = find_definitions
        # Hitta begreppsdefinitioner
        if isinstance(element, Paragraf):
            # kolla om första stycket innehåller en text som
            # antyder att definitioner följer
            # self.log.debug("Testing %r against some regexes" % element[0][0])
            if self.re_definitions(element[0][0]):
                find_definitions = "normal"
            if (self.re_brottsdef(element[0][0])
                    or self.re_brottsdef_alt(element[0][0])):
                find_definitions = "brottsrubricering"
            if self.re_parantesdef(element[0][0]):
                find_definitions = "parantes"
            if self.re_loptextdef(element[0][0]):
                find_definitions = "loptext"

            for p in element:
                if isinstance(p, Stycke):
                    # do an extra check in case "I denna paragraf
                    # avses med" occurs in the 2nd or later
                    # paragrapgh of a section
                    if self.re_definitions(p[0]):
                        find_definitions = "normal"
            find_definitions_recursive = find_definitions

        # Hitta lagrumshänvisningar + definitioner
        if isinstance(element, (Stycke, Listelement, Tabellrad)):
            nodes = []
            term = None

            # self.log.debug("handling text %s, find_definitions %s" % (element[0],find_definitions))
            if find_definitions:
                # For Tabellrad, this is a Tabellcell, not a string,
                # but we fix that later
                elementtext = element[0]
                termdelimiter = ":"

                if isinstance(element, Tabellrad):
                    # only the first cell can be a definition, and
                    # only if it's not the text "Beteckning". So for
                    # the reminder of this func, we switch context to
                    # not the element itself but rather the first
                    # cell.
                    element = elementtext
                    elementtext = element[0]
                    if elementtext != "Beteckning":
                        term = elementtext
                        self.log.debug('"%s" är nog en definition (1)' % term)
                elif isinstance(element, Stycke):

                    # Case 1: "antisladdsystem: ett tekniskt stödsystem"
                    # Sometimes, : is not the delimiter between
                    # the term and the definition, but even in
                    # those cases, : might figure in the
                    # definition itself, usually as part of the
                    # SFS number. Do some hairy heuristics to find
                    # out what delimiter to use
                    if find_definitions == "normal":
                        if not self.re_definitions(elementtext):
                            if " - " in elementtext:
                                if (":" in elementtext
                                        and (elementtext.index(":") <
                                             elementtext.index(" - "))):
                                    termdelimiter = ":"
                                else:
                                    termdelimiter = " - "
                            m = self.re_SearchSfsId(elementtext)

                            if termdelimiter == ":" and m and m.start(
                            ) < elementtext.index(":"):
                                termdelimiter = " "

                            if termdelimiter in elementtext:
                                term = elementtext.split(termdelimiter)[0]
                                self.log.debug(
                                    '"%s" är nog en definition (2.1)' % term)

                    # case 2: "Den som berövar annan livet, döms
                    # för mord till fängelse"
                    m = self.re_brottsdef(elementtext)
                    if m:
                        term = m.group(2)
                        self.log.debug('"%s" är nog en definition (2.2)' %
                                       term)

                    # case 3: "För miljöbrott döms till böter"
                    m = self.re_brottsdef_alt(elementtext)
                    if m:
                        term = m.group(1)
                        self.log.debug('"%s" är nog en definition (2.3)' %
                                       term)

                    # case 4: "Inteckning får på ansökan av
                    # fastighetsägaren dödas (dödning)."
                    m = self.re_parantesdef(elementtext)
                    if m:
                        term = m.group(1)
                        # print("%s: %s" %  (basefile, elementtext))
                        self.log.debug('"%s" är nog en definition (2.4)' %
                                       term)

                    # case 5: "Med detaljhandel avses i denna lag
                    # försäljning av läkemedel"
                    m = self.re_loptextdef(elementtext)
                    if m:
                        term = m.group(1)
                        self.log.debug('"%s" är nog en definition (2.5)' %
                                       term)

                elif isinstance(element, Listelement):
                    for rx in (self.re_Bullet, self.re_DottedNumber,
                               self.re_Bokstavslista):
                        elementtext = rx.sub('', elementtext)
                    term = elementtext.split(termdelimiter)[0]
                    self.log.debug('"%s" är nog en definition (3)' % term)

                # Longest legitimate term found "Valutaväxling,
                # betalningsöverföring och annan finansiell
                # verksamhet"
                if term and len(term) < 68:
                    term = util.normalize_space(term)
                    termnode = LinkSubject(term,
                                           uri=self._term_to_subject(term),
                                           predicate="dcterms:subject")
                    find_definitions_recursive = False
                else:
                    term = None

            if term:
                idx = None
                for p in element:
                    if isinstance(p, str) and term in p:
                        (head, tail) = p.split(term, 1)
                        nodes = (head, termnode, tail)
                        idx = element.index(p)
                if not idx is None:
                    element[idx:idx + 1] = nodes

        return find_definitions_recursive
Exemplo n.º 42
0
    def test_ocr(self):
        try:
            if not os.environ.get("FERENDA_TEST_TESSERACT"):
                raise errors.ExternalCommandError
            reader = PDFReader(filename="test/files/pdfreader/scanned.pdf",
                               workdir=self.datadir,
                               ocr_lang="swe")
        except errors.ExternalCommandError:
            self._copy_sample()
            reader = PDFReader(filename="test/files/pdfreader/scanned.pdf",
                               workdir=self.datadir,
                               ocr_lang="swe")

        # assert that a hOCR file has been created
        self.assertTrue(os.path.exists(self.datadir + os.sep + "scanned.hocr.html"))

        # assert that we have two pages
        self.assertEqual(2, len(reader))

        # assert that first element in the first textbox in the first
        # page corresponds to the first bbox, scaled by the
        # pixel/point scaling factor.
        self.assertEqual("Regeringens ", str(reader[0][0][0]))
        self.assertEqual(47, reader[0][0][0].top)
        self.assertEqual(38, reader[0][0][0].left)
        self.assertEqual(21, reader[0][0][0].height)
        self.assertEqual(118, reader[0][0][0].width)

        # assert that the <s>third</s>fifth textbox (which has mostly
        # normal text) is rendered correctly (note that we have a
        # couple of OCR errors).
        # self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i bifogade utdrag ur regeringsprotokollet den 31 oktober l99l.", util.normalize_space(str(reader[0][3])))
        self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i", util.normalize_space(str(reader[0][5])))