def downloaded_to_intermediate(self, basefile): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def extract_metadata_header(self, reader, basefile): re_sfs = re.compile(r'(\d{4}:\d+)\s*$').search d = {} for line in reader: if ":" in line: (key, val) = [util.normalize_space(x) for x in line.split(":", 1)] # Simple string literals if key == 'Rubrik': d["dcterms:title"] = val elif key == 'Övrigt': d["rdfs:comment"] = val elif key == 'SFS nr': identifier = "SFS " + val # delay actual writing to graph, since we may need to # amend this # date literals elif key == 'Utfärdad': d["rpubl:utfardandedatum"] = val[:10] elif key == 'Tidsbegränsad': # FIXME: Should be done by lagen.nu.SFS d["rinfoex:tidsbegransad"] = val[:10] elif key == 'Upphävd': dat = datetime.strptime(val[:10], '%Y-%m-%d') d["rpubl:upphavandedatum"] = val[:10] if not self.config.keepexpired and dat < datetime.today(): raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) # urirefs elif key == 'Departement/ myndighet': # this is only needed because of SFS 1942:724, which # has "Försvarsdepartementet, Socialdepartementet"... if "departementet, " in val: val = val.split(", ")[0] d["dcterms:creator"] = val elif (key == 'Ändring införd' and re_sfs(val)): uppdaterad = re_sfs(val).group(1) # not sure we need to add this, since parse_metadata # catches the same d["rpubl:konsolideringsunderlag"] = [ URIRef(self.canonical_uri(uppdaterad)) ] if identifier and identifier != "SFS " + uppdaterad: identifier += " i lydelse enligt SFS " + uppdaterad d["dcterms:issued"] = uppdaterad elif (key == 'Omtryck' and re_sfs(val)): d["rinfoex:omtryck"] = self.canonical_uri(re_sfs(val).group(1)) elif (key == 'Författningen har upphävts genom' and re_sfs(val)): s = re_sfs(val).group(1) d["rinfoex:upphavdAv"] = self.canonical_uri(s) else: self.log.warning('%s: Obekant nyckel [\'%s\']' % (basefile, key)) d["dcterms:identifier"] = identifier # FIXME: This is a misuse of the dcterms:issued prop in order # to mint the correct URI. We need to remove this somehow afterwards. if "dcterms:issued" not in d: d["dcterms:issued"] = basefile if "dcterms:title" not in d: self.log.warning("%s: Rubrik saknas" % basefile) return d
def extract_metadata_register(self, soup, basefile): d = {} rubrik = util.normalize_space(soup.body('table')[2].text) changes = soup.body('table')[3:-2] g = self.make_graph() # used for qname lookup only for table in changes: sfsnr = table.find(text="SFS-nummer:").find_parent( "td").find_next_sibling("td").text.strip() docuri = self.canonical_uri(sfsnr) rowdict = {} parts = sfsnr.split(":") d[docuri] = { "dcterms:publisher": "Regeringskansliet", "rpubl:arsutgava": parts[0], "rpubl:beslutadAv": "Regeringskansliet", "rpubl:forfattningssamling": "SFS", "rpubl:lopnummer": parts[1] } for row in table('tr'): key = row.td.text.strip() if key.endswith(":"): key = key[:-1] # trim ending ":" elif key == '': continue # FIXME: the \xa0 ( ) to space conversion should # maye be part of normalize_space? val = util.normalize_space(row('td')[1].text) if val == "": continue rowdict[key] = val # first change does not contain a "Rubrik" key. Fake it. if 'Rubrik' not in rowdict and rubrik: rowdict['Rubrik'] = rubrik rubrik = None for key, val in rowdict.items(): if key == 'SFS-nummer': (arsutgava, lopnummer) = val.split(":") d[docuri]["dcterms:identifier"] = "SFS " + val d[docuri]["rpubl:arsutgava"] = arsutgava d[docuri]["rpubl:lopnummer"] = lopnummer elif key == 'Ansvarig myndighet': d[docuri]["rpubl:departement"] = val # FIXME: Sanitize this in # sanitize_metadata->sanitize_department, lookup # resource in polish_metadata elif key == 'Rubrik': # Change acts to Balkar never contain the SFS no # of the Balk. if basefile not in val and not val.endswith("balken"): self.log.warning("%s: Base SFS %s not in title %r" % (basefile, basefile, val)) d[docuri]["dcterms:title"] = val d[docuri]["rdf:type"] = self._forfattningstyp(val) elif key == 'Observera': if not self.config.keepexpired: if 'Författningen är upphävd/skall upphävas: ' in val: dateval = datetime.strptime(val[41:51], '%Y-%m-%d') if dateval < datetime.today(): raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) d[docuri]["rdfs:comment"] = val elif key == 'Ikraft': d[docuri]["rpubl:ikrafttradandedatum"] = val[:10] elif key == 'Omfattning': # First, create rdf statements for every # single modified section we can find for changecat in val.split('; '): if (changecat.startswith('ändr.') or changecat.startswith('ändr ') or changecat.startswith('ändring ')): pred = self.ns['rpubl'].ersatter elif (changecat.startswith('upph.') or changecat.startswith('upp.') or changecat.startswith('utgår')): pred = self.ns['rpubl'].upphaver elif (changecat.startswith('ny') or changecat.startswith('ikrafttr.') or changecat.startswith('ikrafftr.') or changecat.startswith('ikraftr.') or changecat.startswith('ikraftträd.') or changecat.startswith('tillägg')): pred = self.ns['rpubl'].inforsI elif (changecat.startswith('nuvarande') or changecat.startswith('rubr. närmast') or changecat in ('begr. giltighet', 'Omtryck', 'omtryck', 'forts.giltighet', 'forts. giltighet', 'forts. giltighet av vissa best.')): # some of these changecats are renames, eg # "nuvarande 2, 3, 4, 5 §§ betecknas 10, # 11, 12, 13, 14, 15 §§;" or # "rubr. närmast efter 1 § sätts närmast # före 10 §" pred = None else: self.log.warning("%s: Okänd omfattningstyp %r" % (basefile, changecat)) pred = None old_currenturl = self.lagrum_parser._currenturl self.lagrum_parser._currenturl = docuri for node in self.lagrum_parser.parse_string( changecat, pred): if hasattr(node, 'predicate'): qname = g.qname(node.predicate) d[docuri][qname] = node.uri self.lagrum_parser._currenturl = old_currenturl # Secondly, preserve the entire text d[docuri]["rpubl:andrar"] = val elif key == 'Förarbeten': for node in self.forarbete_parser.parse_string( val, "rpubl:forarbete"): if hasattr(node, 'uri'): if "rpubl:forarbete" not in d[docuri]: d[docuri]["rpubl:forarbete"] = [] d[docuri]["rpubl:forarbete"].append(node.uri) d[node.uri] = {"dcterms:identifier": str(node)} elif key == 'CELEX-nr': for celex in re.findall('3\d{2,4}[LR]\d{4}', val): b = BNode() cg = Graph() cg.add((b, RPUBL.celexNummer, Literal(celex))) celexuri = self.minter.space.coin_uri(cg.resource(b)) if "rpubl:genomforDirektiv" not in d[docuri]: d[docuri]["rpubl:genomforDirektiv"] = [] d[docuri]["rpubl:genomforDirektiv"].append(celexuri) d[celexuri] = {"rpubl:celexNummer": celex} elif key == 'Tidsbegränsad': d["rinfoex:tidsbegransad"] = val[:10] expdate = datetime.strptime(val[:10], '%Y-%m-%d') if expdate < datetime.today(): if not self.config.keepexpired: raise UpphavdForfattning( "%s is expired (time-limited) SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) else: self.log.warning('%s: Obekant nyckel [\'%s\']' % basefile, key) utfardandedatum = self._find_utfardandedatum(sfsnr) if utfardandedatum: d[docuri]["rpubl:utfardandedatum"] = utfardandedatum return d