def downloaded_to_intermediate(self, basefile): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)
def downloaded_to_intermediate(self, basefile, attachment=None): # Check to see if this might not be a proper SFS at all # (from time to time, other agencies publish their stuff # in SFS - this seems to be handled by giving those # documents a SFS nummer on the form "N1992:31". Filter # these out. if basefile.startswith('N'): raise IckeSFS("%s is not a regular SFS" % basefile) filename = self.store.downloaded_path(basefile) try: t = TextReader(filename, encoding=self.source_encoding) except IOError: self.log.warning("%s: Fulltext is missing" % basefile) # FIXME: This code needs to be rewritten baseuri = self.canonical_uri(basefile) if baseuri in registry: title = registry[baseuri].value(URIRef(baseuri), self.ns['dcterms'].title) desc.value(self.ns['dcterms'].title, title) desc.rel(self.ns['dcterms'].publisher, self.lookup_resource("Regeringskansliet")) desc.value(self.ns['dcterms'].identifier, "SFS " + basefile) doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')]) # Check to see if the Författning has been revoked (using # plain fast string searching, no fancy HTML parsing and # traversing) if not self.config.keepexpired: try: t.cuepast('<i>Författningen är upphävd/skall upphävas: ') datestr = t.readto('</i></b>') if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today(): self.log.debug('%s: Expired' % basefile) raise UpphavdForfattning("%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) t.seek(0) except IOError: t.seek(0) t.cuepast('<pre>') # remove ä et al try: # this is the preferred way from py34 onwards. FIXME: Move # this to ferenda.compat import html txt = html.unescape(t.readto('</pre>')) except ImportError: # this is the old way. hp = HTMLParser() txt = hp.unescape(t.readto('</pre>')) if '\r\n' not in txt: txt = txt.replace('\n', '\r\n') re_tags = re.compile("</?\w{1,3}>") txt = re_tags.sub('', txt) # add ending CRLF aids with producing better diffs txt += "\r\n" util.writefile(self.store.intermediate_path(basefile), txt, encoding=self.source_encoding) return codecs.open(self.store.intermediate_path(basefile), encoding=self.source_encoding)