def parse(self, doc): head, body = util.readfile(self.store.downloaded_path( doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add( (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup( "<div class='sitenews-item'>" + body + "</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update( doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def parse(self, doc): head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1) datestr, timestr, title = head.split(" ", 2) published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S") doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type)) doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published))) doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang))) soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml") doc.body = elements_from_soup(soup.body) # move timestamp into dcterms:issued, title into dcterms:title # parse body with elements_from_soup # set first real para as dcterms:abstract (XMLLiteral) doc.body[0][0] = Div([doc.body[0][0]], datatype="rdf:XMLLiteral", property="dcterms:abstract") # but we need to add it to doc.meta RIGHT AWAY because of reasons... doc.meta.add((URIRef(doc.uri), DCTERMS.abstract, Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral))) self.parse_entry_update(doc) # need to set published and possibly updated entry = DocumentEntry(self.store.documententry_path(doc.basefile)) entry.published = published entry.save() return True
def test_guess_type(self): d = DocumentEntry() self.assertEqual(d.guess_type("test.pdf"), "application/pdf") self.assertEqual(d.guess_type("test.rdf"), "application/rdf+xml") self.assertEqual(d.guess_type("test.html"), "text/html") self.assertEqual(d.guess_type("test.xhtml"),"application/html+xml") self.assertEqual(d.guess_type("test.bin"), "application/octet-stream")
def test_write_atom_inline(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) for basefile in range(25): de = DocumentEntry(self.repo.store.documententry_path(str(basefile))) util.writefile(self.repo.store.parsed_path(str(basefile)), "<html><p>Document #%s</p></html>" % basefile) de.set_content(self.repo.store.parsed_path(str(basefile)), self.repo.canonical_uri(str(basefile)), inline=True) de.save() unsorted_entries = self.repo.news_facet_entries() entries = sorted(list(unsorted_entries), key=itemgetter('updated'), reverse=True) self.repo.news_write_atom(entries, 'New and updated documents', 'main', archivesize=6) tree = etree.parse('%s/base/feed/main.atom' % self.datadir) NS = "{http://www.w3.org/2005/Atom}" content = tree.find(".//"+NS+"content") self.assertIsNotNone(content) self.assertIsNone(content.get("src")) self.assertIsNone(content.get("hash")) self.assertEqual(content.get("type"), "xhtml") self.assertEqualXML(etree.tostring(content[0]), '<html xmlns="http://www.w3.org/2005/Atom" xmlns:le="http://purl.org/atompub/link-extensions/1.0"><p>Document #24</p></html>')
def test_save(self): path = self.repo.store.documententry_path("123/a") d = DocumentEntry() d.orig_checked = datetime(2013,3,27,20,46,37) d.orig_url = 'http://source.example.org/doc/123/a' d.save(path=path) self.maxDiff = None self.assertEqual(self.d2u(util.readfile(path)), self.basic_json)
def test_incomplete_entries(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) # make our entries incomplete in various ways entry = DocumentEntry(self.repo.store.documententry_path("1")) entry.published = None entry.save() # try very hard to remove title from everywhere entry = DocumentEntry(self.repo.store.documententry_path("2")) del entry.title entry.save() g = rdflib.Graph().parse(self.repo.store.distilled_path("2")) g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"), self.repo.ns['dcterms'].title, rdflib.Literal("Doc #2"))) with open(self.repo.store.distilled_path("2"), "wb") as fp: g.serialize(fp, format="pretty-xml") os.unlink(self.repo.store.distilled_path("3")) # entries w/o published date and w/o distilled file should not # be published, but w/o title is OK with silence(): # avoid warnings about stale entry files # since the downloaded and intermediate file # is missing, which would exist in a real # scenario self.assertEqual(len(list(self.repo.news_entries())), 23) # also make sure that corresponding faceted_entries do not # show these non-published entries self.assertEqual(len(self.repo.news_facet_entries()), 23)
def test_set_link(self): t = tempfile.mktemp() with open(t+".html","w") as f: f.write("<div>xhtml fragment</div>") d = DocumentEntry() d.set_link(t+".html", "http://example.org/test") self.assertEqual(d.link['href'],"http://example.org/test") self.assertEqual(d.link['type'], "text/html") self.assertEqual(d.link['length'],25) self.assertEqual(d.link['hash'],"md5:ca8d87b5cf6edbbe88f51d45926c9a8d")
def setUp(self): super(News, self).setUp() self.faceted_data = [] # create a bunch of DocumentEntry objects and save them basetime = datetime(2013, 1, 1, 12, 0) for basefile in range(25): v = {'id':self.repo.canonical_uri(basefile), 'title':"Doc #%s" % basefile} self.faceted_data.append({'uri': v['id'], 'dcterms_title': v['title'], 'rdf_type': 'http://xmlns.com/foaf/0.1/Document'}) de = DocumentEntry() de.orig_created = basetime + timedelta(hours=basefile) de.orig_updated = basetime + timedelta(hours=basefile, minutes=10) de.orig_checked = basetime + timedelta(hours=basefile, minutes=20) de.published = basetime + timedelta(hours=basefile, minutes=30) de.updated = basetime + timedelta(hours=basefile, minutes=40) de.orig_url = "http://source.example.org/doc/%s" % basefile de.title = v['title'] de.save(self.repo.store.documententry_path(str(basefile))) g = rdflib.Graph() desc = Describer(g, self.repo.canonical_uri(basefile)) dcterms = self.repo.ns['dcterms'] desc.rdftype(self.repo.ns['foaf'].Document) desc.value(dcterms.title, "Invalid title") util.ensure_dir(self.repo.store.distilled_path(str(basefile))) with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp: g.serialize(fp, format="pretty-xml") util.ensure_dir(self.repo.store.parsed_path(str(basefile))) with open(self.repo.store.parsed_path(str(basefile)), "w") as fp: fp.write("""<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en"> <head about="%(id)s"> <title>%(title)s</title> </head> <body about="%(id)s"> <h1>%(title)s</h1> </body> </html>""" % v) util.ensure_dir(self.repo.store.generated_path(str(basefile))) with open(self.repo.store.generated_path(str(basefile)), "w") as fp: fp.write("""<!DOCTYPE html> <html> <head> <title>%(title)s</title> </head> <body> <h1>%(title)s</h1> </body> </html>""" % v)
def test_init(self): d = DocumentEntry() self.assertIsNone(d.id) # same for .updated, .published, # .title, .summary, .url and .content self.assertEqual(d.content, {}) self.assertEqual(d.link, {}) path = self.repo.store.documententry_path("123/b") d = DocumentEntry(path=path) self.assertIsNone(d.id) # same for .updated, .published, # .title, .summary, .url and .content self.assertEqual(d.content, {}) self.assertEqual(d.link, {})
def download_single(self, basefile, url=None): if not url: entry = DocumentEntry(self.store.documententry_path(basefile)) url = entry.orig_url xml_downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml") if self.get_parse_options(basefile) == "metadataonly": # in these cases, to save space, get # the smaller XML OCR data, not the # actual scanned images-in-PDF url = url.replace(".pdf", ".xml").replace("pdf/web", "xml") # make store.downloaded_path return .xml suffixes (and set # the timestamp to the beginning of epoch so that the # resulting if-modified-since header doesn't contain the # current date/time if not os.path.exists(xml_downloaded_path): util.writefile(xml_downloaded_path, "") os.utime(xml_downloaded_path, (0,0)) else: # if parse options have changed from metadataonly to # default, there will be a xml file lying about which will # make downloaded_path return its name. Remove it so that # we don't end up with pdf files that have a .xml # extension. if os.path.exists(xml_downloaded_path): os.unlink(xml_downloaded_path) return super(PropKB, self).download_single(basefile, url)
def infer_metadata(self, resource, basefile): super(InferTimes, self).infer_metadata(resource, basefile) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
def source_url(self, basefile): # this source does not have any predictable URLs, so we try to # find if we made a note on the URL when we ran download() # FIXME: This code is repeated in jk.py and regeringen.py -- # maybe we should let the default impl of source_url try this # strategy if eg self.remote_url is None? entry = DocumentEntry(self.store.documententry_path(basefile)) return entry.orig_url
def trim_documententry(basefile): # if the path (typically for the distilled or # parsed file) is a 0-size file, the following # steps should not be carried out. But since # they at some point might have done that # anyway, we're left with a bunch of stale # error reports in the entry files. As a # one-time-thing, try to blank out irrelevant # sections. entry = DocumentEntry(self.documententry_path(basefile)) sections = {'parse': ['parse', 'relate', 'generate'], 'relate': ['relate', 'generate'], 'generate': ['generate']}.get(action, {}) for section in sections: if section in entry.status: del entry.status[section] entry.save()
def download_single(self, basefile, url): updated = False created = False filename = self.store.downloaded_path(basefile) created = not os.path.exists(filename) # util.print_open_fds() if self.download_if_needed(url, basefile): if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info( "%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) soup = BeautifulSoup(util.readfile(filename), "lxml") for pdflink in soup.find_all("a", href=re.compile("\.pdf$")): slug = "-".join(pdflink["href"].rsplit("/")[-2:]) attachment_path = self.store.downloaded_path(basefile, attachment=slug) self.download_if_needed(urljoin(url, pdflink["href"]), basefile, filename=attachment_path) vm = soup.find("a", text="Visa Varumärke") if vm: attachment_path = self.store.downloaded_path(basefile, attachment="varumarke.jpg") attachment_url = re.search("http[^'\"]*", vm["href"]).group(0) self.download_if_needed(attachment_url, basefile, filename=attachment_path) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now entry.orig_checked = now entry.save()
def create_entry(self, basefile, timestampoffset=0): # create a entry file with indexed_{ft,ts,dep} set to the # current time with optional offset. Also # .status['generated']['date'], to test needed(..., # 'transformlinks') de = DocumentEntry(self.store.documententry_path(basefile)) delta = timedelta(seconds=timestampoffset) ts = datetime.now() + delta de.indexed_ts = ts de.indexed_ft = ts de.indexed_dep = ts de.updated = ts de.status['generate'] = {'date': ts} de.save()
def inner_wrapper(self, *args, **kwargs): # try to find out if we have a basefile if args and args[0]: entrypath_arg = args[0] else: args = () entrypath_arg = ".root" entrypath = self.store.documententry_path args = [self] + list(args) return DocumentEntry.updateentry(f, section, entrypath, entrypath_arg, None, *args, **kwargs)
def trim_documententry(basefile): # if the path (typically for the distilled or # parsed file) is a 0-size file, the following # steps should not be carried out. But since # they at some point might have done that # anyway, we're left with a bunch of stale # error reports in the entry files. As a # one-time-thing, try to blank out irrelevant # sections. entry = DocumentEntry(self.documententry_path(basefile)) sections = { 'parse': ['parse', 'relate', 'generate'], 'relate': ['relate', 'generate'], 'generate': ['generate'] }.get(action, {}) for section in sections: if section in entry.status: del entry.status[section] entry.save()
def inner_wrapper(self, *args, **kwargs): if args and args[0]: # try to find out if we have a basefile basefile = args[0] else: basefile = ".root" args = () entrypath = self.store.documententry_path(basefile) args = [self] + list(args) return DocumentEntry.updateentry(f, section, entrypath, *args, **kwargs)
def test_load(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.basic_json) d = DocumentEntry(path=path) self.assertEqual(d.orig_checked, datetime(2013,3,27,20,46,37)) self.assertIsNone(d.orig_updated) self.assertEqual(d.orig_url,'http://source.example.org/doc/123/a') self.assertEqual(d.id,'http://example.org/123/a') self.assertEqual('<DocumentEntry id=http://example.org/123/a>', repr(d))
def test_ifneeded_relate(self): @ifneeded("relate") def testfunc(repo, basefile, needed): repo.called = True repo.needed = needed try: datadir = tempfile.mkdtemp() mockbasefile = "1234" mockrepo = Mock() mockrepo.store = DocumentStore(datadir=datadir) mockrepo.called = False mockrepo.config.force = False # create some docentry file in a good place de = DocumentEntry(mockrepo.store.documententry_path("1234")) now = datetime.datetime.now() de.indexed_ts = now + datetime.timedelta(seconds=3600) de.indexed_ft = now + datetime.timedelta(seconds=-3600) de.indexed_dep = now + datetime.timedelta(seconds=-3600) de.save() # test 1: Outfile is newer - the ifneeded decorator should # make sure the actual testfunc code is never reached # NOTE: the "relate" branch of DocumentStore.needed # doesn't use outfile_is_newer, so we can't patch that, we # have to create actual files parsedpath = mockrepo.store.parsed_path("1234") util.writefile(parsedpath, "dummy") os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200)) testfunc(mockrepo, mockbasefile) self.assertFalse(mockrepo.called) mockrepo.called = False # test 2: Outfile is older than the information in the documententry file os.utime(parsedpath, (now.timestamp(), now.timestamp())) testfunc(mockrepo, mockbasefile) self.assertTrue(mockrepo.called) self.assertTrue(mockrepo.needed) self.assertFalse(mockrepo.needed.triples) self.assertFalse(mockrepo.needed.dependencies) self.assertTrue(mockrepo.needed.fulltext) mockrepo.called = False # test 3: Outfile is newer, but the global force option was set os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200)) mockrepo.config.force = True testfunc(mockrepo, mockbasefile) self.assertTrue(mockrepo.called) mockrepo.config.force = None mockrepo.called = False finally: if os.path.exists(datadir): shutil.rmtree(datadir)
def test_write_atom_inline(self): self.repo.faceted_data = Mock(return_value=self.faceted_data) for basefile in range(25): de = DocumentEntry( self.repo.store.documententry_path(str(basefile))) util.writefile(self.repo.store.parsed_path(str(basefile)), "<html><p>Document #%s</p></html>" % basefile) de.set_content(self.repo.store.parsed_path(str(basefile)), self.repo.canonical_uri(str(basefile)), inline=True) de.save() unsorted_entries = self.repo.news_facet_entries() entries = sorted(list(unsorted_entries), key=itemgetter('updated'), reverse=True) self.repo.news_write_atom(entries, 'New and updated documents', 'main', archivesize=6) tree = etree.parse('%s/base/feed/main.atom' % self.datadir) NS = "{http://www.w3.org/2005/Atom}" content = tree.find(".//" + NS + "content") self.assertIsNotNone(content) self.assertIsNone(content.get("src")) self.assertIsNone(content.get("hash")) self.assertEqual(content.get("type"), "xhtml") self.assertEqualXML( etree.tostring(content[0]), '<html xmlns="http://www.w3.org/2005/Atom" xmlns:le="http://purl.org/atompub/link-extensions/1.0"><p>Document #24</p></html>' )
def test_modify(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.basic_json) d = DocumentEntry(path=path) d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067) d.id = "http://example.org/123/a" # do this in setUp? with open(self.datadir+"/xhtml","w") as f: f.write("<div>xhtml fragment</div>") d.set_content(self.datadir+"/xhtml", "http://example.org/test", mimetype="xhtml", inline=True) d.save() self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
def test_load_status(self): path = self.repo.store.documententry_path("123/a") util.ensure_dir(path) with open(path, "w") as fp: fp.write(self.status_json) d = DocumentEntry(path=path) self.assertEqual(datetime(2018, 8, 14, 18, 15, 00), d.status['download']['date']) self.assertEqual(datetime(2018, 8, 14, 18, 16, 00), d.status['parse']['date']) self.assertEqual(datetime(2018, 8, 14, 18, 17, 00), d.status['relate']['date']) self.assertEqual(datetime(2018, 8, 14, 18, 18, 00), d.status['generate']['date']) self.assertEqual("2018-08-14T18:18:00", d.status['generate']['not_a_date'])
def download(self, basefile=None, url=None): if basefile: if not url: entry = DocumentEntry(self.store.documententry_path(basefile)) url = entry.orig_url if url: return self.download_single(basefile, url) else: raise DownloadError( "%s doesn't support downloading single basefiles w/o page URL" % self.__class__.__name__) params = { 'filterType': 'Taxonomy', 'filterByType': 'FilterablePageBase', 'preFilteredCategories': '1324', 'rootPageReference': '0', 'filteredContentCategories': self.document_type } if 'lastdownload' in self.config and not self.config.refresh: params['fromDate'] = self.config.lastdownload.strftime("%Y-%m-%d") # temporary test -- useful when troubleshooting behaviour related to malformed entries in the search result list # params['fromDate'] = "2009-05-13" # params['toDate'] = "2009-05-20" self.log.debug("Loading documents starting from %s" % params.get('fromDate', "the beginning")) try: for basefile, url in self.download_get_basefiles(params): try: # sleep(0.5) # regeringen.se has a tendency to throw 400 errors, maybe because we're too quick? self.download_single(basefile, url) except requests.exceptions.HTTPError as e: if self.download_accept_404 and e.response.status_code == 404: self.log.error("%s: %s %s" % (basefile, url, e)) ret = False else: raise e finally: urlmap_path = self.store.path("urls", "downloaded", ".map", storage_policy="file") util.ensure_dir(urlmap_path) with codecs.open(urlmap_path, "w", encoding="utf-8") as fp: for url, identifier in self.urlmap.items(): fp.write("%s\t%s\n" % (url, identifier))
def infer_metadata(self, resource, basefile): # remove the bogus dcterms:issued thing that we only added to # aid URI generation. NB: This is removed in the superclass' # postprocess_doc as well, because for this lagen.nu-derived # class it needs to be done at this point, but for use of the # superclass directly, it needs to be done at some point. for o in resource.objects(DCTERMS.issued): if not o.datatype: resource.remove(DCTERMS.issued, o) sameas_uri = self.sameas_minter.space.coin_uri(resource) resource.add(OWL.sameAs, URIRef(sameas_uri)) resource.graph.add((URIRef(self.canonical_uri(basefile, True)), OWL.sameAs, resource.identifier)) # then find each rpubl:konsolideringsunderlag, and create # owl:sameas for them as well for subresource in resource.objects(RPUBL.konsolideringsunderlag): # sometimes there'll be a rpubl:konsolideringsunderlag to # a resource URI but no actual data about that # resource. This seems to happen if SFST is updated but # SFSR is not. In those cases we can't generate a # owl:sameAs URI since we have no other data about the # resource. if subresource.value(RDF.type): uri = self.sameas_minter.space.coin_uri(subresource) subresource.add(OWL.sameAs, URIRef(uri)) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked) rooturi = URIRef(desc.getrel(RPUBL.konsoliderar)) v = self.commondata.value(rooturi, DCTERMS.alternate, any=True) if v: desc.value(DCTERMS.alternate, v) v = self.commondata.value(rooturi, RDFS.label, any=True) if v: # don't include labels if they're essentially the same as # dcterms:title (legalref needs it to be able to parse # refs to laws that typically don't include SFS numbers, # so that's why they're in sfs.ttl basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0] if not v.startswith(basetitle.lower()): desc.value(RDFS.label, util.ucfirst(v))
def download_single(self, basefile, url): updated = False created = False filename = self.store.downloaded_path(basefile) created = not os.path.exists(filename) # util.print_open_fds() if self.download_if_needed(url, basefile): if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info("%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) soup = BeautifulSoup(util.readfile(filename), "lxml") for pdflink in soup.find_all("a", href=re.compile("\.pdf$")): slug = "-".join(pdflink["href"].rsplit("/")[-2:]) attachment_path = self.store.downloaded_path(basefile, attachment=slug) self.download_if_needed(urljoin(url, pdflink["href"]), basefile, filename=attachment_path) vm = soup.find("a", text="Visa Varumärke") if vm: attachment_path = self.store.downloaded_path( basefile, attachment="varumarke.jpg") attachment_url = re.search("http[^'\"]*", vm["href"]).group(0) self.download_if_needed(attachment_url, basefile, filename=attachment_path) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now entry.orig_checked = now entry.save()
def download(self, basefile=None, url=None): if basefile: if not url: entry = DocumentEntry(self.store.documententry_path(basefile)) url = entry.orig_url if url: return self.download_single(basefile, url) else: raise DownloadError( "%s doesn't support downloading single basefiles w/o page URL" % self.__class__.__name__) self.session = requests.session() if ('lastdownload' in self.config and self.config.lastdownload and not self.config.refresh): startdate = self.config.lastdownload - timedelta(days=30) self.start_url += "&from=%s" % datetime.strftime( startdate, "%Y-%m-%d") for basefile, url in self.download_get_basefiles(self.start_url): self.download_single(basefile, url)
def needed(self, basefile, action): # if this function is even called, it means that force is not # true (or ferenda-build.py has not been called with a single # basefile, which is an implied force) if action == "parse": infile = self.downloaded_path(basefile) outfile = self.parsed_path(basefile) return not util.outfile_is_newer([infile], outfile) elif action == "relate": entry = DocumentEntry(self.documententry_path(basefile)) def newer(filename, dt): if not os.path.exists(filename): return False elif not dt: # has never been indexed return True else: return datetime.fromtimestamp( os.stat(filename).st_mtime) > dt return Relate(fulltext=newer(self.parsed_path(basefile), entry.indexed_ft), triples=newer(self.distilled_path(basefile), entry.indexed_ts), dependencies=newer(self.distilled_path(basefile), entry.indexed_dep)) elif action == "generate": infile = self.parsed_path(basefile) annotations = self.annotation_path(basefile) if os.path.exists(self.dependencies_path(basefile)): deptxt = util.readfile(self.dependencies_path(basefile)) dependencies = deptxt.strip().split("\n") else: dependencies = [] dependencies.extend((infile, annotations)) outfile = self.generated_path(basefile) return util.outfile_is_newer(dependencies, outfile) else: # custom actions will need to override needed and provide logic there return True
def infer_metadata(self, resource, basefile): # remove the bogus dcterms:issued thing that we only added to # aid URI generation. NB: This is removed in the superclass' # postprocess_doc as well, because for this lagen.nu-derived # class it needs to be done at this point, but for use of the # superclass directly, it needs to be done at some point. for o in resource.objects(DCTERMS.issued): if not o.datatype: resource.remove(DCTERMS.issued, o) sameas_uri = self.sameas_minter.space.coin_uri(resource) resource.add(OWL.sameAs, URIRef(sameas_uri)) resource.graph.add((URIRef(self.canonical_uri(basefile, True)), OWL.sameAs, resource.identifier)) # then find each rpubl:konsolideringsunderlag, and create # owl:sameas for them as well for subresource in resource.objects(RPUBL.konsolideringsunderlag): # sometimes there'll be a rpubl:konsolideringsunderlag to # a resource URI but no actual data about that # resource. This seems to happen if SFST is updated but # SFSR is not. In those cases we can't generate a # owl:sameAs URI since we have no other data about the # resource. if subresource.value(RDF.type): uri = self.sameas_minter.space.coin_uri(subresource) subresource.add(OWL.sameAs, URIRef(uri)) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked) v = self.commondata.value(resource.identifier, DCTERMS.alternate, any=True) if v: desc.value(DCTERMS.alternate, v)
def download_single(self, basefile, url): # unpack the tuples we may recieve instead of plain strings if isinstance(basefile, tuple): basefile, attachment = basefile if attachment: mainattachment = attachment + ".html" else: mainattachment = None if isinstance(url, tuple): url, extraurls = url updated = created = False checked = True filename = self.store.downloaded_path(basefile, attachment=mainattachment) created = not os.path.exists(filename) if self.download_if_needed(url, basefile, filename=filename): if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info( "%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) for url in extraurls: if url.endswith('msword.application'): # NOTE: We cannot be sure that this is # actually a Word (CDF) file. For older files # it might be a WordPerfect file (.wpd) or a # RDF file, for newer it might be a .docx. We # cannot be sure until we've downloaded it. # So we quickly read the first 4 bytes r = requests.get(url, stream=True) sig = r.raw.read(4) # r.raw.close() #bodyidx = head.index("\n\n") #sig = head[bodyidx:bodyidx+4] if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" else: self.log.error( "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig)) continue elif url.endswith('pdf.application'): doctype = ".pdf" else: self.log.warning("Unknown doc type %s" % td.a['href'].split("=")[-1]) doctype = None if doctype: if attachment: filename = self.store.downloaded_path( basefile, attachment=attachment + doctype) else: filename = self.store.downloaded_path(basefile, attachment="index" + doctype) self.log.debug("%s: downloading attachment %s" % (basefile, filename)) self.download_if_needed(url, basefile, filename=filename) if mainattachment == None: entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now if checked: entry.orig_checked = now entry.save() return updated
def source_url(self, basefile): # this source does not have any predictable URLs, so we try to # find if we made a note on the URL when we ran download() entry = DocumentEntry(self.store.documententry_path(basefile)) return entry.orig_url
def download_single(self, basefile, url=None): if self.get_parse_options(basefile) == "skip": raise DocumentSkippedError( "%s should not be downloaded according to options.py" % basefile) if not url: url = self.remote_url(basefile) if not url: # remote_url failed return filename = self.store.downloaded_path(basefile) # just the html page updated = filesupdated = False created = not os.path.exists(filename) if (not os.path.exists(filename) or self.config.refresh): existed = os.path.exists(filename) try: updated = self.download_if_needed(url, basefile, filename=filename) except requests.exceptions.HTTPError as e: if e.response.status_code == 400: # regeringen.se seems to have a problem with the # first req after a search -- unless slowed down, # raises a 400 error. Sleep on it, and try once more sleep(5) updated = self.download_if_needed(url, basefile, filename=filename) else: raise docid = url.split("/")[-1] if existed: if updated: self.log.info("%s: updated from %s" % (basefile, url)) else: self.log.debug("%s: %s is unchanged, checking PDF files" % (basefile, filename)) else: self.log.info("%s: download OK from %s" % (basefile, url)) if self.get_parse_options(basefile) == "metadataonly": self.log.debug( "%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile) else: soup = BeautifulSoup( codecs.open(filename, encoding=self.source_encoding), "lxml") cnt = 0 selected_files = self.find_doc_links(soup, basefile) if selected_files: for (filename, filetype, label) in selected_files: fileurl = urljoin(url, filename) basepath = filename.split("/")[-1] filename = self.store.downloaded_path( basefile, attachment=basepath) if not filename.lower().endswith(".pdf"): filename += ".%s" % filetype if self.download_if_needed(fileurl, basefile, filename=filename): filesupdated = True self.log.debug(" %s is new or updated" % filename) else: self.log.debug(" %s is unchanged" % filename) else: self.log.warning("%s (%s) has no downloadable files" % (basefile, url)) if updated or filesupdated: pass else: self.log.debug("%s and all files are unchanged" % filename) else: self.log.debug("%s: %s already exists" % (basefile, filename)) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated or filesupdated: entry.orig_updated = now entry.orig_checked = now entry.save() return updated or filesupdated
def download_single(self, basefile, url=None): if self.get_parse_options(basefile) == "skip": raise DocumentSkippedError("%s should not be downloaded according to options.py" % basefile) if not url: url = self.remote_url(basefile) if not url: # remote_url failed return filename = self.store.downloaded_path(basefile) # just the html page updated = filesupdated = False created = not os.path.exists(filename) if (not os.path.exists(filename) or self.config.refresh): existed = os.path.exists(filename) try: updated = self.download_if_needed(url, basefile, filename=filename) except requests.exceptions.HTTPError as e: if e.response.status_code == 400: # regeringen.se seems to have a problem with the # first req after a search -- unless slowed down, # raises a 400 error. Sleep on it, and try once more sleep(5) updated = self.download_if_needed(url, basefile, filename=filename) else: raise docid = url.split("/")[-1] if existed: if updated: self.log.info("%s: updated from %s" % (basefile, url)) else: self.log.debug("%s: %s is unchanged, checking PDF files" % (basefile, filename)) else: self.log.info("%s: download OK from %s" % (basefile, url)) if self.get_parse_options(basefile) == "metadataonly": self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile) else: soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml") cnt = 0 selected_files = self.find_doc_links(soup, basefile) if selected_files: for (filename, filetype,label) in selected_files: fileurl = urljoin(url, filename) basepath = filename.split("/")[-1] filename = self.store.downloaded_path(basefile, attachment=basepath) if not filename.lower().endswith(".pdf"): filename += ".%s" % filetype if self.download_if_needed(fileurl, basefile, filename=filename): filesupdated = True self.log.debug( " %s is new or updated" % filename) else: self.log.debug(" %s is unchanged" % filename) else: self.log.warning( "%s (%s) has no downloadable files" % (basefile, url)) if updated or filesupdated: pass else: self.log.debug("%s and all files are unchanged" % filename) else: self.log.debug("%s: %s already exists" % (basefile, filename)) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated or filesupdated: entry.orig_updated = now entry.orig_checked = now entry.save() return updated or filesupdated
def setUp(self): super(News, self).setUp() self.faceted_data = [] # create a bunch of DocumentEntry objects and save them basetime = datetime(2013, 1, 1, 12, 0) for basefile in range(25): v = { 'id': self.repo.canonical_uri(basefile), 'title': "Doc #%s" % basefile } self.faceted_data.append({ 'uri': v['id'], 'dcterms_title': v['title'], 'rdf_type': 'http://xmlns.com/foaf/0.1/Document' }) de = DocumentEntry() de.orig_created = basetime + timedelta(hours=basefile) de.orig_updated = basetime + timedelta(hours=basefile, minutes=10) de.orig_checked = basetime + timedelta(hours=basefile, minutes=20) de.published = basetime + timedelta(hours=basefile, minutes=30) de.updated = basetime + timedelta(hours=basefile, minutes=40) de.orig_url = "http://source.example.org/doc/%s" % basefile de.title = v['title'] de.save(self.repo.store.documententry_path(str(basefile))) g = rdflib.Graph() desc = Describer(g, self.repo.canonical_uri(basefile)) dcterms = self.repo.ns['dcterms'] desc.rdftype(self.repo.ns['foaf'].Document) desc.value(dcterms.title, "Invalid title") util.ensure_dir(self.repo.store.distilled_path(str(basefile))) with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp: g.serialize(fp, format="pretty-xml") util.ensure_dir(self.repo.store.parsed_path(str(basefile))) with open(self.repo.store.parsed_path(str(basefile)), "w") as fp: fp.write("""<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en"> <head about="%(id)s"> <title>%(title)s</title> </head> <body about="%(id)s"> <h1>%(title)s</h1> </body> </html>""" % v) util.ensure_dir(self.repo.store.generated_path(str(basefile))) with open(self.repo.store.generated_path(str(basefile)), "w") as fp: fp.write("""<!DOCTYPE html> <html> <head> <title>%(title)s</title> </head> <body> <h1>%(title)s</h1> </body> </html>""" % v)
def download_single(self, basefile, url): if self.get_parse_options(basefile) == "skip": raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile) rdffilename = self.store.downloaded_path(basefile, attachment="index.rdf") if self.get_parse_options(basefile) == "metadataonly" and os.path.exists(rdffilename) and (not self.config.refresh): # it is kind of bad that we can even get here in these # cases (if a rdffile exists, and a empty index.pdf # exists, shouldn't download() skip that file? Right now # it ignores empty files and passes them to # download_single. return False # url is really a 2-tuple url, title = url resp = self.session.get(url) soup = BeautifulSoup(resp.text, "lxml") pdflink = soup.find("a", href=re.compile(".*\.pdf$")) pdfurl = pdflink.get("href") thumburl = urljoin(url, soup.find("img", "tumnagel").get("src")) librisid = url.rsplit("-")[1] rdfurl = "http://data.libris.kb.se/open/bib/%s.rdf" % librisid filename = self.store.downloaded_path(basefile) created = not os.path.exists(filename) updated = False # download rdf metadata before actual content try: # it appears that URLs like # http://data.libris.kb.se/open/bib/8351225.rdf now # returns empty responses. Until we find out the proper # RDF endpoint URLs, we should check and warn for this # (and infer a minimal RDF by hand from what we can, eg # dc:title from the link text) self.download_if_needed(rdfurl, basefile, filename=rdffilename, archive=False) if os.path.getsize(rdffilename) == 0: self.log.warning("%s: %s returned 0 response, infer RDF" % (basefile, rdfurl)) base = URIRef("http://libris.kb.se/resource/bib/%s" % librisid) fakegraph = Graph() fakegraph.bind("dc", str(DC)) fakegraph.add((base, DC.title, Literal(title, lang="sv"))) year = basefile.split(":")[0] # Libris uses str type fakegraph.add((base, DC.date, Literal(year))) with open(rdffilename, "wb") as fp: fakegraph.serialize(fp, format="pretty-xml") except requests.exceptions.HTTPError as e: self.log.error("Failed to load attachment: %s" % e) raise if self.get_parse_options(basefile) == "metadataonly": self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile) with self.store.open_downloaded(basefile, "w") as fp: pass else: if self.download_if_needed(pdfurl, basefile) or self.config.refresh: if created: self.log.info("%s: download OK from %s" % (basefile, pdfurl)) else: self.log.info( "%s: download OK (new version) from %s" % (basefile, pdfurl)) updated = True try: self.download_if_needed(thumburl, basefile, filename=self.store.downloaded_path( basefile, attachment="thumb.jpg")) except requests.exceptions.HTTPError as e: self.log.error("Failed to load attachment: %s" % e) raise else: self.log.debug("%s: exists and is unchanged" % basefile) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url # or pdfurl? if created: entry.orig_created = now if updated: entry.orig_updated = now entry.orig_checked = now entry.save() return updated
def download_single(self, basefile, url=None): if not url: url = self.remote_url(basefile) if not url: # remote_url failed return filename = self.store.downloaded_path(basefile) # just the html page updated = pdfupdated = False created = not os.path.exists if (not os.path.exists(filename) or self.config.force): existed = os.path.exists(filename) updated = self.download_if_needed(url, basefile, filename=filename) docid = url.split("/")[-1] if existed: if updated: self.log.debug( "%s existed, but a new ver was downloaded" % filename) else: self.log.debug( "%s is unchanged -- checking PDF files" % filename) else: self.log.debug( "%s did not exist, so it was downloaded" % filename) soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding)) cnt = 0 pdffiles = self.find_pdf_links(soup, basefile) if pdffiles: for pdffile in pdffiles: # note; the pdfurl goes to a redirect script; however that # part of the URL tree (/download/*) is off-limits for # robots. But we can figure out the actual URL anyway! if len(docid) > 4: path = "c6/%02d/%s/%s" % ( int(docid[:-4]), docid[-4:-2], docid[-2:]) else: path = "c4/%02d/%s" % (int(docid[:-2]), docid[-2:]) pdfurl = "http://www.regeringen.se/content/1/%s/%s" % ( path, pdffile) pdffilename = self.store.downloaded_path(basefile, attachment=pdffile) if self.download_if_needed(pdfurl, basefile, filename=pdffilename): pdfupdated = True self.log.debug( " %s is new or updated" % pdffilename) else: self.log.debug(" %s is unchanged" % pdffilename) else: self.log.warning( "%s (%s) has no downloadable PDF files" % (basefile, url)) if updated or pdfupdated: pass else: self.log.debug("%s and all PDF files are unchanged" % filename) else: self.log.debug("%s already exists" % (filename)) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated or pdfupdated: entry.orig_updated = now entry.orig_checked = now entry.save() return updated or pdfupdated
def wrapper(self, doc): # call the actual function that creates the doc data oldbasefile = doc.basefile ret = f(self, doc) if doc.basefile != oldbasefile: # means that basefile was adjusted. Touch the old parsed # path first so we don't regenerate. with self.store.open_parsed(oldbasefile, "w"): pass # move any intermediate files (in particular extracted # image backgrounds from PDF files) that might be # needed later. old_intermediate = self.store.intermediate_path(oldbasefile) new_intermediate = self.store.intermediate_path(doc.basefile) if self.store.storage_policy == "dir": old_intermediate = os.path.dirname(old_intermediate) new_intermediate = os.path.dirname(new_intermediate) if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate): util.ensure_dir(new_intermediate) os.rename(old_intermediate, new_intermediate) # now render thath doc data as files (JSON, XHTML, RDF/XML) if self.config.serializejson == True: with self.store.open_serialized(doc.basefile, "wb") as fp: r = serialize(doc, format="json") # should be a (unicode) str fp.write(r.encode('utf-8')) self.log.debug( "Created %s" % (self.store.serialized_path( doc.basefile))) # css file + background images + png renderings of text resources = self.create_external_resources(doc) if resources: cssuris = [cssuri(doc.uri, x) for x in resources if x.endswith(".css")] else: cssuris = [] if cssuris: doc.cssuris = cssuris updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile, version=doc.version)) if updated: self.log.debug( "Created %s" % (self.store.parsed_path( doc.basefile))) # Extract all triples on the XHTML/RDFa data to a separate # RDF/XML file distilled_graph = Graph() with codecs.open(self.store.parsed_path(doc.basefile, version=doc.version), encoding="utf-8") as fp: # unicode distilled_graph.parse(data=fp.read(), format="rdfa", publicID=doc.uri) # The act of parsing from RDFa binds a lot of namespaces # in the graph in an unneccesary manner. Particularly it # binds both 'dc' and 'dcterms' to # 'http://purl.org/dc/terms/', which makes serialization # less than predictable. Blow these prefixes away. distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/")) distilled_graph.bind( "dcterms", URIRef("http://example.org/this-prefix-should-not-be-used")) util.ensure_dir(self.store.distilled_path(doc.basefile, version=doc.version)) with open(self.store.distilled_path(doc.basefile, version=doc.version), "wb") as distilled_file: # print("============distilled===============") # print(distilled_graph.serialize(format="turtle").decode('utf-8')) distilled_graph.serialize(distilled_file, format="pretty-xml") self.log.debug( '%s triples extracted to %s', len(distilled_graph), self.store.distilled_path(doc.basefile, version=doc.version)) # Validate that all required triples are present (we check # distilled_graph, but we could just as well check doc.meta) required = sorted(set(self.get_required_predicates(doc))) for p in required: x = distilled_graph.value(URIRef(doc.uri), p) if not x: self.log.warning("Metadata is missing a %s triple" % (distilled_graph.qname(p))) if 'validaterdfa' in self.config and self.config.validaterdfa: # Validate that all triples specified in doc.meta and any # .meta property on any body object is present in the # XHTML+RDFa file. NOTE: graph_diff has suddenly become # glacial on medium-large graphs (> ~1000 triples). Maybe we # don't have to validate them? huge_graph = False for g in iterate_graphs(doc.body): doc.meta += g if len(doc.meta) > 1000: huge_graph = True break if huge_graph: self.log.warning("Graph seems huge, skipping validation") else: # self.log.debug("diffing graphs") (in_both, in_first, in_second) = graph_diff(doc.meta, distilled_graph) self.log.debug("graphs diffed (-%s, +%s)" % (len(in_first), len(in_second))) if in_first: # original metadata not present in the XHTML filee self.log.warning("%d triple(s) from the original metadata was " "not found in the serialized XHTML file:\n%s", len(in_first), in_first.serialize(format="n3").decode("utf-8")) # Validate that entry.title and entry.id has been filled # (might be from doc.meta and doc.uri, might be other things entry = DocumentEntry(self.store.documententry_path(doc.basefile, version=doc.version)) if not entry.id: self.log.warning("entry.id missing") if not entry.title: self.log.warning("entry.title missing") return ret
def test_set_content(self): t = tempfile.mktemp() with open(t,"w") as f: f.write("<div>xhtml fragment</div>") d = DocumentEntry() d.set_content(t, "http://example.org/test", mimetype="xhtml", inline=True) # type must be either "text", "html", "xhtml" or a MIME media type (RFC 4287, 4.1.3.1) self.assertEqual(d.content['type'],"xhtml") self.assertEqual(d.content['markup'],"<div>xhtml fragment</div>") self.assertIsNone(d.content['src']) d = DocumentEntry() d.set_content(t, "http://example.org/test", mimetype="xhtml") self.assertEqual(d.content['type'],"xhtml") self.assertIsNone(d.content['markup']) self.assertEqual(d.content['src'], "http://example.org/test") self.assertEqual(d.content['hash'], "md5:ca8d87b5cf6edbbe88f51d45926c9a8d") os.unlink(t) t = tempfile.mktemp() with open(t+".pdf","w") as f: f.write("This is not a real PDF file") d = DocumentEntry() d.set_content(t+".pdf", "http://example.org/test") self.assertEqual(d.content['type'],"application/pdf") self.assertIsNone(d.content['markup']) self.assertEqual(d.content['src'], "http://example.org/test") self.assertEqual(d.content['hash'], "md5:0a461f0621ede53f1ea8471e34796b6f") d = DocumentEntry() with self.assertRaises(AssertionError): d.set_content(t+".pdf", "http://example.org/test", inline=True) os.unlink(t+".pdf")
def test_save(self): path = self.repo.store.documententry_path("123/x") d = DocumentEntry() d.title = StringIO("A file-like object, not a string") with self.assertRaises(TypeError): d.save(path=path)
def download_single(self, basefile, url=None): if url is None: url = self.remote_url(basefile) if not url: # remote_url failed return updated = created = False checked = True mainattachment = None if url in self.urlmap: attachment = self.urlmap[url] else: attachment = self.sniff_attachment(url) if attachment: self.urlmap[url] = attachment attachment += ".html" else: self.urlmap[url] = '' attachment = "index.html" downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) created = not os.path.exists(downloaded_path) if self.download_if_needed(url, basefile, filename=downloaded_path): text = util.readfile(downloaded_path) if "<div>Inga tr\xe4ffar</div>" in text: self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url)) util.robust_remove(downloaded_path) return False if created: self.log.info("%s: download OK from %s" % (basefile, url)) else: self.log.info( "%s: download OK (new version) from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) text = util.readfile(downloaded_path) soup = BeautifulSoup(text, "lxml") del text attachment = self.find_attachment(soup) extraurls = [] results = soup.find("div", "search-results-content") a = results.find("a", string="Hämta Pdf") if a: extraurls.append(a.get("href")) a = results.find("a", string="Hämta Doc") if a: extraurls.append(a.get("href")) # parse downloaded html/text page and find out extraurls for url in extraurls: if url.endswith('get=doc'): # NOTE: We cannot be sure that this is # actually a Word (CDF) file. For older files # it might be a WordPerfect file (.wpd) or a # RDF file, for newer it might be a .docx. We # cannot be sure until we've downloaded it. # So we quickly read the first 4 bytes r = requests.get(url, stream=True) sig = r.raw.read(4) # r.raw.close() #bodyidx = head.index("\n\n") #sig = head[bodyidx:bodyidx+4] if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" else: self.log.error( "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig)) continue elif url.endswith('get=pdf'): doctype = ".pdf" else: self.log.warning("Unknown doc type %s" % url.split("get=")[-1]) doctype = None if doctype: if attachment: filename = self.store.downloaded_path( basefile, attachment=attachment + doctype) else: filename = self.store.downloaded_path( basefile, attachment="index" + doctype) self.log.debug("%s: downloading attachment %s" % (basefile, filename)) self.download_if_needed(url, basefile, filename=filename) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now if checked: entry.orig_checked = now entry.save() return updated
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if not f.startswith("downloaded/sfs"): # sfst or sfsr continue for regex in self.templ: m = re.match(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version # text = t.extractfile(f).read(4000).decode("latin-1") text = open(f).read(4000).decode("latin-1") reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: version = None current += 1 de = DocumentEntry() de.basefile = basefile de.id = self.canonical_uri(basefile, updated_to) # fudge timestamps best as we can de.orig_created = datetime.fromtimestamp( os.path.getctime(f)) de.orig_updated = datetime.fromtimestamp( os.path.getmtime(f)) de.orig_updated = datetime.now() de.orig_url = self.document_url_template % locals() de.published = datetime.now() de.url = self.generated_url(basefile) de.title = "SFS %s" % basefile # de.set_content() # de.set_link() de.save(self.store.documententry_path(basefile)) # this yields more reasonable basefiles, but they are not # backwards compatible -- skip them for now # basefile = basefile.replace("_", "").replace(".", "") if "type" in m.groupdict() and m.group("type") == "sfsr": dest = self.store.register_path(basefile) current -= 1 # to offset the previous increment else: dest = self.store.downloaded_path(basefile, version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions" % (current, archived))
def download_single(self, basefile, url=None): if url is None: url = self.remote_url(basefile) if not url: # remote_url failed return updated = created = False checked = True mainattachment = None if url in self.urlmap: attachment = self.urlmap[url] else: attachment = self.sniff_attachment(url) if attachment: self.urlmap[url] = attachment attachment += ".html" else: self.urlmap[url] = '' attachment = "index.html" downloaded_path = self.store.downloaded_path(basefile, attachment=attachment) created = not os.path.exists(downloaded_path) if self.download_if_needed(url, basefile, filename=downloaded_path): text = util.readfile(downloaded_path) if "<div>Inga tr\xe4ffar</div>" in text: self.log.warning( "%s: Could not find this prop at %s, might be a bug" % (basefile, url)) util.robust_remove(downloaded_path) return False if created: self.log.info("%s: downloaded from %s" % (basefile, url)) else: self.log.info("%s: downloaded new version from %s" % (basefile, url)) updated = True else: self.log.debug("%s: exists and is unchanged" % basefile) text = util.readfile(downloaded_path) soup = BeautifulSoup(text, "lxml") del text attachment = self.find_attachment(soup) extraurls = [] results = soup.find("div", "search-results-content") a = results.find("a", string="Hämta Pdf") if a: extraurls.append(a.get("href")) a = results.find("a", string="Hämta Doc") if a: extraurls.append(a.get("href")) # parse downloaded html/text page and find out extraurls for url in extraurls: if url.endswith('get=doc'): # NOTE: We cannot be sure that this is # actually a Word (CDF) file. For older files # it might be a WordPerfect file (.wpd) or a # RDF file, for newer it might be a .docx. We # cannot be sure until we've downloaded it. # So we quickly read the first 4 bytes r = requests.get(url, stream=True) sig = r.raw.read(4) # r.raw.close() #bodyidx = head.index("\n\n") #sig = head[bodyidx:bodyidx+4] if sig == b'\xffWPC': doctype = ".wpd" elif sig == b'\xd0\xcf\x11\xe0': doctype = ".doc" elif sig == b'PK\x03\x04': doctype = ".docx" elif sig == b'{\\rt': doctype = ".rtf" else: self.log.error( "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig)) continue elif url.endswith('get=pdf'): doctype = ".pdf" else: self.log.warning("Unknown doc type %s" % url.split("get=")[-1]) doctype = None if doctype: if attachment: filename = self.store.downloaded_path( basefile, attachment=attachment + doctype) else: filename = self.store.downloaded_path(basefile, attachment="index" + doctype) self.log.debug("%s: downloading attachment %s" % (basefile, filename)) self.download_if_needed(url, basefile, filename=filename) entry = DocumentEntry(self.store.documententry_path(basefile)) now = datetime.now() entry.orig_url = url if created: entry.orig_created = now if updated: entry.orig_updated = now if checked: entry.orig_checked = now entry.save() return updated