Пример #1
0
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(
            doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr),
                                      "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add(
            (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup(
            "<div class='sitenews-item'>" + body + "</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                             datatype="rdf:XMLLiteral",
                             property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(
            doc)  # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True
Пример #2
0
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                          datatype="rdf:XMLLiteral",
                          property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(doc) # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True
Пример #3
0
 def test_guess_type(self):
     d = DocumentEntry()
     self.assertEqual(d.guess_type("test.pdf"),  "application/pdf")
     self.assertEqual(d.guess_type("test.rdf"),  "application/rdf+xml")
     self.assertEqual(d.guess_type("test.html"), "text/html")
     self.assertEqual(d.guess_type("test.xhtml"),"application/html+xml")
     self.assertEqual(d.guess_type("test.bin"),  "application/octet-stream")
Пример #4
0
 def test_write_atom_inline(self):
     self.repo.faceted_data = Mock(return_value=self.faceted_data)
     for basefile in range(25):
         de = DocumentEntry(self.repo.store.documententry_path(str(basefile)))
         util.writefile(self.repo.store.parsed_path(str(basefile)),
                        "<html><p>Document #%s</p></html>" % basefile)
         de.set_content(self.repo.store.parsed_path(str(basefile)),
                        self.repo.canonical_uri(str(basefile)),
                        inline=True)
         de.save()
     unsorted_entries = self.repo.news_facet_entries()
     entries = sorted(list(unsorted_entries),
                      key=itemgetter('updated'), reverse=True)
     self.repo.news_write_atom(entries,
                               'New and updated documents',
                               'main',
                               archivesize=6)
     tree = etree.parse('%s/base/feed/main.atom' % self.datadir)
     NS = "{http://www.w3.org/2005/Atom}"
     content = tree.find(".//"+NS+"content")
     self.assertIsNotNone(content)
     self.assertIsNone(content.get("src"))
     self.assertIsNone(content.get("hash"))
     self.assertEqual(content.get("type"), "xhtml")
     self.assertEqualXML(etree.tostring(content[0]),
                           '<html xmlns="http://www.w3.org/2005/Atom" xmlns:le="http://purl.org/atompub/link-extensions/1.0"><p>Document #24</p></html>')
Пример #5
0
    def test_save(self):
        path = self.repo.store.documententry_path("123/a")
        d = DocumentEntry()
        d.orig_checked = datetime(2013,3,27,20,46,37)
        d.orig_url = 'http://source.example.org/doc/123/a'
        d.save(path=path)

        self.maxDiff = None
        self.assertEqual(self.d2u(util.readfile(path)), self.basic_json)
Пример #6
0
    def test_incomplete_entries(self):
        self.repo.faceted_data = Mock(return_value=self.faceted_data)

        # make our entries incomplete in various ways
        entry = DocumentEntry(self.repo.store.documententry_path("1"))
        entry.published = None
        entry.save()

        # try very hard to remove title from everywhere
        entry = DocumentEntry(self.repo.store.documententry_path("2"))
        del entry.title
        entry.save()
        g = rdflib.Graph().parse(self.repo.store.distilled_path("2"))
        g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"),
                  self.repo.ns['dcterms'].title, rdflib.Literal("Doc #2")))
        with open(self.repo.store.distilled_path("2"), "wb") as fp:
            g.serialize(fp, format="pretty-xml")

        os.unlink(self.repo.store.distilled_path("3"))

        # entries w/o published date and w/o distilled file should not
        # be published, but w/o title is OK
        with silence():  # avoid warnings about stale entry files
            # since the downloaded and intermediate file
            # is missing, which would exist in a real
            # scenario
            self.assertEqual(len(list(self.repo.news_entries())), 23)

            # also make sure that corresponding faceted_entries do not
            # show these non-published entries
            self.assertEqual(len(self.repo.news_facet_entries()), 23)
Пример #7
0
    def test_set_link(self):
        t = tempfile.mktemp()
        with open(t+".html","w") as f:
             f.write("<div>xhtml fragment</div>")

        d = DocumentEntry()
        d.set_link(t+".html", "http://example.org/test")
        self.assertEqual(d.link['href'],"http://example.org/test")
        self.assertEqual(d.link['type'], "text/html")
        self.assertEqual(d.link['length'],25)
        self.assertEqual(d.link['hash'],"md5:ca8d87b5cf6edbbe88f51d45926c9a8d")
Пример #8
0
    def test_set_link(self):
        t = tempfile.mktemp()
        with open(t+".html","w") as f:
             f.write("<div>xhtml fragment</div>")

        d = DocumentEntry()
        d.set_link(t+".html", "http://example.org/test")
        self.assertEqual(d.link['href'],"http://example.org/test")
        self.assertEqual(d.link['type'], "text/html")
        self.assertEqual(d.link['length'],25)
        self.assertEqual(d.link['hash'],"md5:ca8d87b5cf6edbbe88f51d45926c9a8d")
Пример #9
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Пример #10
0
    def test_init(self):
        d = DocumentEntry()
        self.assertIsNone(d.id) # same for .updated, .published,
                                # .title, .summary, .url and .content
        self.assertEqual(d.content, {})
        self.assertEqual(d.link,   {})

        path = self.repo.store.documententry_path("123/b")
        d = DocumentEntry(path=path)
        self.assertIsNone(d.id) # same for .updated, .published,
                                # .title, .summary, .url and .content
        self.assertEqual(d.content, {})
        self.assertEqual(d.link,   {})
Пример #11
0
 def download_single(self, basefile, url=None):
     if not url:
         entry = DocumentEntry(self.store.documententry_path(basefile))
         url = entry.orig_url
     xml_downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml")
     if self.get_parse_options(basefile) == "metadataonly":
         # in these cases, to save space, get
         # the smaller XML OCR data, not the
         # actual scanned images-in-PDF
         url = url.replace(".pdf", ".xml").replace("pdf/web", "xml")
         # make store.downloaded_path return .xml suffixes (and set
         # the timestamp to the beginning of epoch so that the
         # resulting if-modified-since header doesn't contain the
         # current date/time
         if not os.path.exists(xml_downloaded_path):
             util.writefile(xml_downloaded_path, "")
             os.utime(xml_downloaded_path, (0,0))
     else:
         # if parse options have changed from metadataonly to
         # default, there will be a xml file lying about which will
         # make downloaded_path return its name. Remove it so that
         # we don't end up with pdf files that have a .xml
         # extension.
         if os.path.exists(xml_downloaded_path):
             os.unlink(xml_downloaded_path)
     return super(PropKB, self).download_single(basefile, url)
Пример #12
0
 def infer_metadata(self, resource, basefile):
     super(InferTimes, self).infer_metadata(resource, basefile)
     desc = Describer(resource.graph, resource.identifier)
     de = DocumentEntry(self.store.documententry_path(basefile))
     if de.orig_updated:
         desc.value(RINFOEX.senastHamtad, de.orig_updated)
     if de.orig_checked:
         desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
Пример #13
0
 def source_url(self, basefile):
     # this source does not have any predictable URLs, so we try to
     # find if we made a note on the URL when we ran download()
     # FIXME: This code is repeated in jk.py and regeringen.py --
     # maybe we should let the default impl of source_url try this
     # strategy if eg self.remote_url is None?
     entry = DocumentEntry(self.store.documententry_path(basefile))
     return entry.orig_url
Пример #14
0
 def trim_documententry(basefile):
     # if the path (typically for the distilled or
     # parsed file) is a 0-size file, the following
     # steps should not be carried out. But since
     # they at some point might have done that
     # anyway, we're left with a bunch of stale
     # error reports in the entry files. As a
     # one-time-thing, try to blank out irrelevant
     # sections.
     entry = DocumentEntry(self.documententry_path(basefile))
     sections = {'parse': ['parse', 'relate', 'generate'],
                 'relate': ['relate', 'generate'],
                 'generate': ['generate']}.get(action, {})
     for section in sections:
         if section in entry.status:
             del entry.status[section]
     entry.save()
Пример #15
0
    def download_single(self, basefile, url):
        updated = False
        created = False
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        # util.print_open_fds()
        if self.download_if_needed(url, basefile):
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: downloaded new version from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
        soup = BeautifulSoup(util.readfile(filename), "lxml")
        for pdflink in soup.find_all("a", href=re.compile("\.pdf$")):
            slug =  "-".join(pdflink["href"].rsplit("/")[-2:])
            attachment_path = self.store.downloaded_path(basefile, attachment=slug)
            self.download_if_needed(urljoin(url, pdflink["href"]), basefile, filename=attachment_path)
        vm = soup.find("a", text="Visa Varumärke")
        if vm:
            attachment_path = self.store.downloaded_path(basefile, attachment="varumarke.jpg")
            attachment_url = re.search("http[^'\"]*", vm["href"]).group(0)
            self.download_if_needed(attachment_url, basefile, filename=attachment_path)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
Пример #16
0
 def create_entry(self, basefile, timestampoffset=0):
     # create a entry file with indexed_{ft,ts,dep} set to the
     # current time with optional offset. Also
     # .status['generated']['date'], to test needed(...,
     # 'transformlinks')
     de = DocumentEntry(self.store.documententry_path(basefile))
     delta = timedelta(seconds=timestampoffset)
     ts = datetime.now() + delta
     de.indexed_ts = ts
     de.indexed_ft = ts
     de.indexed_dep = ts
     de.updated = ts
     de.status['generate'] = {'date': ts}
     de.save()
Пример #17
0
    def test_incomplete_entries(self):
        self.repo.faceted_data = Mock(return_value=self.faceted_data)

        # make our entries incomplete in various ways
        entry = DocumentEntry(self.repo.store.documententry_path("1"))
        entry.published = None
        entry.save()

        # try very hard to remove title from everywhere
        entry = DocumentEntry(self.repo.store.documententry_path("2"))
        del entry.title
        entry.save()
        g = rdflib.Graph().parse(self.repo.store.distilled_path("2"))
        g.remove((rdflib.URIRef("http://localhost:8000/res/base/2"),
                  self.repo.ns['dcterms'].title,
                  rdflib.Literal("Doc #2")))
        with open(self.repo.store.distilled_path("2"), "wb") as fp:
            g.serialize(fp, format="pretty-xml")

        os.unlink(self.repo.store.distilled_path("3"))

        # entries w/o published date and w/o distilled file should not
        # be published, but w/o title is OK
        with silence():  # avoid warnings about stale entry files
                         # since the downloaded and intermediate file
                         # is missing, which would exist in a real
                         # scenario
            self.assertEqual(len(list(self.repo.news_entries())),
                             23)

            # also make sure that corresponding faceted_entries do not
            # show these non-published entries
            self.assertEqual(len(self.repo.news_facet_entries()), 23)
Пример #18
0
 def inner_wrapper(self, *args, **kwargs):
     # try to find out if we have a basefile
     if args and args[0]:
         entrypath_arg = args[0]
     else:
         args = ()
         entrypath_arg = ".root"
     entrypath = self.store.documententry_path
     args = [self] + list(args)
     return DocumentEntry.updateentry(f, section, entrypath, entrypath_arg, None, *args, **kwargs)
Пример #19
0
 def trim_documententry(basefile):
     # if the path (typically for the distilled or
     # parsed file) is a 0-size file, the following
     # steps should not be carried out. But since
     # they at some point might have done that
     # anyway, we're left with a bunch of stale
     # error reports in the entry files. As a
     # one-time-thing, try to blank out irrelevant
     # sections.
     entry = DocumentEntry(self.documententry_path(basefile))
     sections = {
         'parse': ['parse', 'relate', 'generate'],
         'relate': ['relate', 'generate'],
         'generate': ['generate']
     }.get(action, {})
     for section in sections:
         if section in entry.status:
             del entry.status[section]
     entry.save()
Пример #20
0
 def inner_wrapper(self, *args, **kwargs):
     if args and args[0]:
         # try to find out if we have a basefile
         basefile = args[0]
     else:
         basefile = ".root"
         args = ()
     entrypath = self.store.documententry_path(basefile)
     args = [self] + list(args)
     return DocumentEntry.updateentry(f, section, entrypath, *args,
                                      **kwargs)
Пример #21
0
 def test_load(self):
     path = self.repo.store.documententry_path("123/a")
     util.ensure_dir(path)
     with open(path, "w") as fp:
         fp.write(self.basic_json)
     d = DocumentEntry(path=path)
     self.assertEqual(d.orig_checked, datetime(2013,3,27,20,46,37))
     self.assertIsNone(d.orig_updated)
     self.assertEqual(d.orig_url,'http://source.example.org/doc/123/a')
     self.assertEqual(d.id,'http://example.org/123/a')
     self.assertEqual('<DocumentEntry id=http://example.org/123/a>', repr(d))
Пример #22
0
 def create_entry(self, basefile, timestampoffset=0):
     # create a entry file with indexed_{ft,ts,dep} set to the
     # current time with optional offset. Also
     # .status['generated']['date'], to test needed(...,
     # 'transformlinks')
     de = DocumentEntry(self.store.documententry_path(basefile))
     delta = timedelta(seconds=timestampoffset)
     ts = datetime.now() + delta
     de.indexed_ts = ts
     de.indexed_ft = ts
     de.indexed_dep = ts
     de.updated = ts
     de.status['generate'] = {'date': ts}
     de.save()
Пример #23
0
    def test_ifneeded_relate(self):
        @ifneeded("relate")
        def testfunc(repo, basefile, needed):
            repo.called = True
            repo.needed = needed

        try:
            datadir = tempfile.mkdtemp()
            mockbasefile = "1234"
            mockrepo = Mock()
            mockrepo.store = DocumentStore(datadir=datadir)
            mockrepo.called = False
            mockrepo.config.force = False

            # create some docentry file in a good place
            de = DocumentEntry(mockrepo.store.documententry_path("1234"))
            now = datetime.datetime.now()
            de.indexed_ts = now + datetime.timedelta(seconds=3600)
            de.indexed_ft = now + datetime.timedelta(seconds=-3600)
            de.indexed_dep = now + datetime.timedelta(seconds=-3600)
            de.save()

            # test 1: Outfile is newer - the ifneeded decorator should
            # make sure the actual testfunc code is never reached

            # NOTE: the "relate" branch of DocumentStore.needed
            # doesn't use outfile_is_newer, so we can't patch that, we
            # have to create actual files
            parsedpath = mockrepo.store.parsed_path("1234")
            util.writefile(parsedpath, "dummy")
            os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200))
            testfunc(mockrepo, mockbasefile)
            self.assertFalse(mockrepo.called)
            mockrepo.called = False

            # test 2: Outfile is older than the information in the documententry file
            os.utime(parsedpath, (now.timestamp(), now.timestamp()))
            testfunc(mockrepo, mockbasefile)
            self.assertTrue(mockrepo.called)
            self.assertTrue(mockrepo.needed)
            self.assertFalse(mockrepo.needed.triples)
            self.assertFalse(mockrepo.needed.dependencies)
            self.assertTrue(mockrepo.needed.fulltext)

            mockrepo.called = False
            # test 3: Outfile is newer, but the global force option was set
            os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200))
            mockrepo.config.force = True
            testfunc(mockrepo, mockbasefile)
            self.assertTrue(mockrepo.called)
            mockrepo.config.force = None
            mockrepo.called = False
        finally:
            if os.path.exists(datadir):
                shutil.rmtree(datadir)
Пример #24
0
 def test_write_atom_inline(self):
     self.repo.faceted_data = Mock(return_value=self.faceted_data)
     for basefile in range(25):
         de = DocumentEntry(
             self.repo.store.documententry_path(str(basefile)))
         util.writefile(self.repo.store.parsed_path(str(basefile)),
                        "<html><p>Document #%s</p></html>" % basefile)
         de.set_content(self.repo.store.parsed_path(str(basefile)),
                        self.repo.canonical_uri(str(basefile)),
                        inline=True)
         de.save()
     unsorted_entries = self.repo.news_facet_entries()
     entries = sorted(list(unsorted_entries),
                      key=itemgetter('updated'),
                      reverse=True)
     self.repo.news_write_atom(entries,
                               'New and updated documents',
                               'main',
                               archivesize=6)
     tree = etree.parse('%s/base/feed/main.atom' % self.datadir)
     NS = "{http://www.w3.org/2005/Atom}"
     content = tree.find(".//" + NS + "content")
     self.assertIsNotNone(content)
     self.assertIsNone(content.get("src"))
     self.assertIsNone(content.get("hash"))
     self.assertEqual(content.get("type"), "xhtml")
     self.assertEqualXML(
         etree.tostring(content[0]),
         '<html xmlns="http://www.w3.org/2005/Atom" xmlns:le="http://purl.org/atompub/link-extensions/1.0"><p>Document #24</p></html>'
     )
Пример #25
0
 def test_guess_type(self):
     d = DocumentEntry()
     self.assertEqual(d.guess_type("test.pdf"),  "application/pdf")
     self.assertEqual(d.guess_type("test.rdf"),  "application/rdf+xml")
     self.assertEqual(d.guess_type("test.html"), "text/html")
     self.assertEqual(d.guess_type("test.xhtml"),"application/html+xml")
     self.assertEqual(d.guess_type("test.bin"),  "application/octet-stream")
Пример #26
0
    def test_save(self):
        path = self.repo.store.documententry_path("123/a")
        d = DocumentEntry()
        d.orig_checked = datetime(2013,3,27,20,46,37)
        d.orig_url = 'http://source.example.org/doc/123/a'
        d.save(path=path)

        self.maxDiff = None
        self.assertEqual(self.d2u(util.readfile(path)), self.basic_json)
Пример #27
0
    def test_modify(self):
        path = self.repo.store.documententry_path("123/a")
        util.ensure_dir(path)
        with open(path, "w") as fp:
            fp.write(self.basic_json)

        d = DocumentEntry(path=path)
        d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067)
        d.id = "http://example.org/123/a"
        # do this in setUp?
        with open(self.datadir+"/xhtml","w") as f:
            f.write("<div>xhtml fragment</div>")

        d.set_content(self.datadir+"/xhtml", "http://example.org/test",
                      mimetype="xhtml", inline=True)
        d.save()
        self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
Пример #28
0
 def test_load_status(self):
     path = self.repo.store.documententry_path("123/a")
     util.ensure_dir(path)
     with open(path, "w") as fp:
         fp.write(self.status_json)
     d = DocumentEntry(path=path)
     self.assertEqual(datetime(2018, 8, 14, 18, 15, 00),
                      d.status['download']['date'])
     self.assertEqual(datetime(2018, 8, 14, 18, 16, 00),
                      d.status['parse']['date'])
     self.assertEqual(datetime(2018, 8, 14, 18, 17, 00),
                      d.status['relate']['date'])
     self.assertEqual(datetime(2018, 8, 14, 18, 18, 00),
                      d.status['generate']['date'])
     self.assertEqual("2018-08-14T18:18:00",
                      d.status['generate']['not_a_date'])
Пример #29
0
    def test_ifneeded_relate(self):
        @ifneeded("relate")
        def testfunc(repo, basefile, needed):
            repo.called = True
            repo.needed = needed

        try:
            datadir = tempfile.mkdtemp()
            mockbasefile = "1234"
            mockrepo = Mock()
            mockrepo.store = DocumentStore(datadir=datadir)
            mockrepo.called = False
            mockrepo.config.force = False

            # create some docentry file in a good place
            de = DocumentEntry(mockrepo.store.documententry_path("1234"))
            now = datetime.datetime.now()
            de.indexed_ts = now + datetime.timedelta(seconds=3600)
            de.indexed_ft = now + datetime.timedelta(seconds=-3600)
            de.indexed_dep = now + datetime.timedelta(seconds=-3600)
            de.save()

            # test 1: Outfile is newer - the ifneeded decorator should
            # make sure the actual testfunc code is never reached

            # NOTE: the "relate" branch of DocumentStore.needed
            # doesn't use outfile_is_newer, so we can't patch that, we
            # have to create actual files
            parsedpath = mockrepo.store.parsed_path("1234")
            util.writefile(parsedpath,  "dummy")
            os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200))
            testfunc(mockrepo, mockbasefile)
            self.assertFalse(mockrepo.called)
            mockrepo.called = False

            # test 2: Outfile is older than the information in the documententry file
            os.utime(parsedpath, (now.timestamp(), now.timestamp()))
            testfunc(mockrepo, mockbasefile)
            self.assertTrue(mockrepo.called)
            self.assertTrue(mockrepo.needed)
            self.assertFalse(mockrepo.needed.triples)
            self.assertFalse(mockrepo.needed.dependencies)
            self.assertTrue(mockrepo.needed.fulltext)
            
            mockrepo.called = False
            # test 3: Outfile is newer, but the global force option was set
            os.utime(parsedpath, (now.timestamp(), now.timestamp() - 7200))
            mockrepo.config.force = True
            testfunc(mockrepo, mockbasefile)
            self.assertTrue(mockrepo.called)
            mockrepo.config.force = None
            mockrepo.called = False
        finally:
            if os.path.exists(datadir):
                shutil.rmtree(datadir)
Пример #30
0
    def download(self, basefile=None, url=None):
        if basefile:
            if not url:
                entry = DocumentEntry(self.store.documententry_path(basefile))
                url = entry.orig_url
            if url:
                return self.download_single(basefile, url)
            else:
                raise DownloadError(
                    "%s doesn't support downloading single basefiles w/o page URL"
                    % self.__class__.__name__)
        params = {
            'filterType': 'Taxonomy',
            'filterByType': 'FilterablePageBase',
            'preFilteredCategories': '1324',
            'rootPageReference': '0',
            'filteredContentCategories': self.document_type
        }
        if 'lastdownload' in self.config and not self.config.refresh:
            params['fromDate'] = self.config.lastdownload.strftime("%Y-%m-%d")
        # temporary test -- useful when troubleshooting behaviour related to malformed entries in the search result list
        # params['fromDate'] = "2009-05-13"
        # params['toDate']   = "2009-05-20"

        self.log.debug("Loading documents starting from %s" %
                       params.get('fromDate', "the beginning"))
        try:
            for basefile, url in self.download_get_basefiles(params):
                try:
                    # sleep(0.5)  # regeringen.se has a tendency to throw 400 errors, maybe because we're too quick?
                    self.download_single(basefile, url)
                except requests.exceptions.HTTPError as e:
                    if self.download_accept_404 and e.response.status_code == 404:
                        self.log.error("%s: %s %s" % (basefile, url, e))
                        ret = False
                    else:
                        raise e
        finally:
            urlmap_path = self.store.path("urls",
                                          "downloaded",
                                          ".map",
                                          storage_policy="file")
            util.ensure_dir(urlmap_path)
            with codecs.open(urlmap_path, "w", encoding="utf-8") as fp:
                for url, identifier in self.urlmap.items():
                    fp.write("%s\t%s\n" % (url, identifier))
Пример #31
0
    def infer_metadata(self, resource, basefile):
        # remove the bogus dcterms:issued thing that we only added to
        # aid URI generation. NB: This is removed in the superclass'
        # postprocess_doc as well, because for this lagen.nu-derived
        # class it needs to be done at this point, but for use of the
        # superclass directly, it needs to be done at some point.
        for o in resource.objects(DCTERMS.issued):
            if not o.datatype:
                resource.remove(DCTERMS.issued, o)
        sameas_uri = self.sameas_minter.space.coin_uri(resource)
        resource.add(OWL.sameAs, URIRef(sameas_uri))
        resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                            OWL.sameAs, resource.identifier))
        # then find each rpubl:konsolideringsunderlag, and create
        # owl:sameas for them as well
        for subresource in resource.objects(RPUBL.konsolideringsunderlag):
            # sometimes there'll be a rpubl:konsolideringsunderlag to
            # a resource URI but no actual data about that
            # resource. This seems to happen if SFST is updated but
            # SFSR is not. In those cases we can't generate a
            # owl:sameAs URI since we have no other data about the
            # resource.
            if subresource.value(RDF.type):
                uri = self.sameas_minter.space.coin_uri(subresource)
                subresource.add(OWL.sameAs, URIRef(uri))
        desc = Describer(resource.graph, resource.identifier)
        de = DocumentEntry(self.store.documententry_path(basefile))
        if de.orig_updated:
            desc.value(RINFOEX.senastHamtad, de.orig_updated)
        if de.orig_checked:
            desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
        rooturi = URIRef(desc.getrel(RPUBL.konsoliderar))

        v = self.commondata.value(rooturi, DCTERMS.alternate, any=True)
        if v:
            desc.value(DCTERMS.alternate, v)
        v = self.commondata.value(rooturi, RDFS.label, any=True)
        if v:
            # don't include labels if they're essentially the same as
            # dcterms:title (legalref needs it to be able to parse
            # refs to laws that typically don't include SFS numbers,
            # so that's why they're in sfs.ttl
            basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0]
            if not v.startswith(basetitle.lower()):
                desc.value(RDFS.label, util.ucfirst(v))
Пример #32
0
    def download_single(self, basefile, url):
        updated = False
        created = False
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        # util.print_open_fds()
        if self.download_if_needed(url, basefile):
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info("%s: downloaded new version from %s" %
                              (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
        soup = BeautifulSoup(util.readfile(filename), "lxml")
        for pdflink in soup.find_all("a", href=re.compile("\.pdf$")):
            slug = "-".join(pdflink["href"].rsplit("/")[-2:])
            attachment_path = self.store.downloaded_path(basefile,
                                                         attachment=slug)
            self.download_if_needed(urljoin(url, pdflink["href"]),
                                    basefile,
                                    filename=attachment_path)
        vm = soup.find("a", text="Visa Varumärke")
        if vm:
            attachment_path = self.store.downloaded_path(
                basefile, attachment="varumarke.jpg")
            attachment_url = re.search("http[^'\"]*", vm["href"]).group(0)
            self.download_if_needed(attachment_url,
                                    basefile,
                                    filename=attachment_path)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
Пример #33
0
    def test_modify(self):
        path = self.repo.store.documententry_path("123/a")
        util.ensure_dir(path)
        with open(path, "w") as fp:
            fp.write(self.basic_json)

        d = DocumentEntry(path=path)
        d.orig_updated = datetime(2013, 3, 27, 20, 59, 42, 325067)
        d.id = "http://example.org/123/a"
        # do this in setUp?
        with open(self.datadir+"/xhtml","w") as f:
            f.write("<div>xhtml fragment</div>")

        d.set_content(self.datadir+"/xhtml", "http://example.org/test",
                      mimetype="xhtml", inline=True)
        d.save()
        self.assertEqual(self.d2u(util.readfile(path)), self.modified_json)
Пример #34
0
 def download(self, basefile=None, url=None):
     if basefile:
         if not url:
             entry = DocumentEntry(self.store.documententry_path(basefile))
             url = entry.orig_url
         if url:
             return self.download_single(basefile, url)
         else:
             raise DownloadError(
                 "%s doesn't support downloading single basefiles w/o page URL"
                 % self.__class__.__name__)
     self.session = requests.session()
     if ('lastdownload' in self.config and self.config.lastdownload
             and not self.config.refresh):
         startdate = self.config.lastdownload - timedelta(days=30)
         self.start_url += "&from=%s" % datetime.strftime(
             startdate, "%Y-%m-%d")
     for basefile, url in self.download_get_basefiles(self.start_url):
         self.download_single(basefile, url)
Пример #35
0
    def needed(self, basefile, action):
        # if this function is even called, it means that force is not
        # true (or ferenda-build.py has not been called with a single
        # basefile, which is an implied force)
        if action == "parse":
            infile = self.downloaded_path(basefile)
            outfile = self.parsed_path(basefile)
            return not util.outfile_is_newer([infile], outfile)
        elif action == "relate":
            entry = DocumentEntry(self.documententry_path(basefile))

            def newer(filename, dt):
                if not os.path.exists(filename):
                    return False
                elif not dt:  # has never been indexed
                    return True
                else:
                    return datetime.fromtimestamp(
                        os.stat(filename).st_mtime) > dt

            return Relate(fulltext=newer(self.parsed_path(basefile),
                                         entry.indexed_ft),
                          triples=newer(self.distilled_path(basefile),
                                        entry.indexed_ts),
                          dependencies=newer(self.distilled_path(basefile),
                                             entry.indexed_dep))
        elif action == "generate":
            infile = self.parsed_path(basefile)
            annotations = self.annotation_path(basefile)
            if os.path.exists(self.dependencies_path(basefile)):
                deptxt = util.readfile(self.dependencies_path(basefile))
                dependencies = deptxt.strip().split("\n")
            else:
                dependencies = []
            dependencies.extend((infile, annotations))
            outfile = self.generated_path(basefile)
            return util.outfile_is_newer(dependencies, outfile)
        else:
            # custom actions will need to override needed and provide logic there
            return True
Пример #36
0
 def infer_metadata(self, resource, basefile):
     # remove the bogus dcterms:issued thing that we only added to
     # aid URI generation. NB: This is removed in the superclass'
     # postprocess_doc as well, because for this lagen.nu-derived
     # class it needs to be done at this point, but for use of the
     # superclass directly, it needs to be done at some point.
     for o in resource.objects(DCTERMS.issued):
         if not o.datatype:
             resource.remove(DCTERMS.issued, o)
     sameas_uri = self.sameas_minter.space.coin_uri(resource)
     resource.add(OWL.sameAs, URIRef(sameas_uri))
     resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                         OWL.sameAs, resource.identifier))
     # then find each rpubl:konsolideringsunderlag, and create
     # owl:sameas for them as well
     for subresource in resource.objects(RPUBL.konsolideringsunderlag):
         # sometimes there'll be a rpubl:konsolideringsunderlag to
         # a resource URI but no actual data about that
         # resource. This seems to happen if SFST is updated but
         # SFSR is not. In those cases we can't generate a
         # owl:sameAs URI since we have no other data about the
         # resource.
         if subresource.value(RDF.type):
             uri = self.sameas_minter.space.coin_uri(subresource)
             subresource.add(OWL.sameAs, URIRef(uri))
     desc = Describer(resource.graph, resource.identifier)
     de = DocumentEntry(self.store.documententry_path(basefile))
     if de.orig_updated:
         desc.value(RINFOEX.senastHamtad, de.orig_updated)
     if de.orig_checked:
         desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
     v = self.commondata.value(resource.identifier,
                               DCTERMS.alternate,
                               any=True)
     if v:
         desc.value(DCTERMS.alternate, v)
Пример #37
0
    def download_single(self, basefile, url):
        # unpack the tuples we may recieve instead of plain strings
        if isinstance(basefile, tuple):
            basefile, attachment = basefile
            if attachment:
                mainattachment = attachment + ".html"
            else:
                mainattachment = None
        if isinstance(url, tuple):
            url, extraurls = url
        updated = created = False
        checked = True

        filename = self.store.downloaded_path(basefile, attachment=mainattachment)
        created = not os.path.exists(filename)
        if self.download_if_needed(url, basefile, filename=filename):
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: downloaded new version from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)

        for url in extraurls:
            if url.endswith('msword.application'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
                    continue
            elif url.endswith('pdf.application'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" %
                                 td.a['href'].split("=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(basefile, attachment="index" + doctype)
                self.log.debug("%s: downloading attachment %s" % (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        if mainattachment == None:
            entry = DocumentEntry(self.store.documententry_path(basefile))
            now = datetime.now()
            entry.orig_url = url
            if created:
                entry.orig_created = now
            if updated:
                entry.orig_updated = now
            if checked:
                entry.orig_checked = now
            entry.save()

        return updated
Пример #38
0
 def source_url(self, basefile):
     # this source does not have any predictable URLs, so we try to
     # find if we made a note on the URL when we ran download()
     entry = DocumentEntry(self.store.documententry_path(basefile))
     return entry.orig_url
Пример #39
0
    def download_single(self, basefile, url=None):
        if self.get_parse_options(basefile) == "skip":
            raise DocumentSkippedError(
                "%s should not be downloaded according to options.py" %
                basefile)
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = filesupdated = False
        created = not os.path.exists(filename)
        if (not os.path.exists(filename) or self.config.refresh):
            existed = os.path.exists(filename)
            try:
                updated = self.download_if_needed(url,
                                                  basefile,
                                                  filename=filename)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 400:
                    # regeringen.se seems to have a problem with the
                    # first req after a search -- unless slowed down,
                    # raises a 400 error. Sleep on it, and try once more
                    sleep(5)
                    updated = self.download_if_needed(url,
                                                      basefile,
                                                      filename=filename)
                else:
                    raise
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.info("%s: updated from %s" % (basefile, url))
                else:
                    self.log.debug("%s: %s is unchanged, checking PDF files" %
                                   (basefile, filename))
            else:
                self.log.info("%s: download OK from %s" % (basefile, url))

            if self.get_parse_options(basefile) == "metadataonly":
                self.log.debug(
                    "%s: Marked as 'metadataonly', not downloading actual PDF file"
                    % basefile)
            else:
                soup = BeautifulSoup(
                    codecs.open(filename, encoding=self.source_encoding),
                    "lxml")
                cnt = 0
                selected_files = self.find_doc_links(soup, basefile)
                if selected_files:
                    for (filename, filetype, label) in selected_files:
                        fileurl = urljoin(url, filename)
                        basepath = filename.split("/")[-1]
                        filename = self.store.downloaded_path(
                            basefile, attachment=basepath)
                        if not filename.lower().endswith(".pdf"):
                            filename += ".%s" % filetype
                        if self.download_if_needed(fileurl,
                                                   basefile,
                                                   filename=filename):
                            filesupdated = True
                            self.log.debug("    %s is new or updated" %
                                           filename)
                        else:
                            self.log.debug("    %s is unchanged" % filename)
                else:
                    self.log.warning("%s (%s) has no downloadable files" %
                                     (basefile, url))
            if updated or filesupdated:
                pass
            else:
                self.log.debug("%s and all files are unchanged" % filename)
        else:
            self.log.debug("%s: %s already exists" % (basefile, filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or filesupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or filesupdated
Пример #40
0
    def download_single(self, basefile, url=None):
        if self.get_parse_options(basefile) == "skip":
            raise DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = filesupdated = False
        created = not os.path.exists(filename)
        if (not os.path.exists(filename) or self.config.refresh):
            existed = os.path.exists(filename)
            try:
                updated = self.download_if_needed(url, basefile, filename=filename)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 400:
                    # regeringen.se seems to have a problem with the
                    # first req after a search -- unless slowed down,
                    # raises a 400 error. Sleep on it, and try once more
                    sleep(5)
                    updated = self.download_if_needed(url, basefile, filename=filename)
                else:
                    raise
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.info("%s: updated from %s" % (basefile, url))
                else:
                    self.log.debug("%s: %s is unchanged, checking PDF files" %
                                   (basefile, filename))
            else:
                self.log.info("%s: download OK from %s" % (basefile, url))

            if self.get_parse_options(basefile) == "metadataonly":
                self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
            else:
                soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml")
                cnt = 0
                selected_files = self.find_doc_links(soup, basefile)
                if selected_files:
                    for (filename, filetype,label) in selected_files:
                        fileurl = urljoin(url, filename)
                        basepath = filename.split("/")[-1]
                        filename = self.store.downloaded_path(basefile, attachment=basepath)
                        if not filename.lower().endswith(".pdf"):
                            filename += ".%s" % filetype
                        if self.download_if_needed(fileurl, basefile, filename=filename):
                            filesupdated = True
                            self.log.debug(
                                "    %s is new or updated" % filename)
                        else:
                            self.log.debug("    %s is unchanged" % filename)
                else:
                    self.log.warning(
                        "%s (%s) has no downloadable files" % (basefile, url))
            if updated or filesupdated:
                pass
            else:
                self.log.debug("%s and all files are unchanged" % filename)
        else:
            self.log.debug("%s: %s already exists" % (basefile, filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or filesupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or filesupdated
Пример #41
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {
                'id': self.repo.canonical_uri(basefile),
                'title': "Doc #%s" % basefile
            }
            self.faceted_data.append({
                'uri':
                v['id'],
                'dcterms_title':
                v['title'],
                'rdf_type':
                'http://xmlns.com/foaf/0.1/Document'
            })
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published = basetime + timedelta(hours=basefile, minutes=30)
            de.updated = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url = "http://source.example.org/doc/%s" % basefile
            de.title = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)),
                      "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)),
                      "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Пример #42
0
    def download_single(self, basefile, url):
        if self.get_parse_options(basefile) == "skip":
            raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
        rdffilename = self.store.downloaded_path(basefile, attachment="index.rdf")
        if self.get_parse_options(basefile) == "metadataonly" and os.path.exists(rdffilename) and (not self.config.refresh):
            # it is kind of bad that we can even get here in these
            # cases (if a rdffile exists, and a empty index.pdf
            # exists, shouldn't download() skip that file? Right now
            # it ignores empty files and passes them to
            # download_single.
            return False
        
        # url is really a 2-tuple
        url, title = url
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, "lxml")
        pdflink = soup.find("a", href=re.compile(".*\.pdf$"))
        pdfurl = pdflink.get("href")
        thumburl = urljoin(url, soup.find("img", "tumnagel").get("src"))
        librisid = url.rsplit("-")[1]
        rdfurl = "http://data.libris.kb.se/open/bib/%s.rdf" % librisid
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        updated = False
        
        # download rdf metadata before actual content
        try:
            # it appears that URLs like
            # http://data.libris.kb.se/open/bib/8351225.rdf now
            # returns empty responses. Until we find out the proper
            # RDF endpoint URLs, we should check and warn for this
            # (and infer a minimal RDF by hand from what we can, eg
            # dc:title from the link text)
            self.download_if_needed(rdfurl, basefile,
                                    filename=rdffilename,
                                    archive=False)
            if os.path.getsize(rdffilename) == 0:
                self.log.warning("%s: %s returned 0 response, infer RDF" %
                                 (basefile, rdfurl))
                base = URIRef("http://libris.kb.se/resource/bib/%s" %
                              librisid)
                fakegraph = Graph()
                fakegraph.bind("dc", str(DC))
                fakegraph.add((base, DC.title, Literal(title, lang="sv")))
                year = basefile.split(":")[0] # Libris uses str type
                fakegraph.add((base, DC.date, Literal(year)))
                with open(rdffilename, "wb") as fp:
                    fakegraph.serialize(fp, format="pretty-xml")
        except requests.exceptions.HTTPError as e:
            self.log.error("Failed to load attachment: %s" % e)
            raise

        if self.get_parse_options(basefile) == "metadataonly":
            self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
            with self.store.open_downloaded(basefile, "w") as fp:
                pass
        else:
            if self.download_if_needed(pdfurl, basefile) or self.config.refresh:
                if created:
                    self.log.info("%s: download OK from %s" % (basefile, pdfurl))
                else:
                    self.log.info(
                        "%s: download OK (new version) from %s" % (basefile, pdfurl))
                updated = True
                try:
                    self.download_if_needed(thumburl, basefile,
                                            filename=self.store.downloaded_path(
                            basefile, attachment="thumb.jpg"))
                except requests.exceptions.HTTPError as e:
                    self.log.error("Failed to load attachment: %s" % e)
                    raise
            else:
                self.log.debug("%s: exists and is unchanged" % basefile)
        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url  # or pdfurl?
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
        return updated
Пример #43
0
    def download_single(self, basefile, url=None):
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = pdfupdated = False
        created = not os.path.exists
        if (not os.path.exists(filename) or self.config.force):
            existed = os.path.exists(filename)
            updated = self.download_if_needed(url, basefile, filename=filename)
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.debug(
                        "%s existed, but a new ver was downloaded" % filename)
                else:
                    self.log.debug(
                        "%s is unchanged -- checking PDF files" % filename)
            else:
                self.log.debug(
                    "%s did not exist, so it was downloaded" % filename)

            soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding))
            cnt = 0
            pdffiles = self.find_pdf_links(soup, basefile)
            if pdffiles:
                for pdffile in pdffiles:
                    # note; the pdfurl goes to a redirect script; however that
                    # part of the URL tree (/download/*) is off-limits for
                    # robots. But we can figure out the actual URL anyway!
                    if len(docid) > 4:
                        path = "c6/%02d/%s/%s" % (
                            int(docid[:-4]), docid[-4:-2], docid[-2:])
                    else:
                        path = "c4/%02d/%s" % (int(docid[:-2]), docid[-2:])
                    pdfurl = "http://www.regeringen.se/content/1/%s/%s" % (
                        path, pdffile)
                    pdffilename = self.store.downloaded_path(basefile, attachment=pdffile)
                    if self.download_if_needed(pdfurl, basefile, filename=pdffilename):
                        pdfupdated = True
                        self.log.debug(
                            "    %s is new or updated" % pdffilename)
                    else:
                        self.log.debug("    %s is unchanged" % pdffilename)
            else:
                self.log.warning(
                    "%s (%s) has no downloadable PDF files" % (basefile, url))
            if updated or pdfupdated:
                pass
            else:
                self.log.debug("%s and all PDF files are unchanged" % filename)
        else:
            self.log.debug("%s already exists" % (filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or pdfupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or pdfupdated
Пример #44
0
    def wrapper(self, doc):
        # call the actual function that creates the doc data
        oldbasefile = doc.basefile
        ret = f(self, doc)
        if doc.basefile != oldbasefile:
            # means that basefile was adjusted.  Touch the old parsed
            # path first so we don't regenerate.
            with self.store.open_parsed(oldbasefile, "w"):
                pass
            # move any intermediate files (in particular extracted
            # image backgrounds from PDF files) that might be
            # needed later. 
            old_intermediate = self.store.intermediate_path(oldbasefile)
            new_intermediate = self.store.intermediate_path(doc.basefile)
            if self.store.storage_policy == "dir":
                old_intermediate = os.path.dirname(old_intermediate)
                new_intermediate = os.path.dirname(new_intermediate)
            if os.path.exists(old_intermediate) and not os.path.exists(new_intermediate):
                util.ensure_dir(new_intermediate)
                os.rename(old_intermediate, new_intermediate)
        # now render thath doc data as files (JSON, XHTML, RDF/XML)
        if self.config.serializejson == True:
            with self.store.open_serialized(doc.basefile, "wb") as fp:
                r = serialize(doc, format="json")  # should be a (unicode) str
                fp.write(r.encode('utf-8'))
            self.log.debug(
                "Created %s" %
                (self.store.serialized_path(
                    doc.basefile)))
        # css file + background images + png renderings of text
        resources = self.create_external_resources(doc)
        if resources:
            cssuris = [cssuri(doc.uri, x) for x in resources if x.endswith(".css")]
        else:
            cssuris = []
        if cssuris:
            doc.cssuris = cssuris
        updated = self.render_xhtml(doc, self.store.parsed_path(doc.basefile, version=doc.version))
        if updated:
            self.log.debug(
                "Created %s" %
                (self.store.parsed_path(
                    doc.basefile)))


        # Extract all triples on the XHTML/RDFa data to a separate
        # RDF/XML file
        distilled_graph = Graph()
        with codecs.open(self.store.parsed_path(doc.basefile, version=doc.version),
                         encoding="utf-8") as fp:  # unicode
            distilled_graph.parse(data=fp.read(), format="rdfa",
                                  publicID=doc.uri)

        # The act of parsing from RDFa binds a lot of namespaces
        # in the graph in an unneccesary manner. Particularly it
        # binds both 'dc' and 'dcterms' to
        # 'http://purl.org/dc/terms/', which makes serialization
        # less than predictable. Blow these prefixes away.
        distilled_graph.bind("dc", URIRef("http://purl.org/dc/elements/1.1/"))
        distilled_graph.bind(
            "dcterms",
            URIRef("http://example.org/this-prefix-should-not-be-used"))

        util.ensure_dir(self.store.distilled_path(doc.basefile, version=doc.version))
        with open(self.store.distilled_path(doc.basefile, version=doc.version),
                  "wb") as distilled_file:
            # print("============distilled===============")
            # print(distilled_graph.serialize(format="turtle").decode('utf-8'))
            distilled_graph.serialize(distilled_file, format="pretty-xml")
        self.log.debug(
            '%s triples extracted to %s',
            len(distilled_graph), self.store.distilled_path(doc.basefile, version=doc.version))

        # Validate that all required triples are present (we check
        # distilled_graph, but we could just as well check doc.meta)
        required = sorted(set(self.get_required_predicates(doc))) 
        for p in required:
            x = distilled_graph.value(URIRef(doc.uri), p)
            if not x:
                self.log.warning("Metadata is missing a %s triple" %
                                 (distilled_graph.qname(p)))
        if 'validaterdfa' in self.config and self.config.validaterdfa:
            # Validate that all triples specified in doc.meta and any
            # .meta property on any body object is present in the
            # XHTML+RDFa file.  NOTE: graph_diff has suddenly become
            # glacial on medium-large graphs (> ~1000 triples). Maybe we
            # don't have to validate them?
            huge_graph = False
            for g in iterate_graphs(doc.body):
                doc.meta += g
                if len(doc.meta) > 1000:
                    huge_graph = True
                    break
            if huge_graph:
                self.log.warning("Graph seems huge, skipping validation")
            else:
                # self.log.debug("diffing graphs")
                (in_both, in_first, in_second) = graph_diff(doc.meta, distilled_graph)
                self.log.debug("graphs diffed (-%s, +%s)" % (len(in_first), len(in_second)))

                if in_first:  # original metadata not present in the XHTML filee
                    self.log.warning("%d triple(s) from the original metadata was "
                                     "not found in the serialized XHTML file:\n%s",
                                     len(in_first), in_first.serialize(format="n3").decode("utf-8"))

        # Validate that entry.title and entry.id has been filled
        # (might be from doc.meta and doc.uri, might be other things
        entry = DocumentEntry(self.store.documententry_path(doc.basefile, version=doc.version))
        if not entry.id:
            self.log.warning("entry.id missing")
        if not entry.title:
            self.log.warning("entry.title missing")
        return ret
Пример #45
0
    def test_set_content(self):
        t = tempfile.mktemp()
        with open(t,"w") as f:
             f.write("<div>xhtml fragment</div>")

        d = DocumentEntry()
        d.set_content(t, "http://example.org/test", mimetype="xhtml", inline=True)
        # type must be either "text", "html",  "xhtml" or a MIME media type (RFC 4287, 4.1.3.1)
        self.assertEqual(d.content['type'],"xhtml")
        self.assertEqual(d.content['markup'],"<div>xhtml fragment</div>")
        self.assertIsNone(d.content['src'])

        d = DocumentEntry()
        d.set_content(t, "http://example.org/test", mimetype="xhtml")
        self.assertEqual(d.content['type'],"xhtml")
        self.assertIsNone(d.content['markup'])
        self.assertEqual(d.content['src'], "http://example.org/test")
        self.assertEqual(d.content['hash'], "md5:ca8d87b5cf6edbbe88f51d45926c9a8d")

        os.unlink(t)
        
        t = tempfile.mktemp()
        with open(t+".pdf","w") as f:
             f.write("This is not a real PDF file")
        
        d = DocumentEntry()
        d.set_content(t+".pdf", "http://example.org/test")
        self.assertEqual(d.content['type'],"application/pdf")
        self.assertIsNone(d.content['markup'])
        self.assertEqual(d.content['src'], "http://example.org/test")
        self.assertEqual(d.content['hash'], "md5:0a461f0621ede53f1ea8471e34796b6f")

        d = DocumentEntry()
        with self.assertRaises(AssertionError):
            d.set_content(t+".pdf", "http://example.org/test", inline=True)

        os.unlink(t+".pdf")
Пример #46
0
 def test_save(self):
     path = self.repo.store.documententry_path("123/x")
     d = DocumentEntry()
     d.title = StringIO("A file-like object, not a string")
     with self.assertRaises(TypeError):
         d.save(path=path)
Пример #47
0
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"
        
        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)
        
        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: download OK from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: download OK (new version) from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)
            
        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc") 
        if a:
            extraurls.append(a.get("href"))
        

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" %
                                 url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(
                        basefile,
                        attachment="index" +
                        doctype)
                self.log.debug("%s: downloading attachment %s" % (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated
Пример #48
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if not f.startswith("downloaded/sfs"):  # sfst or sfsr
                continue
            for regex in self.templ:
                m = re.match(regex, f)
                if not m:
                    continue
                if "vcheck" in m.groupdict():  # silently ignore
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                # text = t.extractfile(f).read(4000).decode("latin-1")
                text = open(f).read(4000).decode("latin-1")
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile, reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    version = None
                    current += 1
                    de = DocumentEntry()
                    de.basefile = basefile
                    de.id = self.canonical_uri(basefile, updated_to)
                    # fudge timestamps best as we can
                    de.orig_created = datetime.fromtimestamp(
                        os.path.getctime(f))
                    de.orig_updated = datetime.fromtimestamp(
                        os.path.getmtime(f))
                    de.orig_updated = datetime.now()
                    de.orig_url = self.document_url_template % locals()
                    de.published = datetime.now()
                    de.url = self.generated_url(basefile)
                    de.title = "SFS %s" % basefile
                    # de.set_content()
                    # de.set_link()
                    de.save(self.store.documententry_path(basefile))
                # this yields more reasonable basefiles, but they are not
                # backwards compatible -- skip them for now
                # basefile = basefile.replace("_", "").replace(".", "")
                if "type" in m.groupdict() and m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile)
                    current -= 1  # to offset the previous increment
                else:
                    dest = self.store.downloaded_path(basefile, version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions" %
            (current, archived))
Пример #49
0
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"

        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)

        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning(
                    "%s: Could not find this prop at %s, might be a bug" %
                    (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info("%s: downloaded new version from %s" %
                              (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)

        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc")
        if a:
            extraurls.append(a.get("href"))

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is"
                        % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" % url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(basefile,
                                                          attachment="index" +
                                                          doctype)
                self.log.debug("%s: downloading attachment %s" %
                               (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated