Exemplo n.º 1
0
    def download_single(self, basefile, url):
        updated = False
        created = False
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        # util.print_open_fds()
        if self.download_if_needed(url, basefile):
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: downloaded new version from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
        soup = BeautifulSoup(util.readfile(filename), "lxml")
        for pdflink in soup.find_all("a", href=re.compile("\.pdf$")):
            slug =  "-".join(pdflink["href"].rsplit("/")[-2:])
            attachment_path = self.store.downloaded_path(basefile, attachment=slug)
            self.download_if_needed(urljoin(url, pdflink["href"]), basefile, filename=attachment_path)
        vm = soup.find("a", text="Visa Varumärke")
        if vm:
            attachment_path = self.store.downloaded_path(basefile, attachment="varumarke.jpg")
            attachment_url = re.search("http[^'\"]*", vm["href"]).group(0)
            self.download_if_needed(attachment_url, basefile, filename=attachment_path)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
Exemplo n.º 2
0
    def test_save(self):
        path = self.repo.store.documententry_path("123/a")
        d = DocumentEntry()
        d.orig_checked = datetime(2013,3,27,20,46,37)
        d.orig_url = 'http://source.example.org/doc/123/a'
        d.save(path=path)

        self.maxDiff = None
        self.assertEqual(self.d2u(util.readfile(path)), self.basic_json)
Exemplo n.º 3
0
    def test_save(self):
        path = self.repo.store.documententry_path("123/a")
        d = DocumentEntry()
        d.orig_checked = datetime(2013,3,27,20,46,37)
        d.orig_url = 'http://source.example.org/doc/123/a'
        d.save(path=path)

        self.maxDiff = None
        self.assertEqual(self.d2u(util.readfile(path)), self.basic_json)
Exemplo n.º 4
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Exemplo n.º 5
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Exemplo n.º 6
0
    def download_single(self, basefile, url):
        updated = False
        created = False
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        # util.print_open_fds()
        if self.download_if_needed(url, basefile):
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info("%s: downloaded new version from %s" %
                              (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
        soup = BeautifulSoup(util.readfile(filename), "lxml")
        for pdflink in soup.find_all("a", href=re.compile("\.pdf$")):
            slug = "-".join(pdflink["href"].rsplit("/")[-2:])
            attachment_path = self.store.downloaded_path(basefile,
                                                         attachment=slug)
            self.download_if_needed(urljoin(url, pdflink["href"]),
                                    basefile,
                                    filename=attachment_path)
        vm = soup.find("a", text="Visa Varumärke")
        if vm:
            attachment_path = self.store.downloaded_path(
                basefile, attachment="varumarke.jpg")
            attachment_url = re.search("http[^'\"]*", vm["href"]).group(0)
            self.download_if_needed(attachment_url,
                                    basefile,
                                    filename=attachment_path)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
Exemplo n.º 7
0
    def importarchive(self, archivedir):
        """Imports downloaded data from an archive from legacy lagen.nu data.

        In particular, creates proper archive storage for older
        versions of each text.

        """
        current = archived = 0
        for f in util.list_dirs(archivedir, ".html"):
            if not f.startswith("downloaded/sfs"):  # sfst or sfsr
                continue
            for regex in self.templ:
                m = re.match(regex, f)
                if not m:
                    continue
                if "vcheck" in m.groupdict():  # silently ignore
                    break
                basefile = "%s:%s" % (m.group("byear"), m.group("bnum"))

                # need to look at the file to find out its version
                # text = t.extractfile(f).read(4000).decode("latin-1")
                text = open(f).read(4000).decode("latin-1")
                reader = TextReader(string=text)
                updated_to = self._find_uppdaterad_tom(basefile, reader=reader)

                if "vyear" in m.groupdict():  # this file is marked as
                    # an archival version
                    archived += 1
                    version = updated_to

                    if m.group("vyear") == "first":
                        pass
                    else:
                        exp = "%s:%s" % (m.group("vyear"), m.group("vnum"))
                        if version != exp:
                            self.log.warning("%s: Expected %s, found %s" %
                                             (f, exp, version))
                else:
                    version = None
                    current += 1
                    de = DocumentEntry()
                    de.basefile = basefile
                    de.id = self.canonical_uri(basefile, updated_to)
                    # fudge timestamps best as we can
                    de.orig_created = datetime.fromtimestamp(
                        os.path.getctime(f))
                    de.orig_updated = datetime.fromtimestamp(
                        os.path.getmtime(f))
                    de.orig_updated = datetime.now()
                    de.orig_url = self.document_url_template % locals()
                    de.published = datetime.now()
                    de.url = self.generated_url(basefile)
                    de.title = "SFS %s" % basefile
                    # de.set_content()
                    # de.set_link()
                    de.save(self.store.documententry_path(basefile))
                # this yields more reasonable basefiles, but they are not
                # backwards compatible -- skip them for now
                # basefile = basefile.replace("_", "").replace(".", "")
                if "type" in m.groupdict() and m.group("type") == "sfsr":
                    dest = self.store.register_path(basefile)
                    current -= 1  # to offset the previous increment
                else:
                    dest = self.store.downloaded_path(basefile, version)
                self.log.debug("%s: extracting %s to %s" % (basefile, f, dest))
                util.ensure_dir(dest)
                shutil.copy2(f, dest)
                break
            else:
                self.log.warning("Couldn't process %s" % f)
        self.log.info(
            "Extracted %s current versions and %s archived versions" %
            (current, archived))
Exemplo n.º 8
0
    def download_single(self, basefile, url=None):
        if self.get_parse_options(basefile) == "skip":
            raise DocumentSkippedError(
                "%s should not be downloaded according to options.py" %
                basefile)
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = filesupdated = False
        created = not os.path.exists(filename)
        if (not os.path.exists(filename) or self.config.refresh):
            existed = os.path.exists(filename)
            try:
                updated = self.download_if_needed(url,
                                                  basefile,
                                                  filename=filename)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 400:
                    # regeringen.se seems to have a problem with the
                    # first req after a search -- unless slowed down,
                    # raises a 400 error. Sleep on it, and try once more
                    sleep(5)
                    updated = self.download_if_needed(url,
                                                      basefile,
                                                      filename=filename)
                else:
                    raise
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.info("%s: updated from %s" % (basefile, url))
                else:
                    self.log.debug("%s: %s is unchanged, checking PDF files" %
                                   (basefile, filename))
            else:
                self.log.info("%s: download OK from %s" % (basefile, url))

            if self.get_parse_options(basefile) == "metadataonly":
                self.log.debug(
                    "%s: Marked as 'metadataonly', not downloading actual PDF file"
                    % basefile)
            else:
                soup = BeautifulSoup(
                    codecs.open(filename, encoding=self.source_encoding),
                    "lxml")
                cnt = 0
                selected_files = self.find_doc_links(soup, basefile)
                if selected_files:
                    for (filename, filetype, label) in selected_files:
                        fileurl = urljoin(url, filename)
                        basepath = filename.split("/")[-1]
                        filename = self.store.downloaded_path(
                            basefile, attachment=basepath)
                        if not filename.lower().endswith(".pdf"):
                            filename += ".%s" % filetype
                        if self.download_if_needed(fileurl,
                                                   basefile,
                                                   filename=filename):
                            filesupdated = True
                            self.log.debug("    %s is new or updated" %
                                           filename)
                        else:
                            self.log.debug("    %s is unchanged" % filename)
                else:
                    self.log.warning("%s (%s) has no downloadable files" %
                                     (basefile, url))
            if updated or filesupdated:
                pass
            else:
                self.log.debug("%s and all files are unchanged" % filename)
        else:
            self.log.debug("%s: %s already exists" % (basefile, filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or filesupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or filesupdated
Exemplo n.º 9
0
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"

        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)

        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning(
                    "%s: Could not find this prop at %s, might be a bug" %
                    (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info("%s: downloaded new version from %s" %
                              (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)

        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc")
        if a:
            extraurls.append(a.get("href"))

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is"
                        % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" % url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(basefile,
                                                          attachment="index" +
                                                          doctype)
                self.log.debug("%s: downloading attachment %s" %
                               (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated
Exemplo n.º 10
0
    def download_single(self, basefile, url):
        if self.get_parse_options(basefile) == "skip":
            raise errors.DocumentSkippedError(
                "%s should not be downloaded according to options.py" %
                basefile)
        rdffilename = self.store.downloaded_path(basefile,
                                                 attachment="index.rdf")
        if self.get_parse_options(
                basefile) == "metadataonly" and os.path.exists(
                    rdffilename) and (not self.config.refresh):
            # it is kind of bad that we can even get here in these
            # cases (if a rdffile exists, and a empty index.pdf
            # exists, shouldn't download() skip that file? Right now
            # it ignores empty files and passes them to
            # download_single.
            return False

        # url is really a 2-tuple
        url, title = url
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, "lxml")
        pdflink = soup.find("a", href=re.compile(".*\.pdf$"))
        pdfurl = pdflink.get("href")
        thumburl = urljoin(url, soup.find("img", "tumnagel").get("src"))
        librisid = url.rsplit("-")[1]
        rdfurl = "http://data.libris.kb.se/open/bib/%s.rdf" % librisid
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        updated = False

        # download rdf metadata before actual content
        try:
            # it appears that URLs like
            # http://data.libris.kb.se/open/bib/8351225.rdf now
            # returns empty responses. Until we find out the proper
            # RDF endpoint URLs, we should check and warn for this
            # (and infer a minimal RDF by hand from what we can, eg
            # dc:title from the link text)
            self.download_if_needed(rdfurl,
                                    basefile,
                                    filename=rdffilename,
                                    archive=False)
            if os.path.getsize(rdffilename) == 0:
                self.log.warning("%s: %s returned 0 response, infer RDF" %
                                 (basefile, rdfurl))
                base = URIRef("http://libris.kb.se/resource/bib/%s" % librisid)
                fakegraph = Graph()
                fakegraph.bind("dc", str(DC))
                fakegraph.add((base, DC.title, Literal(title, lang="sv")))
                year = basefile.split(":")[0]  # Libris uses str type
                fakegraph.add((base, DC.date, Literal(year)))
                with open(rdffilename, "wb") as fp:
                    fakegraph.serialize(fp, format="pretty-xml")
        except requests.exceptions.HTTPError as e:
            self.log.error("Failed to load attachment: %s" % e)
            raise

        if self.get_parse_options(basefile) == "metadataonly":
            self.log.debug(
                "%s: Marked as 'metadataonly', not downloading actual PDF file"
                % basefile)
            with self.store.open_downloaded(basefile, "w") as fp:
                pass
        else:
            if self.download_if_needed(pdfurl,
                                       basefile) or self.config.refresh:
                if created:
                    self.log.info("%s: download OK from %s" %
                                  (basefile, pdfurl))
                else:
                    self.log.info("%s: download OK (new version) from %s" %
                                  (basefile, pdfurl))
                updated = True
                try:
                    self.download_if_needed(
                        thumburl,
                        basefile,
                        filename=self.store.downloaded_path(
                            basefile, attachment="thumb.jpg"))
                except requests.exceptions.HTTPError as e:
                    self.log.error("Failed to load attachment: %s" % e)
                    raise
            else:
                self.log.debug("%s: exists and is unchanged" % basefile)
        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url  # or pdfurl?
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
        return updated
Exemplo n.º 11
0
    def download_single(self, basefile, url):
        if self.get_parse_options(basefile) == "skip":
            raise errors.DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
        rdffilename = self.store.downloaded_path(basefile, attachment="index.rdf")
        if self.get_parse_options(basefile) == "metadataonly" and os.path.exists(rdffilename) and (not self.config.refresh):
            # it is kind of bad that we can even get here in these
            # cases (if a rdffile exists, and a empty index.pdf
            # exists, shouldn't download() skip that file? Right now
            # it ignores empty files and passes them to
            # download_single.
            return False
        
        # url is really a 2-tuple
        url, title = url
        resp = self.session.get(url)
        soup = BeautifulSoup(resp.text, "lxml")
        pdflink = soup.find("a", href=re.compile(".*\.pdf$"))
        pdfurl = pdflink.get("href")
        thumburl = urljoin(url, soup.find("img", "tumnagel").get("src"))
        librisid = url.rsplit("-")[1]
        rdfurl = "http://data.libris.kb.se/open/bib/%s.rdf" % librisid
        filename = self.store.downloaded_path(basefile)
        created = not os.path.exists(filename)
        updated = False
        
        # download rdf metadata before actual content
        try:
            # it appears that URLs like
            # http://data.libris.kb.se/open/bib/8351225.rdf now
            # returns empty responses. Until we find out the proper
            # RDF endpoint URLs, we should check and warn for this
            # (and infer a minimal RDF by hand from what we can, eg
            # dc:title from the link text)
            self.download_if_needed(rdfurl, basefile,
                                    filename=rdffilename,
                                    archive=False)
            if os.path.getsize(rdffilename) == 0:
                self.log.warning("%s: %s returned 0 response, infer RDF" %
                                 (basefile, rdfurl))
                base = URIRef("http://libris.kb.se/resource/bib/%s" %
                              librisid)
                fakegraph = Graph()
                fakegraph.bind("dc", str(DC))
                fakegraph.add((base, DC.title, Literal(title, lang="sv")))
                year = basefile.split(":")[0] # Libris uses str type
                fakegraph.add((base, DC.date, Literal(year)))
                with open(rdffilename, "wb") as fp:
                    fakegraph.serialize(fp, format="pretty-xml")
        except requests.exceptions.HTTPError as e:
            self.log.error("Failed to load attachment: %s" % e)
            raise

        if self.get_parse_options(basefile) == "metadataonly":
            self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
            with self.store.open_downloaded(basefile, "w") as fp:
                pass
        else:
            if self.download_if_needed(pdfurl, basefile) or self.config.refresh:
                if created:
                    self.log.info("%s: download OK from %s" % (basefile, pdfurl))
                else:
                    self.log.info(
                        "%s: download OK (new version) from %s" % (basefile, pdfurl))
                updated = True
                try:
                    self.download_if_needed(thumburl, basefile,
                                            filename=self.store.downloaded_path(
                            basefile, attachment="thumb.jpg"))
                except requests.exceptions.HTTPError as e:
                    self.log.error("Failed to load attachment: %s" % e)
                    raise
            else:
                self.log.debug("%s: exists and is unchanged" % basefile)
        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url  # or pdfurl?
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()
        return updated
Exemplo n.º 12
0
    def download_single(self, basefile, url=None):
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = filesupdated = False
        created = not os.path.exists(filename)
        if (not os.path.exists(filename) or self.config.refresh):
            existed = os.path.exists(filename)
            updated = self.download_if_needed(url, basefile, filename=filename)
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.info("%s: updated from %s" % (basefile, url))
                else:
                    self.log.debug("%s: %s is unchanged, checking PDF files" %
                                   (basefile, filename))
            else:
                self.log.info("%s: downloaded from %s" % (basefile, url))

            soup = BeautifulSoup(
                codecs.open(filename, encoding=self.source_encoding), "lxml")
            cnt = 0
            selected_files = self.find_doc_links(soup, basefile)
            if selected_files:
                for (filename, filetype, label) in selected_files:
                    fileurl = urljoin(url, filename)
                    basepath = filename.split("/")[-1]
                    filename = self.store.downloaded_path(basefile,
                                                          attachment=basepath)
                    if not filename.lower().endswith(".pdf"):
                        filename += ".%s" % filetype
                    if self.download_if_needed(fileurl,
                                               basefile,
                                               filename=filename):
                        filesupdated = True
                        self.log.debug("    %s is new or updated" % filename)
                    else:
                        self.log.debug("    %s is unchanged" % filename)
            else:
                self.log.warning("%s (%s) has no downloadable files" %
                                 (basefile, url))
            if updated or filesupdated:
                pass
            else:
                self.log.debug("%s and all files are unchanged" % filename)
        else:
            self.log.debug("%s: %s already exists" % (basefile, filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or filesupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or filesupdated
Exemplo n.º 13
0
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"
        
        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)
        
        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: download OK from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: download OK (new version) from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)
            
        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc") 
        if a:
            extraurls.append(a.get("href"))
        

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" %
                                 url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(
                        basefile,
                        attachment="index" +
                        doctype)
                self.log.debug("%s: downloading attachment %s" % (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated
Exemplo n.º 14
0
    def download_single(self, basefile, url=None):
        if self.get_parse_options(basefile) == "skip":
            raise DocumentSkippedError("%s should not be downloaded according to options.py" % basefile)
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = filesupdated = False
        created = not os.path.exists(filename)
        if (not os.path.exists(filename) or self.config.refresh):
            existed = os.path.exists(filename)
            try:
                updated = self.download_if_needed(url, basefile, filename=filename)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 400:
                    # regeringen.se seems to have a problem with the
                    # first req after a search -- unless slowed down,
                    # raises a 400 error. Sleep on it, and try once more
                    sleep(5)
                    updated = self.download_if_needed(url, basefile, filename=filename)
                else:
                    raise
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.info("%s: updated from %s" % (basefile, url))
                else:
                    self.log.debug("%s: %s is unchanged, checking PDF files" %
                                   (basefile, filename))
            else:
                self.log.info("%s: download OK from %s" % (basefile, url))

            if self.get_parse_options(basefile) == "metadataonly":
                self.log.debug("%s: Marked as 'metadataonly', not downloading actual PDF file" % basefile)
            else:
                soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding), "lxml")
                cnt = 0
                selected_files = self.find_doc_links(soup, basefile)
                if selected_files:
                    for (filename, filetype,label) in selected_files:
                        fileurl = urljoin(url, filename)
                        basepath = filename.split("/")[-1]
                        filename = self.store.downloaded_path(basefile, attachment=basepath)
                        if not filename.lower().endswith(".pdf"):
                            filename += ".%s" % filetype
                        if self.download_if_needed(fileurl, basefile, filename=filename):
                            filesupdated = True
                            self.log.debug(
                                "    %s is new or updated" % filename)
                        else:
                            self.log.debug("    %s is unchanged" % filename)
                else:
                    self.log.warning(
                        "%s (%s) has no downloadable files" % (basefile, url))
            if updated or filesupdated:
                pass
            else:
                self.log.debug("%s and all files are unchanged" % filename)
        else:
            self.log.debug("%s: %s already exists" % (basefile, filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or filesupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or filesupdated
Exemplo n.º 15
0
    def download_single(self, basefile, url):
        # unpack the tuples we may recieve instead of plain strings
        if isinstance(basefile, tuple):
            basefile, attachment = basefile
            if attachment:
                mainattachment = attachment + ".html"
            else:
                mainattachment = None
        if isinstance(url, tuple):
            url, extraurls = url
        updated = created = False
        checked = True

        filename = self.store.downloaded_path(basefile, attachment=mainattachment)
        created = not os.path.exists(filename)
        if self.download_if_needed(url, basefile, filename=filename):
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: downloaded new version from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)

        for url in extraurls:
            if url.endswith('msword.application'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
                    continue
            elif url.endswith('pdf.application'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" %
                                 td.a['href'].split("=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(basefile, attachment="index" + doctype)
                self.log.debug("%s: downloading attachment %s" % (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        if mainattachment == None:
            entry = DocumentEntry(self.store.documententry_path(basefile))
            now = datetime.now()
            entry.orig_url = url
            if created:
                entry.orig_created = now
            if updated:
                entry.orig_updated = now
            if checked:
                entry.orig_checked = now
            entry.save()

        return updated
Exemplo n.º 16
0
    def download_single(self, basefile, url=None):
        if not url:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return
        filename = self.store.downloaded_path(basefile)  # just the html page
        updated = pdfupdated = False
        created = not os.path.exists
        if (not os.path.exists(filename) or self.config.force):
            existed = os.path.exists(filename)
            updated = self.download_if_needed(url, basefile, filename=filename)
            docid = url.split("/")[-1]
            if existed:
                if updated:
                    self.log.debug(
                        "%s existed, but a new ver was downloaded" % filename)
                else:
                    self.log.debug(
                        "%s is unchanged -- checking PDF files" % filename)
            else:
                self.log.debug(
                    "%s did not exist, so it was downloaded" % filename)

            soup = BeautifulSoup(codecs.open(filename, encoding=self.source_encoding))
            cnt = 0
            pdffiles = self.find_pdf_links(soup, basefile)
            if pdffiles:
                for pdffile in pdffiles:
                    # note; the pdfurl goes to a redirect script; however that
                    # part of the URL tree (/download/*) is off-limits for
                    # robots. But we can figure out the actual URL anyway!
                    if len(docid) > 4:
                        path = "c6/%02d/%s/%s" % (
                            int(docid[:-4]), docid[-4:-2], docid[-2:])
                    else:
                        path = "c4/%02d/%s" % (int(docid[:-2]), docid[-2:])
                    pdfurl = "http://www.regeringen.se/content/1/%s/%s" % (
                        path, pdffile)
                    pdffilename = self.store.downloaded_path(basefile, attachment=pdffile)
                    if self.download_if_needed(pdfurl, basefile, filename=pdffilename):
                        pdfupdated = True
                        self.log.debug(
                            "    %s is new or updated" % pdffilename)
                    else:
                        self.log.debug("    %s is unchanged" % pdffilename)
            else:
                self.log.warning(
                    "%s (%s) has no downloadable PDF files" % (basefile, url))
            if updated or pdfupdated:
                pass
            else:
                self.log.debug("%s and all PDF files are unchanged" % filename)
        else:
            self.log.debug("%s already exists" % (filename))

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated or pdfupdated:
            entry.orig_updated = now
        entry.orig_checked = now
        entry.save()

        return updated or pdfupdated