def extract_body(self, fp, basefile): pdffile = self.store.downloaded_path(basefile, attachment="index.pdf") # fp can now be a pointer to a hocr file, a pdf2xml file, # a html file or a StringIO object containing html taken # from index.xml if os.path.exists(pdffile): fp = self.parse_open(basefile) parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml" reader = StreamingPDFReader().read(fp, parser=parser) identifier = self.canonical_uri(basefile) pdffile = self.store.downloaded_path(basefile, attachment="index.pdf") for page in reader: page.src = pdffile return reader else: # fp points to a HTML file, which we can use directly. # fp will be a raw bitstream of a latin-1 file. try: filename = util.name_from_fp(fp) self.log.debug("%s: Loading soup from %s" % (basefile, filename)) except ValueError: self.log.debug("%s: Loading placeholder soup" % (basefile)) text = fp.read() if text == "Propositionen ej utgiven": raise errors.DocumentRemovedError("%s was never published" % basefile) else: return BeautifulSoup(text, "lxml")
def _process_file(self, filename, buf, destdir, origin=""): """ Helper function to concatenate or copy CSS/JS (optionally processing them with e.g. Scss) or other files to correct place under the web root directory. :param filename: The name (relative to the ferenda package) of the file :param buf: A buffer into which the contents of the file is written (if combineresources == True) :param destdir: The directory into which the file will be copied (unless combineresources == True) :param origin: The source of the configuration that specifies this file :returns: The URL path of the resulting file, relative to the web root (or None if combineresources == True) :rtype: str """ if filename.startswith("http://") or filename.startswith("https://"): if self.config.combineresources: raise errors.ConfigurationError( "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename) self.log.debug("Using external url %s" % filename) return filename try: fp = self.resourceloader.openfp(filename, binary=True) except errors.ResourceNotFound: self.log.warning("file %(filename)s (specified in %(origin)s)" " doesn't exist" % locals()) return None (base, ext) = os.path.splitext(filename) if self.config.combineresources: self.log.debug("combining %s into buffer" % filename) d = fp.read() buf.write(d) fp.close() return None else: # FIXME: don't copy (at least not log) if the outfile # already exists. # self.log.debug("writing %s out to %s" % (filename, destdir)) outfile = destdir + os.sep + os.path.basename(filename) if (os.path.islink(outfile) and os.path.relpath( os.path.join(os.path.dirname(outfile), os.readlink(outfile))) == util.name_from_fp(fp)): self.log.warning( "%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp))) else: util.ensure_dir(outfile) with open(outfile, "wb") as fp2: fp2.write(fp.read()) fp.close() return self._filepath_to_urlpath(outfile, 2)
def close(self, *args, **kwargs): if "w" in self.mode: tempname = util.name_from_fp(self.fp) ret = self.fp.close() if not os.path.exists(self.filename) or not filecmp.cmp( tempname, self.filename): util.ensure_dir(self.filename) shutil.move(tempname, self.filename) # since _open uses NamedTemporaryFile, which creates # files only readable by the creating user, we need to # set more liberal permissions. FIXME: This should # respect os.umask() os.chmod( self.filename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH) else: os.unlink(tempname) return ret else: # This is needed sometimes since # Bzip2File/LZMAFile/GzipFile doesn't close the open file # objects that they wrap if hasattr(self.fp, '_fp'): # for Bzip2File/LZMAFile with IOBufferedReader self.fp._fp.close() if hasattr(self.fp, 'fileobj'): # for GzipFile in the same situation self.fp.fileobj.close() return self.fp.close()
def _process_file(self, filename, buf, destdir, origin=""): """ Helper function to concatenate or copy CSS/JS (optionally processing them with e.g. Scss) or other files to correct place under the web root directory. :param filename: The name (relative to the ferenda package) of the file :param buf: A buffer into which the contents of the file is written (if combineresources == True) :param destdir: The directory into which the file will be copied (unless combineresources == True) :param origin: The source of the configuration that specifies this file :returns: The URL path of the resulting file, relative to the web root (or None if combineresources == True) :rtype: str """ if filename.startswith("http://") or filename.startswith("https://"): if self.config.combineresources: raise errors.ConfigurationError( "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename) self.log.debug("Using external url %s" % filename) return filename try: fp = self.resourceloader.openfp(filename, binary=True) except errors.ResourceNotFound: self.log.warning("file %(filename)s (specified in %(origin)s)" " doesn't exist" % locals()) return None (base, ext) = os.path.splitext(filename) if self.config.combineresources: self.log.debug("combining %s into buffer" % filename) d = fp.read() buf.write(d) fp.close() return None else: # FIXME: don't copy (at least not log) if the outfile # already exists. # self.log.debug("writing %s out to %s" % (filename, destdir)) outfile = destdir + os.sep + os.path.basename(filename) if (os.path.islink(outfile) and os.path.relpath(os.path.join(os.path.dirname(outfile), os.readlink(outfile))) == util.name_from_fp(fp)): self.log.warning("%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp))) else: util.ensure_dir(outfile) with open(outfile, "wb") as fp2: fp2.write(fp.read()) fp.close() return self._filepath_to_urlpath(outfile, 2)
def extract_body(self, fp, basefile): # If we can asssume that the fp is a hOCR HTML file and not a # PDF2XML file, use alternate parser. FIXME: There ought to be # a cleaner way than guessing based on filename parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml" reader = StreamingPDFReader().read(fp, parser=parser) baseuri = self.canonical_uri(basefile) for page in reader: page.src = "%s/sid%s.png" % (baseuri, page.number) if reader.is_empty(): raise DocumentRemovedError(dummyfile=self.store.parsed_path(basefile)) else: return reader
def extract_body(self, fp, basefile): if util.name_from_fp(fp).endswith((".txt", ".txt.bz2")): bodystring = fp.read() if isinstance(bodystring, bytes): # fp is opened in bytestream mode bodystring = bodystring.decode("utf-8") return TextReader(string=bodystring) else: reader = super(PropTrips, self).extract_body(fp, basefile) pdffile = self.store.downloaded_path(basefile, attachment="index.pdf") for page in reader: page.src = pdffile return reader
def close(self, *args, **kwargs): if "w" in self.mode: tempname = util.name_from_fp(self.fp) ret = self.fp.close() if not os.path.exists(self.filename) or not filecmp.cmp( tempname, self.filename): util.ensure_dir(self.filename) shutil.move(tempname, self.filename) else: os.unlink(tempname) return ret else: return self.fp.close()
def extract_body(self, fp, basefile): # If we can asssume that the fp is a hOCR HTML file and not a # PDF2XML file, use alternate parser. FIXME: There ought to be # a cleaner way than guessing based on filename parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml" reader = StreamingPDFReader().read(fp, parser=parser) baseuri = self.canonical_uri(basefile) for page in reader: page.src = "%s/sid%s.png" % (baseuri, page.number) if reader.is_empty(): raise DocumentRemovedError( dummyfile=self.store.parsed_path(basefile)) else: return reader
def close(self, *args, **kwargs): if "w" in self.mode: tempname = util.name_from_fp(self.fp) ret = self.fp.close() if not os.path.exists(self.filename) or not filecmp.cmp(tempname, self.filename): util.ensure_dir(self.filename) shutil.move(tempname, self.filename) # since _open uses NamedTemporaryFile, which creates # files only readable by the creating user, we need to # set more liberal permissions. FIXME: This should # respect os.umask() os.chmod(self.filename, stat.S_IRUSR|stat.S_IWUSR|stat.S_IRGRP|stat.S_IWGRP|stat.S_IROTH) else: os.unlink(tempname) return ret else: # This is needed sometimes since # Bzip2File/LZMAFile/GzipFile doesn't close the open file # objects that they wrap if hasattr(self.fp, '_fp'): # for Bzip2File/LZMAFile with IOBufferedReader self.fp._fp.close() if hasattr(self.fp, 'fileobj'): # for GzipFile in the same situation self.fp.fileobj.close() return self.fp.close()