def textreader_from_basefile(self, basefile, encoding): infile = self.store.downloaded_path(basefile) tmpfile = self.store.path(basefile, "intermediate", ".pdf") outfile = self.store.path(basefile, "intermediate", ".txt") util.copy_if_different(infile, tmpfile) util.runcmd("pdftotext %s" % tmpfile, require_success=True) util.robust_remove(tmpfile) return TextReader(outfile, encoding=encoding, linesep=TextReader.UNIX)
def download_test(self, specfile): def my_get(url, **kwargs): urlspec = spec[url] if isinstance(urlspec, str): urlspec = {'file': urlspec} if 'charset' not in urlspec: urlspec['charset'] = 'utf-8' url_location = os.path.join(os.path.dirname(specfile), urlspec['file']) res = Mock() # load the .content property with open(url_location, "rb") as fp: res.content = fp.read() # but only load .text if a charset is present (note # default value of 'utf-8' above -- set 'charset': null in # the json file for binary files if urlspec['charset']: with codecs.open(url_location, "r", encoding=urlspec['charset']) as fp: res.text = fp.read() # FIXME: Using a defaultdict ensures that we'll never trip # over the non-existance of certain headers. WE should # specify only the most basic headers to make sure calling # code doesn't rely on eg. the etag header always being # there, because it won't res.headers = collections.defaultdict(lambda: None) res.headers['X-These-Headers-Are'] = 'Faked' res.status_code = 200 return res with codecs.open(specfile, encoding="utf-8") as fp: spec = json.load(fp) with patch('requests.get', side_effect=my_get): self.repo.download() # organize a temporary copy of files that we can compare our results to wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias) expected = False for url in spec: if "expect" in spec[url]: expected = True sourcefile = os.path.join(os.path.dirname(specfile), spec[url]['file']) wantfile = "%s/%s" % (wantdir, spec[url]['expect']) util.copy_if_different(sourcefile, wantfile) if expected: self.assertEqualDirs(wantdir, "%s/%s" % (self.datadir, self.repoclass.alias)) else: self.fail('No files were marked as "expect" in specfile %s' % specfile)
def create_external_resources(self, doc): cssfile = self.store.parsed_path(doc.basefile, attachment="index.css") with open(cssfile, "w") as fp: # Create CSS header with fontspecs for pdf in doc.body: assert isinstance(pdf, PDFReader) for spec in list(pdf.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec['color'])) # 2 Copy all created png files to their correct locations totcnt = 0 pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0] for pdf in doc.body: cnt = 0 for page in pdf: totcnt += 1 cnt += 1 src = self.store.intermediate_path( doc.basefile, attachment="%s%03d.png" % (pdfbase, page.number)) dest = self.store.parsed_path( doc.basefile, attachment="%s%03d.png" % (pdfbase, page.number)) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) fp.write("#page%03d { background: url('%s');}\n" % (cnt, os.path.basename(dest)))
def create_external_resources(self, doc): resources = [] cssfile = self.store.parsed_path(doc.basefile, attachment="index.css") resources.append(cssfile) util.ensure_dir(cssfile) with open(cssfile, "w") as fp: # Create CSS header with fontspecs for pdf in doc.body: assert isinstance(pdf, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(pdf) for spec in list(pdf.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec['color'])) # 2 Copy all created png files to their correct locations totcnt = 0 pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0] for pdf in doc.body: cnt = 0 for page in pdf: totcnt += 1 cnt += 1 if page.background: src = self.store.intermediate_path( doc.basefile, attachment=os.path.basename(page.background)) dest = self.store.parsed_path( doc.basefile, attachment=os.path.basename(page.background)) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) resources.append(dest) fp.write("#page%03d { background: url('%s');}\n" % (cnt, os.path.basename(dest))) return resources
def test_copy_if_different(self): # test 1: dst does not exist util.writefile(self.fname, "Hello") self.assertTrue(util.copy_if_different(self.fname, self.fname2)) self.assertTrue(os.path.exists(self.fname)) self.assertTrue(os.path.exists(self.fname2)) # test 2: dst does exist, is different util.writefile(self.fname, "Hello (different)") self.assertTrue(util.copy_if_different(self.fname, self.fname2)) self.assertTrue(os.path.exists(self.fname)) self.assertTrue(os.path.exists(self.fname2)) self.assertEqual("Hello (different)", util.readfile(self.fname2)) # test 3: dst does exist, is identical self.assertFalse(util.copy_if_different(self.fname, self.fname2))
def read(self, pdffile, workdir): """Initializes a PDFReader object from an existing PDF file. After initialization, the PDFReader contains a list of :py:class:`~ferenda.pdfreader.Page` objects. :param pdffile: The full path to the PDF file :param workdir: A directory where intermediate files (particularly background PNG files) are stored """ self.filename = pdffile assert os.path.exists(pdffile), "PDF %s not found" % pdffile basename = os.path.basename(pdffile) xmlfile = os.sep.join( (workdir, os.path.splitext(basename)[0] + ".xml")) if not util.outfile_is_newer([pdffile], xmlfile): tmppdffile = os.sep.join([workdir, basename]) util.copy_if_different(pdffile, tmppdffile) # two pass coding: First use -c (complex) to extract # background pictures, then use -xml to get easy-to-parse # text with bounding boxes. cmd = "pdftohtml -nodrm -c %s" % tmppdffile self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # we won't need the html files for f in os.listdir(workdir): if f.endswith(".html"): os.unlink(workdir + os.sep + f) cmd = "pdftohtml -nodrm -xml %s" % tmppdffile self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) return self._parse_xml(xmlfile)
def create_external_resources(self, doc): """Optionally create external files that go together with the parsed file (stylesheets, images, etc). """ if len(doc.body) == 0: self.log.warning( "%s: No external resources to create", doc.basefile) return # Step 1: Create CSS # 1.1 find css name cssfile = self.store.parsed_path(doc.basefile, attachment='index.css') # 1.2 create static CSS fp = open(cssfile, "w") # 1.3 create css for fontspecs and pages for pdf in doc.body: assert isinstance(pdf, PDFReader) for spec in list(pdf.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec['color'])) # 2 Copy all created png files to their correct locations totcnt = 0 src_base = os.path.dirname(self.store.intermediate_path(doc.basefile)) for pdf in doc.body: pdf_src_base = src_base + "/" + os.path.splitext(os.path.basename(pdf.filename))[0] cnt = 0 for page in pdf: totcnt += 1 cnt += 1 src = "%s%03d.png" % (pdf_src_base, page.number) # 4 digits, compound docs can be over 1K pages attachment = "%04d.png" % (totcnt) dest = self.store.parsed_path(doc.basefile, attachment=attachment) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) fp.write("#page%03d { background: url('%s');}\n" % (cnt, os.path.basename(dest)))
def create_external_resources(self, doc): resources = [] cssfile = self.store.parsed_path(doc.basefile, attachment="index.css") resources.append(cssfile) util.ensure_dir(cssfile) with open(cssfile, "w") as fp: # Create CSS header with fontspecs for pdf in doc.body: assert isinstance( pdf, PDFReader ), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type( pdf) for spec in list(pdf.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec['color'])) # 2 Copy all created png files to their correct locations totcnt = 0 pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0] for pdf in doc.body: cnt = 0 for page in pdf: totcnt += 1 cnt += 1 if page.background: src = self.store.intermediate_path( doc.basefile, attachment=os.path.basename(page.background)) dest = self.store.parsed_path( doc.basefile, attachment=os.path.basename(page.background)) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) resources.append(dest) fp.write("#page%03d { background: url('%s');}\n" % (cnt, os.path.basename(dest))) return resources
def create_external_resources(self, doc): resources = [] if isinstance(doc.body, Body): # document wasn't derived from a PDF file, probably from HTML instead return resources cssfile = self.store.parsed_path(doc.basefile, attachment="index.css") urltransform = self.get_url_transform_func([self], os.path.dirname(cssfile), develurl=self.config.develurl) resources.append(cssfile) util.ensure_dir(cssfile) with open(cssfile, "w") as fp: # Create CSS header with fontspecs assert isinstance(doc.body, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(doc.body) for spec in list(doc.body.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec.get('color', 'black'))) # 2 Copy all created png files to their correct locations for cnt, page in enumerate(doc.body): if page.background: src = self.store.intermediate_path( doc.basefile, attachment=os.path.basename(page.background)) dest = self.store.parsed_path( doc.basefile, attachment=os.path.basename(page.background)) resources.append(dest) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) desturi = "%s?dir=parsed&attachment=%s" % (doc.uri, os.path.basename(dest)) desturi = urltransform(desturi) background = " background: url('%s') no-repeat grey;" % desturi else: background = "" fp.write("#page%03d {width: %spx; height: %spx;%s}\n" % (cnt+1, page.width, page.height, background)) return resources
def create_external_resources(self, doc): resources = [] # there are two types of doc.body objects # 1. PDFReader objects, ie raw PDF objects, structured by page # and with a top-level fontspec object # 2. elements.Body objects that are structured by logical # elements (chapters, sections etc) and where individual # Sidbrytning objects can be anywhere in the tree. if not hasattr(doc.body, 'fontspec'): # document wasn't derived from a PDF file, probably from HTML instead return resources cssfile = self.store.parsed_path(doc.basefile, attachment="index.css") urltransform = self.get_url_transform_func( [self], os.path.dirname(cssfile), develurl=self.config.develurl) resources.append(cssfile) util.ensure_dir(cssfile) with open(cssfile, "w") as fp: # Create CSS header with fontspecs for spec in list(doc.body.fontspec.values()): fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" % (spec['id'], spec['size'], spec['family'], spec.get('color', 'black'))) # 2 Copy all created png files to their correct locations if isinstance(doc.body, PDFReader): pageenumerator = enumerate(doc.body) else: sidbrytningar = [] def collect(node, state): if isinstance(node, Sidbrytning): state.append(node) return state self.visit_node(doc.body, collect, sidbrytningar) pageenumerator = enumerate(sidbrytningar) # assert isinstance(doc.body, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(doc.body) for cnt, page in pageenumerator: if page.background: src = self.store.intermediate_path( doc.basefile, attachment=os.path.basename(page.background)) dest = self.store.parsed_path(doc.basefile, attachment=os.path.basename( page.background)) resources.append(dest) if util.copy_if_different(src, dest): self.log.debug("Copied %s to %s" % (src, dest)) desturi = "%s?dir=parsed&attachment=%s" % ( doc.uri, os.path.basename(dest)) desturi = urltransform(desturi) background = " background: url('%s') no-repeat grey;" % desturi else: background = "" fp.write("#%s {width: %spx; height: %spx;%s}\n" % (page.id, page.width, page.height, background)) return resources
def download_test(self, specfile, basefile=None): """This test is run for each json file found in docroot/source.""" # this function can run in normal test mode or in # FERENDA_SET_TESTFILE mode. In the latter, all the normal download # code, including net access, is run. Calls to requests.get # are intercepted and notes are made of which URLs are # requested, and if this results in files on disk. The end # result is a JSON file and a set of cached files, all placed under # "source/" def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join( [self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath(".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join(os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest) with codecs.open(specfile, encoding="utf-8") as fp: spec = json.load(fp) for k in list(spec.keys()): # NB: This exposes the encoded, possibly non-ascii, values # of the URL as byte strings. The encoding of these is # unknown (and we cannot generally assume UTF-8. Let's see # if this bites us. nk = unquote(k) if k != nk: spec[nk] = spec[k] del spec[k] # process the special '@settings' key (FIXME: didn't I already # implement this somewhere else?) # # a @settings like this: # "@settings": { # "config": {"next_sfsnr": "2014:913"} # }, # # will have the effect of this: # # self.repo.config.next_sfsnr = "2014:913" if '@settings' in spec: for attribute in spec['@settings']: if isinstance(spec['@settings'][attribute], dict): thing = getattr(self.repo, attribute) for key, value in spec['@settings'][attribute].items(): setattr(thing, key, value) else: setattr(self.repo, attribute, spec['@settings'][attribute]) if os.environ.get("FERENDA_SET_TESTFILE"): downloaddir = os.sep.join( [self.datadir, self.repoclass.alias, "downloaded"]) state = { 'downloaded': list(util.list_dirs(downloaddir)), 'previous_url': None, 'requests': 0 } try: rc = int(os.environ.get("FERENDA_SET_TESTFILE")) state['total_requests'] = rc except (ValueError, TypeError): state['total_requests'] = 2 # search page, single payload def callback(req): # clean up after last callback add_downloaded_files(state['downloaded'], spec, state['previous_url']) if state['requests'] == state['total_requests']: raise MaxDownloadsReached() # make a real requests call somehow responses.stop() # when testing this testing function # (testTestutil.RepoTester.test_download_setfile) we # still want to disable responses, but we don't want # to make an actual HTTP call. Detect if we are # running that test by examining the stack, and if so, # mock the requests.get call in a different way. frames = [ f for f in inspect.stack() if f[3] == "test_download_setfile" ] if frames: frame = frames[0][0] resp = frame.f_locals['self']._myget(req.url) else: resp = requests.get(req.url) responses.start() # create a filename. use .html as suffix unless we # should use something else contenttype = resp.headers["Content-type"] stem = os.path.splitext(specfile)[0] suffix = { 'application/pdf': 'pdf', 'application/json': 'json', 'text/plain': 'txt' }.get(contenttype, "html") outfile = "%s-%s.%s" % (stem, state['requests'], suffix) with open(outfile, "wb") as fp: fp.write(resp.content) if not frames and os.environ.get("TRAVIS") != "true": if suffix == "html": print( "requested %s, saved as %s. Edit if needed, then press enter" % (req.url, outfile)) x = input() else: print("requested %s, saved %s" % (req.url, outfile)) with open(outfile, "rb") as fp: content = fp.read() spec[req.url] = {'file': os.path.basename(outfile)} if resp.encoding != 'utf-8': spec[req.url]['encoding'] = resp.encoding state['requests'] += 1 state['previous_url'] = req.url return (resp.status_code, resp.headers, content) else: def callback(req): headers = {'Content-type': 'text/html'} try: # normalize req.url. req.url might be a (byte)str # but keys in spec will be (and should be) # unicode. Assume that req.url is all ascii if isinstance(req.url, bytes): url = req.url.decode() else: url = req.url urlspec = spec[unquote(url)] if isinstance(urlspec, str): urlspec = {'file': urlspec} url_location = os.path.join(os.path.dirname(specfile), urlspec['file']) # load the .content property with open(url_location, "rb") as fp: content = fp.read() return (200, headers, content) except KeyError: return (404, headers, "Not found") responses.add_callback(responses.GET, re.compile("(.*)"), callback) # PERFORM THE TEST try: self.repo.download(basefile) except MaxDownloadsReached: pass if os.environ.get("FERENDA_SET_TESTFILE"): # process final file and save specfile add_downloaded_files(state['downloaded'], spec, state['previous_url']) with open(specfile, "w") as fp: j = json.dumps(spec, indent=4, separators=(', ', ': ')) fp.write(j) # organize a temporary copy of files that we can compare our results to wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias) expected = False for url in spec: if url == "@settings": continue if "expect" in spec[url]: expected = True sourcefile = os.path.join(os.path.dirname(specfile), spec[url]['file']) wantfile = "%s/%s" % (wantdir, spec[url]['expect']) util.copy_if_different(sourcefile, wantfile) if expected: self.assertEqualDirs(wantdir, "%s/%s" % (self.datadir, self.repoclass.alias), subset=True) else: # the test doesn't actually result in any downloaded file if hasattr(self.repo, 'expect') and self.repo.expect is False: pass else: self.fail('No files were marked as "expect" in specfile %s' % specfile)
def download_test(self, specfile, basefile=None): """This test is run for each json file found in docroot/source.""" # this function can run in normal test mode or in # FERENDA_SET_TESTFILE mode. In the latter, all the normal download # code, including net access, is run. Calls to requests.get # are intercepted and notes are made of which URLs are # requested, and if this results in files on disk. The end # result is a JSON file and a set of cached files, all placed under # "source/" def add_downloaded_files(filelist, spec, url): downloaddir = os.sep.join([self.datadir, self.repoclass.alias, "downloaded"]) for f in list(util.list_dirs(downloaddir)): if f.endswith(".etag"): continue # FIXME: this is ugly if f not in filelist: # print("Fetching %s resulted in downloaded file %s" % (url, f)) filelist.append(f) expect = "downloaded" + f.replace(downloaddir, "") if os.sep != "/": expect = expect.replace(os.sep, "/") spec[url]['expect'] = expect reldest = os.path.relpath( ".." + os.sep + "downloaded", os.path.dirname(f)) dest = os.path.normpath( os.path.join( os.path.dirname(specfile), reldest)) util.ensure_dir(dest) shutil.copy2(f, dest) with codecs.open(specfile, encoding="utf-8") as fp: spec = json.load(fp) for k in list(spec.keys()): # NB: This exposes the encoded, possibly non-ascii, values # of the URL as byte strings. The encoding of these is # unknown (and we cannot generally assume UTF-8. Let's see # if this bites us. nk = unquote(k) if k != nk: spec[nk] = spec[k] del spec[k] # process the special '@settings' key (FIXME: didn't I already # implement this somewhere else?) # # a @settings like this: # "@settings": { # "config": {"next_sfsnr": "2014:913"} # }, # # will have the effect of this: # # self.repo.config.next_sfsnr = "2014:913" if '@settings' in spec: for attribute in spec['@settings']: if isinstance(spec['@settings'][attribute], dict): thing = getattr(self.repo, attribute) for key, value in spec['@settings'][attribute].items(): setattr(thing, key, value) else: setattr(self.repo, attribute, spec['@settings'][attribute]) if os.environ.get("FERENDA_SET_TESTFILE"): downloaddir = os.sep.join([self.datadir, self.repoclass.alias, "downloaded"]) state = {'downloaded': list(util.list_dirs(downloaddir)), 'previous_url': None, 'requests': 0} try: rc = int(os.environ.get("FERENDA_SET_TESTFILE")) state['total_requests'] = rc except (ValueError, TypeError): state['total_requests'] = 2 # search page, single payload def callback(req): # clean up after last callback add_downloaded_files(state['downloaded'], spec, state['previous_url']) if state['requests'] == state['total_requests']: raise MaxDownloadsReached() # make a real requests call somehow responses.stop() # when testing this testing function # (testTestutil.RepoTester.test_download_setfile) we # still want to disable responses, but we don't want # to make an actual HTTP call. Detect if we are # running that test by examining the stack, and if so, # mock the requests.get call in a different way. frames = [f for f in inspect.stack() if f[3] == "test_download_setfile"] if frames: frame = frames[0][0] resp = frame.f_locals['self']._myget(req.url) else: resp = requests.get(req.url) responses.start() # create a filename. use .html as suffix unless we # should use something else contenttype = resp.headers["Content-type"] stem = os.path.splitext(specfile)[0] suffix = {'application/pdf': 'pdf', 'application/json': 'json', 'text/plain': 'txt'}.get(contenttype, "html") outfile = "%s-%s.%s" % (stem, state['requests'], suffix) with open(outfile, "wb") as fp: fp.write(resp.content) if not frames and os.environ.get("TRAVIS") != "true": if suffix == "html": print( "requested %s, saved as %s. Edit if needed, then press enter" % (req.url, outfile)) x = input() else: print("requested %s, saved %s" % (req.url, outfile)) with open(outfile, "rb") as fp: content = fp.read() spec[req.url] = {'file': os.path.basename(outfile)} if resp.encoding != 'utf-8': spec[req.url]['encoding'] = resp.encoding state['requests'] += 1 state['previous_url'] = req.url return (resp.status_code, resp.headers, content) else: def callback(req): headers = {'Content-type': 'text/html'} try: # normalize req.url. req.url might be a (byte)str # but keys in spec will be (and should be) # unicode. Assume that req.url is all ascii if isinstance(req.url, bytes): url = req.url.decode() else: url = req.url urlspec = spec[unquote(url)] if isinstance(urlspec, str): urlspec = {'file': urlspec} url_location = os.path.join(os.path.dirname(specfile), urlspec['file']) # load the .content property with open(url_location, "rb") as fp: content = fp.read() return (200, headers, content) except KeyError: return (404, headers, "Not found") responses.add_callback(responses.GET, re.compile("(.*)"), callback) # PERFORM THE TEST try: self.repo.download(basefile) except MaxDownloadsReached: pass if os.environ.get("FERENDA_SET_TESTFILE"): # process final file and save specfile add_downloaded_files(state['downloaded'], spec, state['previous_url']) with open(specfile, "w") as fp: j = json.dumps(spec, indent=4, separators=(', ', ': ')) fp.write(j) # organize a temporary copy of files that we can compare our results to wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias) expected = False for url in spec: if url == "@settings": continue if "expect" in spec[url]: expected = True sourcefile = os.path.join(os.path.dirname(specfile), spec[url]['file']) wantfile = "%s/%s" % (wantdir, spec[url]['expect']) util.copy_if_different(sourcefile, wantfile) if expected: self.assertEqualDirs(wantdir, "%s/%s" % (self.datadir, self.repoclass.alias), subset=True) else: # the test doesn't actually result in any downloaded file if hasattr(self.repo, 'expect') and self.repo.expect is False: pass else: self.fail('No files were marked as "expect" in specfile %s' % specfile)