示例#1
0
    def textreader_from_basefile(self, basefile, encoding):
        infile = self.store.downloaded_path(basefile)
        tmpfile = self.store.path(basefile, "intermediate", ".pdf")
        outfile = self.store.path(basefile, "intermediate", ".txt")
        util.copy_if_different(infile, tmpfile)
        util.runcmd("pdftotext %s" % tmpfile, require_success=True)
        util.robust_remove(tmpfile)

        return TextReader(outfile, encoding=encoding, linesep=TextReader.UNIX)
示例#2
0
    def download_test(self, specfile):
        def my_get(url, **kwargs):
            urlspec = spec[url]
            if isinstance(urlspec, str):
                urlspec = {'file': urlspec}
            if 'charset' not in urlspec:
                urlspec['charset'] = 'utf-8'
            url_location = os.path.join(os.path.dirname(specfile),
                                        urlspec['file'])
            res = Mock()
            # load the .content property
            with open(url_location, "rb") as fp:
                res.content = fp.read()
            # but only load .text if a charset is present (note
            # default value of 'utf-8' above -- set 'charset': null in
            # the json file for binary files
            if urlspec['charset']:
                with codecs.open(url_location, "r", encoding=urlspec['charset']) as fp:
                    res.text = fp.read()
            # FIXME: Using a defaultdict ensures that we'll never trip
            # over the non-existance of certain headers. WE should
            # specify only the most basic headers to make sure calling
            # code doesn't rely on eg. the etag header always being
            # there, because it won't
            res.headers = collections.defaultdict(lambda: None)
            res.headers['X-These-Headers-Are'] = 'Faked'
            res.status_code = 200
            return res
        with codecs.open(specfile, encoding="utf-8") as fp:
            spec = json.load(fp)
        with patch('requests.get', side_effect=my_get):
            self.repo.download()

        # organize a temporary copy of files that we can compare our results to
        wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias)
        expected = False
        for url in spec:
            if "expect" in spec[url]:
                expected = True
                sourcefile = os.path.join(os.path.dirname(specfile),
                                          spec[url]['file'])
                wantfile = "%s/%s" % (wantdir, spec[url]['expect'])

                util.copy_if_different(sourcefile, wantfile)
        if expected:
            self.assertEqualDirs(wantdir,
                                 "%s/%s" % (self.datadir,
                                            self.repoclass.alias))
        else:
            self.fail('No files were marked as "expect" in specfile %s' %
                      specfile)
    def create_external_resources(self, doc):
        cssfile = self.store.parsed_path(doc.basefile, attachment="index.css")
        with open(cssfile, "w") as fp:
            # Create CSS header with fontspecs
            for pdf in doc.body:
                assert isinstance(pdf, PDFReader)
                for spec in list(pdf.fontspec.values()):
                    fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                             (spec['id'], spec['size'], spec['family'], spec['color']))

            # 2 Copy all created png files to their correct locations
            totcnt = 0
            pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0]
            for pdf in doc.body:
                cnt = 0
                for page in pdf:
                    totcnt += 1
                    cnt += 1
                    src = self.store.intermediate_path(
                        doc.basefile, attachment="%s%03d.png" % (pdfbase, page.number))
                    dest = self.store.parsed_path(
                        doc.basefile, attachment="%s%03d.png" % (pdfbase, page.number))
                    if util.copy_if_different(src, dest):
                        self.log.debug("Copied %s to %s" % (src, dest))

                    fp.write("#page%03d { background: url('%s');}\n" %
                             (cnt, os.path.basename(dest)))
示例#4
0
    def create_external_resources(self, doc):
        resources = []
        cssfile = self.store.parsed_path(doc.basefile, attachment="index.css")
        resources.append(cssfile)
        util.ensure_dir(cssfile)
        with open(cssfile, "w") as fp:
            # Create CSS header with fontspecs
            for pdf in doc.body:
                assert isinstance(pdf, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(pdf)
                for spec in list(pdf.fontspec.values()):
                    fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                             (spec['id'], spec['size'], spec['family'], spec['color']))

            # 2 Copy all created png files to their correct locations
            totcnt = 0
            pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0]
            for pdf in doc.body:
                cnt = 0
                for page in pdf:
                    totcnt += 1
                    cnt += 1
                    if page.background:
                        src = self.store.intermediate_path(
                            doc.basefile, attachment=os.path.basename(page.background))
                        dest = self.store.parsed_path(
                            doc.basefile, attachment=os.path.basename(page.background))
                        if util.copy_if_different(src, dest):
                            self.log.debug("Copied %s to %s" % (src, dest))
                        resources.append(dest)
                        fp.write("#page%03d { background: url('%s');}\n" %
                                 (cnt, os.path.basename(dest)))
        return resources
示例#5
0
文件: testUtil.py 项目: zigit/ferenda
    def test_copy_if_different(self):
        # test 1: dst does not exist
        util.writefile(self.fname, "Hello")
        self.assertTrue(util.copy_if_different(self.fname, self.fname2))
        self.assertTrue(os.path.exists(self.fname))
        self.assertTrue(os.path.exists(self.fname2))

        # test 2: dst does exist, is different
        util.writefile(self.fname, "Hello (different)")
        self.assertTrue(util.copy_if_different(self.fname, self.fname2))
        self.assertTrue(os.path.exists(self.fname))
        self.assertTrue(os.path.exists(self.fname2))
        self.assertEqual("Hello (different)",
                         util.readfile(self.fname2))

        # test 3: dst does exist, is identical
        self.assertFalse(util.copy_if_different(self.fname, self.fname2))
示例#6
0
    def read(self, pdffile, workdir):
        """Initializes a PDFReader object from an existing PDF file. After
        initialization, the PDFReader contains a list of
        :py:class:`~ferenda.pdfreader.Page` objects.

        :param pdffile: The full path to the PDF file
        :param workdir: A directory where intermediate files (particularly
                        background PNG files) are stored

        """

        self.filename = pdffile
        assert os.path.exists(pdffile), "PDF %s not found" % pdffile
        basename = os.path.basename(pdffile)
        xmlfile = os.sep.join(
            (workdir, os.path.splitext(basename)[0] + ".xml"))

        if not util.outfile_is_newer([pdffile], xmlfile):
            tmppdffile = os.sep.join([workdir, basename])
            util.copy_if_different(pdffile, tmppdffile)
            # two pass coding: First use -c (complex) to extract
            # background pictures, then use -xml to get easy-to-parse
            # text with bounding boxes.
            cmd = "pdftohtml -nodrm -c %s" % tmppdffile
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
            # we won't need the html files
            for f in os.listdir(workdir):
                if f.endswith(".html"):
                    os.unlink(workdir + os.sep + f)

            cmd = "pdftohtml -nodrm -xml %s" % tmppdffile
            self.log.debug("Converting: %s" % cmd)
            (returncode, stdout, stderr) = util.runcmd(cmd,
                                                       require_success=True)
        return self._parse_xml(xmlfile)
示例#7
0
    def create_external_resources(self, doc):
        """Optionally create external files that go together with the
        parsed file (stylesheets, images, etc). """
        if len(doc.body) == 0:
            self.log.warning(
                "%s: No external resources to create", doc.basefile)
            return
        # Step 1: Create CSS
        # 1.1 find css name
        cssfile = self.store.parsed_path(doc.basefile, attachment='index.css')
        # 1.2 create static CSS
        fp = open(cssfile, "w")
        # 1.3 create css for fontspecs and pages
        for pdf in doc.body:
            assert isinstance(pdf, PDFReader)
            for spec in list(pdf.fontspec.values()):
                fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                         (spec['id'], spec['size'], spec['family'], spec['color']))

        # 2 Copy all created png files to their correct locations
        totcnt = 0
        src_base = os.path.dirname(self.store.intermediate_path(doc.basefile))
        for pdf in doc.body:
            pdf_src_base = src_base + "/" + os.path.splitext(os.path.basename(pdf.filename))[0]

            cnt = 0
            for page in pdf:
                totcnt += 1
                cnt += 1
                src = "%s%03d.png" % (pdf_src_base, page.number)
                # 4 digits, compound docs can be over 1K pages
                attachment = "%04d.png" % (totcnt)
                dest = self.store.parsed_path(doc.basefile,
                                              attachment=attachment)

                if util.copy_if_different(src, dest):
                    self.log.debug("Copied %s to %s" % (src, dest))

                fp.write("#page%03d { background: url('%s');}\n" %
                         (cnt, os.path.basename(dest)))
示例#8
0
    def create_external_resources(self, doc):
        resources = []
        cssfile = self.store.parsed_path(doc.basefile, attachment="index.css")
        resources.append(cssfile)
        util.ensure_dir(cssfile)
        with open(cssfile, "w") as fp:
            # Create CSS header with fontspecs
            for pdf in doc.body:
                assert isinstance(
                    pdf, PDFReader
                ), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(
                    pdf)
                for spec in list(pdf.fontspec.values()):
                    fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                             (spec['id'], spec['size'], spec['family'],
                              spec['color']))

            # 2 Copy all created png files to their correct locations
            totcnt = 0
            pdfbase = os.path.splitext(os.path.basename(pdf.filename))[0]
            for pdf in doc.body:
                cnt = 0
                for page in pdf:
                    totcnt += 1
                    cnt += 1
                    if page.background:
                        src = self.store.intermediate_path(
                            doc.basefile,
                            attachment=os.path.basename(page.background))
                        dest = self.store.parsed_path(
                            doc.basefile,
                            attachment=os.path.basename(page.background))
                        if util.copy_if_different(src, dest):
                            self.log.debug("Copied %s to %s" % (src, dest))
                        resources.append(dest)
                        fp.write("#page%03d { background: url('%s');}\n" %
                                 (cnt, os.path.basename(dest)))
        return resources
示例#9
0
    def create_external_resources(self, doc):
        resources = []

        if isinstance(doc.body, Body):
            # document wasn't derived from a PDF file, probably from HTML instead
            return resources
        cssfile = self.store.parsed_path(doc.basefile, attachment="index.css")
        urltransform = self.get_url_transform_func([self], os.path.dirname(cssfile),
                                                   develurl=self.config.develurl)
        resources.append(cssfile)
        util.ensure_dir(cssfile)
        with open(cssfile, "w") as fp:
            # Create CSS header with fontspecs
            assert isinstance(doc.body, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(doc.body)
            for spec in list(doc.body.fontspec.values()):
                fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                         (spec['id'], spec['size'], spec['family'],
                          spec.get('color', 'black')))

            # 2 Copy all created png files to their correct locations
            for cnt, page in enumerate(doc.body):
                if page.background:
                    src = self.store.intermediate_path(
                        doc.basefile, attachment=os.path.basename(page.background))
                    dest = self.store.parsed_path(
                        doc.basefile, attachment=os.path.basename(page.background))
                    resources.append(dest)
                    if util.copy_if_different(src, dest):
                        self.log.debug("Copied %s to %s" % (src, dest))
                    desturi = "%s?dir=parsed&attachment=%s" % (doc.uri, os.path.basename(dest))
                    desturi = urltransform(desturi)
                    background = " background: url('%s') no-repeat grey;" % desturi
                else:
                    background = ""
                fp.write("#page%03d {width: %spx; height: %spx;%s}\n" %
                         (cnt+1, page.width, page.height, background))
        return resources
示例#10
0
    def create_external_resources(self, doc):
        resources = []

        # there are two types of doc.body objects

        # 1. PDFReader objects, ie raw PDF objects, structured by page
        #    and with a top-level fontspec object
        # 2. elements.Body objects that are structured by logical
        #    elements (chapters, sections etc) and where individual
        #    Sidbrytning objects can be anywhere in the tree.
        if not hasattr(doc.body, 'fontspec'):
            # document wasn't derived from a PDF file, probably from HTML instead
            return resources
        cssfile = self.store.parsed_path(doc.basefile, attachment="index.css")
        urltransform = self.get_url_transform_func(
            [self], os.path.dirname(cssfile), develurl=self.config.develurl)
        resources.append(cssfile)
        util.ensure_dir(cssfile)
        with open(cssfile, "w") as fp:
            # Create CSS header with fontspecs
            for spec in list(doc.body.fontspec.values()):
                fp.write(".fontspec%s {font: %spx %s; color: %s;}\n" %
                         (spec['id'], spec['size'], spec['family'],
                          spec.get('color', 'black')))

            # 2 Copy all created png files to their correct locations
            if isinstance(doc.body, PDFReader):
                pageenumerator = enumerate(doc.body)
            else:
                sidbrytningar = []

                def collect(node, state):
                    if isinstance(node, Sidbrytning):
                        state.append(node)
                    return state

                self.visit_node(doc.body, collect, sidbrytningar)
                pageenumerator = enumerate(sidbrytningar)
            # assert isinstance(doc.body, PDFReader), "doc.body is %s, not PDFReader -- still need to access fontspecs etc" % type(doc.body)

            for cnt, page in pageenumerator:
                if page.background:
                    src = self.store.intermediate_path(
                        doc.basefile,
                        attachment=os.path.basename(page.background))
                    dest = self.store.parsed_path(doc.basefile,
                                                  attachment=os.path.basename(
                                                      page.background))
                    resources.append(dest)
                    if util.copy_if_different(src, dest):
                        self.log.debug("Copied %s to %s" % (src, dest))
                    desturi = "%s?dir=parsed&attachment=%s" % (
                        doc.uri, os.path.basename(dest))
                    desturi = urltransform(desturi)
                    background = " background: url('%s') no-repeat grey;" % desturi
                else:
                    background = ""

                fp.write("#%s {width: %spx; height: %spx;%s}\n" %
                         (page.id, page.width, page.height, background))
        return resources
示例#11
0
    def download_test(self, specfile, basefile=None):
        """This test is run for each json file found in docroot/source."""

        # this function can run in normal test mode or in
        # FERENDA_SET_TESTFILE mode. In the latter, all the normal download
        # code, including net access, is run. Calls to requests.get
        # are intercepted and notes are made of which URLs are
        # requested, and if this results in files on disk. The end
        # result is a JSON file and a set of cached files, all placed under
        # "source/"
        def add_downloaded_files(filelist, spec, url):
            downloaddir = os.sep.join(
                [self.datadir, self.repoclass.alias, "downloaded"])
            for f in list(util.list_dirs(downloaddir)):
                if f.endswith(".etag"):
                    continue  # FIXME: this is ugly
                if f not in filelist:
                    # print("Fetching %s resulted in downloaded file %s" % (url, f))
                    filelist.append(f)
                    expect = "downloaded" + f.replace(downloaddir, "")
                    if os.sep != "/":
                        expect = expect.replace(os.sep, "/")
                    spec[url]['expect'] = expect
                    reldest = os.path.relpath(".." + os.sep + "downloaded",
                                              os.path.dirname(f))
                    dest = os.path.normpath(
                        os.path.join(os.path.dirname(specfile), reldest))
                    util.ensure_dir(dest)
                    shutil.copy2(f, dest)

        with codecs.open(specfile, encoding="utf-8") as fp:
            spec = json.load(fp)
        for k in list(spec.keys()):
            # NB: This exposes the encoded, possibly non-ascii, values
            # of the URL as byte strings. The encoding of these is
            # unknown (and we cannot generally assume UTF-8. Let's see
            # if this bites us.
            nk = unquote(k)
            if k != nk:
                spec[nk] = spec[k]
                del spec[k]

            # process the special '@settings' key (FIXME: didn't I already
            # implement this somewhere else?)
            #
            # a @settings like this:
            #     "@settings": {
            # 	"config": {"next_sfsnr": "2014:913"}
            #     },
            #
            # will have the effect of this:
            #
            # self.repo.config.next_sfsnr = "2014:913"
            if '@settings' in spec:
                for attribute in spec['@settings']:
                    if isinstance(spec['@settings'][attribute], dict):
                        thing = getattr(self.repo, attribute)
                        for key, value in spec['@settings'][attribute].items():
                            setattr(thing, key, value)
                    else:
                        setattr(self.repo, attribute,
                                spec['@settings'][attribute])

        if os.environ.get("FERENDA_SET_TESTFILE"):
            downloaddir = os.sep.join(
                [self.datadir, self.repoclass.alias, "downloaded"])
            state = {
                'downloaded': list(util.list_dirs(downloaddir)),
                'previous_url': None,
                'requests': 0
            }
            try:
                rc = int(os.environ.get("FERENDA_SET_TESTFILE"))
                state['total_requests'] = rc
            except (ValueError, TypeError):
                state['total_requests'] = 2  # search page, single payload

            def callback(req):
                # clean up after last callback
                add_downloaded_files(state['downloaded'], spec,
                                     state['previous_url'])
                if state['requests'] == state['total_requests']:
                    raise MaxDownloadsReached()
                # make a real requests call somehow
                responses.stop()
                # when testing this testing function
                # (testTestutil.RepoTester.test_download_setfile) we
                # still want to disable responses, but we don't want
                # to make an actual HTTP call. Detect if we are
                # running that test by examining the stack, and if so,
                # mock the requests.get call in a different way.
                frames = [
                    f for f in inspect.stack()
                    if f[3] == "test_download_setfile"
                ]
                if frames:
                    frame = frames[0][0]
                    resp = frame.f_locals['self']._myget(req.url)
                else:
                    resp = requests.get(req.url)
                responses.start()
                # create a filename. use .html as suffix unless we
                # should use something else
                contenttype = resp.headers["Content-type"]
                stem = os.path.splitext(specfile)[0]
                suffix = {
                    'application/pdf': 'pdf',
                    'application/json': 'json',
                    'text/plain': 'txt'
                }.get(contenttype, "html")
                outfile = "%s-%s.%s" % (stem, state['requests'], suffix)
                with open(outfile, "wb") as fp:
                    fp.write(resp.content)

                if not frames and os.environ.get("TRAVIS") != "true":
                    if suffix == "html":
                        print(
                            "requested %s, saved as %s. Edit if needed, then press enter"
                            % (req.url, outfile))
                        x = input()
                    else:
                        print("requested %s, saved %s" % (req.url, outfile))

                with open(outfile, "rb") as fp:
                    content = fp.read()
                spec[req.url] = {'file': os.path.basename(outfile)}
                if resp.encoding != 'utf-8':
                    spec[req.url]['encoding'] = resp.encoding

                state['requests'] += 1
                state['previous_url'] = req.url
                return (resp.status_code, resp.headers, content)
        else:

            def callback(req):
                headers = {'Content-type': 'text/html'}
                try:
                    # normalize req.url. req.url might be a (byte)str
                    # but keys in spec will be (and should be)
                    # unicode. Assume that req.url is all ascii
                    if isinstance(req.url, bytes):
                        url = req.url.decode()
                    else:
                        url = req.url
                    urlspec = spec[unquote(url)]
                    if isinstance(urlspec, str):
                        urlspec = {'file': urlspec}
                    url_location = os.path.join(os.path.dirname(specfile),
                                                urlspec['file'])
                    # load the .content property
                    with open(url_location, "rb") as fp:
                        content = fp.read()
                    return (200, headers, content)
                except KeyError:
                    return (404, headers, "Not found")

        responses.add_callback(responses.GET, re.compile("(.*)"), callback)
        # PERFORM THE TEST
        try:
            self.repo.download(basefile)
        except MaxDownloadsReached:
            pass

        if os.environ.get("FERENDA_SET_TESTFILE"):
            # process final file and save specfile
            add_downloaded_files(state['downloaded'], spec,
                                 state['previous_url'])
            with open(specfile, "w") as fp:
                j = json.dumps(spec, indent=4, separators=(', ', ': '))
                fp.write(j)

        # organize a temporary copy of files that we can compare our results to
        wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias)
        expected = False
        for url in spec:
            if url == "@settings":
                continue
            if "expect" in spec[url]:
                expected = True
                sourcefile = os.path.join(os.path.dirname(specfile),
                                          spec[url]['file'])
                wantfile = "%s/%s" % (wantdir, spec[url]['expect'])

                util.copy_if_different(sourcefile, wantfile)
        if expected:
            self.assertEqualDirs(wantdir,
                                 "%s/%s" %
                                 (self.datadir, self.repoclass.alias),
                                 subset=True)
        else:
            # the test doesn't actually result in any downloaded file
            if hasattr(self.repo, 'expect') and self.repo.expect is False:
                pass
            else:
                self.fail('No files were marked as "expect" in specfile %s' %
                          specfile)
示例#12
0
    def download_test(self, specfile, basefile=None):
        """This test is run for each json file found in docroot/source."""
        # this function can run in normal test mode or in
        # FERENDA_SET_TESTFILE mode. In the latter, all the normal download
        # code, including net access, is run. Calls to requests.get
        # are intercepted and notes are made of which URLs are
        # requested, and if this results in files on disk. The end
        # result is a JSON file and a set of cached files, all placed under
        # "source/"
        def add_downloaded_files(filelist, spec, url):
            downloaddir = os.sep.join([self.datadir, self.repoclass.alias,
                                       "downloaded"])
            for f in list(util.list_dirs(downloaddir)):
                if f.endswith(".etag"):
                    continue  # FIXME: this is ugly
                if f not in filelist:
                    # print("Fetching %s resulted in downloaded file %s" % (url, f))
                    filelist.append(f)
                    expect = "downloaded" + f.replace(downloaddir, "")
                    if os.sep != "/":
                        expect = expect.replace(os.sep, "/")
                    spec[url]['expect'] = expect
                    reldest = os.path.relpath(
                        ".." +
                        os.sep +
                        "downloaded",
                        os.path.dirname(f))
                    dest = os.path.normpath(
                        os.path.join(
                            os.path.dirname(specfile),
                            reldest))
                    util.ensure_dir(dest)
                    shutil.copy2(f, dest)

        with codecs.open(specfile, encoding="utf-8") as fp:
            spec = json.load(fp)
        for k in list(spec.keys()):
            # NB: This exposes the encoded, possibly non-ascii, values
            # of the URL as byte strings. The encoding of these is
            # unknown (and we cannot generally assume UTF-8. Let's see
            # if this bites us.
            nk = unquote(k)
            if k != nk:
                spec[nk] = spec[k]
                del spec[k]

            # process the special '@settings' key (FIXME: didn't I already
            # implement this somewhere else?)
            #
            # a @settings like this:
            #     "@settings": {
            # 	"config": {"next_sfsnr": "2014:913"}
            #     },
            #
            # will have the effect of this:
            #
            # self.repo.config.next_sfsnr = "2014:913"
            if '@settings' in spec:
                for attribute in spec['@settings']:
                    if isinstance(spec['@settings'][attribute], dict):
                        thing = getattr(self.repo, attribute)
                        for key, value in spec['@settings'][attribute].items():
                            setattr(thing, key, value)
                    else:
                        setattr(self.repo, attribute,
                                spec['@settings'][attribute])

        if os.environ.get("FERENDA_SET_TESTFILE"):
            downloaddir = os.sep.join([self.datadir, self.repoclass.alias,
                                       "downloaded"])
            state = {'downloaded':  list(util.list_dirs(downloaddir)),
                     'previous_url': None,
                     'requests': 0}
            try:
                rc = int(os.environ.get("FERENDA_SET_TESTFILE"))
                state['total_requests'] = rc
            except (ValueError, TypeError):
                state['total_requests'] = 2  # search page, single payload

            def callback(req):
                # clean up after last callback
                add_downloaded_files(state['downloaded'], spec, state['previous_url'])
                if state['requests'] == state['total_requests']:
                    raise MaxDownloadsReached()
                # make a real requests call somehow
                responses.stop()
                # when testing this testing function
                # (testTestutil.RepoTester.test_download_setfile) we
                # still want to disable responses, but we don't want
                # to make an actual HTTP call. Detect if we are
                # running that test by examining the stack, and if so,
                # mock the requests.get call in a different way.
                frames = [f for f in inspect.stack() if f[3] == "test_download_setfile"]
                if frames:
                    frame = frames[0][0]
                    resp = frame.f_locals['self']._myget(req.url)
                else:
                    resp = requests.get(req.url)
                responses.start()
                # create a filename. use .html as suffix unless we
                # should use something else
                contenttype = resp.headers["Content-type"]
                stem = os.path.splitext(specfile)[0]
                suffix = {'application/pdf': 'pdf',
                          'application/json': 'json',
                          'text/plain': 'txt'}.get(contenttype, "html")
                outfile = "%s-%s.%s" % (stem, state['requests'], suffix)
                with open(outfile, "wb") as fp:
                    fp.write(resp.content)

                if not frames and os.environ.get("TRAVIS") != "true":
                    if suffix == "html":
                        print(
                            "requested %s, saved as %s. Edit if needed, then press enter" %
                            (req.url, outfile))
                        x = input()
                    else:
                        print("requested %s, saved %s" % (req.url, outfile))

                with open(outfile, "rb") as fp:
                    content = fp.read()
                spec[req.url] = {'file': os.path.basename(outfile)}
                if resp.encoding != 'utf-8':
                    spec[req.url]['encoding'] = resp.encoding

                state['requests'] += 1
                state['previous_url'] = req.url
                return (resp.status_code, resp.headers, content)
        else:
            def callback(req):
                headers = {'Content-type': 'text/html'}
                try:
                    # normalize req.url. req.url might be a (byte)str
                    # but keys in spec will be (and should be)
                    # unicode. Assume that req.url is all ascii
                    if isinstance(req.url, bytes):
                        url = req.url.decode()
                    else:
                        url = req.url
                    urlspec = spec[unquote(url)]
                    if isinstance(urlspec, str):
                        urlspec = {'file': urlspec}
                    url_location = os.path.join(os.path.dirname(specfile),
                                                urlspec['file'])
                    # load the .content property
                    with open(url_location, "rb") as fp:
                        content = fp.read()
                    return (200, headers, content)
                except KeyError:
                    return (404, headers, "Not found")
        responses.add_callback(responses.GET,
                               re.compile("(.*)"),
                               callback)
        # PERFORM THE TEST
        try:
            self.repo.download(basefile)
        except MaxDownloadsReached:
            pass

        if os.environ.get("FERENDA_SET_TESTFILE"):
            # process final file and save specfile
            add_downloaded_files(state['downloaded'], spec,
                                 state['previous_url'])
            with open(specfile, "w") as fp:
                j = json.dumps(spec, indent=4, 
                          separators=(', ', ': '))
                fp.write(j)

        # organize a temporary copy of files that we can compare our results to
        wantdir = "%s/%s-want" % (self.datadir, self.repoclass.alias)
        expected = False
        for url in spec:
            if url == "@settings":
                continue
            if "expect" in spec[url]:
                expected = True
                sourcefile = os.path.join(os.path.dirname(specfile),
                                          spec[url]['file'])
                wantfile = "%s/%s" % (wantdir, spec[url]['expect'])

                util.copy_if_different(sourcefile, wantfile)
        if expected:
            self.assertEqualDirs(wantdir,
                                 "%s/%s" % (self.datadir,
                                            self.repoclass.alias),
                                 subset=True)
        else:
            # the test doesn't actually result in any downloaded file
            if hasattr(self.repo, 'expect') and self.repo.expect is False:
                pass
            else:
                self.fail('No files were marked as "expect" in specfile %s' %
                          specfile)