示例#1
0
    def test_retrieve_record__no_url_for_zip_archive(self):
        """
        SCENARIO:  We have a URL for a landing page for a PUBLISHED document,
        but the landing page does not have a proper URL for the bagit zip
        archive.  Yeah, this happens.

        EXPECTED RESULT:  A SkipError is issued.
        """
        url = ('https://www.hydroshare.org'
               '/resource/81e947faccf04de59392dddaac77bc75/')

        # External I/O
        #
        # 1st:  landing page
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents1 = ir.read_text(package, 'landing_page.no_zip_url.html')

        harvester = CUAHSIHarvester()

        with self.assertLogs(logger=harvester.logger, level='INFO'):
            with aioresponses() as m:
                m.get(self.regex, body=contents1)

                with self.assertRaises(SkipError):
                    asyncio.run(harvester.retrieve_record(url))
示例#2
0
    def test_retrieve_record__bad_metadata_document(self):
        """
        SCENARIO:  We have a URL for a landing page for a PUBLISHED document.
        The metadata document, however, is invalid.

        EXPECTED RESULT:  An XMLMetadataParsingError is issued.
        """
        url = ('https://www.hydroshare.org'
               '/resource/81e947faccf04de59392dddaac77bc75/')

        # External I/O
        #
        # 1st:  landing page
        # 2nd:  zip archive containing data and metadata
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents1 = ir.read_text(package, 'landing_page.html')

        # Switch out the metadata document for something that is NOT xml.
        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    b'not xml')
        zf.close()
        b.seek(0)
        contents2 = b.read()

        harvester = CUAHSIHarvester()

        with self.assertLogs(logger=harvester.logger, level='INFO'):
            with aioresponses() as m:
                m.get(self.regex, body=contents1)
                m.get(self.regex, body=contents2)

                with self.assertRaises(XMLMetadataParsingError):
                    asyncio.run(harvester.retrieve_record(url))
示例#3
0
    def test_retrieve_record(self):
        """
        SCENARIO:  We have a URL for a landing page for a PUBLISHED document.

        EXPECTED RESULT:  The series identifier is retrieved.  The lastMod
        time is None because this is only retrieved in schema.org.
        """
        url = ('https://www.hydroshare.org'
               '/resource/81e947faccf04de59392dddaac77bc75/')

        # External I/O
        #
        # 1st:  landing page
        # 2nd:  zip archive containing data and metadata
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75'
        contents1 = ir.read_text(package, 'landing_page.html')

        b = io.BytesIO()
        zf = zipfile.ZipFile(b, mode='w')
        package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data'
        content = ir.read_binary(package, 'resourcemetadata.xml')
        zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata',
                    content)
        zf.close()
        b.seek(0)
        contents2 = b.read()

        harvester = CUAHSIHarvester()

        with self.assertLogs(logger=harvester.logger, level='INFO'):
            with aioresponses() as m:
                m.get(self.regex, body=contents1)
                m.get(self.regex, body=contents2)

                awaitable = harvester.retrieve_record(url)
                sid, pid, lastmod, doc = asyncio.run(awaitable)

        self.assertEqual(sid, '10.4211/hs.81e947faccf04de59392dddaac77bc75')
        self.assertIsNone(lastmod)