def test_retrieve_record__no_url_for_zip_archive(self): """ SCENARIO: We have a URL for a landing page for a PUBLISHED document, but the landing page does not have a proper URL for the bagit zip archive. Yeah, this happens. EXPECTED RESULT: A SkipError is issued. """ url = ('https://www.hydroshare.org' '/resource/81e947faccf04de59392dddaac77bc75/') # External I/O # # 1st: landing page package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75' contents1 = ir.read_text(package, 'landing_page.no_zip_url.html') harvester = CUAHSIHarvester() with self.assertLogs(logger=harvester.logger, level='INFO'): with aioresponses() as m: m.get(self.regex, body=contents1) with self.assertRaises(SkipError): asyncio.run(harvester.retrieve_record(url))
def test_retrieve_record__bad_metadata_document(self): """ SCENARIO: We have a URL for a landing page for a PUBLISHED document. The metadata document, however, is invalid. EXPECTED RESULT: An XMLMetadataParsingError is issued. """ url = ('https://www.hydroshare.org' '/resource/81e947faccf04de59392dddaac77bc75/') # External I/O # # 1st: landing page # 2nd: zip archive containing data and metadata package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75' contents1 = ir.read_text(package, 'landing_page.html') # Switch out the metadata document for something that is NOT xml. b = io.BytesIO() zf = zipfile.ZipFile(b, mode='w') zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata', b'not xml') zf.close() b.seek(0) contents2 = b.read() harvester = CUAHSIHarvester() with self.assertLogs(logger=harvester.logger, level='INFO'): with aioresponses() as m: m.get(self.regex, body=contents1) m.get(self.regex, body=contents2) with self.assertRaises(XMLMetadataParsingError): asyncio.run(harvester.retrieve_record(url))
def test_retrieve_record(self): """ SCENARIO: We have a URL for a landing page for a PUBLISHED document. EXPECTED RESULT: The series identifier is retrieved. The lastMod time is None because this is only retrieved in schema.org. """ url = ('https://www.hydroshare.org' '/resource/81e947faccf04de59392dddaac77bc75/') # External I/O # # 1st: landing page # 2nd: zip archive containing data and metadata package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75' contents1 = ir.read_text(package, 'landing_page.html') b = io.BytesIO() zf = zipfile.ZipFile(b, mode='w') package = 'tests.data.cuahsi.81e947faccf04de59392dddaac77bc75.data' content = ir.read_binary(package, 'resourcemetadata.xml') zf.writestr('81e947faccf04de59392dddaac77bc75/data/resourcemetadata', content) zf.close() b.seek(0) contents2 = b.read() harvester = CUAHSIHarvester() with self.assertLogs(logger=harvester.logger, level='INFO'): with aioresponses() as m: m.get(self.regex, body=contents1) m.get(self.regex, body=contents2) awaitable = harvester.retrieve_record(url) sid, pid, lastmod, doc = asyncio.run(awaitable) self.assertEqual(sid, '10.4211/hs.81e947faccf04de59392dddaac77bc75') self.assertIsNone(lastmod)