def test__read_record__invalid_jsonld(self): """ SCENARIO: A landing page is properly retrieved, but has invalid JSON-LD. EXPECTED RESULT: JSON-LD error """ url = ( 'https://www.archive.arm.gov' '/metadata/adc/html/nsaqcrad1longC2.c2.invalid_jsonld.html' ) harvester = ARMHarvester() contents = ir.read_binary('tests.data.arm', 'nsaqcrad1longC2.c2.no_dataset_in_jsonld.html') # noqa : E501 status_code = 200 headers = {'Content-Type': 'text/html'} regex = re.compile('https://www.archive.arm.gov/metadata/adc') with aioresponses() as m: m.get(regex, body=contents, status=status_code, headers=headers) with self.assertRaises(JsonLdError): with self.assertLogs(logger=harvester.logger, level='DEBUG'): asyncio.run(harvester.retrieve_record(url))
def test__landing_page_is_empty(self): """ SCENARIO: A landing page has absolutely no content. JSON. EXPECTED RESULT: RuntimeError """ url = 'https://www.archive.arm.gov/metadata/adc/html/met.html' harvester = ARMHarvester() contents = ir.read_binary('tests.data.arm', 'met.html') status_code = 200 headers = {'Content-Type': 'text/html'} regex = re.compile('https://www.archive.arm.gov/metadata/adc') with aioresponses() as m: m.get(regex, body=contents, status=status_code, headers=headers) with self.assertRaises(RuntimeError): with self.assertLogs(logger=harvester.logger, level='DEBUG'): asyncio.run(harvester.retrieve_record(url))
def test__retrieve_record__bad_series_identifier(self): """ SCENARIO: We have a valid landing page URL but the JSON-LD document has a series identifier that is not in the format that we want. EXPECTED RESULT: RuntimeError """ landing_page_url = ( 'https://www.archive.arm.gov/metadata/adc/html/wsacrcrcal.html' ) harvester = ARMHarvester() # External calls to read the: # # 2) HTML document for the landing page # 3) XML document associated with the landing page # contents = [ ir.read_binary('tests.data.arm', 'wsacrcrcal.bad_series_id.html'), ir.read_binary('tests.data.arm', 'wsacrcrcal.xml'), ] status_codes = [200, 200, 200] headers = [ {'Content-Type': 'text/html'}, {'Content-Type': 'application/xml'}, ] z = zip(contents, status_codes, headers) with aioresponses() as m: for content, status_code, headers in z: m.get(self.regex, body=content, status=status_code, headers=headers) with self.assertLogs(logger=harvester.logger, level='DEBUG'): with self.assertRaises(RuntimeError): asyncio.run(harvester.retrieve_record(landing_page_url))