def scrub_index_data(index_data: dict, bundle_id: str) -> list: cache = S3UrlCache() def request_json(url): return json.loads(cache.resolve(url).decode("utf-8")) resolver = validators.RefResolver(referrer='', base_uri='', handlers={ 'http': request_json, 'https': request_json }) extra_fields = [] extra_documents = [] for document in index_data.keys(): for doc_list_ind, document_content in enumerate(index_data[document]): schema_info = SchemaInfo.from_json(document_content) if schema_info is not None: try: schema = request_json(schema_info.url) except Exception as ex: extra_documents.append(document) logger.warning( f"Unable to retrieve JSON schema information from {document} in bundle {bundle_id} " f"because retrieving {schema_info.url} caused exception: {ex}." ) else: for error in DSS_Draft4Validator( schema, resolver=resolver).iter_errors(document_content): if error.validator == 'additionalProperties': path = [document, doc_list_ind, *error.path] # Example error message: "Additional properties are not allowed ('extra_lst', 'extra_top' # were unexpected)" or "'extra', does not match any of the regexes: '^characteristics_.*$'" fields_to_remove = (path, [ field for field in _utils.find_additional_properties( error.instance, error.schema) ]) extra_fields.append(fields_to_remove) else: logger.warning( f"Unable to retrieve JSON schema information from {document} in bundle {bundle_id}." ) extra_documents.append(document) if extra_documents: extra_fields.append(([], extra_documents)) removed_fields = [] for path, fields in extra_fields: remove_json_fields(index_data, path, fields) removed_fields.extend( ['.'.join((*[str(p) for p in path], field)) for field in fields]) if removed_fields: logger.info( f"In {bundle_id}, unexpected additional fields have been removed from the data" f" to be indexed. Removed {removed_fields}.") return removed_fields
def __init__(self, schema_urls): """ :param schema_urls: a list of JSON schema URLs. """ self.schemas = dict() for url in schema_urls: name = url.split('/')[-1] self.schemas[name] = {'$ref': url, 'id': url} self.cache = S3UrlCache() self.resolver = self.resolver_factory( ) # The resolver used to dereference JSON '$ref'. self._json_gen = JsonGenerator(resolver=self.resolver)
def test_stored_url_metadata(self): url = f"{HTTPInfo.url}/{KiB}" self.urls_to_cleanup.add(url) url_key = S3UrlCache._url_to_key(url) self.cache.resolve(url) with self.subTest("check dss_cached_url"): cached_url = self.cache._reverse_key_lookup(url_key) self.assertEqual(cached_url, url) with self.subTest("check content_type"): contentType = self.blobstore.get_content_type( self.test_bucket, url_key) self.assertEqual(contentType, "application/octet-stream")
def test_store_in_cache(self): """The URL contents are stored in S3 and the contents returned, when requested url is not found in cache.""" url = f"{HTTPInfo.url}/{KiB}" self.urls_to_cleanup.add(url) url_key = S3UrlCache._url_to_key(url) self._delete_cached_urls() with self.assertLogs(dss.logger, "INFO") as log_monitor: url_content = self.cache.resolve(url) original_data = randomdata[:KiB] self.assertEqual(len(url_content), KiB) self.assertEqual(url_content, original_data) self.assertTrue(log_monitor.output[0].endswith( f"{url} not found in cache. Adding it to " f"{self.test_bucket} with key {url_key}."))
def test_evict(self): url = f"{HTTPInfo.url}/{KiB}" self.urls_to_cleanup.add(url) url_key = S3UrlCache._url_to_key(url) with self.assertLogs(dss.logger, "INFO") as log_monitor: # Verify the URL is cached self.cache.resolve(url) self.assertTrue(self.cache.contains(url)) # Remove the URL from cache self.cache.evict(url) self.assertTrue(not self.cache.contains(url)) self.cache.evict(url) self.assertTrue(not self.cache.contains(url)) self.assertTrue(log_monitor.output[0].endswith( f"{url} not found in cache. Adding it to " f"{self.test_bucket} with key {url_key}.")) self.assertTrue(log_monitor.output[1].endswith( f"{url} removed from cache in {self.test_bucket}.")) self.assertTrue(log_monitor.output[2].endswith( f"{url} not found and not removed from cache."))
def setUp(self): self.urls_to_cleanup = set() self.cache = S3UrlCache()