예제 #1
0
def scrub_index_data(index_data: dict, bundle_id: str) -> list:
    cache = S3UrlCache()

    def request_json(url):
        return json.loads(cache.resolve(url).decode("utf-8"))

    resolver = validators.RefResolver(referrer='',
                                      base_uri='',
                                      handlers={
                                          'http': request_json,
                                          'https': request_json
                                      })
    extra_fields = []
    extra_documents = []
    for document in index_data.keys():
        for doc_list_ind, document_content in enumerate(index_data[document]):
            schema_info = SchemaInfo.from_json(document_content)
            if schema_info is not None:
                try:
                    schema = request_json(schema_info.url)
                except Exception as ex:
                    extra_documents.append(document)
                    logger.warning(
                        f"Unable to retrieve JSON schema information from {document} in bundle {bundle_id} "
                        f"because retrieving {schema_info.url} caused exception: {ex}."
                    )
                else:
                    for error in DSS_Draft4Validator(
                            schema,
                            resolver=resolver).iter_errors(document_content):
                        if error.validator == 'additionalProperties':
                            path = [document, doc_list_ind, *error.path]
                            #  Example error message: "Additional properties are not allowed ('extra_lst', 'extra_top'
                            #  were unexpected)" or "'extra', does not match any of the regexes: '^characteristics_.*$'"
                            fields_to_remove = (path, [
                                field
                                for field in _utils.find_additional_properties(
                                    error.instance, error.schema)
                            ])
                            extra_fields.append(fields_to_remove)
            else:
                logger.warning(
                    f"Unable to retrieve JSON schema information from {document} in bundle {bundle_id}."
                )
                extra_documents.append(document)

    if extra_documents:
        extra_fields.append(([], extra_documents))
    removed_fields = []
    for path, fields in extra_fields:
        remove_json_fields(index_data, path, fields)
        removed_fields.extend(
            ['.'.join((*[str(p) for p in path], field)) for field in fields])
    if removed_fields:
        logger.info(
            f"In {bundle_id}, unexpected additional fields have been removed from the data"
            f" to be indexed. Removed {removed_fields}.")
    return removed_fields
 def __init__(self, schema_urls):
     """
     :param schema_urls: a list of JSON schema URLs.
     """
     self.schemas = dict()
     for url in schema_urls:
         name = url.split('/')[-1]
         self.schemas[name] = {'$ref': url, 'id': url}
     self.cache = S3UrlCache()
     self.resolver = self.resolver_factory(
     )  # The resolver used to dereference JSON '$ref'.
     self._json_gen = JsonGenerator(resolver=self.resolver)
    def test_stored_url_metadata(self):
        url = f"{HTTPInfo.url}/{KiB}"
        self.urls_to_cleanup.add(url)
        url_key = S3UrlCache._url_to_key(url)
        self.cache.resolve(url)

        with self.subTest("check dss_cached_url"):
            cached_url = self.cache._reverse_key_lookup(url_key)
            self.assertEqual(cached_url, url)

        with self.subTest("check content_type"):
            contentType = self.blobstore.get_content_type(
                self.test_bucket, url_key)
            self.assertEqual(contentType, "application/octet-stream")
    def test_store_in_cache(self):
        """The URL contents are stored in S3 and the contents returned, when requested url is not found in cache."""
        url = f"{HTTPInfo.url}/{KiB}"
        self.urls_to_cleanup.add(url)
        url_key = S3UrlCache._url_to_key(url)

        self._delete_cached_urls()
        with self.assertLogs(dss.logger, "INFO") as log_monitor:
            url_content = self.cache.resolve(url)

        original_data = randomdata[:KiB]
        self.assertEqual(len(url_content), KiB)
        self.assertEqual(url_content, original_data)
        self.assertTrue(log_monitor.output[0].endswith(
            f"{url} not found in cache. Adding it to "
            f"{self.test_bucket} with key {url_key}."))
    def test_evict(self):
        url = f"{HTTPInfo.url}/{KiB}"
        self.urls_to_cleanup.add(url)
        url_key = S3UrlCache._url_to_key(url)

        with self.assertLogs(dss.logger, "INFO") as log_monitor:
            # Verify the URL is cached
            self.cache.resolve(url)
            self.assertTrue(self.cache.contains(url))
            # Remove the URL from cache
            self.cache.evict(url)
            self.assertTrue(not self.cache.contains(url))
            self.cache.evict(url)
            self.assertTrue(not self.cache.contains(url))

        self.assertTrue(log_monitor.output[0].endswith(
            f"{url} not found in cache. Adding it to "
            f"{self.test_bucket} with key {url_key}."))
        self.assertTrue(log_monitor.output[1].endswith(
            f"{url} removed from cache in {self.test_bucket}."))
        self.assertTrue(log_monitor.output[2].endswith(
            f"{url} not found and not removed from cache."))
 def setUp(self):
     self.urls_to_cleanup = set()
     self.cache = S3UrlCache()