示例#1
0
 def test_verify_dynamic_mapping(self):
     doc1 = {
         "manifest": {
             "data": "hello world!"
         },
         "description":
         "Scooby dooby do, where are you, we got some work to do now.",
         "time1": "2017-11-02T09:50:20.123123Z",
         "time2": "2017-11-02 09:55:12",
         "time3": "2017-11-02",
     }
     bundle_uuid = str(uuid.uuid4())
     version = get_version()
     bundle_fqid = f"{bundle_uuid}.{version}"
     es_client = ElasticsearchClient.get()
     es_client.index(index=self.dss_index_name,
                     doc_type=ESDocType.doc.name,
                     id=bundle_fqid,
                     body=doc1)
     mapping = es_client.indices.get_mapping(
         self.dss_index_name)[self.dss_index_name]['mappings']
     self.assertEqual(mapping['query']['properties']['query']['type'],
                      'percolator')
     self.assertEqual(mapping['doc']['properties']['description']['type'],
                      'keyword')
     self.assertEqual(
         mapping['doc']['properties']['description']['fields']['text']
         ['type'], 'text')
     self.assertEqual(mapping['doc']['properties']['time1']['type'], 'date')
     self.assertEqual(mapping['doc']['properties']['time2']['type'], 'date')
     self.assertEqual(mapping['doc']['properties']['time3']['type'], 'date')
示例#2
0
def get(uuid: str, replica: str):
    owner = security.get_token_email(request.token_info)

    es_client = ElasticsearchClient.get()
    try:
        response = es_client.get(index=Config.get_es_index_name(
            ESIndexType.subscriptions, Replica[replica]),
                                 doc_type=ESDocType.subscription.name,
                                 id=uuid)
    except NotFoundError:
        raise DSSException(requests.codes.not_found, "not_found",
                           "Cannot find subscription!")

    source = response['_source']
    source['uuid'] = uuid
    source['replica'] = replica
    if 'hmac_key_id' in response:
        source['hmac_key_id'] = response['hmac_key_id']
    if 'hmac_secret_key' in source:
        source.pop('hmac_secret_key')
    if source['owner'] != owner:
        # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent.
        raise DSSException(requests.codes.forbidden, "Forbidden",
                           "Your credentials can't access this subscription!")

    return jsonify(source), requests.codes.okay
示例#3
0
def delete(uuid: str, replica: str):
    owner = security.get_token_email(request.token_info)

    es_client = ElasticsearchClient.get()

    try:
        response = es_client.get(index=Config.get_es_index_name(
            ESIndexType.subscriptions, Replica[replica]),
                                 doc_type=ESDocType.subscription.name,
                                 id=uuid)
    except NotFoundError:
        raise DSSException(requests.codes.not_found, "not_found",
                           "Cannot find subscription!")

    stored_metadata = response['_source']

    if stored_metadata['owner'] != owner:
        # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent.
        raise DSSException(requests.codes.forbidden, "Forbidden",
                           "Your credentials can't access this subscription!")

    _delete_subscription(es_client, uuid)

    timestamp = datetime.datetime.utcnow()
    time_deleted = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ")

    return jsonify({'timeDeleted': time_deleted}), requests.codes.okay
    def _refresh_percolate_queries(self, index_name: str):
        # When dynamic templates are used and queries for percolation have been added
        # to an index before the index contains mappings of fields referenced by those queries,
        # the queries must be reloaded when the mappings are present for the queries to match.
        # See: https://github.com/elastic/elasticsearch/issues/5750
        subscription_index_name = Config.get_es_index_name(ESIndexType.subscriptions, self.replica)
        es_client = ElasticsearchClient.get()
        if not es_client.indices.exists(subscription_index_name):
            return
        subscription_queries = [{'_index': index_name,
                                 '_type': ESDocType.query.name,
                                 '_id': hit['_id'],
                                 '_source': hit['_source']['es_query']
                                 }
                                for hit in scan(es_client,
                                                index=subscription_index_name,
                                                doc_type=ESDocType.subscription.name,
                                                query={'query': {'match_all': {}}})
                                ]

        if subscription_queries:
            try:
                bulk(es_client, iter(subscription_queries), refresh=True)
            except BulkIndexError as ex:
                logger.error(f"Error occurred when adding subscription queries "
                             f"to index {index_name} Errors: {ex.errors}")
def delete(uuid: str, replica: str):
    authenticated_user_email = request.token_info['email']

    es_client = ElasticsearchClient.get()

    try:
        response = es_client.get(index=Config.get_es_index_name(
            ESIndexType.subscriptions, Replica[replica]),
                                 doc_type=ESDocType.subscription.name,
                                 id=uuid)
    except NotFoundError as ex:
        raise DSSException(requests.codes.not_found, "not_found",
                           "Cannot find subscription!")

    stored_metadata = response['_source']

    if stored_metadata['owner'] != authenticated_user_email:
        # common_error_handler defaults code to capitalized 'Forbidden' for Werkzeug exception. Keeping consistent.
        raise DSSException(requests.codes.forbidden, "Forbidden",
                           "Your credentials can't access this subscription!")

    #  get all indexes that use current alias
    alias_name = Config.get_es_alias_name(ESIndexType.docs, Replica[replica])
    doc_indexes = _get_indexes_by_alias(es_client, alias_name)
    _unregister_percolate(es_client, doc_indexes, uuid)

    es_client.delete(index=Config.get_es_index_name(ESIndexType.subscriptions,
                                                    Replica[replica]),
                     doc_type=ESDocType.subscription.name,
                     id=uuid)

    timestamp = datetime.datetime.utcnow()
    time_deleted = timestamp.strftime("%Y-%m-%dT%H%M%S.%fZ")

    return jsonify({'timeDeleted': time_deleted}), requests.codes.okay
 def _prepare_index(self, dryrun):
     shape_descriptor = self.get_shape_descriptor()
     index_name = Config.get_es_index_name(ESIndexType.docs, self.replica, shape_descriptor)
     es_client = ElasticsearchClient.get()
     if not dryrun:
         IndexManager.create_index(es_client, self.replica, index_name)
     return index_name
 def _write_to_index(self, index_name: str, version: typing.Optional[int] = None):
     es_client = ElasticsearchClient.get()
     initial_mappings = es_client.indices.get_mapping(index_name)[index_name]['mappings']
     super()._write_to_index(index_name, version=version)
     current_mappings = es_client.indices.get_mapping(index_name)[index_name]['mappings']
     if initial_mappings != current_mappings:
         self._refresh_percolate_queries(index_name)
示例#8
0
    def test_put(self):
        uuid_ = self._put_subscription()

        es_client = ElasticsearchClient.get()
        response = es_client.get(index=self.doc_index_name,
                                 doc_type=dss.ESDocType.query.name,
                                 id=uuid_)
        registered_query = response['_source']
        self.assertEqual(self.sample_percolate_query, registered_query)
示例#9
0
 def from_index(cls,
                replica: Replica,
                bundle_fqid: BundleFQID,
                index_name,
                version=None):
     es_client = ElasticsearchClient.get()
     source = es_client.get(index_name,
                            str(bundle_fqid),
                            ESDocType.doc.name,
                            version=version)['_source']
     return cls(replica, bundle_fqid, source)
示例#10
0
 def check_count(self, es_query, expected_count, timeout=5):
     es_client = ElasticsearchClient.get()
     timeout_time = timeout + time.time()
     while time.time() <= timeout_time:
         count_resp = es_client.count(index=self.dss_index_name,
                                      doc_type=ESDocType.doc.name,
                                      body=es_query)
         if count_resp['count'] == expected_count:
             break
         else:
             time.sleep(0.5)
     else:
         self.fail("elasticsearch failed to return all results.")
示例#11
0
def elasticsearch_delete_index(index_name: str):
    # ensure the indexes are test index.
    assert Config._CURRENT_CONFIG == BucketConfig.TEST
    assert Config.test_index_suffix.value
    assert index_name.endswith(Config.test_index_suffix.value)

    try:
        es_client = ElasticsearchClient.get()
        es_client.indices.delete(index=index_name, ignore=[404])
    except Exception as e:
        logger.warning(
            "Error occurred while removing Elasticsearch index:%s Exception: %s",
            index_name, e)
示例#12
0
 def _prepare_index(self, dryrun):
     shape_descriptor = self['shape_descriptor']
     if shape_descriptor is not None:
         hashed_shape_descriptor = hashlib.sha1(
             str(shape_descriptor).encode("utf-8")).hexdigest()
     else:
         hashed_shape_descriptor = ""
     index_name = Config.get_es_index_name(ESIndexType.docs, self.replica,
                                           hashed_shape_descriptor)
     es_client = ElasticsearchClient.get()
     if not dryrun:
         IndexManager.create_index(es_client, self.replica, index_name)
     return index_name
 def _remove_versions(self, versions: typing.MutableMapping[str, int]):
     """
     Remove this document from each given index provided that it contains the given version of this document.
     """
     es_client = ElasticsearchClient.get()
     num_ok, errors = bulk(es_client, raise_on_error=False, actions=[{
         '_op_type': 'delete',
         '_index': index_name,
         '_type': ESDocType.doc.name,
         '_version': version,
         '_id': str(self.fqid),
     } for index_name, version in versions.items()])
     for item in errors:
         logger.warning(f"Document deletion failed: {json.dumps(item)}")
示例#14
0
 def _get_indexed_versions(self) -> typing.MutableMapping[str, int]:
     """
     Returns a dictionary mapping the name of each index containing this document to the
     version of this document in that index. Note that `version` denotes document version, not
     bundle version.
     """
     es_client = ElasticsearchClient.get()
     alias_name = Config.get_es_alias_name(ESIndexType.docs, self.replica)
     # First attempt to get the single instance of the document. The common case is that there is zero or one
     # instance.
     try:
         doc = es_client.get(id=str(self.fqid),
                             index=alias_name,
                             _source=False,
                             stored_fields=[])
         # One instance found
         return {doc['_index']: doc['_version']}
     except TransportError as e:
         if e.status_code == 404:
             # No instance found
             return {}
         elif e.status_code == 400:
             # This could be a general error or an one complaining that we attempted a single-index operation
             # against a multi-index alias. If the latter, we can actually avoid a round trip by parsing the index
             # names out of the error message generated at https://github.com/elastic/elasticsearch/blob/5.5
             # /core/src/main/java/org/elasticsearch/cluster/metadata/IndexNameExpressionResolver.java#L194
             error = e.info.get('error')
             if error:
                 reason = error.get('reason')
                 if reason:
                     match = self.multi_index_error.fullmatch(reason)
                     if match:
                         indices = map(str.strip, match.group(2).split(','))
                         # Now get the document version from all indices in the alias
                         doc = es_client.mget(_source=False,
                                              stored_fields=[],
                                              body={
                                                  'docs': [{
                                                      '_id':
                                                      str(self.fqid),
                                                      '_index':
                                                      index
                                                  } for index in indices]
                                              })
                         return {
                             doc['_index']: doc['_version']
                             for doc in doc['docs'] if doc.get('found')
                         }
         raise
 def _find_matching_subscriptions(self, index_name: str) -> typing.MutableSet[str]:
     percolate_document = {
         'query': {
             'percolate': {
                 'field': "query",
                 'document_type': ESDocType.doc.name,
                 'document': self
             }
         }
     }
     subscription_ids = set()
     for hit in scan(ElasticsearchClient.get(),
                     index=index_name,
                     query=percolate_document):
         subscription_ids.add(hit["_id"])
     logger.debug(f"Found {len(subscription_ids)} matching subscription(s).")
     return subscription_ids
    def _write_to_index(self, index_name: str, version: typing.Optional[int] = None):
        """
        Place this document into the given index.

        :param version: if 0, write only if this document is currently absent from the given index
                        if > 0, write only if the specified version of this document is currently present
                        if None, write regardless
        """
        es_client = ElasticsearchClient.get()
        body = self.to_json()
        logger.debug(f"Writing document to index {index_name}: {body}")
        es_client.index(index=index_name,
                        doc_type=ESDocType.doc.name,
                        id=str(self.fqid),
                        body=body,
                        op_type='create' if version == 0 else 'index',
                        version=version if version else None)
示例#17
0
 def setUp(self):
     super().setUp()
     self.alias_name = dss.Config.get_es_alias_name(dss.ESIndexType.docs,
                                                    self.replica)
     self.sub_index_name = dss.Config.get_es_index_name(
         dss.ESIndexType.subscriptions, self.replica)
     shape_identifier = self.index_document.get_shape_descriptor()
     self.doc_index_name = dss.Config.get_es_index_name(
         dss.ESIndexType.docs, self.replica, shape_identifier)
     es_client = ElasticsearchClient.get()
     IndexManager.create_index(es_client, self.replica, self.doc_index_name)
     es_client.index(index=self.doc_index_name,
                     doc_type=dss.ESDocType.doc.name,
                     id=str(uuid.uuid4()),
                     body=self.index_document,
                     refresh=True)
     self.callback_url = "https://example.com"
     self.sample_percolate_query = smartseq2_paired_ends_v2_or_v3_query
示例#18
0
 def populate_search_index(self, index_document: dict, count: int) -> list:
     es_client = ElasticsearchClient.get()
     bundles = []
     for i in range(count):
         bundle_uuid = str(uuid.uuid4())
         version = get_version()
         index_document['manifest']['version'] = version
         bundle_fqid = f"{bundle_uuid}.{version}"
         bundle_url = (
             f"https://127.0.0.1:{self.app._port}"
             f"/v1/bundles/{bundle_uuid}?version={version}&replica={self.replica.name}"
         )
         es_client.index(index=self.dss_index_name,
                         doc_type=ESDocType.doc.name,
                         id=bundle_fqid,
                         body=index_document,
                         refresh=(i == count - 1))
         bundles.append((bundle_fqid, bundle_url))
     return bundles
 def _get_subscription(self, subscription_id: str) -> dict:
     subscription_query = {
         'query': {
             'ids': {
                 'type': ESDocType.subscription.name,
                 'values': [subscription_id]
             }
         }
     }
     response = ElasticsearchClient.get().search(
         index=Config.get_es_index_name(ESIndexType.subscriptions, self.replica),
         body=subscription_query)
     hits = response['hits']['hits']
     assert len(hits) == 1
     hit = hits[0]
     assert hit['_id'] == subscription_id
     subscription = hit['_source']
     assert 'id' not in subscription
     subscription['id'] = subscription_id
     return subscription
示例#20
0
def clear_indexes(index_names: List[str], doctypes: List[str]):
    """
    Erases all of the documents in indexes with any of the doctypes provided. This can only be used in TEST
    configuration with IndexSuffix.name set. Only indexes with the same IndexSuffix.name can be erased.
    """
    # ensure the indexes are test index.
    assert Config._CURRENT_CONFIG == BucketConfig.TEST
    assert Config.test_index_suffix.value
    for index_name in index_names:
        assert index_name.endswith(Config.test_index_suffix.value)

    es_client = ElasticsearchClient.get()
    if es_client.indices.exists(index_names):
        es_client.delete_by_query(index=index_names,
                                  body={'query': {
                                      'match_all': {}
                                  }},
                                  doc_type=doctypes,
                                  refresh=True,
                                  conflicts='proceed')
示例#21
0
def find(replica: str):
    owner = security.get_token_email(request.token_info)
    es_client = ElasticsearchClient.get()

    search_obj = Search(using=es_client,
                        index=Config.get_es_index_name(
                            ESIndexType.subscriptions, Replica[replica]),
                        doc_type=ESDocType.subscription.name)
    search = search_obj.query({'bool': {'must': [{'term': {'owner': owner}}]}})

    responses = [{
        'uuid': hit.meta.id,
        'replica': replica,
        'owner': owner,
        **{k: v
           for k, v in hit.to_dict().items() if k != 'hmac_secret_key'}
    } for hit in search.scan()]

    full_response = {'subscriptions': responses}
    return jsonify(full_response), requests.codes.okay
def find(replica: str):
    owner = request.token_info['email']
    es_client = ElasticsearchClient.get()

    search_obj = Search(using=es_client,
                        index=Config.get_es_index_name(
                            ESIndexType.subscriptions, Replica[replica]),
                        doc_type=ESDocType.subscription.name)
    search = search_obj.query({'match': {'owner': owner}})

    responses = [{
        'uuid': hit.meta.id,
        'replica': replica,
        'owner': owner,
        'callback_url': hit.callback_url,
        'es_query': hit.es_query.to_dict()
    } for hit in search.scan()]

    full_response = {'subscriptions': responses}
    return jsonify(full_response), requests.codes.okay
示例#23
0
 def test_search_session_expired_when_session_deleted(self):
     self.populate_search_index(self.index_document, 20)
     self.check_count(smartseq2_paired_ends_v3_query, 20)
     url = self.build_url({"per_page": 10})
     search_obj = self.assertPostResponse(
         path=url,
         json_request_body=dict(es_query=smartseq2_paired_ends_v3_query),
         expected_code=requests.codes.partial)
     self.verify_search_result(search_obj.json,
                               smartseq2_paired_ends_v3_query, 20, 10)
     next_url = self.get_next_url(search_obj.response.headers)
     scroll_id = self.verify_next_url(next_url, 10)
     es_client = ElasticsearchClient.get()
     es_client.clear_scroll(scroll_id)
     self.assertPostResponse(
         path=self.strip_next_url(next_url),
         json_request_body=dict(es_query=smartseq2_paired_ends_v3_query),
         expected_code=requests.codes.not_found,
         expected_error=ExpectedErrorFields(
             code="elasticsearch_context_not_found",
             status=requests.codes.not_found))
示例#24
0
def _es_search_page(es_query: dict, replica: Replica, per_page: int,
                    _scroll_id: typing.Optional[str],
                    output_format: str) -> dict:
    es_query = deepcopy(es_query)
    es_client = ElasticsearchClient.get()

    # Do not return the raw indexed data unless it is requested
    if output_format != 'raw':
        es_query['_source'] = False

    # The time for a scroll search context to stay open per page. A page of results must be retreived before this
    # timeout expires. Subsequent calls to search will refresh the scroll timeout. For more details on time format see:
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#time-units
    scroll = '2m'  # set a timeout of 2min to keep the search context alive. This is reset

    # From: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html
    # Scroll requests have optimizations that make them faster when the sort order is _doc. If you want to iterate over
    # all documents regardless of the order, this is the most efficient option:
    # {
    #   "sort": [
    #     "_doc"
    #   ]
    # }
    sort = {"sort": ["_doc"]}
    if _scroll_id is None:
        page = es_client.search(index=Config.get_es_alias_name(
            ESIndexType.docs, replica),
                                doc_type=ESDocType.doc.name,
                                scroll=scroll,
                                size=per_page,
                                body=es_query,
                                sort=sort)
        logger.debug("Created ES scroll instance")
    else:
        page = es_client.scroll(scroll_id=_scroll_id, scroll=scroll)
        logger.debug(
            f"Retrieved ES results from scroll instance Scroll_id: {_scroll_id}"
        )
    return page
示例#25
0
 def setUp(self):
     super().setUp()
     self.alias_name = dss.Config.get_es_alias_name(dss.ESIndexType.docs, self.replica)
     self.sub_index_name = dss.Config.get_es_index_name(dss.ESIndexType.subscriptions, self.replica)
     shape_identifier = self.index_document._get_shape_descriptor()
     shape_identifier = hashlib.sha1(f"{shape_identifier}".encode("utf-8")).hexdigest()
     self.doc_index_name = dss.Config.get_es_index_name(dss.ESIndexType.docs, self.replica, shape_identifier)
     es_client = ElasticsearchClient.get()
     IndexManager.create_index(es_client, self.replica, self.doc_index_name)
     es_client.index(index=self.doc_index_name,
                     doc_type=dss.ESDocType.doc.name,
                     id=str(uuid.uuid4()),
                     body=self.index_document,
                     refresh=True)
     self.endpoint = Endpoint(callback_url="https://example.com",
                              method="POST",
                              encoding="application/json",
                              form_fields={'foo': 'bar'},
                              payload_form_field='baz')
     self.sample_percolate_query = smartseq2_paired_ends_vx_query
     self.hmac_key_id = 'dss_test'
     self.hmac_secret_key = '23/33'
示例#26
0
def get_es_client():
    domain_name = "dss-index-" + os.environ['DSS_DEPLOYMENT_STAGE']
    host = boto3.client("es").describe_elasticsearch_domain(
        DomainName=domain_name)['DomainStatus']['Endpoint']
    os.environ['DSS_ES_ENDPOINT'] = host
    return ElasticsearchClient.get()
示例#27
0
 def setUp(self):
     super().setUp()
     self.dss_index_name = dss.Config.get_es_index_name(
         dss.ESIndexType.docs, self.replica)
     es_client = ElasticsearchClient.get()
     IndexManager.create_index(es_client, self.replica, self.dss_index_name)
def put(json_request_body: dict, replica: str):
    uuid = str(uuid4())
    es_query = json_request_body['es_query']
    owner = request.token_info['email']

    es_client = ElasticsearchClient.get()

    index_mapping = {
        "mappings": {
            ESDocType.subscription.name: {
                "properties": {
                    "owner": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    "es_query": {
                        "type": "object",
                        "enabled": "false"
                    }
                }
            }
        }
    }
    # Elasticsearch preprocesses inputs by splitting strings on punctuation.
    # So for [email protected], if I searched for people with the email address [email protected],
    # [email protected] would show up because elasticsearch matched example w/ example.
    # By including "index": "not_analyzed", Elasticsearch leaves all owner inputs alone.
    index_name = Config.get_es_index_name(ESIndexType.subscriptions,
                                          Replica[replica])
    IndexManager.get_subscription_index(es_client, index_name, index_mapping)

    #  get all indexes that use current alias
    alias_name = Config.get_es_alias_name(ESIndexType.docs, Replica[replica])
    doc_indexes = _get_indexes_by_alias(es_client, alias_name)

    #  try to subscribe query to each of the indexes.
    subscribed_indexes = []
    for doc_index in doc_indexes:
        try:
            percolate_registration = _register_percolate(
                es_client, doc_index, uuid, es_query, replica)
        except ElasticsearchException as ex:
            logger.debug(
                f"Exception occured when registering a document to an index. Exception: {ex}"
            )
            last_ex = ex
        else:
            logger.debug(
                f"Percolate query registration succeeded:\n{percolate_registration}"
            )
            subscribed_indexes.append(doc_index)

    # Queries are unlikely to fit in all of the indexes, therefore errors will almost always occur. Only return an error
    # if no queries are successfully indexed.
    if doc_indexes and not subscribed_indexes:
        logger.critical(
            f"Percolate query registration failed: owner: {owner}, uuid: {uuid}, "
            f"replica: {replica}, es_query: {es_query}, Exception: {last_ex}")
        raise DSSException(
            requests.codes.internal_server_error, "elasticsearch_error",
            "Unable to register elasticsearch percolate query!") from last_ex

    json_request_body['owner'] = owner

    try:
        subscription_registration = _register_subscription(
            es_client, uuid, json_request_body, replica)
        logger.debug(
            f"Event Subscription succeeded:\n{subscription_registration}")
    except ElasticsearchException as ex:
        logger.critical(
            f"Event Subscription failed: owner: {owner}, uuid: {uuid}, "
            f"replica: {replica}, Exception: {ex}")

        # Delete percolate query to make sure queries and subscriptions are in sync.
        doc_indexes = _get_indexes_by_alias(es_client, alias_name)
        _unregister_percolate(es_client, doc_indexes, uuid)

        raise DSSException(
            requests.codes.internal_server_error, "elasticsearch_error",
            "Unable to register subscription! Rolling back percolate query.")

    return jsonify(dict(uuid=uuid)), requests.codes.created
示例#29
0
                         output_format: str) -> dict:
    result_list = []  # type: typing.List[dict]
    for hit in page['hits']['hits']:
        result = {'bundle_fqid': hit['_id'], 'search_score': hit['_score']}
        if output_format == 'raw':
            result['metadata'] = hit['_source']
        result_list.append(result)

    return {
        'es_query': es_query,
        'results': result_list,
        'total_hits': page['hits']['total']
    }


es_client = ElasticsearchClient.get()

replica = Replica.aws
es_query = {
    "query": {
        "bool": {
            "must": [{
                "exists": {
                    "field": "files.links_json"
                }
            }]
        }
    }
}
output_format = 'raw'
per_page = 1000