def elasticsearch(self): """ Indicates whether the Elasticsearch cluster is responsive. """ return { 'up': ESClientFactory.get().ping(), }
def add_cart_item(self, catalog: CatalogName, user_id, cart_id, entity_id, entity_type, entity_version): """ Add an item to a cart and return the created item ID An error will be raised if the cart does not exist or does not belong to the user """ # TODO: Cart item should have some user readable name if cart_id is None: cart = self.get_or_create_default_cart(user_id) else: cart = self.get_cart(user_id, cart_id) real_cart_id = cart['CartId'] if not entity_version: # When entity_version is not given, this method will check the data integrity and retrieve the version. entity = ESClientFactory.get().get( index=config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=True), id=entity_id, _source=True, _source_include=[ 'contents.files.uuid', # data file UUID 'contents.files.version', # data file version 'contents.projects.document_id', # metadata file UUID 'contents.samples.document_id', # metadata file UUID ])['_source'] normalized_entity = self.extract_entity_info(entity_type, entity) entity_version = normalized_entity['version'] new_item = self.transform_entity_to_cart_item(real_cart_id, entity_type, entity_id, entity_version) self.dynamo_accessor.insert_item(config.dynamo_cart_item_table_name, new_item) return new_item['CartItemId']
def __init__(self, catalog: Optional[CatalogName], field_types: CataloguedFieldTypes, refresh: Union[bool, str], conflict_retry_limit: int, error_retry_limit: int) -> None: """ :param field_types: A mapping of field paths to field type :param refresh: https://www.elastic.co/guide/en/elasticsearch/reference/5.5/docs-refresh.html :param conflict_retry_limit: The maximum number of retries (the second attempt is the first retry) on version conflicts. Specify 0 for no retries or None for unlimited retries. :param error_retry_limit: The maximum number of retries (the second attempt is the first retry) on other errors. Specify 0 for no retries or None for unlimited retries. """ super().__init__() self.catalog = catalog self.field_types = field_types self.refresh = refresh self.conflict_retry_limit = conflict_retry_limit self.error_retry_limit = error_retry_limit self.es_client = ESClientFactory.get() self.errors: MutableMapping[DocumentCoordinates, int] = defaultdict(int) self.conflicts: MutableMapping[DocumentCoordinates, int] = defaultdict(int) self.retries: Optional[MutableSet[DocumentCoordinates]] = None
def _read_aggregates( self, entities: CataloguedTallies ) -> Dict[CataloguedEntityReference, Aggregate]: coordinates = [ AggregateCoordinates(entity=entity) for entity in entities ] request = { 'docs': [{ '_type': coordinate.type, '_index': coordinate.index_name, '_id': coordinate.document_id } for coordinate in coordinates] } catalogs = {coordinate.entity.catalog for coordinate in coordinates} mandatory_source_fields = set() for catalog in catalogs: aggregate_cls = self.aggregate_class(catalog) mandatory_source_fields.update( aggregate_cls.mandatory_source_fields()) response = ESClientFactory.get().mget( body=request, _source_include=list(mandatory_source_fields)) def aggregates(): for doc in response['docs']: if doc['found']: coordinate = DocumentCoordinates.from_hit(doc) aggregate_cls = self.aggregate_class( coordinate.entity.catalog) aggregate = aggregate_cls.from_index( self.catalogued_field_types(), doc, coordinates=coordinate) yield aggregate return {a.coordinates.entity: a for a in aggregates()}
def create_indices(self, catalog: CatalogName): es_client = ESClientFactory.get() for index_name in self.index_names(catalog): while True: settings = self.settings(index_name) mappings = self.metadata_plugin(catalog).mapping() try: with silenced_es_logger(): index = es_client.indices.get(index=index_name) except NotFoundError: try: es_client.indices.create(index=index_name, body=dict(settings=settings, mappings=mappings)) except RequestError as e: if e.error == 'resource_already_exists_exception': log.info( 'Another party concurrently created index %r, retrying.', index_name) else: raise else: self._check_index(settings=settings, mappings=mappings, index=index[index_name]) break
def create_indices(self, catalog: CatalogName): es_client = ESClientFactory.get() for index_name in self.index_names(catalog): es_client.indices.create( index=index_name, ignore=[400], body=dict(settings=self.settings(index_name), mappings=dict( doc=self.metadata_plugin(catalog).mapping())))
def _assert_indices_exist(self, catalog: CatalogName): """ Aside from checking that all indices exist this method also asserts that we can instantiate a local ES client pointing at a real, remote ES domain. """ es_client = ESClientFactory.get() service = IndexService() for index_name in service.index_names(catalog): self.assertTrue(es_client.indices.exists(index_name))
def setUpClass(cls): super().setUpClass() es_endpoint = cls._create_container('docker.elastic.co/elasticsearch/elasticsearch:6.8.0', container_port=9200, environment=['xpack.security.enabled=false', 'discovery.type=single-node', 'ES_JAVA_OPTS=-Xms512m -Xmx512m']) try: new_env = config.es_endpoint_env(es_endpoint=es_endpoint, es_instance_count=2) cls._env_patch = mock.patch.dict(os.environ, **new_env) cls._env_patch.__enter__() cls.es_client = ESClientFactory.get() cls._wait_for_es() except BaseException: # no coverage cls._kill_containers() raise
def deindex(self, catalog: CatalogName, sources: Iterable[str]): plugin = self.repository_plugin(catalog) source_ids = [plugin.resolve_source(s).id for s in sources] es_client = ESClientFactory.get() indices = ','.join(self.index_service.index_names(catalog)) query = { 'query': { 'bool': { 'should': [ { 'terms': { # Aggregate documents 'sources.id.keyword': source_ids } }, { 'terms': { # Contribution documents 'source.id.keyword': source_ids } } ] } } } logger.info('Deindexing sources %r from catalog %r', sources, catalog) logger.debug('Using query: %r', query) response = es_client.delete_by_query(index=indices, body=query, slices='auto') if len(response['failures']) > 0: if response['version_conflicts'] > 0: logger.error( 'Version conflicts encountered. Do not deindex while ' 'indexing is occurring. The index may now be in an ' 'inconsistent state.') raise RuntimeError('Failures during deletion', response['failures'])
def setUpClass(cls): super().setUpClass() cls.es_client = ESClientFactory.get()
def es_client(self) -> Elasticsearch: return ESClientFactory.get()
def _read_contributions( self, tallies: CataloguedTallies) -> List[CataloguedContribution]: es_client = ESClientFactory.get() entity_ids_by_index: MutableMapping[str, MutableSet[str]] = defaultdict(set) for entity in tallies.keys(): index = config.es_index_name(catalog=entity.catalog, entity_type=entity.entity_type, aggregate=False) entity_ids_by_index[index].add(entity.entity_id) query = { "query": { "bool": { "should": [{ "bool": { "must": [{ "term": { "_index": index } }, { "terms": { "entity_id.keyword": list(entity_ids) } }] } } for index, entity_ids in entity_ids_by_index.items()] } } } index = sorted(list(entity_ids_by_index.keys())) # scan() uses a server-side cursor and is expensive. Only use it if the number of contributions is large page_size = 1000 # page size of 100 caused excessive ScanError occurrences num_contributions = sum(tallies.values()) hits = None if num_contributions <= page_size: log.info('Reading %i expected contribution(s) using search().', num_contributions) response = es_client.search(index=index, body=query, size=page_size, doc_type=Document.type) total_hits = response['hits']['total'] if total_hits <= page_size: hits = response['hits']['hits'] if len(hits) != total_hits: message = f'Search returned {len(hits)} hits but reports total to be {total_hits}' raise EventualConsistencyException(message) else: log.info('Expected only %i contribution(s) but got %i.', num_contributions, total_hits) num_contributions = total_hits if hits is None: log.info('Reading %i expected contribution(s) using scan().', num_contributions) hits = scan(es_client, index=index, query=query, size=page_size, doc_type=Document.type) contributions = [ Contribution.from_index(self.catalogued_field_types(), hit) for hit in hits ] log.info('Read %i contribution(s). ', len(contributions)) if log.isEnabledFor(logging.DEBUG): entity_ref = attrgetter('entity') log.debug( 'Number of contributions read, by entity: %r', { f'{entity.entity_type}/{entity.entity_id}': sum( 1 for _ in contribution_group) for entity, contribution_group in groupby( sorted(contributions, key=entity_ref), key=entity_ref) }) return contributions
def delete_indices(self, catalog: CatalogName): es_client = ESClientFactory.get() for index_name in self.index_names(catalog): if es_client.indices.exists(index_name): es_client.indices.delete(index=index_name)
def _read_contributions( self, tallies: CataloguedTallies) -> List[CataloguedContribution]: es_client = ESClientFactory.get() entity_ids_by_index: MutableMapping[str, MutableSet[str]] = defaultdict(set) for entity in tallies.keys(): index = config.es_index_name(catalog=entity.catalog, entity_type=entity.entity_type, aggregate=False) entity_ids_by_index[index].add(entity.entity_id) query = { 'bool': { 'should': [{ 'bool': { 'must': [{ 'term': { '_index': index } }, { 'terms': { 'entity_id.keyword': list(entity_ids) } }] } } for index, entity_ids in entity_ids_by_index.items()] } } index = sorted(list(entity_ids_by_index.keys())) num_contributions = sum(tallies.values()) log.info('Reading %i expected contribution(s)', num_contributions) def pages() -> Iterable[JSONs]: body = dict(query=query) while True: response = es_client.search( index=index, sort=['_index', 'document_id.keyword'], body=body, size=config.contribution_page_size, track_total_hits=False, seq_no_primary_term=Contribution.needs_seq_no_primary_term) hits = response['hits']['hits'] log.debug('Read a page with %i contribution(s)', len(hits)) if hits: yield hits body['search_after'] = hits[-1]['sort'] else: break contributions = [ Contribution.from_index(self.catalogued_field_types(), hit) for hits in pages() for hit in hits ] log.info('Read %i contribution(s)', len(contributions)) if log.isEnabledFor(logging.DEBUG): entity_ref = attrgetter('entity') log.debug( 'Number of contributions read, by entity: %r', { f'{entity.entity_type}/{entity.entity_id}': sum( 1 for _ in contribution_group) for entity, contribution_group in groupby( sorted(contributions, key=entity_ref), key=entity_ref) }) return contributions