def parallel_prep(self): # this is kind of a hack for elasticsearch connections to be # 'reset' with their existing settings kept intact for label in list(connections.connections._conns.keys()): # remove the existing connection (but keeps the _kwargs settings for it connections.connections._conns.pop(label, None) # recreate the connection using the retained _kwargs (view the source) connections.get_connection(label)
async def test_get_connection(): conn = create_connection() assert conn is get_connection() assert conn is get_connection('async') assert conn is connections.get_connection('async') with raises(KeyError): get_connection('default') connections.get_connection()
def new_index( index_base_name: str, document_cls: Type[_T_BaseDocument], *, move_data: bool = False, update_alias: bool = True, ) -> str: """Creates a new Index with mapping settings from given class. The index is versioned by including the current timestamp in its name. Through this, existing previous indices with potentially incompatible mappings will not be affected. An alias is pointed to the newest index. Implements the alias migration pattern, based on: https://github.com/elastic/elasticsearch-dsl-py/blob/9b1a39dd47e8678bc4885b03b138293e189471d0/examples/alias_migration.py :param index_base_name: The index to create a new version of. :param document_cls: The elasticsearch-dsl-based class that defines the mapping. :param move_data: If true, reindex all data from the previous index to the new one (before updating the alias). :param update_alias: If true, move the alias to the newly created index. """ _LOGGER.debug("Creating new index '{}'.", index_base_name) new_index_name = index_base_name + "-" + datetime.now().strftime("%Y%m%d-%H%M%S") new_index = Index(new_index_name) new_index.settings(**document_cls.index_settings()) # The following is equivalent to `new_index.document(document_cls)` except that it # does not add `new_index` as a default index to `document_cls`. new_index._doc_types.append(document_cls) new_index.create() if move_data: _LOGGER.info("Reindexing data from previous copy to newly created one...") # TODO: test if this works and what happens if no previous index exists. connections.get_connection().reindex( body={ "source": {"index": index_base_name}, "dest": {"index": new_index_name}, }, request_timeout=3600, # TODO: find out if timeout works for large index # TODO: check if parameter name is actually `request_timeout` and not # `timeout` as indicated by source. ) new_index.refresh() if update_alias: all_indices = Index(index_base_name + "-*") if all_indices.exists_alias(name=index_base_name): all_indices.delete_alias(name=index_base_name) new_index.put_alias(name=index_base_name) return new_index_name
def setup_elastic_connection(request): """Creates a connection to elasticsearch for use in tests. If the environment variable PYTEST_ENV is set, and a connection does not already exist, a new one will be created. Runs before every test. """ if is_dev_env(): return try: connections.get_connection() except KeyError: elastic_host = os.getenv("ELASTIC_HOST", "localhost") connections.create_connection(hosts=[elastic_host]) initialize_models()
def index_listing(files): """ Index the result of a Tapis listing. Files are indexed with a UUID comprising the SHA256 hash of the system + path. Parameters ---------- files: list list of Tapis files (either dict or agavepy.agave.Attrdict) Returns ------- Void """ from portal.libs.elasticsearch.docs.base import IndexedFile idx = IndexedFile.Index.name client = get_connection('default') ops = [] for _file in files: file_dict = dict(_file) if file_dict['name'][0] == '.': continue file_dict['lastUpdated'] = current_time() file_dict['basePath'] = os.path.dirname(file_dict['path']) file_uuid = file_uuid_sha256(file_dict['system'], file_dict['path']) ops.append({ '_index': idx, '_id': file_uuid, 'doc': file_dict, '_op_type': 'update', 'doc_as_upsert': True }) bulk(client, ops)
def delete_recursive(system, path): """ Recursively delete all Elasticsearch documents in a specified system/path. Parameters ---------- system: str The Tapis system ID containing files to be deleted. path: str The path relative to the system root. All documents with this path as a prefix will be deleted. Returns ------- Void """ from portal.libs.elasticsearch.docs.base import IndexedFile hits = walk_children(system, path, recurse=True) idx = IndexedFile.Index.name client = get_connection('default') # Group children in batches of 100 for bulk deletion. for group in grouper(hits, 100): filtered_group = filter(lambda hit: hit is not None, group) ops = map(lambda hit: {'_index': idx, '_id': hit.meta.id, '_op_type': 'delete'}, filtered_group) bulk(client, ops)
def parallel_bulk_index(serializer_hash, index, options): serializer = Serializer.hash_registry[serializer_hash] using = options.get('using', '') or None client = connections.get_connection(using or 'default') serializer.bulk_operation(index=index, client=client, **options)
def setUp(self): self.index = f'test-wine-{uuid.uuid4()}' self.connection = connections.get_connection() self.connection.indices.create(index=self.index, body={ 'settings': { 'number_of_shards': 1, 'number_of_replicas': 0, }, 'mappings': ES_MAPPING, }) # Load fixture data fixture_path = pathlib.Path(settings.BASE_DIR / 'catalog' / 'fixtures' / 'test_wines.json') with open(fixture_path, 'rt') as fixture_file: fixture_data = json.loads(fixture_file.read()) for wine in fixture_data: fields = wine['fields'] self.connection.create(index=self.index, id=fields['id'], body={ 'country': fields['country'], 'description': fields['description'], 'points': fields['points'], 'price': fields['price'], 'variety': fields['variety'], 'winery': fields['winery'], }, refresh=True) # Start patching self.mock_constants = patch('catalog.views.constants').start() self.mock_constants.ES_INDEX = self.index
def get_score_for_ifra(self): from app import connections es = connections.get_connection() latlon = str(self.location).split(",") query = { "query": { "bool": { "must": { "match_all": {} }, "filter": { "geo_distance": { "distance": "10km", "location": { "lat": latlon[0], "lon": latlon[1] } } } } } } res = es.search(index='prolepsyspoi', body=query) new_score = 0.0 if 'hits' in res and 'hits' in res['hits'] and len(res['hits']['hits']) > 0: for i in res['hits']['hits']: new_score += float(i['_source']['score'] if 'score' in i['_source'] else 0) new_score = new_score / len(res['hits']['hits']) return new_score
def telephones(cls, lat, lon): from app import connections es = connections.get_connection() query = { "query": { "bool": { "must": { "match_all": {} }, "filter": { "geo_distance": { "distance": "10km", "location": { "lat": lat, "lon": lon } } } } } } res = es.search(index='users', body=query) telephones = [] if 'hits' in res and 'hits' in res['hits'] and len(res['hits']['hits']) > 0: for i in res['hits']['hits']: if 'telephone' in i['_source']: telephones.append(i['_source']['telephone']) return telephones
def installPipelines(): conn = get_connection() client = IngestClient(conn) client.put_pipeline(id='ingest_attachment', body={ 'description': "Extract attachment information", 'processors': [ { "attachment": { "field": "data" }, "remove": { "field": "data" } } ] }) client.put_pipeline(id='add_timestamp', body={ 'description': "Adds an index_date timestamp", 'processors': [ { "set": { "field": "index_date", "value": "{{_ingest.timestamp}}", }, }, ] })
def create_patterned_index(alias: str, pattern: str, create_alias: bool = True) -> None: """Run only one time to setup""" name = pattern.replace('*', datetime.datetime.now().strftime('%Y%m%d%H%M')) # create_index es = connections.get_connection() es.indices.create(index=name) if create_alias: es.indices.update_aliases( body={ 'actions': [ { "remove": { "alias": alias, "index": pattern } }, { "add": { "alias": alias, "index": name } }, ] })
def sanity_check_new_index(self, attempt, document, new_index_name, previous_record_count): """ Ensure that we do not point to an index that looks like it has missing data. """ current_record_count = self.get_record_count(document) percentage_change = self.percentage_change(current_record_count, previous_record_count) # Verify there was not a big shift in record count record_count_is_sane = percentage_change < settings.INDEX_SIZE_CHANGE_THRESHOLD # Spot check a known-flaky field type to detect VAN-391 aggregation_type = Mapping.from_es(new_index_name)['aggregation_key'].name record_count_is_sane = record_count_is_sane and aggregation_type == 'keyword' if not record_count_is_sane: conn = get_connection() alternate_current_record_count = conn.search({"query": {"match_all": {}}}, index=new_index_name).get( 'hits', {}).get('total', {}).get('value', 0) message = ''' Sanity check failed for attempt #{0}. Index name: {1} Percentage change: {2} Previous record count: {3} Base record count: {4} Search record count: {5} Aggregation key type: {6} '''.format( attempt, new_index_name, str(int(round(percentage_change * 100, 0))) + '%', previous_record_count, current_record_count, alternate_current_record_count, aggregation_type, ) logger.info(message) logger.info('...sleeping for 5 seconds...') time.sleep(5) else: message = ''' Sanity check passed for attempt #{0}. Index name: {1} Percentage change: {2} Previous record count: {3} Current record count: {4} '''.format( attempt, new_index_name, str(int(round(percentage_change * 100, 0))) + '%', previous_record_count, current_record_count ) logger.info(message) index_info_string = ( 'The previous index contained [{}] records. ' 'The new index contains [{}] records, a [{:.2f}%] change.'.format( previous_record_count, current_record_count, percentage_change * 100 ) ) return record_count_is_sane, index_info_string
def handle(self, *args, **options): LogEntry.init() count = LogEntry_db.objects.count() step = 10000 last = 0 for i in range(0, count, step): entries = [] for entry_db in LogEntry_db.objects.all()[i:last+step]: last += step entry = LogEntry( meta={'id': entry_db.pk}, action=['create', 'update', 'delete'][entry_db.action], content_type_id=entry_db.content_type.pk, content_type_app_label=entry_db.content_type.app_label, content_type_model=entry_db.content_type.model, object_id=entry_db.object_id, object_pk=entry_db.object_pk, object_repr=entry_db.object_repr, timestamp=entry_db.timestamp ) if entry_db.actor: entry.actor_id = str(entry_db.actor.pk) entry.actor_email = entry_db.actor.email entry.actor_first_name = entry_db.actor.first_name entry.actor_last_name = entry_db.actor.last_name if entry_db.remote_addr: entry.remote_addr = entry_db.remote_addr if entry_db.changes: entry.changes = [ Change(field=key, old=val[0], new=val[1]) for key, val in entry_db.changes.items() ] entries.append(entry) LogEntry.bulk(connections.get_connection(), entries) print(f'Uploaded {i} logs')
def setup_index(doctype): """ Create the index template in elasticsearch specifying the mappings and any settings to be used. This can be run at any time, ideally at every new code deploy. """ alias = doctype._index._name pattern = '{alias}-*'.format(alias=alias) # create an index template index_template = doctype._index.as_template(alias, pattern) # upload the template into elasticsearch # potentially overriding the one already there index_template.save() # get the low level connection es = get_connection() # create the first index if it doesn't exist if not es.indices.exists_alias(alias): index = get_next_index(pattern) es.indices.create(index=index) es.indices.update_aliases( body={'actions': [ { "add": { "alias": alias, "index": index } }, ]})
def delete(cls, elex: Dict[str, Any]) -> None: ''' todo: docs ''' idxs = connections.get_connection().indices logger().debug('Dropping index %s', elex.uid) idxs.delete(ignore=404, index=elex.uid)
async def execute(self, ignore_cache=False, raise_on_error=True): """ Execute the multi search request and return a list of search results. """ if ignore_cache or not hasattr(self, '_response'): es = get_connection(self._using) responses = await es.msearch(index=self._index, body=self.to_dict(), **self._params) out = [] for s, r in zip(self._searches, responses['responses']): if r.get('error', False): if raise_on_error: raise TransportError('N/A', r['error']['type'], r['error']) r = None else: r = Response(s, r) out.append(r) self._response = out return self._response
def update_search(self, data): doc = self.to_search_data() if 'parent' not in data: try: current_structure = self.tag.get_active_structure() parent = current_structure.parent if parent is not None: parent = { 'id': str(parent.tag.current_version.pk), 'index': parent.tag.current_version.elastic_index, } except TagStructure.DoesNotExist: parent = None data['parent'] = parent doc.update(data) doc['current_version'] = self.tag.current_version == self doc.pop('_id', None) doc.pop('_index', None) doc = {'doc_as_upsert': True, 'doc': doc} es = get_connection() es.update(self.elastic_index, 'doc', str(self.pk), body=doc)
def create(cls, elex: Dict[str, Any]) -> None: ''' todo: docs ''' idxs = connections.get_connection().indices logger().debug('Creating index %s', elex.uid) idxs.create(index=elex.uid, body=cls.__schema(elex))
def _rebuild(self, models, options): if options['atomic'] is False and not self._delete(models, options): return if options['atomic'] is True: alias_index_pairs = [] index_suffix = "-" + datetime.now().strftime("%Y%m%d%H%M%S%f") for index in registry.get_indices(models): # The alias takes the original index name value. The # index name sent to Elasticsearch will be the alias # plus the suffix from above. new_index = index._name + index_suffix alias_index_pairs.append({ 'alias': index._name, 'index': new_index }) index._name = new_index self._create(models, options) self._populate(models, options) if options['atomic'] is True: es_conn = connections.get_connection() existing_aliases = [] for index in es_conn.indices.get_alias().values(): existing_aliases += index['aliases'].keys() for alias_index_pair in alias_index_pairs: alias = alias_index_pair['alias'] alias_exists = alias in existing_aliases self._update_alias(es_conn, alias, alias_index_pair['index'], alias_exists, options)
def update_faces_index(self): q = { "script": { "inline": "ctx._source.person=params.person", "lang": "painless", "params": { "person": self.person } }, "query": { "bool": { "must": [{ "terms": { "_id": self.faces } }, { "bool": { "must_not": [{ "exists": { "field": "person" } }] } }] } } } es = connections.get_connection() es.update_by_query(body=q, doc_type='doc', index='faces', conflicts='proceed')
def main(self) -> None: try: instance.config = ConfigParser() instance.config.read_dict(defaultconfig()) logger().info('Started eisp with pid %s', getpid()) for i in [i for i in argv if i.startswith('--')]: try: mod('eisp.param.{}'.format(i[2:])).__dict__[i[2:]](argv) except: exit('Invalid parameter or argument to {}'.format(i[2:])) conf = dotdict(instance.config['data']) connections.create_connection(hosts=[conf.host]) delete_index(conf.index_name) create_index(conf.elastic_mapping, conf.index_name) for ok, info in helpers.parallel_bulk(connections.get_connection(), actions=index_pdfs( conf.index_name, conf.root), request_timeout=60, chunk_size=100, thread_count=8, queue_size=8): if not ok: print(info) except KeyboardInterrupt: print('\N{bomb}') except Exception as exception: logger().exception(exception) except SystemExit as exception: logger().critical(str(exception))
def handle(self, *args, **kwargs): connection = connections.get_connection() self.stdout.write(f'Checking if index "{ES_INDEX}" exists...') if connection.indices.exists(index=ES_INDEX): self.stdout.write(f'Index "{ES_INDEX}" already exists') self.stdout.write(f'Updating mapping on "{ES_INDEX}" index...') connection.indices.put_mapping(index=ES_INDEX, body=ES_MAPPING) self.stdout.write(f'Updated mapping on "{ES_INDEX}" successfully') else: self.stdout.write(f'Index "{ES_INDEX}" does not exist') self.stdout.write(f'Creating index "{ES_INDEX}"...') connection.indices.create(index=ES_INDEX, body={ 'settings': { 'number_of_shards': 1, 'number_of_replicas': 0, }, 'mappings': ES_MAPPING, }) self.stdout.write(f'Index "{ES_INDEX}" created successfully') self.stdout.write(f'Bulk updating documents on "{ES_INDEX}" index...') succeeded, _ = bulk(connection, actions=self._document_generator(), stats_only=True) self.stdout.write( f'Updated {succeeded} documents on "{ES_INDEX}" successfully')
def _with_elastic(do: str, action: Callable[[Elasticsearch], None]) -> bool: try: action(get_connection()) return True except Exception as e: LOG.warning('Could not %s elastic. Perhaps client is down?', do) return debug_ex(e, f'{do} elastic', LOG, silent=True)
def run(self, corpus, index_name="fact_corpus", document_class=Fact, **kwargs): connections.create_connection(hosts=["localhost"]) document_class.init() documents = ( document_class(meta={"id": id}, fact=doc["fact"]).to_dict(True) for id, doc in corpus.items() ) logger.info(f"Building corpus index for {index_name}") # RayExecutor().run(documents, self.save_data, {}) for success, info in tqdm( parallel_bulk( connections.get_connection(), documents, thread_count=kwargs.pop("batch_size", multiprocessing.cpu_count()), chunk_size=100000, max_chunk_bytes=2 * 1024 ** 3, ) ): if not success: logger.error(f"A document failed: {info} ") logger.success("Elastic index successfully built") return index_name
def import_content(self, task, path, rootdir=None, ip=None): if not rootdir: rootdir = os.path.dirname(path) self.indexed_files = [] self.task = task archive = self.get_archive(path) if not archive: archive = getattr(ip, 'tag', None) if not archive: raise ValueError('No archive found') else: archive = archive.tag.current_version logger.debug("Deleting task tags already in database...") Tag.objects.filter(task=self.task).delete() logger.debug("Deleting task tags already in Elasticsearch...") indices_to_delete = [doc._index._name for doc in [Archive, Component, File]] es = get_connection() Search(using=es, index=indices_to_delete).query('term', task_id=str(self.task.pk)).delete() tags, tag_versions, tag_structures, components = self.parse_eard(path, ip, rootdir, archive) self.update_progress(50) self.save_to_database(tags, tag_versions, tag_structures, archive) self.update_progress(75) self.save_to_elasticsearch(components) self.update_progress(100) return self.indexed_files
def bulk_index_public(): public_publications = Publication.api.primary().filter(status='REVIEWED') PublicationDoc.init() AuthorDoc.init() PlatformDoc.init() SponsorDoc.init() TagDoc.init() logger.info('creating publication index') client = connections.get_connection() bulk(client=client, actions=(AuthorDoc.from_instance(a) for a in Author.objects.filter( publications__in=public_publications))) bulk(client=client, actions=(PlatformDoc.from_instance(p) for p in Platform.objects.filter( publications__in=public_publications))) bulk(client=client, actions=(SponsorDoc.from_instance(s) for s in Sponsor.objects.filter( publications__in=public_publications))) bulk(client=client, actions=(TagDoc.from_instance(t) for t in Tag.objects.filter( publications__in=public_publications))) bulk(client=client, actions=(PublicationDoc.from_instance(p) for p in public_publications.select_related('container') \ .prefetch_related('tags', 'sponsors', 'platforms', 'creators', 'model_documentation').iterator()))
def installPipelines(): conn = get_connection() client = IngestClient(conn) client.put_pipeline(id='ingest_attachment', body={ 'description': "Extract attachment information", 'processors': [{ "attachment": { "field": "data", "indexed_chars": "-1" }, "remove": { "field": "data" } }] }) client.put_pipeline(id='add_timestamp', body={ 'description': "Adds an index_date timestamp", 'processors': [ { "set": { "field": "index_date", "value": "{{_ingest.timestamp}}", }, }, ] })
def generate_vulns(self): docs = [] for i in range(100): docs.append( create_vulnerability(create_asset(F'10.10.10.{i}', save=False), self.cve, save=False).to_dict()) for i in range(100): vuln = create_vulnerability(create_asset(F'10.10.10.{i}', save=False), self.cve, save=False) vuln.tags.append(VulnerabilityStatus.FIXED) docs.append(vuln.to_dict()) for i in range(100): asset = create_asset(F'10.10.10.{i}', save=False) asset.tags = [AssetStatus.DELETED] vuln = create_vulnerability(asset, self.cve, save=False) docs.append(vuln.to_dict()) bulk(get_connection(), docs, refresh=True, index=VulnerabilityDocument.Index.name)
def bulk_create(cls, docs, using=None, index=None, handler=bulk, **kwargs): if index is not None: for doc in docs: doc._index = doc._index.clone(name=index) client = connections.get_connection(using or cls._index._using) docs = [doc.to_dict(include_meta=True) for doc in docs] return handler(client, docs, **kwargs)
def put(self, request, *args, **kwargs): connection = get_connection() url_path = kwargs['url_path'] body = json.loads(request.body) return JsonResponse( connection.transport.perform_request('PUT', f'/{url_path}', body=body))
def bulk(self, documents): """Takes a documents generator, converts to ES format and passes generator on to ES bulk call""" conn = connections.get_connection() for result in elasticsearch.helpers.streaming_bulk( client=conn, actions=self._bulk_generator(documents), chunk_size=self._bulk_chunk_size, ): pass
def bulk_index_public(): client = connections.get_connection() for doc_class in (PublicationDoc, AuthorDoc, PlatformDoc, SponsorDoc, TagDoc): client.indices.delete(index=doc_class.Index.name, ignore=[400, 404]) doc_class.init() public_publications = Publication.api.primary().filter(status='REVIEWED') bulk(client=client, actions=(AuthorDoc.from_instance(a) for a in Author.objects.filter(publications__in=public_publications))) bulk(client=client, actions=(PlatformDoc.from_instance(p) for p in Platform.objects.filter(publications__in=public_publications))) bulk(client=client, actions=(SponsorDoc.from_instance(s) for s in Sponsor.objects.filter(publications__in=public_publications))) bulk(client=client, actions=(TagDoc.from_instance(t) for t in Tag.objects.filter(publications__in=public_publications))) bulk(client=client, actions=(PublicationDoc.from_instance(p) for p in public_publications.select_related('container') \ .prefetch_related('code_archive_urls', 'tags', 'sponsors', 'platforms', 'creators', 'model_documentation').iterator()))
def migrate(move_data=True, update_alias=True): """ Upgrade function that creates a new index for the data. Optionally it also can (and by default will) reindex previous copy of the data into the new index (specify ``move_data=False`` to skip this step) and update the alias to point to the latest index (set ``update_alias=False`` to skip). Note that while this function is running the application can still perform any and all searches without any loss of functionality. It should, however, not perform any writes at this time as those might be lost. """ # construct a new index name by appending current timestamp next_index = PATTERN.replace('*', datetime.now().strftime('%Y%m%d%H%M%S%f')) # get the low level connection es = connections.get_connection() # create new index, it will use the settings from the template es.indices.create(index=next_index) if move_data: # move data from current alias to the new index es.reindex( body={"source": {"index": ALIAS}, "dest": {"index": next_index}}, request_timeout=3600 ) # refresh the index to make the changes visible es.indices.refresh(index=next_index) if update_alias: # repoint the alias to point to the newly created index es.indices.update_aliases(body={ 'actions': [ {"remove": {"alias": ALIAS, "index": PATTERN}}, {"add": {"alias": ALIAS, "index": next_index}}, ] })
def __init__(self, *args, **kwargs): self.client = get_connection() super().__init__(*args, **kwargs)