def delete_repo(to_delete): ''' deletes the repository(github issues and git commits) from the dashboard by deleting the elasticsearch data. :param to_delete : the url of the repository to be deleted ''' print("Repository", to_delete, "will be deleted") s = Search(using = es, index = "git_test", doc_type="items").\ query("match", origin=to_delete + ".git") response = s.delete() s = Search(using = es, index = "git_test-raw", doc_type="items").\ query("match", origin=to_delete + ".git") response = s.delete() s = Search(using = es, index = "github_test", doc_type="items").\ query("match", origin=to_delete ) response = s.delete() s = Search(using = es, index = "github_test-raw", doc_type="items").\ query("match", origin=to_delete ) response = s.delete()
def delete_by_repo(self, repo_id): """Delete all the docs of a repo. SQL: delete from repofiles where repo='xxx' """ s = Search(using=self.es, index=self.INDEX_NAME).query('term', repo=repo_id) s.delete()
def _delete_distribution_data(self, distribution): fields_to_delete = list( distribution.field_set.filter(present=True).exclude( identifier=None).values_list('identifier', flat=True)) series_data = Search(using=self.elastic, index=self.index._name).filter( 'terms', series_id=fields_to_delete) series_data.delete()
def delete_by_repo_path_prefix(self, repo_id, path_prefix): """Delete docs of dirs and all files/sub-dirs in those dirs of a repo. SQL: delete from repofiles where repo='xxx' and path like '/dir_xxx/%' """ s = Search(using=self.es, index=self.INDEX_NAME).query( 'term', repo=repo_id).query('prefix', path=path_prefix) s.delete()
def _delete_distribution_data(self, distribution): fields_to_delete = list( SeriesRepository.get_present_series(distribution=distribution) .exclude(identifier=None) .values_list('identifier', flat=True) ) for field in fields_to_delete: series_data = Search(using=self.elastic, index=self.index._name).params(conflicts='proceed').filter('term', series_id=field) series_data.delete()
def delete(properties, index='data_objects'): """ delete item from index """ s = Search(using=client, index=index) clauses = [] for k in properties.keys(): v = properties[k] clauses.append('+{}:"{}"'.format(k, v)) s = s.query("query_string", query=' '.join(clauses)) s.delete()
def init_tm_index(**kwargs): from elasticsearch_dsl import Search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_TOPIC_MODELLING from mainapp.documents import TopicModellingIndex, DynamicTopicModellingIndex kwargs = kwargs.copy() corpus = kwargs['corpus'] kwargs['is_multi_corpus'] = True if type(corpus) != list: corpus = [corpus] kwargs['is_multi_corpus'] = False source = kwargs['source'] datetime_from = kwargs['datetime_from'] datetime_to = kwargs['datetime_to'] is_dynamic = 'is_dynamic' in kwargs and kwargs['is_dynamic'] # Check if already exists if not 'perform_actualize' in kwargs: s = Search(using=ES_CLIENT, index=kwargs['index_tm']) s = s.filter("term", name=kwargs['name']) s.delete() s = Search(using=ES_CLIENT, index=kwargs['index_tm']) s = s.filter("term", **{"name.keyword": kwargs['name']}) try: s.delete() except: pass else: return get_tm_index(**kwargs) s = Search(using=ES_CLIENT, index=ES_INDEX_DOCUMENT).filter("terms", corpus=corpus) if source: s = s.filter("term", **{"source": source}) if datetime_from: s = s.filter('range', datetime={'gte': datetime_from}) if datetime_to: s = s.filter('range', datetime={'lt': datetime_to}) number_of_documents = s.count() kwargs["number_of_documents"] = number_of_documents kwargs["is_ready"] = False kwargs['corpus'] = "_".join(corpus) if is_dynamic: index = DynamicTopicModellingIndex(**kwargs) else: index = TopicModellingIndex(**kwargs) index.save() return index
def remove_cropped_if_asset_exists(asset): try: search = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_asset_meta)) search.query = Q('match', asset_id=asset.asset_id) search.exclude() for hit in search: idx = '{}-{}'.format(asset.asset_id, hit.cropped_id) s = Search(index=cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_prefix) + cfg.resolve(cfg.ELASTICSEARCH_SERVER, cfg.index_cropped)) s.query = Q('match', id=idx) s.delete() search.delete() except: print(sys.exc_info()[0])
def test3_delete(): ''' :return: ''' s = Search(using=client, index='test-index').query('match', sport='gaming') get_dsl(s) # print '检索匹配记录:' # response = s.execute() # for hit in response: # get_readable_rs(hit.to_dict()) # get_dsl(s) print '删除记录:' s.delete() print '删除后记录:' response = s.execute() for hit in response: get_readable_rs(hit.to_dict())
def handle(self, *args, **options): self.get_tm_ids() IpcAppList.objects.filter(id__in=self.ids).delete() print(len(self.ids)) print(self.ids) print(self.app_nums) es = Elasticsearch(settings.ELASTIC_HOST, timeout=settings.ELASTIC_TIMEOUT) for _id in self.ids: q = Q( 'bool', must=[Q('match', _id=_id)], ) s = Search(index=settings.ELASTIC_INDEX_NAME).using(es).query(q) s.delete() self.stdout.write(self.style.SUCCESS('Finished'))
def delete_rows(client, index, date, path_log): try: s = Search(using=client, index=index) \ .filter('range' , **{'Date': {'gte': date, "lte": date}}) response = s.delete() except: error = sys.exc_info() simple_log(path_log, index, error[0], error[1]) return None
def delete_all(self, **kwargs): output = {'success': False} index = kwargs.get('index') if index is None: msg = "Error: 'index' argument is required" output['msg'] = msg return output if not index_exists(index): msg = "Error: index {} does not exist".format(index) output['msg'] = msg return output cherrypy.log('delete_all in index {}'.format(index)) s = Search(index=index) s.delete() s = Search(index=token_index_name) s.delete() output['success'] = True return output
def clear_topics_from_topic_index(self, topic): should = [] if isinstance(topic, list): should.extend([Q('match', topic_id=t) for t in topic]) else: should.append(Q('match', topic_id=topic)) q = Q('bool', should=should) s = Search(using=self.client, index=self.topic_index) \ .query(q) response = s.delete() return response.success()
def test_delete(self): person = Person(id="1", name="唐僧", age=66, create_time=datetime.now(), desc="desc", meta={'id': 42}) self.assertEqual(42, person.meta.id) person.save(using=es) # 插入几条测试数据 Person(id="2", name="张三", age=15, create_time="2013-09-10 23:40:00").save(using=es) Person(id="3", name="李四", age=16, create_time="2013-10-10 23:40:00").save(using=es) Person(id="4", name="王五", age=17, create_time="2013-11-10 23:40:00").save(using=es) Person._index.refresh(using=es) # 单个删除 p = Person.get(id=42, using=es, ignore=[400, 404]) p.delete(using=es) self.assertIsNone(Person.get(id=42, using=es, ignore=[400, 404 ])) # 添加ignore, 否则当没有该文档时,会抛出异常 # 批量删除方式一 s = Search(using=es, index=Person._index._name).filter('term', name="张三") res = s.delete() self.assertEqual(1, res.deleted) # 批量删除方式二 s = Search(using=es, index=Person._index._name).filter('term', name="李四") res = [ hit for hit in scan(es, query=s.to_dict(), index=Person._index._name) ] for r in res: r['_op_type'] = 'delete' bulk(es, res, params={"refresh": 'true'}) # 验证结果 total = Search(using=es, index=Person._index._name).count() self.assertEqual(1, total)
def delete_repository(arr): es = Elasticsearch('http://localhost:9200', verify_certs=False) s = Search(using=es, index='git_test-raw') s.aggs.bucket('repository', 'terms', field='origin') result = s.execute() buckets_result = result['aggregations']['repository']['buckets'] i = -1 for repo in buckets_result: i = i + 1 print(i, repo.key) index = int( input("\nEnter the Index of repository which you want to delete\n")) delete_repo = buckets_result[index]['key'] for i in arr: print(i) s = Search(using=es, index=i).query("match", origin=delete_repo) response = s.delete()
def _delete_all(self, criteria: Q = None): """Delete all records matching criteria from the Repository""" conn = self._get_session() # Build the filters from the criteria q = elasticsearch_dsl.Q() if criteria and criteria.children: q = self._build_filters(criteria) s = Search(using=conn, index=self.model_cls._index._name).query(q) # Return the results try: response = s.delete() # `Search.delete` does not refresh index, so we have to manually refresh index = Index(name=self.entity_cls.meta_.schema_name, using=conn) index.refresh() except Exception as exc: logger.error(f"Error while deleting records: {exc}") raise return response.deleted
def elasticsearch_delete_old(): _from = NEVER _to = datetime.now() - timedelta(days=30) query = Search().filter(Q("range", visited_at={'from': _from, 'to': _to})) result = query.delete()
def delete(self, index, document_id): s = Search(index=index).query("match", _id=document_id) response = s.delete() return response
def delete_docs_by_unique_key( client: Elasticsearch, key: str, value_list: list, task_id: str, index, refresh_after: bool = True, delete_chunk_size: int = 1000, ) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. NOTE: This delete routine looks at just the index name given. If there are duplicate records across multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple indices, or this will need to be run once per index. Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster key (str): name of field in targeted elasticsearch index that should have a unique value for every doc in the index. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) value_list (list): if key field has these values, the document will be deleted task_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. refresh_after (bool): Whether to call ``_refresh`` on the index when all of the provided values in ``value_list`` have been processed for delete; defaults to ``True``. If many small deletes happen at a rapid rate, it may be best to set this ``False`` and await a deferred refresh afterward in the calling code. NOTE: This param will be ignored and a refresh will be attempted if this function errors-out during execution, in order to not leave un-refreshed deletes in the index. delete_chunk_size (int): the batch-size of terms value-array given to each _delete_by_query call. Needs to be less than 65536 (max values for any terms query), and less than index.max_results_window setting. Ideally use ``config["partition_size"]`` (derived from --partition-size) to set this to a calibrated value. If not provided, uses 1000 as a safe default (10,000 resulted in some timeouts on a busy cluster). Returns: Number of ES documents deleted """ start = perf_counter() if len(value_list) == 0: logger.info( format_log("Nothing to delete", action="Delete", name=task_id)) return 0 logger.info( format_log(f"Deleting up to {len(value_list):,} document(s)", action="Delete", name=task_id)) if not index: raise RuntimeError("index name must be provided") if not _is_allowed_key_field_type(client, key, index): msg = ( f'Cannot perform deletes in index "{index}" by key field "{key}" because its type is not one of ' f"the allowed field types, or the field was not found in that index." ) logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) if delete_chunk_size > 65536: # 65,536 is max number of terms that can be added to an ES terms filter query msg = ( f"{delete_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " f"terms filter query") logger.error(format_log(msg=msg, action="Delete")) raise RuntimeError(msg) chunks_processed = 0 deleted = 0 is_error = False try: values_generator = chunks(value_list, delete_chunk_size) for chunk_of_values in values_generator: # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query # _refresh is deferred until the end of chunk processing q = Search(using=client, index=index).filter("terms", **{key: chunk_of_values}) # type: Search # params: # conflicts="proceed": Ignores version conflict errors if a doc delete is attempted more than once # slices="auto": Will create parallel delete batches per shard q = q.params(conflicts="proceed", slices="auto") response = q.delete() # Some subtle errors come back on the response if response["timed_out"]: msg = f"Delete request timed out on cluster after {int(response['took'])/1000:.2f}s" logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) if response["failures"]: fail_snippet = "\n\t\t" + "\n\t\t".join( map(str, response["failures"][0:4])) + "\n\t\t" + "..." msg = f"Some docs failed to delete on cluster:{fail_snippet}" logger.error(format_log(msg=msg, action="Delete", name=task_id)) raise RuntimeError(msg) logger.info( format_log( f"Deleted {response['deleted']:,} docs in ES from chunk of size {len(chunk_of_values):,} " f"in {int(response['took'])/1000:.2f}s, " f"and ignored {response['version_conflicts']:,} version conflicts", action="Delete", name=task_id, )) deleted += response["deleted"] chunks_processed += 1 except Exception: is_error = True logger.exception(format_log("", name=task_id, action="Delete")) raise finally: if deleted > 0 and (refresh_after or is_error): if not is_error: refresh_msg = "Refreshing index so deletes take effect" else: refresh_msg = "Attempting index refresh while handling error so deletes take effect" logger.info(format_log(refresh_msg, action="Delete", name=task_id)) client.indices.refresh(index=index) if chunks_processed > 1 or is_error: # This log becomes redundant unless to log the sum of multiple chunks' deletes (or error) error_text = " before encountering an error" if is_error else "" duration = perf_counter() - start docs = f"document{'s' if deleted != 1 else ''}" msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} total {docs}{error_text}" logger.info(format_log(msg, action="Delete", name=task_id)) return deleted
def delete_by_id(id): client = connect_es() s = Search(using=client, index=ELASTIC_INDEX) s.query = Q('term', _id=id) return s.delete()
def make_apartments_sold_in_elastic() -> None: s_obj = Search(index="test-apartment").query( "match", apartment_state_of_sale=ApartmentStateOfSale.FOR_SALE ) s_obj.delete() sleep(3)
def remove(cls, name): q = [w for w in name.split(' ')] pers = Search(index='softwareprofs').query("simple_query_string", query=' +'.join(q), fields=["name"]) pers.delete()
def delete_document(book_id): s = Search(index='book-index').query('match', index__id=book_id) s.delete()
def deletePost(url): s = Search(index='blog').query('match', _id=url) response = s.delete() return response
# The api is chainable s = Search().using(client).query('match', title='python') # Send the request response = s.execute() # Requests are cached by default by the python client, # subsequent calls to execute will not trigger additional # requests being sent to Elasticsearch # To force a request specify `ignore_cache=True` when sending # a request # Much like the Django orm we are familiar with, we can delete # the documents matching a search by calling delete s = Search().query('match', title='python') response = s.delete() ##################################### # QUERIES # Query objects are a one-to-one mapping to ES quey DSL: from elasticsearch_dsl.query import MultiMatch, Match MultiMatch(query='python django', fields=['title', 'body']) # {"mutli_match": {"query": "python django", "fields": ["title", "body"]}} Match(title={'query': 'web framework', 'type': 'phrase'}) # {"match": {"title": {"query": "web framwork", "type": "phrase"})
} } } } } }) paper_info_s = paper_info_s.source(['PaperId']) # Get number of query results results = paper_info_s[:NUM_PAPERS] papers = [x.PaperId for x in results.execute()] # Check if the paper has been seen before, and thus needs to be deleted checked_papers = last_papers.intersection(set(papers)) if checked_papers: delete_info_s = Search(index='paper_info', using=client) delete_info_s = delete_info_s.query("match", PaperId=list(checked_papers)) delete_info_s.delete() last_papers = set(papers).difference(checked_papers) papers = list(last_papers) print(papers) # Get updated information process_res, partial_res = paper_info_multiquery(papers) #, force=True) # Generate cached entries cache_paper_info(process_res) cache_paper_info(partial_res, chunk_size=100)
def _delete(_by_filter): search = Search(index=self._index, doc_type=self._mapping_type, using=self._es_client) search = search.query(_by_filter) return search.delete()
partial_papers = [p for (p, t) in paper_ids if t == 'partial'] print('[{}] -- Generate cache entries'.format(datetime.now())) complete_res, partial_res = paper_info_multiquery( complete_papers, query_filter=cache_allow, partial_updates=partial_papers ,recache=True) print('[{}] -- Add to cache'.format(datetime.now())) cache_paper_info( complete_res, additional_tag={'UpdateVersion': START_VERSION}) cache_paper_info( partial_res, additional_tag={'UpdateVersion': START_VERSION}) print('[{}] -- Remove old paper ids'.format(datetime.now())) res_ids = [p['PaperId'] for p in complete_res + partial_res] old_ids = [p for p in next(zip(*paper_ids)) if p not in res_ids] if len(old_ids) > 0: remove_s = Search(index='paper_info', using=client) remove_s = remove_s.query('terms', PaperId=old_ids) remove_s.delete() print('[{}] - Finish batch {}\n'.format(datetime.now(), counter)) counter += 1 complete_updated += len(complete_res) partial_updated += len(partial_res) removed += len(old_ids) print('\n[{}] - Complete: {}, Partial: {}, Total: {}, Remove: {}\n'.format( datetime.now(), complete_updated, partial_updated, complete_updated + partial_updated, removed))