from django.conf import settings ELASTICSEARCH_ENABLED = hasattr(settings, 'ELASTICSEARCH_DSL') if ELASTICSEARCH_ENABLED: connections.create_connection( hosts=[settings.ELASTICSEARCH_DSL['default']['hosts']]) from elasticsearch import Elasticsearch es = Elasticsearch(settings.ELASTICSEARCH_DSL['default']['hosts']) from elasticsearch.client import IngestClient c = IngestClient(es) try: c.get_pipeline('geoip') except elasticsearch.exceptions.NotFoundError: c.put_pipeline('geoip', body='''{ "description" : "Add geoip info", "processors" : [ { "geoip" : { "field" : "ip" } } ] }''') class GeoIp(InnerDoc):
class IngestConnector: def __init__( self, pipeline_id: str = "pdf_content", field: str = "data", pipeline_description: str = "Extracting info from pdf content"): self.pipeline_id: str = pipeline_id self.index_name: str = pipeline_id + "_index" self.field: str = field self.pipeline_description: str = pipeline_description self.ingest_client = IngestClient(current_app.elasticsearch) def create_pipeline(self): self.ingest_client.put_pipeline(id=self.pipeline_id, body={ 'description': self.pipeline_description, 'processors': [{ "attachment": { "field": self.field } }] }) def delete_pipeline(self): self.ingest_client.delete_pipeline(id=self.pipeline_id) def get_pipeline(self): return self.ingest_client.get_pipeline(id=self.pipeline_id) def add_to_index(self, id_: int, content: str, content_page: int, content_paragraph: int): current_app.elasticsearch.index( index=self.index_name, id=id_, pipeline=self.pipeline_id, body={ self.field: base64.b64encode(content.encode("utf-8")).decode("utf-8"), "content_page": content_page, "content_paragraph": content_paragraph, }) def remove_from_index(self, id_: int): current_app.elasticsearch.delete(index=self.index_name, id=id_) def api_search(self, query: str): return current_app.elasticsearch.search( index=self.index_name, body={"query": { "match": { "attachment.content": query } }}) def search(self, query: str): search = self.api_search(query) ids = [int(hit['_id']) for hit in search['hits']['hits']] if len(ids) == 0: return None when = [] for i in range(len(ids)): when.append((ids[i], i)) res = KnowledgePdfContent.query.filter( KnowledgePdfContent.id.in_(ids)).order_by( db.case(when, value=KnowledgePdfContent.id)).all() return res[0] if len(res) > 0 else None
# simulate ingest pipeline IngestClient.simulate(es, body) # In[ ]: # store the pipeline for use in prod pipeline_name = model_id + '_ingest_pipeline' body = {'description': 'predict flower type', 'processors': processors} IngestClient.put_pipeline(es, id=pipeline_name, body=body) # In[ ]: # verify pipeline IngestClient.get_pipeline(es, pipeline_name) # In[ ]: # create index template with our new pipeline as the default pipeline settings = { "index_patterns": ["flower_measurements-*"], "settings": { "default_pipeline": "jeffs-rfc-flower-type_ingest_pipeline" } } template_name = 'flowers_measurement' IndicesClient.put_template(es, name=template_name, body=settings)