def __init__(self, url, index_name): super().__init__() self.index_name = index_name logger.info("Connecting to ES @ %s" % url) self.es = elasticsearch.Elasticsearch(hosts=[url]) self.filter = SearchFilter() if not self.es.indices.exists(self.index_name): self.init()
def init(self): logger.info("Elasticsearch first time setup") if self.es.indices.exists(self.index_name): self.es.indices.delete(index=self.index_name) self.es.indices.create(index=self.index_name, body={ "settings": { "index": { "number_of_shards": 50, "number_of_replicas": 0, "refresh_interval": "30s", "codec": "best_compression" }, "analysis": { "analyzer": { "my_nGram": { "tokenizer": "my_nGram_tokenizer", "filter": ["lowercase", "asciifolding"] } }, "tokenizer": { "my_nGram_tokenizer": { "type": "nGram", "min_gram": 3, "max_gram": 3 } } } } }) # Index Mappings self.es.indices.put_mapping(body={ "properties": { "path": {"analyzer": "standard", "type": "text"}, "name": {"analyzer": "standard", "type": "text", "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, "mtime": {"type": "date", "format": "epoch_second"}, "size": {"type": "long"}, "website_id": {"type": "integer"}, "ext": {"type": "keyword"}, }, "_routing": {"required": True} }, doc_type="file", index=self.index_name, include_type_name=True) self.es.indices.open(index=self.index_name)
def init(self): logger.info("Elasticsearch first time setup") if self.es.indices.exists(self.index_name): self.es.indices.delete(index=self.index_name) self.es.indices.create(index=self.index_name) self.es.indices.close(index=self.index_name) # Index settings self.es.indices.put_settings(body={ "analysis": { "tokenizer": { "my_nGram_tokenizer": { "type": "nGram", "min_gram": 3, "max_gram": 3 } } } }, index=self.index_name, request_timeout=60) self.es.indices.put_settings(body={ "analysis": { "analyzer": { "my_nGram": { "tokenizer": "my_nGram_tokenizer", "filter": ["lowercase", "asciifolding"] } } } }, index=self.index_name) self.es.indices.put_mapping(body={ "properties": { "path": { "analyzer": "standard", "type": "text" }, "name": { "analyzer": "standard", "type": "text", "fields": { "nGram": { "type": "text", "analyzer": "my_nGram" } } }, "mtime": { "type": "date", "format": "epoch_second" }, "size": { "type": "long" }, "website_id": { "type": "integer" }, "ext": { "type": "keyword" }, }, "_routing": { "required": True } }, doc_type="file", index=self.index_name, request_timeout=60) self.es.indices.open(index=self.index_name)
def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, date_max) -> {}: if self.filter.should_block(query): logger.info("Search was blocked") raise InvalidQueryException( "One or more terms in your query is blocked by the search filter. " "This incident has been reported.") filters = [] if extensions: filters.append({"terms": {"ext": extensions}}) if size_min > 0 or size_max: size_filer = dict() new_filter = {"range": {"size": size_filer}} if size_min > 0: size_filer["gte"] = size_min if size_max: size_filer["lte"] = size_max filters.append(new_filter) if date_min > 0 or date_max: date_filer = dict() new_filter = {"range": {"mtime": date_filer}} if date_min > 0: date_filer["gte"] = date_min if date_max: date_filer["lte"] = date_max filters.append(new_filter) sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, []) page = self.es.search(body={ "query": { "bool": { "must": { "multi_match": { "query": query, "fields": fields, "operator": "or" if match_all else "and" } }, "filter": filters } }, "sort": sort_by, "highlight": { "fields": { "name": { "pre_tags": ["<mark>"], "post_tags": ["</mark>"] }, "name.nGram": { "pre_tags": ["<mark>"], "post_tags": ["</mark>"] }, "path": { "pre_tags": ["<mark>"], "post_tags": ["</mark>"] } } }, "size": per_page, "from": min(page * per_page, 10000 - per_page) }, index=self.index_name, request_timeout=20) return page