def url_ngram_analyzer(): """ An analyzer for creating URL safe n-grams. Returns: Analyzer """ return analyzer( 'url_ngram', tokenizer='keyword', filter=[ 'lowercase', # Strip the protocol, we don't need it for searching. token_filter( 'url_protocol_filter', type='pattern_replace', pattern='^\w+:\/\/', replace='', ), 'word_delimiter', # Create trigrams from the address. token_filter( 'url_ngram_filter', type='ngram', min_gram=3, max_gram=3, ), ], )
def configure_index(idx): """Configure ES index settings. NOTE: This is unused at the moment. Current issues: 1. The index needs to be created (index.create() or search_index --create) setting update_all_types=True because of the attribute name being the same in Person and Company. https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create name = fields.TextField(attr="fullname", analyzer=lb_analyzer) 2. How to specifiy token filter for an attribute? Therefore the index needs to be configured outside Django. """ idx.settings(number_of_shards=1, number_of_replicas=0) lb_filter = token_filter( "lb_filter", "stop", stopwords=["i"] ) lb_analyzer = analyzer( "lb_analyzer", tokenizer="standard", filter=["standard", "lb_filter", "asciifolding", "lowercase"] ) return lb_analyzer, lb_filter
def email_main_analyzer(): """ An analyzer for creating "words" from email addresses. This analyzer splits email addresses on special characters. For example, [email protected] would become [john, doe, crm, example, com]. These tokens, when combined with ngrams, provide nice fuzzy matching while boosting full word matches. Returns: Analyzer: An analyzer suitable for analyzing email addresses. """ return analyzer( 'email', # We tokenize with token filters, so use the no-op keyword tokenizer. tokenizer='keyword', filter=[ 'lowercase', # Split the email address on special characters. token_filter( 'email_word_delimiter', type='word_delimiter', # Ensure words like hello2lily are kept as one token or not. split_on_numerics=False, ), ], )
def test_analyzers_returned_from_to_dict(): random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100))) random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard") index = Index('i', using='alias') index.analyzer(random_analyzer) assert index.to_dict()["settings"]["analysis"]["analyzer"][random_analyzer_name] == {"filter": ["standard"], "type": "custom", "tokenizer": "standard"}
def suggest(): suggestfilter = token_filter("suggestfilter", type="ngram", **other_param) bestsuggest = analyzer("bestsuggest", tokenizer="standard", filter=["lowercase", suggestfilter, "asciifolding"]) return bestsuggest
def ngram(min_gram=2, max_gram=4): base_name = "ngram_%d_%d" % (min_gram, max_gram) return dsl.analyzer(base_name + "_analyzer", tokenizer=dsl.tokenizer(base_name + "_tokenizer", 'nGram', min_gram=min_gram, max_gram=max_gram, token_chars=[ "letter", "digit" ]), filter=['lowercase'])
def test_simulate_complex(client): a = analyzer('my-analyzer', tokenizer=tokenizer('split_words', 'simple_pattern_split', pattern=':'), filter=['lowercase', token_filter('no-ifs', 'stop', stopwords=['if'])]) tokens = a.simulate('if:this:works', using=client).tokens assert len(tokens) == 2 assert ['this', 'works'] == [t.token for t in tokens]
def get_default_text_analyzer(): return analyzer( 'froide_analyzer', tokenizer='standard', filter=[ 'standard', 'lowercase', 'asciifolding', ] )
def get_default_ngram_analyzer(): return analyzer( 'froide_ngram_analyzer', tokenizer=tokenizer( 'froide_ngram_tokenzier', type='edge_ngram', min_gram=1, max_gram=15, token_chars=['letter', 'digit'] ), filter=[ 'standard', 'lowercase', 'asciifolding', ] )
def test_cloned_index_has_analysis_attribute(): """ Regression test for Issue #582 in which `Index.clone()` was not copying over the `_analysis` attribute. """ client = object() i = Index('my-index', using=client) random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100))) random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard") i.analyzer(random_analyzer) i2 = i.clone('my-clone-index') assert i.to_dict()['settings']['analysis'] == i2.to_dict()['settings']['analysis']
def get_text_analyzer(): return analyzer( 'fds_analyzer', tokenizer='standard', filter=[ 'keyword_repeat', token_filter('decomp', type='decompound', subwords_only=True), 'lowercase', token_filter('stop_de', type='stop', stopwords="_german_"), 'german_normalization', 'asciifolding', token_filter('de_stemmer', type='stemmer', name='light_german'), token_filter('unique_stem', type='unique', only_on_same_position=True) ], )
def standard_ascii_analyzer(): """ Elasticsearch's standard analyzer with asciifolding. The asciifolding filter converts non-ascii letters to their ascii counterparts. It essentially cleans diacritics from strings. Returns: Analyzer """ return analyzer( 'standard_ascii', tokenizer='standard', filter=[ 'standard', 'lowercase', 'asciifolding', ] )
def test_cloned_index_has_analysis_attribute(): """ Regression test for Issue #582 in which `Index.clone()` was not copying over the `_analysis` attribute. """ client = object() i = Index('my-index', using=client) random_analyzer_name = ''.join( (choice(string.ascii_letters) for _ in range(100))) random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard") i.analyzer(random_analyzer) i2 = i.clone('my-clone-index') assert i.to_dict()['settings']['analysis'] == i2.to_dict( )['settings']['analysis']
def add_analyzer(index: Index): """Agrega un nuevo analyzer al índice, disponible para ser usado en todos sus fields. El analyzer aplica lower case + ascii fold: quita acentos y uso de ñ, entre otros, para permitir búsqueda de texto en español """ synonyms = list(Synonym.objects.values_list('terms', flat=True)) filters = ['lowercase', 'asciifolding'] if synonyms: filters.append(token_filter(constants.SYNONYM_FILTER, type='synonym', synonyms=synonyms)) index.analyzer( analyzer(constants.ANALYZER, tokenizer='standard', filter=filters) )
def gen_name_analyzer_excluding_terms(excluding_terms): """Crea un analizador para nombres que sólo retorna TE (términos excluyentes). Por ejemplo, si el archivo de configuración de TE contiene las siguientes reglas: santa, salta, santo caba, cba Entonces, aplicar el analizador a la búsqueda 'salta' debería retornar 'santa' y 'santo', mientras que buscar 'caba' debería retornar 'cba'. El analizador se utiliza para excluir resultados de búsquedas específicas. Args: excluding_terms (list): Lista de TE a utilizar especificados como sinónimos Solr. Returns: elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre 'name_analyzer_excluding_terms'. """ name_excluding_terms_filter = token_filter( 'name_excluding_terms_filter', type='synonym', synonyms=excluding_terms ) return analyzer( name_analyzer_excluding_terms, tokenizer='standard', filter=[ 'lowercase', 'asciifolding', name_excluding_terms_filter, synonyms_only_filter, spanish_stopwords_filter ] )
def gen_name_analyzer_synonyms(synonyms): """Crea un analizador para nombres con sinónimos. Args: synonyms (list): Lista de sinónimos a utilizar, en formato Solr. Returns: elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre 'name_analyzer_synonyms'. """ name_synonyms_filter = token_filter('name_synonyms_filter', type='synonym', synonyms=synonyms) return analyzer(name_analyzer_synonyms, tokenizer='standard', filter=[ 'lowercase', 'asciifolding', name_synonyms_filter, spanish_stopwords_filter ])
class Post(DocType): date = Date() url = Keyword() author = Keyword() topic = Keyword() board = Keyword() title_origin = Keyword() title_unigram = Text(analyzer=analyzer('whitespace')) title_ccjieba = Text(analyzer=analyzer('whitespace')) title_pos = Text(analyzer=analyzer('whitespace')) title_quality = HalfFloat() comments = Nested( properties={ 'comment_author': Keyword(), 'comment_origin': Keyword(), 'comment_unigram': Text(analyzer=analyzer('whitespace')), 'comment_ccjieba': Text(analyzer=analyzer('whitespace')), 'comment_pos': Text(analyzer=analyzer('whitespace')), 'comment_audio_url': Keyword(), 'comment_quality': HalfFloat() }) class Meta: index = 'post' def save(self, *args, **kwargs): return super(Post, self).save(*args, **kwargs) def add_comment(self, comment_author, comment_origin, comment_unigram, comment_ccjieba, comment_pos, comment_audio_url, comment_quality): self.comments.append({ 'comment_author': comment_author, 'comment_origin': comment_origin, 'comment_unigram': comment_unigram, 'comment_ccjieba': comment_ccjieba, 'comment_pos': comment_pos, 'comment_audio_url': comment_audio_url, 'comment_quality': comment_quality }) def bulk_dicts(docs): dicts = (d.to_dict(include_meta=True) for d in docs) return dicts
def configure_index(idx): """Configure ES index settings. NOTE: This is unused at the moment. Current issues: 1. The index needs to be created (index.create() or search_index --create) setting update_all_types=True because of the attribute name being the same in Person and Company. https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create name = fields.TextField(attr="fullname", analyzer=lb_analyzer) 2. How to specifiy token filter for an attribute? Therefore the index needs to be configured outside Django. """ idx.settings(number_of_shards=1, number_of_replicas=0) lb_filter = token_filter("lb_filter", "stop", stopwords=["i"]) lb_analyzer = analyzer( "lb_analyzer", tokenizer="standard", filter=["standard", "lb_filter", "asciifolding", "lowercase"]) return lb_analyzer, lb_filter
def url_main_analyzer(): """ An analyzer for creating "words" from URLs. Returns: Analyzer """ return analyzer( 'url', tokenizer='keyword', filter=[ 'lowercase', # Strip the protocol, we don't need it for searching. token_filter( 'url_protocol_filter', type='pattern_replace', pattern='^\w+:\/\/', replace='', ), 'word_delimiter', ], )
def phone_number_analyzer(): """ An analyzer to do complex partial matching on phone numbers. Returns: Analyzer: An analyzer suitable for searching phone numbers. """ return analyzer( 'phone_number', # We only want n-grams, which we want to create as late as possible. tokenizer='keyword', filter=[ # Strip all special chars, don't tokenize. token_filter( 'phone_word_delimiter', type='word_delimiter', generate_word_parts=False, generate_number_parts=False, catenate_all=True, ), # Strip any zeros from the start of the number. token_filter( 'leading_zero_filter', type='pattern_replace', pattern='^(0+)', replace='', ), # Create n-grams of all lengths to support partial matching. token_filter( 'phone_ngram_filter', type='ngram', # Still undecided on whether this should be 3 or 4. # 3 means users have to type less numbers to get results, # but the matching is less accurate. min_gram=3, max_gram=32, ), ], )
def es_analyzer_for_locale(locale, search_analyzer=False): """Pick an appropriate analyzer for a given locale. If no analyzer is defined for `locale` or the locale analyzer uses a plugin but using plugin is turned off from settings, return an analyzer named "default_sumo". """ name = "" analyzer_config = config.ES_LOCALE_ANALYZERS.get(locale) if not analyzer_config or (analyzer_config.get("plugin") and not settings.ES_USE_PLUGINS): name = config.ES_DEFAULT_ANALYZER_NAME analyzer_config = {} # use default values from ES_DEFAULT_ANALYZER if not overridden # using python 3.9's dict union operator analyzer_config = config.ES_DEFAULT_ANALYZER | analyzer_config # turn dictionaries into `char_filter` and `token_filter` instances filters = _insert_custom_filters(name or locale, analyzer_config["filter"]) char_filters = _insert_custom_filters(name or locale, analyzer_config["char_filter"], char=True) if search_analyzer: # create a locale-specific search analyzer, even if the index-time analyzer is # `sumo_default`. we do this so that we can adjust the synonyms used in any locale, # even if it doesn't have a custom analysis chain set up, without having to re-index name = locale + "_search_analyzer" filters.append( _create_synonym_graph_filter(config.ES_ALL_SYNONYMS_NAME)) filters.append(_create_synonym_graph_filter(locale)) return analyzer( name or locale, tokenizer=analyzer_config["tokenizer"], filter=filters, char_filter=char_filters, )
def email_ngram_analyzer(): """ An analyzer for creating email safe ngrams. This analyzer first splits the local part and domain name, then creates n-grams (overlapping fragments) from the remaining strings, minus any special characters. Returns: Analyzer: An analyzer suitable for analyzing email addresses. """ return analyzer( 'email_ngram', # Split the email address at the @ sign. tokenizer=tokenizer( 'at_sign_tokenizer', type='pattern', pattern='@', ), filter=[ 'lowercase', # Strip any special characters from the email address. token_filter( 'email_ngram_word_delimiter', type='word_delimiter', split_on_numerics=False, catenate_all=True, ), # Create trigrams from the address. token_filter( 'email_ngram_filter', type='ngram', min_gram=3, max_gram=3, ), ], )
class LagouJob(Document): # 建议 suggestion = Completion(analyzer=analyzer('ik_smart')) job_id = Keyword() # 职位标题 title = Text(analyzer="ik_max_word") # url url = Keyword() # 工资 salary = FloatRange() # # 工资下限 # salary_min = Float() # 工作经验 work_years = FloatRange() # # 最低年 # work_year_min = Integer() # 学历要求 degree_need = Float() # 工作性质 实习兼职全职 job_type = Keyword() # 发布时间 publish_time = Date() # 职位诱惑 job_advantage = Text(analyzer="ik_max_word") # 职位描述 job_desc = Text(analyzer="ik_max_word") # 工作城市 job_city = Keyword() # 工作地址 job_addr = Text(analyzer="ik_max_word") # 公司url company_url = Keyword() # 公司名字 company_name = Keyword() class Index: name = 'a51job'
def bigram_analyzer(): """ A n-gram analyzer of length 2. Bigrams provide nice partial, fuzzy matching. Returns: Analyzer """ return analyzer( 'bigram', tokenizer=tokenizer( 'bigram_tokenizer', type='ngram', min_gram=2, max_gram=2, token_chars=['letter', 'digit'], ), filter=[ 'standard', 'lowercase', 'asciifolding', ], )
def gen_suggests(index, info_tuple): # 根据字符串生成搜索建议数组 used_words = set() suggests = [] for text, weight in info_tuple: if text: # 调用es的analyze接口分析字符串 ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) my_analyzer = analyzer('my_analyzer', tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=['lowercase']) i = Index(index) i._analysis = ik_analyzer # i.analyzer(analyzer=ik_analyzer) # i.analyzer.default.type: "ik_max_word" a = i.analyze(params={'filter': ["lowercase"]}, body=text) # i.analyzer(analyzer = "ik_max_word") words = es.indices.analyze(index=index, params={'filter': ["lowercase"]}, body=text) anylyzed_words = set( [r["token"] for r in words["tokens"] if len(r["token"]) > 1]) new_words = anylyzed_words - used_words else: new_words = set() if new_words: suggests.append({"input": list(new_words), "weight": weight}) return suggests
class Book(Document): """An objects representing a book in ES """ title = Text( fields={ "no_vowels": Text( analyzer=analyzer("no_vowels", "pattern", pattern="[\Waeiouy]"), # noqa: W605 search_analyzer="standard") }) ref = Keyword() if MAJOR_ES > 2 else Text(index="not_analyzed") edition = Text() author = Object(properties={"name": Text(), "birthdate": Date()}) publication_date = Date() n_pages = Integer() if ES6: illustrators = Nested(Illustrator) class Index: name = "bk" else: illustrators = Nested( properties={ "name": Text(), "birthdate": Date(), "nationality": Keyword() if MAJOR_ES > 2 else Text(index="not_analyzed"), }) class Meta: index = "bk"
def _analyzer(self, species, dump=True): autophrase_syns, syns = self._synonyms(species) if dump: with open('autophrase_syns.txt', 'w') as f: f.writelines(l + '\n' for l in autophrase_syns) with open('syns.txt', 'w') as f: f.writelines(l + '\n' for l in syns) autophrase_filter = token_filter(f'species_autophrase_syn', type='synonym', synonyms=autophrase_syns) syn_filter = token_filter(f'species_syn', type='synonym', tokenizer='keyword', synonyms=syns) return analyzer( f'species_analyzer', tokenizer='lowercase', filter=[autophrase_filter, syn_filter], )
# -*- coding: utf-8 -*- from __future__ import unicode_literals, absolute_import from elasticsearch_dsl import analyzer, tokenizer # autocomplete tokenizer edge_ngram_tokenizer = tokenizer('edge_ngram_tokenizer', type='edge_ngram', min_gram=1, max_gram=20, token_chars=['letter', 'digit']) # autocomplete analyzer edge_ngram_analyzer = analyzer( 'edge_ngram_analyzer', tokenizer=edge_ngram_tokenizer, filter=['lowercase', 'asciifolding'], ) # autocomplete *search* tokenizer edge_ngram_search_tokenizer = tokenizer('edge_ngram_search_tokenizer', type='edge_ngram', token_chars=['letter', 'digit']) search_tokenizer = tokenizer('search_tokenizer', type='standard', token_chars=['letter', 'digit']) # autocomplete *search* analyzer edge_ngram_search_analyzer = analyzer( 'edge_ngram_search_analyzer',
from elasticsearch_dsl import ( Date, Document, Float, Integer, Keyword, Text, analyzer, token_filter, ) edge_ngram_analyzer = analyzer( "edge_ngram_analyzer", type="custom", tokenizer="standard", filter=[ "lowercase", token_filter("edge_ngram_filter", type="edgeNGram", min_gram=1, max_gram=20), ], ) class PodcastDoc(Document): id = Keyword(required=True) thumbnail_348 = Keyword() thumbnail_160 = Keyword() times_picked = Integer() episodes_count = Integer() episodes_seconds = Float() slug = Keyword(required=True, index=False) name = Text(required=True, analyzer=edge_ngram_analyzer, search_analyzer="standard")
__author__ = 'mtianyan' __date__ = '2017/6/25 10:18' from elasticsearch_dsl import connections, Document, Text, Keyword, Integer, Date, Completion, analyzer connections.create_connection(hosts=["localhost"]) my_analyzer = analyzer('ik_smart') class LagouJobIndex(Document): suggest = Completion(analyzer=my_analyzer) title = Text(analyzer="ik_max_word") url = Keyword() url_object_id = Keyword() salary_min = Integer() salary_max = Integer() job_city = Keyword() work_years_min = Integer() work_years_max = Integer() degree_need = Text(analyzer="ik_max_word") job_type = Keyword() publish_time = Date() job_advantage = Text(analyzer="ik_max_word") job_desc = Text(analyzer="ik_smart") job_addr = Text(analyzer="ik_max_word") company_name = Keyword() company_url = Keyword() tags = Text(analyzer="ik_max_word") crawl_time = Date()
""" import logging import typing import elasticsearch_dsl as es from elasticsearch_dsl import analysis log = logging.getLogger(__name__) edge_ngram_filter = analysis.token_filter('edge_ngram_filter', type='edge_ngram', min_gram=1, max_gram=15) autocomplete = es.analyzer( 'autocomplete', tokenizer='standard', filter=['standard', 'asciifolding', 'lowercase', edge_ngram_filter]) class User(es.DocType): """Elastic document describing user.""" objectID = es.Keyword() username = es.Text(fielddata=True, analyzer=autocomplete) username_exact = es.Keyword() full_name = es.Text(fielddata=True, analyzer=autocomplete) roles = es.Keyword(multi=True) groups = es.Keyword(multi=True)
"""Elasticsearch DSL Date field chokes on None values and parses empty strings as current date, hence the workaround. TODO: move this upstream in some form.""" def _to_python(self, data): if data is None: return data return super(NoneAwareDate, self)._to_python(data) namesAutocompleteAnalyzer = analyzer( "namesAutocompleteAnalyzer", tokenizer=tokenizer( "autocompleteTokenizer", type="edge_ngram", min_gram=1, max_gram=25, token_chars=["letter", "digit"], ), filter=["lowercase"], ) namesAutocompleteSearchAnalyzer = analyzer( "namesAutocompleteSearchAnalyzer", tokenizer=tokenizer("whitespace"), filter=[ "lowercase" ] )
) naam_stripper = analysis.char_filter( 'naam_stripper', type='mapping', mappings=[ "-=>' '", # change '-' to separator ".=>' '", # change '.' to separator ] ) kadastrale_aanduiding = es.analyzer( 'kadastrale_aanduiding', tokenizer='keyword', filter=['standard', 'lowercase'] ) adres = es.analyzer( 'adres', tokenizer='standard', filter=['standard', 'lowercase', 'asciifolding', synonym_filter], char_filter=[adres_split, huisnummer_generate], ) naam = es.analyzer( 'naam', tokenizer='standard',
from elasticsearch_dsl.document import DocType from elasticsearch_dsl import analyzer, String # slovenian lemmanizer lemmagen_sl = analyzer('lemmagen_sl', type='custom', tokenizer="uax_url_email", filter=["lowercase"], ) class Document(DocType): """ The :class:`Document` class defines a Type in ElasticSearch """ title = String(analyzer=lemmagen_sl)
location += '?{}'.format(query_string) return location class Index: name = 'publication' settings = { 'number_of_shards': 1 } autocomplete_analyzer = analyzer('autocomplete_analyzer', tokenizer=tokenizer( 'edge_ngram_tokenizer', type='edge_ngram', min_gram=3, max_gram=10, token_chars=[ "letter", "digit" ]), filter=['lowercase', 'asciifolding', 'trim']) def get_search_index(model): lookup = { Author: AuthorDoc, Container: ContainerDoc, Platform: PlatformDoc, Sponsor: SponsorDoc, Tag: TagDoc, }
DocType, Keyword, Text, Index, analyzer, tokenizer, token_filter, Date ) namesAutocompleteAnalyzer = analyzer( "namesAutocompleteAnalyzer", tokenizer=tokenizer( "autocompleteTokenizer", type="edge_ngram", min_gram=1, max_gram=25, token_chars=["letter", "digit"], ), filter=["lowercase"], ) namesAutocompleteSearchAnalyzer = analyzer( "namesAutocompleteSearchAnalyzer", tokenizer=tokenizer("lowercase") ) ukrainianAddressesStopwordsAnalyzer = analyzer( "ukrainianAddressesStopwordsAnalyzer", type="ukrainian", filter=[ token_filter(
kadaster_object_aanduiding = analysis.token_filter( 'kad_obj_aanduiding_filter', type='ngram', min_gram=4, max_gram=16 ) #################################### # Analyzers # #################################### bouwblok = es.analyzer( 'bouwblok', tokenizer=tokenizer( 'edge_ngram_filter', type='edge_ngram', min_gram=2, max_gram=4, token_chars=["letter", "digit"]), filter=['lowercase', divider_stripper], # char_filter=[divider_stripper] ) adres = es.analyzer( 'adres', tokenizer='standard', filter=['lowercase', 'asciifolding', synonym_filter], # filter=['lowercase', 'asciifolding'], char_filter=[naam_stripper], ) straatnaam = es.analyzer( 'straatnaam',
from elasticmodels import BooleanField, DateField, Index, IntegerField, StringField from elasticsearch_dsl import MetaField, analyzer, token_filter, tokenizer from .models import Report # override the default analyzer for ES to use an ngram filter that breaks words using # the standard tokenizer. Allow words to be broken up with underscores name = analyzer( "name", # the standard analyzer splits the words nicely by default tokenizer=tokenizer("standard"), filter=[ # technically, the standard filter doesn't do anything but we include # it anyway just in case ES decides to make use of it "standard", # obviously, lowercasing the tokens is a good thing "lowercase", # this enumates a 3-4 ngram, but also includes the whole token itself # (which prevents us from having to create multifields) token_filter("simple_edge", type="pattern_capture", patterns=["(?=(...))(?=(....))"]), ], ) class ReportIndex(Index): category = StringField( attr="category.name", # need a non_analyzed field for sorting fields={"raw": StringField(index="not_analyzed")}, ) category_id = IntegerField(attr="category.pk")
from django_elasticsearch_dsl import DocType, Index, fields from elasticsearch_dsl import analyzer, token_filter from ..account.models import User from ..order.models import Order from ..product.models import Product storefront = Index('storefront') storefront.settings(number_of_shards=1, number_of_replicas=0) partial_words = token_filter( 'partial_words', 'edge_ngram', min_gram=3, max_gram=15) title_analyzer = analyzer( 'title_analyzer', tokenizer='standard', filter=[partial_words, 'lowercase']) email_analyzer = analyzer('email_analyzer', tokenizer='uax_url_email') @storefront.doc_type class ProductDocument(DocType): title = fields.StringField(analyzer=title_analyzer) def prepare_title(self, instance): return instance.name class Meta: model = Product fields = ['name', 'description', 'is_published']
from datetime import datetime from pytz import timezone from ipaddress import ip_address from elasticsearch import ConflictError, NotFoundError from elasticsearch_dsl import Document, Date, Text, Keyword, Mapping, InnerDoc, \ Object, Nested, MetaField, Q, Long, Boolean, Double, Binary, Ip, analyzer from elasticsearch_dsl.utils import AttrList from pytest import raises, fixture snowball = analyzer('my_snow', tokenizer='standard', filter=['standard', 'lowercase', 'snowball']) class User(InnerDoc): name = Text(fields={'raw': Keyword()}) class Wiki(Document): owner = Object(User) views = Long() class Index: name = 'test-wiki' class Repository(Document): owner = Object(User) created_at = Date() description = Text(analyzer=snowball) tags = Keyword()
import elasticsearch_dsl as es from django.conf import settings from elasticsearch_dsl import analyzer, tokenizer dutch_analyzer = es.analyzer('dutchanalyzer', type='standard', stopwords='_dutch_') base_analyzer = analyzer('zorg_base_txt', tokenizer=tokenizer('trigram', 'nGram', min_gram=2, max_gram=20), filter=['lowercase']) _index = es.Index(settings.ELASTIC_INDEX) @_index.doc_type class Term(es.DocType): term = es.Text() gewicht = es.Integer() @_index.doc_type class Organisatie(es.DocType): ext_id = es.String(index='not_analyzed') naam = es.String(analyzer=dutch_analyzer) # ngram beschrijving = es.String(analyzer=dutch_analyzer) afdeling = es.String(index='not_analyzed')
from elasticsearch_dsl import DocType, String, token_filter, analyzer from django.conf import settings __author__ = 'erhmutlu' turkish_stop = token_filter('turkish_stop', type='stop', stopwords="_turkish_") turkish_lowercase = token_filter('turkish_lowercase', type='lowercase', language="turkish") turkish_stemmer = token_filter('turkish_stemmer', type='stemmer', language='turkish') custom_shingle_filter = token_filter('custom_shingle_filter', type='shingle', max_shingle_size=3, min_shingle_size=2, output_unigrams=True) entity_synonym_index_analyzer = analyzer('entity_synonym_index_analyzer', tokenizer='keyword', filter=[turkish_lowercase, 'asciifolding', turkish_stemmer]) entity_synonym_search_analyzer = analyzer('entity_synonym_search_analyzer', tokenizer='standard', filter=[turkish_lowercase, 'apostrophe', 'asciifolding', custom_shingle_filter, turkish_stemmer]) class Entity(DocType): entity_synonyms = String(index_analyzer=entity_synonym_index_analyzer, search_analyzer=entity_synonym_search_analyzer, include_in_all=True) entity_key = String(index='not_analyzed', include_in_all=False) value = String(index='not_analyzed', include_in_all=False) @classmethod def _get_index(self, index=None): return settings.ELASTICSEARCH_INDEX @classmethod def _get_doctype(self):
from itertools import permutations from elasticsearch_dsl import Document, Integer, Text, Keyword, Completion, analyzer, token_filter, GeoPoint, Date # custom analyzer for names ascii_fold = analyzer( 'ascii_fold', # we don't want to split O'Brian or Toulouse-Lautrec tokenizer='whitespace', filter=[ 'lowercase', token_filter('ascii_fold', 'asciifolding') ] ) class Entity(Document): project_id = Integer() file_id = Integer() id = Text() name = Text(fields={'keywords': Keyword()}) suggest = Completion(analyzer=ascii_fold) def clean(self): """ Automatically construct the suggestion input and weight by taking all possible permutation of Person's name as ``input`` and taking their popularity as ``weight``. """ self.suggest = { 'input': [' '.join(p) for p in permutations(self.name.split())],
from django_elasticsearch_dsl import DocType, Index, fields from elasticsearch_dsl import analyzer, tokenizer from genres.models import GenreNew # Name of the Elasticsearch index genre = Index('genres') # See Elasticsearch Indices API reference for available settings genre.settings( number_of_shards=1, number_of_replicas=0 ) html_strip = analyzer( 'genre', tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=["lowercase"] ) @genre.doc_type class GenreDocument(DocType): name = fields.TextField( analyzer=html_strip, fields={'raw': fields.KeywordField()} ) class Meta: model = GenreNew # The model associated with this DocType # queryset_pagination = 50000 # The fields of the model you want to be indexed in Elasticsearch fields = [ 'id'
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from elasticsearch_dsl import DocType, Text, Keyword, analyzer, MetaField, Date from first import first from packaging.version import parse as parse_version from warehouse.search import doc_type EmailAnalyzer = analyzer( "email", tokenizer="uax_url_email", filter=["standard", "lowercase", "stop", "snowball"], ) NameAnalyzer = analyzer( "normalized_name", tokenizer="lowercase", filter=["standard", "lowercase", "word_delimiter"], ) @doc_type class Project(DocType): name = Text() normalized_name = Text(analyzer=NameAnalyzer, index_options="docs")
#!/usr/bin/env python # -*- coding: utf-8 -*- from elasticsearch_dsl import DocType, Integer, String, Date, Nested, Boolean, analyzer html_strip = analyzer( 'html_strip', tokenizer="standard", filter=["standard", "lowercase", "snowball", "stop"], char_filter=["html_strip"] ) class RecipeIndex(DocType): document_id = Integer(index='not_analyzed') name = String(analyzer=html_strip) preparation_time = Integer(index='not_analyzed') ingredients = String(analyzer=html_strip) servings = Integer(index='not_analyzed') likes = Integer(index='not_analyzed') source_text = String() slug = String(index='no') source_slug = String(index='not_analyzed') large_image = String(index='no') last_updated = Date(index='not_analyzed') courses = String() cuisines = String() holidays = String() class Meta:
from time import sleep from elasticsearch_dsl.connections import connections from elasticsearch_dsl import Index, Document, SearchAsYouType, analyzer, Search, Keyword from elasticsearch_dsl.query import MultiMatch import json index = 'odrednica' serbianAnalyzer = analyzer('serbian') host = 'localhost' data = [] class Odrednica(Document): pk = Keyword() rec = Keyword() varijante = SearchAsYouType(analyzer=serbianAnalyzer) vrsta = Keyword() def createIndex(): connections.create_connection(hosts=[host], timeout=20) if not connections.get_connection().indices.exists(index): odrednicaIdx = Index(index) odrednicaIdx.analyzer(serbianAnalyzer) odrednicaIdx.document(Odrednica) odrednicaIdx.create() def saveOdrednica(item): varijante = ' '.join(item['varijante'])
fr_stem_filter = token_filter( 'fr_stem_filter', type='stemmer', language='minimal_french') # Deal with French specific aspects. fr_elision = token_filter( 'fr_elision', type='elision', articles=[ 'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c', 'jusqu', 'quoiqu', 'lorsqu', 'puisqu' ] ) # Languages related analyzers. de_analyzer = analyzer( 'de_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter], char_filter=[char_filter('html_strip')] ) en_analyzer = analyzer( 'en_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter], char_filter=[char_filter('html_strip')] ) es_analyzer = analyzer( 'es_analyzer', tokenizer=tokenizer('icu_tokenizer'), filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter], char_filter=[char_filter('html_strip')]
synonym_tokenfilter = token_filter( 'synonym_tokenfilter', 'synonym', synonyms=[ 'reactjs, react', # <-- important ], ) text_analyzer = analyzer( 'text_analyzer', tokenizer='standard', filter=[ # The ORDER is important here. 'standard', 'lowercase', 'stop', synonym_tokenfilter, # Note! 'snowball' comes after 'synonym_tokenfilter' 'snowball', ], char_filter=['html_strip']) class Web(Document): url = Keyword() domain = Keyword() homepage = Text() created_date = Date() last_updated = Date()
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from elasticsearch_dsl import DocType, String, analyzer, MetaField from warehouse.search import doc_type EmailAnalyzer = analyzer( "email", tokenizer="uax_url_email", filter=["standard", "lowercase", "stop", "snowball"], ) @doc_type class Project(DocType): name = String() version = String(index="not_analyzed", multi=True) summary = String(analyzer="snowball") description = String(analyzer="snowball") author = String() author_email = String(analyzer=EmailAnalyzer) maintainer = String() maintainer_email = String(analyzer=EmailAnalyzer)
reHeader = re.compile("^(.*?):\s*(.*)$") tz = get_localzone() def parse_header(header): # TODO: support for multiline headers match = reHeader.search(header) if match: return {'name': match.group(1), 'value': match.group(2)} else: raise ValueError("No header matched") identifierAnalyzer = analyzer("identifier", tokenizer="keyword", filter=["lowercase"]) class DocHTTPRequestResponse(DocType): class Meta: doc_type = 'HTTPRequestResponse' timestamp = Date() protocol = Text() host = Keyword() port = Integer() request = Object( properties={ 'method': Keyword(),
russian_stemmer_filter = analysis.token_filter('russian_stemmer', type='stemmer', language='russian') english_stop_filter = analysis.token_filter('english_stop', type='stop', stopwords='_english_') english_stemmer_filter = analysis.token_filter('english_stemmer', type='stemmer', language='english') english_possessive_stemmer_filter = analysis.token_filter( 'english_stemmer', type='stemmer', language='possessive_english') # Создаем анализаторы ru_analyzer = analyzer( 'ru_analyzer', type='custom', tokenizer='standard', filter=['lowercase', russian_stop_filter, russian_stemmer_filter], ) en_analyzer = analyzer('en_analyzer', type='custom', tokenizer='standard', filter=[ english_possessive_stemmer_filter, 'lowercase', english_stop_filter, english_stemmer_filter ]) # Добавляем анализаторы в Индекс movie_index.analyzer(ru_analyzer) movie_index.analyzer(en_analyzer) @movie_index.doc_type
from django_elasticsearch_dsl import Document, Index, fields from elasticsearch_dsl import analyzer from django_elasticsearch_dsl.registries import registry from .models import Product products_index = Index("products") products_index.settings(number_of_shards=1, number_of_replicas=1) html_strip = analyzer( "html_strip", tokenizer="standard", filter=["standard", "lowercase", "stop", "snowball"], char_filter=["html_strip"], ) # @registry.register_document @products_index.doc_type class ProductDocument(Document): # id = fields.IntegerField(attr='id') # title = fields.StringField( # analyzer=html_strip, # fields={ # 'raw': fields.StringField(analyzer='keyword'), # } # ) # description = fields.TextField( # analyzer=html_strip, # fields={ # 'raw': fields.TextField(analyzer='keyword'),
import os from elasticsearch_dsl import (Index, tokenizer, analyzer) from pprint import pprint movie_index: Index = Index(os.environ.get('ES_INDEX', 'moovie')) movie_index.settings(number_of_shards=5, number_of_replicas=1) completion_analyzer = analyzer('completion_analyzer', tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=['lowercase']) normalization_analyzer = analyzer('normalization_analyzer', tokenizer="standard", filter=["lowercase", "stop", "snowball"], char_filter=["html_strip"]) movie_index.analyzer(normalization_analyzer) def init_index(): if not movie_index.exists(): movie_index.create() def destroy_index(): if movie_index.exists(): movie_index.delete(ignore=404)