def test_unchanged_mapping_is_not_updated(write_client): m = mapping.Mapping('test-type') m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer", tokenizer="standard", filter=[ token_filter("simple_edge", type="edgeNGram", min_gram=2, max_gram=3 )] ) ) m.save('test-mapping', using=write_client) # this should not trigger an error since the mapping didn't change m.save('test-mapping', using=write_client) # change the mapping just a little bit m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer", tokenizer="standard", filter=[ token_filter("simple_edge", type="edgeNGram", min_gram=2, max_gram=4 # changed from 3 to 4 )] ) ) with raises(exceptions.IllegalOperation): m.save('test-mapping', using=write_client)
def url_ngram_analyzer(): """ An analyzer for creating URL safe n-grams. Returns: Analyzer """ return analyzer( 'url_ngram', tokenizer='keyword', filter=[ 'lowercase', # Strip the protocol, we don't need it for searching. token_filter( 'url_protocol_filter', type='pattern_replace', pattern='^\w+:\/\/', replace='', ), 'word_delimiter', # Create trigrams from the address. token_filter( 'url_ngram_filter', type='ngram', min_gram=3, max_gram=3, ), ], )
def configure_index(idx): """Configure ES index settings. NOTE: This is unused at the moment. Current issues: 1. The index needs to be created (index.create() or search_index --create) setting update_all_types=True because of the attribute name being the same in Person and Company. https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create name = fields.TextField(attr="fullname", analyzer=lb_analyzer) 2. How to specifiy token filter for an attribute? Therefore the index needs to be configured outside Django. """ idx.settings(number_of_shards=1, number_of_replicas=0) lb_filter = token_filter( "lb_filter", "stop", stopwords=["i"] ) lb_analyzer = analyzer( "lb_analyzer", tokenizer="standard", filter=["standard", "lb_filter", "asciifolding", "lowercase"] ) return lb_analyzer, lb_filter
def get_text_analyzer(language: str) -> Analyzer: # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html # According to https://discuss.elastic.co/t/extend-built-in-analyzers/134778/7 we do have to copy and paste stop = token_filter("stop", "stop", stopwords="_" + language + "_") stemmer = token_filter("stemmer", "stemmer", language=language) unique_stem = token_filter("unique_stem", "unique", only_on_same_position=True) # This seems to be kinda patchwork in elastic itself if language == "german": # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#german-analyzer # We can't use german_normalization here because that breaks with our keyword_repeat/unique_stem logic filters = ["keyword_repeat", "lowercase", stop, stemmer, unique_stem] elif language == "english": # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#english-analyzer english_possessive_stemmer = token_filter( "english_possessive_stemmer", "stemmer", language="possessive_english" ) filters = [ "keyword_repeat", english_possessive_stemmer, "lowercase", stop, stemmer, unique_stem, ] else: filters = ["keyword_repeat", "lowercase", stop, stemmer, unique_stem] return analyzer("text_analyzer", tokenizer="standard", filter=filters)
def test_unchanged_mapping_is_not_updated(write_client): m = mapping.Mapping('test-type') m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer", tokenizer="standard", filter=[ token_filter("simple_edge", type="edgeNGram", min_gram=2, max_gram=3) ])) m.save('test-mapping', using=write_client) # this should not trigger an error since the mapping didn't change m.save('test-mapping', using=write_client) # change the mapping just a little bit m.field( 'name', 'string', analyzer=analysis.analyzer( "my_analyzer", tokenizer="standard", filter=[ token_filter( "simple_edge", type="edgeNGram", min_gram=2, max_gram=4 # changed from 3 to 4 ) ])) with raises(exceptions.IllegalOperation): m.save('test-mapping', using=write_client)
class ArabicAnalyzer(Analyzer): token_filters = [ 'lowercase', 'arabic_normalization', token_filter('arabic_stop', type='stop', stopwords='_arabic_'), token_filter('arabic_stemmer', type='stemmer', language='arabic') ] def __init__(self): self.analyzer = analyzer('arabic', tokenizer='standard', filter=self.token_filters)
def init_index(cls): ''' Class method to init index ''' # default analyzer shingle_filter = token_filter( 'shingle_filter', type='shingle', min_shingle_size=2, max_shingle_size=3, ) default_analyzer = analyzer( 'default', tokenizer='standard', char_filter=['html_strip'], filter=['lowercase', 'asciifolding', shingle_filter] ) # set the analyzers for the available languages # TODO: languages and languages_stopwords should be in settings languages = ('es', 'en') languages_stopwords = { 'en': '_english_', 'es': '_spanish_', } languages_analyzers = {} languages_filters = {} for language in languages: languages_filters[language] = token_filter( language + '_filter', type='stop', stopwords=languages_stopwords[language], ) languages_analyzers[language] = analyzer( language + '_analyzer', tokenizer='standard', char_filter=['html_strip'], filter=['lowercase', 'asciifolding', languages_filters[language]] ) # Add analyzers, the index has to be closed before any configuration searches_index = Index('searches') # default analyzer searches_index.analyzer(default_analyzer) # languages search analyzers for language in languages: searches_index.analyzer(languages_analyzers[language]) searches_index.save() # create the mappings in elasticsearch cls.init()
class Filter: english_stop = token_filter("english_stop", type="stop", stopwords="_english_") # uses default porter stemmer english_stemmer = token_filter("english_stemmer", type="stemmer", language="english") english_possessive_stemmer = token_filter("english_possessive_stemmer", type="stemmer", language="possessive_english")
def get_edge_ngram_filter() -> token_filter: return token_filter( "custom_edge_ngram_filter", type="edge_ngram", min_gram=3, max_gram=15, )
def _get_locale_specific_analyzer(locale): """Get an analyzer for locales specified in config otherwise return `None`""" locale_analyzer = config.ES_LOCALE_ANALYZERS.get(locale) if locale_analyzer: if not settings.ES_USE_PLUGINS and locale_analyzer in settings.ES_PLUGIN_ANALYZERS: return None return analyzer(locale, type=locale_analyzer) snowball_language = config.ES_SNOWBALL_LOCALES.get(locale) if snowball_language: # The locale is configured to use snowball filter token_name = "snowball_{}".format(locale.lower()) snowball_filter = token_filter(token_name, type="snowball", language=snowball_language) # Use language specific snowball filter with standard analyzer. # The standard analyzer is basically a analyzer with standard tokenizer # and standard, lowercase and stop filter locale_analyzer = analyzer( locale, tokenizer="standard", filter=["lowercase", "stop", snowball_filter], char_filter=["html_strip"], ) return locale_analyzer
def gen_name_analyzer_excluding_terms(excluding_terms): """Crea un analizador para nombres que sólo retorna TE (términos excluyentes). Por ejemplo, si el archivo de configuración de TE contiene las siguientes reglas: santa, salta, santo caba, cba Entonces, aplicar el analizador a la búsqueda 'salta' debería retornar 'santa' y 'santo', mientras que buscar 'caba' debería retornar 'cba'. El analizador se utiliza para excluir resultados de búsquedas específicas. Args: excluding_terms (list): Lista de TE a utilizar especificados como sinónimos Solr. Returns: elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre 'name_analyzer_excluding_terms'. """ name_excluding_terms_filter = token_filter('name_excluding_terms_filter', type='synonym', synonyms=excluding_terms) return analyzer(name_analyzer_excluding_terms, tokenizer='standard', filter=[ 'lowercase', 'asciifolding', name_excluding_terms_filter, synonyms_only_filter, spanish_stopwords_filter ])
def email_main_analyzer(): """ An analyzer for creating "words" from email addresses. This analyzer splits email addresses on special characters. For example, [email protected] would become [john, doe, crm, example, com]. These tokens, when combined with ngrams, provide nice fuzzy matching while boosting full word matches. Returns: Analyzer: An analyzer suitable for analyzing email addresses. """ return analyzer( 'email', # We tokenize with token filters, so use the no-op keyword tokenizer. tokenizer='keyword', filter=[ 'lowercase', # Split the email address on special characters. token_filter( 'email_word_delimiter', type='word_delimiter', # Ensure words like hello2lily are kept as one token or not. split_on_numerics=False, ), ], )
def gen_name_analyzer_synonyms(synonyms): """Crea un analizador para nombres con sinónimos. Args: synonyms (list): Lista de sinónimos a utilizar, en formato Solr. Returns: elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre 'name_analyzer_synonyms'. """ name_synonyms_filter = token_filter( 'name_synonyms_filter', type='synonym', synonyms=synonyms ) return analyzer( name_analyzer_synonyms, tokenizer='standard', filter=[ 'lowercase', 'asciifolding', name_synonyms_filter, spanish_stopwords_filter ] )
def suggest(): suggestfilter = token_filter("suggestfilter", type="ngram", **other_param) bestsuggest = analyzer("bestsuggest", tokenizer="standard", filter=["lowercase", suggestfilter, "asciifolding"]) return bestsuggest
def mapping_func(position_filter_tuple): position, filter = position_filter_tuple if type(filter) is dict: name = f'{locale}_{position}_{filter["type"]}' if char: return char_filter(name, **filter) return token_filter(name, **filter) return filter
def get_text_analyzer(): return analyzer( 'fds_analyzer', tokenizer='standard', filter=[ 'keyword_repeat', token_filter('decomp', type='decompound', subwords_only=True), 'lowercase', token_filter('stop_de', type='stop', stopwords="_german_"), 'german_normalization', 'asciifolding', token_filter('de_stemmer', type='stemmer', name='light_german'), token_filter('unique_stem', type='unique', only_on_same_position=True) ], )
def test_simulate_complex(client): a = analyzer('my-analyzer', tokenizer=tokenizer('split_words', 'simple_pattern_split', pattern=':'), filter=['lowercase', token_filter('no-ifs', 'stop', stopwords=['if'])]) tokens = a.simulate('if:this:works', using=client).tokens assert len(tokens) == 2 assert ['this', 'works'] == [t.token for t in tokens]
def get_autocomplete_analyzer(): autocomplete_filter = token_filter( "autocomplete_filter", "edge_ngram", min_gram=1, max_gram=20 ) # Using this analyzer with an empty field fails, so we're using methods instead that add a space return analyzer( "autocomplete", tokenizer="standard", filter=["lowercase", autocomplete_filter] )
def test_simulate_complex(client): a = analyzer( "my-analyzer", tokenizer=tokenizer("split_words", "simple_pattern_split", pattern=":"), filter=["lowercase", token_filter("no-ifs", "stop", stopwords=["if"])], ) tokens = a.simulate("if:this:works", using=client).tokens assert len(tokens) == 2 assert ["this", "works"] == [t.token for t in tokens]
def _create_synonym_graph_filter(synonym_file_name): filter_name = f"{synonym_file_name}_synonym_graph" return token_filter( filter_name, type="synonym_graph", synonyms_path=f"synonyms/{synonym_file_name}.txt", # we must use "true" instead of True to work around an elastic-dsl bug expand="true", lenient="true", updateable="true", )
def _analyzer(synonyms): syn_filter = token_filter(f'country_syn', type='synonym', tokenizer='keyword', synonyms=synonyms) return analyzer( f'country_analyzer', tokenizer='lowercase', filter=[syn_filter], )
def test_simulate_complex(client): a = analyzer( 'my-analyzer', tokenizer=tokenizer('split_words', 'simple_pattern_split', pattern=':'), filter=['lowercase', token_filter('no-ifs', 'stop', stopwords=['if'])]) tokens = a.simulate('if:this:works', using=client).tokens assert len(tokens) == 2 assert ['this', 'works'] == [t.token for t in tokens]
def get_search_quote_analyzer(): return analyzer( "fds_search_quote_analyzer", tokenizer="standard", filter=[ "keyword_repeat", "lowercase", "german_normalization", "asciifolding", token_filter("de_stemmer", type="stemmer", name="light_german"), "remove_duplicates", ], )
def phone_number_analyzer(): """ An analyzer to do complex partial matching on phone numbers. Returns: Analyzer: An analyzer suitable for searching phone numbers. """ return analyzer( 'phone_number', # We only want n-grams, which we want to create as late as possible. tokenizer='keyword', filter=[ # Strip all special chars, don't tokenize. token_filter( 'phone_word_delimiter', type='word_delimiter', generate_word_parts=False, generate_number_parts=False, catenate_all=True, ), # Strip any zeros from the start of the number. token_filter( 'leading_zero_filter', type='pattern_replace', pattern='^(0+)', replace='', ), # Create n-grams of all lengths to support partial matching. token_filter( 'phone_ngram_filter', type='ngram', # Still undecided on whether this should be 3 or 4. # 3 means users have to type less numbers to get results, # but the matching is less accurate. min_gram=3, max_gram=32, ), ], )
def email_ngram_analyzer(): """ An analyzer for creating email safe ngrams. This analyzer first splits the local part and domain name, then creates n-grams (overlapping fragments) from the remaining strings, minus any special characters. Returns: Analyzer: An analyzer suitable for analyzing email addresses. """ return analyzer( 'email_ngram', # Split the email address at the @ sign. tokenizer=tokenizer( 'at_sign_tokenizer', type='pattern', pattern='@', ), filter=[ 'lowercase', # Strip any special characters from the email address. token_filter( 'email_ngram_word_delimiter', type='word_delimiter', split_on_numerics=False, catenate_all=True, ), # Create trigrams from the address. token_filter( 'email_ngram_filter', type='ngram', min_gram=3, max_gram=3, ), ], )
def _analyzer(self, species, dump=True): autophrase_syns, syns = self._synonyms(species) if dump: with open('autophrase_syns.txt', 'w') as f: f.writelines(l + '\n' for l in autophrase_syns) with open('syns.txt', 'w') as f: f.writelines(l + '\n' for l in syns) autophrase_filter = token_filter(f'species_autophrase_syn', type='synonym', synonyms=autophrase_syns) syn_filter = token_filter(f'species_syn', type='synonym', tokenizer='keyword', synonyms=syns) return analyzer( f'species_analyzer', tokenizer='lowercase', filter=[autophrase_filter, syn_filter], )
def mapping_func(position_filter_tuple): position, filter = position_filter_tuple if type(filter) is dict: prefix = analyzer_name default_filters = config.ES_DEFAULT_ANALYZER["char_filter" if char else "filter"] if filter in default_filters: # detect if this filter exists in the default analyzer # if it does use the same name as the default # to avoid defining the same filter for each locale prefix = config.ES_DEFAULT_ANALYZER_NAME position = default_filters.index(filter) name = f'{prefix}_{position}_{filter["type"]}' if char: return char_filter(name, **filter) return token_filter(name, **filter) return filter
def add_analyzer(index: Index): """Agrega un nuevo analyzer al índice, disponible para ser usado en todos sus fields. El analyzer aplica lower case + ascii fold: quita acentos y uso de ñ, entre otros, para permitir búsqueda de texto en español """ synonyms = list(Synonym.objects.values_list('terms', flat=True)) filters = ['lowercase', 'asciifolding'] if synonyms: filters.append( token_filter(constants.SYNONYM_FILTER, type='synonym', synonyms=synonyms)) index.analyzer( analyzer(constants.ANALYZER, tokenizer='standard', filter=filters))
def configure_index(idx): """Configure ES index settings. NOTE: This is unused at the moment. Current issues: 1. The index needs to be created (index.create() or search_index --create) setting update_all_types=True because of the attribute name being the same in Person and Company. https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create name = fields.TextField(attr="fullname", analyzer=lb_analyzer) 2. How to specifiy token filter for an attribute? Therefore the index needs to be configured outside Django. """ idx.settings(number_of_shards=1, number_of_replicas=0) lb_filter = token_filter("lb_filter", "stop", stopwords=["i"]) lb_analyzer = analyzer( "lb_analyzer", tokenizer="standard", filter=["standard", "lb_filter", "asciifolding", "lowercase"]) return lb_analyzer, lb_filter
def url_main_analyzer(): """ An analyzer for creating "words" from URLs. Returns: Analyzer """ return analyzer( 'url', tokenizer='keyword', filter=[ 'lowercase', # Strip the protocol, we don't need it for searching. token_filter( 'url_protocol_filter', type='pattern_replace', pattern='^\w+:\/\/', replace='', ), 'word_delimiter', ], )
# -*- coding: utf-8 -*- from __future__ import unicode_literals from elasticsearch_dsl import analyzer, tokenizer, token_filter, char_filter # Custom filters for analyzers. de_stop_filter = token_filter( 'de_stop_filter', type='stop', stopwords='_german_') de_stem_filter = token_filter( 'de_stem_filter', type='stemmer', language='minimal_german') en_stop_filter = token_filter( 'en_stop_filter', type='stop', stopwords='_english_') en_stem_filter = token_filter( 'en_stem_filter', type='stemmer', language='minimal_english') es_stop_filter = token_filter( 'es_stop_filter', type='stop', stopwords='_spanish_') es_stem_filter = token_filter( 'es_stem_filter', type='stemmer', language='light_spanish') pt_stop_filter = token_filter( 'pt_stop_filter', type='stop', stopwords='_portuguese_') pt_stem_filter = token_filter( 'pt_stem_filter', type='stemmer', language='light_portuguese') fr_stop_filter = token_filter( 'fr_stop_filter', type='stop', stopwords='_french_') fr_stem_filter = token_filter( 'fr_stem_filter', type='stemmer', language='minimal_french') # Deal with French specific aspects.
token_filter( "addresses_stopwords", type="stop", stopwords=[ "будинок", "обл", "район", "вулиця", "місто", "м", "квартира", "вул", "село", "буд", "кв", "проспект", "область", "селище", "міського", "типу", "офіс", "н", "р", "б", "с", "провулок", "корпус", "бульвар", "кімната", "шосе", "в", "смт", "просп", "№", ], ),
import elasticsearch_dsl as dsl from resolwe.elastic.indices import BaseDocument, BaseIndex from .models import Feature, Mapping # pylint: disable=invalid-name # Analyzer for feature identifiers and names, used during boosting. identifier_analyzer = dsl.analyzer('identifier_analyzer', tokenizer='keyword', filter=['lowercase']) # During indexing, we lowercase terms and tokenize using edge_ngram. autocomplete_analyzer = dsl.analyzer( 'autocomplete_index', tokenizer='keyword', filter=[ 'lowercase', dsl.token_filter('autocomplete_filter', type='edgeNGram', min_gram=1, max_gram=15) ], ) # During search, we only lowercase terms. autocomplete_search_analyzer = dsl.analyzer('autocomplete_search', tokenizer='keyword', filter=['lowercase']) # pylint: enable=invalid-name class FeatureSearchDocument(BaseDocument): """Index for feature search.""" # pylint: disable=no-member source = dsl.Keyword() feature_id = dsl.Keyword( # Additional subfield used for boosting during autocomplete. fields={
# override the default analyzer for ES to use an ngram filter that breaks words using # the standard tokenizer. Allow words to be broken up with underscores name = analyzer( "name", # the standard analyzer splits the words nicely by default tokenizer=tokenizer("standard"), filter=[ # technically, the standard filter doesn't do anything but we include # it anyway just in case ES decides to make use of it "standard", # obviously, lowercasing the tokens is a good thing "lowercase", # this enumates a 3-4 ngram, but also includes the whole token itself # (which prevents us from having to create multifields) token_filter("simple_edge", type="pattern_capture", patterns=["(?=(...))(?=(....))"]), ], ) class ReportIndex(Index): category = StringField( attr="category.name", # need a non_analyzed field for sorting fields={"raw": StringField(index="not_analyzed")}, ) category_id = IntegerField(attr="category.pk") species = StringField( attr="species.name", # need a non_analyzed field for sorting
from django_elasticsearch_dsl import DocType, Index, fields from elasticsearch_dsl import analyzer, token_filter from ..account.models import User from ..order.models import Order from ..product.models import Product storefront = Index('storefront') storefront.settings(number_of_shards=1, number_of_replicas=0) partial_words = token_filter( 'partial_words', 'edge_ngram', min_gram=3, max_gram=15) title_analyzer = analyzer( 'title_analyzer', tokenizer='standard', filter=[partial_words, 'lowercase']) email_analyzer = analyzer('email_analyzer', tokenizer='uax_url_email') @storefront.doc_type class ProductDocument(DocType): title = fields.StringField(analyzer=title_analyzer) def prepare_title(self, instance): return instance.name class Meta: model = Product fields = ['name', 'description', 'is_published']
def autocomplete(): autocompletefilter = token_filter("autocompletefilter", type="edge_ngram", **other_param) autocomplete = analyzer("autocomplete", tokenizer="standard", filter=["lowercase", autocompletefilter]) return autocomplete
Document, Float, Integer, Keyword, Text, analyzer, token_filter, ) edge_ngram_analyzer = analyzer( "edge_ngram_analyzer", type="custom", tokenizer="standard", filter=[ "lowercase", token_filter("edge_ngram_filter", type="edgeNGram", min_gram=1, max_gram=20), ], ) class PodcastDoc(Document): id = Keyword(required=True) thumbnail_348 = Keyword() thumbnail_160 = Keyword() times_picked = Integer() episodes_count = Integer() episodes_seconds = Float() slug = Keyword(required=True, index=False) name = Text(required=True, analyzer=edge_ngram_analyzer, search_analyzer="standard") link = Keyword() subtitle = Text()
| (?<=\D)(?=\d) # or non-number followed by number, | (?<=\d)(?=\D) # or number followed by non-number, ''', flags='CASE_INSENSITIVE|COMMENTS', lowercase=True, ) # During indexing, we lowercase terms and tokenize using edge_ngram. ngrams_analyzer = dsl.analyzer( 'ngrams_index', tokenizer='standard', filter=[ 'lowercase', dsl.token_filter( 'ngrams_filter', type='edgeNGram', min_gram=1, max_gram=15, ), ], ) # During search, we only lowercase terms. ngrams_search_analyzer = dsl.analyzer( 'ngrams_search', tokenizer='standard', filter=['lowercase'], ) # pylint: enable=invalid-name class RawKeywordSubfieldMixin: """String field with a 'raw' subfield (e.g. for sorting)."""
Text, Index, analyzer, Keyword, token_filter, ) from django.conf import settings edge_ngram_analyzer = analyzer('edge_ngram_analyzer', type='custom', tokenizer='standard', filter=[ 'lowercase', token_filter('edge_ngram_filter', type='edgeNGram', min_gram=1, max_gram=20) ]) class TitleDoc(DocType): id = Keyword() domain = Keyword(required=True) url = Keyword(required=True, index=False) title = Text(required=True, analyzer=edge_ngram_analyzer, search_analyzer='standard') popularity = Float() group = Keyword()
from elasticsearch_dsl import DocType, String, token_filter, analyzer from django.conf import settings __author__ = 'erhmutlu' turkish_stop = token_filter('turkish_stop', type='stop', stopwords="_turkish_") turkish_lowercase = token_filter('turkish_lowercase', type='lowercase', language="turkish") turkish_stemmer = token_filter('turkish_stemmer', type='stemmer', language='turkish') custom_shingle_filter = token_filter('custom_shingle_filter', type='shingle', max_shingle_size=3, min_shingle_size=2, output_unigrams=True) entity_synonym_index_analyzer = analyzer('entity_synonym_index_analyzer', tokenizer='keyword', filter=[turkish_lowercase, 'asciifolding', turkish_stemmer]) entity_synonym_search_analyzer = analyzer('entity_synonym_search_analyzer', tokenizer='standard', filter=[turkish_lowercase, 'apostrophe', 'asciifolding', custom_shingle_filter, turkish_stemmer]) class Entity(DocType): entity_synonyms = String(index_analyzer=entity_synonym_index_analyzer, search_analyzer=entity_synonym_search_analyzer, include_in_all=True) entity_key = String(index='not_analyzed', include_in_all=False) value = String(index='not_analyzed', include_in_all=False) @classmethod def _get_index(self, index=None): return settings.ELASTICSEARCH_INDEX @classmethod def _get_doctype(self):
from elasticsearch_dsl import DocType, String, token_filter, analyzer from django.conf import settings __author__ = 'erhmutlu' turkish_stop = token_filter('turkish_stop', type='stop', stopwords="_turkish_") turkish_lowercase = token_filter('turkish_lowercase', type='lowercase', language="turkish") turkish_stemmer = token_filter('turkish_stemmer', type='stemmer', language='turkish') turkish_whitespace_analyzer = analyzer('turkish_whitespace_analyzer', tokenizer='whitespace', filter=['apostrophe', 'asciifolding', turkish_lowercase, turkish_stop, turkish_stemmer]) class Intent(DocType): sentence = String(analyzer=turkish_whitespace_analyzer, include_in_all=True) original_sentence = String(analyzer='whitespace', include_in_all=False) action = String(index='not_analyzed', include_in_all=False) params = String(index='not_analyzed', include_in_all=False) def dict_with_id(self): dict = super(DocType, self).to_dict() dict['id'] = self._id return dict @classmethod def _get_index(self, index=None): return settings.ELASTICSEARCH_INDEX @classmethod def _get_doctype(self):
""" from __future__ import print_function, unicode_literals from itertools import permutations from elasticsearch_dsl import connections, Document, Completion, Text, Long, \ Keyword, analyzer, token_filter # custom analyzer for names ascii_fold = analyzer( 'ascii_fold', # we don't want to split O'Brian or Toulouse-Lautrec tokenizer='whitespace', filter=[ 'lowercase', token_filter('ascii_fold', 'asciifolding') ] ) class Person(Document): name = Text(fields={'keyword': Keyword()}) popularity = Long() # copletion field with a custom analyzer suggest = Completion(analyzer=ascii_fold) def clean(self): """ Automatically construct the suggestion input and weight by taking all possible permutation of Person's name as ``input`` and taking their
from itertools import permutations from elasticsearch_dsl import Document, Integer, Text, Keyword, Completion, analyzer, token_filter, GeoPoint, Date # custom analyzer for names ascii_fold = analyzer( 'ascii_fold', # we don't want to split O'Brian or Toulouse-Lautrec tokenizer='whitespace', filter=[ 'lowercase', token_filter('ascii_fold', 'asciifolding') ] ) class Entity(Document): project_id = Integer() file_id = Integer() id = Text() name = Text(fields={'keywords': Keyword()}) suggest = Completion(analyzer=ascii_fold) def clean(self): """ Automatically construct the suggestion input and weight by taking all possible permutation of Person's name as ``input`` and taking their popularity as ``weight``. """ self.suggest = { 'input': [' '.join(p) for p in permutations(self.name.split())],
# therefore we want to match all and anything. # E.g. "center" should find "...the center of..." and "...the centre for..." # But also, should find the same when searching for "centre". # So, rearrange the ba-ae.synonyms file for what's called # "Simple expansion". # https://www.elastic.co/guide/en/elasticsearch/guide/current/synonyms-expand-or-contract.html#synonyms-expansion # noqa # with open(american_british_syns_fn) as f: for line in f: if "=>" not in line or line.strip().startswith("#"): continue all_synonyms.append(line.strip()) synonym_tokenfilter = token_filter( "synonym_tokenfilter", "synonym", synonyms=all_synonyms ) edge_ngram_analyzer = analyzer( "edge_ngram_analyzer", type="custom", tokenizer="standard", filter=[ "lowercase", token_filter("edge_ngram_filter", type="edgeNGram", min_gram=1, max_gram=20), ], ) text_analyzer = analyzer( "text_analyzer",
__author__ = ["Amir Hossein Sorouri"] __copyright__ = "Copyright 2019, DSL-SE" __email__ = ["*****@*****.**"] __license__ = "Apache-2.0" __version__ = "2.0" from . import url from datetime import datetime from elasticsearch_dsl import Document, Date, token_filter \ , DateRange, Keyword, Text, Object, analyzer synonym_tokenfilter = token_filter( 'synonym_tokenfilter', 'synonym', synonyms=[ 'reactjs, react', # <-- important ], ) text_analyzer = analyzer( 'text_analyzer', tokenizer='standard', filter=[ # The ORDER is important here. 'standard', 'lowercase', 'stop', synonym_tokenfilter, # Note! 'snowball' comes after 'synonym_tokenfilter' 'snowball', ],
# override the default analyzer for ES to use an ngram filter that breaks words using # the standard tokenizer. Allow words to be broken up with underscores custom_analyzer = analyzer( "default", # the standard analyzer splits the words nicely by default tokenizer=tokenizer("standard"), filter=[ # technically, the standard filter doesn't do anything but we include # it anyway just in case ES decides to make use of it "standard", # unfortunately, underscores are not used to break up words with the # standard tokenizer, so we do it ourselves token_filter( "underscore", type="pattern_capture", patterns=["([^_]+)"], ), # obviously, lowercasing the tokens is a good thing "lowercase", # ngram it up. Might want to change from an edge ngram to just an ngram token_filter( "simple_edge", type="edgeNGram", min_gram=2, max_gram=3 ) ] )