def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping('article') m.field('title', 'string', analyzer=a1, fields={ 'english': String(index_analyzer=a2), 'unknown': String(search_analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': String(index_analyzer=a4) })) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue']) a = analysis.analyzer( 'my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts] ) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])], ) a2 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])], ) m = mapping.Mapping("article") m.field("title", "string", analyzer=a1, search_analyzer=a2) m.field( "text", "string", analyzer=a1, fields={"english": String(analyzer=a1), "unknown": String(analyzer=a1, search_analyzer=a2)}, ) assert { "analyzer": { "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"}, "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"}, }, "filter": { "my_filter1": {"stopwords": ["a", "b"], "type": "stop"}, "my_filter2": {"stopwords": ["c", "d"], "type": "stop"}, }, "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}}, } == m._collect_analysis()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3) my_stop = analysis.token_filter("my_stop", "stop", stopwords=["a", "b"]) umlauts = analysis.char_filter("umlauts", "pattern_replace", mappings=["ü=>ue"]) a = analysis.analyzer( "my_analyzer", tokenizer=trigram, filter=["lowercase", my_stop], char_filter=["html_strip", umlauts], ) assert a.to_dict() == "my_analyzer" assert { "analyzer": { "my_analyzer": { "type": "custom", "tokenizer": "trigram", "filter": ["lowercase", "my_stop"], "char_filter": ["html_strip", "umlauts"], } }, "tokenizer": {"trigram": trigram.get_definition()}, "filter": {"my_stop": my_stop.get_definition()}, "char_filter": {"umlauts": umlauts.get_definition()}, } == a.get_analysis_definition()
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( 'my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer( 'my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping() m.field('title', 'text', analyzer=a1, search_analyzer=a2) m.field( 'text', 'text', analyzer=a1, fields={ 'english': Text(analyzer=a1), 'unknown': Keyword(analyzer=a1, search_analyzer=a2), } ) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}}, 'tokenizer': {'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}} } == m._collect_analysis()
def test_conditional_token_filter(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), "stop", ], ) assert { "analyzer": { "my_cond": { "filter": ["testing", "stop"], "tokenizer": "keyword", "type": "custom", } }, "filter": { "en": {"language": "English", "type": "snowball"}, "testing": { "script": {"source": "return true"}, "filter": ["lowercase", "en"], "type": "condition", }, }, } == a.get_analysis_definition()
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) m = mapping.Mapping('article') m.field('title', 'string', analyzer=a1, fields={ 'english': String(analyzer=a2), 'unknown': String(analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': String(analyzer=a4) })) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}}, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis()
def test_custom_analyzer_can_collect_custom_items(): trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) my_stop = analysis.token_filter('my_stop', 'stop', stopwords=['a', 'b']) umlauts = analysis.char_filter('umlauts', 'pattern_replace', mappings=['ü=>ue']) a = analysis.analyzer('my_analyzer', tokenizer=trigram, filter=['lowercase', my_stop], char_filter=['html_strip', umlauts]) assert a.to_dict() == 'my_analyzer' assert { 'analyzer': { 'my_analyzer': { 'type': 'custom', 'tokenizer': 'trigram', 'filter': ['lowercase', 'my_stop'], 'char_filter': ['html_strip', 'umlauts'] } }, 'tokenizer': { 'trigram': trigram.get_definition() }, 'filter': { 'my_stop': my_stop.get_definition() }, 'char_filter': { 'umlauts': umlauts.get_definition() } } == a.get_analysis_definition()
def test_tokenizer(): t = analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3) assert t.to_dict() == "trigram" assert { "type": "nGram", "min_gram": 3, "max_gram": 3 } == t.get_definition()
def test_tokenizer(): t = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) assert t.to_dict() == 'trigram' assert { 'type': 'nGram', 'min_gram': 3, 'max_gram': 3 } == t.get_definition()
def test_mapping_can_collect_all_analyzers_and_normalizers(): a1 = analysis.analyzer('my_analyzer1', tokenizer='keyword', filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])], ) a2 = analysis.analyzer('english') a3 = analysis.analyzer('unknown_custom') a4 = analysis.analyzer('my_analyzer2', tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])], ) a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword') n1 = analysis.normalizer('my_normalizer1', filter=['lowercase'] ) n2 = analysis.normalizer('my_normalizer2', filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])] ) n3 = analysis.normalizer('unknown_custom') m = mapping.Mapping() m.field('title', 'text', analyzer=a1, fields={ 'english': Text(analyzer=a2), 'unknown': Keyword(search_analyzer=a3), } ) m.field('comments', Nested(properties={ 'author': Text(analyzer=a4) })) m.field('normalized_title', 'keyword', normalizer=n1) m.field('normalized_comment', 'keyword', normalizer=n2) m.field('unknown', 'keyword', normalizer=n3) m.meta('_all', analyzer=a5) assert { 'analyzer': { 'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'}, 'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'}, 'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'}, }, 'normalizer': { 'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'}, 'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'}, }, 'filter': { 'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'}, 'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'}, 'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'}, }, 'tokenizer': { 'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'}, } } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
def get_analyzer(lang_analyzer, delete_old_index, user_dictionary_file='', synonyms=None): """ Return analyzer for specific language. If Japanese (``lang_analyzer == ja``) and the index doesn't need to be recreated (no delete required and no new synonyms) then return only the name of the analyzer. :param lang_analyzer: ``str`` which analyzer to get e.g. 'standard','kuromoji','english' :param delete_old_index: (only Japanese) ``bool`` if list is empty and index is not deleted, keep previous analyzer with synonyms :param user_dictionary_file: (only Japanese) ``str`` user-dictionary file with custom terms in the form of 東京スカイツリー,東京 スカイツリー,トウキョウ スカイツリー,カスタム名詞 See: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-tokenizer.html :param synonyms: (only Japanese) ``list`` of synonyms to be used in the form of ['京産大, 京都産業大学','a, b'] if list is empty and index is not deleted, keep previous analyzer with synonyms :return: ``analyzer`` or ``str`` of analyzer to be used """ if synonyms is None: synonyms = [] if lang_analyzer == constants.SUPPORTED_LANG_CODES_ANALYZERS['ja']: # Use existing analyzer (with synonyms) if new synonyms list is empty. (Only if index is not re-built) if (not delete_old_index) & (len(synonyms) == 0): analyzer_lang = '{0}_custom'.format( lang_analyzer) # Use existing analyzer with existing synonyms else: analyzer_lang = analysis.analyzer( '{0}_custom'.format(lang_analyzer), tokenizer=analysis.tokenizer( 'kuromoji_tokenizer_user_dict', type='kuromoji_tokenizer', user_dictionary=user_dictionary_file), filter=[ 'kuromoji_baseform', 'kuromoji_part_of_speech', 'cjk_width', 'ja_stop', 'kuromoji_stemmer', 'lowercase', analysis.token_filter( 'synonym', type='synonym', synonyms=synonyms), # ['京産大, 京都産業大学'] ]) # Extra token filters: kuromoji_number, kuromoji_readingform # Extra character filter: kuromoji_iteration_mark # user_dictionary="userdict_ja.txt") # /etc/elasticsearch/ else: analyzer_lang = analysis.analyzer(lang_analyzer) return analyzer_lang
def add_mapping_fields(self, mapping, analyzer_lang, analyzer_case_insensitive_sort): """ Add custom fields for Mails to the passed Index-mapping. :param mapping: ``Mapping`` Elastic-search DSL mapping to add fields to :param analyzer_lang: ``analyzer`` or ``str`` of analyzer to be used for language-specific fields :param analyzer_case_insensitive_sort: ``analyzer`` of analyzer to be used :return: None (Mapping is modified!) """ # Specific fields email analyzer_email = analysis.analyzer('email', tokenizer=analysis.tokenizer('uax_url_email'), filter=['lowercase', 'unique']) mapping.field('fromName', 'text', analyzer=analyzer_lang, fields={ 'keyword': 'keyword', }) mapping.field('fromEmail', 'text', analyzer=analyzer_email, fields={ 'keyword': 'keyword', }) mapping.field('toName', 'text', analyzer=analyzer_lang, fields={ 'keyword': 'keyword', }) mapping.field('toEmail', 'text', analyzer=analyzer_email, fields={ 'keyword': 'keyword', }) mapping.field('replyToName', 'text', analyzer=analyzer_lang, fields={ 'keyword': 'keyword', }) mapping.field('replyToEmail', 'text', analyzer=analyzer_email, fields={ 'keyword': 'keyword', }) mapping.field('subject', 'text', analyzer=analyzer_lang) mapping.field('date', 'date') mapping.field('body', 'text', analyzer=analyzer_lang) mapping.field('spam', 'boolean') mapping.field('hasAttachmet', 'boolean') mapping.field('attachmentNames', 'text', analyzer=analyzer_lang)
def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer( "my_cond", tokenizer=analysis.tokenizer("keyword"), filter=[ analysis.token_filter("en", "stemmer", language="english"), analysis.token_filter( "testing", "condition", script={"source": "return true"}, filter=[ "lowercase", analysis.token_filter("en", "snowball", language="English"), ], ), ], ) with raises(ValueError): a.get_analysis_definition()
def test_conflicting_nested_filters_cause_error(): a = analysis.analyzer('my_cond', tokenizer=analysis.tokenizer('keyword'), filter=[ analysis.token_filter('en', 'stemmer', language='english'), analysis.token_filter( 'testing', 'condition', script={'source': 'return true'}, filter=[ 'lowercase', analysis.token_filter('en', 'snowball', language='English') ]) ]) with raises(ValueError): a.get_analysis_definition()
def test_conditional_token_filter(): a = analysis.analyzer('my_cond', tokenizer=analysis.tokenizer('keyword'), filter=[ analysis.token_filter( 'testing', 'condition', script={'source': 'return true'}, filter=[ 'lowercase', analysis.token_filter('en', 'snowball', language='English') ]), 'stop' ]) assert { "analyzer": { "my_cond": { "filter": ["testing", "stop"], "tokenizer": "keyword", "type": "custom" } }, "filter": { "en": { "language": "English", "type": "snowball" }, "testing": { "script": { "source": "return true" }, "filter": ["lowercase", "en"], "type": "condition" } } } == a.get_analysis_definition()
def test_mapping_can_collect_all_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=["lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"])], ) a2 = analysis.analyzer("english") a3 = analysis.analyzer("unknown_custom") a4 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])], ) a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword") m = mapping.Mapping("article") m.field( "title", "string", analyzer=a1, fields={"english": String(analyzer=a2), "unknown": String(search_analyzer=a3)} ) m.field("comments", Nested(properties={"author": String(analyzer=a4)})) m.meta("_all", analyzer=a5) assert { "analyzer": { "my_analyzer1": {"filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom"}, "my_analyzer2": {"filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom"}, "my_analyzer3": {"tokenizer": "keyword", "type": "custom"}, }, "filter": { "my_filter1": {"stopwords": ["a", "b"], "type": "stop"}, "my_filter2": {"stopwords": ["c", "d"], "type": "stop"}, }, "tokenizer": {"trigram": {"max_gram": 3, "min_gram": 3, "type": "nGram"}}, } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
def add_mapping_to_index(self, lang_code, lang_analyzer, delete_old_index=False, kuromoji_synonyms=None): """ Add or update mail/irc-mapping to EL-index, create/update required analyzers and add fields. :param lang_code: ``str`` Language of index e.g. 'ja' :param lang_analyzer: ``str`` Name of analyzer for language e.g. 'kuromoji', 'standard' etc. :param delete_old_index: ``bool`` Delete index if existing? Default: False = Update existing index (Close, Update, Open) :param kuromoji_synonyms: ``dict`` Synonyms for kuromoji Japanese analyzer. Keep old synonyms if synonyms list empty and index not deleted :return: None """ if kuromoji_synonyms is None: kuromoji_synonyms = [] analyzer_lang = helpers.get_analyzer(lang_analyzer, delete_old_index=delete_old_index, user_dictionary_file=self._user_dictionary_file, synonyms=kuromoji_synonyms) analyzer_case_insensitive_sort = analysis.analyzer('case_insensitive_sort', tokenizer=analysis.tokenizer('keyword'), filter=['lowercase']) mapping = Mapping(self._type_name) reopen_index = False index_name = self._index_prefix.format(lang_code) if self._es.indices.exists(index=index_name): if delete_old_index: self._es.indices.delete(index=index_name, ignore=[400, 404]) else: self._es.indices.close(index=index_name) reopen_index = True mapping = Mapping.from_es(index_name, self._type_name, using=self._es) # Get existing index from server self.add_mapping_fields(mapping, analyzer_lang, analyzer_case_insensitive_sort) mapping.save(index_name, using=self._es) # Insert or update if reopen_index: self._es.indices.open(index=index_name)
from django.conf import settings as config from shop.search.documents import ProductDocument from elasticsearch_dsl.analysis import analyzer, token_filter, tokenizer settings = { 'number_of_shards': 1, 'number_of_replicas': 0, } for language, _ in config.LANGUAGES: analyzer_name = language+'_'+_+ '_analyzer' language_analizers = { language: analyzer('german_analyzer', type='custom', tokenizer=tokenizer('trigram', 'ngram', min_gram=3, max_gram=3), filter=[ 'lowercase', token_filter('asciifolding', type='asciifolding', preserve_original=False), token_filter('german_stop', type='stop', language='german'), token_filter('german_stemmer', type='snowball', language='german'), ], char_filter=['html_strip'], ) } ProductDocument(language=language, settings=settings, language_analizers=language_analizers)
'max_result_window': settings.MAX_RESULT_WINDOW, 'number_of_shards': 1, 'number_of_replicas': 0 } insitu_products.settings(**ELASTICSEARCH_INDEX_SETTINGS) insitu_requirements.settings(**ELASTICSEARCH_INDEX_SETTINGS) insitu_data.settings(**ELASTICSEARCH_INDEX_SETTINGS) insitu_dataproviders.settings(**ELASTICSEARCH_INDEX_SETTINGS) if not getattr(Search, '_patched', False): Search.order_by = Search.sort Search._patched = True case_insensitive_analyzer = analyzer('case_insensitive_analyzer', tokenizer=tokenizer('trigram', 'nGram'), filter=['lowercase']) case_insensitive_normalizer = normalizer( type="custom", name_or_instance='case_insensitive_normalizer', char_filter=[], filter="lowercase", ) @insitu_products.doc_type class ProductDoc(DocType): acronym = fields.KeywordField() description = fields.TextField() name = fields.TextField(analyzer=case_insensitive_analyzer,
preserve_original=True ) adres_split = analysis.char_filter( 'adres_split', type='mapping', mappings=[ "-=>' '", # strip '-' ".=>' '", # change '.' to separator ] ) boutnummer_ngram = analysis.tokenizer( 'boutnummer_ngram', 'edgeNGram', min_gram=1, max_gram=8, token_chars=['letter', 'digit'] ) postcode_ngram = analysis.tokenizer( 'postcode_ngram', 'edgeNGram', min_gram=2, max_gram=4, token_chars=['letter', 'digit'] ) naam_stripper = analysis.char_filter( 'naam_stripper', type='mapping',
from django_elasticsearch_dsl.registries import registry from tardis.tardis_portal.models import Dataset, Experiment, \ DataFile, Instrument, ObjectACL logger = logging.getLogger(__name__) elasticsearch_index_settings = getattr(settings, 'ELASTICSEARCH_DSL_INDEX_SETTINGS', { 'number_of_shards': 1, 'number_of_replicas': 0 }) elasticsearch_parallel_index_settings = getattr( settings, 'ELASTICSEARCH_PARALLEL_INDEX_SETTINGS', {}) trigram = analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3) analyzer = analyzer( "analyzer", tokenizer=trigram, filter='lowercase', ) @registry.register_document class ExperimentDocument(Document): def parallel_bulk(self, actions, **kwargs): Document.parallel_bulk(self, actions=actions, **elasticsearch_parallel_index_settings)
logger = getLogger(__name__) # Normalises values to improve sorting (by keeping e, E, è, ê etc. together) lowercase_asciifolding_normalizer = analysis.normalizer( 'lowercase_asciifolding_normalizer', filter=('lowercase', 'asciifolding'), ) # Trigram tokenizer enables us to support partial matching trigram = analysis.tokenizer( 'trigram', 'nGram', min_gram=3, max_gram=3, token_chars=('letter', 'digit'), ) # Filters out "-" so that t-shirt and tshirt can be matched special_chars = analysis.char_filter('special_chars', 'mapping', mappings=('-=>',)) trigram_analyzer = analysis.CustomAnalyzer( 'trigram_analyzer', tokenizer=trigram, char_filter=special_chars, filter=('lowercase',), ) space_remover = analysis.token_filter( 'space_remover',
""" Define base fields for all documents except ConceptDocument.""" # Use Any for typing because mypy does not recognize models as subtypes of ModelBase from typing import Any, Dict from django_elasticsearch_dsl import Document, fields from elasticsearch_dsl import analyzer from elasticsearch_dsl.analysis import tokenizer from ddionrails.studies.models import Study edge_ngram_completion = analyzer( "edge_ngram_completion", tokenizer=tokenizer("edge_ngram", "edge_ngram", min_gram=1, max_gram=10), ) class GenericDocument(Document): """Base for search documents.""" # attributes id = fields.TextField() name = fields.TextField(analyzer=edge_ngram_completion) label = fields.TextField(analyzer="english") label_de = fields.TextField(analyzer="german") description = fields.TextField(analyzer="english") description_de = fields.TextField(analyzer="german") # relations as attributes study = fields.ObjectField( properties={
def test_mapping_can_collect_multiple_analyzers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=[ "lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]), ], ) a2 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[ analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"]) ], ) m = mapping.Mapping() m.field("title", "text", analyzer=a1, search_analyzer=a2) m.field( "text", "text", analyzer=a1, fields={ "english": Text(analyzer=a1), "unknown": Keyword(analyzer=a1, search_analyzer=a2), }, ) assert { "analyzer": { "my_analyzer1": { "filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom", }, "my_analyzer2": { "filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom", }, }, "filter": { "my_filter1": { "stopwords": ["a", "b"], "type": "stop" }, "my_filter2": { "stopwords": ["c", "d"], "type": "stop" }, }, "tokenizer": { "trigram": { "max_gram": 3, "min_gram": 3, "type": "nGram" } }, } == m._collect_analysis()
def test_mapping_can_collect_all_analyzers_and_normalizers(): a1 = analysis.analyzer( "my_analyzer1", tokenizer="keyword", filter=[ "lowercase", analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]), ], ) a2 = analysis.analyzer("english") a3 = analysis.analyzer("unknown_custom") a4 = analysis.analyzer( "my_analyzer2", tokenizer=analysis.tokenizer("trigram", "nGram", min_gram=3, max_gram=3), filter=[ analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"]) ], ) a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword") n1 = analysis.normalizer("my_normalizer1", filter=["lowercase"]) n2 = analysis.normalizer( "my_normalizer2", filter=[ "my_filter1", "my_filter2", analysis.token_filter("my_filter3", "stop", stopwords=["e", "f"]), ], ) n3 = analysis.normalizer("unknown_custom") m = mapping.Mapping() m.field( "title", "text", analyzer=a1, fields={ "english": Text(analyzer=a2), "unknown": Keyword(search_analyzer=a3) }, ) m.field("comments", Nested(properties={"author": Text(analyzer=a4)})) m.field("normalized_title", "keyword", normalizer=n1) m.field("normalized_comment", "keyword", normalizer=n2) m.field("unknown", "keyword", normalizer=n3) m.meta("_all", analyzer=a5) assert { "analyzer": { "my_analyzer1": { "filter": ["lowercase", "my_filter1"], "tokenizer": "keyword", "type": "custom", }, "my_analyzer2": { "filter": ["my_filter2"], "tokenizer": "trigram", "type": "custom", }, "my_analyzer3": { "tokenizer": "keyword", "type": "custom" }, }, "normalizer": { "my_normalizer1": { "filter": ["lowercase"], "type": "custom" }, "my_normalizer2": { "filter": ["my_filter1", "my_filter2", "my_filter3"], "type": "custom", }, }, "filter": { "my_filter1": { "stopwords": ["a", "b"], "type": "stop" }, "my_filter2": { "stopwords": ["c", "d"], "type": "stop" }, "my_filter3": { "stopwords": ["e", "f"], "type": "stop" }, }, "tokenizer": { "trigram": { "max_gram": 3, "min_gram": 3, "type": "nGram" } }, } == m._collect_analysis() assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
from django_elasticsearch_dsl.registries import registry from elasticsearch_dsl import analysis, InnerDoc from elasticsearch_dsl.field import Text from django.db.models import Prefetch from api.applications import models address_analyzer = analysis.analyzer( "address_analyzer", tokenizer="whitespace", filter=["lowercase", "asciifolding", "trim",], ) part_number_analyzer = analysis.analyzer( "part_number_analyzer", tokenizer=analysis.tokenizer("part_number_path_hierarchy", "path_hierarchy", delimiter="-"), filter=["lowercase", "trim"], ) reference_code_analyzer = analysis.analyzer( "reference_code_analyzer", tokenizer="path_hierarchy", filter=["lowercase", "trim"] ) descriptive_text_analyzer = analysis.analyzer( "descriptive_text_analyzer", tokenizer="classic", filter=["lowercase", "trim", "stemmer"] ) ngram_filter = analysis.token_filter("ngram_filter", type="ngram", min_gram=2, max_gram=20) ngram_analyzer = analysis.analyzer( "ngram_completion", tokenizer="whitespace", filter=["lowercase", "asciifolding", ngram_filter]