Пример #1
2
def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=3
            )]
        )
    )


    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)


    # change the mapping just a little bit
    m.field('name', 'string', analyzer=analysis.analyzer("my_analyzer",
        tokenizer="standard",
        filter=[
            token_filter("simple_edge",
                type="edgeNGram",
                min_gram=2,
                max_gram=4 # changed from 3 to 4
            )]
        )
    )

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)
Пример #2
1
def url_ngram_analyzer():
    """
    An analyzer for creating URL safe n-grams.

    Returns:
        Analyzer
    """
    return analyzer(
        'url_ngram',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
            # Create trigrams from the address.
            token_filter(
                'url_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
Пример #3
1
def configure_index(idx):
    """Configure ES index settings.

    NOTE: This is unused at the moment. Current issues:
    1. The index needs to be created (index.create() or search_index --create)
    setting update_all_types=True because of the attribute name being the same
    in Person and Company.
    https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create

    name = fields.TextField(attr="fullname", analyzer=lb_analyzer)

    2. How to specifiy token filter for an attribute?

    Therefore the index needs to be configured outside Django.
    """
    idx.settings(number_of_shards=1, number_of_replicas=0)
    lb_filter = token_filter(
        "lb_filter",
        "stop",
        stopwords=["i"]
    )
    lb_analyzer = analyzer(
        "lb_analyzer",
        tokenizer="standard",
        filter=["standard", "lb_filter", "asciifolding", "lowercase"]
    )
    return lb_analyzer, lb_filter
Пример #4
0
def url_ngram_analyzer():
    """
    An analyzer for creating URL safe n-grams.

    Returns:
        Analyzer
    """
    return analyzer(
        'url_ngram',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
            # Create trigrams from the address.
            token_filter(
                'url_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
def get_text_analyzer(language: str) -> Analyzer:
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html
    # According to https://discuss.elastic.co/t/extend-built-in-analyzers/134778/7 we do have to copy and paste

    stop = token_filter("stop", "stop", stopwords="_" + language + "_")
    stemmer = token_filter("stemmer", "stemmer", language=language)
    unique_stem = token_filter("unique_stem", "unique", only_on_same_position=True)

    # This seems to be kinda patchwork in elastic itself
    if language == "german":
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#german-analyzer
        # We can't use german_normalization here because that breaks with our keyword_repeat/unique_stem logic
        filters = ["keyword_repeat", "lowercase", stop, stemmer, unique_stem]
    elif language == "english":
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#english-analyzer
        english_possessive_stemmer = token_filter(
            "english_possessive_stemmer", "stemmer", language="possessive_english"
        )

        filters = [
            "keyword_repeat",
            english_possessive_stemmer,
            "lowercase",
            stop,
            stemmer,
            unique_stem,
        ]
    else:
        filters = ["keyword_repeat", "lowercase", stop, stemmer, unique_stem]

    return analyzer("text_analyzer", tokenizer="standard", filter=filters)
Пример #6
0
def test_unchanged_mapping_is_not_updated(write_client):
    m = mapping.Mapping('test-type')
    m.field('name',
            'string',
            analyzer=analysis.analyzer("my_analyzer",
                                       tokenizer="standard",
                                       filter=[
                                           token_filter("simple_edge",
                                                        type="edgeNGram",
                                                        min_gram=2,
                                                        max_gram=3)
                                       ]))

    m.save('test-mapping', using=write_client)
    # this should not trigger an error since the mapping didn't change
    m.save('test-mapping', using=write_client)

    # change the mapping just a little bit
    m.field(
        'name',
        'string',
        analyzer=analysis.analyzer(
            "my_analyzer",
            tokenizer="standard",
            filter=[
                token_filter(
                    "simple_edge",
                    type="edgeNGram",
                    min_gram=2,
                    max_gram=4  # changed from 3 to 4
                )
            ]))

    with raises(exceptions.IllegalOperation):
        m.save('test-mapping', using=write_client)
Пример #7
0
class ArabicAnalyzer(Analyzer):
    token_filters = [
        'lowercase', 'arabic_normalization',
        token_filter('arabic_stop', type='stop', stopwords='_arabic_'),
        token_filter('arabic_stemmer', type='stemmer', language='arabic')
    ]

    def __init__(self):
        self.analyzer = analyzer('arabic',
                                 tokenizer='standard',
                                 filter=self.token_filters)
Пример #8
0
    def init_index(cls):
        '''
        Class method to init index
        '''

        # default analyzer
        shingle_filter = token_filter(
            'shingle_filter',
            type='shingle',
            min_shingle_size=2,
            max_shingle_size=3,
        )
        default_analyzer = analyzer(
            'default',
            tokenizer='standard',
            char_filter=['html_strip'],
            filter=['lowercase', 'asciifolding', shingle_filter]
        )

        # set the analyzers for the available languages
        # TODO: languages and languages_stopwords should be in settings
        languages = ('es', 'en')
        languages_stopwords = {
            'en': '_english_',
            'es': '_spanish_',
        }
        languages_analyzers = {}
        languages_filters = {}
        for language in languages:
            languages_filters[language] = token_filter(
                language + '_filter',
                type='stop',
                stopwords=languages_stopwords[language],
            )
            languages_analyzers[language] = analyzer(
                language + '_analyzer',
                tokenizer='standard',
                char_filter=['html_strip'],
                filter=['lowercase', 'asciifolding', languages_filters[language]]
            )

        # Add analyzers, the index has to be closed before any configuration
        searches_index = Index('searches')
        # default analyzer
        searches_index.analyzer(default_analyzer)
        # languages search analyzers
        for language in languages:
            searches_index.analyzer(languages_analyzers[language])
        searches_index.save()

        # create the mappings in elasticsearch
        cls.init()
Пример #9
0
class Filter:
    english_stop = token_filter("english_stop",
                                type="stop",
                                stopwords="_english_")

    # uses default porter stemmer
    english_stemmer = token_filter("english_stemmer",
                                   type="stemmer",
                                   language="english")

    english_possessive_stemmer = token_filter("english_possessive_stemmer",
                                              type="stemmer",
                                              language="possessive_english")
Пример #10
0
def get_edge_ngram_filter() -> token_filter:
    return token_filter(
        "custom_edge_ngram_filter",
        type="edge_ngram",
        min_gram=3,
        max_gram=15,
    )
Пример #11
0
def _get_locale_specific_analyzer(locale):
    """Get an analyzer for locales specified in config otherwise return `None`"""

    locale_analyzer = config.ES_LOCALE_ANALYZERS.get(locale)
    if locale_analyzer:
        if not settings.ES_USE_PLUGINS and locale_analyzer in settings.ES_PLUGIN_ANALYZERS:
            return None

        return analyzer(locale, type=locale_analyzer)

    snowball_language = config.ES_SNOWBALL_LOCALES.get(locale)
    if snowball_language:
        # The locale is configured to use snowball filter
        token_name = "snowball_{}".format(locale.lower())
        snowball_filter = token_filter(token_name, type="snowball", language=snowball_language)

        # Use language specific snowball filter with standard analyzer.
        # The standard analyzer is basically a analyzer with standard tokenizer
        # and standard, lowercase and stop filter
        locale_analyzer = analyzer(
            locale,
            tokenizer="standard",
            filter=["lowercase", "stop", snowball_filter],
            char_filter=["html_strip"],
        )
        return locale_analyzer
Пример #12
0
def gen_name_analyzer_excluding_terms(excluding_terms):
    """Crea un analizador para nombres que sólo retorna TE (términos
    excluyentes).

    Por ejemplo, si el archivo de configuración de TE contiene las siguientes
    reglas:

    santa, salta, santo
    caba, cba

    Entonces, aplicar el analizador a la búsqueda 'salta' debería retornar
    'santa' y 'santo', mientras que buscar 'caba' debería retornar 'cba'.

    El analizador se utiliza para excluir resultados de búsquedas específicas.

    Args:
        excluding_terms (list): Lista de TE a utilizar especificados como
            sinónimos Solr.

    Returns:
        elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre
            'name_analyzer_excluding_terms'.

    """
    name_excluding_terms_filter = token_filter('name_excluding_terms_filter',
                                               type='synonym',
                                               synonyms=excluding_terms)

    return analyzer(name_analyzer_excluding_terms,
                    tokenizer='standard',
                    filter=[
                        'lowercase', 'asciifolding',
                        name_excluding_terms_filter, synonyms_only_filter,
                        spanish_stopwords_filter
                    ])
Пример #13
0
def email_main_analyzer():
    """
    An analyzer for creating "words" from email addresses.

    This analyzer splits email addresses on special characters. For example,
    [email protected] would become [john, doe, crm, example, com].
    These tokens, when combined with ngrams, provide nice fuzzy matching while
    boosting full word matches.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email',
        # We tokenize with token filters, so use the no-op keyword tokenizer.
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Split the email address on special characters.
            token_filter(
                'email_word_delimiter',
                type='word_delimiter',
                # Ensure words like hello2lily are kept as one token or not.
                split_on_numerics=False,
            ),
        ],
    )
Пример #14
0
def gen_name_analyzer_synonyms(synonyms):
    """Crea un analizador para nombres con sinónimos.

    Args:
        synonyms (list): Lista de sinónimos a utilizar, en formato Solr.

    Returns:
        elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre
            'name_analyzer_synonyms'.

    """
    name_synonyms_filter = token_filter(
        'name_synonyms_filter',
        type='synonym',
        synonyms=synonyms
    )

    return analyzer(
        name_analyzer_synonyms,
        tokenizer='standard',
        filter=[
            'lowercase',
            'asciifolding',
            name_synonyms_filter,
            spanish_stopwords_filter
        ]
    )
Пример #15
0
def email_main_analyzer():
    """
    An analyzer for creating "words" from email addresses.

    This analyzer splits email addresses on special characters. For example,
    [email protected] would become [john, doe, crm, example, com].
    These tokens, when combined with ngrams, provide nice fuzzy matching while
    boosting full word matches.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email',
        # We tokenize with token filters, so use the no-op keyword tokenizer.
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Split the email address on special characters.
            token_filter(
                'email_word_delimiter',
                type='word_delimiter',
                # Ensure words like hello2lily are kept as one token or not.
                split_on_numerics=False,
            ),
        ],
    )
Пример #16
0
    def suggest():
        suggestfilter = token_filter("suggestfilter", type="ngram",
                                     **other_param)

        bestsuggest = analyzer("bestsuggest", tokenizer="standard",
                               filter=["lowercase", suggestfilter,
                                       "asciifolding"])
        return bestsuggest
Пример #17
0
 def mapping_func(position_filter_tuple):
     position, filter = position_filter_tuple
     if type(filter) is dict:
         name = f'{locale}_{position}_{filter["type"]}'
         if char:
             return char_filter(name, **filter)
         return token_filter(name, **filter)
     return filter
Пример #18
0
def get_text_analyzer():
    return analyzer(
        'fds_analyzer',
        tokenizer='standard',
        filter=[
            'keyword_repeat',
            token_filter('decomp', type='decompound', subwords_only=True),

            'lowercase',
            token_filter('stop_de', type='stop', stopwords="_german_"),

            'german_normalization',
            'asciifolding',

            token_filter('de_stemmer', type='stemmer', name='light_german'),
            token_filter('unique_stem', type='unique', only_on_same_position=True)
        ],
    )
Пример #19
0
def test_simulate_complex(client):
    a = analyzer('my-analyzer',
                 tokenizer=tokenizer('split_words', 'simple_pattern_split', pattern=':'),
                 filter=['lowercase', token_filter('no-ifs', 'stop', stopwords=['if'])])

    tokens = a.simulate('if:this:works', using=client).tokens

    assert len(tokens) == 2
    assert ['this', 'works'] == [t.token for t in tokens]
def get_autocomplete_analyzer():
    autocomplete_filter = token_filter(
        "autocomplete_filter", "edge_ngram", min_gram=1, max_gram=20
    )

    # Using this analyzer with an empty field fails, so we're using methods instead that add a space
    return analyzer(
        "autocomplete", tokenizer="standard", filter=["lowercase", autocomplete_filter]
    )
Пример #21
0
def test_simulate_complex(client):
    a = analyzer(
        "my-analyzer",
        tokenizer=tokenizer("split_words", "simple_pattern_split", pattern=":"),
        filter=["lowercase", token_filter("no-ifs", "stop", stopwords=["if"])],
    )

    tokens = a.simulate("if:this:works", using=client).tokens

    assert len(tokens) == 2
    assert ["this", "works"] == [t.token for t in tokens]
Пример #22
0
def _create_synonym_graph_filter(synonym_file_name):
    filter_name = f"{synonym_file_name}_synonym_graph"
    return token_filter(
        filter_name,
        type="synonym_graph",
        synonyms_path=f"synonyms/{synonym_file_name}.txt",
        # we must use "true" instead of True to work around an elastic-dsl bug
        expand="true",
        lenient="true",
        updateable="true",
    )
Пример #23
0
    def _analyzer(synonyms):

        syn_filter = token_filter(f'country_syn',
                                  type='synonym',
                                  tokenizer='keyword',
                                  synonyms=synonyms)

        return analyzer(
            f'country_analyzer',
            tokenizer='lowercase',
            filter=[syn_filter],
        )
Пример #24
0
def test_simulate_complex(client):
    a = analyzer(
        'my-analyzer',
        tokenizer=tokenizer('split_words', 'simple_pattern_split',
                            pattern=':'),
        filter=['lowercase',
                token_filter('no-ifs', 'stop', stopwords=['if'])])

    tokens = a.simulate('if:this:works', using=client).tokens

    assert len(tokens) == 2
    assert ['this', 'works'] == [t.token for t in tokens]
Пример #25
0
def get_search_quote_analyzer():
    return analyzer(
        "fds_search_quote_analyzer",
        tokenizer="standard",
        filter=[
            "keyword_repeat",
            "lowercase",
            "german_normalization",
            "asciifolding",
            token_filter("de_stemmer", type="stemmer", name="light_german"),
            "remove_duplicates",
        ],
    )
Пример #26
0
def phone_number_analyzer():
    """
    An analyzer to do complex partial matching on phone numbers.

    Returns:
        Analyzer: An analyzer suitable for searching phone numbers.
    """
    return analyzer(
        'phone_number',
        # We only want n-grams, which we want to create as late as possible.
        tokenizer='keyword',
        filter=[
            # Strip all special chars, don't tokenize.
            token_filter(
                'phone_word_delimiter',
                type='word_delimiter',
                generate_word_parts=False,
                generate_number_parts=False,
                catenate_all=True,
            ),
            # Strip any zeros from the start of the number.
            token_filter(
                'leading_zero_filter',
                type='pattern_replace',
                pattern='^(0+)',
                replace='',
            ),
            # Create n-grams of all lengths to support partial matching.
            token_filter(
                'phone_ngram_filter',
                type='ngram',
                # Still undecided on whether this should be 3 or 4.
                # 3 means users have to type less numbers to get results,
                # but the matching is less accurate.
                min_gram=3,
                max_gram=32,
            ),
        ],
    )
Пример #27
0
def phone_number_analyzer():
    """
    An analyzer to do complex partial matching on phone numbers.

    Returns:
        Analyzer: An analyzer suitable for searching phone numbers.
    """
    return analyzer(
        'phone_number',
        # We only want n-grams, which we want to create as late as possible.
        tokenizer='keyword',
        filter=[
            # Strip all special chars, don't tokenize.
            token_filter(
                'phone_word_delimiter',
                type='word_delimiter',
                generate_word_parts=False,
                generate_number_parts=False,
                catenate_all=True,
            ),
            # Strip any zeros from the start of the number.
            token_filter(
                'leading_zero_filter',
                type='pattern_replace',
                pattern='^(0+)',
                replace='',
            ),
            # Create n-grams of all lengths to support partial matching.
            token_filter(
                'phone_ngram_filter',
                type='ngram',
                # Still undecided on whether this should be 3 or 4.
                # 3 means users have to type less numbers to get results,
                # but the matching is less accurate.
                min_gram=3,
                max_gram=32,
            ),
        ],
    )
Пример #28
0
def email_ngram_analyzer():
    """
    An analyzer for creating email safe ngrams.

    This analyzer first splits the local part and domain name, then creates
    n-grams (overlapping fragments) from the remaining strings, minus any
    special characters.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email_ngram',
        # Split the email address at the @ sign.
        tokenizer=tokenizer(
            'at_sign_tokenizer',
            type='pattern',
            pattern='@',
        ),
        filter=[
            'lowercase',
            # Strip any special characters from the email address.
            token_filter(
                'email_ngram_word_delimiter',
                type='word_delimiter',
                split_on_numerics=False,
                catenate_all=True,
            ),
            # Create trigrams from the address.
            token_filter(
                'email_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
Пример #29
0
def email_ngram_analyzer():
    """
    An analyzer for creating email safe ngrams.

    This analyzer first splits the local part and domain name, then creates
    n-grams (overlapping fragments) from the remaining strings, minus any
    special characters.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email_ngram',
        # Split the email address at the @ sign.
        tokenizer=tokenizer(
            'at_sign_tokenizer',
            type='pattern',
            pattern='@',
        ),
        filter=[
            'lowercase',
            # Strip any special characters from the email address.
            token_filter(
                'email_ngram_word_delimiter',
                type='word_delimiter',
                split_on_numerics=False,
                catenate_all=True,
            ),
            # Create trigrams from the address.
            token_filter(
                'email_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
Пример #30
0
    def _analyzer(self, species, dump=True):
        autophrase_syns, syns = self._synonyms(species)

        if dump:
            with open('autophrase_syns.txt', 'w') as f:
                f.writelines(l + '\n' for l in autophrase_syns)

            with open('syns.txt', 'w') as f:
                f.writelines(l + '\n' for l in syns)

        autophrase_filter = token_filter(f'species_autophrase_syn',
                                         type='synonym',
                                         synonyms=autophrase_syns)

        syn_filter = token_filter(f'species_syn',
                                  type='synonym',
                                  tokenizer='keyword',
                                  synonyms=syns)

        return analyzer(
            f'species_analyzer',
            tokenizer='lowercase',
            filter=[autophrase_filter, syn_filter],
        )
Пример #31
0
 def mapping_func(position_filter_tuple):
     position, filter = position_filter_tuple
     if type(filter) is dict:
         prefix = analyzer_name
         default_filters = config.ES_DEFAULT_ANALYZER["char_filter" if char else "filter"]
         if filter in default_filters:
             # detect if this filter exists in the default analyzer
             # if it does use the same name as the default
             # to avoid defining the same filter for each locale
             prefix = config.ES_DEFAULT_ANALYZER_NAME
             position = default_filters.index(filter)
         name = f'{prefix}_{position}_{filter["type"]}'
         if char:
             return char_filter(name, **filter)
         return token_filter(name, **filter)
     return filter
Пример #32
0
def add_analyzer(index: Index):
    """Agrega un nuevo analyzer al índice, disponible para ser usado
    en todos sus fields. El analyzer aplica lower case + ascii fold:
    quita acentos y uso de ñ, entre otros, para permitir búsqueda de
    texto en español
    """

    synonyms = list(Synonym.objects.values_list('terms', flat=True))

    filters = ['lowercase', 'asciifolding']
    if synonyms:
        filters.append(
            token_filter(constants.SYNONYM_FILTER,
                         type='synonym',
                         synonyms=synonyms))

    index.analyzer(
        analyzer(constants.ANALYZER, tokenizer='standard', filter=filters))
Пример #33
0
def configure_index(idx):
    """Configure ES index settings.

    NOTE: This is unused at the moment. Current issues:
    1. The index needs to be created (index.create() or search_index --create)
    setting update_all_types=True because of the attribute name being the same
    in Person and Company.
    https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create

    name = fields.TextField(attr="fullname", analyzer=lb_analyzer)

    2. How to specifiy token filter for an attribute?

    Therefore the index needs to be configured outside Django.
    """
    idx.settings(number_of_shards=1, number_of_replicas=0)
    lb_filter = token_filter("lb_filter", "stop", stopwords=["i"])
    lb_analyzer = analyzer(
        "lb_analyzer",
        tokenizer="standard",
        filter=["standard", "lb_filter", "asciifolding", "lowercase"])
    return lb_analyzer, lb_filter
Пример #34
0
def url_main_analyzer():
    """
    An analyzer for creating "words" from URLs.

    Returns:
        Analyzer
    """
    return analyzer(
        'url',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
        ],
    )
Пример #35
0
def url_main_analyzer():
    """
    An analyzer for creating "words" from URLs.

    Returns:
        Analyzer
    """
    return analyzer(
        'url',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
        ],
    )
Пример #36
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from elasticsearch_dsl import analyzer, tokenizer, token_filter, char_filter

# Custom filters for analyzers.
de_stop_filter = token_filter(
    'de_stop_filter', type='stop', stopwords='_german_')
de_stem_filter = token_filter(
    'de_stem_filter', type='stemmer', language='minimal_german')

en_stop_filter = token_filter(
    'en_stop_filter', type='stop', stopwords='_english_')
en_stem_filter = token_filter(
    'en_stem_filter', type='stemmer', language='minimal_english')

es_stop_filter = token_filter(
    'es_stop_filter', type='stop', stopwords='_spanish_')
es_stem_filter = token_filter(
    'es_stem_filter', type='stemmer', language='light_spanish')

pt_stop_filter = token_filter(
    'pt_stop_filter', type='stop', stopwords='_portuguese_')
pt_stem_filter = token_filter(
    'pt_stem_filter', type='stemmer', language='light_portuguese')

fr_stop_filter = token_filter(
    'fr_stop_filter', type='stop', stopwords='_french_')
fr_stem_filter = token_filter(
    'fr_stem_filter', type='stemmer', language='minimal_french')
# Deal with French specific aspects.
Пример #37
0
 token_filter(
     "addresses_stopwords",
     type="stop",
     stopwords=[
         "будинок",
         "обл",
         "район",
         "вулиця",
         "місто",
         "м",
         "квартира",
         "вул",
         "село",
         "буд",
         "кв",
         "проспект",
         "область",
         "селище",
         "міського",
         "типу",
         "офіс",
         "н",
         "р",
         "б",
         "с",
         "провулок",
         "корпус",
         "бульвар",
         "кімната",
         "шосе",
         "в",
         "смт",
         "просп",
         "№",
     ],
 ),
Пример #38
0
import elasticsearch_dsl as dsl

from resolwe.elastic.indices import BaseDocument, BaseIndex

from .models import Feature, Mapping

# pylint: disable=invalid-name
# Analyzer for feature identifiers and names, used during boosting.
identifier_analyzer = dsl.analyzer('identifier_analyzer', tokenizer='keyword', filter=['lowercase'])
# During indexing, we lowercase terms and tokenize using edge_ngram.
autocomplete_analyzer = dsl.analyzer(
    'autocomplete_index',
    tokenizer='keyword',
    filter=[
        'lowercase',
        dsl.token_filter('autocomplete_filter', type='edgeNGram', min_gram=1, max_gram=15)
    ],
)
# During search, we only lowercase terms.
autocomplete_search_analyzer = dsl.analyzer('autocomplete_search', tokenizer='keyword', filter=['lowercase'])
# pylint: enable=invalid-name


class FeatureSearchDocument(BaseDocument):
    """Index for feature search."""

    # pylint: disable=no-member
    source = dsl.Keyword()
    feature_id = dsl.Keyword(
        # Additional subfield used for boosting during autocomplete.
        fields={
Пример #39
0
# override the default analyzer for ES to use an ngram filter that breaks words using
# the standard tokenizer. Allow words to be broken up with underscores
name = analyzer(
    "name",
    # the standard analyzer splits the words nicely by default
    tokenizer=tokenizer("standard"),
    filter=[
        # technically, the standard filter doesn't do anything but we include
        # it anyway just in case ES decides to make use of it
        "standard",
        # obviously, lowercasing the tokens is a good thing
        "lowercase",
        # this enumates a 3-4 ngram, but also includes the whole token itself
        # (which prevents us from having to create multifields)
        token_filter("simple_edge", type="pattern_capture", patterns=["(?=(...))(?=(....))"]),
    ],
)


class ReportIndex(Index):
    category = StringField(
        attr="category.name",
        # need a non_analyzed field for sorting
        fields={"raw": StringField(index="not_analyzed")},
    )
    category_id = IntegerField(attr="category.pk")

    species = StringField(
        attr="species.name",
        # need a non_analyzed field for sorting
Пример #40
0
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl import analyzer, token_filter

from ..account.models import User
from ..order.models import Order
from ..product.models import Product

storefront = Index('storefront')
storefront.settings(number_of_shards=1, number_of_replicas=0)


partial_words = token_filter(
    'partial_words', 'edge_ngram', min_gram=3, max_gram=15)
title_analyzer = analyzer(
    'title_analyzer',
    tokenizer='standard',
    filter=[partial_words, 'lowercase'])
email_analyzer = analyzer('email_analyzer', tokenizer='uax_url_email')


@storefront.doc_type
class ProductDocument(DocType):
    title = fields.StringField(analyzer=title_analyzer)

    def prepare_title(self, instance):
        return instance.name

    class Meta:
        model = Product
        fields = ['name', 'description', 'is_published']
Пример #41
0
 def autocomplete():
     autocompletefilter = token_filter("autocompletefilter",
                                       type="edge_ngram", **other_param)
     autocomplete = analyzer("autocomplete", tokenizer="standard",
                             filter=["lowercase", autocompletefilter])
     return autocomplete
Пример #42
0
    Document,
    Float,
    Integer,
    Keyword,
    Text,
    analyzer,
    token_filter,
)

edge_ngram_analyzer = analyzer(
    "edge_ngram_analyzer",
    type="custom",
    tokenizer="standard",
    filter=[
        "lowercase",
        token_filter("edge_ngram_filter", type="edgeNGram", min_gram=1, max_gram=20),
    ],
)


class PodcastDoc(Document):
    id = Keyword(required=True)
    thumbnail_348 = Keyword()
    thumbnail_160 = Keyword()
    times_picked = Integer()
    episodes_count = Integer()
    episodes_seconds = Float()
    slug = Keyword(required=True, index=False)
    name = Text(required=True, analyzer=edge_ngram_analyzer, search_analyzer="standard")
    link = Keyword()
    subtitle = Text()
Пример #43
0
        | (?<=\D)(?=\d)                 # or non-number followed by number,
        | (?<=\d)(?=\D)                 # or number followed by non-number,
    ''',
    flags='CASE_INSENSITIVE|COMMENTS',
    lowercase=True,
)

# During indexing, we lowercase terms and tokenize using edge_ngram.
ngrams_analyzer = dsl.analyzer(
    'ngrams_index',
    tokenizer='standard',
    filter=[
        'lowercase',
        dsl.token_filter(
            'ngrams_filter',
            type='edgeNGram',
            min_gram=1,
            max_gram=15,
        ),
    ],
)
# During search, we only lowercase terms.
ngrams_search_analyzer = dsl.analyzer(
    'ngrams_search',
    tokenizer='standard',
    filter=['lowercase'],
)
# pylint: enable=invalid-name


class RawKeywordSubfieldMixin:
    """String field with a 'raw' subfield (e.g. for sorting)."""
Пример #44
0
    Text,
    Index,
    analyzer,
    Keyword,
    token_filter,
)

from django.conf import settings

edge_ngram_analyzer = analyzer('edge_ngram_analyzer',
                               type='custom',
                               tokenizer='standard',
                               filter=[
                                   'lowercase',
                                   token_filter('edge_ngram_filter',
                                                type='edgeNGram',
                                                min_gram=1,
                                                max_gram=20)
                               ])


class TitleDoc(DocType):
    id = Keyword()
    domain = Keyword(required=True)
    url = Keyword(required=True, index=False)
    title = Text(required=True,
                 analyzer=edge_ngram_analyzer,
                 search_analyzer='standard')
    popularity = Float()
    group = Keyword()

Пример #45
0
from elasticsearch_dsl import DocType, String, token_filter, analyzer
from django.conf import settings

__author__ = 'erhmutlu'

turkish_stop = token_filter('turkish_stop', type='stop', stopwords="_turkish_")
turkish_lowercase = token_filter('turkish_lowercase', type='lowercase', language="turkish")
turkish_stemmer = token_filter('turkish_stemmer', type='stemmer', language='turkish')
custom_shingle_filter = token_filter('custom_shingle_filter', type='shingle', max_shingle_size=3, min_shingle_size=2,
                                     output_unigrams=True)

entity_synonym_index_analyzer = analyzer('entity_synonym_index_analyzer', tokenizer='keyword', filter=[turkish_lowercase, 'asciifolding', turkish_stemmer])

entity_synonym_search_analyzer = analyzer('entity_synonym_search_analyzer', tokenizer='standard',
                                    filter=[turkish_lowercase, 'apostrophe', 'asciifolding',
                                            custom_shingle_filter, turkish_stemmer])


class Entity(DocType):
    entity_synonyms = String(index_analyzer=entity_synonym_index_analyzer,
                             search_analyzer=entity_synonym_search_analyzer,
                             include_in_all=True)
    entity_key = String(index='not_analyzed', include_in_all=False)
    value = String(index='not_analyzed', include_in_all=False)

    @classmethod
    def _get_index(self, index=None):
        return settings.ELASTICSEARCH_INDEX

    @classmethod
    def _get_doctype(self):
Пример #46
0
from elasticsearch_dsl import DocType, String, token_filter, analyzer
from django.conf import settings

__author__ = 'erhmutlu'

turkish_stop = token_filter('turkish_stop', type='stop', stopwords="_turkish_")
turkish_lowercase = token_filter('turkish_lowercase', type='lowercase', language="turkish")
turkish_stemmer = token_filter('turkish_stemmer', type='stemmer', language='turkish')

turkish_whitespace_analyzer = analyzer('turkish_whitespace_analyzer', tokenizer='whitespace', filter=['apostrophe', 'asciifolding',
                                                                                turkish_lowercase, turkish_stop,
                                                                                turkish_stemmer])


class Intent(DocType):
    sentence = String(analyzer=turkish_whitespace_analyzer, include_in_all=True)
    original_sentence = String(analyzer='whitespace', include_in_all=False)
    action = String(index='not_analyzed', include_in_all=False)
    params = String(index='not_analyzed', include_in_all=False)

    def dict_with_id(self):
        dict = super(DocType, self).to_dict()
        dict['id'] = self._id
        return dict

    @classmethod
    def _get_index(self, index=None):
        return settings.ELASTICSEARCH_INDEX

    @classmethod
    def _get_doctype(self):
Пример #47
0
"""
from __future__ import print_function, unicode_literals

from itertools import permutations

from elasticsearch_dsl import connections, Document, Completion, Text, Long, \
        Keyword, analyzer, token_filter

# custom analyzer for names
ascii_fold = analyzer(
    'ascii_fold',
    # we don't want to split O'Brian or Toulouse-Lautrec
    tokenizer='whitespace',
    filter=[
        'lowercase',
        token_filter('ascii_fold', 'asciifolding')
    ]
)


class Person(Document):
    name = Text(fields={'keyword': Keyword()})
    popularity = Long()

    # copletion field with a custom analyzer
    suggest = Completion(analyzer=ascii_fold)

    def clean(self):
        """
        Automatically construct the suggestion input and weight by taking all
        possible permutation of Person's name as ``input`` and taking their
Пример #48
0
from itertools import permutations

from elasticsearch_dsl import Document, Integer, Text, Keyword, Completion, analyzer, token_filter, GeoPoint, Date

# custom analyzer for names
ascii_fold = analyzer(
    'ascii_fold',
    # we don't want to split O'Brian or Toulouse-Lautrec
    tokenizer='whitespace',
    filter=[
        'lowercase',
        token_filter('ascii_fold', 'asciifolding')
    ]
)


class Entity(Document):
    project_id = Integer()
    file_id = Integer()
    id = Text()
    name = Text(fields={'keywords': Keyword()})
    suggest = Completion(analyzer=ascii_fold)

    def clean(self):
        """
        Automatically construct the suggestion input and weight by taking all
        possible permutation of Person's name as ``input`` and taking their
        popularity as ``weight``.
        """
        self.suggest = {
            'input': [' '.join(p) for p in permutations(self.name.split())],
Пример #49
0
# therefore we want to match all and anything.
# E.g. "center" should find "...the center of..." and "...the centre for..."
# But also, should find the same when searching for "centre".
# So, rearrange the ba-ae.synonyms file for what's called
# "Simple expansion".
# https://www.elastic.co/guide/en/elasticsearch/guide/current/synonyms-expand-or-contract.html#synonyms-expansion  # noqa
#
with open(american_british_syns_fn) as f:
    for line in f:
        if "=>" not in line or line.strip().startswith("#"):
            continue
        all_synonyms.append(line.strip())


synonym_tokenfilter = token_filter(
    "synonym_tokenfilter", "synonym", synonyms=all_synonyms
)


edge_ngram_analyzer = analyzer(
    "edge_ngram_analyzer",
    type="custom",
    tokenizer="standard",
    filter=[
        "lowercase",
        token_filter("edge_ngram_filter", type="edgeNGram", min_gram=1, max_gram=20),
    ],
)

text_analyzer = analyzer(
    "text_analyzer",
Пример #50
0
__author__ = ["Amir Hossein Sorouri"]
__copyright__ = "Copyright 2019, DSL-SE"
__email__ = ["*****@*****.**"]
__license__ = "Apache-2.0"
__version__ = "2.0"

from . import url
from datetime import datetime
from elasticsearch_dsl import Document, Date, token_filter \
        , DateRange, Keyword, Text, Object, analyzer

synonym_tokenfilter = token_filter(
    'synonym_tokenfilter',
    'synonym',
    synonyms=[
        'reactjs, react',  # <-- important
    ],
)

text_analyzer = analyzer(
    'text_analyzer',
    tokenizer='standard',
    filter=[
        # The ORDER is important here.
        'standard',
        'lowercase',
        'stop',
        synonym_tokenfilter,
        # Note! 'snowball' comes after 'synonym_tokenfilter'
        'snowball',
    ],
Пример #51
0
# override the default analyzer for ES to use an ngram filter that breaks words using
# the standard tokenizer. Allow words to be broken up with underscores
custom_analyzer = analyzer(
    "default",
    # the standard analyzer splits the words nicely by default
    tokenizer=tokenizer("standard"),
    filter=[
        # technically, the standard filter doesn't do anything but we include
        # it anyway just in case ES decides to make use of it
        "standard",
        # unfortunately, underscores are not used to break up words with the
        # standard tokenizer, so we do it ourselves
        token_filter(
            "underscore",
            type="pattern_capture",
            patterns=["([^_]+)"],
        ),
        # obviously, lowercasing the tokens is a good thing
        "lowercase",
        # ngram it up. Might want to change from an edge ngram to just an ngram
        token_filter(
            "simple_edge",
            type="edgeNGram",
            min_gram=2,
            max_gram=3
        )
    ]
)