示例#1
1
def url_ngram_analyzer():
    """
    An analyzer for creating URL safe n-grams.

    Returns:
        Analyzer
    """
    return analyzer(
        'url_ngram',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
            # Create trigrams from the address.
            token_filter(
                'url_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
示例#2
1
def configure_index(idx):
    """Configure ES index settings.

    NOTE: This is unused at the moment. Current issues:
    1. The index needs to be created (index.create() or search_index --create)
    setting update_all_types=True because of the attribute name being the same
    in Person and Company.
    https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create

    name = fields.TextField(attr="fullname", analyzer=lb_analyzer)

    2. How to specifiy token filter for an attribute?

    Therefore the index needs to be configured outside Django.
    """
    idx.settings(number_of_shards=1, number_of_replicas=0)
    lb_filter = token_filter(
        "lb_filter",
        "stop",
        stopwords=["i"]
    )
    lb_analyzer = analyzer(
        "lb_analyzer",
        tokenizer="standard",
        filter=["standard", "lb_filter", "asciifolding", "lowercase"]
    )
    return lb_analyzer, lb_filter
示例#3
0
def email_main_analyzer():
    """
    An analyzer for creating "words" from email addresses.

    This analyzer splits email addresses on special characters. For example,
    [email protected] would become [john, doe, crm, example, com].
    These tokens, when combined with ngrams, provide nice fuzzy matching while
    boosting full word matches.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email',
        # We tokenize with token filters, so use the no-op keyword tokenizer.
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Split the email address on special characters.
            token_filter(
                'email_word_delimiter',
                type='word_delimiter',
                # Ensure words like hello2lily are kept as one token or not.
                split_on_numerics=False,
            ),
        ],
    )
def test_analyzers_returned_from_to_dict():
    random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard")
    index = Index('i', using='alias')
    index.analyzer(random_analyzer)

    assert index.to_dict()["settings"]["analysis"]["analyzer"][random_analyzer_name] == {"filter": ["standard"], "type": "custom", "tokenizer": "standard"}
示例#5
0
    def suggest():
        suggestfilter = token_filter("suggestfilter", type="ngram",
                                     **other_param)

        bestsuggest = analyzer("bestsuggest", tokenizer="standard",
                               filter=["lowercase", suggestfilter,
                                       "asciifolding"])
        return bestsuggest
def ngram(min_gram=2, max_gram=4):
    base_name = "ngram_%d_%d" % (min_gram, max_gram)
    
    return dsl.analyzer(base_name + "_analyzer",
        tokenizer=dsl.tokenizer(base_name + "_tokenizer", 'nGram',
            min_gram=min_gram,
            max_gram=max_gram,
            token_chars=[ "letter", "digit" ]),
        filter=['lowercase'])
def test_simulate_complex(client):
    a = analyzer('my-analyzer',
                 tokenizer=tokenizer('split_words', 'simple_pattern_split', pattern=':'),
                 filter=['lowercase', token_filter('no-ifs', 'stop', stopwords=['if'])])

    tokens = a.simulate('if:this:works', using=client).tokens

    assert len(tokens) == 2
    assert ['this', 'works'] == [t.token for t in tokens]
示例#8
0
def get_default_text_analyzer():
    return analyzer(
        'froide_analyzer',
        tokenizer='standard',
        filter=[
            'standard',
            'lowercase',
            'asciifolding',
        ]
    )
示例#9
0
def get_default_ngram_analyzer():
    return analyzer(
        'froide_ngram_analyzer',
        tokenizer=tokenizer(
            'froide_ngram_tokenzier',
            type='edge_ngram',
            min_gram=1,
            max_gram=15,
            token_chars=['letter', 'digit']
        ),
        filter=[
            'standard',
            'lowercase',
            'asciifolding',
        ]
    )
def test_cloned_index_has_analysis_attribute():
    """
    Regression test for Issue #582 in which `Index.clone()` was not copying
    over the `_analysis` attribute.
    """
    client = object()
    i = Index('my-index', using=client)

    random_analyzer_name = ''.join((choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name, tokenizer="standard", filter="standard")

    i.analyzer(random_analyzer)

    i2 = i.clone('my-clone-index')

    assert i.to_dict()['settings']['analysis'] == i2.to_dict()['settings']['analysis']
示例#11
0
def get_text_analyzer():
    return analyzer(
        'fds_analyzer',
        tokenizer='standard',
        filter=[
            'keyword_repeat',
            token_filter('decomp', type='decompound', subwords_only=True),

            'lowercase',
            token_filter('stop_de', type='stop', stopwords="_german_"),

            'german_normalization',
            'asciifolding',

            token_filter('de_stemmer', type='stemmer', name='light_german'),
            token_filter('unique_stem', type='unique', only_on_same_position=True)
        ],
    )
示例#12
0
def standard_ascii_analyzer():
    """
    Elasticsearch's standard analyzer with asciifolding.

    The asciifolding filter converts non-ascii letters to their ascii
    counterparts. It essentially cleans diacritics from strings.

    Returns:
        Analyzer
    """
    return analyzer(
        'standard_ascii',
        tokenizer='standard',
        filter=[
            'standard',
            'lowercase',
            'asciifolding',
        ]
    )
def test_cloned_index_has_analysis_attribute():
    """
    Regression test for Issue #582 in which `Index.clone()` was not copying
    over the `_analysis` attribute.
    """
    client = object()
    i = Index('my-index', using=client)

    random_analyzer_name = ''.join(
        (choice(string.ascii_letters) for _ in range(100)))
    random_analyzer = analyzer(random_analyzer_name,
                               tokenizer="standard",
                               filter="standard")

    i.analyzer(random_analyzer)

    i2 = i.clone('my-clone-index')

    assert i.to_dict()['settings']['analysis'] == i2.to_dict(
    )['settings']['analysis']
示例#14
0
def add_analyzer(index: Index):
    """Agrega un nuevo analyzer al índice, disponible para ser usado
    en todos sus fields. El analyzer aplica lower case + ascii fold:
    quita acentos y uso de ñ, entre otros, para permitir búsqueda de
    texto en español
    """

    synonyms = list(Synonym.objects.values_list('terms', flat=True))

    filters = ['lowercase', 'asciifolding']
    if synonyms:
        filters.append(token_filter(constants.SYNONYM_FILTER,
                                    type='synonym',
                                    synonyms=synonyms))

    index.analyzer(
        analyzer(constants.ANALYZER,
                 tokenizer='standard',
                 filter=filters)
    )
示例#15
0
def gen_name_analyzer_excluding_terms(excluding_terms):
    """Crea un analizador para nombres que sólo retorna TE (términos
    excluyentes).

    Por ejemplo, si el archivo de configuración de TE contiene las siguientes
    reglas:

    santa, salta, santo
    caba, cba

    Entonces, aplicar el analizador a la búsqueda 'salta' debería retornar
    'santa' y 'santo', mientras que buscar 'caba' debería retornar 'cba'.

    El analizador se utiliza para excluir resultados de búsquedas específicas.

    Args:
        excluding_terms (list): Lista de TE a utilizar especificados como
            sinónimos Solr.

    Returns:
        elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre
            'name_analyzer_excluding_terms'.

    """
    name_excluding_terms_filter = token_filter(
        'name_excluding_terms_filter',
        type='synonym',
        synonyms=excluding_terms
    )

    return analyzer(
        name_analyzer_excluding_terms,
        tokenizer='standard',
        filter=[
            'lowercase',
            'asciifolding',
            name_excluding_terms_filter,
            synonyms_only_filter,
            spanish_stopwords_filter
        ]
    )
示例#16
0
def gen_name_analyzer_synonyms(synonyms):
    """Crea un analizador para nombres con sinónimos.

    Args:
        synonyms (list): Lista de sinónimos a utilizar, en formato Solr.

    Returns:
        elasticsearch_dsl.analysis.Analyzer: analizador de texto con nombre
            'name_analyzer_synonyms'.

    """
    name_synonyms_filter = token_filter('name_synonyms_filter',
                                        type='synonym',
                                        synonyms=synonyms)

    return analyzer(name_analyzer_synonyms,
                    tokenizer='standard',
                    filter=[
                        'lowercase', 'asciifolding', name_synonyms_filter,
                        spanish_stopwords_filter
                    ])
示例#17
0
class Post(DocType):

    date = Date()
    url = Keyword()
    author = Keyword()
    topic = Keyword()
    board = Keyword()

    title_origin = Keyword()
    title_unigram = Text(analyzer=analyzer('whitespace'))
    title_ccjieba = Text(analyzer=analyzer('whitespace'))
    title_pos = Text(analyzer=analyzer('whitespace'))
    title_quality = HalfFloat()
    comments = Nested(
        properties={
            'comment_author': Keyword(),
            'comment_origin': Keyword(),
            'comment_unigram': Text(analyzer=analyzer('whitespace')),
            'comment_ccjieba': Text(analyzer=analyzer('whitespace')),
            'comment_pos': Text(analyzer=analyzer('whitespace')),
            'comment_audio_url': Keyword(),
            'comment_quality': HalfFloat()
        })

    class Meta:
        index = 'post'

    def save(self, *args, **kwargs):
        return super(Post, self).save(*args, **kwargs)

    def add_comment(self, comment_author, comment_origin, comment_unigram,
                    comment_ccjieba, comment_pos, comment_audio_url,
                    comment_quality):

        self.comments.append({
            'comment_author': comment_author,
            'comment_origin': comment_origin,
            'comment_unigram': comment_unigram,
            'comment_ccjieba': comment_ccjieba,
            'comment_pos': comment_pos,
            'comment_audio_url': comment_audio_url,
            'comment_quality': comment_quality
        })

    def bulk_dicts(docs):
        dicts = (d.to_dict(include_meta=True) for d in docs)
        return dicts
示例#18
0
def configure_index(idx):
    """Configure ES index settings.

    NOTE: This is unused at the moment. Current issues:
    1. The index needs to be created (index.create() or search_index --create)
    setting update_all_types=True because of the attribute name being the same
    in Person and Company.
    https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.client.IndicesClient.create

    name = fields.TextField(attr="fullname", analyzer=lb_analyzer)

    2. How to specifiy token filter for an attribute?

    Therefore the index needs to be configured outside Django.
    """
    idx.settings(number_of_shards=1, number_of_replicas=0)
    lb_filter = token_filter("lb_filter", "stop", stopwords=["i"])
    lb_analyzer = analyzer(
        "lb_analyzer",
        tokenizer="standard",
        filter=["standard", "lb_filter", "asciifolding", "lowercase"])
    return lb_analyzer, lb_filter
示例#19
0
def url_main_analyzer():
    """
    An analyzer for creating "words" from URLs.

    Returns:
        Analyzer
    """
    return analyzer(
        'url',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
        ],
    )
示例#20
0
def url_main_analyzer():
    """
    An analyzer for creating "words" from URLs.

    Returns:
        Analyzer
    """
    return analyzer(
        'url',
        tokenizer='keyword',
        filter=[
            'lowercase',
            # Strip the protocol, we don't need it for searching.
            token_filter(
                'url_protocol_filter',
                type='pattern_replace',
                pattern='^\w+:\/\/',
                replace='',
            ),
            'word_delimiter',
        ],
    )
示例#21
0
def phone_number_analyzer():
    """
    An analyzer to do complex partial matching on phone numbers.

    Returns:
        Analyzer: An analyzer suitable for searching phone numbers.
    """
    return analyzer(
        'phone_number',
        # We only want n-grams, which we want to create as late as possible.
        tokenizer='keyword',
        filter=[
            # Strip all special chars, don't tokenize.
            token_filter(
                'phone_word_delimiter',
                type='word_delimiter',
                generate_word_parts=False,
                generate_number_parts=False,
                catenate_all=True,
            ),
            # Strip any zeros from the start of the number.
            token_filter(
                'leading_zero_filter',
                type='pattern_replace',
                pattern='^(0+)',
                replace='',
            ),
            # Create n-grams of all lengths to support partial matching.
            token_filter(
                'phone_ngram_filter',
                type='ngram',
                # Still undecided on whether this should be 3 or 4.
                # 3 means users have to type less numbers to get results,
                # but the matching is less accurate.
                min_gram=3,
                max_gram=32,
            ),
        ],
    )
示例#22
0
def phone_number_analyzer():
    """
    An analyzer to do complex partial matching on phone numbers.

    Returns:
        Analyzer: An analyzer suitable for searching phone numbers.
    """
    return analyzer(
        'phone_number',
        # We only want n-grams, which we want to create as late as possible.
        tokenizer='keyword',
        filter=[
            # Strip all special chars, don't tokenize.
            token_filter(
                'phone_word_delimiter',
                type='word_delimiter',
                generate_word_parts=False,
                generate_number_parts=False,
                catenate_all=True,
            ),
            # Strip any zeros from the start of the number.
            token_filter(
                'leading_zero_filter',
                type='pattern_replace',
                pattern='^(0+)',
                replace='',
            ),
            # Create n-grams of all lengths to support partial matching.
            token_filter(
                'phone_ngram_filter',
                type='ngram',
                # Still undecided on whether this should be 3 or 4.
                # 3 means users have to type less numbers to get results,
                # but the matching is less accurate.
                min_gram=3,
                max_gram=32,
            ),
        ],
    )
示例#23
0
def es_analyzer_for_locale(locale, search_analyzer=False):
    """Pick an appropriate analyzer for a given locale.
    If no analyzer is defined for `locale` or the locale analyzer uses a plugin
    but using plugin is turned off from settings, return an analyzer named "default_sumo".
    """

    name = ""
    analyzer_config = config.ES_LOCALE_ANALYZERS.get(locale)

    if not analyzer_config or (analyzer_config.get("plugin")
                               and not settings.ES_USE_PLUGINS):
        name = config.ES_DEFAULT_ANALYZER_NAME
        analyzer_config = {}

    # use default values from ES_DEFAULT_ANALYZER if not overridden
    # using python 3.9's dict union operator
    analyzer_config = config.ES_DEFAULT_ANALYZER | analyzer_config

    # turn dictionaries into `char_filter` and `token_filter` instances
    filters = _insert_custom_filters(name or locale, analyzer_config["filter"])
    char_filters = _insert_custom_filters(name or locale,
                                          analyzer_config["char_filter"],
                                          char=True)

    if search_analyzer:
        # create a locale-specific search analyzer, even if the index-time analyzer is
        # `sumo_default`. we do this so that we can adjust the synonyms used in any locale,
        # even if it doesn't have a custom analysis chain set up, without having to re-index
        name = locale + "_search_analyzer"
        filters.append(
            _create_synonym_graph_filter(config.ES_ALL_SYNONYMS_NAME))
        filters.append(_create_synonym_graph_filter(locale))

    return analyzer(
        name or locale,
        tokenizer=analyzer_config["tokenizer"],
        filter=filters,
        char_filter=char_filters,
    )
示例#24
0
def email_ngram_analyzer():
    """
    An analyzer for creating email safe ngrams.

    This analyzer first splits the local part and domain name, then creates
    n-grams (overlapping fragments) from the remaining strings, minus any
    special characters.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email_ngram',
        # Split the email address at the @ sign.
        tokenizer=tokenizer(
            'at_sign_tokenizer',
            type='pattern',
            pattern='@',
        ),
        filter=[
            'lowercase',
            # Strip any special characters from the email address.
            token_filter(
                'email_ngram_word_delimiter',
                type='word_delimiter',
                split_on_numerics=False,
                catenate_all=True,
            ),
            # Create trigrams from the address.
            token_filter(
                'email_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
示例#25
0
class LagouJob(Document):
    # 建议
    suggestion = Completion(analyzer=analyzer('ik_smart'))
    job_id = Keyword()
    # 职位标题
    title = Text(analyzer="ik_max_word")
    # url
    url = Keyword()
    # 工资
    salary = FloatRange()
    # # 工资下限
    # salary_min = Float()
    # 工作经验
    work_years = FloatRange()
    # # 最低年
    # work_year_min = Integer()
    # 学历要求
    degree_need = Float()
    # 工作性质 实习兼职全职
    job_type = Keyword()
    # 发布时间
    publish_time = Date()
    # 职位诱惑
    job_advantage = Text(analyzer="ik_max_word")
    # 职位描述
    job_desc = Text(analyzer="ik_max_word")
    # 工作城市
    job_city = Keyword()
    # 工作地址
    job_addr = Text(analyzer="ik_max_word")
    # 公司url
    company_url = Keyword()
    # 公司名字
    company_name = Keyword()

    class Index:
        name = 'a51job'
示例#26
0
def email_ngram_analyzer():
    """
    An analyzer for creating email safe ngrams.

    This analyzer first splits the local part and domain name, then creates
    n-grams (overlapping fragments) from the remaining strings, minus any
    special characters.

    Returns:
        Analyzer: An analyzer suitable for analyzing email addresses.
    """
    return analyzer(
        'email_ngram',
        # Split the email address at the @ sign.
        tokenizer=tokenizer(
            'at_sign_tokenizer',
            type='pattern',
            pattern='@',
        ),
        filter=[
            'lowercase',
            # Strip any special characters from the email address.
            token_filter(
                'email_ngram_word_delimiter',
                type='word_delimiter',
                split_on_numerics=False,
                catenate_all=True,
            ),
            # Create trigrams from the address.
            token_filter(
                'email_ngram_filter',
                type='ngram',
                min_gram=3,
                max_gram=3,
            ),
        ],
    )
示例#27
0
def bigram_analyzer():
    """
    A n-gram analyzer of length 2.

    Bigrams provide nice partial, fuzzy matching.

    Returns:
        Analyzer
    """
    return analyzer(
        'bigram',
        tokenizer=tokenizer(
            'bigram_tokenizer',
            type='ngram',
            min_gram=2,
            max_gram=2,
            token_chars=['letter', 'digit'],
        ),
        filter=[
            'standard',
            'lowercase',
            'asciifolding',
        ],
    )
示例#28
0
def gen_suggests(index, info_tuple):
    # 根据字符串生成搜索建议数组
    used_words = set()
    suggests = []
    for text, weight in info_tuple:
        if text:
            # 调用es的analyze接口分析字符串
            ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
            my_analyzer = analyzer('my_analyzer',
                                   tokenizer=tokenizer('trigram',
                                                       'nGram',
                                                       min_gram=3,
                                                       max_gram=3),
                                   filter=['lowercase'])
            i = Index(index)
            i._analysis = ik_analyzer
            # i.analyzer(analyzer=ik_analyzer)

            # i.analyzer.default.type: "ik_max_word"
            a = i.analyze(params={'filter': ["lowercase"]}, body=text)

            # i.analyzer(analyzer = "ik_max_word")

            words = es.indices.analyze(index=index,
                                       params={'filter': ["lowercase"]},
                                       body=text)
            anylyzed_words = set(
                [r["token"] for r in words["tokens"] if len(r["token"]) > 1])
            new_words = anylyzed_words - used_words
        else:
            new_words = set()

        if new_words:
            suggests.append({"input": list(new_words), "weight": weight})

    return suggests
示例#29
0
class Book(Document):
    """An objects representing a book in ES
    """
    title = Text(
        fields={
            "no_vowels":
            Text(
                analyzer=analyzer("no_vowels", "pattern",
                                  pattern="[\Waeiouy]"),  # noqa: W605
                search_analyzer="standard")
        })
    ref = Keyword() if MAJOR_ES > 2 else Text(index="not_analyzed")
    edition = Text()
    author = Object(properties={"name": Text(), "birthdate": Date()})
    publication_date = Date()
    n_pages = Integer()

    if ES6:
        illustrators = Nested(Illustrator)

        class Index:
            name = "bk"

    else:
        illustrators = Nested(
            properties={
                "name":
                Text(),
                "birthdate":
                Date(),
                "nationality":
                Keyword() if MAJOR_ES > 2 else Text(index="not_analyzed"),
            })

        class Meta:
            index = "bk"
示例#30
0
    def _analyzer(self, species, dump=True):
        autophrase_syns, syns = self._synonyms(species)

        if dump:
            with open('autophrase_syns.txt', 'w') as f:
                f.writelines(l + '\n' for l in autophrase_syns)

            with open('syns.txt', 'w') as f:
                f.writelines(l + '\n' for l in syns)

        autophrase_filter = token_filter(f'species_autophrase_syn',
                                         type='synonym',
                                         synonyms=autophrase_syns)

        syn_filter = token_filter(f'species_syn',
                                  type='synonym',
                                  tokenizer='keyword',
                                  synonyms=syns)

        return analyzer(
            f'species_analyzer',
            tokenizer='lowercase',
            filter=[autophrase_filter, syn_filter],
        )
示例#31
0
def bigram_analyzer():
    """
    A n-gram analyzer of length 2.

    Bigrams provide nice partial, fuzzy matching.

    Returns:
        Analyzer
    """
    return analyzer(
        'bigram',
        tokenizer=tokenizer(
            'bigram_tokenizer',
            type='ngram',
            min_gram=2,
            max_gram=2,
            token_chars=['letter', 'digit'],
        ),
        filter=[
            'standard',
            'lowercase',
            'asciifolding',
        ],
    )
示例#32
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, absolute_import

from elasticsearch_dsl import analyzer, tokenizer

# autocomplete tokenizer
edge_ngram_tokenizer = tokenizer('edge_ngram_tokenizer',
                                 type='edge_ngram',
                                 min_gram=1,
                                 max_gram=20,
                                 token_chars=['letter', 'digit'])

# autocomplete analyzer
edge_ngram_analyzer = analyzer(
    'edge_ngram_analyzer',
    tokenizer=edge_ngram_tokenizer,
    filter=['lowercase', 'asciifolding'],
)

# autocomplete *search*  tokenizer
edge_ngram_search_tokenizer = tokenizer('edge_ngram_search_tokenizer',
                                        type='edge_ngram',
                                        token_chars=['letter', 'digit'])

search_tokenizer = tokenizer('search_tokenizer',
                             type='standard',
                             token_chars=['letter', 'digit'])

# autocomplete *search* analyzer
edge_ngram_search_analyzer = analyzer(
    'edge_ngram_search_analyzer',
示例#33
0
from elasticsearch_dsl import (
    Date,
    Document,
    Float,
    Integer,
    Keyword,
    Text,
    analyzer,
    token_filter,
)

edge_ngram_analyzer = analyzer(
    "edge_ngram_analyzer",
    type="custom",
    tokenizer="standard",
    filter=[
        "lowercase",
        token_filter("edge_ngram_filter", type="edgeNGram", min_gram=1, max_gram=20),
    ],
)


class PodcastDoc(Document):
    id = Keyword(required=True)
    thumbnail_348 = Keyword()
    thumbnail_160 = Keyword()
    times_picked = Integer()
    episodes_count = Integer()
    episodes_seconds = Float()
    slug = Keyword(required=True, index=False)
    name = Text(required=True, analyzer=edge_ngram_analyzer, search_analyzer="standard")
示例#34
0
__author__ = 'mtianyan'
__date__ = '2017/6/25 10:18'

from elasticsearch_dsl import connections, Document, Text, Keyword, Integer, Date, Completion, analyzer

connections.create_connection(hosts=["localhost"])

my_analyzer = analyzer('ik_smart')


class LagouJobIndex(Document):
    suggest = Completion(analyzer=my_analyzer)
    title = Text(analyzer="ik_max_word")
    url = Keyword()
    url_object_id = Keyword()
    salary_min = Integer()
    salary_max = Integer()
    job_city = Keyword()
    work_years_min = Integer()
    work_years_max = Integer()
    degree_need = Text(analyzer="ik_max_word")
    job_type = Keyword()
    publish_time = Date()
    job_advantage = Text(analyzer="ik_max_word")
    job_desc = Text(analyzer="ik_smart")
    job_addr = Text(analyzer="ik_max_word")
    company_name = Keyword()
    company_url = Keyword()
    tags = Text(analyzer="ik_max_word")
    crawl_time = Date()
示例#35
0
"""
import logging
import typing

import elasticsearch_dsl as es
from elasticsearch_dsl import analysis

log = logging.getLogger(__name__)

edge_ngram_filter = analysis.token_filter('edge_ngram_filter',
                                          type='edge_ngram',
                                          min_gram=1,
                                          max_gram=15)

autocomplete = es.analyzer(
    'autocomplete',
    tokenizer='standard',
    filter=['standard', 'asciifolding', 'lowercase', edge_ngram_filter])


class User(es.DocType):
    """Elastic document describing user."""

    objectID = es.Keyword()

    username = es.Text(fielddata=True, analyzer=autocomplete)
    username_exact = es.Keyword()
    full_name = es.Text(fielddata=True, analyzer=autocomplete)

    roles = es.Keyword(multi=True)
    groups = es.Keyword(multi=True)
    """Elasticsearch DSL Date field chokes on None values and parses empty
    strings as current date, hence the workaround.
    TODO: move this upstream in some form."""

    def _to_python(self, data):
        if data is None:
            return data
        return super(NoneAwareDate, self)._to_python(data)


namesAutocompleteAnalyzer = analyzer(
    "namesAutocompleteAnalyzer",
    tokenizer=tokenizer(
        "autocompleteTokenizer",
        type="edge_ngram",
        min_gram=1,
        max_gram=25,
        token_chars=["letter", "digit"],
    ),
    filter=["lowercase"],
)

namesAutocompleteSearchAnalyzer = analyzer(
    "namesAutocompleteSearchAnalyzer",
    tokenizer=tokenizer("whitespace"),

    filter=[
        "lowercase"
    ]
)
示例#37
0
)


naam_stripper = analysis.char_filter(
    'naam_stripper',
    type='mapping',
    mappings=[
        "-=>' '",   # change '-' to separator
        ".=>' '",   # change '.' to separator
    ]
)


kadastrale_aanduiding = es.analyzer(
    'kadastrale_aanduiding',
    tokenizer='keyword',
    filter=['standard', 'lowercase']
)


adres = es.analyzer(
    'adres',
    tokenizer='standard',
    filter=['standard', 'lowercase', 'asciifolding', synonym_filter],
    char_filter=[adres_split, huisnummer_generate],
)


naam = es.analyzer(
    'naam',
    tokenizer='standard',
from elasticsearch_dsl.document import DocType
from elasticsearch_dsl import analyzer, String

# slovenian lemmanizer
lemmagen_sl = analyzer('lemmagen_sl', type='custom',
                       tokenizer="uax_url_email",
                       filter=["lowercase"],
                       )


class Document(DocType):
    """
    The :class:`Document` class defines a Type in ElasticSearch
    """
    title = String(analyzer=lemmagen_sl)
示例#39
0
            location += '?{}'.format(query_string)
        return location

    class Index:
        name = 'publication'
        settings = {
            'number_of_shards': 1
        }


autocomplete_analyzer = analyzer('autocomplete_analyzer',
                                 tokenizer=tokenizer(
                                    'edge_ngram_tokenizer',
                                    type='edge_ngram',
                                    min_gram=3,
                                    max_gram=10,
                                    token_chars=[
                                        "letter",
                                        "digit"
                                    ]),
                                 filter=['lowercase', 'asciifolding', 'trim'])


def get_search_index(model):
    lookup = {
        Author: AuthorDoc,
        Container: ContainerDoc,
        Platform: PlatformDoc,
        Sponsor: SponsorDoc,
        Tag: TagDoc,
    }
    DocType,
    Keyword,
    Text,
    Index,
    analyzer,
    tokenizer,
    token_filter,
    Date
)

namesAutocompleteAnalyzer = analyzer(
    "namesAutocompleteAnalyzer",
    tokenizer=tokenizer(
        "autocompleteTokenizer",
        type="edge_ngram",
        min_gram=1,
        max_gram=25,
        token_chars=["letter", "digit"],
    ),
    filter=["lowercase"],
)

namesAutocompleteSearchAnalyzer = analyzer(
    "namesAutocompleteSearchAnalyzer", tokenizer=tokenizer("lowercase")
)

ukrainianAddressesStopwordsAnalyzer = analyzer(
    "ukrainianAddressesStopwordsAnalyzer",
    type="ukrainian",
    filter=[
        token_filter(
kadaster_object_aanduiding = analysis.token_filter(
    'kad_obj_aanduiding_filter',
    type='ngram',
    min_gram=4,
    max_gram=16
)

####################################
#           Analyzers              #
####################################

bouwblok = es.analyzer(
    'bouwblok',
    tokenizer=tokenizer(
        'edge_ngram_filter',
        type='edge_ngram',
        min_gram=2, max_gram=4,
        token_chars=["letter", "digit"]),
    filter=['lowercase', divider_stripper],
    # char_filter=[divider_stripper]
)

adres = es.analyzer(
    'adres',
    tokenizer='standard',
    filter=['lowercase', 'asciifolding', synonym_filter],
    # filter=['lowercase', 'asciifolding'],
    char_filter=[naam_stripper],
)

straatnaam = es.analyzer(
    'straatnaam',
示例#42
0
from elasticmodels import BooleanField, DateField, Index, IntegerField, StringField
from elasticsearch_dsl import MetaField, analyzer, token_filter, tokenizer

from .models import Report

# override the default analyzer for ES to use an ngram filter that breaks words using
# the standard tokenizer. Allow words to be broken up with underscores
name = analyzer(
    "name",
    # the standard analyzer splits the words nicely by default
    tokenizer=tokenizer("standard"),
    filter=[
        # technically, the standard filter doesn't do anything but we include
        # it anyway just in case ES decides to make use of it
        "standard",
        # obviously, lowercasing the tokens is a good thing
        "lowercase",
        # this enumates a 3-4 ngram, but also includes the whole token itself
        # (which prevents us from having to create multifields)
        token_filter("simple_edge", type="pattern_capture", patterns=["(?=(...))(?=(....))"]),
    ],
)


class ReportIndex(Index):
    category = StringField(
        attr="category.name",
        # need a non_analyzed field for sorting
        fields={"raw": StringField(index="not_analyzed")},
    )
    category_id = IntegerField(attr="category.pk")
示例#43
0
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl import analyzer, token_filter

from ..account.models import User
from ..order.models import Order
from ..product.models import Product

storefront = Index('storefront')
storefront.settings(number_of_shards=1, number_of_replicas=0)


partial_words = token_filter(
    'partial_words', 'edge_ngram', min_gram=3, max_gram=15)
title_analyzer = analyzer(
    'title_analyzer',
    tokenizer='standard',
    filter=[partial_words, 'lowercase'])
email_analyzer = analyzer('email_analyzer', tokenizer='uax_url_email')


@storefront.doc_type
class ProductDocument(DocType):
    title = fields.StringField(analyzer=title_analyzer)

    def prepare_title(self, instance):
        return instance.name

    class Meta:
        model = Product
        fields = ['name', 'description', 'is_published']
示例#44
0
from datetime import datetime
from pytz import timezone
from ipaddress import ip_address

from elasticsearch import ConflictError, NotFoundError

from elasticsearch_dsl import Document, Date, Text, Keyword, Mapping, InnerDoc, \
    Object, Nested, MetaField, Q, Long, Boolean, Double, Binary, Ip, analyzer
from elasticsearch_dsl.utils import AttrList

from pytest import raises, fixture

snowball = analyzer('my_snow',
    tokenizer='standard',
    filter=['standard', 'lowercase', 'snowball'])

class User(InnerDoc):
    name = Text(fields={'raw': Keyword()})

class Wiki(Document):
    owner = Object(User)
    views = Long()

    class Index:
        name = 'test-wiki'

class Repository(Document):
    owner = Object(User)
    created_at = Date()
    description = Text(analyzer=snowball)
    tags = Keyword()
示例#45
0
import elasticsearch_dsl as es
from django.conf import settings
from elasticsearch_dsl import analyzer, tokenizer

dutch_analyzer = es.analyzer('dutchanalyzer',
                             type='standard',
                             stopwords='_dutch_')

base_analyzer = analyzer('zorg_base_txt',
                         tokenizer=tokenizer('trigram',
                                             'nGram',
                                             min_gram=2,
                                             max_gram=20),
                         filter=['lowercase'])

_index = es.Index(settings.ELASTIC_INDEX)


@_index.doc_type
class Term(es.DocType):
    term = es.Text()
    gewicht = es.Integer()


@_index.doc_type
class Organisatie(es.DocType):
    ext_id = es.String(index='not_analyzed')
    naam = es.String(analyzer=dutch_analyzer)  # ngram
    beschrijving = es.String(analyzer=dutch_analyzer)
    afdeling = es.String(index='not_analyzed')
示例#46
0
from elasticsearch_dsl import DocType, String, token_filter, analyzer
from django.conf import settings

__author__ = 'erhmutlu'

turkish_stop = token_filter('turkish_stop', type='stop', stopwords="_turkish_")
turkish_lowercase = token_filter('turkish_lowercase', type='lowercase', language="turkish")
turkish_stemmer = token_filter('turkish_stemmer', type='stemmer', language='turkish')
custom_shingle_filter = token_filter('custom_shingle_filter', type='shingle', max_shingle_size=3, min_shingle_size=2,
                                     output_unigrams=True)

entity_synonym_index_analyzer = analyzer('entity_synonym_index_analyzer', tokenizer='keyword', filter=[turkish_lowercase, 'asciifolding', turkish_stemmer])

entity_synonym_search_analyzer = analyzer('entity_synonym_search_analyzer', tokenizer='standard',
                                    filter=[turkish_lowercase, 'apostrophe', 'asciifolding',
                                            custom_shingle_filter, turkish_stemmer])


class Entity(DocType):
    entity_synonyms = String(index_analyzer=entity_synonym_index_analyzer,
                             search_analyzer=entity_synonym_search_analyzer,
                             include_in_all=True)
    entity_key = String(index='not_analyzed', include_in_all=False)
    value = String(index='not_analyzed', include_in_all=False)

    @classmethod
    def _get_index(self, index=None):
        return settings.ELASTICSEARCH_INDEX

    @classmethod
    def _get_doctype(self):
示例#47
0
from itertools import permutations

from elasticsearch_dsl import Document, Integer, Text, Keyword, Completion, analyzer, token_filter, GeoPoint, Date

# custom analyzer for names
ascii_fold = analyzer(
    'ascii_fold',
    # we don't want to split O'Brian or Toulouse-Lautrec
    tokenizer='whitespace',
    filter=[
        'lowercase',
        token_filter('ascii_fold', 'asciifolding')
    ]
)


class Entity(Document):
    project_id = Integer()
    file_id = Integer()
    id = Text()
    name = Text(fields={'keywords': Keyword()})
    suggest = Completion(analyzer=ascii_fold)

    def clean(self):
        """
        Automatically construct the suggestion input and weight by taking all
        possible permutation of Person's name as ``input`` and taking their
        popularity as ``weight``.
        """
        self.suggest = {
            'input': [' '.join(p) for p in permutations(self.name.split())],
示例#48
0
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl import analyzer, tokenizer
from genres.models import GenreNew

# Name of the Elasticsearch index
genre = Index('genres')
# See Elasticsearch Indices API reference for available settings
genre.settings(
    number_of_shards=1,
    number_of_replicas=0
)

html_strip = analyzer(
    'genre',
    tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
    filter=["lowercase"]
)

@genre.doc_type
class GenreDocument(DocType):
    name = fields.TextField(
        analyzer=html_strip,
        fields={'raw': fields.KeywordField()}
    )

    class Meta:
        model = GenreNew # The model associated with this DocType
        # queryset_pagination = 50000
        # The fields of the model you want to be indexed in Elasticsearch
        fields = [
            'id'
示例#49
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from elasticsearch_dsl import DocType, Text, Keyword, analyzer, MetaField, Date
from first import first
from packaging.version import parse as parse_version

from warehouse.search import doc_type


EmailAnalyzer = analyzer(
    "email",
    tokenizer="uax_url_email",
    filter=["standard", "lowercase", "stop", "snowball"],
)

NameAnalyzer = analyzer(
    "normalized_name",
    tokenizer="lowercase",
    filter=["standard", "lowercase", "word_delimiter"],
)


@doc_type
class Project(DocType):

    name = Text()
    normalized_name = Text(analyzer=NameAnalyzer, index_options="docs")
示例#50
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from elasticsearch_dsl import DocType, Integer, String, Date, Nested, Boolean, analyzer

html_strip = analyzer(
    'html_strip',
    tokenizer="standard",
    filter=["standard", "lowercase", "snowball", "stop"],
    char_filter=["html_strip"]
)

class RecipeIndex(DocType):
    document_id = Integer(index='not_analyzed')
    name = String(analyzer=html_strip)
    preparation_time = Integer(index='not_analyzed')

    ingredients = String(analyzer=html_strip)
    servings = Integer(index='not_analyzed')
    likes = Integer(index='not_analyzed')
    source_text = String()
    slug = String(index='no')
    source_slug = String(index='not_analyzed')
    large_image = String(index='no')
    last_updated = Date(index='not_analyzed')

    courses = String()
    cuisines = String()
    holidays = String()

    class Meta:
示例#51
0
from time import sleep

from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Index, Document, SearchAsYouType, analyzer, Search, Keyword
from elasticsearch_dsl.query import MultiMatch
import json

index = 'odrednica'
serbianAnalyzer = analyzer('serbian')
host = 'localhost'
data = []


class Odrednica(Document):
    pk = Keyword()
    rec = Keyword()
    varijante = SearchAsYouType(analyzer=serbianAnalyzer)
    vrsta = Keyword()


def createIndex():
    connections.create_connection(hosts=[host], timeout=20)
    if not connections.get_connection().indices.exists(index):
        odrednicaIdx = Index(index)
        odrednicaIdx.analyzer(serbianAnalyzer)
        odrednicaIdx.document(Odrednica)
        odrednicaIdx.create()


def saveOdrednica(item):
    varijante = ' '.join(item['varijante'])
示例#52
0
fr_stem_filter = token_filter(
    'fr_stem_filter', type='stemmer', language='minimal_french')
# Deal with French specific aspects.
fr_elision = token_filter(
    'fr_elision',
    type='elision',
    articles=[
        'l', 'm', 't', 'qu', 'n', 's', 'j', 'd', 'c',
        'jusqu', 'quoiqu', 'lorsqu', 'puisqu'
    ]
)

# Languages related analyzers.
de_analyzer = analyzer(
    'de_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', de_stop_filter, de_stem_filter],
    char_filter=[char_filter('html_strip')]
)

en_analyzer = analyzer(
    'en_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', en_stop_filter, en_stem_filter],
    char_filter=[char_filter('html_strip')]
)

es_analyzer = analyzer(
    'es_analyzer',
    tokenizer=tokenizer('icu_tokenizer'),
    filter=['icu_folding', 'icu_normalizer', es_stop_filter, es_stem_filter],
    char_filter=[char_filter('html_strip')]
示例#53
0
synonym_tokenfilter = token_filter(
    'synonym_tokenfilter',
    'synonym',
    synonyms=[
        'reactjs, react',  # <-- important
    ],
)

text_analyzer = analyzer(
    'text_analyzer',
    tokenizer='standard',
    filter=[
        # The ORDER is important here.
        'standard',
        'lowercase',
        'stop',
        synonym_tokenfilter,
        # Note! 'snowball' comes after 'synonym_tokenfilter'
        'snowball',
    ],
    char_filter=['html_strip'])


class Web(Document):
    url = Keyword()
    domain = Keyword()
    homepage = Text()
    created_date = Date()
    last_updated = Date()
示例#54
0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from elasticsearch_dsl import DocType, String, analyzer, MetaField

from warehouse.search import doc_type


EmailAnalyzer = analyzer(
    "email",
    tokenizer="uax_url_email",
    filter=["standard", "lowercase", "stop", "snowball"],
)


@doc_type
class Project(DocType):

    name = String()
    version = String(index="not_analyzed", multi=True)
    summary = String(analyzer="snowball")
    description = String(analyzer="snowball")
    author = String()
    author_email = String(analyzer=EmailAnalyzer)
    maintainer = String()
    maintainer_email = String(analyzer=EmailAnalyzer)
示例#55
0
reHeader = re.compile("^(.*?):\s*(.*)$")
tz = get_localzone()


def parse_header(header):
    # TODO: support for multiline headers
    match = reHeader.search(header)
    if match:
        return {'name': match.group(1), 'value': match.group(2)}
    else:
        raise ValueError("No header matched")


identifierAnalyzer = analyzer("identifier",
                              tokenizer="keyword",
                              filter=["lowercase"])


class DocHTTPRequestResponse(DocType):
    class Meta:
        doc_type = 'HTTPRequestResponse'

    timestamp = Date()
    protocol = Text()
    host = Keyword()
    port = Integer()
    request = Object(
        properties={
            'method':
            Keyword(),
示例#56
0
russian_stemmer_filter = analysis.token_filter('russian_stemmer',
                                               type='stemmer',
                                               language='russian')
english_stop_filter = analysis.token_filter('english_stop',
                                            type='stop',
                                            stopwords='_english_')
english_stemmer_filter = analysis.token_filter('english_stemmer',
                                               type='stemmer',
                                               language='english')
english_possessive_stemmer_filter = analysis.token_filter(
    'english_stemmer', type='stemmer', language='possessive_english')

# Создаем анализаторы
ru_analyzer = analyzer(
    'ru_analyzer',
    type='custom',
    tokenizer='standard',
    filter=['lowercase', russian_stop_filter, russian_stemmer_filter],
)
en_analyzer = analyzer('en_analyzer',
                       type='custom',
                       tokenizer='standard',
                       filter=[
                           english_possessive_stemmer_filter, 'lowercase',
                           english_stop_filter, english_stemmer_filter
                       ])
# Добавляем анализаторы в Индекс
movie_index.analyzer(ru_analyzer)
movie_index.analyzer(en_analyzer)


@movie_index.doc_type
示例#57
0
from django_elasticsearch_dsl import Document, Index, fields
from elasticsearch_dsl import analyzer
from django_elasticsearch_dsl.registries import registry
from .models import Product

products_index = Index("products")
products_index.settings(number_of_shards=1, number_of_replicas=1)

html_strip = analyzer(
    "html_strip",
    tokenizer="standard",
    filter=["standard", "lowercase", "stop", "snowball"],
    char_filter=["html_strip"],
)


# @registry.register_document
@products_index.doc_type
class ProductDocument(Document):

    # id = fields.IntegerField(attr='id')
    # title = fields.StringField(
    #     analyzer=html_strip,
    #     fields={
    #         'raw': fields.StringField(analyzer='keyword'),
    #     }
    # )
    # description = fields.TextField(
    #     analyzer=html_strip,
    #     fields={
    #         'raw': fields.TextField(analyzer='keyword'),
示例#58
0
import os
from elasticsearch_dsl import (Index, tokenizer, analyzer)
from pprint import pprint

movie_index: Index = Index(os.environ.get('ES_INDEX', 'moovie'))

movie_index.settings(number_of_shards=5, number_of_replicas=1)

completion_analyzer = analyzer('completion_analyzer',
                               tokenizer=tokenizer('trigram',
                                                   'nGram',
                                                   min_gram=3,
                                                   max_gram=3),
                               filter=['lowercase'])

normalization_analyzer = analyzer('normalization_analyzer',
                                  tokenizer="standard",
                                  filter=["lowercase", "stop", "snowball"],
                                  char_filter=["html_strip"])

movie_index.analyzer(normalization_analyzer)


def init_index():
    if not movie_index.exists():
        movie_index.create()


def destroy_index():
    if movie_index.exists():
        movie_index.delete(ignore=404)