Exemplo n.º 1
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer('my_analyzer1',
        tokenizer='keyword',
        filter=['lowercase', analysis.token_filter('my_filter1', 'stop', stopwords=['a', 'b'])],
    )
    a2 = analysis.analyzer('english')
    a3 = analysis.analyzer('unknown_custom')
    a4 = analysis.analyzer('my_analyzer2',
        tokenizer=analysis.tokenizer('trigram', 'nGram', min_gram=3, max_gram=3),
        filter=[analysis.token_filter('my_filter2', 'stop', stopwords=['c', 'd'])],
    )
    a5 = analysis.analyzer('my_analyzer3', tokenizer='keyword')
    n1 = analysis.normalizer('my_normalizer1',
        filter=['lowercase']
    )
    n2 = analysis.normalizer('my_normalizer2',
        filter=['my_filter1', 'my_filter2', analysis.token_filter('my_filter3', 'stop', stopwords=['e', 'f'])]
    )
    n3 = analysis.normalizer('unknown_custom')

    m = mapping.Mapping()
    m.field('title', 'text', analyzer=a1,
        fields={
            'english': Text(analyzer=a2),
            'unknown': Keyword(search_analyzer=a3),
        }
    )
    m.field('comments', Nested(properties={
        'author': Text(analyzer=a4)
    }))
    m.field('normalized_title', 'keyword', normalizer=n1)
    m.field('normalized_comment', 'keyword', normalizer=n2)
    m.field('unknown', 'keyword', normalizer=n3)
    m.meta('_all', analyzer=a5)

    assert {
        'analyzer': {
            'my_analyzer1': {'filter': ['lowercase', 'my_filter1'], 'tokenizer': 'keyword', 'type': 'custom'},
            'my_analyzer2': {'filter': ['my_filter2'], 'tokenizer': 'trigram', 'type': 'custom'},
            'my_analyzer3': {'tokenizer': 'keyword', 'type': 'custom'},
        },
        'normalizer': {
            'my_normalizer1': {'filter': ['lowercase'], 'type': 'custom'},
            'my_normalizer2': {'filter': ['my_filter1', 'my_filter2', 'my_filter3'], 'type': 'custom'},
        },
        'filter': {
            'my_filter1': {'stopwords': ['a', 'b'], 'type': 'stop'},
            'my_filter2': {'stopwords': ['c', 'd'], 'type': 'stop'},
            'my_filter3': {'stopwords': ['e', 'f'], 'type': 'stop'},
        },
        'tokenizer': {
            'trigram': {'max_gram': 3, 'min_gram': 3, 'type': 'nGram'},
        }
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Exemplo n.º 2
0
def test_normalizer_serializes_as_name():
    n = analysis.normalizer('my_normalizer')

    assert 'my_normalizer' == n.to_dict()
Exemplo n.º 3
0
descriptive_text_analyzer = analysis.analyzer(
    "descriptive_text_analyzer", tokenizer="classic", filter=["lowercase", "trim", "stemmer"]
)

ngram_filter = analysis.token_filter("ngram_filter", type="ngram", min_gram=2, max_gram=20)

ngram_analyzer = analysis.analyzer(
    "ngram_completion", tokenizer="whitespace", filter=["lowercase", "asciifolding", ngram_filter]
)

whitespace_analyzer = analysis.analyzer(
    "whitespace_analyzer", tokenizer="whitespace", filter=["lowercase", "asciifolding"]
)

lowercase_normalizer = analysis.normalizer("lowercase_normalizer", filter=["lowercase"])

email_analyzer = analysis.analyzer(
    "email_analyzer",
    type="custom",
    tokenizer=analysis.tokenizer(
        "case_officer_email", "pattern", pattern="([a-zA-Z0-9_.-]+@[a-zA-Z0-9_.-]+\\.[a-zA-Z]{2,})", group=1,
    ),
    filter=["lowercase"],
)


class Country(InnerDoc):
    name = fields.KeywordField(
        fields={"raw": fields.KeywordField(normalizer=lowercase_normalizer), "suggest": fields.CompletionField(),},
        attr="country.name",
Exemplo n.º 4
0
def test_mapping_can_collect_all_analyzers_and_normalizers():
    a1 = analysis.analyzer(
        "my_analyzer1",
        tokenizer="keyword",
        filter=[
            "lowercase",
            analysis.token_filter("my_filter1", "stop", stopwords=["a", "b"]),
        ],
    )
    a2 = analysis.analyzer("english")
    a3 = analysis.analyzer("unknown_custom")
    a4 = analysis.analyzer(
        "my_analyzer2",
        tokenizer=analysis.tokenizer("trigram",
                                     "nGram",
                                     min_gram=3,
                                     max_gram=3),
        filter=[
            analysis.token_filter("my_filter2", "stop", stopwords=["c", "d"])
        ],
    )
    a5 = analysis.analyzer("my_analyzer3", tokenizer="keyword")
    n1 = analysis.normalizer("my_normalizer1", filter=["lowercase"])
    n2 = analysis.normalizer(
        "my_normalizer2",
        filter=[
            "my_filter1",
            "my_filter2",
            analysis.token_filter("my_filter3", "stop", stopwords=["e", "f"]),
        ],
    )
    n3 = analysis.normalizer("unknown_custom")

    m = mapping.Mapping()
    m.field(
        "title",
        "text",
        analyzer=a1,
        fields={
            "english": Text(analyzer=a2),
            "unknown": Keyword(search_analyzer=a3)
        },
    )
    m.field("comments", Nested(properties={"author": Text(analyzer=a4)}))
    m.field("normalized_title", "keyword", normalizer=n1)
    m.field("normalized_comment", "keyword", normalizer=n2)
    m.field("unknown", "keyword", normalizer=n3)
    m.meta("_all", analyzer=a5)

    assert {
        "analyzer": {
            "my_analyzer1": {
                "filter": ["lowercase", "my_filter1"],
                "tokenizer": "keyword",
                "type": "custom",
            },
            "my_analyzer2": {
                "filter": ["my_filter2"],
                "tokenizer": "trigram",
                "type": "custom",
            },
            "my_analyzer3": {
                "tokenizer": "keyword",
                "type": "custom"
            },
        },
        "normalizer": {
            "my_normalizer1": {
                "filter": ["lowercase"],
                "type": "custom"
            },
            "my_normalizer2": {
                "filter": ["my_filter1", "my_filter2", "my_filter3"],
                "type": "custom",
            },
        },
        "filter": {
            "my_filter1": {
                "stopwords": ["a", "b"],
                "type": "stop"
            },
            "my_filter2": {
                "stopwords": ["c", "d"],
                "type": "stop"
            },
            "my_filter3": {
                "stopwords": ["e", "f"],
                "type": "stop"
            },
        },
        "tokenizer": {
            "trigram": {
                "max_gram": 3,
                "min_gram": 3,
                "type": "nGram"
            }
        },
    } == m._collect_analysis()

    assert json.loads(json.dumps(m.to_dict())) == m.to_dict()
Exemplo n.º 5
0
from contextlib import contextmanager
from logging import getLogger

from django.conf import settings
from elasticsearch.helpers import bulk as es_bulk
from elasticsearch_dsl import analysis, Index
from elasticsearch_dsl.connections import connections


logger = getLogger(__name__)


# Normalises values to improve sorting (by keeping e, E, è, ê etc. together)
lowercase_asciifolding_normalizer = analysis.normalizer(
    'lowercase_asciifolding_normalizer',
    filter=('lowercase', 'asciifolding'),
)

# Trigram tokenizer enables us to support partial matching
trigram = analysis.tokenizer(
    'trigram',
    'nGram',
    min_gram=3,
    max_gram=3,
    token_chars=('letter', 'digit'),
)

# Filters out "-" so that t-shirt and tshirt can be matched
special_chars = analysis.char_filter('special_chars', 'mapping', mappings=('-=>',))
trigram_analyzer = analysis.CustomAnalyzer(
    'trigram_analyzer',
Exemplo n.º 6
0
from django_elasticsearch_dsl import Document, fields, Index
from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl.analysis import analyzer, normalizer, char_filter, token_filter
from .models import Document as DocumentModel, DKeyword


log = logging.getLogger(__name__)
info = log.info
debug = log.debug
warn = log.warning
error = log.error


lowercase_normalizer = normalizer(
    type="custom",
    name_or_instance="lowercase_normalizer",
    char_filter=[],
    filter="lowercase",
)

no_digits_char_filter = char_filter(
    name_or_instance="no_digits", type="pattern_replace", pattern="(\\d+)", replace=""
)

no_digits_analyzer = analyzer(
    name_or_instance="no_digits",
    tokenizer="standard",
    filter=["lowercase", "stop"],
    char_filter=[no_digits_char_filter],
)

Exemplo n.º 7
0
def test_normalizer_serializes_as_name():
    n = analysis.normalizer('my_normalizer')

    assert 'my_normalizer' == n.to_dict()
def test_normalizer_serializes_as_name():
    n = analysis.normalizer("my_normalizer")

    assert "my_normalizer" == n.to_dict()
Exemplo n.º 9
0
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl import analyzer
from elasticsearch_dsl.analysis import normalizer

from .models import Tender, Award, TenderDocument

case_insensitive_analyzer = analyzer('case_insensitive_analyzer',
                                     tokenizer='standard',
                                     token_chars=['whitespace', 'punctuation'],
                                     filter=['lowercase'])

case_insensitive_normalizer = normalizer(
    type='custom',
    name_or_instance='case_insensitive_normalizer',
    char_filter=[],
    filter='lowercase',
)

tender = Index('tenders')
tender.settings(
    number_of_shards=1,
    number_of_replicas=0,
)

award = Index('awards')
award.settings(
    number_of_shards=1,
    number_of_replicas=0,
)

tender_document = Index('tender_documents')
Exemplo n.º 10
0
insitu_products.settings(**ELASTICSEARCH_INDEX_SETTINGS)
insitu_requirements.settings(**ELASTICSEARCH_INDEX_SETTINGS)
insitu_data.settings(**ELASTICSEARCH_INDEX_SETTINGS)
insitu_dataproviders.settings(**ELASTICSEARCH_INDEX_SETTINGS)

if not getattr(Search, '_patched', False):
    Search.order_by = Search.sort
    Search._patched = True

case_insensitive_analyzer = analyzer('case_insensitive_analyzer',
                                     tokenizer=tokenizer('trigram', 'nGram'),
                                     filter=['lowercase'])

case_insensitive_normalizer = normalizer(
    type="custom",
    name_or_instance='case_insensitive_normalizer',
    char_filter=[],
    filter="lowercase",
)


@insitu_products.doc_type
class ProductDoc(DocType):
    acronym = fields.KeywordField()
    description = fields.TextField()
    name = fields.TextField(analyzer=case_insensitive_analyzer,
                            fielddata=True,
                            fields={
                                'raw':
                                fields.KeywordField(
                                    multi=True,
                                    ignore_above=256,
Exemplo n.º 11
0
edge_ngram_filter = analysis.token_filter(
    'edge_ngram_filter',
    type='edge_ngram',
    min_gram=1,
    max_gram=20,
)

# Creating ngram filtering to kadastral objects
kadaster_object_aanduiding = analysis.token_filter(
    'kad_obj_aanduiding_filter',
    type='edge_ngram',
    min_gram=1,
    max_gram=6,
)

lowercase = analysis.normalizer('lowercase_keyword', filter=['lowercase'])

strip_zero = analysis.CustomCharFilter("strip_zero",
                                       builtin_type="pattern_replace",
                                       pattern="^0+(.*)",
                                       replacement="$1")

####################################
#           Analyzers              #
####################################

bouwblokid = es.analyzer(
    'bouwbloknummer',
    tokenizer=tokenizer('bouwbloktokens',
                        'edge_ngram',
                        min_gram=1,