Пример #1
0
    def test_tokenization(self):
        """
        The whether the elasticsearch analyzer yields the right tokens for the german analyzer.

        Check the comments in mainapp.documents.index for more details
        """
        tokenizations = {
            "die": [],
            "hunde": ["hunde", "hund"],
            "wi-fi": ["wi", "fi"],
            "Feuerwehr": ["feuerwehr"],  # Would ideally split the words
            "oktopoden": ["oktopoden", "oktopod"],
            "Äpfel": ["äpfel", "apfel"],
            "ging": ["ging"],
            "schwierigste": ["schwierigste", "schwierig"],
            "1234/89": ["1234", "89"],  # Would be better if it included "1234/89"
        }

        text_analyzer = get_text_analyzer("german")
        elastic_index = Index("mst-test-tokenization")
        if not elastic_index.exists():
            elastic_index.create()
        elastic_index.close()
        elastic_index.analyzer(text_analyzer)
        elastic_index.save()
        elastic_index.open()
        elastic_index.flush()

        for word, expected_tokens in tokenizations.items():
            analysis = elastic_index.analyze(
                body={"analyzer": "text_analyzer", "text": word}
            )
            actual_tokens = [i["token"] for i in analysis["tokens"]]
            self.assertEqual(expected_tokens, actual_tokens, "Word was {}".format(word))
    def analyze(self, text: str) -> Dict[str, List[Dict]]:
        """Shows what elasticsearch does with the tokens"""

        elastic_index_file = Index(settings.ELASTICSEARCH_PREFIX + "-file")
        elastic_index_file.analyzer(autocomplete_analyzer)
        elastic_index_file.analyzer(text_analyzer)
        return elastic_index_file.analyze(
            body={"analyzer": "text_analyzer", "text": text}
        )
Пример #3
0
# Создаем анализаторы
ru_analyzer = analyzer(
    'ru_analyzer',
    type='custom',
    tokenizer='standard',
    filter=['lowercase', russian_stop_filter, russian_stemmer_filter],
)
en_analyzer = analyzer('en_analyzer',
                       type='custom',
                       tokenizer='standard',
                       filter=[
                           english_possessive_stemmer_filter, 'lowercase',
                           english_stop_filter, english_stemmer_filter
                       ])
# Добавляем анализаторы в Индекс
movie_index.analyzer(ru_analyzer)
movie_index.analyzer(en_analyzer)


@movie_index.doc_type
class MovieDocument(Document):
    title = fields.TextField(
        analyzer=ru_analyzer,  # Анализатор для индексации
        search_analyzer=ru_analyzer  # Анализатор для поискового запроса
    )
    description = fields.TextField(
        analyzer=ru_analyzer,  # Анализатор для индексации
        search_analyzer=ru_analyzer  # Анализатор для поискового запроса
    )
    subtitles = fields.TextField(
        attr='get_subtitles',
Пример #4
0
# Name of the Elasticsearch index
from django.conf import settings
from django_elasticsearch_dsl import Index, DEDField, Integer
from elasticsearch_dsl import analyzer, token_filter


class RelatedToValueList(DEDField, Integer):
    def get_value_from_instance(self, data):
        return [obj.id for obj in super().get_value_from_instance(data)]


mainIndex = Index(settings.ELASTICSEARCH_INDEX)
# See Elasticsearch Indices API reference for available settings
mainIndex.settings(number_of_shards=1, number_of_replicas=0)

autocomplete_filter = token_filter(
    "autocomplete_filter",
    "edge_ngram",
    min_gram=1,
    max_gram=20,
)

# Using this analyzer with an empty field fails, so we're using methods instead that add a space
autocomplete_analyzer = analyzer(
    'autocomplete',
    tokenizer="standard",
    filter=["lowercase", autocomplete_filter],
)
mainIndex.analyzer(autocomplete_analyzer)
Пример #5
0
from django.conf import settings
from django.db.models import QuerySet
from django_elasticsearch_dsl import Index

from metarecord.models import Classification
from search_indices import get_finnish_analyzer
from search_indices.documents.base import BaseDocument

# Name of the Elasticsearch index
INDEX = Index(settings.ELASTICSEARCH_INDEX_NAMES[__name__])

finnish_analyzer = get_finnish_analyzer()

INDEX.analyzer(finnish_analyzer)

INDEX.settings(max_result_window=500000, )


@INDEX.document
class ClassificationDocument(BaseDocument):
    class Django:
        model = Classification

    def get_queryset(self) -> QuerySet:
        return Classification.objects.latest_version()
def make_index(suffix: str) -> Index:
    elastic_index = Index(settings.ELASTICSEARCH_PREFIX + "-" + suffix)
    elastic_index.analyzer(autocomplete_analyzer)
    elastic_index.analyzer(text_analyzer)
    return elastic_index
Пример #7
0
# Name of the Elasticsearch index
job_listing = Index('joblistings')
# See Elasticsearch Indices API reference for available settings
job_listing.settings(
    number_of_shards=1,
    number_of_replicas=0
)

custom_tokenizer = tokenizer(
    "pattern",
    "pattern",
    pattern="\s|-|\n|/|,|\.\s"
)

keyword_analyzer = analyzer("default", type="custom", tokenizer=custom_tokenizer, filter=["lowercase"])
job_listing.analyzer(keyword_analyzer)


filter_shingle = token_filter(name_or_instance="filter_shingle", type="shingle", max_shingle_size=2, min_shingle_size=2, output_unigrams="false")
shingle_analyzer = analyzer("shingle", tokenizer=custom_tokenizer, type="custom", filter=["lowercase", filter_shingle])
job_listing.analyzer(shingle_analyzer)

# triple_filter_shingle = token_filter(name_or_instance="triple_filter_shingle", type="shingle", max_shingle_size=3, min_shingle_size=3, output_unigrams="false")
# triple_shingle_analyzer = analyzer("triple_shingle", tokenizer=custom_tokenizer, type="custom", filter=["lowercase", triple_filter_shingle])
# job_listing.analyzer(triple_shingle_analyzer)


@job_listing.doc_type
class JobListingDocument(DocType):
    keywords = fields.TextField(attr="description", fielddata=True)
#    shingles = fields.TextField(attr="description", analyzer="shingle", fielddata=True)