예제 #1
0
def test_analyzer_has_definition():
    a = analysis.CustomAnalyzer('my_analyzer',
                                tokenizer='keyword',
                                filter=['lowercase'])

    assert {
        'type': 'custom',
        'tokenizer': 'keyword',
        'filter': ["lowercase"],
    } == a.get_definition()
def test_analyzer_has_definition():
    a = analysis.CustomAnalyzer(
        "my_analyzer", tokenizer="keyword", filter=["lowercase"]
    )

    assert {
        "type": "custom",
        "tokenizer": "keyword",
        "filter": ["lowercase"],
    } == a.get_definition()
예제 #3
0
)

# Trigram tokenizer enables us to support partial matching
trigram = analysis.tokenizer(
    'trigram',
    'nGram',
    min_gram=3,
    max_gram=3,
    token_chars=('letter', 'digit'),
)

# Filters out "-" so that t-shirt and tshirt can be matched
special_chars = analysis.char_filter('special_chars', 'mapping', mappings=('-=>',))
trigram_analyzer = analysis.CustomAnalyzer(
    'trigram_analyzer',
    tokenizer=trigram,
    char_filter=special_chars,
    filter=('lowercase',),
)

space_remover = analysis.token_filter(
    'space_remover',
    type='pattern_replace',
    pattern=' ',
    replacement='',
)

AREA_REGEX = r'[a-z]{1,2}'
DISTRICT_REGEX = r'(?:[0-9][a-z]|[0-9]{1,2})'
SECTOR_REGEX = r'[0-9]'
UNIT_REGEX = r'[a-z]{2}'
예제 #4
0

analysis.Tokenizer._builtins = analysis.TOKENIZERS = frozenset(
    ('keyword', 'standard', 'path_hierarchy', 'whitespace'))


class PathHierarchyTokenizer(analysis.Tokenizer):
    name = 'path_hierarchy'


class WhitespaceTokenizer(analysis.Tokenizer):
    name = 'whitespace'


path_analyzer = analysis.CustomAnalyzer('path',
                                        tokenizer='path_hierarchy',
                                        filter=['lowercase'])

lower_whitespace_analyzer = analysis.analyzer('lower_whitespace',
                                              tokenizer='whitespace',
                                              filter=['lowercase', 'stop'],
                                              char_filter=['html_strip'])


class DocumentDocType(ImprovedDocType):
    """
    The main documentation doc type to be used for searching.
    It stores a bit of meta data so we don't have to hit the db
    when rendering search results.

    The search view will be using the 'lang' and 'version' fields
# Trigram tokenizer enables us to support partial matching
trigram = analysis.tokenizer(
    'trigram',
    'nGram',
    min_gram=3,
    max_gram=3,
    token_chars=('letter', 'digit'),
)

# Filters out "-" so that t-shirt and tshirt can be matched
special_chars = analysis.char_filter('special_chars',
                                     'mapping',
                                     mappings=('-=>', ))
trigram_analyzer = analysis.CustomAnalyzer(
    'trigram_analyzer',
    tokenizer=trigram,
    char_filter=special_chars,
    filter=('lowercase', ),
)

english_possessive_stemmer = analysis.token_filter(
    'english_possessive_stemmer',
    type='stemmer',
    language='possessive_english',
)

english_stemmer = analysis.token_filter(
    'english_stemmer',
    type='stemmer',
    language='english',
)