def _check_locale_tokenization(self, locale, expected_tokens, p_tag=True): """ Check that a given locale's document was tokenized correctly. * `locale` - The locale to check. * `expected_tokens` - An iterable of the tokens that should be found. If any tokens from this list are missing, or if any tokens not in this list are found, the check will fail. * `p_tag` - Default True. If True, an extra token will be added to `expected_tokens`: "p". This is because our wiki parser wraps it's content in <p> tags and many analyzers will tokenize a string like '<p>Foo</p>' as ['p', 'foo'] (the HTML tag is included in the tokenization). So this will show up in the tokenization during this test. Not all the analyzers do this, which is why it can be turned off. Why can't we fix the analyzers to strip out that HTML, and not generate spurious tokens? That could probably be done, but it probably isn't worth while because: * ES will weight common words lower, thanks to it's TF-IDF algorithms, which judges words based on how often they appear in the entire corpus and in the document, so the p tokens will be largely ignored. * The pre-l10n search code did it this way, so it doesn't break search. * When implementing l10n search, I wanted to minimize the number of changes needed, and this seemed like an unneeded change. """ search = es_utils.Sphilastic(DocumentMappingType) search = search.filter(document_locale=locale) facet_filter = search._process_filters([('document_locale', locale)]) search = search.facet_raw(tokens={ 'terms': { 'field': 'document_content' }, 'facet_filter': facet_filter }) facets = search.facet_counts() expected = set(expected_tokens) if p_tag: # Since `expected` is a set, there is no problem adding this # twice, since duplicates will be ignored. expected.add(u'p') actual = set(t['term'] for t in facets['tokens']) eq_(actual, expected)
def search(cls): return es_utils.Sphilastic(cls)