예제 #1
0
    def test_tokenization(self):
        """
        The whether the elasticsearch analyzer yields the right tokens for the german analyzer.

        Check the comments in mainapp.documents.index for more details
        """
        tokenizations = {
            "die": [],
            "hunde": ["hunde", "hund"],
            "wi-fi": ["wi", "fi"],
            "Feuerwehr": ["feuerwehr"],  # Would ideally split the words
            "oktopoden": ["oktopoden", "oktopod"],
            "Äpfel": ["äpfel", "apfel"],
            "ging": ["ging"],
            "schwierigste": ["schwierigste", "schwierig"],
            "1234/89": ["1234", "89"],  # Would be better if it included "1234/89"
        }

        text_analyzer = get_text_analyzer("german")
        elastic_index = Index("mst-test-tokenization")
        if not elastic_index.exists():
            elastic_index.create()
        elastic_index.close()
        elastic_index.analyzer(text_analyzer)
        elastic_index.save()
        elastic_index.open()
        elastic_index.flush()

        for word, expected_tokens in tokenizations.items():
            analysis = elastic_index.analyze(
                body={"analyzer": "text_analyzer", "text": word}
            )
            actual_tokens = [i["token"] for i in analysis["tokens"]]
            self.assertEqual(expected_tokens, actual_tokens, "Word was {}".format(word))
    def analyze(self, text: str) -> Dict[str, List[Dict]]:
        """Shows what elasticsearch does with the tokens"""

        elastic_index_file = Index(settings.ELASTICSEARCH_PREFIX + "-file")
        elastic_index_file.analyzer(autocomplete_analyzer)
        elastic_index_file.analyzer(text_analyzer)
        return elastic_index_file.analyze(
            body={"analyzer": "text_analyzer", "text": text}
        )