def test_sentence_tokenize_with_combined(): corpus = SentenceTokenizer() expect = ["わんわん。", "「にゃ?」(にゃー)わんわん。", "「わおーん。」(犬より。)"] result = corpus.tokenize(DOCUMENT4) assert expect == result
def test_sentence_tokenize_with_quotation(): corpus = SentenceTokenizer() expect = ["猫「にゃおにゃ。ただしかわいいものとする。異議は認める」。", "にゃお。", "にゃにゃ"] result = corpus.tokenize(DOCUMENT3) assert expect == result
def test_sentence_tokenize_with_bracket(): corpus = SentenceTokenizer() expect = ["私は猫である(ただしかわいいものとする。異議は認める)。", "にゃお。", "にゃにゃ"] result = corpus.tokenize(DOCUMENT2) assert expect == result
def test_sentence_tokenize(): corpus = SentenceTokenizer() expect = ["私は猫である。", "にゃお。", "にゃにゃ", "わんわん。", "にゃーにゃー。"] result = corpus.tokenize(DOCUMENT1) assert expect == result
def test_sentence_tokenize_with_custom_period(): corpus = SentenceTokenizer(period=".") expect = ["わんわん。「にゃ?」(にゃー)わんわん.", "「わおーん。」(犬より。)"] result = corpus.tokenize(DOCUMENT6) assert expect == result
def test_sentence_tokenize_with_custom_patterns(): corpus = SentenceTokenizer(patterns=SentenceTokenizer.PATTERNS + [re.compile(r"『.*?』")]) expect = ["わんわん。", "「にゃ?」(にゃー)わんわん。", "『わおーん。』(犬より。)"] result = corpus.tokenize(DOCUMENT5) assert expect == result