def test_expand(): '''Testing expansions''' tok = Tokenizer() assert tok.expand("Foo[x3]!") == "Foo Foo Foo!" assert tok.expand("Foo [x 3]!") == "Foo Foo Foo!" assert tok.expand("Foo [x3]!") == "Foo Foo Foo!" assert tok.expand("Foo [x3] bar!") == "Foo Foo Foo bar!" assert tok.expand("Foo [x3] bar [x3]!") == "Foo Foo Foo bar bar bar!" assert tok.expand("Foo <bar baz>[x2]!") == "Foo bar baz bar baz!"
def test_replace(): '''Testing replacements in tokenization method''' tok = Tokenizer() assert tok.decontract("hey gimme") == 'hey give me' assert tok.decontract("hey let's") == 'hey let_us' assert tok.decontract("hey wanna go") == 'hey want to go' assert tok.decontract("hey gotta go") == 'hey got to go' assert tok.decontract("hey gonna go") == 'hey going to go' assert tok.decontract("hey cannot go") == 'hey can not go' assert tok.decontract("lookit here") == 'look at here' assert tok.tokenize("hey gimme") == ['hey', 'give', 'me'] assert tok.tokenize("hey let's") == ['hey', 'let_us'] assert tok.tokenize("hey wanna go") == ['hey', 'want', 'to', 'go'] assert tok.tokenize("hey gotta go") == ['hey', 'got', 'to', 'go'] assert tok.tokenize("hey gonna go") == ['hey', 'going', 'to', 'go'] assert tok.tokenize("hey cannot go") == ['hey', 'can', 'not', 'go'] assert tok.tokenize("lookit here") == ['look', 'at', 'here']
def test_tokenize(): '''Testing the tokenize method''' tok = Tokenizer() assert tok.tokenize("") == [] assert tok.tokenize("###") == [] assert tok.tokenize("###") == [] assert tok.tokenize("--") == [] assert tok.tokenize("Foo Bar Baz.") == ["Foo", "Bar", "Baz"] assert tok.tokenize("Don't do that!") == ["Don't", "do", "that"] assert tok.tokenize("\"Don't do that!\"") == ["Don't", "do", "that"] assert tok.tokenize("'Foo' Bar Baz.") == ["Foo", "Bar", "Baz"] tok = Tokenizer(split_clitics=True) assert tok.tokenize("Don't do that!") == ["Do", "n't", "do", "that"] assert tok.tokenize("Foo'll go.") == ["Foo", "'ll", "go"] assert tok.tokenize("I'm new.") == ["I", "'m", "new"] assert tok.tokenize("He's got to go.") == ["He", "'s", "got", "to", "go"] assert tok.tokenize("Foo_Bar Baz.") == ["Foo_Bar", "Baz"] assert tok.tokenize("James' friend.") == ["James", "'s", "friend"] assert tok.tokenize("Piglet's eyes") == ["Piglet", "'s", "eyes"] tok = Tokenizer(nonword=r'[^a-zA-Z\+\'\-\&\@]') assert tok.tokenize("Foo_Bar Baz.") == ["Foo", "Bar", "Baz"]