def test_parse_tokens(self): """ test_parse_tokens """ options = 0 # No RIGHT-WALL, no CAPS options |= BIT_STRIP # tokens = parse_tokens(self.tokens_all_walls, options) # self.assertTrue(self.cmp_lists(tokens, ['###LEFT-WALL###', 'dad', 'was', 'not', 'a', # 'parent', 'before', '.'])) # Tokens without walls tokens = parse_tokens(self.tokens_no_walls, options)[0] self.assertTrue( self.cmp_lists(tokens, ['###LEFT-WALL###', 'eagle', 'has', 'wing', '.'])) # RIGHT-WALL and CAPS, no STRIP options |= (BIT_RWALL | BIT_CAPS) options &= ~BIT_STRIP tokens = parse_tokens(self.tokens_all_walls, options)[0] self.assertTrue( self.cmp_lists(tokens, [ '###LEFT-WALL###', 'Dad[!]', 'was.v-d', 'not.e', 'a', 'parent.n', 'before', '.', '###RIGHT-WALL###' ])) # Tokens without walls tokens = parse_tokens(self.tokens_no_walls, options)[0] # print(tokens, file=sys.stdout) self.assertTrue( self.cmp_lists(tokens, ['###LEFT-WALL###', 'eagle', 'has', 'wing', '.']))
def test_parse_gutenchildren_bug_002(self): """ Test for number of tokens (bug from Gutenberg Children corpus) """ options = BIT_NO_LWALL | BIT_NO_PERIOD | BIT_STRIP tokens = parse_tokens(gutenberg_children_bug_002t, options)[0] self.assertEqual(tokens, gutenberg_children_bug_002tr)
def test_parse_no_period_if_no_period(self): """ Test for parsing sentence with no walls and period """ options = 0 options |= BIT_STRIP | BIT_NO_PERIOD | BIT_RWALL tokens = parse_tokens(self.tokens_no_walls_no_period, options)[0] self.assertTrue(self.cmp_lists(tokens, ['###LEFT-WALL###', 'eagle', 'has', 'wing']))
def test_parse_tokens_no_period(self): options = 0 options |= BIT_STRIP | BIT_NO_PERIOD | BIT_RWALL tokens = parse_tokens(self.tokens_no_walls, options)[0] print(tokens) self.assertTrue(self.cmp_lists(tokens, ['###LEFT-WALL###', 'eagle', 'has', 'wing']))
def test_parse_tokens_no_walls_no_period(self): options = 0 options |= BIT_STRIP | BIT_NO_PERIOD | BIT_NO_LWALL tokens = parse_tokens(self.tokens_all_walls, options)[0] # print(tokens) self.assertTrue(self.cmp_lists(tokens, ['dad', 'was', 'not', 'a', 'parent', 'before']))
def test_parse_tokens_no_left_wall(self): # NO_LWALL and CAPS, no STRIP options = 0 options |= BIT_CAPS | BIT_NO_LWALL # options |= (BIT_NO_LWALL | BIT_CAPS) # options &= (~(BIT_STRIP | BIT_RWALL)) tokens = parse_tokens(self.tokens_all_walls, options)[0] # print(tokens) self.assertTrue(self.cmp_lists(tokens, ['Dad[!]', 'was.v-d', 'not.e', 'a', 'parent.n', 'before', '.']))
def test_parse_tokens_alice_004(self): """ Test for proper parsing of square brackets revealed by Alice in Wonderland corpus """ options = BIT_STRIP | BIT_NO_LWALL | BIT_NO_PERIOD post = "(LEFT-WALL)(posting.g)(date.n)(:.j)(@date@[?].a)([)(ebook[?].a)([#])(@number@[?].n)(])(release.n)" \ "(date.n)(:.j)([@date@])(last.ord)(updated.v-d)(:.v)(@date@[?].n)" ref = ["###LEFT-WALL###", "posting", "date", ":", "@date@", "[", "ebook", "[#]", "@number@", "]", "release", "date", ":", "[@date@]", "last", "updated", ":", "@date@"] tokens = parse_tokens(post, options)[0] self.assertEqual(ref, tokens)
def test_parse_tokens_alice_003(self): """ Test for proper parsing of '[(]' revealed by Alice in Wonderland corpus """ options = BIT_STRIP | BIT_NO_LWALL | BIT_NO_PERIOD # sent = "(alice had no idea what latitude was, or longitude either, but thought they were nice grand words to say.)" post = "(LEFT-WALL)([(])(alice[?].n)(had.v-d)(no.misc-d)(idea.n)(what)(latitude.n-u)(was.v-d)(,)(or.ij)" \ "(longitude.n-u)(either.r)(,)([but])(thought.q-d)(they)(were.v-d)(nice.a)(grand.a)(words.n)(to.r)(say.v)" \ "(.)([)])" ref = \ ["###LEFT-WALL###", "[(]", "alice", "had", "no", "idea", "what", "latitude", "was", ",", "or", "longitude", "either", ",", "[but]", "thought", "they", "were", "nice", "grand", "words", "to", "say", ".", "[)]"] tokens = parse_tokens(post, options)[0] self.assertEqual(ref, tokens)