def test_comment(self): lexer = Lexer() tokens = lexer.tokenize( '<!-- In the interest of restricting article length, please limit this section to ' 'two or three short paragraphs and add any substantial information to the main Issues ' 'in anarchism article. Thank you. -->') logging.info(tokens) self.assertGreater(len(tokens), 0)
def test_tokenize(self, name='wikitext'): """ Test tokenizer """ with (DATA_FOLDER / name).open(encoding="utf8") as f: text = f.read() lexer = Lexer() tokens = lexer.tokenize(text) logging.info(tokens) logging.info('TEXT_LENGTH: {0}'.format(len(text))) self.assertGreater(len(tokens), 0)