def setup_tokenizers(terminal_punctuation): PunktLanguageVars.sent_end_chars = terminal_punctuation PunktLanguageVars.re_boundary_realignment = re.compile(r'[›»》’”\'\")\)\]\}\>]+?(?:\s+|(?=--)|$)', re.MULTILINE) global word_tokenizer global sentence_tokenizers #Accessing private variables of PunktLanguageVars because nltk has a faulty design pattern that necessitates it. #Issue reported here: https://github.com/nltk/nltk/issues/2068 word_tokenizer = PunktLanguageVars() word_tokenizer._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % { 'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'MultiChar': PunktLanguageVars._re_multi_char_punct, 'WordStart': r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]", }, re.UNICODE | re.VERBOSE) word_tokenizer._re_period_context = re.compile(PunktLanguageVars._period_context_fmt % { 'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'SentEndChars': word_tokenizer._re_sent_end_chars, }, re.UNICODE | re.VERBOSE) x = PunktLanguageVars() x._re_word_tokenizer = re.compile(PunktLanguageVars._word_tokenize_fmt % { 'NonWord': r"(?:[\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'MultiChar': PunktLanguageVars._re_multi_char_punct, 'WordStart': r"[^\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]", }, re.UNICODE | re.VERBOSE) x._re_period_context = re.compile(PunktLanguageVars._period_context_fmt % { 'NonWord': r"(?:[\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'SentEndChars': x._re_sent_end_chars, }, re.UNICODE | re.VERBOSE) #Read tokenizers from pickle files (also include an untrained tokenizer). Mapping from language name to tokenizer sentence_tokenizers = dict({None: PunktSentenceTokenizer(lang_vars=PunktLanguageVars())}, **{ current_file_name[:current_file_name.index('.')]: pickle.load(open(join(current_path, current_file_name), mode='rb')) for current_path, current_dir_names, current_file_names in os.walk(sentence_tokenizer_dir) for current_file_name in current_file_names if current_file_name.endswith('.pickle') }) for s in sentence_tokenizers.values(): s._lang_vars._re_period_context = x._re_period_context s._lang_vars._re_word_tokenizer = x._re_word_tokenizer
#΄´´``′″‴ textual_feature.setup_tokenizers(terminal_punctuation=('.', ';', ';')) p = PunktLanguageVars() #TODO don't mess with the PunktLanguageVars instance variables, mess with the class variables p._re_word_tokenizer = re.compile( PunktLanguageVars._word_tokenize_fmt % { 'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'MultiChar': PunktLanguageVars._re_multi_char_punct, 'WordStart': r"[^\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡]", }, re.UNICODE | re.VERBOSE) p._re_period_context = re.compile( PunktLanguageVars._period_context_fmt % { 'NonWord': r"(?:[\d\.\?¿؟\!¡!‽…⋯᠁ฯ,،,、、。°※··᛫~\:;;\\\/⧸⁄()\(\)\[\]\{\}\<\>\'\"‘’“”‹›«»《》\|‖\=\-\‐\‒\–\—\―_\+\*\^\$£€§%#@&†‡])", 'SentEndChars': p._re_sent_end_chars, }, re.UNICODE | re.VERBOSE) test_sentence_tokenizer = PunktSentenceTokenizer(lang_vars=p) class TestParsers(unittest.TestCase): def setUp(self): pass def test_sentences1(self): file = 'test test. test test test? test test test; test test. test.' result = textual_feature.tokenize_types['sentences']['func'](file) expected = [ 'test test.', 'test test test? test test test;', 'test test.', 'test.'
# From polybius.histories.tess s = "διόπερ οὐχ ὁρῶν ποίαν ἄν τις ὀξυτέραν ἢ μείζονα λάβοι μεταβολὴν τῶν καθ' ἡμᾶς τῆς γε ̔Ρωμαίοις συμβάσης, εἰς τοῦτον ἀπεθέμην τὸν καιρὸν τὸν ὑπὲρ τῶν προειρημένων ἀπολογισμόν: γνοίη δ' ἄν τις τὸ μέγεθος τῆς μεταβολῆς ἐκ τούτων. ζήτει ἐν τῷ περὶ στρατηγίας. [εχξ. Vατ. π. 369 μαι. 24, 4 ηεψς.]" print(p.word_tokenize(s)) print() #Sentence Tokenization s = 'a b c. "a b c". a b c. "a b c." a b c. “a b c”. a b c. “a b c.” a b c.' print('\n'.join( PunktSentenceTokenizer(lang_vars=PunktLanguageVars()).tokenize(s))) print() s = 'a b c. "a b c". a b c. "a b c." a b c. “a b c”. a b c. “a b c.” a b c.' PunktLanguageVars.re_boundary_realignment = re.compile( r'["”\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE) p = PunktLanguageVars() p._re_word_tokenizer = re.compile( PunktLanguageVars._word_tokenize_fmt % { 'NonWord': r"(?:[0-9\-))\"“”‘’`··~,«»;;}\]\*\#:@&\'\(({\[])", 'MultiChar': PunktLanguageVars._re_multi_char_punct, 'WordStart': r"[^0-9\-))\"“”‘’`··~,«»;;}\]\*\#:@&\'\(({\[]", }, re.UNICODE | re.VERBOSE) p._re_period_context = re.compile( PunktLanguageVars._period_context_fmt % { 'NonWord': r"(?:[0-9\-))\"“”‘’`··~,«»;;}\]\*\#:@&\'\(({\[])", 'SentEndChars': PunktLanguageVars._re_sent_end_chars, }, re.UNICODE | re.VERBOSE) tok = PunktSentenceTokenizer(lang_vars=p) print('\n'.join(tok.tokenize(s))) print()