def moses_sentenize(text): from mosestokenizer import MosesSentenceSplitter global MOSES_SENT if not MOSES_SENT: MOSES_SENT = MosesSentenceSplitter('ru') chunks = MOSES_SENT([text]) return find_substrings(chunks, text)
def spacy_tokenize(text): from spacy.lang.ru import Russian global NLP if not NLP: NLP = Russian() doc = NLP(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
def moses_tokenize(text): from mosestokenizer import MosesTokenizer global MOSES_TOK if not MOSES_TOK: MOSES_TOK = MosesTokenizer('ru') # disable MOSES_TOK.argv.append('-no-escape') # " -> " MOSES_TOK.argv.remove('-a') # - -> @-@ MOSES_TOK.restart() chunks = MOSES_TOK(text) return find_substrings(chunks, text)
def mystem_tokenize(text): from pymystem3 import Mystem global MYSTEM if not MYSTEM: MYSTEM = Mystem( grammar_info=False, entire_input=True, disambiguation=False, weight=False ) data = MYSTEM.analyze(text) chunks = parse_mystem(data) return find_substrings(chunks, text)
def parse_tokens(stream): stream = filter_xml(stream, tags={'source', 'tokens', 'token'}) buffer = [] for event, node in stream: if event == 'end': tag = node.tag if tag == 'source': sent = node.text.strip() elif tag == 'token': word = node.get('text') buffer.append(word) elif tag == 'tokens': substrings = find_substrings(buffer, sent) yield substrings_partition(substrings) buffer = [] node.clear()
def parse_tokens(lines): lines = iter(lines) sent = None buffer = [] for line in lines: if not line: substrings = find_substrings(buffer, sent) yield substrings_partition(substrings) buffer = [] else: match = re.match(r'# text = (.+)$', line) if match: sent = match.group(1) if re.match(r'^\d', line): parts = line.split('\t') word = parts[1] buffer.append(word)
def spacy_tokenize2(text): from spacy.lang.ru import Russian from spacy_russian_tokenizer import ( RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES ) global NLP2 if not NLP2: NLP2 = Russian() NLP2.add_pipe( RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer' ) doc = NLP2(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
def __call__(self, text): parts = self.split(text) chunks = self.segment(parts) if self.post: chunks = self.post(chunks) return find_substrings(chunks, text)
def nltk_tokenize(text): from nltk.tokenize import word_tokenize chunks = word_tokenize(text, 'russian') return find_substrings(chunks, text)
def re_tokenize(text): chunks = TOKEN.findall(text) return find_substrings(chunks, text)
def space_tokenize(text): chunks = re.split(r'\s+', text) return find_substrings(chunks, text)
def segtok_sentenize(text): from segtok.segmenter import split_single chunks = split_single(text) return find_substrings(chunks, text)
def nltk_sentenize(text): from nltk import sent_tokenize chunks = sent_tokenize(text, 'russian') return find_substrings(chunks, text)
def deepmipt_sentenize(text): from rusenttokenize import ru_sent_tokenize with no_logger(LOGGER): chunks = ru_sent_tokenize(text) return find_substrings(chunks, text)
def dot_sentenize(text): chunks = dot_sentenize_(text) return find_substrings(chunks, text)
def segtok_tokenize(text): from segtok.tokenizer import word_tokenizer chunks = word_tokenizer(text) return find_substrings(chunks, text)