Exemplo n.º 1
0
def count_simple_stats():
    with open('data/articles.json', 'r', encoding='utf8') as f:
        json_str = f.readlines()[0]
    articles = json.loads(json_str)
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    texts_count = 0
    sent_count = 0
    words_count = 0
    symbols_count = 0
    for title in articles:
        text = articles[title][0].strip()
        texts_count += 1
        sents = nltk.sent_tokenize(text, language="russian")
        sent_count += len(sents)
        tokens = nlp(text)
        words_count += len(tokens)
        symbols = [symb for symb in text if symb != ' ' and symb != '\n']
        symbols_count += len(symbols)
        # print([token.txt for token in tokens])
    print("Texts count:", texts_count)
    print("Sentences count:", sent_count)
    print("Words count:", words_count)
    print("Symbols count:", symbols_count)
Exemplo n.º 2
0
    def add_to_index(self, document, doc_id):
        # parser = HTMLParser(text=document['data'])
        text = document['data']

        # print(1)

        nlp = Russian()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tokens = [token.lower() for token in tokens]
        tmp_text = ' '.join(tokens)
        if len(tokens) > 10e5:
            return
        self.doc_iter += 1
        nlp.max_length = 10e7
        doc_text = nlp(tmp_text, disable=['ner', 'parser'])
        lemmas = []
        # for lemma in tokens:
        for s in doc_text:
            lemma = s.lemma_
            lemmas.append(lemma)
            # if lemma not in set(stopwords.words('russian')) \
        #             and lemma not in set(stopwords.words('english')) \
        #             and len(lemma) > 1:
        #         lemmas.append(lemma)
        freq = FreqDist(lemmas)
        for k, v in freq.most_common():
            if k not in self.global_index:
                self.global_index[k] = []
            self.global_index[k].append((doc_id, v))
Exemplo n.º 3
0
class SpacyTokenizer:
    def __init__(self):
        self.nlp = Russian()
        self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS),
                          name="russian_tokenizer")

    def tokenize(self, text):
        return [token.text for token in self.nlp(text) if token.text.strip()]
Exemplo n.º 4
0
    def __init__(self):
        from spacy.lang.ru import Russian
        from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS,
                                             SYNTAGRUS_RARE_CASES)

        self.nlp = Russian()
        self.nlp.add_pipe(RussianTokenizer(
            self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
                          name='russian_tokenizer')
Exemplo n.º 5
0
def load_spacy_model(model):
    if model == "ru":
        try:
            from spacy.lang.ru import Russian
            return Russian()
            # import stanfordnlp
            # from spacy_stanfordnlp import StanfordNLPLanguage
            # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources")
            # return StanfordNLPLanguage(snlp)

            #import stanza
            #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources")
        except OSError as e:
            raise OSError(
                "Failed to get spaCy Russian model. Install it using "
                "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git"
            ) from e
    import spacy
    try:
        return spacy.load(model)
    except OSError:
        spacy.cli.download(model)
        # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
        from spacy.cli import link
        from spacy.util import get_package_path
        link(model, model, force=True, model_path=get_package_path(model))
        try:
            return spacy.load(model)
        except OSError as e:
            raise OSError(
                "Failed to get spaCy model. Download it manually using "
                "`python -m spacy download %s`." % model) from e
Exemplo n.º 6
0
    def __init__(self):
        super(ClassifierKNN, self).__init__()
        self.data_sets = []
        self.texts = {}
        self.options = {}
        self.threshold = 0.3

        self.russian_stop_words = stop_words.get_stop_words('russian')
        self.parser = Russian()
        self.stop_list = set(stopwords.words('russian') + list(
            self.russian_stop_words))
        # List of symbols we don't care about
        self.escape_symbols = ' '.join(string.punctuation).split(' ') +\
                              ['-----', '---', '...', '“', '”', '\'ve']
        # the vectorizer and classifer to use
        # note that I changed the tokenizer in CountVectorizer
        # to use a custom function using spaCy's tokenizer
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizeText,
            ngram_range=(1, 1)
        )
        self.clf = KNeighborsClassifier(
            n_neighbors=20, weights='uniform',
            algorithm='auto'#, metric='mahalanobis'
        )

        # the pipeline to clean, tokenize, vectorize, and classify
        self.pipe = Pipeline(
            [
                ('cleanText', ClassifierKNN.CleanTextTransformer()),
                ('vectorizer', self.vectorizer),
                ('clf', self.clf)
            ]
        )
Exemplo n.º 7
0
def load_spacy_model(model):
    if model == "ru":
        try:
            from spacy.lang.ru import Russian
            return Russian()
        except OSError as e:
            raise OSError(
                "Failed to get spaCy Russian model. Install it using "
                "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git"
            ) from e
    import spacy
    try:
        return spacy.load(model)
    except OSError:
        spacy.cli.download(model)
        # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
        from spacy.cli import link
        from spacy.util import get_package_path
        link(model, model, force=True, model_path=get_package_path(model))
        try:
            return spacy.load(model)
        except OSError as e:
            raise OSError(
                "Failed to get spaCy model. Download it manually using "
                "`python -m spacy download %s`." % model) from e
Exemplo n.º 8
0
    def __init__(self):
        super(ClassifierSpacy, self).__init__()
        self.data_sets = []
        self.texts = {}
        self.options = {}
        self.threshold = 0.3

        self.russian_stop_words = stop_words.get_stop_words('russian')
        self.parser = Russian()
        self.stop_list = set(stopwords.words('russian') + list(
            self.russian_stop_words))
        # List of symbols we don't care about
        self.escape_symbols = ' '.join(string.punctuation).split(' ') +\
                              ['-----', '---', '...', '“', '”', '\'ve']
        # the vectorizer and classifer to use
        # note that I changed the tokenizer in CountVectorizer
        # to use a custom function using spaCy's tokenizer
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizeText,
            ngram_range=(1, 1)
        )
        self.clf = MultinomialNB()
        # self.clf = LinearSVC()
        # self.clf = SVC(probability=True)

        # the pipeline to clean, tokenize, vectorize, and classify
        self.pipe = Pipeline(
            [
                ('cleanText', ClassifierSpacy.CleanTextTransformer()),
                ('vectorizer', self.vectorizer),
                ('clf', self.clf)
            ]
        )
Exemplo n.º 9
0
def tokenize():
    with open('data/articles.json', 'r', encoding='utf8') as f:
        json_str = f.readlines()[0]
    articles = json.loads(json_str)
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    for title in articles:
        text = articles[title][0].strip()
        texts_count += 1
        sents = nltk.sent_tokenize(text, language="russian")
        sent_count += len(sents)
        tokens = nlp(text)
        words_count += len(tokens)
        symbols = [symb for symb in text if symb != ' ' and symb != '\n']
        symbols_count += len(symbols)
Exemplo n.º 10
0
class TimofeevTokenizer:
    label = 'aatimofeev/spacy_russian_tokenizer'

    def __init__(self):
        from spacy.lang.ru import Russian
        from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS,
                                             SYNTAGRUS_RARE_CASES)

        self.nlp = Russian()
        self.nlp.add_pipe(RussianTokenizer(
            self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
                          name='russian_tokenizer')

    def __call__(self, text):
        doc = self.nlp(text)
        chunks = (token.text for token in doc)
        return find_substrings(chunks, text)
def text_decomposition(text, lang='de'):
    if lang == 'de':
        nlp = spacy.load('de_core_news_md')
    elif lang == 'en':
        nlp = spacy.load("en_core_web_md")
    elif lang == 'ru':
        nlp = Russian()
        sentencizer = nlp.create_pipe("sentencizer")
        nlp.add_pipe(sentencizer)
    else:
        print("Unsupported language. Choose from ['en', 'de', 'ru']")
        return

    doc = nlp(text)
    sentences = list()
    for sent in doc.sents:
        sentences.append(sent.text)
    return sentences
Exemplo n.º 12
0
    def __init__(
            self,
            regexp_suffixes=BASE_SUFFIXES_REGEXPS,
            regexp_prefixes=BASE_PREFIXES_REGEXPS,
            regexp_infixes=BASE_INFIXES_REGEXPS,
            regexp_base_token_matches=BASE_TOKEN_MATCH,
            merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            terminal_patterns=tuple(NO_TERMINAL_PATTERNS),
    ):
        """
        Parameters
        ----------
        regexp_suffixes : list of dict
            Dict in spacy format. See above for explanation of spacy format.
        regexp_prefixes : list of dict
            Dict in spacy format.
        regexp_infixes : list of dict
            Dict in spacy format.
        regexp_base_token_matches : list of dict
            Dict in spacy format.
        merge_patterns : list of dict
            Dict in spacy format.
        terminal_patterns : list of dict
            Dict in spacy format.
        """
        merge_patterns = list(merge_patterns)
        terminal_patterns = list(terminal_patterns)

        self.nlp_pipeline = Russian()
        self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer(
            nlp_model=self.nlp_pipeline,
            prefix_regexp=regexp_prefixes,
            suffix_regexp=regexp_suffixes,
            infix_regexp=regexp_infixes,
            token_match_regexp=regexp_base_token_matches,
        )

        self.tokenizer_postprocesser = RussianTokenizer(
            self.nlp_pipeline,
            merge_patterns=merge_patterns,
            terminal_patterns=terminal_patterns)

        self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser,
                                   name='russian_tokenizer_postprocesser')
Exemplo n.º 13
0
def spacy_tokenize(text):
    from spacy.lang.ru import Russian

    global NLP
    if not NLP:
        NLP = Russian()

    doc = NLP(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
Exemplo n.º 14
0
def spacy_tokenize2(text):
    from spacy.lang.ru import Russian
    from spacy_russian_tokenizer import (
        RussianTokenizer,
        MERGE_PATTERNS,
        SYNTAGRUS_RARE_CASES
    )

    global NLP2
    if not NLP2:
        NLP2 = Russian()
        NLP2.add_pipe(
            RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            name='russian_tokenizer'
        )

    doc = NLP2(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
Exemplo n.º 15
0
class RusWordTokenizer(PreProcesser):
    
    def __init__(self):
        
        self.rus_word_tokenizer = Russian()
        
        pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES)
        self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
    
    def transform_text(self, text):
        return [Token(token_id, token.text) for token_id, token in enumerate(self.rus_word_tokenizer(text), 1)]
    
    def transform_sent(self, sent):
        
        sent = sent.copy()
        sent.tokens = self.transform_text(sent.text)
        
        return sent
    
    def transform_item(self, x):
        return [self.transform_sent(sent) for sent in x]
Exemplo n.º 16
0
    def spacy_sentence_scores(self) -> Dict[str, float]:
        nlp = Russian()
        sentencizer = nlp.create_pipe('sentencizer')
        nlp.add_pipe(sentencizer)

        raw_text = self.text
        docx = nlp(raw_text)
        stopwords = list(STOP_WORDS)

        word_frequencies = {}
        for word in docx:
            if word.text not in stopwords:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word not in word_frequencies.keys():
                        word_frequencies[word.word] = 1
                    else:
                        word_frequencies[word.word] += 1

        maximum_frequency = max(word_frequencies.values())

        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word] / maximum_frequency)
        sentence_list = [sentence for sentence in docx.sents]

        sentence_scores = {}
        for sent in sentence_list:
            for word in sent:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word in word_frequencies.keys():
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.word]
                        else:
                            sentence_scores[sent] += word_frequencies[word.word]

        return sentence_scores
Exemplo n.º 17
0
def main():
    _fn = 'test_ru.json'
    try:
        with open(_fn, 'r', encoding='utf-8') as _fd:
            _buf = json.load(_fd)
    except IOError as _err:
        print(_err)
        return
    # _text = json.dumps(_buf)
    _text = str(_buf)
    print(_text)
    input("Press ENTER for continue...")
    nlp_obj = Russian()
    _doc = nlp_obj(_text)
    for _token in _doc:
        print(_token.text)
    input("Press ENTER for continue...")
Exemplo n.º 18
0
 def get_tokenizer(lang):
     if lang == "zh":
         # nlp = spacy.load("zh_core_web_sm")
         nlp = Chinese()
     elif lang == "en":
         # nlp = spacy.load("en_core_web_sm")
         nlp = English()
     elif lang == "cs":
         nlp = Czech()
     elif lang == "de":
         # nlp = spacy.load("de_core_web_sm")
         nlp = German()
     elif lang == "ru":
         nlp = Russian()
     else:
         raise Exception("Unacceptable language.")
     return nlp
    def make_document_tf(self, document):
        tf = {}
        parser = HTMLParser(text=document['data'])
        text = parser.get_text()

        nlp = Russian()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tmp_text = ' '.join(tokens)
        doc_text = nlp(tmp_text)
        lemmas = []
        for s in doc_text:
            if s.lemma_ not in set(stopwords.words('russian')) \
                    and s.lemma_ not in set(stopwords.words('english')):
                lemmas.append(s.lemma_)
        freq = FreqDist(lemmas)
        print(freq.most_common(10))

        # TODO most_common -> all
        for k, v in freq.most_common(10):
            tf = self.update_tf(tf, k, document['url'], v)

        return tf
Exemplo n.º 20
0
import os
import sys
import argparse
import codecs
import json

from tqdm import tqdm
from spacy.lang.ru import Russian
from spacy.lang.en import English

from dataset_utils.utils import save_output
from dataset_utils.global_vars import TEXT_FIELDS

TOKENIZERS = {'Russian': Russian().tokenizer, 'English': English().tokenizer}


def main(input_dir: str, output_dir: str, language: str):
    tasks = [task for task in os.listdir(input_dir) if task in TEXT_FIELDS]
    [
        preprocess_task(input_dir, output_dir, t, TOKENIZERS[language])
        for t in tqdm(tasks)
    ]


def preprocess_task(input_dir: str, output_dir, task: str, preproc_fn):
    """ replaces raw texts with preprocessed ones """
    if not os.path.isdir(output_dir + task):
        # create directories for preprocessed tasks
        os.makedirs(output_dir + task)

    samples = [
Exemplo n.º 21
0
#!/usr/bin/env python3
from __future__ import unicode_literals
import sys
import spacy.lang.ru
import re
from spacy.lang.ru import Russian
import spacy
corpora_path = 'raw_corpora.txt'

global_word_count = {}
texts_word_count = []

nlp = Russian()
tokenizer = nlp.Defaults.create_tokenizer()
with open(corpora_path, 'r') as f:
    for cnt, line in enumerate(f):
        line = line.strip()
        if line == '/***/':
            texts_word_count.append({})
            continue
        doc = nlp(line)
        for word in doc:
            if re.match('[\w]+', word.text, re.I) == None:
                continue
            if not word.text in global_word_count:
                global_word_count[word.text] = 1
            else:
                global_word_count[word.text] += 1

            if not word.text in texts_word_count[-1]:
                texts_word_count[-1][word.text] = 1
Exemplo n.º 22
0
 def __init__(self):
     self.nlp = Russian()
     self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS),
                       name="russian_tokenizer")
Exemplo n.º 23
0
import nltk
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords

import stanza
from spacy_stanza import StanzaLanguage

#nltk.download("stopwords")
#stanza.download('ru')  # will take a while
#russian_stopwords = stopwords.words("russian")
russian_stopwords = spacy.lang.ru.stop_words.STOP_WORDS

# ================================================== EXAMPLE ===================================================
text = "Не ветер, а какой-то ураган!"
nlp = Russian()
doc = nlp(text)
russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
doc = nlp(text)
print([token.text for token in doc])
# =============================================================================================================

toxic_rus = pd.read_csv("./__DATA/NLP_Datasets/toxic_russian.csv")

toxic_rus.head()
toxic_rus.info()

# Removing punctuation
toxic_rus['comment'] =  toxic_rus['comment'].str.replace(r'[.,!?<>-]', '')
toxic_rus['comment'] =  toxic_rus['comment'].str.replace("\n"," ")
Exemplo n.º 24
0
from spacy.lang.ja import Japanese
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
                 remove_stopwords=True,
                 lowercase=True,
                 strip_accents=None,
                 ngram_range=(1, 1),
Exemplo n.º 25
0
def main(args):

    if not os.path.exists(args.save_path):
        os.mkdir(args.save_path)

    tokenizers = {
        "en": spacy.load("en_core_web_sm"),
        "zh": spacy.load("zh_core_web_sm"),
        "ru": Russian(),
        "fr": spacy.load("fr_core_news_sm"),
        "es": spacy.load("es_core_news_sm"),
        "ar": WordTokenizer("arabic"),
    }

    src_tokenizer = None
    if args.src_tok is not None:
        src_tok = tokenizers[args.src_tok]
        if args.src_tok == "ar":

            def tokenize_src(text):
                return [tok for tok in src_tok.tokenize(text)]

        else:

            def tokenize_src(text):
                return [tok.text for tok in src_tok.tokenizer(text)]

        src_tokenizer = tokenize_src

    trg_tokenizer = None
    if args.trg_tok is not None:
        trg_tok = tokenizers[args.trg_tok]
        if args.trg_tok == "ar":

            def tokenize_trg(text):
                return [tok for tok in trg_tok.tokenize(text)]

        else:

            def tokenize_trg(text):
                return [tok.text for tok in tokz.tokenizer(text)]

        trg_tokenizer = tokenize_trg

    if args.task == "translation":
        indices = prep_trans_files(
            args.src_file,
            args.trg_file,
            args.save_path,
            src_tok=src_tokenizer,
            trg_tok=trg_tokenizer,
            max_len=args.max_len,
            min_len=args.min_len,
        )
    elif args.task == "tagging":
        indices = prep_tag_files(
            args.src_file,
            args.save_path,
            src_tok=src_tokenizer,
            max_len=args.max_len,
            min_len=args.min_len,
        )

    train, indices, = train_test_split(indices, test_size=0.3, random_state=42)
    valid, test = train_test_split(indices, test_size=0.5, random_state=42)

    split_to_tsv("train", train, args.save_path)
    split_to_tsv("test", test, args.save_path)
    split_to_tsv("valid", valid, args.save_path)

    # delete temporary files
    os.remove(os.path.join(args.save_path, "temp_src.txt"))
    os.remove(os.path.join(args.save_path, "temp_trg.txt"))
Exemplo n.º 26
0
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.fr import French
from spacy.lang.zh import Chinese
from spacy.lang.ru import Russian
from spacy.lang.ar import Arabic
from spacy.lang.de import German
from spacy.lang.uk import Ukrainian
from spacy.lang.ro import Romanian

lang_id_to_spacy = {
    'en': English(),
    'es': Spanish(),
    'fr': French(),
    'zh-cn': Chinese(),
    'ru': Russian(),
    'ar': Arabic(),
    'de': German(),
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
                password='******',
                user_agent='testscript by /u/pocaguirre',
                username='******')
Exemplo n.º 27
0
class SpacyRulesRussianTokenizer():
    """
    Tokenizer based on https://github.com/aatimofeev/spacy_russian_tokenizer.git
    Tokenizer was built on spacy and use spacy standart tokenization pipeline.
    You can read more about it here:
        * https://spacy.io/usage/linguistic-features#section-tokenization
        * https://spacy.io/usage/rule-based-matching
    Installation instruction:
    1) pip install spacy
    2) pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git
    """
    def __init__(
            self,
            regexp_suffixes=BASE_SUFFIXES_REGEXPS,
            regexp_prefixes=BASE_PREFIXES_REGEXPS,
            regexp_infixes=BASE_INFIXES_REGEXPS,
            regexp_base_token_matches=BASE_TOKEN_MATCH,
            merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            terminal_patterns=tuple(NO_TERMINAL_PATTERNS),
    ):
        """
        Parameters
        ----------
        regexp_suffixes : list of dict
            Dict in spacy format. See above for explanation of spacy format.
        regexp_prefixes : list of dict
            Dict in spacy format.
        regexp_infixes : list of dict
            Dict in spacy format.
        regexp_base_token_matches : list of dict
            Dict in spacy format.
        merge_patterns : list of dict
            Dict in spacy format.
        terminal_patterns : list of dict
            Dict in spacy format.
        """
        merge_patterns = list(merge_patterns)
        terminal_patterns = list(terminal_patterns)

        self.nlp_pipeline = Russian()
        self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer(
            nlp_model=self.nlp_pipeline,
            prefix_regexp=regexp_prefixes,
            suffix_regexp=regexp_suffixes,
            infix_regexp=regexp_infixes,
            token_match_regexp=regexp_base_token_matches,
        )

        self.tokenizer_postprocesser = RussianTokenizer(
            self.nlp_pipeline,
            merge_patterns=merge_patterns,
            terminal_patterns=terminal_patterns)

        self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser,
                                   name='russian_tokenizer_postprocesser')

    @staticmethod
    def create_custom_pretokenizer(nlp_model, prefix_regexp, suffix_regexp,
                                   infix_regexp, token_match_regexp):
        custom_pretokenizer = SpacyBaseTokenizer(
            nlp_model.vocab,
            prefix_search=prefix_regexp.search,
            suffix_search=suffix_regexp.search,
            infix_finditer=infix_regexp.finditer,
            token_match=token_match_regexp.match,
        )
        return custom_pretokenizer

    def transform_element(self, element):
        """
        Get tokenization variant of the element.
        Parameters
        ----------
        element : str
            String, supposed to be a sentence, one document or something analogous.
        Return
        ------
        tokens_array : list of str
            Tokenized string
        """
        if not isinstance(element, str):
            raise TypeError(
                f"Cannot tokenize {type(element)} instead of {type('')}!")
        tokens_array = [token.text for token in self.nlp_pipeline(element)]
        return tokens_array

    def transform(self, elements_collection):
        """
        Apply transformer to collection of elements (objects).
        Parameters
        ----------
        elements_collection : iterable of optional
            Collection of objects to be transformed.
        Returns
        -------
        transformed_elements : list of optional
            Collection of transformed objects.
        """
        transformed_elements = [
            self.transform_element(element) for element in elements_collection
        ]
        return transformed_elements
Exemplo n.º 28
0
 def __init__(self):
     
     self.rus_word_tokenizer = Russian()
     
     pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES)
     self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
Exemplo n.º 29
0
def tokenizer(inp):
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    return nlp(inp)
Exemplo n.º 30
0
 def lemmatize(self, token, pos_tag):
     nlp = Russian()
     docs = iter(nlp(token))
     return next(docs).lemma_