def count_simple_stats(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') texts_count = 0 sent_count = 0 words_count = 0 symbols_count = 0 for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols) # print([token.txt for token in tokens]) print("Texts count:", texts_count) print("Sentences count:", sent_count) print("Words count:", words_count) print("Symbols count:", symbols_count)
def add_to_index(self, document, doc_id): # parser = HTMLParser(text=document['data']) text = document['data'] # print(1) nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tokens = [token.lower() for token in tokens] tmp_text = ' '.join(tokens) if len(tokens) > 10e5: return self.doc_iter += 1 nlp.max_length = 10e7 doc_text = nlp(tmp_text, disable=['ner', 'parser']) lemmas = [] # for lemma in tokens: for s in doc_text: lemma = s.lemma_ lemmas.append(lemma) # if lemma not in set(stopwords.words('russian')) \ # and lemma not in set(stopwords.words('english')) \ # and len(lemma) > 1: # lemmas.append(lemma) freq = FreqDist(lemmas) for k, v in freq.most_common(): if k not in self.global_index: self.global_index[k] = [] self.global_index[k].append((doc_id, v))
class SpacyTokenizer: def __init__(self): self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS), name="russian_tokenizer") def tokenize(self, text): return [token.text for token in self.nlp(text) if token.text.strip()]
def __init__(self): from spacy.lang.ru import Russian from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES) self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer( self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer')
def load_spacy_model(model): if model == "ru": try: from spacy.lang.ru import Russian return Russian() # import stanfordnlp # from spacy_stanfordnlp import StanfordNLPLanguage # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources") # return StanfordNLPLanguage(snlp) #import stanza #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources") except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git" ) from e import spacy try: return spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def __init__(self): super(ClassifierKNN, self).__init__() self.data_sets = [] self.texts = {} self.options = {} self.threshold = 0.3 self.russian_stop_words = stop_words.get_stop_words('russian') self.parser = Russian() self.stop_list = set(stopwords.words('russian') + list( self.russian_stop_words)) # List of symbols we don't care about self.escape_symbols = ' '.join(string.punctuation).split(' ') +\ ['-----', '---', '...', '“', '”', '\'ve'] # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer # to use a custom function using spaCy's tokenizer self.vectorizer = CountVectorizer( tokenizer=self.tokenizeText, ngram_range=(1, 1) ) self.clf = KNeighborsClassifier( n_neighbors=20, weights='uniform', algorithm='auto'#, metric='mahalanobis' ) # the pipeline to clean, tokenize, vectorize, and classify self.pipe = Pipeline( [ ('cleanText', ClassifierKNN.CleanTextTransformer()), ('vectorizer', self.vectorizer), ('clf', self.clf) ] )
def load_spacy_model(model): if model == "ru": try: from spacy.lang.ru import Russian return Russian() except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git" ) from e import spacy try: return spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def __init__(self): super(ClassifierSpacy, self).__init__() self.data_sets = [] self.texts = {} self.options = {} self.threshold = 0.3 self.russian_stop_words = stop_words.get_stop_words('russian') self.parser = Russian() self.stop_list = set(stopwords.words('russian') + list( self.russian_stop_words)) # List of symbols we don't care about self.escape_symbols = ' '.join(string.punctuation).split(' ') +\ ['-----', '---', '...', '“', '”', '\'ve'] # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer # to use a custom function using spaCy's tokenizer self.vectorizer = CountVectorizer( tokenizer=self.tokenizeText, ngram_range=(1, 1) ) self.clf = MultinomialNB() # self.clf = LinearSVC() # self.clf = SVC(probability=True) # the pipeline to clean, tokenize, vectorize, and classify self.pipe = Pipeline( [ ('cleanText', ClassifierSpacy.CleanTextTransformer()), ('vectorizer', self.vectorizer), ('clf', self.clf) ] )
def tokenize(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols)
class TimofeevTokenizer: label = 'aatimofeev/spacy_russian_tokenizer' def __init__(self): from spacy.lang.ru import Russian from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES) self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer( self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer') def __call__(self, text): doc = self.nlp(text) chunks = (token.text for token in doc) return find_substrings(chunks, text)
def text_decomposition(text, lang='de'): if lang == 'de': nlp = spacy.load('de_core_news_md') elif lang == 'en': nlp = spacy.load("en_core_web_md") elif lang == 'ru': nlp = Russian() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) else: print("Unsupported language. Choose from ['en', 'de', 'ru']") return doc = nlp(text) sentences = list() for sent in doc.sents: sentences.append(sent.text) return sentences
def __init__( self, regexp_suffixes=BASE_SUFFIXES_REGEXPS, regexp_prefixes=BASE_PREFIXES_REGEXPS, regexp_infixes=BASE_INFIXES_REGEXPS, regexp_base_token_matches=BASE_TOKEN_MATCH, merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), terminal_patterns=tuple(NO_TERMINAL_PATTERNS), ): """ Parameters ---------- regexp_suffixes : list of dict Dict in spacy format. See above for explanation of spacy format. regexp_prefixes : list of dict Dict in spacy format. regexp_infixes : list of dict Dict in spacy format. regexp_base_token_matches : list of dict Dict in spacy format. merge_patterns : list of dict Dict in spacy format. terminal_patterns : list of dict Dict in spacy format. """ merge_patterns = list(merge_patterns) terminal_patterns = list(terminal_patterns) self.nlp_pipeline = Russian() self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer( nlp_model=self.nlp_pipeline, prefix_regexp=regexp_prefixes, suffix_regexp=regexp_suffixes, infix_regexp=regexp_infixes, token_match_regexp=regexp_base_token_matches, ) self.tokenizer_postprocesser = RussianTokenizer( self.nlp_pipeline, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser, name='russian_tokenizer_postprocesser')
def spacy_tokenize(text): from spacy.lang.ru import Russian global NLP if not NLP: NLP = Russian() doc = NLP(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
def spacy_tokenize2(text): from spacy.lang.ru import Russian from spacy_russian_tokenizer import ( RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES ) global NLP2 if not NLP2: NLP2 = Russian() NLP2.add_pipe( RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer' ) doc = NLP2(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
class RusWordTokenizer(PreProcesser): def __init__(self): self.rus_word_tokenizer = Russian() pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer') def transform_text(self, text): return [Token(token_id, token.text) for token_id, token in enumerate(self.rus_word_tokenizer(text), 1)] def transform_sent(self, sent): sent = sent.copy() sent.tokens = self.transform_text(sent.text) return sent def transform_item(self, x): return [self.transform_sent(sent) for sent in x]
def spacy_sentence_scores(self) -> Dict[str, float]: nlp = Russian() sentencizer = nlp.create_pipe('sentencizer') nlp.add_pipe(sentencizer) raw_text = self.text docx = nlp(raw_text) stopwords = list(STOP_WORDS) word_frequencies = {} for word in docx: if word.text not in stopwords: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word not in word_frequencies.keys(): word_frequencies[word.word] = 1 else: word_frequencies[word.word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word] / maximum_frequency) sentence_list = [sentence for sentence in docx.sents] sentence_scores = {} for sent in sentence_list: for word in sent: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word.word] else: sentence_scores[sent] += word_frequencies[word.word] return sentence_scores
def main(): _fn = 'test_ru.json' try: with open(_fn, 'r', encoding='utf-8') as _fd: _buf = json.load(_fd) except IOError as _err: print(_err) return # _text = json.dumps(_buf) _text = str(_buf) print(_text) input("Press ENTER for continue...") nlp_obj = Russian() _doc = nlp_obj(_text) for _token in _doc: print(_token.text) input("Press ENTER for continue...")
def get_tokenizer(lang): if lang == "zh": # nlp = spacy.load("zh_core_web_sm") nlp = Chinese() elif lang == "en": # nlp = spacy.load("en_core_web_sm") nlp = English() elif lang == "cs": nlp = Czech() elif lang == "de": # nlp = spacy.load("de_core_web_sm") nlp = German() elif lang == "ru": nlp = Russian() else: raise Exception("Unacceptable language.") return nlp
def make_document_tf(self, document): tf = {} parser = HTMLParser(text=document['data']) text = parser.get_text() nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tmp_text = ' '.join(tokens) doc_text = nlp(tmp_text) lemmas = [] for s in doc_text: if s.lemma_ not in set(stopwords.words('russian')) \ and s.lemma_ not in set(stopwords.words('english')): lemmas.append(s.lemma_) freq = FreqDist(lemmas) print(freq.most_common(10)) # TODO most_common -> all for k, v in freq.most_common(10): tf = self.update_tf(tf, k, document['url'], v) return tf
import os import sys import argparse import codecs import json from tqdm import tqdm from spacy.lang.ru import Russian from spacy.lang.en import English from dataset_utils.utils import save_output from dataset_utils.global_vars import TEXT_FIELDS TOKENIZERS = {'Russian': Russian().tokenizer, 'English': English().tokenizer} def main(input_dir: str, output_dir: str, language: str): tasks = [task for task in os.listdir(input_dir) if task in TEXT_FIELDS] [ preprocess_task(input_dir, output_dir, t, TOKENIZERS[language]) for t in tqdm(tasks) ] def preprocess_task(input_dir: str, output_dir, task: str, preproc_fn): """ replaces raw texts with preprocessed ones """ if not os.path.isdir(output_dir + task): # create directories for preprocessed tasks os.makedirs(output_dir + task) samples = [
#!/usr/bin/env python3 from __future__ import unicode_literals import sys import spacy.lang.ru import re from spacy.lang.ru import Russian import spacy corpora_path = 'raw_corpora.txt' global_word_count = {} texts_word_count = [] nlp = Russian() tokenizer = nlp.Defaults.create_tokenizer() with open(corpora_path, 'r') as f: for cnt, line in enumerate(f): line = line.strip() if line == '/***/': texts_word_count.append({}) continue doc = nlp(line) for word in doc: if re.match('[\w]+', word.text, re.I) == None: continue if not word.text in global_word_count: global_word_count[word.text] = 1 else: global_word_count[word.text] += 1 if not word.text in texts_word_count[-1]: texts_word_count[-1][word.text] = 1
def __init__(self): self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS), name="russian_tokenizer")
import nltk from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords import stanza from spacy_stanza import StanzaLanguage #nltk.download("stopwords") #stanza.download('ru') # will take a while #russian_stopwords = stopwords.words("russian") russian_stopwords = spacy.lang.ru.stop_words.STOP_WORDS # ================================================== EXAMPLE =================================================== text = "Не ветер, а какой-то ураган!" nlp = Russian() doc = nlp(text) russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') doc = nlp(text) print([token.text for token in doc]) # ============================================================================================================= toxic_rus = pd.read_csv("./__DATA/NLP_Datasets/toxic_russian.csv") toxic_rus.head() toxic_rus.info() # Removing punctuation toxic_rus['comment'] = toxic_rus['comment'].str.replace(r'[.,!?<>-]', '') toxic_rus['comment'] = toxic_rus['comment'].str.replace("\n"," ")
from spacy.lang.ja import Japanese from spacy.lang.ca import Catalan from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True, lowercase=True, strip_accents=None, ngram_range=(1, 1),
def main(args): if not os.path.exists(args.save_path): os.mkdir(args.save_path) tokenizers = { "en": spacy.load("en_core_web_sm"), "zh": spacy.load("zh_core_web_sm"), "ru": Russian(), "fr": spacy.load("fr_core_news_sm"), "es": spacy.load("es_core_news_sm"), "ar": WordTokenizer("arabic"), } src_tokenizer = None if args.src_tok is not None: src_tok = tokenizers[args.src_tok] if args.src_tok == "ar": def tokenize_src(text): return [tok for tok in src_tok.tokenize(text)] else: def tokenize_src(text): return [tok.text for tok in src_tok.tokenizer(text)] src_tokenizer = tokenize_src trg_tokenizer = None if args.trg_tok is not None: trg_tok = tokenizers[args.trg_tok] if args.trg_tok == "ar": def tokenize_trg(text): return [tok for tok in trg_tok.tokenize(text)] else: def tokenize_trg(text): return [tok.text for tok in tokz.tokenizer(text)] trg_tokenizer = tokenize_trg if args.task == "translation": indices = prep_trans_files( args.src_file, args.trg_file, args.save_path, src_tok=src_tokenizer, trg_tok=trg_tokenizer, max_len=args.max_len, min_len=args.min_len, ) elif args.task == "tagging": indices = prep_tag_files( args.src_file, args.save_path, src_tok=src_tokenizer, max_len=args.max_len, min_len=args.min_len, ) train, indices, = train_test_split(indices, test_size=0.3, random_state=42) valid, test = train_test_split(indices, test_size=0.5, random_state=42) split_to_tsv("train", train, args.save_path) split_to_tsv("test", test, args.save_path) split_to_tsv("valid", valid, args.save_path) # delete temporary files os.remove(os.path.join(args.save_path, "temp_src.txt")) os.remove(os.path.join(args.save_path, "temp_trg.txt"))
from spacy.lang.en import English from spacy.lang.es import Spanish from spacy.lang.fr import French from spacy.lang.zh import Chinese from spacy.lang.ru import Russian from spacy.lang.ar import Arabic from spacy.lang.de import German from spacy.lang.uk import Ukrainian from spacy.lang.ro import Romanian lang_id_to_spacy = { 'en': English(), 'es': Spanish(), 'fr': French(), 'zh-cn': Chinese(), 'ru': Russian(), 'ar': Arabic(), 'de': German(), 'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4', password='******', user_agent='testscript by /u/pocaguirre', username='******')
class SpacyRulesRussianTokenizer(): """ Tokenizer based on https://github.com/aatimofeev/spacy_russian_tokenizer.git Tokenizer was built on spacy and use spacy standart tokenization pipeline. You can read more about it here: * https://spacy.io/usage/linguistic-features#section-tokenization * https://spacy.io/usage/rule-based-matching Installation instruction: 1) pip install spacy 2) pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git """ def __init__( self, regexp_suffixes=BASE_SUFFIXES_REGEXPS, regexp_prefixes=BASE_PREFIXES_REGEXPS, regexp_infixes=BASE_INFIXES_REGEXPS, regexp_base_token_matches=BASE_TOKEN_MATCH, merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), terminal_patterns=tuple(NO_TERMINAL_PATTERNS), ): """ Parameters ---------- regexp_suffixes : list of dict Dict in spacy format. See above for explanation of spacy format. regexp_prefixes : list of dict Dict in spacy format. regexp_infixes : list of dict Dict in spacy format. regexp_base_token_matches : list of dict Dict in spacy format. merge_patterns : list of dict Dict in spacy format. terminal_patterns : list of dict Dict in spacy format. """ merge_patterns = list(merge_patterns) terminal_patterns = list(terminal_patterns) self.nlp_pipeline = Russian() self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer( nlp_model=self.nlp_pipeline, prefix_regexp=regexp_prefixes, suffix_regexp=regexp_suffixes, infix_regexp=regexp_infixes, token_match_regexp=regexp_base_token_matches, ) self.tokenizer_postprocesser = RussianTokenizer( self.nlp_pipeline, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser, name='russian_tokenizer_postprocesser') @staticmethod def create_custom_pretokenizer(nlp_model, prefix_regexp, suffix_regexp, infix_regexp, token_match_regexp): custom_pretokenizer = SpacyBaseTokenizer( nlp_model.vocab, prefix_search=prefix_regexp.search, suffix_search=suffix_regexp.search, infix_finditer=infix_regexp.finditer, token_match=token_match_regexp.match, ) return custom_pretokenizer def transform_element(self, element): """ Get tokenization variant of the element. Parameters ---------- element : str String, supposed to be a sentence, one document or something analogous. Return ------ tokens_array : list of str Tokenized string """ if not isinstance(element, str): raise TypeError( f"Cannot tokenize {type(element)} instead of {type('')}!") tokens_array = [token.text for token in self.nlp_pipeline(element)] return tokens_array def transform(self, elements_collection): """ Apply transformer to collection of elements (objects). Parameters ---------- elements_collection : iterable of optional Collection of objects to be transformed. Returns ------- transformed_elements : list of optional Collection of transformed objects. """ transformed_elements = [ self.transform_element(element) for element in elements_collection ] return transformed_elements
def __init__(self): self.rus_word_tokenizer = Russian() pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
def tokenizer(inp): nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') return nlp(inp)
def lemmatize(self, token, pos_tag): nlp = Russian() docs = iter(nlp(token)) return next(docs).lemma_