def count_simple_stats(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') texts_count = 0 sent_count = 0 words_count = 0 symbols_count = 0 for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols) # print([token.txt for token in tokens]) print("Texts count:", texts_count) print("Sentences count:", sent_count) print("Words count:", words_count) print("Symbols count:", symbols_count)
def add_to_index(self, document, doc_id): # parser = HTMLParser(text=document['data']) text = document['data'] # print(1) nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tokens = [token.lower() for token in tokens] tmp_text = ' '.join(tokens) if len(tokens) > 10e5: return self.doc_iter += 1 nlp.max_length = 10e7 doc_text = nlp(tmp_text, disable=['ner', 'parser']) lemmas = [] # for lemma in tokens: for s in doc_text: lemma = s.lemma_ lemmas.append(lemma) # if lemma not in set(stopwords.words('russian')) \ # and lemma not in set(stopwords.words('english')) \ # and len(lemma) > 1: # lemmas.append(lemma) freq = FreqDist(lemmas) for k, v in freq.most_common(): if k not in self.global_index: self.global_index[k] = [] self.global_index[k].append((doc_id, v))
class SpacyTokenizer: def __init__(self): self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS), name="russian_tokenizer") def tokenize(self, text): return [token.text for token in self.nlp(text) if token.text.strip()]
def __init__(self): from import Russian from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES) self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer( self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer')
def load_spacy_model(model): if model == "ru": try: from import Russian return Russian() # import stanfordnlp # from spacy_stanfordnlp import StanfordNLPLanguage # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources") # return StanfordNLPLanguage(snlp) #import stanza #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources") except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+" ) from e import spacy try: return spacy.load(model) except OSError: # Workaround from from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def __init__(self): super(ClassifierKNN, self).__init__() self.data_sets = [] self.texts = {} self.options = {} self.threshold = 0.3 self.russian_stop_words = stop_words.get_stop_words('russian') self.parser = Russian() self.stop_list = set(stopwords.words('russian') + list( self.russian_stop_words)) # List of symbols we don't care about self.escape_symbols = ' '.join(string.punctuation).split(' ') +\ ['-----', '---', '...', '“', '”', '\'ve'] # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer # to use a custom function using spaCy's tokenizer self.vectorizer = CountVectorizer( tokenizer=self.tokenizeText, ngram_range=(1, 1) ) self.clf = KNeighborsClassifier( n_neighbors=20, weights='uniform', algorithm='auto'#, metric='mahalanobis' ) # the pipeline to clean, tokenize, vectorize, and classify self.pipe = Pipeline( [ ('cleanText', ClassifierKNN.CleanTextTransformer()), ('vectorizer', self.vectorizer), ('clf', self.clf) ] )
def load_spacy_model(model): if model == "ru": try: from import Russian return Russian() except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+" ) from e import spacy try: return spacy.load(model) except OSError: # Workaround from from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def __init__(self): super(ClassifierSpacy, self).__init__() self.data_sets = [] self.texts = {} self.options = {} self.threshold = 0.3 self.russian_stop_words = stop_words.get_stop_words('russian') self.parser = Russian() self.stop_list = set(stopwords.words('russian') + list( self.russian_stop_words)) # List of symbols we don't care about self.escape_symbols = ' '.join(string.punctuation).split(' ') +\ ['-----', '---', '...', '“', '”', '\'ve'] # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer # to use a custom function using spaCy's tokenizer self.vectorizer = CountVectorizer( tokenizer=self.tokenizeText, ngram_range=(1, 1) ) self.clf = MultinomialNB() # self.clf = LinearSVC() # self.clf = SVC(probability=True) # the pipeline to clean, tokenize, vectorize, and classify self.pipe = Pipeline( [ ('cleanText', ClassifierSpacy.CleanTextTransformer()), ('vectorizer', self.vectorizer), ('clf', self.clf) ] )
def tokenize(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols)
class TimofeevTokenizer: label = 'aatimofeev/spacy_russian_tokenizer' def __init__(self): from import Russian from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES) self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer( self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer') def __call__(self, text): doc = self.nlp(text) chunks = (token.text for token in doc) return find_substrings(chunks, text)
def text_decomposition(text, lang='de'): if lang == 'de': nlp = spacy.load('de_core_news_md') elif lang == 'en': nlp = spacy.load("en_core_web_md") elif lang == 'ru': nlp = Russian() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) else: print("Unsupported language. Choose from ['en', 'de', 'ru']") return doc = nlp(text) sentences = list() for sent in doc.sents: sentences.append(sent.text) return sentences
def __init__( self, regexp_suffixes=BASE_SUFFIXES_REGEXPS, regexp_prefixes=BASE_PREFIXES_REGEXPS, regexp_infixes=BASE_INFIXES_REGEXPS, regexp_base_token_matches=BASE_TOKEN_MATCH, merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), terminal_patterns=tuple(NO_TERMINAL_PATTERNS), ): """ Parameters ---------- regexp_suffixes : list of dict Dict in spacy format. See above for explanation of spacy format. regexp_prefixes : list of dict Dict in spacy format. regexp_infixes : list of dict Dict in spacy format. regexp_base_token_matches : list of dict Dict in spacy format. merge_patterns : list of dict Dict in spacy format. terminal_patterns : list of dict Dict in spacy format. """ merge_patterns = list(merge_patterns) terminal_patterns = list(terminal_patterns) self.nlp_pipeline = Russian() self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer( nlp_model=self.nlp_pipeline, prefix_regexp=regexp_prefixes, suffix_regexp=regexp_suffixes, infix_regexp=regexp_infixes, token_match_regexp=regexp_base_token_matches, ) self.tokenizer_postprocesser = RussianTokenizer( self.nlp_pipeline, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser, name='russian_tokenizer_postprocesser')
def spacy_tokenize(text): from import Russian global NLP if not NLP: NLP = Russian() doc = NLP(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
def spacy_tokenize2(text): from import Russian from spacy_russian_tokenizer import ( RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES ) global NLP2 if not NLP2: NLP2 = Russian() NLP2.add_pipe( RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer' ) doc = NLP2(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
class RusWordTokenizer(PreProcesser): def __init__(self): self.rus_word_tokenizer = Russian() pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer') def transform_text(self, text): return [Token(token_id, token.text) for token_id, token in enumerate(self.rus_word_tokenizer(text), 1)] def transform_sent(self, sent): sent = sent.copy() sent.tokens = self.transform_text(sent.text) return sent def transform_item(self, x): return [self.transform_sent(sent) for sent in x]
def spacy_sentence_scores(self) -> Dict[str, float]: nlp = Russian() sentencizer = nlp.create_pipe('sentencizer') nlp.add_pipe(sentencizer) raw_text = self.text docx = nlp(raw_text) stopwords = list(STOP_WORDS) word_frequencies = {} for word in docx: if word.text not in stopwords: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word not in word_frequencies.keys(): word_frequencies[word.word] = 1 else: word_frequencies[word.word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word] / maximum_frequency) sentence_list = [sentence for sentence in docx.sents] sentence_scores = {} for sent in sentence_list: for word in sent: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word.word] else: sentence_scores[sent] += word_frequencies[word.word] return sentence_scores
def main(): _fn = 'test_ru.json' try: with open(_fn, 'r', encoding='utf-8') as _fd: _buf = json.load(_fd) except IOError as _err: print(_err) return # _text = json.dumps(_buf) _text = str(_buf) print(_text) input("Press ENTER for continue...") nlp_obj = Russian() _doc = nlp_obj(_text) for _token in _doc: print(_token.text) input("Press ENTER for continue...")
def get_tokenizer(lang): if lang == "zh": # nlp = spacy.load("zh_core_web_sm") nlp = Chinese() elif lang == "en": # nlp = spacy.load("en_core_web_sm") nlp = English() elif lang == "cs": nlp = Czech() elif lang == "de": # nlp = spacy.load("de_core_web_sm") nlp = German() elif lang == "ru": nlp = Russian() else: raise Exception("Unacceptable language.") return nlp
def make_document_tf(self, document): tf = {} parser = HTMLParser(text=document['data']) text = parser.get_text() nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tmp_text = ' '.join(tokens) doc_text = nlp(tmp_text) lemmas = [] for s in doc_text: if s.lemma_ not in set(stopwords.words('russian')) \ and s.lemma_ not in set(stopwords.words('english')): lemmas.append(s.lemma_) freq = FreqDist(lemmas) print(freq.most_common(10)) # TODO most_common -> all for k, v in freq.most_common(10): tf = self.update_tf(tf, k, document['url'], v) return tf
import os import sys import argparse import codecs import json from tqdm import tqdm from import Russian from spacy.lang.en import English from dataset_utils.utils import save_output from dataset_utils.global_vars import TEXT_FIELDS TOKENIZERS = {'Russian': Russian().tokenizer, 'English': English().tokenizer} def main(input_dir: str, output_dir: str, language: str): tasks = [task for task in os.listdir(input_dir) if task in TEXT_FIELDS] [ preprocess_task(input_dir, output_dir, t, TOKENIZERS[language]) for t in tqdm(tasks) ] def preprocess_task(input_dir: str, output_dir, task: str, preproc_fn): """ replaces raw texts with preprocessed ones """ if not os.path.isdir(output_dir + task): # create directories for preprocessed tasks os.makedirs(output_dir + task) samples = [
#!/usr/bin/env python3 from __future__ import unicode_literals import sys import import re from import Russian import spacy corpora_path = 'raw_corpora.txt' global_word_count = {} texts_word_count = [] nlp = Russian() tokenizer = nlp.Defaults.create_tokenizer() with open(corpora_path, 'r') as f: for cnt, line in enumerate(f): line = line.strip() if line == '/***/': texts_word_count.append({}) continue doc = nlp(line) for word in doc: if re.match('[\w]+', word.text, re.I) == None: continue if not word.text in global_word_count: global_word_count[word.text] = 1 else: global_word_count[word.text] += 1 if not word.text in texts_word_count[-1]: texts_word_count[-1][word.text] = 1
def __init__(self): self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS), name="russian_tokenizer")
import nltk from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords import stanza from spacy_stanza import StanzaLanguage"stopwords")'ru') # will take a while #russian_stopwords = stopwords.words("russian") russian_stopwords = # ================================================== EXAMPLE =================================================== text = "Не ветер, а какой-то ураган!" nlp = Russian() doc = nlp(text) russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') doc = nlp(text) print([token.text for token in doc]) # ============================================================================================================= toxic_rus = pd.read_csv("./__DATA/NLP_Datasets/toxic_russian.csv") toxic_rus.head() # Removing punctuation toxic_rus['comment'] = toxic_rus['comment'].str.replace(r'[.,!?<>-]', '') toxic_rus['comment'] = toxic_rus['comment'].str.replace("\n"," ")
from spacy.lang.ja import Japanese from import Catalan from import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True, lowercase=True, strip_accents=None, ngram_range=(1, 1),
def main(args): if not os.path.exists(args.save_path): os.mkdir(args.save_path) tokenizers = { "en": spacy.load("en_core_web_sm"), "zh": spacy.load("zh_core_web_sm"), "ru": Russian(), "fr": spacy.load("fr_core_news_sm"), "es": spacy.load("es_core_news_sm"), "ar": WordTokenizer("arabic"), } src_tokenizer = None if args.src_tok is not None: src_tok = tokenizers[args.src_tok] if args.src_tok == "ar": def tokenize_src(text): return [tok for tok in src_tok.tokenize(text)] else: def tokenize_src(text): return [tok.text for tok in src_tok.tokenizer(text)] src_tokenizer = tokenize_src trg_tokenizer = None if args.trg_tok is not None: trg_tok = tokenizers[args.trg_tok] if args.trg_tok == "ar": def tokenize_trg(text): return [tok for tok in trg_tok.tokenize(text)] else: def tokenize_trg(text): return [tok.text for tok in tokz.tokenizer(text)] trg_tokenizer = tokenize_trg if args.task == "translation": indices = prep_trans_files( args.src_file, args.trg_file, args.save_path, src_tok=src_tokenizer, trg_tok=trg_tokenizer, max_len=args.max_len, min_len=args.min_len, ) elif args.task == "tagging": indices = prep_tag_files( args.src_file, args.save_path, src_tok=src_tokenizer, max_len=args.max_len, min_len=args.min_len, ) train, indices, = train_test_split(indices, test_size=0.3, random_state=42) valid, test = train_test_split(indices, test_size=0.5, random_state=42) split_to_tsv("train", train, args.save_path) split_to_tsv("test", test, args.save_path) split_to_tsv("valid", valid, args.save_path) # delete temporary files os.remove(os.path.join(args.save_path, "temp_src.txt")) os.remove(os.path.join(args.save_path, "temp_trg.txt"))
from spacy.lang.en import English from import Spanish from import French from spacy.lang.zh import Chinese from import Russian from import Arabic from import German from import Ukrainian from import Romanian lang_id_to_spacy = { 'en': English(), 'es': Spanish(), 'fr': French(), 'zh-cn': Chinese(), 'ru': Russian(), 'ar': Arabic(), 'de': German(), 'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4', password='******', user_agent='testscript by /u/pocaguirre', username='******')
class SpacyRulesRussianTokenizer(): """ Tokenizer based on Tokenizer was built on spacy and use spacy standart tokenization pipeline. You can read more about it here: * * Installation instruction: 1) pip install spacy 2) pip install git+ """ def __init__( self, regexp_suffixes=BASE_SUFFIXES_REGEXPS, regexp_prefixes=BASE_PREFIXES_REGEXPS, regexp_infixes=BASE_INFIXES_REGEXPS, regexp_base_token_matches=BASE_TOKEN_MATCH, merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), terminal_patterns=tuple(NO_TERMINAL_PATTERNS), ): """ Parameters ---------- regexp_suffixes : list of dict Dict in spacy format. See above for explanation of spacy format. regexp_prefixes : list of dict Dict in spacy format. regexp_infixes : list of dict Dict in spacy format. regexp_base_token_matches : list of dict Dict in spacy format. merge_patterns : list of dict Dict in spacy format. terminal_patterns : list of dict Dict in spacy format. """ merge_patterns = list(merge_patterns) terminal_patterns = list(terminal_patterns) self.nlp_pipeline = Russian() self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer( nlp_model=self.nlp_pipeline, prefix_regexp=regexp_prefixes, suffix_regexp=regexp_suffixes, infix_regexp=regexp_infixes, token_match_regexp=regexp_base_token_matches, ) self.tokenizer_postprocesser = RussianTokenizer( self.nlp_pipeline, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser, name='russian_tokenizer_postprocesser') @staticmethod def create_custom_pretokenizer(nlp_model, prefix_regexp, suffix_regexp, infix_regexp, token_match_regexp): custom_pretokenizer = SpacyBaseTokenizer( nlp_model.vocab,,, infix_finditer=infix_regexp.finditer, token_match=token_match_regexp.match, ) return custom_pretokenizer def transform_element(self, element): """ Get tokenization variant of the element. Parameters ---------- element : str String, supposed to be a sentence, one document or something analogous. Return ------ tokens_array : list of str Tokenized string """ if not isinstance(element, str): raise TypeError( f"Cannot tokenize {type(element)} instead of {type('')}!") tokens_array = [token.text for token in self.nlp_pipeline(element)] return tokens_array def transform(self, elements_collection): """ Apply transformer to collection of elements (objects). Parameters ---------- elements_collection : iterable of optional Collection of objects to be transformed. Returns ------- transformed_elements : list of optional Collection of transformed objects. """ transformed_elements = [ self.transform_element(element) for element in elements_collection ] return transformed_elements
def __init__(self): self.rus_word_tokenizer = Russian() pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
def tokenizer(inp): nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') return nlp(inp)
def lemmatize(self, token, pos_tag): nlp = Russian() docs = iter(nlp(token)) return next(docs).lemma_