def add_to_index(self, document, doc_id): # parser = HTMLParser(text=document['data']) text = document['data'] # print(1) nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tokens = [token.lower() for token in tokens] tmp_text = ' '.join(tokens) if len(tokens) > 10e5: return self.doc_iter += 1 nlp.max_length = 10e7 doc_text = nlp(tmp_text, disable=['ner', 'parser']) lemmas = [] # for lemma in tokens: for s in doc_text: lemma = s.lemma_ lemmas.append(lemma) # if lemma not in set(stopwords.words('russian')) \ # and lemma not in set(stopwords.words('english')) \ # and len(lemma) > 1: # lemmas.append(lemma) freq = FreqDist(lemmas) for k, v in freq.most_common(): if k not in self.global_index: self.global_index[k] = [] self.global_index[k].append((doc_id, v))
def __init__(self): super(ClassifierKNN, self).__init__() self.data_sets = [] self.texts = {} self.options = {} self.threshold = 0.3 self.russian_stop_words = stop_words.get_stop_words('russian') self.parser = Russian() self.stop_list = set(stopwords.words('russian') + list( self.russian_stop_words)) # List of symbols we don't care about self.escape_symbols = ' '.join(string.punctuation).split(' ') +\ ['-----', '---', '...', '“', '”', '\'ve'] # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer # to use a custom function using spaCy's tokenizer self.vectorizer = CountVectorizer( tokenizer=self.tokenizeText, ngram_range=(1, 1) ) self.clf = KNeighborsClassifier( n_neighbors=20, weights='uniform', algorithm='auto'#, metric='mahalanobis' ) # the pipeline to clean, tokenize, vectorize, and classify self.pipe = Pipeline( [ ('cleanText', ClassifierKNN.CleanTextTransformer()), ('vectorizer', self.vectorizer), ('clf', self.clf) ] )
def __init__(self): super(ClassifierSpacy, self).__init__() self.data_sets = [] self.texts = {} self.options = {} self.threshold = 0.3 self.russian_stop_words = stop_words.get_stop_words('russian') self.parser = Russian() self.stop_list = set(stopwords.words('russian') + list( self.russian_stop_words)) # List of symbols we don't care about self.escape_symbols = ' '.join(string.punctuation).split(' ') +\ ['-----', '---', '...', '“', '”', '\'ve'] # the vectorizer and classifer to use # note that I changed the tokenizer in CountVectorizer # to use a custom function using spaCy's tokenizer self.vectorizer = CountVectorizer( tokenizer=self.tokenizeText, ngram_range=(1, 1) ) self.clf = MultinomialNB() # self.clf = LinearSVC() # self.clf = SVC(probability=True) # the pipeline to clean, tokenize, vectorize, and classify self.pipe = Pipeline( [ ('cleanText', ClassifierSpacy.CleanTextTransformer()), ('vectorizer', self.vectorizer), ('clf', self.clf) ] )
def load_spacy_model(model): if model == "ru": try: from spacy.lang.ru import Russian return Russian() # import stanfordnlp # from spacy_stanfordnlp import StanfordNLPLanguage # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources") # return StanfordNLPLanguage(snlp) #import stanza #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources") except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git" ) from e import spacy try: return spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def load_spacy_model(model): if model == "ru": try: from spacy.lang.ru import Russian return Russian() except OSError as e: raise OSError( "Failed to get spaCy Russian model. Install it using " "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git" ) from e import spacy try: return spacy.load(model) except OSError: spacy.cli.download(model) # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269 from spacy.cli import link from spacy.util import get_package_path link(model, model, force=True, model_path=get_package_path(model)) try: return spacy.load(model) except OSError as e: raise OSError( "Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e
def count_simple_stats(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') texts_count = 0 sent_count = 0 words_count = 0 symbols_count = 0 for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols) # print([token.txt for token in tokens]) print("Texts count:", texts_count) print("Sentences count:", sent_count) print("Words count:", words_count) print("Symbols count:", symbols_count)
def __init__(self): from spacy.lang.ru import Russian from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES) self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer( self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer')
def spacy_tokenize(text): from spacy.lang.ru import Russian global NLP if not NLP: NLP = Russian() doc = NLP(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
def tokenize(): with open('data/articles.json', 'r', encoding='utf8') as f: json_str = f.readlines()[0] articles = json.loads(json_str) nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') for title in articles: text = articles[title][0].strip() texts_count += 1 sents = nltk.sent_tokenize(text, language="russian") sent_count += len(sents) tokens = nlp(text) words_count += len(tokens) symbols = [symb for symb in text if symb != ' ' and symb != '\n'] symbols_count += len(symbols)
def main(): _fn = 'test_ru.json' try: with open(_fn, 'r', encoding='utf-8') as _fd: _buf = json.load(_fd) except IOError as _err: print(_err) return # _text = json.dumps(_buf) _text = str(_buf) print(_text) input("Press ENTER for continue...") nlp_obj = Russian() _doc = nlp_obj(_text) for _token in _doc: print(_token.text) input("Press ENTER for continue...")
def get_tokenizer(lang): if lang == "zh": # nlp = spacy.load("zh_core_web_sm") nlp = Chinese() elif lang == "en": # nlp = spacy.load("en_core_web_sm") nlp = English() elif lang == "cs": nlp = Czech() elif lang == "de": # nlp = spacy.load("de_core_web_sm") nlp = German() elif lang == "ru": nlp = Russian() else: raise Exception("Unacceptable language.") return nlp
def text_decomposition(text, lang='de'): if lang == 'de': nlp = spacy.load('de_core_news_md') elif lang == 'en': nlp = spacy.load("en_core_web_md") elif lang == 'ru': nlp = Russian() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) else: print("Unsupported language. Choose from ['en', 'de', 'ru']") return doc = nlp(text) sentences = list() for sent in doc.sents: sentences.append(sent.text) return sentences
def __init__( self, regexp_suffixes=BASE_SUFFIXES_REGEXPS, regexp_prefixes=BASE_PREFIXES_REGEXPS, regexp_infixes=BASE_INFIXES_REGEXPS, regexp_base_token_matches=BASE_TOKEN_MATCH, merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), terminal_patterns=tuple(NO_TERMINAL_PATTERNS), ): """ Parameters ---------- regexp_suffixes : list of dict Dict in spacy format. See above for explanation of spacy format. regexp_prefixes : list of dict Dict in spacy format. regexp_infixes : list of dict Dict in spacy format. regexp_base_token_matches : list of dict Dict in spacy format. merge_patterns : list of dict Dict in spacy format. terminal_patterns : list of dict Dict in spacy format. """ merge_patterns = list(merge_patterns) terminal_patterns = list(terminal_patterns) self.nlp_pipeline = Russian() self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer( nlp_model=self.nlp_pipeline, prefix_regexp=regexp_prefixes, suffix_regexp=regexp_suffixes, infix_regexp=regexp_infixes, token_match_regexp=regexp_base_token_matches, ) self.tokenizer_postprocesser = RussianTokenizer( self.nlp_pipeline, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns) self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser, name='russian_tokenizer_postprocesser')
def spacy_tokenize2(text): from spacy.lang.ru import Russian from spacy_russian_tokenizer import ( RussianTokenizer, MERGE_PATTERNS, SYNTAGRUS_RARE_CASES ) global NLP2 if not NLP2: NLP2 = Russian() NLP2.add_pipe( RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES), name='russian_tokenizer' ) doc = NLP2(text) chunks = [token.text for token in doc] return find_substrings(chunks, text)
def make_document_tf(self, document): tf = {} parser = HTMLParser(text=document['data']) text = parser.get_text() nlp = Russian() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(text) tmp_text = ' '.join(tokens) doc_text = nlp(tmp_text) lemmas = [] for s in doc_text: if s.lemma_ not in set(stopwords.words('russian')) \ and s.lemma_ not in set(stopwords.words('english')): lemmas.append(s.lemma_) freq = FreqDist(lemmas) print(freq.most_common(10)) # TODO most_common -> all for k, v in freq.most_common(10): tf = self.update_tf(tf, k, document['url'], v) return tf
def spacy_sentence_scores(self) -> Dict[str, float]: nlp = Russian() sentencizer = nlp.create_pipe('sentencizer') nlp.add_pipe(sentencizer) raw_text = self.text docx = nlp(raw_text) stopwords = list(STOP_WORDS) word_frequencies = {} for word in docx: if word.text not in stopwords: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word not in word_frequencies.keys(): word_frequencies[word.word] = 1 else: word_frequencies[word.word] += 1 maximum_frequency = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word] / maximum_frequency) sentence_list = [sentence for sentence in docx.sents] sentence_scores = {} for sent in sentence_list: for word in sent: word = MORPH.parse(word.text)[0].normalized if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag): if word.word in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word.word] else: sentence_scores[sent] += word_frequencies[word.word] return sentence_scores
def __init__(self): self.nlp = Russian() self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS), name="russian_tokenizer")
def __init__(self, lowercase=True, keepcaps=False, normalize=3, ignore_quotes=False, ignore_reddit_quotes=False, ignore_stopwords=False, stem=False, remove_punct=True, remove_breaks=True, decontract=False, twitter_handles=False, urls=False, hashtags=False, numbers=False, subreddits=False, reddit_usernames=False, emails=False, extra_patterns=None, keep_untokenized=None, whitespaces_to_underscores=True, remove_nonunicode=False, pos_emojis=None, neg_emojis=None, neutral_emojis=None, print_url_warnings=False, latin_chars_fix=False, ngrams=1): self.params = locals() #self._nlp = English() self._nlp = Russian() russian_tokenizer = RussianTokenizer( self._nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self._nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') self._merging_matcher = Matcher(self._nlp.vocab) self._matcher = Matcher(self._nlp.vocab) self._replacements = {} self._domains = {} self._realnames = {} self._stopwords = None alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check) hashtag_flag = self._nlp.vocab.add_flag(hashtag_check) twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check) self._merging_matcher.add('HASHTAG', None, [{ 'ORTH': '#' }, { 'IS_ASCII': True }]) self._merging_matcher.add('SUBREDDIT', None, [{ 'ORTH': '/r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) self._merging_matcher.add('REDDIT_USERNAME', None, [{ 'ORTH': '/u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules): try: self._stopwords = stopwords.words(ignore_stopwords) except OSError: raise ValueError('Language {} was not found by NLTK'.format( ignore_stopwords)) elif ignore_stopwords is True: self._matcher.add('STOPWORDS', self._remove_token, [{ 'IS_STOP': True }]) elif isinstance(ignore_stopwords, list): self._stopwords = [word.lower() for word in ignore_stopwords] elif ignore_stopwords is not False: raise TypeError( 'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed' .format(type(ignore_stopwords))) if lowercase and (not keepcaps): self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False }]) elif lowercase and keepcaps: self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False, 'IS_UPPER': False }]) if remove_punct: self._matcher.add('PUNCTUATION', self._remove_token, [{ 'IS_PUNCT': True }]) if remove_breaks: def break_check(text): return bool(BREAKS_RE.fullmatch(text)) break_flag = self._nlp.vocab.add_flag(break_check) self._matcher.add('BREAK', self._remove_token, [{ break_flag: True }]) if normalize: def normalize_check(text): return bool(NORMALIZE_RE.search(text)) normalize_flag = self._nlp.vocab.add_flag(normalize_check) self._matcher.add('NORMALIZE', self._normalize, [{ normalize_flag: True }]) if numbers is not False: self._matcher.add('NUMBER', self._replace_token, [{ 'LIKE_NUM': True }]) self._replacements['NUMBER'] = numbers if urls is not False: if urls in [ 'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title' ]: self._urls = urls self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) elif isinstance(urls, dict): self._domains = urls self._urls = 'domain_unwrap_fast' self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) else: self._matcher.add('URL', self._replace_token, [{ 'LIKE_URL': True }]) self._replacements['URL'] = urls if emails is not False: self._matcher.add('EMAIL', self._replace_token, [{ 'LIKE_EMAIL': True }]) self._replacements['EMAIL'] = emails if reddit_usernames is not False: def reddit_username_check(text): return bool(REDDITORS_RE.fullmatch(text)) reddit_username_flag = self._nlp.vocab.add_flag( reddit_username_check) self._matcher.add('REDDIT_USERNAME', self._replace_token, [{ reddit_username_flag: True }]) self._replacements['REDDIT_USERNAME'] = reddit_usernames if subreddits is not False: def subreddit_check(text): return bool(SUBREDDITS_RE.fullmatch(text)) subreddit_flag = self._nlp.vocab.add_flag(subreddit_check) self._matcher.add('SUBREDDIT', self._replace_token, [{ subreddit_flag: True }]) self._replacements['SUBREDDIT'] = subreddits if twitter_handles is not False: self._matcher.add('TWITTER_HANDLE', self._handles_postprocess, [{ twitter_handle_flag: True }]) if hashtags is not False: self._matcher.add('HASHTAG', self._hashtag_postprocess, [{ hashtag_flag: True }]) if hashtags == 'split' or twitter_handles == 'split': file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt') with open(file) as f: self._words = f.read().split() self._wordcost = dict((k, log((i + 1) * log(len(self._words)))) for i, k in enumerate(self._words)) self._maxword = max(len(x) for x in self._words) if twitter_handles == 'realname': with open(os.path.join(DATA_PATH, 'realnames.json')) as f: self._realnames = json.load(f) if ignore_quotes: self._merging_matcher.add('QUOTE', None, [{ 'ORTH': '"' }, { 'OP': '*', 'IS_ASCII': True }, { 'ORTH': '"' }]) def doublequote_check(text): return bool(QUOTES_RE.fullmatch(text)) doublequote_flag = self._nlp.vocab.add_flag(doublequote_check) self._matcher.add('DOUBLE_QUOTES', self._remove_token, [{ doublequote_flag: True }]) if self._stopwords: def stopword_check(text): return bool(text.lower() in self._stopwords) stopword_flag = self._nlp.vocab.add_flag(stopword_check) self._matcher.add('STOPWORD', self._remove_token, [{ stopword_flag: True }]) if keep_untokenized is not None: if not isinstance(keep_untokenized, list): raise ValueError( "keep_untokenized has to be either None or a list") for i, phrase in enumerate(keep_untokenized): phrase_tokens = phrase.split(' ') rule = [] for token in phrase_tokens: rule.append({'LOWER': token.lower()}) self._merging_matcher.add('RULE_' + str(i), None, rule) if pos_emojis: if not isinstance(pos_emojis, list): pos_emojis = POS_EMOJIS pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis] self._matcher.add('HAPPY', self._replace_token, *pos_patterns) self._replacements['HAPPY'] = 'POS_EMOJI' if neg_emojis: if not isinstance(neg_emojis, list): neg_emojis = NEG_EMOJIS neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis] self._matcher.add('SAD', self._replace_token, *neg_patterns) self._replacements['SAD'] = 'NEG_EMOJI' if neutral_emojis: if not isinstance(neutral_emojis, list): neutral_emojis = NEUTRAL_EMOJIS neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis] self._matcher.add('NEUTRAL', self._replace_token, *neutral_patterns) self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI' if isinstance(extra_patterns, list): self._flags = {} for name, re_pattern, replacement_token in extra_patterns: def flag(text): return bool(re_pattern.match(text)) self._flags[name] = self._nlp.vocab.add_flag(flag) self._matcher.add(name, self._replace_token, [{ self._flags[name]: True }]) self._replacements[name] = replacement_token if stem and ('nltk' in sys.modules): if stem == 'stem': self._stemmer = PorterStemmer() elif stem == 'lemm': self._stemmer = WordNetLemmatizer() #elif stem == 'rus': # self._stemmer = SnowballStemmer("russian") else: raise ValueError( 'Stemming method {} is not supported'.format(stem)) self._matcher.add('WORD_TO_STEM', self._stem_word, [{ 'IS_ALPHA': True }]) retokenize_flag = self._nlp.vocab.add_flag(retokenize_check) self._matcher.add('RETOKENIZE', self._retokenize, [{ retokenize_flag: True, 'IS_PUNCT': False, 'LIKE_URL': False, 'LIKE_EMAIL': False, 'LIKE_NUM': False, hashtag_flag: False, twitter_handle_flag: False }]) self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True) self._nlp.add_pipe(self._match_doc, name='match_doc', last=True) self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True)
def __init__(self): self.rus_word_tokenizer = Russian() pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES) self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
from spacy.lang.en import English from spacy.lang.es import Spanish from spacy.lang.fr import French from spacy.lang.zh import Chinese from spacy.lang.ru import Russian from spacy.lang.ar import Arabic from spacy.lang.de import German from spacy.lang.uk import Ukrainian from spacy.lang.ro import Romanian lang_id_to_spacy = { 'en': English(), 'es': Spanish(), 'fr': French(), 'zh-cn': Chinese(), 'ru': Russian(), 'ar': Arabic(), 'de': German(), 'uk': Ukrainian(), 'ro': Romanian() } ##################### ### Globals ##################### reddit = Reddit(client_id='OFsSWAsbFrzLpg', client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4', password='******', user_agent='testscript by /u/pocaguirre', username='******')
def lemmatize(self, token, pos_tag): nlp = Russian() docs = iter(nlp(token)) return next(docs).lemma_
def main(args): if not os.path.exists(args.save_path): os.mkdir(args.save_path) tokenizers = { "en": spacy.load("en_core_web_sm"), "zh": spacy.load("zh_core_web_sm"), "ru": Russian(), "fr": spacy.load("fr_core_news_sm"), "es": spacy.load("es_core_news_sm"), "ar": WordTokenizer("arabic"), } src_tokenizer = None if args.src_tok is not None: src_tok = tokenizers[args.src_tok] if args.src_tok == "ar": def tokenize_src(text): return [tok for tok in src_tok.tokenize(text)] else: def tokenize_src(text): return [tok.text for tok in src_tok.tokenizer(text)] src_tokenizer = tokenize_src trg_tokenizer = None if args.trg_tok is not None: trg_tok = tokenizers[args.trg_tok] if args.trg_tok == "ar": def tokenize_trg(text): return [tok for tok in trg_tok.tokenize(text)] else: def tokenize_trg(text): return [tok.text for tok in tokz.tokenizer(text)] trg_tokenizer = tokenize_trg if args.task == "translation": indices = prep_trans_files( args.src_file, args.trg_file, args.save_path, src_tok=src_tokenizer, trg_tok=trg_tokenizer, max_len=args.max_len, min_len=args.min_len, ) elif args.task == "tagging": indices = prep_tag_files( args.src_file, args.save_path, src_tok=src_tokenizer, max_len=args.max_len, min_len=args.min_len, ) train, indices, = train_test_split(indices, test_size=0.3, random_state=42) valid, test = train_test_split(indices, test_size=0.5, random_state=42) split_to_tsv("train", train, args.save_path) split_to_tsv("test", test, args.save_path) split_to_tsv("valid", valid, args.save_path) # delete temporary files os.remove(os.path.join(args.save_path, "temp_src.txt")) os.remove(os.path.join(args.save_path, "temp_trg.txt"))
tokens = ' '.join(tokens) return tokens.strip() def tokenizeText(sample): tokens = parser(sample) lemmas = [] for tok in tokens: lemmas.append(tok.lemma_.lower().strip( ) if tok.lemma_ != "-PRON-" else tok.lower_) tokens = lemmas return tokens vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1, 1)) parser = Russian() clf = LogisticRegression() class CleanTextTransformer(TransformerMixin): def transform(self, X, **transform_params): return [cleanup_text(text, False) for text in X] def fit(self, X, y=None, **fit_params): return self pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)]) p = pipe.fit(df['text'].to_list(), df['label'].to_list())
import os import sys import argparse import codecs import json from tqdm import tqdm from spacy.lang.ru import Russian from spacy.lang.en import English from dataset_utils.utils import save_output from dataset_utils.global_vars import TEXT_FIELDS TOKENIZERS = {'Russian': Russian().tokenizer, 'English': English().tokenizer} def main(input_dir: str, output_dir: str, language: str): tasks = [task for task in os.listdir(input_dir) if task in TEXT_FIELDS] [ preprocess_task(input_dir, output_dir, t, TOKENIZERS[language]) for t in tqdm(tasks) ] def preprocess_task(input_dir: str, output_dir, task: str, preproc_fn): """ replaces raw texts with preprocessed ones """ if not os.path.isdir(output_dir + task): # create directories for preprocessed tasks os.makedirs(output_dir + task) samples = [
print(stem_vectorizer.get_feature_names()) #### Spacy https://github.com/kmike/pymorphy2 from spacy.lang.ru import Russian # Requires morph2 import spacy nlp = Russian() # use directly txt = train.description[900] doc = nlp(txt) for token in doc: print(token.text) """c""" ==> Not seeing any results.
import nltk from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import WordPunctTokenizer from spacy.lang.ru import Russian nltk.download('stopwords') stopwords = set(stopwords.words('russian')) stopwords.update([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '#', '№' ]) lemmer = Russian() stemmer = SnowballStemmer(language='russian') tokenizer = WordPunctTokenizer() def filter_words(text: str) -> str: tokens = [] for token in tokenizer.tokenize(text): if token not in stopwords and not token.isdigit(): tokens.append(token) return " ".join(tokens) def lemmatization(text: str) -> str: tokens = [token.lemma_ for token in lemmer(text)] return " ".join(tokens)
#!/usr/bin/env python3 from __future__ import unicode_literals import sys import spacy.lang.ru import re from spacy.lang.ru import Russian import spacy corpora_path = 'raw_corpora.txt' global_word_count = {} texts_word_count = [] nlp = Russian() tokenizer = nlp.Defaults.create_tokenizer() with open(corpora_path, 'r') as f: for cnt, line in enumerate(f): line = line.strip() if line == '/***/': texts_word_count.append({}) continue doc = nlp(line) for word in doc: if re.match('[\w]+', word.text, re.I) == None: continue if not word.text in global_word_count: global_word_count[word.text] = 1 else: global_word_count[word.text] += 1 if not word.text in texts_word_count[-1]: texts_word_count[-1][word.text] = 1
lemmas_.append(doc[0].lemma_) return lemmas_ def tokenize_set(s, lemmas, lang): if lemmas: return [lemmatize(word_tokenize(instance[0]), lang) for instance in s] if not lemmas: return [word_tokenize(instance[0]) for instance in s] def get_y(s): return [instance[1] for instance in s] ru_nlp = Russian() de_nlp = spacy.load('de_core_news_sm') def delete_stop_words(lang, tweet): doc = '' allowed_words = [] if lang == "rus": doc = ru_nlp(tweet) allowed_words = ["не"] elif lang == "ger": doc = de_nlp(tweet) allowed_words = [ "gut", "gute", "guter", "gutes", "kaum", "kein", "keine", "keinem", "keinen", "keiner", "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "schlecht"
def tokenizer(inp): nlp = Russian() russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS) nlp.add_pipe(russian_tokenizer, name='russian_tokenizer') return nlp(inp)
from spacy.lang.ja import Japanese from spacy.lang.ca import Catalan from spacy.lang.eu import Basque from DataHandler import load_df_twitter_sent, load_df_lorelei from util import clean_str as test_clean_str from nltk.corpus import stopwords from util import identity_fn, lang2id language_dict = { 'english': English(), 'spanish': Spanish(), 'french': French(), 'italian': Italian(), 'german': German(), 'russian': Russian(), 'chinese': Chinese(), 'japanese': Japanese(), 'catalan': Catalan(), 'basque': Basque(), } class Tokenizer: def __init__(self, language, tokenizer_method='spacy', remove_stopwords=True, lowercase=True, strip_accents=None, ngram_range=(1, 1),