def norm(file): global tr_qd global sub_qd global morphdict global d idd = file[len(d):-4] morph = pymorphy2.MorphAnalyzer() ifile = open(file) text = " ".join(line[:-1] for line in ifile.readlines()) ifile.close() if idd not in sub_qd.keys() and idd not in tr_qd.keys(): return speller = YandexSpeller(max_requests=555) changes = {} newwords = {} lock.acquire() spelled = speller.spell(text) lock.release() for change in spelled: try: changes[change['word']] = change['s'][0] except: continue for word, suggestion in changes.items(): try: text = text.replace(word, suggestion) except: continue for word in text.split(' '): if word not in morphdict.keys(): newwords[word] = morph.parse(word)[0].normal_form try: text = text.replace(word, newwords[word]) except: continue else: try: text = text.replace(word, morphdict[word]) except: continue if idd in tr_qd.keys(): for q in tr_qd[idd]: try: os.mkdir('./train/{}'.format(q)) except: n = None ofile = open('./train/{}/{}'.format(q, file), 'w') ofile.write(text) ofile.close() if idd in sub_qd.keys(): for q in sub_qd[idd]: try: os.mkdir('./sub/{}'.format(q)) except: n = None ofile = open('./sub/{}/{}'.format(q, file), 'w') ofile.write(text) ofile.close() return newwords
def correct_spelling(text): speller = YandexSpeller() changes = { change['word']: change['s'][0] for change in speller.spell(text) } for word, suggestion in changes.items(): text = text.replace(word, suggestion) return text
class T9Bot(object): def __init__(self, config : dict): self.blanks = config['blanks'] self.engine = Bot(environ['TOKEN'], parse_mode = types.ParseMode.HTML) self.dispatcher = Dispatcher(self.engine) self.speller = YandexSpeller() def __spelled(self, message : str): result = self.blanks['correctly'] try: corrected = self.speller.spelled(message) except: result = self.blanks['error'] if(message != corrected): result = self.blanks['incorrectly'].format(corrected) return result def __init_handlers(self): @self.dispatcher.message_handler(commands = ['start', 'help']) async def process_command(message : types.Message): response = self.blanks[message.text.replace('/', '')] await self.engine.send_message(message.from_user.id, response) @self.dispatcher.message_handler() async def process_message(message : types.Message): response = self.__spelled(sub(compile('<.*?>') , '', message.text)) await message.reply(response) def run(self): self.__init_handlers() try: executor.start_polling(self.dispatcher) except Exception as error: print(f'Error: {error}') finally: (collect(), self.run())
def text_correction(speller: YandexSpeller, text: str) -> str: changes = {change['word']: change['s'][0] for change in speller.spell(text) if len(change['s']) > 0} for word, suggestion in changes.items(): if word[0].isupper(): continue text = text.replace(word, suggestion) return text
def start_comment(message): if message.text == 'Нет': bot.send_message(message.chat.id, "Окей") return bot.send_message(message.chat.id, "Приступаю))") global api global speller try: api = vk_features.login_to_vk() speller = YandexSpeller() except Exception as e: bot.send_message(message.chat.id, "Не могу запустить API") print("Can't init APIs:", e) exit(0) bot.send_message(message.chat.id, "Загрузил API") print("Successfully logged in!") user = message.chat.id user_data[user] = {'posts_amount': vk_features.get_post_count(api, public_id), 'posts_cnt': 0, 'curr_post': None, 'changes': [], 'changes_cnt': 0} ask_permission(message)
def corrgraf(message): _text = message.text speller = YandexSpeller() try: changes = { change['word']: change['s'][0] for change in speller.spell(_text) } for word, suggestion in changes.items(): _text = _text.replace(word, suggestion) except IndexError: bot.send_message(message.from_user.id, 'Определенного слова нет в словаре') else: bot.send_message(message.from_user.id, 'Проверенный текст :> \n' + _text)
def useSpeller(self): self.originalText, self.errorText = FP().prepareFiles() originalSentencesList, errorSentencesList = EC().textToSentences(self.originalText, self.errorText) print(len(originalSentencesList), len(errorSentencesList)) processedSentencesList = [] speller = YandexSpeller() for sentence in errorSentencesList: if sentence == "": processedSentencesList.append(sentence) else: for change in speller.spell(sentence): if change['s'] == []: changes = {change['word']: change['word']} else: changes = {change['word']: change['s'][0]} for word, suggestion in changes.items(): sentence = sentence.replace(word, suggestion) processedSentencesList.append(sentence) Metrics().estimateCorrections(self.originalText,originalSentencesList, processedSentencesList)
def spel(text): speller = YandexSpeller(lang="ru", ignore_urls=True, ignore_tags=True, ignore_capitalization=True, ignore_digits=True, ignore_latin=True, ignore_roman_numerals=True, ignore_uppercase=True, find_repeat_words=False) try: changes = { change["word"]: change["s"][0] for change in speller.spell(text) } for word, suggestion in changes.items(): text = text.replace(word, suggestion) return text except Exception as e: text = "О, семпай использовал запретное заклинание! Нехорошо так делать." return text
class YandexSpeller(Cleaner): def __init__(self, sent=False): self.speller = YandexSpeller() self.sent = sent def transform_text(self, text): try: changes = {change['word']: change['s'][0] for change in self.speller.spell(text) if change['s']} for word, suggestion in changes.items(): text = text.replace(word, suggestion) except: pass return text
from app.api.rules.modal_particles import MODAL_PARTICLES_REGEX from app.api.rules.rules import * from app.api.utils.display_utils import highlight_match from app.api.utils.morph_utils import parse_sentence_morph, MorphRegexConverter, match_morph, \ parse_word_morph, tokenize_sentences, tokenize_corp_sentences, Match from app.api.utils.ngrams import log_likelihood, create_bigrams, create_trigrams NAN_ELEMENT = 'N/A' VOWELS = 'ауоыиэяюёе' IPM_MULTIPLIER = 1000000 SPELLER = YandexSpeller( lang='ru', find_repeat_words=False, ignore_digits=True, ignore_latin=True, ignore_roman_numerals=True, ignore_uppercase=True, ignore_urls=True, ) class Text: def __init__(self, text, genre, attributes): self.text = text self.attributes = attributes self.genre = genre self.sentences = self.__tokenize_sentences() self.morph_parsed_sentences = [ parse_sentence_morph(sentence) for sentence in self.sentences
def __init__(self, config : dict): self.blanks = config['blanks'] self.engine = Bot(environ['TOKEN'], parse_mode = types.ParseMode.HTML) self.dispatcher = Dispatcher(self.engine) self.speller = YandexSpeller()
import config import kb from time import sleep import dbworker import os import requests import random import urllib import azuretext import time import datetime import multiprocessing from pyaspeller import YandexSpeller speller = YandexSpeller() from multiprocessing import Process #import schedule from requests import get from aiogram import Bot, types from aiogram.utils import executor from aiogram.utils.markdown import text #from aiogram.utils.markdown import bold, code, italic, text from aiogram.dispatcher import Dispatcher from aiogram.types import ReplyKeyboardRemove, \ ReplyKeyboardMarkup, KeyboardButton, \ InlineKeyboardMarkup, InlineKeyboardButton BASE_MEDIA_PATH = './agg' # git - 10/10 bot = Bot(token=config.token) dp = Dispatcher(bot)
tr_qd[line[1]] = [] tr_qd[line[1]].append(line[0]) trf.close() lfiles = comm.recv() morph = pymorphy2.MorphAnalyzer() morphdict = {} for file in lfiles: ifile = open(file) text = " ".join(line[:-1] for line in ifile.readlines()) ltext = text.split('\t') header = ltext[1] text = ltext[2] ifile.close() if ltext[0] not in sub_qd.keys() and ltext[0] not in tr_qd.keys(): continue speller = YandexSpeller() changes = {} for change in speller.spell(text): try: changes[change['word']] = change['s'][0] except: continue for change in speller.spell(header): try: changes[change['word']] = change['s'][0] except: continue recbuf = {} newwords = {} for sugg in changes.values(): if sugg not in morphdict.keys():
def __init__(self, sent=False): self.speller = YandexSpeller() self.sent = sent
class TextProcessor: _speller = YandexSpeller() _word_re = re.compile('[А-яA-zёЁ]+(?:-[а-яА-Яa-zA-ZёЁ]+)?') @classmethod def tokenize_and_process(cls, text, strip_accents=True, rm_not_ascii=True, rm_stopwords=True, rm_not_words=True, spell_correct=False): if isinstance(text, list): text = ' '.join(text) if strip_accents: text = cls.strip_accents(text, rm_not_ascii=rm_not_ascii) tokens = cls.tokenize(text) if rm_not_words: tokens = cls.rm_not_words(tokens) if rm_stopwords: tokens = cls.rm_stop_words(tokens) if spell_correct: tokens = cls.spell_correct(tokens) return tokens # === TOKENIZING HARD-CODED FROM NLTK === # (in order to don't download megabytes of additional resources won't be used) _punkt_tokenizer = nltk.load(os.path.join(os.path.dirname(__file__), 'tokenizers/punkt/english.pickle')) _tokenizer = nltk.TreebankWordTokenizer() # See discussion on https://github.com/nltk/nltk/pull/1437 # Adding to TreebankWordTokenizer, the splits on # - chervon quotes u'\xab' and u'\xbb' . # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' improved_open_quote_regex = re.compile(u'([«“‘])', re.U) improved_close_quote_regex = re.compile(u'([»”’])', re.U) improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U) _tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) _tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) _tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 ')) @classmethod def tokenize(cls, text): sentences = cls._punkt_tokenizer.tokenize(text) return [token for sent in sentences for token in cls._tokenizer.tokenize(sent)] # === END HARD-CODED FROM NLTK === # === pre-processing === @classmethod def strip_accents(cls, text, rm_not_ascii=True): not_accents = [] exceptions = ['ё', 'й'] for char in text: if rm_not_ascii and char not in printable_chars: continue char_nfd_form = list(unicodedata.normalize('NFD', char)) if len(char_nfd_form) == 1: if unicodedata.category(char) != 'Mn': not_accents.append(char) elif len(char_nfd_form) == 2: mark, _ = tuple(char_nfd_form) if char.lower() in exceptions: not_accents.append(char) else: not_accents.append(mark) return ''.join(not_accents) @classmethod def rm_not_words(cls, tokens: List[str]): words_tokens = [] for t in tokens: words_tokens.extend(cls._word_re.findall(t)) return words_tokens @classmethod def rm_stop_words(cls, words: List[str]): return [w for w in words if w.lower() not in stopwords_set] # === spell correction === @classmethod def _get_spell_corrections_dict(cls, *words): corrections = defaultdict() try: words_generator = cls._speller.spell(words) for w_info in words_generator: corrections[w_info.get('word')] = w_info.get('s') except: pass return corrections @classmethod def get_spell_correction(cls, word): corrections = cls._get_spell_corrections_dict(word) return corrections.get(word, []) @classmethod def spell_correct(cls, tokens: List[str]): corrections = cls._get_spell_corrections_dict(*tokens) corrected_tokens = [] for token in tokens: token_corrections = corrections.get(token) if token_corrections: if len(token_corrections) > 1: # several corrections for not-local token print('Warning: ambiguous corrections for non-local token %s: %s' % (token, str(token_corrections))) # accept first 2 corrections corrected_tokens.extend(token_corrections[:2]) else: # accept first correction corrected_tokens.append(token_corrections[0]) else: # accept token without correction corrected_tokens.append(token) return corrected_tokens
else: ext_dict[key].append(ad[key]) ext_df = pd.DataFrame(ext_dict) ext_df['Callout'] = ext_df['Callout'].map(lambda x: x['CalloutText']) all_unique_extensions = ext_df['Callout'].unique().tolist() for x in all_unique_extensions: print(x) ### НАЧИНАЕМ ПРОВЕРКУ НА ОПЕЧАТКИ speller = YandexSpeller() print('') print('НАЧИНАЮ ПРОВЕРКУ ОПЕЧАТОК:') print('') print('Чекю заголовки...') print('') # ## Чеким заголовки error_titles = [] all_unique_titles = txtdata['Title'].unique().tolist()
def __init__(self): super().__init__() self._speller = YandexSpeller()
class TextProcessor(Mystem): """ Should be used in conjunction with 'with' context management operator """ _lemma_pattern = re.compile('[а-яА-Яa-zA-Z0-9]+(-[а-яА-Яa-zA-Z0-9]+)*') text_groups_delimiter = '|' def __init__(self): super().__init__() self._speller = YandexSpeller() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): if exc_type: print('%s\nLanguageProcessor closed forcibly: "%s": "%s"' % (str(exc_tb), str(exc_type), str(exc_val))) self.close() @classmethod def _preprocess_text(cls, _text: str) -> str: """ Replace all 'ё' with 'е' """ return _text.lower().replace('ё', 'е') def get_spelling_variants(self, text: str) -> List[str]: """ Returns list of all possible text spellings """ text_words = text.split(' ') # get pairs of {source word: [variants of correction]} corrections = { w_info.get('word'): w_info.get('s') for w_info in self._speller.spell(text) } # fill array like [['Мама'], ['мыла', 'мыло'], ['раму', 'рамп]] words_variants = [] for word in text_words: word_spellings = [word] if corrections.get(word): word_spellings.extend(corrections.get(word)) words_variants.append(word_spellings) # iterate through text products and create text variants text_variants = [ ' '.join(words_product) for words_product in product(*words_variants) ] return text_variants def stemming(self, text: Union[str, List[str]]) -> Union[str, List[str]]: """ Performs stemming on given text(s). Returns string of tokens in the original order for each given text. :param text : if list of texts is passed - they are grouped by delimiter and are processed at once """ if text is None: return [] if isinstance(text, list): _text = (' %s ' % self.text_groups_delimiter).join(text) else: _text = text _text = self._preprocess_text(_text) analyzed = self.analyze(_text) sentence = self._build_sentence(analyzed) tokens = sentence.get_tokens() # if there were several texts given if tokens.count(self.text_groups_delimiter): tokens_string = ' '.join(tokens) return tokens_string.split(' %s ' % self.text_groups_delimiter) else: return ' '.join(tokens) @classmethod def _build_sentence(cls, analyzed: dict) -> Sentence: sentence = Sentence() for l_dict in analyzed: analysis = l_dict.get('analysis') # only words which have 'analyzed' section, and not a 'bastard', are really recognized lemmas if analysis and analysis[-1].get('qual') != 'bastard': try: lemma = LemmaParser.parse_lemma(l_dict) sentence.append(lemma) except Exception as e: print('%s: %s' % (str(type(e)), str(e))) traceback.print_tb(e.__traceback__) # groups delimiter elif cls.text_groups_delimiter in l_dict.get('text'): sentence.append( LemmaParser.get_delimiter_lemma(cls.text_groups_delimiter)) # not-recognized word, but still word elif re.match(cls._lemma_pattern, l_dict.get('text')): sentence.append( LemmaParser.get_arbitrary_lemma(l_dict.get('text'))) return sentence