Exemplo n.º 1
0
def norm(file):
    global tr_qd
    global sub_qd
    global morphdict
    global d
    idd = file[len(d):-4]
    morph = pymorphy2.MorphAnalyzer()
    ifile = open(file)
    text = " ".join(line[:-1] for line in ifile.readlines())
    ifile.close()
    if idd not in sub_qd.keys() and idd not in tr_qd.keys():
        return
    speller = YandexSpeller(max_requests=555)
    changes = {}
    newwords = {}
    lock.acquire()
    spelled = speller.spell(text)
    lock.release()
    for change in spelled:
        try:
            changes[change['word']] = change['s'][0]
        except:
            continue
    for word, suggestion in changes.items():
        try:
            text = text.replace(word, suggestion)
        except:
            continue
    for word in text.split(' '):
        if word not in morphdict.keys():
            newwords[word] = morph.parse(word)[0].normal_form
            try:
                text = text.replace(word, newwords[word])
            except:
                continue
        else:
            try:
                text = text.replace(word, morphdict[word])
            except:
                continue
    if idd in tr_qd.keys():
        for q in tr_qd[idd]:
            try:
                os.mkdir('./train/{}'.format(q))
            except:
                n = None
            ofile = open('./train/{}/{}'.format(q, file), 'w')
            ofile.write(text)
            ofile.close()
    if idd in sub_qd.keys():
        for q in sub_qd[idd]:
            try:
                os.mkdir('./sub/{}'.format(q))
            except:
                n = None
            ofile = open('./sub/{}/{}'.format(q, file), 'w')
            ofile.write(text)
            ofile.close()
    return newwords
Exemplo n.º 2
0
def correct_spelling(text):
    speller = YandexSpeller()
    changes = {
        change['word']: change['s'][0]
        for change in speller.spell(text)
    }
    for word, suggestion in changes.items():
        text = text.replace(word, suggestion)
    return text
Exemplo n.º 3
0
class T9Bot(object):
    def __init__(self, config : dict):
        self.blanks = config['blanks']
        self.engine = Bot(environ['TOKEN'], parse_mode = types.ParseMode.HTML)
        self.dispatcher = Dispatcher(self.engine)
        self.speller = YandexSpeller()

    def __spelled(self, message : str):
        result = self.blanks['correctly']
        try: corrected = self.speller.spelled(message)
        except: result = self.blanks['error']
        if(message != corrected): result = self.blanks['incorrectly'].format(corrected)
        return result

    def __init_handlers(self):
        @self.dispatcher.message_handler(commands = ['start', 'help'])
        async def process_command(message : types.Message):
            response = self.blanks[message.text.replace('/', '')]
            await self.engine.send_message(message.from_user.id, response)
        @self.dispatcher.message_handler()
        async def process_message(message : types.Message):
            response = self.__spelled(sub(compile('<.*?>') , '', message.text))
            await message.reply(response)

    def run(self):
        self.__init_handlers()
        try: executor.start_polling(self.dispatcher)
        except Exception as error: print(f'Error: {error}')
        finally: (collect(), self.run())
Exemplo n.º 4
0
def text_correction(speller: YandexSpeller, text: str) -> str:
    changes = {change['word']: change['s'][0] for change in speller.spell(text) if len(change['s']) > 0}
    for word, suggestion in changes.items():
        if word[0].isupper():
            continue
        text = text.replace(word, suggestion)
    return text
Exemplo n.º 5
0
def start_comment(message):
    if message.text == 'Нет':
        bot.send_message(message.chat.id, "Окей")
        return

    bot.send_message(message.chat.id, "Приступаю))")

    global api
    global speller

    try:
        api = vk_features.login_to_vk()
        speller = YandexSpeller()
    except Exception as e:
        bot.send_message(message.chat.id, "Не могу запустить API")
        print("Can't init APIs:", e)
        exit(0)

    bot.send_message(message.chat.id, "Загрузил API")
    print("Successfully logged in!")

    user = message.chat.id
    user_data[user] = {'posts_amount': vk_features.get_post_count(api, public_id), 'posts_cnt': 0,
                       'curr_post': None, 'changes': [], 'changes_cnt': 0}
    ask_permission(message)
Exemplo n.º 6
0
def corrgraf(message):
    _text = message.text
    speller = YandexSpeller()
    try:
        changes = {
            change['word']: change['s'][0]
            for change in speller.spell(_text)
        }
        for word, suggestion in changes.items():
            _text = _text.replace(word, suggestion)
    except IndexError:
        bot.send_message(message.from_user.id,
                         'Определенного слова нет в словаре')
    else:
        bot.send_message(message.from_user.id,
                         'Проверенный текст :> \n' + _text)
Exemplo n.º 7
0
 def useSpeller(self):
     self.originalText, self.errorText = FP().prepareFiles()
     originalSentencesList, errorSentencesList = EC().textToSentences(self.originalText, self.errorText)
     print(len(originalSentencesList), len(errorSentencesList))
     processedSentencesList = []
     speller = YandexSpeller()
     for sentence in errorSentencesList:
         if sentence == "":
             processedSentencesList.append(sentence)
         else:
             for change in speller.spell(sentence):
                 if change['s'] == []:
                     changes = {change['word']: change['word']}
                 else:
                     changes = {change['word']: change['s'][0]}
                 for word, suggestion in changes.items():
                     sentence = sentence.replace(word, suggestion)
             processedSentencesList.append(sentence)
     Metrics().estimateCorrections(self.originalText,originalSentencesList, processedSentencesList)
Exemplo n.º 8
0
def spel(text):
    speller = YandexSpeller(lang="ru",
                            ignore_urls=True,
                            ignore_tags=True,
                            ignore_capitalization=True,
                            ignore_digits=True,
                            ignore_latin=True,
                            ignore_roman_numerals=True,
                            ignore_uppercase=True,
                            find_repeat_words=False)
    try:
        changes = {
            change["word"]: change["s"][0]
            for change in speller.spell(text)
        }
        for word, suggestion in changes.items():
            text = text.replace(word, suggestion)
        return text
    except Exception as e:
        text = "О, семпай использовал запретное заклинание! Нехорошо так делать."
        return text
Exemplo n.º 9
0
class YandexSpeller(Cleaner):
    
    def __init__(self, sent=False):
        
        self.speller = YandexSpeller()
        self.sent = sent
    
    def transform_text(self, text):
        
        try:
            changes = {change['word']: change['s'][0] 
                       for change in self.speller.spell(text) if change['s']}
            
            for word, suggestion in changes.items():
                text = text.replace(word, suggestion)
        
        except:
            pass
        
        return text
Exemplo n.º 10
0
from app.api.rules.modal_particles import MODAL_PARTICLES_REGEX
from app.api.rules.rules import *
from app.api.utils.display_utils import highlight_match
from app.api.utils.morph_utils import parse_sentence_morph, MorphRegexConverter, match_morph, \
    parse_word_morph, tokenize_sentences, tokenize_corp_sentences, Match
from app.api.utils.ngrams import log_likelihood, create_bigrams, create_trigrams

NAN_ELEMENT = 'N/A'
VOWELS = 'ауоыиэяюёе'
IPM_MULTIPLIER = 1000000

SPELLER = YandexSpeller(
    lang='ru',
    find_repeat_words=False,
    ignore_digits=True,
    ignore_latin=True,
    ignore_roman_numerals=True,
    ignore_uppercase=True,
    ignore_urls=True,
)


class Text:
    def __init__(self, text, genre, attributes):
        self.text = text
        self.attributes = attributes
        self.genre = genre

        self.sentences = self.__tokenize_sentences()
        self.morph_parsed_sentences = [
            parse_sentence_morph(sentence) for sentence in self.sentences
Exemplo n.º 11
0
 def __init__(self, config : dict):
     self.blanks = config['blanks']
     self.engine = Bot(environ['TOKEN'], parse_mode = types.ParseMode.HTML)
     self.dispatcher = Dispatcher(self.engine)
     self.speller = YandexSpeller()
Exemplo n.º 12
0
import config
import kb
from time import sleep
import dbworker
import os
import requests
import random
import urllib
import azuretext
import time
import datetime
import multiprocessing
from pyaspeller import YandexSpeller
speller = YandexSpeller()
from multiprocessing import Process
#import schedule
from requests import get
from aiogram import Bot, types
from aiogram.utils import executor
from aiogram.utils.markdown import text
#from aiogram.utils.markdown import bold, code, italic, text
from aiogram.dispatcher import Dispatcher
from aiogram.types import ReplyKeyboardRemove, \
    ReplyKeyboardMarkup, KeyboardButton, \
    InlineKeyboardMarkup, InlineKeyboardButton

BASE_MEDIA_PATH = './agg'
# git - 10/10 
bot = Bot(token=config.token)
dp = Dispatcher(bot)
Exemplo n.º 13
0
        tr_qd[line[1]] = []
    tr_qd[line[1]].append(line[0])
trf.close()
lfiles = comm.recv()
morph = pymorphy2.MorphAnalyzer()
morphdict = {}
for file in lfiles:
    ifile = open(file)
    text = " ".join(line[:-1] for line in ifile.readlines())
    ltext = text.split('\t')
    header = ltext[1]
    text = ltext[2]
    ifile.close()
    if ltext[0] not in sub_qd.keys() and ltext[0] not in tr_qd.keys():
        continue
    speller = YandexSpeller()
    changes = {}
    for change in speller.spell(text):
        try:
            changes[change['word']] = change['s'][0] 
        except:
            continue
    for change in speller.spell(header):
        try:
            changes[change['word']] = change['s'][0] 
        except:
            continue
    recbuf = {}
    newwords = {}
    for sugg in changes.values():
        if sugg not in morphdict.keys():
Exemplo n.º 14
0
 def __init__(self, sent=False):
     
     self.speller = YandexSpeller()
     self.sent = sent
Exemplo n.º 15
0
class TextProcessor:
    _speller = YandexSpeller()

    _word_re = re.compile('[А-яA-zёЁ]+(?:-[а-яА-Яa-zA-ZёЁ]+)?')

    @classmethod
    def tokenize_and_process(cls, text, strip_accents=True, rm_not_ascii=True, rm_stopwords=True, rm_not_words=True,
                             spell_correct=False):
        if isinstance(text, list):
            text = ' '.join(text)

        if strip_accents:
            text = cls.strip_accents(text, rm_not_ascii=rm_not_ascii)

        tokens = cls.tokenize(text)

        if rm_not_words:
            tokens = cls.rm_not_words(tokens)

        if rm_stopwords:
            tokens = cls.rm_stop_words(tokens)

        if spell_correct:
            tokens = cls.spell_correct(tokens)

        return tokens

    # === TOKENIZING HARD-CODED FROM NLTK ===
    # (in order to don't download megabytes of additional resources won't be used)

    _punkt_tokenizer = nltk.load(os.path.join(os.path.dirname(__file__), 'tokenizers/punkt/english.pickle'))

    _tokenizer = nltk.TreebankWordTokenizer()

    # See discussion on https://github.com/nltk/nltk/pull/1437
    # Adding to TreebankWordTokenizer, the splits on
    # - chervon quotes u'\xab' and u'\xbb' .
    # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'

    improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
    improved_close_quote_regex = re.compile(u'([»”’])', re.U)
    improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
    _tokenizer.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
    _tokenizer.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
    _tokenizer.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))

    @classmethod
    def tokenize(cls, text):
        sentences = cls._punkt_tokenizer.tokenize(text)

        return [token for sent in sentences
                for token in cls._tokenizer.tokenize(sent)]

    # === END HARD-CODED FROM NLTK ===

    # === pre-processing ===

    @classmethod
    def strip_accents(cls, text, rm_not_ascii=True):
        not_accents = []
        exceptions = ['ё', 'й']

        for char in text:
            if rm_not_ascii and char not in printable_chars:
                continue

            char_nfd_form = list(unicodedata.normalize('NFD', char))

            if len(char_nfd_form) == 1:
                if unicodedata.category(char) != 'Mn':
                    not_accents.append(char)

            elif len(char_nfd_form) == 2:
                mark, _ = tuple(char_nfd_form)

                if char.lower() in exceptions:
                    not_accents.append(char)

                else:
                    not_accents.append(mark)

        return ''.join(not_accents)

    @classmethod
    def rm_not_words(cls, tokens: List[str]):
        words_tokens = []

        for t in tokens:
            words_tokens.extend(cls._word_re.findall(t))

        return words_tokens

    @classmethod
    def rm_stop_words(cls, words: List[str]):
        return [w
                for w in words
                if w.lower() not in stopwords_set]

    # === spell correction ===

    @classmethod
    def _get_spell_corrections_dict(cls, *words):
        corrections = defaultdict()

        try:
            words_generator = cls._speller.spell(words)

            for w_info in words_generator:
                corrections[w_info.get('word')] = w_info.get('s')

        except:
            pass

        return corrections

    @classmethod
    def get_spell_correction(cls, word):
        corrections = cls._get_spell_corrections_dict(word)

        return corrections.get(word, [])

    @classmethod
    def spell_correct(cls, tokens: List[str]):
        corrections = cls._get_spell_corrections_dict(*tokens)

        corrected_tokens = []

        for token in tokens:
            token_corrections = corrections.get(token)

            if token_corrections:
                if len(token_corrections) > 1:
                    # several corrections for not-local token
                    print('Warning: ambiguous corrections for non-local token %s: %s' %
                          (token, str(token_corrections)))

                    # accept first 2 corrections
                    corrected_tokens.extend(token_corrections[:2])

                else:
                    # accept first correction
                    corrected_tokens.append(token_corrections[0])

            else:
                # accept token without correction
                corrected_tokens.append(token)

        return corrected_tokens
        else:
            ext_dict[key].append(ad[key])

ext_df = pd.DataFrame(ext_dict)
ext_df['Callout'] = ext_df['Callout'].map(lambda x: x['CalloutText'])


all_unique_extensions = ext_df['Callout'].unique().tolist()

for x in all_unique_extensions:
    print(x)
    
    
### НАЧИНАЕМ ПРОВЕРКУ НА ОПЕЧАТКИ

speller = YandexSpeller()
 
print('') 
print('НАЧИНАЮ ПРОВЕРКУ ОПЕЧАТОК:')    
print('') 

print('Чекю заголовки...')
print('') 


# ## Чеким заголовки 


error_titles = []

all_unique_titles = txtdata['Title'].unique().tolist()
Exemplo n.º 17
0
 def __init__(self):
     super().__init__()
     self._speller = YandexSpeller()
Exemplo n.º 18
0
class TextProcessor(Mystem):
    """
    Should be used in conjunction with 'with' context management operator
    """
    _lemma_pattern = re.compile('[а-яА-Яa-zA-Z0-9]+(-[а-яА-Яa-zA-Z0-9]+)*')

    text_groups_delimiter = '|'

    def __init__(self):
        super().__init__()
        self._speller = YandexSpeller()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type:
            print('%s\nLanguageProcessor closed forcibly: "%s": "%s"' %
                  (str(exc_tb), str(exc_type), str(exc_val)))

        self.close()

    @classmethod
    def _preprocess_text(cls, _text: str) -> str:
        """
        Replace all 'ё' with 'е'
        """
        return _text.lower().replace('ё', 'е')

    def get_spelling_variants(self, text: str) -> List[str]:
        """
        Returns list of all possible text spellings
        """
        text_words = text.split(' ')

        # get pairs of {source word: [variants of correction]}
        corrections = {
            w_info.get('word'): w_info.get('s')
            for w_info in self._speller.spell(text)
        }

        # fill array like [['Мама'], ['мыла', 'мыло'], ['раму', 'рамп]]
        words_variants = []
        for word in text_words:
            word_spellings = [word]

            if corrections.get(word):
                word_spellings.extend(corrections.get(word))

            words_variants.append(word_spellings)

        # iterate through text products and create text variants
        text_variants = [
            ' '.join(words_product)
            for words_product in product(*words_variants)
        ]

        return text_variants

    def stemming(self, text: Union[str, List[str]]) -> Union[str, List[str]]:
        """
        Performs stemming on given text(s).
        Returns string of tokens in the original order for each given text.

        :param text : if list of texts is passed - they are grouped by delimiter and are processed at once
        """
        if text is None:
            return []

        if isinstance(text, list):
            _text = (' %s ' % self.text_groups_delimiter).join(text)

        else:
            _text = text

        _text = self._preprocess_text(_text)

        analyzed = self.analyze(_text)

        sentence = self._build_sentence(analyzed)

        tokens = sentence.get_tokens()

        # if there were several texts given
        if tokens.count(self.text_groups_delimiter):
            tokens_string = ' '.join(tokens)

            return tokens_string.split(' %s ' % self.text_groups_delimiter)

        else:
            return ' '.join(tokens)

    @classmethod
    def _build_sentence(cls, analyzed: dict) -> Sentence:
        sentence = Sentence()

        for l_dict in analyzed:
            analysis = l_dict.get('analysis')

            # only words which have 'analyzed' section, and not a 'bastard', are really recognized lemmas
            if analysis and analysis[-1].get('qual') != 'bastard':
                try:
                    lemma = LemmaParser.parse_lemma(l_dict)
                    sentence.append(lemma)

                except Exception as e:
                    print('%s: %s' % (str(type(e)), str(e)))
                    traceback.print_tb(e.__traceback__)

            # groups delimiter
            elif cls.text_groups_delimiter in l_dict.get('text'):
                sentence.append(
                    LemmaParser.get_delimiter_lemma(cls.text_groups_delimiter))

            # not-recognized word, but still word
            elif re.match(cls._lemma_pattern, l_dict.get('text')):
                sentence.append(
                    LemmaParser.get_arbitrary_lemma(l_dict.get('text')))

        return sentence