예제 #1
0
def spellcheckword(word):
    check = Word(word)
    if check.correct == False:
        try:
            return check.variants[0]
        except IndexError:
            return word
    else:
        return word
예제 #2
0
def spell_word(word: str):
    try:
        gr = morph.parse(word)[0].tag.POS
        if gr not in ['PREP', 'CONJ', 'PRCL', 'INTJ', 'NPRO'
                      ] and len(word) > 2:
            speller_w = Word(word)
            return word if speller_w.correct else speller_w.spellsafe
        return word
    except:
        return word
예제 #3
0
def speller(text):
    correct = text
    try:
        for word in set(text.split(" ")):
            if len(word) >= 2:
                check = Word(word)
                if not check.correct and check.spellsafe:
                    correct = text.replace(word, check.spellsafe)
    except Exception:
        pass
    finally:
        return correct
예제 #4
0
def check_text(word_list):

    misspelt_words = []

    for word in word_list:
        check = Word(word)
        val = check.correct
        if not val:
            print('Unrecognized word: ' + word)
            if check.spellsafe == None:
                print('Suggested edits: ' + str(check.spellsafe))
            else:
                print('Did you mean: ' + str(check.variants))
예제 #5
0
 def correct(txt):
     text = ""
     for ae in txt.split(" "):
         check = Word(ae)
         try:
             if check.correct:
                 text += ae + " "
             else:
                 text += check.variants[0] + " "
         except:
             text += ae + " "
     if text.endswith(" "):
         text = text[:-1]
     return text
예제 #6
0
def automatic_typo_correction(text):
    """
    Автоматическое исправление опечаток в тексте используя pyaspeller.
    :param text: строка содержащая текст
    :return: строка после обработки
    """
    list_word = text.split()
    result_text = ""
    for word in list_word:
        check = Word(word)
        result_check = check.spellsafe
        if result_check:
            result_text += result_check + " "
        else:
            result_text += word + " "

    return result_text
예제 #7
0
    def clean_typos(self, text):

        result = []

        for w in text.split():
            dict_w = self.check_dict(w)
            if dict_w:
                result.append(dict_w)
            else:
                check = Word(w)
                if check.correct \
                        or len(check.variants) == 0:
                    correct_word = w
                else:
                    correct_word = check.variants[0]
                result.append(correct_word)
                self.update_dict(w, correct_word)

        return ' '.join(result)
예제 #8
0
def correctSpelling(line):
    stop_sybols = [
        u'/', u'\\', u'№', u':', u'1', u'2', u'3', u'4', u'5', u'6', u'7',
        u'8', u'9', u'0', u'–'
    ]
    checked_words = []
    for word in line.split():
        if not any(st in word for st in stop_sybols):
            try:
                check = Word(word)
                if not check.correct:
                    if check.spellsafe:
                        checked_words.append(
                            check.spellsafe.translate({ord(u"-"): " "}))
                else:
                    checked_words.append(word)
            except:
                pass
    return " ".join(checked_words)
예제 #9
0
def spellchecking(cleantweet):
    list_word = cleantweet.split()
    result_string = []
    morph = pymorphy2.MorphAnalyzer()

    for word in list_word:
        check = Word(word)
        if not check.correct:
            if (len(check.variants)) == 0:
                pass
            else:
                print(word)
                print(check.variants)
                word = check.variants[0]
        else:
            pass

        word = morph.parse(word)[0].normal_form
        word = str(word)
        result_string.append(word)
    return " ".join(result_string)
예제 #10
0
def check_word(one_word):

    check = Word(one_word)
    if not check.correct:
        print('Incorrect word: ' + one_word)
        print('Did you mean: ' + check.variants)
예제 #11
0
    raw = [line.rstrip() for line in f.readlines()]

with open('./by_wd/test.norm', 'r') as f:
    gold_norm = [line.rstrip() for line in f.readlines()]

with open('./by_wd/{}'.format(run), 'r') as f:
    csmt_norm = [line.rstrip() for line in f.readlines()]

w = open('./0_results/aligned_{}.tsv'.format(run), 'w')

for raw, gold, csmt in zip(raw, gold_norm, csmt_norm):
    raw_words = raw.split()
    gold_words = gold.split()
    csmt_words = csmt.split()
    csmt_spellcheck = []

    for word in csmt_words:
        sp = Word(word)
        checked = sp.spellsafe
        if checked:
            csmt_spellcheck.append(checked)
        else:
            csmt_spellcheck.append(word)

    # if len(gold_words) == len(csmt_words):
    for line in itertools.zip_longest(raw_words, gold_words, csmt_words,
                                      csmt_spellcheck):
        w.write('{}\t{}\t{}\t{}\n'.format(*line))

w.close()
예제 #12
0
from pyaspeller import Word

# f = open('./diff_sizes/norm.txt', 'r')
#
# for line in f:
#     for word in line.rstrip().split():
#         sp = Word(word)
#         checked = sp.spellsafe
#         if checked:
#             print(word)
for i in range(10):
    sp = Word('кашаю')
    checked = sp.spellsafe
    print(checked)
def main():
    # the ciphertext
    ciphertext = 'RQH YDULDWLRQ WR WKH VWDQGDUG FDHVDU FLSKHU LV ZKHQ WKH DOSKDEHW LV "NHBHG" EB XVLQJ D ZRUG. LQ WKH ' \
                     'WUDGLWLRQDO YDULHWB, RQH FRXOG ZULWH WKH DOSKDEHW RQ WZR VWULSV DQG MXVW PDWFK XS WKH VWULSV DIWHU ' \
                     'VOLGLQJ WKH ERWWRP VWULS WR WKH OHIW RU ULJKW. WR HQFRGH, BRX ZRXOG ILQG D OHWWHU LQ WKH WRS URZ DQG ' \
                     'VXEVWLWXWH LW IRU WKH OHWWHU LQ WKH ERWWRP URZ. IRU D NHBHG YHUVLRQ, RQH ZRXOG QRW XVH D VWDQGDUG ' \
                     'DOSKDEHW, EXW ZRXOG ILUVW ZULWH D ZRUG (RPLWWLQJ GXSOLFDWHG OHWWHUV) DQG WKHQ ZULWH WKH UHPDLQLQJ OHWWHUV ' \
                     'RI WKH DOSKDEHW. IRU WKH HADPSOH EHORZ, L XVHG D NHB RI "UXPNLQ.FRP" DQG BRX ZLOO VHH WKDW WKH SHULRG LV ' \
                     'UHPRYHG EHFDXVH LW LV QRW D OHWWHU. BRX ZLOO DOVR QRWLFH WKH VHFRQG "P" LV QRW LQFOXGHG EHFDXVH WKHUH ZDV ' \
                     'DQ P DOUHDGB DQG BRX FDQ\'W KDYH GXSOLFDWHV. '

    # empty array for storing alphabet i.e. possible keys
    alphabet = ''

    # build the alphabet, list of possible keys
    for one in range(97, 123):
        letter = chr(one).upper()
        alphabet = alphabet + letter

    # check to see if alphabet populates correctly
    print(alphabet)

    # start decrypting
    # for every index value in the alphabet (0 - 25)
    for key in range(len(alphabet)):
        # reset the plaintext string on each round
        plaintext = ''
        # look at each value in the ciphertext
        for symbol in ciphertext:
            # if the value exists in the alphabet
            if symbol in alphabet:
                # get the index of the symbol in the alphabet
                num = alphabet.find(symbol)
                # try every variation of the index from 0-25
                num = num - key  # key here is the index of the outer for loop
                # if the index goes out of bounds i.e. < 0 or < A
                if num < 0:
                    # reset the value to start at Z again
                    num = num + len(alphabet)
                # update the plaintext with the indexed value from alphabet
                plaintext = plaintext + alphabet[num]
            # otherwise the symbol is not in alphabet i.e. white space
            else:
                # therefore just append the symbol to the plaintext unchanged
                plaintext = plaintext + symbol
        # split the plaintext to try identify english words
        words = re.split("[^a-zA-Z]+", plaintext)

        # look at the first two words in the string
        check1 = Word(words[0])
        check2 = Word(words[1])

        # if they are not english words
        if check1.correct is False:
            if check2.correct is False:
                # print("not in dictionary")
                continue
        else:
            print("FOUND IT!")
            # print the value of the key and plaintext
            print("The key is: " + str(key) + "\nPlaintext: " + plaintext)
            # no need to check other keys
            break
예제 #14
0
                body[url] += ' ' + div.text
            except:
                continue

    print(
        f'Слов для проверки на странице {url}: {len(get_words(body[url]))}\n')

# #### Проверка

for url in body:

    errors = {}
    print(url)
    for clean_word in get_words(body[url]):
        try:
            check = Word(clean_word)
            if check.correct == False:
                if clean_word not in errors:
                    errors[clean_word] = {}
                    errors[clean_word]['variants'] = check.variants
                    errors[clean_word]['count'] = 1
                else:
                    errors[clean_word]['count'] += 1
        except Exception as e:
            print(f'Что-то пошло не так: {e}, слово: {clean_word}')
            continue

    pprint(errors)
    print('\n')

browser.quit()