def spellcheckword(word): check = Word(word) if check.correct == False: try: return check.variants[0] except IndexError: return word else: return word
def spell_word(word: str): try: gr = morph.parse(word)[0].tag.POS if gr not in ['PREP', 'CONJ', 'PRCL', 'INTJ', 'NPRO' ] and len(word) > 2: speller_w = Word(word) return word if speller_w.correct else speller_w.spellsafe return word except: return word
def speller(text): correct = text try: for word in set(text.split(" ")): if len(word) >= 2: check = Word(word) if not check.correct and check.spellsafe: correct = text.replace(word, check.spellsafe) except Exception: pass finally: return correct
def check_text(word_list): misspelt_words = [] for word in word_list: check = Word(word) val = check.correct if not val: print('Unrecognized word: ' + word) if check.spellsafe == None: print('Suggested edits: ' + str(check.spellsafe)) else: print('Did you mean: ' + str(check.variants))
def correct(txt): text = "" for ae in txt.split(" "): check = Word(ae) try: if check.correct: text += ae + " " else: text += check.variants[0] + " " except: text += ae + " " if text.endswith(" "): text = text[:-1] return text
def automatic_typo_correction(text): """ Автоматическое исправление опечаток в тексте используя pyaspeller. :param text: строка содержащая текст :return: строка после обработки """ list_word = text.split() result_text = "" for word in list_word: check = Word(word) result_check = check.spellsafe if result_check: result_text += result_check + " " else: result_text += word + " " return result_text
def clean_typos(self, text): result = [] for w in text.split(): dict_w = self.check_dict(w) if dict_w: result.append(dict_w) else: check = Word(w) if check.correct \ or len(check.variants) == 0: correct_word = w else: correct_word = check.variants[0] result.append(correct_word) self.update_dict(w, correct_word) return ' '.join(result)
def correctSpelling(line): stop_sybols = [ u'/', u'\\', u'№', u':', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9', u'0', u'–' ] checked_words = [] for word in line.split(): if not any(st in word for st in stop_sybols): try: check = Word(word) if not check.correct: if check.spellsafe: checked_words.append( check.spellsafe.translate({ord(u"-"): " "})) else: checked_words.append(word) except: pass return " ".join(checked_words)
def spellchecking(cleantweet): list_word = cleantweet.split() result_string = [] morph = pymorphy2.MorphAnalyzer() for word in list_word: check = Word(word) if not check.correct: if (len(check.variants)) == 0: pass else: print(word) print(check.variants) word = check.variants[0] else: pass word = morph.parse(word)[0].normal_form word = str(word) result_string.append(word) return " ".join(result_string)
def check_word(one_word): check = Word(one_word) if not check.correct: print('Incorrect word: ' + one_word) print('Did you mean: ' + check.variants)
raw = [line.rstrip() for line in f.readlines()] with open('./by_wd/test.norm', 'r') as f: gold_norm = [line.rstrip() for line in f.readlines()] with open('./by_wd/{}'.format(run), 'r') as f: csmt_norm = [line.rstrip() for line in f.readlines()] w = open('./0_results/aligned_{}.tsv'.format(run), 'w') for raw, gold, csmt in zip(raw, gold_norm, csmt_norm): raw_words = raw.split() gold_words = gold.split() csmt_words = csmt.split() csmt_spellcheck = [] for word in csmt_words: sp = Word(word) checked = sp.spellsafe if checked: csmt_spellcheck.append(checked) else: csmt_spellcheck.append(word) # if len(gold_words) == len(csmt_words): for line in itertools.zip_longest(raw_words, gold_words, csmt_words, csmt_spellcheck): w.write('{}\t{}\t{}\t{}\n'.format(*line)) w.close()
from pyaspeller import Word # f = open('./diff_sizes/norm.txt', 'r') # # for line in f: # for word in line.rstrip().split(): # sp = Word(word) # checked = sp.spellsafe # if checked: # print(word) for i in range(10): sp = Word('кашаю') checked = sp.spellsafe print(checked)
def main(): # the ciphertext ciphertext = 'RQH YDULDWLRQ WR WKH VWDQGDUG FDHVDU FLSKHU LV ZKHQ WKH DOSKDEHW LV "NHBHG" EB XVLQJ D ZRUG. LQ WKH ' \ 'WUDGLWLRQDO YDULHWB, RQH FRXOG ZULWH WKH DOSKDEHW RQ WZR VWULSV DQG MXVW PDWFK XS WKH VWULSV DIWHU ' \ 'VOLGLQJ WKH ERWWRP VWULS WR WKH OHIW RU ULJKW. WR HQFRGH, BRX ZRXOG ILQG D OHWWHU LQ WKH WRS URZ DQG ' \ 'VXEVWLWXWH LW IRU WKH OHWWHU LQ WKH ERWWRP URZ. IRU D NHBHG YHUVLRQ, RQH ZRXOG QRW XVH D VWDQGDUG ' \ 'DOSKDEHW, EXW ZRXOG ILUVW ZULWH D ZRUG (RPLWWLQJ GXSOLFDWHG OHWWHUV) DQG WKHQ ZULWH WKH UHPDLQLQJ OHWWHUV ' \ 'RI WKH DOSKDEHW. IRU WKH HADPSOH EHORZ, L XVHG D NHB RI "UXPNLQ.FRP" DQG BRX ZLOO VHH WKDW WKH SHULRG LV ' \ 'UHPRYHG EHFDXVH LW LV QRW D OHWWHU. BRX ZLOO DOVR QRWLFH WKH VHFRQG "P" LV QRW LQFOXGHG EHFDXVH WKHUH ZDV ' \ 'DQ P DOUHDGB DQG BRX FDQ\'W KDYH GXSOLFDWHV. ' # empty array for storing alphabet i.e. possible keys alphabet = '' # build the alphabet, list of possible keys for one in range(97, 123): letter = chr(one).upper() alphabet = alphabet + letter # check to see if alphabet populates correctly print(alphabet) # start decrypting # for every index value in the alphabet (0 - 25) for key in range(len(alphabet)): # reset the plaintext string on each round plaintext = '' # look at each value in the ciphertext for symbol in ciphertext: # if the value exists in the alphabet if symbol in alphabet: # get the index of the symbol in the alphabet num = alphabet.find(symbol) # try every variation of the index from 0-25 num = num - key # key here is the index of the outer for loop # if the index goes out of bounds i.e. < 0 or < A if num < 0: # reset the value to start at Z again num = num + len(alphabet) # update the plaintext with the indexed value from alphabet plaintext = plaintext + alphabet[num] # otherwise the symbol is not in alphabet i.e. white space else: # therefore just append the symbol to the plaintext unchanged plaintext = plaintext + symbol # split the plaintext to try identify english words words = re.split("[^a-zA-Z]+", plaintext) # look at the first two words in the string check1 = Word(words[0]) check2 = Word(words[1]) # if they are not english words if check1.correct is False: if check2.correct is False: # print("not in dictionary") continue else: print("FOUND IT!") # print the value of the key and plaintext print("The key is: " + str(key) + "\nPlaintext: " + plaintext) # no need to check other keys break
body[url] += ' ' + div.text except: continue print( f'Слов для проверки на странице {url}: {len(get_words(body[url]))}\n') # #### Проверка for url in body: errors = {} print(url) for clean_word in get_words(body[url]): try: check = Word(clean_word) if check.correct == False: if clean_word not in errors: errors[clean_word] = {} errors[clean_word]['variants'] = check.variants errors[clean_word]['count'] = 1 else: errors[clean_word]['count'] += 1 except Exception as e: print(f'Что-то пошло не так: {e}, слово: {clean_word}') continue pprint(errors) print('\n') browser.quit()