def __init__(self, dictionary_path): from spellchecker import SpellChecker self.spell = SpellChecker(local_dictionary=dictionary_path)
def predict_for_sentence(sentences, wordList): # 准备当作后端接口 model_path = ['E://pycharm//gector//models//pretrained_gectors//xlnet_0_gector.th', 'E://pycharm//gector//models//pretrained_gectors//bert_0_gector.th', 'E://pycharm//gector//models//pretrained_gectors//roberta_1_gector.th' ] vocab_path = 'E://pycharm//gector//data//output_vocabulary' model = GecBERTModel(vocab_path=vocab_path, model_paths=model_path, max_len=50, min_len=3, iterations=5, min_error_probability=0.0, min_probability=0.0, lowercase_tokens=0, model_name='xlnet', special_tokens_fix=0, log=False, confidence=0, is_ensemble=1, ) spell = SpellChecker() for word in wordList: spell.word_frequency.add(word) error_labels = set() batch = [] notes = set() correctList = [] for sentence in sentences: tokens = sentence.split() batch.append(tokens) st = time.time() preds, cnt, labels, dics = model.handle_batch(batch, spell) for i in labels: error_labels.add(i) ed = time.time() for idx in range(len(preds)): print("after correct: ", [" ".join(x) for x in preds][idx]) print("correct errors: ", cnt) corr = [" ".join(x) for x in preds][idx] correctList.append(corr) print(f'inference time: {ed - st}') for i in error_labels: if i.startswith('$REPLACE'): notes.add("替换") elif i.startswith('$DELETE'): notes.add("删除") elif i.startswith('$APPEND'): notes.add("插入") elif i.startswith('$TRANSFORM'): label = i.split('_', 1)[1] if label.startswith('VERB'): notes.add("动词形式有误") elif label.startswith('AGREEMENT'): notes.add("请注意单复数问题") elif label.startswith('CASE'): notes.add("注意大小写") if 'Spell' in list(dics.keys()): notes.add('拼写') for note in notes: print(note) dics['Spell'] = list(set(dics['Spell'])) print(dics) ed1 = time.time() print(f'total time: {ed1 - st}') return correctList, list(notes), dics, cnt
def __init__(self): self.spell = SpellChecker(language=None, case_sensitive=False) self.spell.word_frequency.load_text_file('./varietal_dictionary.txt')
def specll_check(self, text): spell = SpellChecker() text = spell.split_words(text) return " ".join([spell.correction(word) for word in text])
def runSpellChecker(word): spell = SpellChecker() if len(spell.unknown([word])) >= 1: return spell.correction(word) return word
from spellchecker import SpellChecker from scrap import update_vault_list from ocr import OcrCheck from db_operations import relic_from_screen_overwrite import numpy as np import cv2 import requests # Initialize ################################################################################## v_relic_list = update_vault_list() # Define reference file for Spellchecking spell_check = SpellChecker(distance=1) spell_check.word_frequency.load_text_file('ref/other_ref/ref_words.txt') # Define references files to use for Warframe Data Era_file = 'ref/other_ref/ref_era.txt' Lith_file = 'ref/other_ref/ref_lith.txt' Meso_file = 'ref/other_ref/ref_meso.txt' Neo_file = 'ref/other_ref/ref_neo.txt' Axi_file = 'ref/other_ref/ref_axi.txt' Quality_file = 'ref/other_ref/ref_quality.txt' Ressources_file = 'ref/other_ref/ref_ressources.txt' # Parse references files to lists def parse_ref_files(file): ref_list = [] with open(file, "r") as fileHandler: for line in fileHandler:
def get_text_layer(original_image): alphabets = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ') spell = SpellChecker() a, b, c = original_image.shape blank_img = np.ones([a, b, c], dtype=np.uint8) blank_img.fill(255) a, b, c = original_image.shape blank_img2 = np.ones([a, b, c], dtype=np.uint8) blank_img2.fill(255) # Saving a original image and shape orig = original_image.copy() (origH, origW) = original_image.shape[:2] # set the new height and width to default 320 by using args #dictionary. (newW, newH) = (args["width"], args["height"]) # Calculate the ratio between original and new image for both height and weight. # This ratio will be used to translate bounding box location on the original image. rW = origW / float(newW) rH = origH / float(newH) # resize the original image to new dimensions image = cv2.resize(original_image, (newW, newH)) (H, W) = image.shape[:2] # construct a blob from the image to forward pass it to EAST model blob = cv2.dnn.blobFromImage(image, 1.0, (W, H), (123.68, 116.78, 103.94), swapRB=True, crop=False) # load the pre-trained EAST model for text detection net = cv2.dnn.readNet(args["east"]) # The following two layer need to pulled from EAST model for achieving this. layerNames = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"] # Forward pass the blob from the image to get the desired output layers net.setInput(blob) (scores, geometry) = net.forward(layerNames) # Find predictions and apply non-maxima suppression (boxes, confidence_val) = predictions(scores, geometry) boxes = non_max_suppression(np.array(boxes), probs=confidence_val) # initialize the list of results results = [] # loop over the bounding boxes to find the coordinate of bounding boxes for (startX, startY, endX, endY) in boxes: # scale the coordinates based on the respective ratios in order to reflect bounding box on the original image startX = int(startX * rW) startY = int(startY * rH) endX = int(endX * rW) endY = int(endY * rH) # extract the region of interest r = orig[startY:endY, startX:endX] # display regions of interests on blank image blank_img[startY:endY, startX:endX] = orig[startY:endY, startX:endX] # configuration setting to convert image to string. configuration = ("-l eng --oem 1 --psm 8") # TEXT RECOGNITION ##This will recognize the text from the image of bounding box text = pytesseract.image_to_string(r, config=configuration) # append bbox coordinate and associated text to the list of results results.append(((startX, startY, endX, endY), text)) # Display the image with bounding box and recognized text orig_image = orig.copy() # cv2.imshow("blank_img", blank_img) text_list = [] # Moving over the results and display on the image for ((start_X, start_Y, end_X, end_Y), text) in results: # display the text detected by Tesseract misspelled_word = ''.join(filter(alphabets.__contains__, text)) final_word = spell.correction(misspelled_word) # print("{}\n".format(text)) text_list.append(final_word) # Displaying text text = "".join([x if ord(x) < 128 else "" for x in final_word]).strip() cv2.rectangle(orig_image, (start_X, start_Y), (end_X, end_Y), (0, 0, 255), 1) cv2.putText(orig_image, text, (start_X, start_Y - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 1) cv2.putText(blank_img2, text, (start_X, start_Y - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 1) return text_list, orig_image, blank_img, blank_img2
def load_spell_check(self): if self.spell_check is not None: return self.spell_check = SpellChecker()
def expand_query(query): ''' Attempts to expand the given query by using synonyms from WordNet. As a consequnece of this process, the query is also tokenized and lemmatized. ''' spell = SpellChecker(local_dictionary='chatbot/nlp/statics/no_50k.json') # Tokenize, tag and filter query using Spacy tokens = [ # Store both token text and POS tag (token.text, token.pos_) for token in nb(query) # Filter away punctuation. if token.text not in string.punctuation ] # Add possible spelling corrections, without duplicates # We also want to keep the original token, since the detected misspelling # migt be intentional - power to the user! tokens += [ (spell.correction(token[0]), token[1]) for token in tokens if not spell.correction(token[0]) in [token[0] for token in tokens] ] # Lemmatize tokens tokens = [ # Store tuples of lemmatized tokens and their corresponding POS tags. (lemmatize(token[0], token[1])[0], token[0]) for token in tokens ] # Filter away stopwords as we do not want to expand them. tokens = [token for token in tokens if token not in get_stopwords()] # Store synonyms in a set, so duplicates are not added multiple times. synonyms = set() # The tokens in the expanded query. result = [] for token in tokens: # Convert POS tags from Spacy to WordNet. pos = getattr(wn, token[1], None) # Find all synsets for the word, using the Norwegian language. synsets = wn.synsets(token[0], lang='nob', pos=pos) # Get a custom synset wrapper. custom_synsets = SynsetWrapper.get_instance() # Get the synset for this token. custom_synset = custom_synsets.get_synset(token[0]) if custom_synset: # Remove the token itself to avoid duplication. custom_synset.remove(token[0]) synonyms.update(custom_synset) if synsets: for synset in synsets: # Find all lemmas in the synset. for name in synset.lemma_names(lang='nob'): # Some lemmas contain underscores, which we remove. synonyms.add(name.replace('_', ' ')) # If we found synonyms, we only add the synonyms. This is because # the original word is already included in the synset, so this # avoids adding it to the result list twice. continue # Add the original token to the full query. result.append(token[0]) # Add custom synset to the query result += synonyms return ' '.join(result)
""" import string import typing as t from datetime import date from enum import Enum, IntEnum from pathlib import Path from typing import List, Optional from pyaml import yaml from pydantic import BaseModel, EmailStr, HttpUrl, ValidationError from pydantic.color import Color from spellchecker import SpellChecker from typing_extensions import TypedDict GLOBAL_CHECKER = SpellChecker() class VersionNumber(IntEnum): """ Contains the different possible versions of `PortfolioEntry` items. """ version_0 = 0 class EntrySize(str, Enum): """ Describes the scope of a portfolio item. Note, this will have a direct impact on the way that the piece of media is displayed to reader when rendered in the portfolio. `large` items will be visually larger than smaller sized items.
import numpy as np import pandas as pd import re import demoji from spellchecker import SpellChecker from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer from bs4 import BeautifulSoup from city_state_dict import city_to_state_dict demoji.download_codes() # Initializes the spell checker, tokenizer and lammatizer. check = SpellChecker() tokenizer = RegexpTokenizer(r'\w+') lemma = WordNetLemmatizer() # Create a set of stopwords stop_words = set(stopwords.words('english')) # Reads in the data. data = pd.read_json('data.json') ########################### FUNCTIONS ################################### def correct_text(text): # text needs to be a list of clean word tokens without other characters. misspelled = check.unknown(text) for word in misspelled: text[text.index(word)] = check.correction(word) return list(set(text) - misspelled)
def transform(self, X: dt.Frame): from spellchecker import SpellChecker self.spell = SpellChecker() return X.to_pandas().astype(str).iloc[:, 0].apply(lambda x: self.correction(x))
from autocorrect import Speller from nltk import word_tokenize import unidecode from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer from emoticons_list import EMOTICONS from emoticons_list import EMO_UNICODE from string import punctuation from nltk.corpus import stopwords import spacy import gensim from collections import Counter from contraction import CONTRACTION_MAP from progress.bar import Bar spell_corrector = SpellChecker() # expending contractions contraction_mapping = CONTRACTION_MAP # take all key values from contraction_mapping contractions_pattern = re.compile('({})'.format('|'.join( contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL) # initialize porter stemmer object stemmer = PorterStemmer() # initializa lemmatizer object lemma = WordNetLemmatizer() # convert emo_unicode to unicode_emo UNICODE_EMO = {v: k for k, v in EMO_UNICODE.items()} # list of stopwords from nltk stopwords_nltk = list(stopwords.words('english'))
def test_words(self): ''' rest the parsing of words ''' spell = SpellChecker(language='en') res = ['this', 'is', 'a', 'test', 'of', 'this'] self.assertEqual(spell.split_words('This is a test of this'), res)
'DN': ['der', 'die', 'den', 'dem','einen', 'eine', 'einem', 'einer'] } ''' artikels = { 'NM': ['der', 'ein'], 'NF': ['die', 'eine'], 'NN': ['das', 'ein'], 'AM': ['den', 'einen'], 'AF': ['die', 'eine'], 'AN': ['das', 'ein'], 'DM': ['dem', 'einem'], 'DF': ['der', 'einer'], 'DN': ['dem', 'einem'] } spell = SpellChecker(language='de') def check_spell(doc): #SpellChecker words = [token.text for token in doc] misspelled = spell.unknown(words) misspelled errors = [] if misspelled: for misspell in misspelled: correct = spell.correction(misspell) tip = spell.candidates(misspell) error = {
from nltk import word_tokenize from nltk.corpus import stopwords import pandas as pn import numpy as np import nltk from nltk.stem.snowball import SpanishStemmer from spellchecker import SpellChecker import spacy import time nlp = spacy.load('es_core_news_sm') stoplist = stopwords.words('spanish') spanishStem=SpanishStemmer('spanish') spell = SpellChecker(language='es') def Lematizar(preguntas): #Recibo matriz de preguntas/respuestas t = time.time() for i in range(preguntas.shape[0]): oracion = '' for token in nlp(preguntas[i][1]): oracion = oracion + token.lemma_ + ' ' # print(token.text, token.lemma_, token.pos_) preguntas[i][1] = oracion print('Elapsed in lematizar: ' ,(time.time() - t)) return (preguntas) def LematizarOracion(sentence): #Recibo string oracion = '' for token in nlp(sentence): oracion = oracion + token.lemma_ + ' ' return (oracion)
a = len(pd_speeches.speaker.unique().tolist()) #assign most similar correct name to speaker in scanned transcripts (takes a bit): pd_speeches.loc[((pd_speeches.session < 42) | (pd_speeches.session ==92)) & (pd_speeches.wp == 15), 'speaker'] = pd_speeches.loc[((pd_speeches.session < 42) | (pd_speeches.session ==92)) & (pd_speeches.wp == 15), 'speaker'].apply(lambda x: difflib.get_close_matches(x, names_right, n=1)[0] if difflib.get_close_matches(x, names_right, n=1) else x) b = len(pd_speeches.speaker.unique().tolist()) print('previous n of distinct speakers: {}; after managing typos: {}'.format(a,b)) #discontinued; not that easy to solve if False: #spellchecker from spellchecker import SpellChecker import string german = SpellChecker(language='de') # remove punct from text; create list of words and find those misspelled t_np = t.translate(str.maketrans('', '', string.punctuation)) w = [w for w in t_np.split(' ') if w != ''] misspelled = german.unknown(w) correct = [] misp = [] for word in misspelled: # Get the one `most likely` answer print(german.correction(word), word) correct.append(german.correction(word)) misp.append(word) # implement algorithm that substitutes word (something like this)
def __init__(self, parser, indexer, model=None): self._parser = parser self._indexer = indexer self._ranker = Ranker() self._model = model self.spell = SpellChecker()
def is_spelling_correct(word_list): ret = True if len(SpellChecker().unknown(word_list)) > 0: ret = False if not ret: g_vars.get('logger').warning('Spell Check failed for {}'.format(word_list)) return ret
from nltk.corpus import wordnet from spellchecker import SpellChecker dictionary = SpellChecker() def get_suggestions(word): candidates = dictionary.candidates(word) candidates = [w for w in candidates if wordnet.synsets(w)] return candidates def web_get_records(word): resp = "" syn = wordnet.synsets(word) if not syn: return None dform = { "n": "noun", "v": "verb", "a": "adjective", "r": "adverb", "s": "adjective satellite", } ctr1 = 1 ctr2 = 97 for i in syn[:10]: ctr2 = 97 definition, examples, form = i.definition(), i.examples(), i.pos() resp = resp + str(ctr1) + "." + "\n"
from django.contrib import messages from django.utils import timezone from django.db.models import Sum from .models import Grocery_List from colorama import init, Fore from operator import itemgetter import numpy as np import logging import math import os import re # ______INITIALIZATIONS________ init(autoreset=True) # Colorama for printing colored text in the terminal dirpath = os.getcwd() # path of the project spellchecker = SpellChecker(local_dictionary=dirpath + \ "\\INEZ\\static\\INEZ\\json\\spellings.json", case_sensitive=True) #Spellchecker # loading the ~14000 products from cache if possible, else loading them from # file and caching them if cache.get("products") is not None: products = cache.get("products") else: products = {} with open(os.getcwd() + "\\INEZ\\products.txt", encoding="utf-8-sig") as f: for line in f: values = line.split("|") products[values[0]] = [ values[1].replace(",", "."), values[2].replace("\n", "") ] print(Fore.GREEN + 'Loaded %s products\n' % len(products)) cache.set("products", products, (60 * 60))
def main(): home_directory = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) with open( os.path.join(home_directory, 'extracted_ocr\\NY_Mercantile_Lib_1825.html'), 'r') as f: contents = f.read() file = html.unescape(bs(contents, 'lxml')) catalog = [] final_catalog = [] sum_of_heights = 0 #Counting the words that go into the denominator for average word height counted_words = 0 for index, page in enumerate(file.find_all('page')): words = page.find_all('word') if len(words) == 0: continue words.sort(key=lambda x: float(x.get('xmin'))) # First we sorted all of the words on the page from furthest left to furthest right. # Now we sort them all again by highest to lowest. Assuming one column of text per page, # this does an excellent job of putting all of the words in normal reading order. # Will need a different method for catalogs with two columns per page. words.sort(key=lambda x: float(x.get('ymin'))) old_line_y = float(words[0].get('ymin')) line = [] for word in words: # Ignoring empty "words." Not sure where they're coming from; this started happening # when I switched to ASCII encoding from the problematic-for-other-reasons UTF-8 if word.text == '': continue # Encoding error that's pervasive in NY Mercantile 1825 if word.text == 'â€' or word.text == 'â€': continue # Ignoring single lower case letters. These tend to be OCR artifacts that aren't useful if re.match("[a-z]{1}$", word.text): continue # Ignoring et cetera if word.text == '&c': continue # Ignoring things in all caps because they're typically headers # Even the catalogs where they're not necessarily (e.g., Ladies' Lib # of Kalamazoo), the OCR usually makes the all caps in running text # into normal text. if re.match("[A-Z]{2}", word.text): continue # Needed for Lib Co Boston 1830, where a lot of 1s got turned into Is in # the shelf number column # if word.text == 'I': # continue # Ignoring random flecks on the page that get turned into punctuation by OCR # Combined with number screen below. # if re.match("[_.,*:|'\"\^\-º]$", word.text): #continue # Ignoring common column headings. "Mo." is a common OCR error for an italicized "No." # For NY Mercantile 1825 we need "Vol" to identify contents of multi-volume sets. # if re.match("Vol", word.text) or if re.match("No.", word.text) or re.match( "Mo.", word.text) or re.match( "Shelf", word.text) or re.match("Size", word.text): continue # Ignoring page numbers -- 1, 2, or 3-digit numbers not followed by "nd", "rd", "th", etc. # Can also adjust to ignore shelf numbers when needed (e.g., NY Mechanics 1844) # And a tweak to ignore OCR-eaten numbers. # I swear that not all of this punctuation is in the ASCII code space, but I've seen # all of it in the OCR extracted by pdftotext with ASCII7 encoding.... #if re.match("[0-9 =%_.,*:#|'\"\^\-º•§]{1,4}$", word.text) or re.search("[0-9]{3,4}", word.text): if re.match("[ —=%_.,:#|'\"\^\-º•§]{1,4}$", word.text): # if re.match("[0-9]{1,4}$", word.text): continue counted_words += 1 sum_of_heights = sum_of_heights + (float(word.get('ymax')) - float(word.get('ymin'))) line_y = float(word.get('ymin')) # We know we're on a new line of text when the ymin increases more than 7 pixels. # 7 pixels was selected empirically based on first several catalogs processed. # This may be too large of a number for very small-type catalogs. # Changed to 8 because found cases in Milwaukee YMA where 7 was too small. # Changed to 12 because of non-straight lines in Charleston Lib Co. if (line_y - 8) > old_line_y: old_line_y = line_y if line: line.sort(key=lambda x: float(x.get('xmin'))) catalog.append(line) line = [word] else: line.append(word) #Append the last one on the page if len(line) > 0: line.sort(key=lambda x: float(x.get('xmin'))) catalog.append(line) #Process the page, putting together split lines into single entries previous_line_xmin = None for entry in catalog: # Is it contents for a multi-volume set? We need to flag these for removal, because the TOCs aren't in the HT metadata if re.match(r'\s*Vols?\.\s*', entry[0].text) and re.match( r'\s*\d{1,2}(\.|,)\s*', entry[1].text): entry[0].string = 'DELETE_ME' # Is the new line indented further than the old line? If so, # it needs some special handling. # If it's the first line on the page, the question is moot. if previous_line_xmin == None: indent = 0 else: first_real_word = next( (y for y in entry if y.text.rstrip('.,?!').casefold() != "do"), None) if first_real_word: this_line_xmin = float(first_real_word.get('xmin')) indent = previous_line_xmin - this_line_xmin else: continue # Allowing 10 pixels of slop to account for skewed scans etc. # Most indents are well over 10 pixels; could increase if needed. if (indent + 10) < 0 and (entry[0].string.rstrip('.,?!') == 'do'): # If this line is indented and not continuing previous line, # we want to append to this line everything from # the previous line with an xmin smaller than the xmin of this word. # Since this will carry down all relevant information from the words on the # previous lines, this *should* work even in catalogs with multiple # levels of indents. # 'vocable' because 'word' was already taken in this script # 10 pixels for slop again for vocable in final_catalog[-1]: if (float(vocable.get('xmin')) + 10) < this_line_xmin: entry.append(vocable) # sort it again because we screwed up the sort appending more words to it entry = sorted(entry, key=lambda x: float(x.get('xmin'))) elif (indent + 10) < 0 and entry[0].string != "DELETE_ME": # If it's indented but the first word isn't "do", then it's continuing # the previous line. # If it's a TOC, don't append it to the previous line. We want to keep the previous # line if it's the title of the series/set and only remove TOCs # Try to reassemble hyphenated words. OCR process ate # the hyphens at the end of lines, so we have to guess # if two words go together or not. We'll assume that if the # first word on the second line is not capitalized and is not # recognized by the spellchecker then it should be concatenated # with the last word on the previous line. last_word_of_carryover = final_catalog[-1][-1].string first_word_of_line = entry[0].string if first_word_of_line and first_word_of_line != "DELETE_ME" and ( re.match("[A-Z]", first_word_of_line) or re.match("[0-9 \-.,]+", first_word_of_line) or (first_word_of_line.rstrip('.,?!').casefold() in SpellChecker() and last_word_of_carryover.rstrip('.,?!').casefold() in SpellChecker())): final_catalog[-1] += entry final_catalog_sorted = sorted( final_catalog[-1], key=lambda x: float(x.get('xmin'))) previous_line_xmin = float( final_catalog_sorted[0].get('xmin')) entry = None else: final_catalog[-1][-1].string = final_catalog[-1][ -1].text + entry[0].text del entry[0] final_catalog[-1] += entry final_catalog_sorted = sorted( final_catalog[-1], key=lambda x: float(x.get('xmin'))) previous_line_xmin = float( final_catalog_sorted[0].get('xmin')) entry = None if entry and entry[0] != "DELETE_ME": entry_sorted = sorted(entry, key=lambda x: float(x.get('xmin'))) previous_line_xmin = float(entry_sorted[0].get('xmin')) final_catalog.append(entry) catalog = [] average_line_height = sum_of_heights / counted_words #average_line_height = sum_of_heights / len(file.find_all('word')) with open( os.path.join(home_directory, 'replication\\NY_Mercantile_Lib_1825.csv'), 'wb+') as outfile: csvwriter = unicodecsv.writer(outfile, encoding='utf-8') for item in final_catalog: # Write catalog entries out to a CSV, omitting shelf numbers and sizes # Also omit headers, to the extent we can identify them by # having a line-height more than 20% larger than average # (testing this just on 1st word in line) if (float(item[0].get('ymax')) - float(item[0].get('ymin')) < average_line_height * 1.2): final_entry = '' for vocable in item: # Don't want to include shelf numbers etc. in output. # Do want to include "1st", 12th", etc. # Also want to get rid of "do"s now that we're done w/them # Get rid of punctuation that confuses Solr (including commas and periods, which interact badly with the # fuzzy search ~ when they trail a word). This includes "'s" on the end of words. stripped_vocable = re.sub( r'[\+ \- & \| ! , \. ( ) \{ \} \[ \] \^ " ~ \* \? : \\ #”`]', ' ', vocable.text) if stripped_vocable == '': continue #if we filter out numbers here, then can't use them to identify tables of contents #in the next step #if stripped_vocable[0].isalpha() or stripped_vocable[-1] == 'h' or stripped_vocable[-1] == 'd' or stripped_vocable[-1] == 't': stripped_vocable = stripped_vocable.replace("'s", '') #stripped_vocable = stripped_vocable.replace("'", '') if stripped_vocable.casefold( ) != 'do' and stripped_vocable.casefold() != 'ditto': final_entry += ' ' + stripped_vocable # Catching TOCs that slipped through the first filter for whatever reason (often missing punctuation) if re.match(r'\s*Vols?\s*\d{1,2}', final_entry): continue for vocable in final_entry.split(): if vocable[0].isdigit() and vocable[-1] != 'h' and vocable[ -1] != 'd' and stripped_vocable[-1] != 't': final_entry = re.sub(vocable, '', final_entry) final_entry = re.sub(r'Presented(.*)by(.*)$', '', final_entry) final_entry = re.sub(r'Gift(.*)of(.*)$', '', final_entry) # This is for NY Society 1813 and NY Mercantile 1825. These translation notes break matching every time. # final_entry = final_entry.replace("translated from the Latin","") final_entry = final_entry.replace("translated from the French", "") final_entry = final_entry.replace("translated from the German", "") if final_entry != '': if final_entry.split() and final_entry.split( )[0] and final_entry.split()[0] != "DELETE_ME": csvwriter.writerow([final_entry])
(options, args) = parser.parse_args() if len(args) != 1: parser.print_usage() sys.exit(1) if options.language not in ['fr', 'en']: print('language should be set to fr or en') sys.exit(1) notebook = None input_fname = args[0] with open(input_fname) as finput: notebook = json.load(finput) spell = SpellChecker(options.language) tokenizer = RegexpTokenizer(r'\w+') if options.language == 'en': dict_fname = os.path.expandvars('$HOME/.spellchecker_en.txt') else: dict_fname = os.path.expandvars('$HOME/.spellchecker_fr.txt') with open(dict_fname, 'a'): os.utime(dict_fname, None) spell.word_frequency.load_text_file(dict_fname) learnt_words = [] ignored_words = [] notebook_updated = False cells = notebook['cells'] cells_md = [cell for cell in cells if cell['cell_type'] == 'markdown']
def __init__(self, bot): self.bot = bot self.spell = SpellChecker() self.spell.distance = 1 self.wordfile = pathlib.Path().home() / "wordlist.txt" self.spell.word_frequency.load_text_file(str(self.wordfile)) async def grammar_module(message): if message.guild is None or message.guild.name.lower() != "cortex": return clean_message = message.clean_content.lower() # MM: Added so list instead of string message_split = clean_message.split(" ") # BLACKLIST CHANNELS blacklist = [ "news", "rpg", "events", "recommends", "politisophy", "eyebleach", "weeb-lyfe", "out-of-context", "jokes", "anime-club", ] message_channel = message.channel.name.lower() if ( # DO NOT RESPOND TO SELF MESSAGES (bot.user.id == message.author.id or message.content.startswith(".")) or (message.channel.name is None) or ( reduce( lambda acc, n: acc or (n == message_channel), blacklist, False ) ) or ("thank" in clean_message) or ("http" in clean_message) ): return ctx = await bot.get_context(message) new_message = re.sub( "n?'[A-Za-z]+|[^A-Za-z ]", "", message.content.lower() ).split(" ") new_message = [w for w in new_message if w != ""] if len(new_message) == 0: return mispelled = self.spell.unknown(new_message) if len(mispelled) == 0: return message_changed = False for word in mispelled: correction = self.spell.correction(word) if correction != word: new_message = [w if w != word else correction for w in new_message] message_changed = True new_message = " ".join(new_message) if message_changed: await ctx.send('I think you meant to say, "{}"'.format(new_message)) return self.bot.add_listener(grammar_module, "on_message")
def set_language(self, entry='en'): """sets language. Options are 'en' 'de' 'fr' es'""" if entry in ['en', 'de', 'fr', 'es']: self.language = entry self.spell = SpellChecker(language=self.language)
!pip install pyspellchecker from spellchecker import SpellChecker spell = SpellChecker(distance=1) # find those words that may be misspelled misspelled = spell.unknown(['tt']) for word in misspelled: # Get the one `most likely` answer print(spell.correction(word)) # Get a list of `likely` options print(spell.candidates(word)) def spell_check(df): for i in range(df.shape[0]): if i%100000==0: print("Reached {0}, percent {1}".format(i,float(i/df.shape[0])*100)) words=df['text'][i].split() misspelled = spell.unknown(words) l=[] for word in words: if word in misspelled: word=spell.correction(word) else: word=word l.append(word) #words=[spell.correction(word) for word in words] df['text'][i]=' '.join(word for word in l) return df def spell_collection(df):
from nltk.corpus import stopwords, wordnet from nltk.stem import WordNetLemmatizer from spellchecker import SpellChecker from constant import CONTRACTION_MAPPING, PUNCT_MAPPING # Constant ENG_STOPWORDS = set(stopwords.words("english")) WORDNET_MAP = { "N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV } # Instance SpellCheckerInstance = SpellChecker() LemmatizerInstance = WordNetLemmatizer() def clean_html_tag(text): """HTMLの削除""" return BeautifulSoup(text, "lxml").text def clean_url(text): """URLの削除""" url_pattern = re.compile(r'https?://\S+|www\.\S+') return url_pattern.sub(r'', text) def clean_number(text):
def token_spellchecker(tokens): spell = SpellChecker() correct_spelling = [spell.correction(word) for word in tokens] return correct_spelling
from nltk.corpus import words from nltk.tokenize import sent_tokenize,word_tokenize from nltk.corpus import cmudict nltk.download('words') nltk.download('cmudict') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') from textblob import TextBlob import math import re import string import syllables d = cmudict.dict() tool = language_tool_python.LanguageTool('en-US') from spellchecker import SpellChecker spell = SpellChecker() ! pip install langdetect from langdetect import detect import requests, time url = 'https://farasa-api.qcri.org' import ast """# All Functions""" """ #my source is: https://readabilityformulas.com/the-LIX-readability-formula.php def w_g_4(txt): count =0 words=word_tokenize(txt) for x in words: #print(len(x))
def setUp(self): self.spellChecker = SpellChecker() self.spellChecker.load_words('spellwords.txt')