def spellCheck(string_to_be_checked): # create an object spell = SpellChecker() # spell.word_frequency.load_text_file('wordsDictionary.txt') misspelled = spell.unknown(string_to_be_checked.split(" ")) spell.word_frequency.load_words( ['f**k', 'f****d', 'damm', 'lmaof', 'pissed', 'google']) spell.known(['f**k', 'f****d', 'damm', 'lmaof', 'pissed']) # will return both now! # print(string_to_be_checked) # print(misspelled) for word in misspelled: # Get the one `most likely` answer correctedWord = '' if (word != ''): correctedWord = spell.correction(word) # Get a list of `likely` options # print(word,correctedWord) if (correctedWord == None or correctedWord == ''): continue string_to_be_checked = string_to_be_checked.replace( word, correctedWord, 1) # print(correctedWord) # print(string_to_be_checked) return string_to_be_checked
class DictionaryChecker: def __init__(self): self.dict = SpellChecker() def check(self, word): wordl = word.lower() result = self.dict.known([wordl]) if (len(result) > 0): return True else: return False def getNumRealWords(self, wordlist): wordlistl = [x.lower() for x in wordlist] return len(self.dict.known(wordlistl))
class SpellProcessor: def __init__(self, first_level_dict='dictionaries/ru_full.txt', second_level_dict='dictionaries/products.txt'): # First level self.first_level_dict = first_level_dict self.spell_checker = SpellChecker() self.spell_checker.word_frequency.load_text_file(self.first_level_dict) # Second level self.second_level_dict = second_level_dict self.word_corrector = SpellChecker() if not os.path.exists(second_level_dict): with open(second_level_dict, 'w'): pass self.word_corrector.word_frequency.load_text_file( self.second_level_dict) async def write_to_dict(self, correct_words): with open(self.second_level_dict, "r+") as second_dict_file: text = second_dict_file.read() for word in correct_words: word_pos = text.find(word) if not word_pos == -1: # Increasing frequency of word usage end_pos = text.find('\n', word_pos) number = int(text[word_pos + len(word) + 1:end_pos]) text = "".join((text[:word_pos + len(word) + 1], str(number + 1), text[end_pos:])) else: text += (word + " 1" + "\n") with open(self.second_level_dict, "w") as second_dict_file: # Saving changes to file #second_dict_file.truncate() second_dict_file.write(text) def correct(self, products): correct_words = [] # This word will be saved to products dictionary result_list = [] for product in products: words = product.split(' ') fixed_words = [] for word in words: if len(word) > 2 and len(self.spell_checker.known([word ])) == 1: correct_words.append(word) fixed_words.append(self.word_corrector.correction(word)) result_list.append(' '.join(fixed_words)) correct_words = np.unique(np.array(correct_words)) asyncio.run(self.write_to_dict(correct_words)) return result_list
class SpellCheckerML: def __init__(self): self.spell_checker = SpellChecker() self.autocomplete = autocomplete self.autocomplete.load() def train(self, text, model_name=''): if model_name == '': self.autocomplete.models.train_models(text, model_name=False) else: self.autocomplete.models.train_models(text, model_name=model_name) self.autocomplete.load() def correction(self, previous_word, word): if self.spell_checker.known([word]): return word else: spell_checker_candidates = self.spell_checker.candidates(word) autocomplete_predictions = self.autocomplete.predict(previous_word, word[0]) autocomplete_candidates = [elem[0] for elem in autocomplete_predictions] best_choices = [] for candidate in spell_checker_candidates: try: candidate_index = autocomplete_candidates.index(candidate) best_choices.append(autocomplete_predictions[candidate_index]) except: continue if best_choices: best_choices = sorted(best_choices, key=lambda t:t[1]) return list(best_choices[-1])[0] else: return random.choice(list(spell_checker_candidates))
def spell_correct(token): spell = SpellChecker() if (spell.known(token)): return token else: spell_correct_tok = spell.correction(token) return spell_correct_tok
def get_valid_words(digits): t9_letters = [t9[int(digit)] for digit in digits if int(digit) > 1] t9_words = product(*t9_letters) # all possible combination t9_words = [''.join(word) for word in t9_words] spell = SpellChecker() valid_words = spell.known(t9_words) return valid_words
def test_word_known(self): ''' test if the word is a `known` word or not ''' spell = SpellChecker() self.assertEqual(spell.known(['this']), {'this'}) self.assertEqual(spell.known(['sherlock']), {'sherlock'}) self.assertEqual(spell.known(['holmes']), {'holmes'}) self.assertEqual(spell.known(['known']), {'known'}) self.assertEqual(spell.known(['-']), set()) self.assertEqual(spell.known(['foobar']), set()) self.assertEqual(spell.known(['ths']), set()) self.assertEqual(spell.known(['ergos']), set())
def run(project_id, repo_path, cursor, **options): num_core_commit_words = 0 totalNumberOfCommitWords = 0 cursor.execute(QUERY.format(project_id)) repoName = cursor.fetchone()[0] os.chdir("path/" + str(project_id) + "/") stri = os.getcwd() for repos in os.listdir(): if (repos == repoName): os.chdir(repos) Dirs = [] Files = [] for (root, dirs, files) in inner_os.walk("", topdown=True): Dirs.append(dirs.lower()) Files.append(files.lower()) stream = inner_os.popen( 'git log --pretty=format:"%s"').read().split("\n") for commits in stream: commits = commits.lower() for ab in Dirs: if (ab in commits): commits.replace(ab, "") num_core_commit_words += 1 totalNumberOfCommitWords += 1 for ab in Files: if (ab in commits): commits.replace(ab, "") num_core_commit_words += 1 totalNumberOfCommitWords += 1 nr = re.sub("[^0123456789 ]", "", commits) nr = ' '.join(nr.split()) totalNumberOfCommitWords += len(nr.split()) trim_commit = re.sub("[^a-zA-Z ]+", "", commits) trim_commit = ' '.join(trim_commit.split()) #trim_commit = re.sub(r"\b[0-9]\b", "", trim_commit) # print('trim: ',len(trim_commit.split())) totalNumberOfCommitWords += len(trim_commit.split()) spell = SpellChecker() trim_commit = re.sub(r"\b[a-zA-Z]\b", "", trim_commit) trim_commit = re.sub(r"\b[a-zA-Z][a-zA-Z]\b", "", trim_commit) trim_commit = trim_commit.split() spelled = spell.known(trim_commit) # print('spelled: ',spelled) num_core_commit_words += len(spelled) print("----- METRIC: COMMIT QUALITY -----") commits_ratio = 0 # print('total: ',totalNumberOfCommitWords) # print('core: ',num_core_commit_words) if (totalNumberOfCommitWords > 0): commits_ratio = float(num_core_commit_words) / float( totalNumberOfCommitWords * 1.0) print('core commits ratio: ', commits_ratio) break threshold = options['threshold'] return (commits_ratio >= threshold, commits_ratio)
def spellcheck(input_data: dict, word_list: str) -> None: """Check package for spelling errors.""" spell = SpellChecker() try: spell.word_frequency.load_text_file(word_list) with open(word_list, 'r') as data_file: word_list_data = data_file.read() except FileNotFoundError: info("Word list not found searching up a directory...") # Search for word list if not found. search_path = f"/opt/pycep/word_list.txt" spell.word_frequency.load_text_file(search_path) with open(search_path, 'r') as data_file: word_list_data = data_file.read() known_data_list = word_list_data.split("\n") spell.known(known_data_list) task_data = return_non_data_task(input_data) for package in task_data: for values, lines in task_data[package].items(): spell_check_task(spell, lines, values, package)
def decrypt_BF(ciphertext, matchrate=0.8): """ A brute force approach to decipher any mono-alphabetic substitution ciphers THIS PROGRAM DOESNOT WORK: generating 26! list is impossible, needs another way to generate permutations """ # use a spellchecker to check whether words are in dictionary from spellchecker import SpellChecker # create an English spell checker spell = SpellChecker(language=u'en') # set the criterion for the number of matched words wordsCount = len(spell.split_words(ciphertext)) wordsMatchCountMin = int(matchrate * wordsCount) # create a list of alphabets for ct cipher = [None] * 26 # assign them to letters for i in range(26): cipher[i] = chr(i + ord('A')) # generate all possible permutations import itertools plain_lists = list(itertools.permutations(cipher)) for i in range(len(plain_lists)): # create the plain list plain = plain_lists[i] # create the decipher dict decipherDict = {} # iterate 'A' to 'Z' for seq in range(26): # add letter and its count to dict decipherDict.update({cipher[seq]: plain[seq]}) # decrypt with the current decipher table decrypted = decrypt(ciphertext, decipherDict) # split the text into a list of words wordsList = spell.split_words(decrypted) wordsCount = len(wordsList) print(i) # check whether it is a real word dictWordsList = spell.known(wordsList) if len(dictWordsList) >= wordsMatchCountMin: print("Find dictionary words at shift ", shift) printCipherTable(decipherDict, isInverse=True) return decrypted print("All trials failed") return "" return
def spellCorrectBackupBaseline(self, check_str): """ Baseline spell checker uses spellchecker library """ print('spellCorrectBackupBaseline called') spell = SpellChecker() spell.known(['zwave', 'rheem']) splitted = check_str.split() for w_ix in range(len(splitted)): if splitted[w_ix].isalpha(): mis_check = list(spell.unknown([splitted[w_ix].lower()])) if len(mis_check) == 1: splitted[w_ix] = spell.correction(mis_check[0]) final_result = " ".join(splitted) # self.append_values[check_str] = final_result return final_result
def scoreTextForSpellingCorrectness(article): score = 0 articleChecker = SpellChecker() wordsInArticle = article.split() totalWords = len(wordsInArticle) numIncorrectWords = len(articleChecker.unknown(wordsInArticle)) correctlySpelledWords = articleChecker.known(wordsInArticle) for word in correctlySpelledWords: score += len(word) # Reward a text for having longer words score -= numIncorrectWords score /= totalWords return score
def test_large_words(self): ''' test checking for words that are clearly larger than the largest dictionary word ''' spell = SpellChecker(language=None, distance=2) spell.word_frequency.add('Bob') words = ['Bb', 'bb', 'BB'] self.assertEqual(spell.unknown(words), {'bb'}) known_words = ['BOB', 'bOb'] self.assertEqual(spell.known(known_words), {'bob'}) self.assertEqual(spell.correction('bobs'), 'bob') self.assertEqual(spell.correction('bobb'), 'bob') self.assertEqual(spell.correction('bobby'), 'bob') self.assertEqual(spell.word_frequency.longest_word_length, 3) self.assertEqual(spell.correction('bobbys'), 'bobbys')
class SpellingSuggestor(object): def __init__(self, word): self.word = word self.spell = SpellChecker() """"method to replace underscore or dash by space""" def pre_process(self): return re.sub(r'([^\s\w]|_|-)+', ' ', self.word) """method to remove letters which occur more than twice""" def reduce_lengthening(self): pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1", self.word) "main method to call and do spell check processing " def spell_checker_result(self): self.word = self.pre_process() self.word = self.reduce_lengthening().lower() i = 1 print("word after cleaning ", self.word) misspelled = self.spell.unknown([self.word]) if len(misspelled) == 0: return [self.word] result = set() while (i < len(self.word)): r1 = self.spell.candidates(self.word[:i].strip()) r2 = self.spell.candidates(self.word[i].strip()) r1 = self.spell.known(r1) r2 = self.spell.known(r2) if len(r1) > 0 and len(r2) > 0: try: for v1 in r1: result.add(v1) for v2 in r2: if len(v2) > 2: result.add(v2) result.add(v1 + " " + v2) except Exception as ex: print("some error", ex) i += 1 return result
def test_capitalization_when_case_sensitive_defaults_to_false(self): ''' test that capitalization doesn't affect in comparisons ''' spell = SpellChecker(language=None) spell.word_frequency.add('Bob') spell.word_frequency.add('Bob') spell.word_frequency.add('Bab') self.assertEqual('Bob' in spell, True) self.assertEqual('BOb' in spell, True) self.assertEqual('BOB' in spell, True) self.assertEqual('bob' in spell, True) words = ['Bb', 'bb', 'BB'] self.assertEqual(spell.unknown(words), {'bb'}) known_words = ['BOB', 'bOb'] self.assertEqual(spell.known(known_words), {'bob'}) self.assertEqual(spell.candidates('BB'), {'bob', 'bab'}) self.assertEqual(spell.correction('BB'), 'bob')
def test_capitalization_when_case_sensitive_true(self): ''' test that capitalization affects comparisons ''' spell = SpellChecker(language=None, case_sensitive=True) spell.word_frequency.add('Bob') self.assertEqual('Bob' in spell, True) self.assertEqual('BOb' in spell, False) self.assertEqual('BOB' in spell, False) self.assertEqual('bob' in spell, False) words = ['Bb', 'bb', 'BB'] self.assertEqual(spell.unknown(words), {'Bb', 'bb', 'BB'}) case_variant_words = ['BOB', 'bOb'] self.assertEqual(spell.known(case_variant_words), set()) self.assertEqual(spell.candidates('Bb'), {'Bob'}) self.assertEqual(spell.candidates('bob'), {'Bob'}) self.assertEqual(spell.correction('Bb'), 'Bob') self.assertEqual(spell.correction('bob'), 'Bob') self.assertEqual(spell.unknown(['bob']), {'bob'})
class SpellCheck: def __init__(self): self.spell = None def spell_correct(self, x): """ Given a sentence x, this function will check, for each word, weather it was misspelled or not. :param x: :return: """ if self.spell is None: from spellchecker import SpellChecker self.spell = SpellChecker() word_list = word_tokenize(x) misspelled = self.spell.unknown(word_list) corrected_words_dict = dict([ (word, self.spell.correction(word)) for word in misspelled]) word_corrected = [corrected_words_dict.get(x,x) for x in word_list] return ' '.join(word_corrected) def spell_check(self, x): """ Given a sentence x, this function will return the same sentence bu removing all the words that were misspelled. Notice this function doesn't correct any misspelled words but just filter them. If you want to correct those wordspip install pyspellchecker you should use spell_check_correct. """ if self.spell is None: from spellchecker import SpellChecker self.spell = SpellChecker() word_list = word_tokenize(x) correct_words = self.spell.known(word_list) word_list_filtered = [x for x in word_list if x in correct_words] return ' '.join(word_list_filtered)
def suggest(): ss = request.args.get('letters', default='hello') perms = [] # support up to 7 letters ss = ss[:7] print(ss) for l in range(3, len(ss) + 1): #print(l) perm = permutations(list(ss), l) for i in perm: w = ''.join(i) perms.append(w) spell = SpellChecker(distance=1) k = spell.known(perms) w = sorted(list(k)) rmap = {"letters": ss, "words": w} jd = json.dumps(rmap) print(jd) return jd
def searchByText(cls, name, page_num): nameRevised = '%{}%'.format(name) check = cls.query.filter(cls.desc.ilike(nameRevised)).first() if check: resultRev = cls.query.filter(cls.desc.ilike(nameRevised)).paginate( per_page=4, page=page_num) return resultRev else: tokenized = name.split() spell = SpellChecker(language="pt") correctWord = '' for i in range(len(tokenized)): size = len(spell.known([tokenized[i]])) if size > 0: correctWord += f"{tokenized[i]}" else: correctWord += f"{spell.correction(tokenized[i])}" nameCorrected = '%{}%'.format(correctWord) result = cls.query.filter( cls.desc.ilike(nameCorrected)).paginate(per_page=3, page=page_num) return result
def xor_strings(self): puntaje_final_strings = [] file_read = open(self.file_name, "r") out = open("salida", "w") spell = SpellChecker() for line in file_read: #print(line) cadena_hex = line.rstrip("\n") cadena = cadena_hex.decode("hex") print(cadena) res = '' puntaje_final = 0 puntaje_final = [] for i in range(256): for j in cadena: res_byte = ord(j) ^ i res += chr(res_byte) puntaje_actual = analiza_frecuencia(res) puntaje_final.append((puntaje_actual, res, i)) res = '' ult = heapq.nlargest(10, puntaje_final) puntaje_actual = 0 for i in range(10): palabras = ult[i][1].split(' ') mejor_palabra = spell.known(palabras) if (mejor_palabra): for palabra in mejor_palabra: puntaje_actual += spell.word_probability(palabra) puntaje_final_strings.append( (puntaje_actual, ult[i][1], ult[i][2])) ult = heapq.nlargest(1, puntaje_final_strings) print(ult[0][1]) file_read.close() out.close()
class Corrector: def __init__(self, lenguaje, diccionario=None, distancia=2, tokenizador=None): """ Constructor por defecto de la clase `Corrector`. Esta clase se \ encarga de realizar corrección ortográfica sobre textos. :param lenguaje: Lenguaje de los textos a los que se les va a \ aplicar corrección ortográfica. Para mayor información, consultar \ la sección de \ :ref:`Lenguajes soportados <seccion_lenguajes_soportados>`. :type lenguaje: str :param diccionario: Diccionario (o string con ubicación del archivo \ JSON que lo contiene), o lista que permite modificar y agregar \ palabras. Si es una lista, contiene las palabras que serán \ consideradas como válidas o correctas. \ Si es un diccionario, las llaves del diccionario son las palabras \ que serán consideradas como válidas o correctas, y los valores \ del diccionario son las frecuencias de cada palabra en el \ diccionario. Las frecuencias son utilizadas como criterio de \ desempate, cuando una palabra incorrecta tiene más de una palabra \ candidata para la corrección. Si se deja este \ parámetro como `None`, se cargará el diccionario por defecto que \ trae la librería `spellchecker` para el lenguaje determinado. :type diccionario: dict, list, str, opcional :param distancia: Máxima distancia de \ Levenshtein que puede haber entre una palabra incorrecta (o no \ reconocida) y las palabras del diccionario para determinar si \ hay palabras candidatas para realizar la corrección. \ Valor por defecto `2`. :type distancia: int :param tokenizador: Objeto encargado de la tokenización y \ detokenización de textos. Si el valor es `None`, se cargará por \ defecto una instancia de la clase `TokenizadorNLTK`. Valor por \ defecto `None` :type tokenizador: object, opcional """ # Definir lenguaje del corrector ortográfico self.establecer_lenguaje(lenguaje) # Inicializar corrector self.iniciar_corrector(diccionario) self.establecer_distancia(distancia) self.tokenizador = (TokenizadorNLTK() if tokenizador is None else tokenizador) def establecer_lenguaje(self, lenguaje): """ Permite definir o cambiar el lenguaje de los textos sobre los cuales \ van a aplicarse el objeto de la clase `Corrector`. :param lenguaje: Lenguaje de los textos a los que se les va a \ aplicar corrección ortográfica. Para mayor información, consultar \ la sección de \ :ref:`Lenguajes soportados <seccion_lenguajes_soportados>`. :type lenguaje: str """ self.lenguaje = definir_lenguaje(lenguaje) def iniciar_corrector(self, diccionario): """ Inicializa el objeto de la clase `SpellChecker` de la librería \ spellchecker, para el lenguaje definido previamente, y lo asigna al \ atributo "corrector" del objeto de clase `Corrector`. :param diccionario: Diccionario (o string con ubicación del archivo \ JSON que lo contiene), o lista que permite modificar y agregar \ palabras. Si es una lista, contiene las palabras que serán \ consideradas como válidas o correctas. \ Si es un diccionario, las llaves del diccionario son las palabras \ que serán consideradas como válidas o correctas, y los valores \ del diccionario son las frecuencias de cada palabra en el \ diccionario. Las frecuencias son utilizadas como criterio de \ desempate, cuando una palabra incorrecta tiene más de una palabra \ candidata para la corrección. Si se deja este \ parámetro como `None`, se cargará el diccionario por defecto que \ trae la librería `spellchecker` para el lenguaje determinado. :type diccionario: dict, list, str, opcional """ self.corrector = None if self.lenguaje is not None: if isinstance(diccionario, str): self.corrector = SpellChecker(local_dictionary=diccionario) elif type(diccionario) in [dict, list]: self.corrector = SpellChecker(language=self.lenguaje) self.actualizar_diccionario(diccionario) else: self.corrector = SpellChecker(language=self.lenguaje) def establecer_distancia(self, distancia): """ Establece la distancia máxima que utilizará el algoritmo de corrección\ de ortografía para determinar si hay palabras candidatas para \ corregir una palabra incorrecta o no reconocida. :param distancia: Máxima distancia de \ Levenshtein que puede haber entre una palabra incorrecta (o no \ reconocida) y las palabras del diccionario para determinar si \ hay palabras candidatas para realizar la corrección. \ Valor por defecto `2`. :type distancia: int """ if self.corrector is not None: self.corrector.distance = distancia def actualizar_diccionario(self, diccionario): """ Actualiza el diccionario que contiene las palabras válidas o \ reconocidas disponibles para realizar la corrección ortográfica. Las \ palabras contenidas en el argumento `diccionario` de esta función \ serán añadidas (o sus frecuencias serán actualizadas) en el \ diccionario que ya existe en el objeto de la clase `Corrector`. :param diccionario: Diccionario (o string con ubicación del archivo \ JSON que lo contiene), o lista que permite modificar y agregar \ palabras. Si es una lista, contiene las palabras que serán \ consideradas como válidas o correctas. \ Si es un diccionario, las llaves del diccionario son las palabras \ que serán consideradas como válidas o correctas, y los valores \ del diccionario son las frecuencias de cada palabra en el \ diccionario. Las frecuencias son utilizadas como criterio de \ desempate, cuando una palabra incorrecta tiene más de una palabra \ candidata para la corrección. Si se deja este \ parámetro como `None`, se cargará el diccionario por defecto que \ trae la librería `spellchecker` para el lenguaje determinado. :type diccionario: dict, list, str, opcional """ if isinstance(diccionario, str): diccionario = json.load(open(diccionario)) if isinstance(diccionario, list): diccionario = [palabra.lower() for palabra in diccionario] self.corrector.word_frequency.load_words(diccionario) elif isinstance(diccionario, dict): self.quitar_palabras(list(diccionario.keys())) for key in diccionario.keys(): self.corrector.word_frequency.load_words([key.lower()] * diccionario[key]) else: pass def quitar_palabras(self, palabras): """ Quita del diccionario del corrector una o más palabras \ proporcionadas en el argumento `palabras`, haciendo que estas ya no \ sean reconocidas como palabras válidas o correctas al momento de \ hacer corrección ortográfica. :param palabras: Palabra o lista de palabras que se \ desean quitar del diccionario del objeto de la clase `Corrector`, \ para que no sean reconocidas como correctas al momento de hacer \ la corrección ortográfica. :type palabras: str, list """ if isinstance(palabras, str): palabras = [palabras] # Quitar de la lista palabras que no estén en el diccionario palabras = [p for p in palabras if self.frecuencia_palabra(p) > 0] if len(palabras) > 0: self.corrector.word_frequency.remove_words(palabras) def agregar_palabras(self, palabras): """ Añade al diccionario del corrector una o más palabras proporcionadas \ en el argumento `palabras`, haciendo que estas sean reconocidas como \ palabras válidas o correctas al momento de hacer corrección \ ortográfica. :param palabras: Palabra o lista de palabras que se \ desean quitar del diccionario del objeto de la clase `Corrector`, \ para que no sean reconocidas como correctas al momento de hacer \ la corrección ortográfica. :type palabras: str, list """ if isinstance(palabras, str): palabras = [palabras] self.actualizar_diccionario(palabras) def palabras_conocidas(self, texto): """ A partir de un texto de entrada, devuelve un conjunto (objeto de \ clase `set` de Python) con las palabras del texto que se reconocen \ por estar presentes en el diccionario del corrector. :param texto: Texto para el que se desean hayar las palabras \ conocidas. :type texto: str :return: (set) Conjunto de palabras conocidas presentes en el texto \ de entrada. """ tokens = self.tokenizador.tokenizar(texto) return self.corrector.known(tokens) def palabras_desconocidas(self, texto): """ A partir de un texto de entrada, devuelve un conjunto (objeto de \ clase `set` de Python) con las palabras del texto que no están \ incluidas en el diccionario del corrector y por lo tanto no se \ reconocen. :param texto: Texto para el que se desean hallar las palabras \ desconocidas. :type texto: str :return: (set) Conjunto de palabras desconocidas presentes en el \ texto de entrada. """ tokens = self.tokenizador.tokenizar(texto) return self.corrector.unknown(tokens) def palabras_candidatas(self, palabra): """ Para una palabra de entrada, retorna un conjunto de palabras que \ podrían ser utilizadas para corregirla. Si la palabra de entrada es \ correcta (está dentro del diccionario del corrector) o no tienen \ ninguna palabra candidata con una distancia menor o igual a la \ establecida en el parámetro `distancia` de la clase `Corrector`, la \ función devolverá la misma palabra de entrada. :param palabra: Palabra para la que se quieren conocer palabras \ candidatas para su corrección ortográfica. :type palabra: str :return: (set) Conjunto de palabras candidatas para corregir la \ palabra de entrada. """ return self.corrector.candidates(palabra) def frecuencia_palabra(self, palabra): """ Para una palabra de entrada, devuelve la frecuencia de la misma, de \ acuerdo al diccionario del corrector. Si la palabra es desconocida \ (no se encuentra en el diccionario), la frecuencia retornada será de \ cero. :param palabra: Palabra para la cual se desea conocer la \ frecuencia de aparición en el diccionario del corrector. :type palabra: str :return: (int) Número mayor o igual a cero que indica la frecuencia \ de la palabra consultada en el diccionario del corrector. """ return self.corrector[palabra] def probabilidad_palabra(self, palabra): """ Para una palabra de entrada, devuelve la probabilidad de aparición \ entendida como su frecuencia sobre la suma de las \ frecuencias de todas las palabras disponibles, de acuerdo al \ diccionario del corrector. Si la palabra es desconocida (no se \ encuentra en el diccionario), la probabilidad retornada será de cero. :param palabra: Palabra para la cual se desea conocer la \ probabilidad de aparición en el diccionario del corrector. :type palabra: str :return: (float) Probabilidad, entre 0 y 1, de aparición de la \ palabra. """ return self.corrector.word_probability(palabra) def correccion_ortografia(self, texto, limpieza=False): """ Realiza corrección ortográfica sobre un texto de entrada, \ identificando las palabras que no están en el diccionario del \ corrector y cambiándolas por su candidata más frecuente o probable, \ siempre y cuando haya por lo menos una palabra candidata que cumpla \ con la máxima distancia de Levenshtein permitida. :param texto: Texto al cual se desea aplicar corrección \ ortográfica. :param limpieza: Define si se desea hacer una limpieza \ básica (aplicando la función `limpieza_basica` del módulo \ `limpieza`) al texto antes de aplicar la corrección ortográfica.\ Valor por defecto `False`. :type limpieza: bool, opcional :return: (str) Texto de entrada luego de la corrección ortográfica. """ if limpieza: # Limpieza básica del texto para que no afecte la corrección texto = limpieza_basica(texto, quitar_numeros=False) lista_palabras = self.tokenizador.tokenizar(texto) desconocidas = self.corrector.unknown(lista_palabras) texto_corregido = [ self.corrector.correction(p) if len(p) > 1 and p in desconocidas else p for p in lista_palabras ] return self.tokenizador.destokenizar(texto_corregido)
class Neuron: """ Main processing object. sugaroid.brain.Neuron classifies texts initially """ def __init__(self, bot): self.bot = bot if self.bot.spell_checker: from spellchecker import SpellChecker self.spell = SpellChecker(distance=1) # some privileges only for the creator self.spell.known( ['Sugaroid', 'Sugarlabs', "sugar", 'Srevin', 'Saju']) logging.info("Sugaroid Neuron Loaded to memory") def parse(self, var): if var.isspace(): return 'Type something to begin' if 'time ' in var: response = self.time() else: for i in ARITHMETIC: if i in var: response = self.alu(self.normalize(var)) if str(response).strip() == '-': pass elif response: break else: if self.bot.spell_checker: wt = var.split(' ') ct = [] for i in wt: ct.append(self.spell.correction(i)) response = self.gen_best_match(' '.join(ct)) else: preprocessed = preprocess(var) response = self.gen_best_match(preprocessed) return response def alu(self, var): conversation = ' '.join(var) return self.gen_arithmetic(conversation) def time(self): return self.gen_time() def gen_best_match(self, parsed): return self.bot.get_response(parsed) @staticmethod def gen_time(): return 'The current time is {}'.format( strftime("%a, %d %b %Y %H:%M:%S", localtime())) def gen_arithmetic(self, parsed): try: me = MathematicalEvaluation(self.bot) return me.process(Statement(parsed)) except Exception as e: return False @staticmethod def normalize(text): return WordPunctTokenizer().tokenize(text)
''' take input of a jumbled up word and output all the possible real word made using those letters''' from spellchecker import SpellChecker from nltk.corpus import words from itertools import permutations spell = SpellChecker() input_string = input("Enter an anagram of a word\n") stuff = [''.join(a) for a in permutations(input_string)] possiblities = spell.known(stuff) print("The possible real word/words are:") for i in possiblities: if i in words.words(): print(i)
def test_capitalization_when_language_set(self): ''' test that capitalization doesn't affect comparisons when language not None''' spell = SpellChecker(language="en") self.assertEqual(spell.known(['Bike']), {'bike'})
def Spell_check(word_token): spell = SpellChecker(distance=2) spell.unknown(word_token) return spell.known(word_token)
accepted = True if len(sentence) < 50 or len(sentence) > 300: accepted = False if not full_stop: accepted = False count += 1 return sentence sentence = generate_sentence(forwards_model, backwards_model, seed_word=u" analytic ") sentence = sentence[1:] words = sentence.split(' ') corrected_sentence = [] for word in words: if word == ' ': continue corrected_word = spell.correction(word) print('word = ', word, ' correction = ', corrected_word, ' probability = ', spell.word_probability(corrected_word)) print('candidates = ', spell.candidates(word)) if len(spell.known([corrected_word])) == 1 or word.isnumeric(): corrected_sentence.append(corrected_word) sentence = ' '.join(corrected_sentence) + '.' sentence = sentence[0].upper() + sentence[1:] sentence = sentence.replace(' i ', ' I ') print(sentence)
class Incubator: #Class variables: # sample_block_table: A dictionary containing all blocks in sample_path and their frequency # spellchecker: A pyspellchecker instance with all the words in words_path added # population: Total population of the incubator, indicating how many chromosomes exist at one time # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_enabled: True if genetic shock enabled, false otherwise # shock_threshold: Number of cycles of fitness stagnation before genetic shock is triggered. # max_cycles: Cycle # at which the simulation terminates def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles): #Parameters: # sample_path: A path to a samples source file containing all training data to be fed to the incubator # words_path: A path to all words which the cipher_breaker should consider valid in addition # to those already in pyspellchecker. # elites: How many elites are carried over for each generation # children: How many children are created for each generation # randoms: How many random chromosomes are added each generation # tournament_size: How many chromosomes are considered in a tournament # cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one # mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one # shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value # max_cycles: Cycle # at which the simulation terminates #Initializes sample_block_tables self.sample_block_table = self.getSampleBlockTable(sample_path) #Initializes spellchecker self.spellchecker = SpellChecker() self.spellchecker.word_frequency.load_text_file((words_path)) #Checks cross_chance and mutation_chance are valid assert (cross_chance + mutation_chance) == 1 #Loads all incubator paramaters self.elites = elites self.children = children self.randoms = randoms self.population = self.elites + self.children + self.randoms self.tournament_size = tournament_size self.cross_chance = cross_chance self.mutation_chance = mutation_chance #Handles shock_value if shock_value <= 0: self.shock_enabled = False self.shock_threshold = 0 else: self.shock_enabled = True self.shock_threshold = shock_value self.max_cycles = max_cycles #Prints incubator summary if verbose enables if __VERBOSE__: print("Incubator Summary:") print("sample_path: " + sample_path + " words_path: " + words_path) print("Total population: " + str(self.population)) print("Elites: " + str(self.elites) + " Children: " + str(self.children) + " Randoms: " + str(self.randoms)) print("Tournament size: " + str(self.tournament_size) + " Cross chance: " + str(self.cross_chance) + " Mutation chance: " + str(self.mutation_chance)) print("Shock enabled: " + str(self.shock_enabled) + " Shock threshold: " + str(self.shock_threshold)) print("Max cycles: " + str(self.max_cycles)) print("\n") """TRAINING FUNCTIONS""" #Takes ciphertext, returns a chromosome that should decrypt ciphertext def train(self, cipher_text): #Initializes cycle counter cycles = 0 #Generates pool of chromosomes chromosomes = [] for chromosome_iter in range(self.population): chromosomes.append(self.getRandomChromosome()) #Genetic shock trigger variables. Triggers if fitness is stagnant for shock_threshold cycles best_fitness = 0 shock_ticker = 0 #Starts timer start_time = time.time() while True: #Increments cycle counter cycles += 1 #Creates list of (chromosome, fitness) tuples in order of increasing fitness chromosome_fitness = [] #Checks all chromosomes to see if the correct one has been found for chromosome in chromosomes: if len(self.spellchecker.unknown((chromosome.convertText(cipher_text)).split(" "))) == 0: if __VERBOSE__: print("Found key! " + str(chromosome)) print("Decrypted text: " + chromosome.convertText(cipher_text)) print("") return (chromosome, cycles) #Gets fitness of each chromosome and sorts them according to fitness for chromosome in chromosomes: chromosome_fitness.append((chromosome, self.getFitness(chromosome, cipher_text))) chromosome_fitness.sort(key=lambda x: x[1]) chromosome_fitness.reverse() #Checks if max_cycles exceeded. If so, returns the fittest chromosome if cycles >= self.max_cycles: print("Best Key: " + str(chromosome_fitness[0][0])) print("Decrypted text: " + chromosome_fitness[0][0].convertText(cipher_text)) print("") return (chromosome_fitness[0][0], cycles) #Checks if fitness is stagnant if chromosome_fitness[0][1] <= best_fitness: shock_ticker += 1 else: best_fitness = max(chromosome_fitness[0][1], best_fitness) shock_ticker = 0 #If __VERBOSE__, provide report on most fit chromosome if __VERBOSE__: converted_text = chromosome_fitness[0][0].convertText(cipher_text) print("Cycle# " + str(cycles)) print("Best Chromosome: " + str(chromosome_fitness[0][0])) print("Fitness: " + str(chromosome_fitness[0][1])) print("Shock Ticker: " + str(shock_ticker)) print("Cycle Time: " + str(time.time()-start_time)) print("Attempted Decrypt: " + converted_text) print("Known words: " + str(self.spellchecker.known((chromosome_fitness[0][0].convertText(cipher_text).split(" "))))) print("Unknown words: " + str(self.spellchecker.unknown((chromosome_fitness[0][0].convertText(cipher_text).split(" "))))) print("") start_time = time.time() #Creates a new chromosomes list new_chromosomes = [] #Copies over elite to new chromosomes for chromosome_iter in range(self.elites): new_chromosomes.append(chromosome_fitness[chromosome_iter][0].clone()) #Creates children in new_chromsomes #Performs tournament process to select breeding candidates tournament_selections = [] while len(tournament_selections) < (self.children): tournament_selections.append(self.tournament(chromosome_fitness)) #Breeds selected candidates while len(tournament_selections)>0: chance = random.random() if chance < self.cross_chance and len(tournament_selections) > 1: chromosome_one = tournament_selections.pop() chromosome_two = tournament_selections.pop() crossed_chromosomes = self.crossChromosomes(chromosome_one, chromosome_two) new_chromosomes.append(crossed_chromosomes[0]) new_chromosomes.append(crossed_chromosomes[1]) elif chance < (self.mutation_chance + self.cross_chance): new_chromosomes.append(self.mutateChromosome(tournament_selections.pop())) else: new_chromosomes.append(self.getRandomChromosome()) #Adds random chromosomes to new_chromosomes for random_iter in range(self.randoms): new_chromosomes.append(self.getRandomChromosome()) #Checks if genetic shock should be triggered if shock_ticker >= self.shock_threshold and self.shock_enabled: if __VERBOSE__: print("Triggering genetic shock...\n") #Performs genetic shock, culling top 10% of population and mutation all others for chromosome_iter in range(len(new_chromosomes)): if self.getFitness(new_chromosomes[chromosome_iter], cipher_text) > .9 * best_fitness: new_chromosomes[chromosome_iter] = self.getRandomChromosome() else: new_chromosomes[chromosome_iter] = self.mutateChromosome(new_chromosomes[chromosome_iter]) #Resets shock tickers and trackers shock_ticker = 0 best_fitness = 0 #Shifts new_chromosomes into gene pool chromosomes = new_chromosomes #Returns a mutated chromosome def mutateChromosome(self, chromosome): new_chromosome = chromosome.clone() #Chooses two mappings to swap mutation_one_index = random.randint(0,25) mutation_two_index = random.randint(0,25) while mutation_two_index == mutation_one_index: mutation_two_index = random.randint(0,25) mutation_one = new_chromosome.mappings[mutation_one_index] mutation_two = new_chromosome.mappings[mutation_two_index] new_chromosome.removeMapping(mutation_one) new_chromosome.removeMapping(mutation_two) mapping_one = (mutation_one[0], mutation_two[1]) mapping_two = (mutation_two[0], mutation_one[1]) new_chromosome.addMapping(mapping_one) new_chromosome.addMapping(mapping_two) return new_chromosome #Takes two chromosomes and returns two crosses of those chromosomes in the format (new_chromosome_one, new_chromosome_two) def crossChromosomes(self, chromosome_one, chromosome_two): new_chromosome_one = chromosome_one.clone() new_chromosome_two = chromosome_two.clone() for chromosome_iter in range(26): if(random.random() > .5): old_mapping_one = new_chromosome_one.mappings[chromosome_iter] old_mapping_two = new_chromosome_two.mappings[chromosome_iter] if old_mapping_one != old_mapping_two: complement_mapping_one = new_chromosome_one.getMappingTarget(old_mapping_two[1]) complement_mapping_two = new_chromosome_two.getMappingTarget(old_mapping_one[1]) old_origin_one = complement_mapping_one[0] old_origin_two = complement_mapping_two[0] new_chromosome_one.removeMapping(complement_mapping_one) new_chromosome_two.removeMapping(complement_mapping_two) new_chromosome_one.removeMapping(old_mapping_one) new_chromosome_two.removeMapping(old_mapping_two) complement_mapping_one = (old_origin_two, complement_mapping_one[1]) complement_mapping_two = (old_origin_one, complement_mapping_two[1]) new_chromosome_one.addMapping(old_mapping_two) new_chromosome_one.addMapping(complement_mapping_two) new_chromosome_two.addMapping(old_mapping_one) new_chromosome_two.addMapping(complement_mapping_one) return (new_chromosome_one, new_chromosome_two) #Returns a new random chromosome def getRandomChromosome(self): new_chromosome = Chromosome() origin = [] destination = [] for letterIter in range(26): origin.append(chr(letterIter+97)) destination.append(chr(letterIter+97)) random.shuffle(destination) for mappingIter in range(26): new_chromosome.addMapping((origin[mappingIter], destination[mappingIter])) return new_chromosome #Performs a tournament selection of chromosomes based on tournament_size def tournament(self, chromosome_fitness): tournament_pool = [] for tournament_iter in range(self.tournament_size): tournament_pool.append(chromosome_fitness[random.randint(0, self.population-1)]) return (max(tournament_pool, key=lambda x: x[1]))[0].clone() #Takes a chromosome and cipher_text and evaluates the chromosomes fitness def getFitness(self, chromosome, cipher_text): total_fitness = 0 parsed_block_table = self.getBlockTable(chromosome.convertText(cipher_text)) for block in parsed_block_table.keys(): if block in self.sample_block_table.keys(): total_fitness += math.log(self.sample_block_table[block],2)*(parsed_block_table[block]) return total_fitness """ BLOCK FUNCTIONS """ #Returns the blocks located in the passed samples path. def getSampleBlockTable(self, sample_path): #Opens input file input_file = open(sample_path) block_table = {} for line in input_file: components = line.split(" ") components[1] = int(components[1][0:len(components[1])-1]) block_table[components[0]] = components[1] input_file.close() return block_table #Takes a string and returns a hash table of blocks def getBlockTable(self, input_string): block_table = {} input_words = input_string.split(" ") #Hashes blocks in dictionary to count them for word in input_words: word_blocks = self.getBlocks(word) for block in word_blocks: if block in block_table: block_table[block] += 1 else: block_table[block] = 1 return block_table #Returns all substrings of a passed string def getBlocks(self, input_string): blocks = [] for block_len in range(len(input_string)): start_point = 0 end_point = block_len+1 while end_point <= len(input_string): blocks.append(input_string[start_point:end_point]) end_point+=1 start_point+=1 return blocks
def _are_words_percentage(self, text): spell = SpellChecker() words = spell.words(text) correct_words = spell.known(words) return ((len(correct_words) / len(words)) * 100)
start = word_id.find(token) if (start == -1): continue else: end = start + len(token) check = word_id[end + 1] if (check == "e"): spell = SpellChecker() if (spell.known(token)): testimonial = TextBlob(token) polarity = testimonial.sentiment.polarity if (polarity >= .2): pos_score += 30 elif (polarity > -.2 and polarity < .2): neu_score += 20 else: neg_score += 10 else:
x = vectorizer.transform(sentences_predict2) pred = classifier.predict(x) #print(pred) count_unknown = list(pred).count('incorrect') print('The solution of above problem like this') print('1.No of sentence incorrect in the Essay.\n', count_unknown) count_known = len(pred) - count_unknown percent_accuracy_sent = (count_known / len(pred)) * 100 ###saving pickle file # save_classifier = open("LR.pickle","wb") # pickle.dump(classifier, save_classifier) # save_classifier.close() ##put string in split_word_text to check whether all spellings are correct or not split_word_text = word_tokenize(sentences_predict) spell = SpellChecker() misspelled = split_word_text #if word is in dictionary word = spell.known(misspelled) #if word not in dictionary word2 = spell.unknown(misspelled) #print(word) #print(word2,len(word2)) print('2.No of spelling mistake in the Essay.\n', len(word2)) percent_accuracy_word = (len(word) / (len(word) + len(word2))) * 100 #print(percent_accuracy_word) print('Accuracy of Essay.', (percent_accuracy_sent + percent_accuracy_word) / 2)
used_chars = [] for char in set(char_list): sub_list = char_list[:char_list.index(char)]+char_list[char_list.index(char)+1:] for used_char in used_chars: sub_list = sub_list.replace(used_char, '') for clist in generate_sub_list(sub_list, size - 1): sub_lists.append([char] + clist) used_chars.append(char) return(sub_lists) def shuffle_all(char_list, min_n = 2, max_n = None): words = [] if max_n is None: max_n = len(char_list) for i in range(min_n, max_n+1): sub_char_lists = generate_sub_list(char_list, i) for sub_char_list in sub_char_lists: for word in [''.join(chars) for chars in shuffle(sub_char_list)]: words.append(word) return(words) words = shuffle_all('igbnne', 4, 7) for word in sorted(words): for i in range(4, 7): if ( len(word) == i #and # word[-2] == 'e' and # word[1] == 'r' ): if spell.known([word]): print(word)