def spellCheck(string_to_be_checked):
    # create an object
    spell = SpellChecker()
    # spell.word_frequency.load_text_file('wordsDictionary.txt')
    misspelled = spell.unknown(string_to_be_checked.split(" "))
    spell.word_frequency.load_words(
        ['f**k', 'f****d', 'damm', 'lmaof', 'pissed', 'google'])
    spell.known(['f**k', 'f****d', 'damm', 'lmaof',
                 'pissed'])  # will return both now!
    # print(string_to_be_checked)
    # print(misspelled)
    for word in misspelled:
        # Get the one `most likely` answer
        correctedWord = ''
        if (word != ''):
            correctedWord = spell.correction(word)

        # Get a list of `likely` options
        # print(word,correctedWord)
        if (correctedWord == None or correctedWord == ''):
            continue
        string_to_be_checked = string_to_be_checked.replace(
            word, correctedWord, 1)
        # print(correctedWord)
    # print(string_to_be_checked)
    return string_to_be_checked
示例#2
0
class DictionaryChecker:
    def __init__(self):
        self.dict = SpellChecker()

    def check(self, word):
        wordl = word.lower()
        result = self.dict.known([wordl])
        if (len(result) > 0):
            return True
        else:
            return False

    def getNumRealWords(self, wordlist):
        wordlistl = [x.lower() for x in wordlist]
        return len(self.dict.known(wordlistl))
class SpellProcessor:
    def __init__(self,
                 first_level_dict='dictionaries/ru_full.txt',
                 second_level_dict='dictionaries/products.txt'):
        # First level
        self.first_level_dict = first_level_dict
        self.spell_checker = SpellChecker()
        self.spell_checker.word_frequency.load_text_file(self.first_level_dict)

        # Second level
        self.second_level_dict = second_level_dict
        self.word_corrector = SpellChecker()

        if not os.path.exists(second_level_dict):
            with open(second_level_dict, 'w'):
                pass
        self.word_corrector.word_frequency.load_text_file(
            self.second_level_dict)

    async def write_to_dict(self, correct_words):
        with open(self.second_level_dict, "r+") as second_dict_file:
            text = second_dict_file.read()
            for word in correct_words:
                word_pos = text.find(word)

                if not word_pos == -1:
                    # Increasing frequency of word usage
                    end_pos = text.find('\n', word_pos)
                    number = int(text[word_pos + len(word) + 1:end_pos])

                    text = "".join((text[:word_pos + len(word) + 1],
                                    str(number + 1), text[end_pos:]))
                else:
                    text += (word + " 1" + "\n")

        with open(self.second_level_dict, "w") as second_dict_file:
            # Saving changes to file
            #second_dict_file.truncate()
            second_dict_file.write(text)

    def correct(self, products):
        correct_words = []  # This word will be saved to products dictionary
        result_list = []

        for product in products:

            words = product.split(' ')
            fixed_words = []
            for word in words:
                if len(word) > 2 and len(self.spell_checker.known([word
                                                                   ])) == 1:
                    correct_words.append(word)

                fixed_words.append(self.word_corrector.correction(word))

            result_list.append(' '.join(fixed_words))

        correct_words = np.unique(np.array(correct_words))
        asyncio.run(self.write_to_dict(correct_words))
        return result_list
class SpellCheckerML:
    def __init__(self):
        self.spell_checker = SpellChecker()
        self.autocomplete = autocomplete
        self.autocomplete.load()
        
    def train(self, text, model_name=''):
        if model_name == '':
            self.autocomplete.models.train_models(text, model_name=False)
        else:
            self.autocomplete.models.train_models(text, model_name=model_name)
        self.autocomplete.load()
        
    def correction(self, previous_word, word):
        if self.spell_checker.known([word]):
            return word
        else:
            spell_checker_candidates = self.spell_checker.candidates(word)
            autocomplete_predictions = self.autocomplete.predict(previous_word, word[0])
            autocomplete_candidates = [elem[0] for elem in autocomplete_predictions]
            best_choices = []
            for candidate in spell_checker_candidates:
                try:
                    candidate_index = autocomplete_candidates.index(candidate)
                    best_choices.append(autocomplete_predictions[candidate_index])
                except:
                    continue
            if best_choices:
                best_choices = sorted(best_choices, key=lambda t:t[1])
                return list(best_choices[-1])[0]
            else:
                return random.choice(list(spell_checker_candidates))
示例#5
0
def spell_correct(token):

    spell = SpellChecker()

    if (spell.known(token)):
        return token
    else:
        spell_correct_tok = spell.correction(token)
    return spell_correct_tok
def get_valid_words(digits):
    t9_letters = [t9[int(digit)] for digit in digits if int(digit) > 1]

    t9_words = product(*t9_letters)  # all possible combination
    t9_words = [''.join(word) for word in t9_words]

    spell = SpellChecker()
    valid_words = spell.known(t9_words)

    return valid_words
示例#7
0
    def test_word_known(self):
        ''' test if the word is a `known` word or not '''
        spell = SpellChecker()
        self.assertEqual(spell.known(['this']), {'this'})
        self.assertEqual(spell.known(['sherlock']), {'sherlock'})
        self.assertEqual(spell.known(['holmes']), {'holmes'})
        self.assertEqual(spell.known(['known']), {'known'})

        self.assertEqual(spell.known(['-']), set())
        self.assertEqual(spell.known(['foobar']), set())
        self.assertEqual(spell.known(['ths']), set())
        self.assertEqual(spell.known(['ergos']), set())
示例#8
0
def run(project_id, repo_path, cursor, **options):
    num_core_commit_words = 0
    totalNumberOfCommitWords = 0
    cursor.execute(QUERY.format(project_id))
    repoName = cursor.fetchone()[0]
    os.chdir("path/" + str(project_id) + "/")
    stri = os.getcwd()
    for repos in os.listdir():
        if (repos == repoName):
            os.chdir(repos)
            Dirs = []
            Files = []
            for (root, dirs, files) in inner_os.walk("", topdown=True):
                Dirs.append(dirs.lower())
                Files.append(files.lower())
            stream = inner_os.popen(
                'git log --pretty=format:"%s"').read().split("\n")
            for commits in stream:
                commits = commits.lower()
                for ab in Dirs:
                    if (ab in commits):
                        commits.replace(ab, "")
                        num_core_commit_words += 1
                        totalNumberOfCommitWords += 1
                for ab in Files:
                    if (ab in commits):
                        commits.replace(ab, "")
                        num_core_commit_words += 1
                        totalNumberOfCommitWords += 1
                nr = re.sub("[^0123456789 ]", "", commits)
                nr = ' '.join(nr.split())
                totalNumberOfCommitWords += len(nr.split())
                trim_commit = re.sub("[^a-zA-Z ]+", "", commits)
                trim_commit = ' '.join(trim_commit.split())
                #trim_commit = re.sub(r"\b[0-9]\b", "", trim_commit)
                # print('trim: ',len(trim_commit.split()))
                totalNumberOfCommitWords += len(trim_commit.split())
                spell = SpellChecker()
                trim_commit = re.sub(r"\b[a-zA-Z]\b", "", trim_commit)
                trim_commit = re.sub(r"\b[a-zA-Z][a-zA-Z]\b", "", trim_commit)
                trim_commit = trim_commit.split()
                spelled = spell.known(trim_commit)
                # print('spelled: ',spelled)
                num_core_commit_words += len(spelled)
            print("----- METRIC: COMMIT QUALITY -----")
            commits_ratio = 0
            # print('total: ',totalNumberOfCommitWords)
            # print('core: ',num_core_commit_words)
            if (totalNumberOfCommitWords > 0):
                commits_ratio = float(num_core_commit_words) / float(
                    totalNumberOfCommitWords * 1.0)
                print('core commits ratio: ', commits_ratio)
            break
    threshold = options['threshold']
    return (commits_ratio >= threshold, commits_ratio)
示例#9
0
def spellcheck(input_data: dict, word_list: str) -> None:
    """Check package for spelling errors."""
    spell = SpellChecker()
    try:
        spell.word_frequency.load_text_file(word_list)
        with open(word_list, 'r') as data_file:
            word_list_data = data_file.read()
    except FileNotFoundError:
        info("Word list not found searching up a directory...")
        # Search for word list if not found.
        search_path = f"/opt/pycep/word_list.txt"
        spell.word_frequency.load_text_file(search_path)
        with open(search_path, 'r') as data_file:
            word_list_data = data_file.read()
    known_data_list = word_list_data.split("\n")
    spell.known(known_data_list)
    task_data = return_non_data_task(input_data)
    for package in task_data:
        for values, lines in task_data[package].items():
            spell_check_task(spell, lines, values, package)
示例#10
0
def decrypt_BF(ciphertext, matchrate=0.8):
    """
    A brute force approach to decipher any mono-alphabetic substitution ciphers
    THIS PROGRAM DOESNOT WORK: generating 26! list is impossible, needs another way to generate permutations
    """

    # use a spellchecker to check whether words are in dictionary
    from spellchecker import SpellChecker
    # create an English spell checker
    spell = SpellChecker(language=u'en')

    # set the criterion for the number of matched words
    wordsCount = len(spell.split_words(ciphertext))
    wordsMatchCountMin = int(matchrate * wordsCount)

    # create a list of alphabets for ct
    cipher = [None] * 26
    # assign them to letters
    for i in range(26):
        cipher[i] = chr(i + ord('A'))

    # generate all possible permutations
    import itertools
    plain_lists = list(itertools.permutations(cipher))

    for i in range(len(plain_lists)):
        # create the plain list
        plain = plain_lists[i]

        # create the decipher dict
        decipherDict = {}
        # iterate 'A' to 'Z'
        for seq in range(26):
            # add letter and its count to dict
            decipherDict.update({cipher[seq]: plain[seq]})

        # decrypt with the current decipher table
        decrypted = decrypt(ciphertext, decipherDict)
        # split the text into a list of words
        wordsList = spell.split_words(decrypted)
        wordsCount = len(wordsList)

        print(i)
        # check whether it is a real word
        dictWordsList = spell.known(wordsList)
        if len(dictWordsList) >= wordsMatchCountMin:
            print("Find dictionary words at shift ", shift)
            printCipherTable(decipherDict, isInverse=True)
            return decrypted

    print("All trials failed")
    return ""

    return
示例#11
0
    def spellCorrectBackupBaseline(self, check_str):
        """
        Baseline spell checker
        uses spellchecker library
        """
        print('spellCorrectBackupBaseline called')
        spell = SpellChecker()
        spell.known(['zwave', 'rheem'])
        splitted = check_str.split()

        for w_ix in range(len(splitted)):
            if splitted[w_ix].isalpha():
                mis_check = list(spell.unknown([splitted[w_ix].lower()]))
                if len(mis_check) == 1:
                    splitted[w_ix] = spell.correction(mis_check[0])

        final_result = " ".join(splitted)
        # self.append_values[check_str] = final_result

        return final_result
示例#12
0
def scoreTextForSpellingCorrectness(article):
    score = 0
    articleChecker = SpellChecker()
    wordsInArticle = article.split()
    totalWords = len(wordsInArticle)
    numIncorrectWords = len(articleChecker.unknown(wordsInArticle))
    correctlySpelledWords = articleChecker.known(wordsInArticle)
    for word in correctlySpelledWords:
        score += len(word)  # Reward a text for having longer words
    score -= numIncorrectWords
    score /= totalWords
    return score
示例#13
0
    def test_large_words(self):
        ''' test checking for words that are clearly larger than the largest dictionary word '''
        spell = SpellChecker(language=None, distance=2)
        spell.word_frequency.add('Bob')

        words = ['Bb', 'bb', 'BB']
        self.assertEqual(spell.unknown(words), {'bb'})

        known_words = ['BOB', 'bOb']
        self.assertEqual(spell.known(known_words), {'bob'})

        self.assertEqual(spell.correction('bobs'), 'bob')
        self.assertEqual(spell.correction('bobb'), 'bob')
        self.assertEqual(spell.correction('bobby'), 'bob')
        self.assertEqual(spell.word_frequency.longest_word_length, 3)
        self.assertEqual(spell.correction('bobbys'), 'bobbys')
class SpellingSuggestor(object):
    def __init__(self, word):
        self.word = word
        self.spell = SpellChecker()

    """"method to replace underscore or dash by space"""

    def pre_process(self):
        return re.sub(r'([^\s\w]|_|-)+', ' ', self.word)

    """method to remove letters which occur more than twice"""

    def reduce_lengthening(self):
        pattern = re.compile(r"(.)\1{2,}")
        return pattern.sub(r"\1\1", self.word)

    "main method to call and do spell check processing "

    def spell_checker_result(self):
        self.word = self.pre_process()
        self.word = self.reduce_lengthening().lower()
        i = 1
        print("word after cleaning ", self.word)
        misspelled = self.spell.unknown([self.word])
        if len(misspelled) == 0:
            return [self.word]
        result = set()
        while (i < len(self.word)):
            r1 = self.spell.candidates(self.word[:i].strip())
            r2 = self.spell.candidates(self.word[i].strip())
            r1 = self.spell.known(r1)
            r2 = self.spell.known(r2)
            if len(r1) > 0 and len(r2) > 0:
                try:
                    for v1 in r1:
                        result.add(v1)
                        for v2 in r2:
                            if len(v2) > 2:
                                result.add(v2)
                                result.add(v1 + " " + v2)
                except Exception as ex:
                    print("some error", ex)
            i += 1
        return result
示例#15
0
    def test_capitalization_when_case_sensitive_defaults_to_false(self):
        ''' test that capitalization doesn't affect in comparisons '''
        spell = SpellChecker(language=None)
        spell.word_frequency.add('Bob')
        spell.word_frequency.add('Bob')
        spell.word_frequency.add('Bab')
        self.assertEqual('Bob' in spell, True)
        self.assertEqual('BOb' in spell, True)
        self.assertEqual('BOB' in spell, True)
        self.assertEqual('bob' in spell, True)

        words = ['Bb', 'bb', 'BB']
        self.assertEqual(spell.unknown(words), {'bb'})

        known_words = ['BOB', 'bOb']
        self.assertEqual(spell.known(known_words), {'bob'})

        self.assertEqual(spell.candidates('BB'), {'bob', 'bab'})
        self.assertEqual(spell.correction('BB'), 'bob')
示例#16
0
    def test_capitalization_when_case_sensitive_true(self):
        ''' test that capitalization affects comparisons '''
        spell = SpellChecker(language=None, case_sensitive=True)
        spell.word_frequency.add('Bob')
        self.assertEqual('Bob' in spell, True)
        self.assertEqual('BOb' in spell, False)
        self.assertEqual('BOB' in spell, False)
        self.assertEqual('bob' in spell, False)

        words = ['Bb', 'bb', 'BB']
        self.assertEqual(spell.unknown(words), {'Bb', 'bb', 'BB'})

        case_variant_words = ['BOB', 'bOb']
        self.assertEqual(spell.known(case_variant_words), set())

        self.assertEqual(spell.candidates('Bb'), {'Bob'})
        self.assertEqual(spell.candidates('bob'), {'Bob'})
        self.assertEqual(spell.correction('Bb'), 'Bob')
        self.assertEqual(spell.correction('bob'), 'Bob')
        self.assertEqual(spell.unknown(['bob']), {'bob'})
示例#17
0
class SpellCheck:
    def __init__(self):
        self.spell = None

    def spell_correct(self, x):
        """
        Given a sentence x, this function will check,
        for each word, weather it was misspelled or not.


        :param x:
        :return:
        """
        if self.spell is None:
            from spellchecker import SpellChecker
            self.spell = SpellChecker()
        word_list = word_tokenize(x)
        misspelled = self.spell.unknown(word_list)
        corrected_words_dict = dict([ (word, self.spell.correction(word)) for word in misspelled])
        word_corrected = [corrected_words_dict.get(x,x) for x in word_list]
        return ' '.join(word_corrected)


    def spell_check(self, x):
        """
        Given a sentence x, this function will return
        the same sentence bu removing all the words that
        were misspelled.

        Notice this function doesn't correct any misspelled words
        but just filter them. If you want to correct those wordspip install pyspellchecker
        you should use spell_check_correct.
        """
        if self.spell is None:
            from spellchecker import SpellChecker
            self.spell = SpellChecker()
        word_list = word_tokenize(x)
        correct_words = self.spell.known(word_list)
        word_list_filtered = [x for x in word_list if x in  correct_words]
        return ' '.join(word_list_filtered)
示例#18
0
def suggest():
    ss = request.args.get('letters', default='hello')
    perms = []

    # support up to 7 letters
    ss = ss[:7]
    print(ss)

    for l in range(3, len(ss) + 1):
        #print(l)
        perm = permutations(list(ss), l)
        for i in perm:
            w = ''.join(i)
            perms.append(w)

    spell = SpellChecker(distance=1)
    k = spell.known(perms)
    w = sorted(list(k))
    rmap = {"letters": ss, "words": w}
    jd = json.dumps(rmap)
    print(jd)
    return jd
示例#19
0
    def searchByText(cls, name, page_num):
        nameRevised = '%{}%'.format(name)
        check = cls.query.filter(cls.desc.ilike(nameRevised)).first()
        if check:
            resultRev = cls.query.filter(cls.desc.ilike(nameRevised)).paginate(
                per_page=4, page=page_num)
            return resultRev
        else:
            tokenized = name.split()
            spell = SpellChecker(language="pt")
            correctWord = ''

            for i in range(len(tokenized)):
                size = len(spell.known([tokenized[i]]))
                if size > 0:
                    correctWord += f"{tokenized[i]}"
                else:
                    correctWord += f"{spell.correction(tokenized[i])}"
                    nameCorrected = '%{}%'.format(correctWord)
                    result = cls.query.filter(
                        cls.desc.ilike(nameCorrected)).paginate(per_page=3,
                                                                page=page_num)
                    return result
    def xor_strings(self):
        puntaje_final_strings = []
        file_read = open(self.file_name, "r")
        out = open("salida", "w")
        spell = SpellChecker()
        for line in file_read:
            #print(line)
            cadena_hex = line.rstrip("\n")
            cadena = cadena_hex.decode("hex")
            print(cadena)
            res = ''
            puntaje_final = 0
            puntaje_final = []
            for i in range(256):
                for j in cadena:
                    res_byte = ord(j) ^ i
                    res += chr(res_byte)
                puntaje_actual = analiza_frecuencia(res)
                puntaje_final.append((puntaje_actual, res, i))
                res = ''
            ult = heapq.nlargest(10, puntaje_final)

            puntaje_actual = 0
            for i in range(10):
                palabras = ult[i][1].split(' ')
                mejor_palabra = spell.known(palabras)
                if (mejor_palabra):
                    for palabra in mejor_palabra:
                        puntaje_actual += spell.word_probability(palabra)
                    puntaje_final_strings.append(
                        (puntaje_actual, ult[i][1], ult[i][2]))

        ult = heapq.nlargest(1, puntaje_final_strings)
        print(ult[0][1])
        file_read.close()
        out.close()
示例#21
0
class Corrector:
    def __init__(self,
                 lenguaje,
                 diccionario=None,
                 distancia=2,
                 tokenizador=None):
        """
        Constructor por defecto de la clase `Corrector`. Esta clase se \
        encarga de realizar corrección ortográfica sobre textos.

        :param lenguaje: Lenguaje de los textos a los que se les va a \
            aplicar corrección ortográfica. Para mayor información, consultar \
            la sección de \
            :ref:`Lenguajes soportados <seccion_lenguajes_soportados>`.
        :type lenguaje: str
        :param diccionario: Diccionario (o string con ubicación del archivo \
            JSON que lo contiene), o lista que permite modificar y agregar \
            palabras. Si es una lista, contiene las palabras que serán \
            consideradas como válidas o correctas. \
            Si es un diccionario, las llaves del diccionario son las palabras \
            que serán consideradas como válidas o correctas, y los valores \
            del diccionario son las frecuencias de cada palabra en el \
            diccionario. Las frecuencias son utilizadas como criterio de \
            desempate, cuando una palabra incorrecta tiene más de una palabra \
            candidata para la corrección. Si se deja este \
            parámetro como `None`, se cargará el diccionario por defecto que \
            trae la librería `spellchecker` para el lenguaje determinado.
        :type diccionario: dict, list, str, opcional
        :param distancia: Máxima distancia de \
            Levenshtein que puede haber entre una palabra incorrecta (o no \
            reconocida) y las palabras del diccionario para determinar si \
            hay palabras candidatas para realizar la corrección. \
            Valor por defecto `2`.
        :type distancia: int
        :param tokenizador: Objeto encargado de la tokenización y \
            detokenización de textos. Si el valor es `None`, se cargará por \
            defecto una instancia de la clase `TokenizadorNLTK`. Valor por \
            defecto `None`
        :type tokenizador: object, opcional
        """
        # Definir lenguaje del corrector ortográfico
        self.establecer_lenguaje(lenguaje)
        # Inicializar corrector
        self.iniciar_corrector(diccionario)
        self.establecer_distancia(distancia)
        self.tokenizador = (TokenizadorNLTK()
                            if tokenizador is None else tokenizador)

    def establecer_lenguaje(self, lenguaje):
        """
        Permite definir o cambiar el lenguaje de los textos sobre los cuales \
        van a aplicarse el objeto de la clase `Corrector`.

        :param lenguaje: Lenguaje de los textos a los que se les va a \
            aplicar corrección ortográfica. Para mayor información, consultar \
            la sección de \
            :ref:`Lenguajes soportados <seccion_lenguajes_soportados>`.
        :type lenguaje: str
        """
        self.lenguaje = definir_lenguaje(lenguaje)

    def iniciar_corrector(self, diccionario):
        """
        Inicializa el objeto de la clase `SpellChecker` de la librería \
        spellchecker, para el lenguaje definido previamente, y lo asigna al \
        atributo "corrector" del objeto de clase `Corrector`.

        :param diccionario: Diccionario (o string con ubicación del archivo \
            JSON que lo contiene), o lista que permite modificar y agregar \
            palabras. Si es una lista, contiene las palabras que serán \
            consideradas como válidas o correctas. \
            Si es un diccionario, las llaves del diccionario son las palabras \
            que serán consideradas como válidas o correctas, y los valores \
            del diccionario son las frecuencias de cada palabra en el \
            diccionario. Las frecuencias son utilizadas como criterio de \
            desempate, cuando una palabra incorrecta tiene más de una palabra \
            candidata para la corrección. Si se deja este \
            parámetro como `None`, se cargará el diccionario por defecto que \
            trae la librería `spellchecker` para el lenguaje determinado.
        :type diccionario: dict, list, str, opcional
        """
        self.corrector = None
        if self.lenguaje is not None:
            if isinstance(diccionario, str):
                self.corrector = SpellChecker(local_dictionary=diccionario)
            elif type(diccionario) in [dict, list]:
                self.corrector = SpellChecker(language=self.lenguaje)
                self.actualizar_diccionario(diccionario)
            else:
                self.corrector = SpellChecker(language=self.lenguaje)

    def establecer_distancia(self, distancia):
        """
        Establece la distancia máxima que utilizará el algoritmo de corrección\
         de ortografía para determinar si hay palabras candidatas para \
        corregir una palabra incorrecta o no reconocida.

        :param distancia: Máxima distancia de \
            Levenshtein que puede haber entre una palabra incorrecta (o no \
            reconocida) y las palabras del diccionario para determinar si \
            hay palabras candidatas para realizar la corrección. \
            Valor por defecto `2`.
        :type distancia: int
        """
        if self.corrector is not None:
            self.corrector.distance = distancia

    def actualizar_diccionario(self, diccionario):
        """
        Actualiza el diccionario que contiene las palabras válidas o \
        reconocidas disponibles para realizar la corrección ortográfica. Las \
        palabras contenidas en el argumento `diccionario` de esta función \
        serán añadidas (o sus frecuencias serán actualizadas) en el \
        diccionario que ya existe en el objeto de la clase `Corrector`.

        :param diccionario: Diccionario (o string con ubicación del archivo \
            JSON que lo contiene), o lista que permite modificar y agregar \
            palabras. Si es una lista, contiene las palabras que serán \
            consideradas como válidas o correctas. \
            Si es un diccionario, las llaves del diccionario son las palabras \
            que serán consideradas como válidas o correctas, y los valores \
            del diccionario son las frecuencias de cada palabra en el \
            diccionario. Las frecuencias son utilizadas como criterio de \
            desempate, cuando una palabra incorrecta tiene más de una palabra \
            candidata para la corrección. Si se deja este \
            parámetro como `None`, se cargará el diccionario por defecto que \
            trae la librería `spellchecker` para el lenguaje determinado.
        :type diccionario: dict, list, str, opcional
        """
        if isinstance(diccionario, str):
            diccionario = json.load(open(diccionario))
        if isinstance(diccionario, list):
            diccionario = [palabra.lower() for palabra in diccionario]
            self.corrector.word_frequency.load_words(diccionario)
        elif isinstance(diccionario, dict):
            self.quitar_palabras(list(diccionario.keys()))
            for key in diccionario.keys():
                self.corrector.word_frequency.load_words([key.lower()] *
                                                         diccionario[key])
        else:
            pass

    def quitar_palabras(self, palabras):
        """
        Quita del diccionario del corrector una o más palabras \
        proporcionadas en el argumento `palabras`, haciendo que estas ya no \
        sean reconocidas como palabras válidas o correctas al momento de \
        hacer corrección ortográfica.

        :param palabras: Palabra o lista de palabras que se \
            desean quitar del diccionario del objeto de la clase `Corrector`, \
            para que no sean reconocidas como correctas al momento de hacer \
            la corrección ortográfica.
        :type palabras: str, list
        """
        if isinstance(palabras, str):
            palabras = [palabras]
        # Quitar de la lista palabras que no estén en el diccionario
        palabras = [p for p in palabras if self.frecuencia_palabra(p) > 0]
        if len(palabras) > 0:
            self.corrector.word_frequency.remove_words(palabras)

    def agregar_palabras(self, palabras):
        """
        Añade al diccionario del corrector una o más palabras proporcionadas \
        en el argumento `palabras`, haciendo que estas sean reconocidas como \
        palabras válidas o correctas al momento de hacer corrección \
        ortográfica.

        :param palabras: Palabra o lista de palabras que se \
            desean quitar del diccionario del objeto de la clase `Corrector`, \
            para que no sean reconocidas como correctas al momento de hacer \
            la corrección ortográfica.
        :type palabras: str, list
        """
        if isinstance(palabras, str):
            palabras = [palabras]
        self.actualizar_diccionario(palabras)

    def palabras_conocidas(self, texto):
        """
        A partir de un texto de entrada, devuelve un conjunto (objeto de \
        clase `set` de Python) con las palabras del texto que se reconocen \
        por estar presentes en el diccionario del corrector.

        :param texto: Texto para el que se desean hayar las palabras \
            conocidas.
        :type texto: str
        :return: (set) Conjunto de palabras conocidas presentes en el texto \
            de entrada.
        """
        tokens = self.tokenizador.tokenizar(texto)
        return self.corrector.known(tokens)

    def palabras_desconocidas(self, texto):
        """
        A partir de un texto de entrada, devuelve un conjunto (objeto de \
        clase `set` de Python) con las palabras del texto que no están \
        incluidas en el diccionario del corrector y por lo tanto no se \
        reconocen.

        :param texto: Texto para el que se desean hallar las palabras \
            desconocidas.
        :type texto: str
        :return: (set) Conjunto de palabras desconocidas presentes en el \
            texto de entrada.
        """
        tokens = self.tokenizador.tokenizar(texto)
        return self.corrector.unknown(tokens)

    def palabras_candidatas(self, palabra):
        """
        Para una palabra de entrada, retorna un conjunto de palabras que \
        podrían ser utilizadas para corregirla. Si la palabra de entrada es \
        correcta (está dentro del diccionario del corrector) o no tienen \
        ninguna palabra candidata con una distancia menor o igual a la \
        establecida en el parámetro `distancia` de la clase `Corrector`, la \
        función devolverá la misma palabra de entrada.

        :param palabra: Palabra para la que se quieren conocer palabras \
            candidatas para su corrección ortográfica.
        :type palabra: str
        :return: (set) Conjunto de palabras candidatas para corregir la \
            palabra de entrada.
        """
        return self.corrector.candidates(palabra)

    def frecuencia_palabra(self, palabra):
        """
        Para una palabra de entrada, devuelve la frecuencia de la misma, de \
        acuerdo al diccionario del corrector. Si la palabra es desconocida \
        (no se encuentra en el diccionario), la frecuencia retornada será de \
        cero.

        :param palabra: Palabra para la cual se desea conocer la \
            frecuencia de aparición en el diccionario del corrector.
        :type palabra: str
        :return: (int) Número mayor o igual a cero que indica la frecuencia \
            de la palabra consultada en el diccionario del corrector.
        """
        return self.corrector[palabra]

    def probabilidad_palabra(self, palabra):
        """
        Para una palabra de entrada, devuelve la probabilidad de aparición \
        entendida como su frecuencia sobre la suma de las \
        frecuencias de todas las palabras disponibles, de acuerdo al \
        diccionario del corrector. Si la palabra es desconocida (no se \
        encuentra en el diccionario), la probabilidad retornada será de cero.

        :param palabra: Palabra para la cual se desea conocer la \
            probabilidad de aparición en el diccionario del corrector.
        :type palabra: str
        :return: (float) Probabilidad, entre 0 y 1, de aparición de la \
            palabra.
        """
        return self.corrector.word_probability(palabra)

    def correccion_ortografia(self, texto, limpieza=False):
        """
        Realiza corrección ortográfica sobre un texto de entrada, \
        identificando las palabras que no están en el diccionario del \
        corrector y cambiándolas por su candidata más frecuente o probable, \
        siempre y cuando haya por lo menos una palabra candidata que cumpla \
        con la máxima distancia de Levenshtein permitida.

        :param texto: Texto al cual se desea aplicar corrección \
            ortográfica.
        :param limpieza: Define si se desea hacer una limpieza \
            básica (aplicando la función `limpieza_basica` del módulo \
            `limpieza`) al texto antes de aplicar la corrección ortográfica.\
            Valor por defecto `False`.
        :type limpieza: bool, opcional
        :return: (str) Texto de entrada luego de la corrección ortográfica.
        """
        if limpieza:
            # Limpieza básica del texto para que no afecte la corrección
            texto = limpieza_basica(texto, quitar_numeros=False)
        lista_palabras = self.tokenizador.tokenizar(texto)
        desconocidas = self.corrector.unknown(lista_palabras)
        texto_corregido = [
            self.corrector.correction(p)
            if len(p) > 1 and p in desconocidas else p for p in lista_palabras
        ]
        return self.tokenizador.destokenizar(texto_corregido)
示例#22
0
class Neuron:
    """
    Main processing object.
    sugaroid.brain.Neuron classifies texts initially
    """
    def __init__(self, bot):
        self.bot = bot
        if self.bot.spell_checker:
            from spellchecker import SpellChecker
            self.spell = SpellChecker(distance=1)
            # some privileges only for the creator
            self.spell.known(
                ['Sugaroid', 'Sugarlabs', "sugar", 'Srevin', 'Saju'])

        logging.info("Sugaroid Neuron Loaded to memory")

    def parse(self, var):
        if var.isspace():
            return 'Type something to begin'
        if 'time ' in var:
            response = self.time()
        else:

            for i in ARITHMETIC:
                if i in var:
                    response = self.alu(self.normalize(var))
                    if str(response).strip() == '-':
                        pass
                    elif response:
                        break
            else:
                if self.bot.spell_checker:
                    wt = var.split(' ')
                    ct = []
                    for i in wt:
                        ct.append(self.spell.correction(i))
                    response = self.gen_best_match(' '.join(ct))
                else:

                    preprocessed = preprocess(var)
                    response = self.gen_best_match(preprocessed)

        return response

    def alu(self, var):
        conversation = ' '.join(var)
        return self.gen_arithmetic(conversation)

    def time(self):
        return self.gen_time()

    def gen_best_match(self, parsed):
        return self.bot.get_response(parsed)

    @staticmethod
    def gen_time():
        return 'The current time is {}'.format(
            strftime("%a, %d %b %Y %H:%M:%S", localtime()))

    def gen_arithmetic(self, parsed):
        try:
            me = MathematicalEvaluation(self.bot)
            return me.process(Statement(parsed))
        except Exception as e:
            return False

    @staticmethod
    def normalize(text):
        return WordPunctTokenizer().tokenize(text)
示例#23
0
''' take input of a jumbled up word and output
 all the possible real word made using those letters'''

from spellchecker import SpellChecker
from nltk.corpus import words
from itertools import permutations
spell = SpellChecker()
input_string = input("Enter an anagram of a word\n")
stuff = [''.join(a) for a in permutations(input_string)]
possiblities = spell.known(stuff)
print("The possible real word/words are:")
for i in possiblities:
    if i in words.words():
        print(i)
示例#24
0
 def test_capitalization_when_language_set(self):
     ''' test that capitalization doesn't affect comparisons when language not None'''
     spell = SpellChecker(language="en")
     self.assertEqual(spell.known(['Bike']), {'bike'})
示例#25
0
def Spell_check(word_token):
    spell = SpellChecker(distance=2)
    spell.unknown(word_token)
    return spell.known(word_token)
示例#26
0
            accepted = True
        if len(sentence) < 50 or len(sentence) > 300:
            accepted = False
        if not full_stop:
            accepted = False
        count += 1

    return sentence


sentence = generate_sentence(forwards_model,
                             backwards_model,
                             seed_word=u" analytic ")
sentence = sentence[1:]
words = sentence.split(' ')
corrected_sentence = []
for word in words:
    if word == ' ':
        continue
    corrected_word = spell.correction(word)
    print('word = ', word, ' correction = ', corrected_word, ' probability = ',
          spell.word_probability(corrected_word))
    print('candidates = ', spell.candidates(word))
    if len(spell.known([corrected_word])) == 1 or word.isnumeric():
        corrected_sentence.append(corrected_word)

sentence = ' '.join(corrected_sentence) + '.'
sentence = sentence[0].upper() + sentence[1:]
sentence = sentence.replace(' i ', ' I ')
print(sentence)
class Incubator:
	#Class variables:
	#	sample_block_table: A dictionary containing all blocks in sample_path and their frequency
	#	spellchecker: A pyspellchecker instance with all the words in words_path added
	#	population: Total population of the incubator, indicating how many chromosomes exist at one time
	#	elites: How many elites are carried over for each generation
	#	children: How many children are created for each generation
	#	randoms: How many random chromosomes are added each generation
	#	tournament_size: How many chromosomes are considered in a tournament
	#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
	#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
	#	shock_enabled: True if genetic shock enabled, false otherwise
	#	shock_threshold: Number of cycles of fitness stagnation before genetic shock is triggered.
	#	max_cycles: Cycle # at which the simulation terminates
	def __init__(self, sample_path, words_path, elites, children, randoms, tournament_size, cross_chance, mutation_chance, shock_value, max_cycles):
		#Parameters:
			#	sample_path: A path to a samples source file containing all training data to be fed to the incubator
			#	words_path: A path to all words which the cipher_breaker should consider valid in addition
			#		to those already in pyspellchecker.
			#	elites: How many elites are carried over for each generation
			#	children: How many children are created for each generation
			#	randoms: How many random chromosomes are added each generation
			#	tournament_size: How many chromosomes are considered in a tournament
			#	cross_chance: Chance of crossing chromosomes when creating a child. cross_chance + mutation_chance should equal one
			#	mutation_chance: Change of mutating a chromosome when creating a child. cross_chance + mutation_chance should equal one
			#	shock_value: 0 if genetic shock disabled. Otherwise shock is enabled and shock_threshold is set to shock_value
			#	max_cycles: Cycle # at which the simulation terminates

			#Initializes sample_block_tables
			self.sample_block_table = self.getSampleBlockTable(sample_path)

			#Initializes spellchecker
			self.spellchecker = SpellChecker()
			self.spellchecker.word_frequency.load_text_file((words_path))

			#Checks cross_chance and mutation_chance are valid
			assert (cross_chance + mutation_chance) == 1

			#Loads all incubator paramaters
			self.elites = elites
			self.children = children
			self.randoms = randoms
			self.population = self.elites + self.children + self.randoms

			self.tournament_size = tournament_size

			self.cross_chance = cross_chance
			self.mutation_chance = mutation_chance

			#Handles shock_value
			if shock_value <= 0:
				self.shock_enabled = False
				self.shock_threshold = 0
			else:
				self.shock_enabled = True
				self.shock_threshold = shock_value

			self.max_cycles = max_cycles

			#Prints incubator summary if verbose enables
			if __VERBOSE__:
				print("Incubator Summary:")
				print("sample_path: " + sample_path + "  words_path: " + words_path)
				print("Total population: " + str(self.population))
				print("Elites: " + str(self.elites) + "  Children: " + str(self.children) + "  Randoms: " + str(self.randoms))
				print("Tournament size: " + str(self.tournament_size) + "  Cross chance: " + str(self.cross_chance) + "  Mutation chance: " + str(self.mutation_chance))
				print("Shock enabled: " + str(self.shock_enabled) + "  Shock threshold: " + str(self.shock_threshold))
				print("Max cycles: " + str(self.max_cycles))
				print("\n")

	"""TRAINING FUNCTIONS"""
	#Takes ciphertext, returns a chromosome that should decrypt ciphertext
	def train(self, cipher_text):
		#Initializes cycle counter
		cycles = 0

		#Generates pool of chromosomes
		chromosomes = []

		for chromosome_iter in range(self.population):
			chromosomes.append(self.getRandomChromosome())

		#Genetic shock trigger variables. Triggers if fitness is stagnant for shock_threshold cycles
		best_fitness = 0
		shock_ticker = 0

		#Starts timer
		start_time = time.time()

		while True:
			#Increments cycle counter
			cycles += 1

			#Creates list of (chromosome, fitness) tuples in order of increasing fitness
			chromosome_fitness = []

			#Checks all chromosomes to see if the correct one has been found
			for chromosome in chromosomes:
				if len(self.spellchecker.unknown((chromosome.convertText(cipher_text)).split(" "))) == 0:
					if __VERBOSE__:
						print("Found key! " + str(chromosome))
						print("Decrypted text:  " + chromosome.convertText(cipher_text))
						print("")

						return (chromosome, cycles)

			#Gets fitness of each chromosome and sorts them according to fitness
			for chromosome in chromosomes:
				chromosome_fitness.append((chromosome, self.getFitness(chromosome, cipher_text)))

			chromosome_fitness.sort(key=lambda x: x[1])
			chromosome_fitness.reverse()

			#Checks if max_cycles exceeded. If so, returns the fittest chromosome
			if cycles >= self.max_cycles:
				print("Best Key: " + str(chromosome_fitness[0][0]))
				print("Decrypted text:  " + chromosome_fitness[0][0].convertText(cipher_text))
				print("")
				return (chromosome_fitness[0][0], cycles)

			#Checks if fitness is stagnant
			if chromosome_fitness[0][1] <= best_fitness:
				shock_ticker += 1
			else:
				best_fitness = max(chromosome_fitness[0][1], best_fitness)
				shock_ticker = 0

			#If __VERBOSE__, provide report on most fit chromosome
			if __VERBOSE__:
				converted_text = chromosome_fitness[0][0].convertText(cipher_text)
				print("Cycle# " + str(cycles))
				print("Best Chromosome: " + str(chromosome_fitness[0][0]))
				print("Fitness: " + str(chromosome_fitness[0][1]))
				print("Shock Ticker: " + str(shock_ticker))
				print("Cycle Time: " + str(time.time()-start_time))
				print("Attempted Decrypt: " + converted_text)
				print("Known words: " + str(self.spellchecker.known((chromosome_fitness[0][0].convertText(cipher_text).split(" ")))))
				print("Unknown words: " + str(self.spellchecker.unknown((chromosome_fitness[0][0].convertText(cipher_text).split(" ")))))
				print("")

			start_time = time.time()

			#Creates a new chromosomes list
			new_chromosomes = []

			#Copies over elite to new chromosomes
			for chromosome_iter in range(self.elites):
				new_chromosomes.append(chromosome_fitness[chromosome_iter][0].clone())

			#Creates children in new_chromsomes

			#Performs tournament process to select breeding candidates
			tournament_selections = []
			while len(tournament_selections) < (self.children):
				tournament_selections.append(self.tournament(chromosome_fitness))

			#Breeds selected candidates
			while len(tournament_selections)>0:
				chance = random.random()
				if chance < self.cross_chance and len(tournament_selections) > 1:
					chromosome_one = tournament_selections.pop()
					chromosome_two = tournament_selections.pop()

					crossed_chromosomes = self.crossChromosomes(chromosome_one, chromosome_two)

					new_chromosomes.append(crossed_chromosomes[0])
					new_chromosomes.append(crossed_chromosomes[1])
				elif chance < (self.mutation_chance + self.cross_chance):
					new_chromosomes.append(self.mutateChromosome(tournament_selections.pop()))
				else:
					new_chromosomes.append(self.getRandomChromosome())

			#Adds random chromosomes to new_chromosomes
			for random_iter in range(self.randoms):
				new_chromosomes.append(self.getRandomChromosome())

			#Checks if genetic shock should be triggered
			if shock_ticker >= self.shock_threshold and self.shock_enabled:
				if __VERBOSE__:
					print("Triggering genetic shock...\n")

				#Performs genetic shock, culling top 10% of population and mutation all others
				for chromosome_iter in range(len(new_chromosomes)):
					if self.getFitness(new_chromosomes[chromosome_iter], cipher_text) > .9 * best_fitness:
						new_chromosomes[chromosome_iter] = self.getRandomChromosome()
					else:
						new_chromosomes[chromosome_iter] = self.mutateChromosome(new_chromosomes[chromosome_iter])

				#Resets shock tickers and trackers
				shock_ticker = 0
				best_fitness = 0

			#Shifts new_chromosomes into gene pool
			chromosomes = new_chromosomes

	#Returns a mutated chromosome
	def mutateChromosome(self, chromosome):
		new_chromosome = chromosome.clone()

		#Chooses two mappings to swap
		mutation_one_index = random.randint(0,25)
		mutation_two_index = random.randint(0,25)

		while mutation_two_index == mutation_one_index:
			mutation_two_index = random.randint(0,25)

		mutation_one = new_chromosome.mappings[mutation_one_index]
		mutation_two = new_chromosome.mappings[mutation_two_index]

		new_chromosome.removeMapping(mutation_one)
		new_chromosome.removeMapping(mutation_two)

		mapping_one = (mutation_one[0], mutation_two[1])
		mapping_two = (mutation_two[0], mutation_one[1])

		new_chromosome.addMapping(mapping_one)
		new_chromosome.addMapping(mapping_two)

		return new_chromosome

	#Takes two chromosomes and returns two crosses of those chromosomes in the format (new_chromosome_one, new_chromosome_two)
	def crossChromosomes(self, chromosome_one, chromosome_two):
		new_chromosome_one = chromosome_one.clone()
		new_chromosome_two = chromosome_two.clone()

		for chromosome_iter in range(26):
			if(random.random() > .5):
				old_mapping_one = new_chromosome_one.mappings[chromosome_iter]
				old_mapping_two = new_chromosome_two.mappings[chromosome_iter]

				if old_mapping_one != old_mapping_two:
					complement_mapping_one = new_chromosome_one.getMappingTarget(old_mapping_two[1])
					complement_mapping_two = new_chromosome_two.getMappingTarget(old_mapping_one[1])

					old_origin_one = complement_mapping_one[0]
					old_origin_two = complement_mapping_two[0]

					new_chromosome_one.removeMapping(complement_mapping_one)
					new_chromosome_two.removeMapping(complement_mapping_two)

					new_chromosome_one.removeMapping(old_mapping_one)
					new_chromosome_two.removeMapping(old_mapping_two)

					complement_mapping_one = (old_origin_two, complement_mapping_one[1])
					complement_mapping_two = (old_origin_one, complement_mapping_two[1])

					new_chromosome_one.addMapping(old_mapping_two)
					new_chromosome_one.addMapping(complement_mapping_two)
					new_chromosome_two.addMapping(old_mapping_one)
					new_chromosome_two.addMapping(complement_mapping_one)

		return (new_chromosome_one, new_chromosome_two)

	#Returns a new random chromosome
	def getRandomChromosome(self):
		new_chromosome = Chromosome()

		origin = []
		destination = []

		for letterIter in range(26):
			origin.append(chr(letterIter+97))
			destination.append(chr(letterIter+97))

		random.shuffle(destination)

		for mappingIter in range(26):
			new_chromosome.addMapping((origin[mappingIter], destination[mappingIter]))

		return new_chromosome

	#Performs a tournament selection of chromosomes based on tournament_size
	def tournament(self, chromosome_fitness):
		tournament_pool = []

		for tournament_iter in range(self.tournament_size):
			tournament_pool.append(chromosome_fitness[random.randint(0, self.population-1)])

		return (max(tournament_pool, key=lambda x: x[1]))[0].clone()

	#Takes a chromosome and cipher_text and evaluates the chromosomes fitness
	def getFitness(self, chromosome, cipher_text):
		total_fitness = 0
		parsed_block_table = self.getBlockTable(chromosome.convertText(cipher_text))

		for block in parsed_block_table.keys():
			if block in self.sample_block_table.keys():
				total_fitness += math.log(self.sample_block_table[block],2)*(parsed_block_table[block])

		return total_fitness

	"""
	BLOCK FUNCTIONS
	"""
	#Returns the blocks located in the passed samples path.
	def getSampleBlockTable(self, sample_path):
		#Opens input file
		input_file = open(sample_path)
		block_table = {}

		for line in input_file:
			components = line.split(" ")

			components[1] = int(components[1][0:len(components[1])-1])

			block_table[components[0]] = components[1]

		input_file.close()

		return block_table

	#Takes a string and returns a hash table of blocks
	def getBlockTable(self, input_string):
		block_table = {}
		input_words = input_string.split(" ")

		#Hashes blocks in dictionary to count them
		for word in input_words:
			word_blocks = self.getBlocks(word)

			for block in word_blocks:
				if block in block_table:
					block_table[block] += 1
				else:
					block_table[block] = 1

		return block_table

	#Returns all substrings of a passed string
	def getBlocks(self, input_string):
		blocks = []

		for block_len in range(len(input_string)):
			start_point = 0
			end_point = block_len+1

			while end_point <= len(input_string):
				blocks.append(input_string[start_point:end_point])
				end_point+=1
				start_point+=1

		return blocks
示例#28
0
 def _are_words_percentage(self, text):
     spell = SpellChecker()
     words = spell.words(text)
     correct_words = spell.known(words)
     return ((len(correct_words) / len(words)) * 100)
示例#29
0
                    start = word_id.find(token)

                    if (start == -1):
                        continue

                    else:

                        end = start + len(token)
                        check = word_id[end + 1]

                        if (check == "e"):

                            spell = SpellChecker()

                            if (spell.known(token)):

                                testimonial = TextBlob(token)
                                polarity = testimonial.sentiment.polarity

                                if (polarity >= .2):
                                    pos_score += 30

                                elif (polarity > -.2 and polarity < .2):
                                    neu_score += 20

                                else:
                                    neg_score += 10

                            else:
示例#30
0
    x = vectorizer.transform(sentences_predict2)
    pred = classifier.predict(x)
    #print(pred)
    count_unknown = list(pred).count('incorrect')
    print('The solution of above problem like this')
    print('1.No of sentence incorrect in the Essay.\n', count_unknown)
    count_known = len(pred) - count_unknown
    percent_accuracy_sent = (count_known / len(pred)) * 100

    ###saving pickle file
    # save_classifier = open("LR.pickle","wb")
    # pickle.dump(classifier, save_classifier)
    # save_classifier.close()

    ##put string in split_word_text to check whether all spellings are correct or not
    split_word_text = word_tokenize(sentences_predict)

    spell = SpellChecker()
    misspelled = split_word_text
    #if word is in dictionary
    word = spell.known(misspelled)
    #if word not in dictionary
    word2 = spell.unknown(misspelled)
    #print(word)
    #print(word2,len(word2))
    print('2.No of spelling mistake in the Essay.\n', len(word2))
    percent_accuracy_word = (len(word) / (len(word) + len(word2))) * 100
    #print(percent_accuracy_word)
    print('Accuracy of Essay.',
          (percent_accuracy_sent + percent_accuracy_word) / 2)
        used_chars = []
        for char in set(char_list):
            sub_list = char_list[:char_list.index(char)]+char_list[char_list.index(char)+1:]
            for used_char in used_chars:
                sub_list = sub_list.replace(used_char, '')
            for clist in generate_sub_list(sub_list, size - 1):
                sub_lists.append([char] + clist)
            used_chars.append(char)
    return(sub_lists) 
def shuffle_all(char_list, min_n = 2, max_n = None):
    words = []
    if max_n is None:
        max_n = len(char_list)
    for i in range(min_n, max_n+1):
        sub_char_lists = generate_sub_list(char_list, i)
        for sub_char_list in sub_char_lists:
            for word in [''.join(chars) for chars in shuffle(sub_char_list)]:
                words.append(word)
    return(words)


words = shuffle_all('igbnne', 4, 7)
for word in sorted(words):
    for i in range(4, 7):
        if (
            len(word) == i #and
#             word[-2] == 'e' and 
#             word[1] == 'r'
           ):
            if spell.known([word]):
                print(word)