Python SpellChecker.split_words示例，spellchecker.SpellChecker.split_words Python示例

示例#1

0

显示文件

def decrypt_BF(ciphertext, matchrate=0.8):
    """
    A brute force approach to decipher any mono-alphabetic substitution ciphers
    THIS PROGRAM DOESNOT WORK: generating 26! list is impossible, needs another way to generate permutations
    """

    # use a spellchecker to check whether words are in dictionary
    from spellchecker import SpellChecker
    # create an English spell checker
    spell = SpellChecker(language=u'en')

    # set the criterion for the number of matched words
    wordsCount = len(spell.split_words(ciphertext))
    wordsMatchCountMin = int(matchrate * wordsCount)

    # create a list of alphabets for ct
    cipher = [None] * 26
    # assign them to letters
    for i in range(26):
        cipher[i] = chr(i + ord('A'))

    # generate all possible permutations
    import itertools
    plain_lists = list(itertools.permutations(cipher))

    for i in range(len(plain_lists)):
        # create the plain list
        plain = plain_lists[i]

        # create the decipher dict
        decipherDict = {}
        # iterate 'A' to 'Z'
        for seq in range(26):
            # add letter and its count to dict
            decipherDict.update({cipher[seq]: plain[seq]})

        # decrypt with the current decipher table
        decrypted = decrypt(ciphertext, decipherDict)
        # split the text into a list of words
        wordsList = spell.split_words(decrypted)
        wordsCount = len(wordsList)

        print(i)
        # check whether it is a real word
        dictWordsList = spell.known(wordsList)
        if len(dictWordsList) >= wordsMatchCountMin:
            print("Find dictionary words at shift ", shift)
            printCipherTable(decipherDict, isInverse=True)
            return decrypted

    print("All trials failed")
    return ""

    return

示例#2

0

显示文件

def spelling_correction(query):
    print("Actual(misspeled) Query words :",query.split())
    spell = SpellChecker()
    words = spell.split_words(query) 
    query = [spell.correction(word) for word in words]
    print("Modified Query words : ",query)
    query = " ".join(query)
    return query

示例#3

0

显示文件

 def test_split_words(self):
     ''' test using split_words '''
     spell = SpellChecker()
     res = spell.split_words("This isn't a good test, but it is a test!!!!")
     self.assertEqual(
         set(res),
         set([
             "This", "isn't", "a", "good", "test", "but", "it", "is", "a",
             "test"
         ]))

示例#4

0

显示文件

class PySpellChecker(BaseCrafter):
    """
    :class:`PySpellChecker` wraps pyspellchecker (https://github.com/barrust/pyspellchecker) library
    to provide spelling correction capacity as a crafter in Jina

    :param language: The language of the dictionary to load or None \
            for no dictionary. Supported languages are `en`, `es`, `de`, `fr`, \
            `pt` and `ru`. Defaults to `en`. A list of languages may be \
            provided and all languages will be loaded.
    :param local_dictionary: he path to a locally stored word \
            frequency dictionary; if provided, no language will be loaded
    :param distance: The edit distance to use. Defaults to 2.
    :param case_sensitive: Flag to use a case sensitive dictionary or \
            not, only available when not using a language dictionary.
    :param args:  Additional positional arguments
    :param kwargs: Additional keyword arguments

    """
    def __init__(self,
                 language: str = 'en',
                 local_dictionary: Optional[str] = None,
                 distance: int = 2,
                 case_sensitive: bool = False,
                 *args,
                 **kwargs):
        """Set constructor."""
        super().__init__(*args, **kwargs)
        self.language = language
        self.local_dictionary = local_dictionary
        self.distance = distance
        self.case_sensitive = case_sensitive

    def post_init(self):
        from spellchecker import SpellChecker

        super().post_init()
        self.speller = SpellChecker(language=self.language,
                                    local_dictionary=self.local_dictionary,
                                    distance=self.distance,
                                    case_sensitive=self.case_sensitive)

    @single
    def craft(self, text: str, *args, **kwargs):
        """
        Craft sentences correcting misspelled words

        :param text: The text to be corrected
        :param args:  Additional positional arguments
        :param kwargs: Additional keyword arguments
        :return: A dictionary with the extracted text
        """
        words = self.speller.split_words(text)
        corrected_text = ' '.join(
            [self.speller.correction(word) for word in words])
        return dict(text=corrected_text)

示例#5

0

显示文件

 def test_words_more_complete(self):
     ''' test the parsing of words '''
     spell = SpellChecker()
     res = [
         'this', 'is', 'a', 'test', 'of', 'the', 'word', 'parser', 'it',
         'should', 'work', 'correctly'
     ]
     self.assertEqual(
         spell.split_words(
             'This is a test of the word parser. It should work correctly!!!'
         ), res)

示例#6

0

显示文件

文件： checker.py 项目： abdelrahmanbonna/NLPspellerChecker

    def correctTypos(text):
        speller = SpellChecker()
        words = speller.split_words(str(text))
        result = ""
        corrected = []
        for word in words:
            corrected.append(speller.correction(word))
        for word in corrected:
            if result == "":
                result = word
            else:
                result = result + " " + word

        return result

示例#7

0

显示文件

文件： checker.py 项目： abdelrahmanbonna/NLPspellerChecker

 def check(text):
     i = 0
     speller = SpellChecker()
     words = speller.split_words(str(text))
     result = "Typo:------"
     corrected = []
     for word in words:
         corrected.append(speller.correction(word))
     while (i < len(words)):
         if (words[i] == corrected[i]):
             i += 1
         elif (words[i] != corrected[i]):
             result = "Typo: " + words[i]
             break
     return result

示例#8

0

显示文件

文件： removeWordRepeat.py 项目： agatagabs/tratamento-dados-guia-alimentar

def removeWordRepeat(text):
    spell = SpellChecker(language='pt')
    arrayString = spell.split_words(text)
    for string in arrayString:
        numberOfRepeats = arrayString.count(string)
        if numberOfRepeats > 1:
            for _ in range(0, numberOfRepeats - 1):
                arrayString.remove(string)

    for string in arrayString:
        numberOfRepeats = arrayString.count(string)
        if numberOfRepeats > 1:
            for _ in range(0, numberOfRepeats - 1):
                arrayString.remove(string)

    return arrayString

示例#9

0

显示文件

文件： Spell_Correction.py 项目： rahulmadanraju/MERIDA

def SpellCheck(data):
    Spell_Words = []
    spell = SpellChecker()
    words = spell.split_words(words)
    for i in data.split_words(' '):
        w = Word(i)
        spell.word_frequency.load_words(['molded','.', '(',')'])
        words = spell.correction(w)
        if words != w:
            words = colored(words, 'blue')

        #spell_word = ' '.join(words)
        Spell_Words.append(words)

    # print(Spell_Words)
    Corrected_Words = TreebankWordDetokenizer().detokenize(Spell_Words)
    return Corrected_Words

示例#10

0

显示文件

文件： ddl_generation.py 项目： toLiuZP/PythonLearning

def check_name(name, type):

    incorrect_ind = False
    new_name = ''

    spell = SpellChecker()

    full_name = spell.split_words(name.replace('_', ' '))
    for word in full_name:
        if (full_name[0] == word and word
                in ('d', 'b', 'f', 'r')) or spell.correction(word) == word:
            new_name += word + ' '
        elif (full_name[-1] == word
              and word in ('key', 'txt', 'nb', 'amt', 'dtm', 'qty',
                           'dt')) or word in ('prev'):
            new_name += word + ' '
        else:
            incorrect_ind = True
            new_name += spell.correction(word) + ' '

    if type != None:
        if full_name[-1] in ('key', 'id') and type not in ('smallint', 'int',
                                                           'bigint'):
            print('column ' + name + ' type: <\028[32m' + type +
                  '\033[0m> may be wrong.')
        elif full_name[-1] == 'dtm' and type not in ('datetime'):
            print('column ' + name + ' type: <\028[32m' + type +
                  '\033[0m> may be wrong.')
        elif full_name[-1] == 'nm' and not (str(type).startswith('varchar')
                                            or str(type).startswith('char')):
            print('column ' + name + ' type: <\028[32m' + type +
                  '\033[0m> may be wrong.')
        elif full_name[-1] == 'txt' and not (str(type).startswith('varchar')
                                             or str(type).startswith('char')):
            print('column ' + name + ' type: <\028[32m' + type +
                  '\033[0m> may be wrong.')

    if incorrect_ind:
        print(name + ' should be \033[32m' +
              new_name.upper().replace(' ', '_')[:len(new_name) - 1] +
              '\033[0m')

示例#11

0

显示文件

            break

    msg = msg.decode(encoding='utf-8')
    print('Message to received: ', msg)

    if msg == 'ping':
        c.sendall(json.dumps('pong').encode(encoding='utf-8'))
        continue

    msg = json.loads(msg)

    msg = sjcl.decrypt(msg, config['crypto_key']).decode()

    print("received: {}".format(msg))

    words = spellchecker.split_words(msg)

    #wrong_indices = []

    # for i in range(len(words)):
    #     if spellchecker.correction(words[i]) != words[i]:
    #         print("{} is wrong. Correct is {}".format(words[i], spellchecker.correction(words[i])))
    #         wrong_indices.append(i)

    wrong_indices = list(spellchecker.unknown(words))

    print("Erros encontrados: ", wrong_indices)

    bill = calculate_bill(len(words))

    data = {'wrong_words': wrong_indices, 'bill': bill}

示例#12

0

显示文件

文件： correction.py 项目： vishnu2112/IR-Assignment2

from spellchecker import SpellChecker

spell = SpellChecker()
spell.split_words("this sentnce has misspelled werds")
words = spell.split_words("this sentnce has misspelled werds")

for i in words:
    print(spell.correction(i))
    print(spell.candidates(i))

示例#13

0

显示文件

     if tag.name == "a":
         href = tag.get("href", None)
         if href and href not in href_list and is_local(
                 href):
             href_list.append(href)
             href_csv.append([href, dirname])
     if tag.name not in class_dict[parent_slug].keys():
         class_dict[parent_slug][tag.name] = dict()
     tag_class = tag.get("class", ["None"])
     if tag_class:
         if "|".join(tag_class) not in class_dict[
                 parent_slug][tag.name].keys():
             class_dict[parent_slug][tag.name]["|".join(
                 tag_class)] = dirname
     misspelled = spell.unknown(
         spell.split_words(
             tag.get_text(separator=" ", strip=True)))
     for m_word in misspelled:
         if dirname not in word_dict.keys():
             word_dict[dirname] = list()
         if m_word not in word_dict[dirname]:
             word_dict[dirname].append(m_word)
             word_csv.append([m_word, dirname])
 for tag_to_remove in class_transformations[
         "remove_tags"]:
     remove_tag = tag_to_remove["tag"]
     remove_class = tag_to_remove["class"]
     if len(remove_class) == 0:
         remove_matches = main.findAll(remove_tag)
     else:
         remove_matches = main.findAll(
             remove_tag, attrs={'class': remove_class})

示例#14

0

显示文件

文件： spellcheck.py 项目： techwriterkoduje/dita-semantic-tests

]


def get_text_from_child_nodes(element, aggregator: List):
    for child in element.childNodes:
        if child.nodeType == child.TEXT_NODE:
            aggregator.append(child.data)
        elif child.nodeType == child.ELEMENT_NODE and child.tagName not in excluded_elements:
            get_text_from_child_nodes(child, aggregator)


spell = SpellChecker()
working_dir = Path(__file__).absolute()
topics_dir = working_dir.parent.parent / "resources" / "dita"

for topic in topics_dir.rglob('*.dita'):
    doc = minidom.parse(topic.__str__())
    all_text = []
    get_text_from_child_nodes(doc, all_text)
    all_words = spell.split_words("".join(all_text))
    unknown_words = [
        word for word in spell.unknown(all_words) if word not in ignored_words
    ]
    if unknown_words:
        for word in unknown_words:
            print(
                f'Unknown word in {topic.name}: "{word}". Did you mean "{spell.correction(word)}"?'
            )
    else:
        print(f'Perfect spelling in file {topic.name}')

示例#15

0

显示文件

def correction_sent(sent):
    spell = SpellChecker()
    words = spell.split_words(sent)
    return [spell.correction(word) for word in words]

示例#16

0

显示文件

 def test_words(self):
     ''' test the parsing of words '''
     spell = SpellChecker()
     res = ['this', 'is', 'a', 'test', 'of', 'this']
     self.assertEqual(spell.split_words('This is a test of this'), res)

示例#17

0

显示文件

文件： main.py 项目： aathul-raj/Elite-Qualifier

from spellchecker import SpellChecker

text = input("INPUT TEXT: ")
spell = SpellChecker()
words = spell.split_words(text)
final = ''
for word in words:
    final += spell.correction(word)
    final += ' '
print(final)

示例#18

0

显示文件

文件： sentiments_analysis.py 项目： omyous/StockMarket_Prediction

    def specll_check(self, text):
        spell = SpellChecker()
        text = spell.split_words(text)

        return " ".join([spell.correction(word) for word in text])

示例#19

0

显示文件

文件： spell.soul.py 项目： RiverApril/Kharon

#!/usr/bin/env python3

import sys
from spellchecker import SpellChecker

input = " ".join(sys.argv[1:])

if any(c.isnumeric() for c in input):
    exit(0)  # it's got numbers, probably not meant for spell checking

spell = SpellChecker()

words = spell.split_words(input)

correction_pairs = [(word, spell.correction(word)) for word in words]

display = [
    "?" if not spell.known([word]) and word == correction else correction
    for word, correction in correction_pairs
]

print(" ".join(display), end="")

示例#20

0

显示文件

document.styles['Normal'].font.name = 'SimHei'

p = document.add_paragraph()
p_run = p.add_run('Org Name List Spell Checker')
p2 = document.add_paragraph('Last Update: ' + str(update_date))
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p2.alignment = WD_ALIGN_PARAGRAPH.RIGHT
p_run.font.size = Pt(24)

table = document.add_table(rows=1, cols=1)
table.style = 'Table Grid'

# i = 0
item = ""
for index, row in complete_df.iterrows():
    first_row = str(int(row['OrganizationSignUpListNumber您的机构在接龙里的序号'])
                    ) + '. ' + row['OrganizationNameInEnglish']
    words = spell.split_words(first_row)
    words_book = [spell.correction(word) for word in words]
    spell_check_result = spell.unknown(words_book)

    item = item + first_row + '\n' + str(spell_check_result) + '\n\n'

### breakdown

cell = table.cell(0, 0)
cell.text = item

document.save('./output/Org Name List ' + now + '.docx')
print('Word file generate successful!')

示例#21

0

显示文件

            conversation.grid(column=0, row=2, sticky='nesw', padx=10, pady=10)
            #image
            load = Image.open("trumppet.png")
            render = ImageTk.PhotoImage(load)
            logo = ttk.Label(window, image=render, background='orange')
            logo.grid(column=1, row=2)

            window.pack(padx=10, pady=50)
            window.mainloop()

            while 1:
                data = ""
                msg = input("Enter message: ")

                # spellchecking, first tokenize, then run through spelling engine
                msg = spell.split_words(msg)
                msg = [spell.correction(token) for token in msg]

                # rejoin tokens and passes to sentiment analysis
                query = " ".join(msg)
                score = sentiment.polarity_scores(query)['compound']
                print(score)

                # run through POS / named entity engine to get the user's topic
                things = pos(query)
                topicStuff = [[token.lemma_, token.text] for token in things
                              if (token.dep_ == "dobj")]

                try:
                    topic = topicStuff[0][0]
                except: