예제 #1
0
def main():
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 0
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    edit_distance_max = 0
    prefix_length = 7
    sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
    sym_spell.load_dictionary(dictionary_path, 0, 1)

    typo = "thequickbrownfoxjumpsoverthelazydog"
    correction = "the quick brown fox jumps over the lazy dog"
    result = sym_spell.word_segmentation(typo)  # create object

    # a sentence without any spaces
    input_term = "thequickbrownfoxjumpsoverthelazydog"
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
예제 #2
0
class SymSpellCorrection:
    """
        Use SymSpell for correction
    """
    def __init__(self, dictionary_path, term_index=0, count_index=1, max_edit_distance_dictionary=0, prefix_length=7, **args):
        """
        Input:
            - dictionary_path: string
            - term_index: int, column of the term in the dictionary text file, default is 0
            - count_index: int, column of the term frequency in the dictionary text file, default is 1
            - max_edit_distance_dictionary: int, maximum edit distance per dictionary precalculation, default is 0
            - prefix_length, int, default is 7
        """
        from symspellpy.symspellpy import SymSpell
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.sym_spell.load_dictionary(dictionary_path, term_index, count_index)

    def __call__(self, sentence):
        """
            Input:
                - sentence: string

            Output:
                - string
        """
        if len(sentence) < 1:
            return sentence
        try:
            corrected = self.sym_spell.word_segmentation(sentence).corrected_string
        except:
            print("Error spell correction:", sentence)
            corrected = sentence
        return corrected
예제 #3
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "pyth"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    # a sentence without any spaces
    input_term = "thequuickbrownfoxjumpsoverthelazydog"
    
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
예제 #4
0
def main():
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    if not sym_spell.create_dictionary('training_data.txt',
                                       encoding="ISO-8859-1"):
        print("Corpus file not found")
        return
    dictlist = []
    for key, count in sym_spell.words.items():
        print("{} {}\n".format(key, count))
        dictlist.append("{} {}\n".format(key, count))
        # save Dictionary

    with open("dict.txt", "a+", encoding="ISO-8859-1") as text_file:
        text_file.write(str(dictlist))
    print('Saved Dic')
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname('./'), "dict.txt")
    print(dictionary_path)
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # a sentence without any spaces
    data = ''
    with open('missing_spaces.txt', 'r', encoding="utf8") as myfile:
        data = myfile.read()

    splitline = data.split(',')
    #        for line in splitline:
    #            data.append(splitline[line])
    for indx in range(0, (len(splitline) - 1)):
        try:
            strval = splitline[indx]
            #        print(strval)
            result = sym_spell.word_segmentation(strval,
                                                 max_edit_distance_dictionary,
                                                 prefix_length)
            # display suggestion term, term frequency, and edit distance
            print("{}".format(result.corrected_string))
        except:
            print('out of index')
예제 #5
0
def postprocessing(text):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(dictionary_path, term_index=0,
                                     count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(dictionary_path, term_index=0,
                                            count_index=2):
        print("Bigram dictionary file not found")
        return

    result = sym_spell.word_segmentation(text.lower())
    return result.corrected_string
예제 #6
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 0
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),"dictionary_final.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # a sentence without any spaces
    input_term = "bangalore"
    # input_term = "thequickbrownfoxjumpsoverthelazydog"
    # input_term =  'universitycollegesbangalore'
    result = sym_spell.word_segmentation(input_term)
    x = result.corrected_string.split(' ')
    # display suggestion term, term frequency, and edit distance
    print(x)
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
예제 #7
0
import time
import pkg_resources
from symspellpy.symspellpy import SymSpell
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
import re
import os
import time
import pandas as pd
# Set max_dictionary_edit_distance to increase lenght of spelling correction
sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=4)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
tic= time.time()
directory = r'TestData'
for entry in os.scandir(directory):
    if (entry.path.endswith(".txt")):
        a_file = open(entry.path, "r")
        new_path = os.path.join(r"TestData\Temp", ((entry.path).split('\\')[1]))
        copy = open(new_path, "w+",encoding='utf-8')
        for line in a_file:
            result = sym_spell.word_segmentation(line)
            copy.write(format(result.corrected_string)+'\n')
        copy.close()
        a_file.close()
    toc = time.time()   
print("Time "+new_path+": " +str(toc-tic))