예제 #1
0
    def test_lookup_compound(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
예제 #2
0
    def test_lookup_compound(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(300000, results[0].count)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(23121323, results[0].count)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(9, results[0].distance)
        self.assertEqual(3813904, results[0].count)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        self.assertEqual(10, results[0].distance)
        self.assertEqual(6218089, results[0].count)
class SegmentText():
    def __init__(self, 
                 dictionary_path = None, 
                 bigram_path = None):
        
        self.name = "SegmenText"
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        # dictionary_path = pkg_resources.resource_filename(
        #     "symspellpy", "frequency_dictionary_en_82_765.txt")
        if dictionary_path != None:
            self.dictionary_path = dictionary_path
        else:
            self.dictionary_path = os.path.join("./symspellfre_", "frequency_dictionary_en_82_765.txt")
    
        if bigram_path != None:
            self.bigram_path = bigram_path
        else:
            self.bigram_path = os.path.join("./symspellfre_", "frequency_bigramdictionary_en_243_342.txt")
            # self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2)

    
    def split(self, sentence):
        # lookup suggestions for multi-word input strings (supports compound
        # splitting & merging)
        # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan")
        # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan eoy")
        # max edit distance per lookup (per single word, not per whole input string)
        suggestions = self.sym_spell.lookup_compound(sentence, max_edit_distance=2)
        # display suggestion term, edit distance, and term frequency
        for suggestion in suggestions:
            print(suggestion)
        return suggestions
예제 #4
0
    class _Project:
        def __init__(self):
            initial_capacity = 83000
            max_edit_distance_dictionary = 2
            prefix_length = 7
            self.sym_spell = SymSpell(initial_capacity,
                                      max_edit_distance_dictionary,
                                      prefix_length)

            # load dictionary
            dictionary_path = Path('dict_final.txt')

            count_index = 1  # column of the term frequency in the dictionary text file
            term_index = 0  # column of the term in the dictionary text file
            if not self.sym_spell.load_dictionary(dictionary_path, term_index,
                                                  count_index):
                print("Dictionary file not found")
                return

        def correct_name(self, query):

            input_term = (
                query
            )  # max edit distance per lookup (per single word, not per whole input string)
            max_edit_distance_lookup = 2
            suggestions = self.sym_spell.lookup_compound(
                input_term, max_edit_distance_lookup)

            # display suggestion term, edit distance, and term frequency
            # writer = csv.writer(f, delimiter='\t')
            for suggestion in suggestions:
                # writer.writerow(['']+[suggestion.term])
                return suggestion.term
예제 #5
0
class SpellCorrect():
    def __init__(self,
                 dictionary_path=dictionary_path__,
                 bigram_path=bigram_path__):

        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        if self.is_valid_path(dictionary_path) and self.is_valid_path(
                bigram_path):
            self.sym_spell.load_dictionary(dictionary_path,
                                           term_index=0,
                                           count_index=1)
            self.sym_spell.load_bigram_dictionary(bigram_path,
                                                  term_index=0,
                                                  count_index=2)
            self.load_status = True
        else:
            self.load_status = False
        self.name = "Spell Corrector"

    def is_valid_path(self, path_file):
        if not os.path.exists(path_file):
            logging.error("The {} is not exists".format(path_file))
            return False
        return True

    def correct(self, sentence):
        if self.load_status:
            # max edit distance per lookup (per single word, not per whole input string)
            suggestions = self.sym_spell.lookup_compound(sentence,
                                                         max_edit_distance=2)
            # display suggestion term, edit distance, and term frequency
            for suggestion in suggestions:
                return suggestion.term
        return self.load_status
예제 #6
0
def fix_spelling(directory, filename):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__), "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    corrected_list = []
    for line in open(directory + filename, 'r'):
        suggestions = sym_spell.lookup_compound(line, max_edit_distance_lookup)
        for suggestion in suggestions:
            corrected_list.append(suggestion.term)
    print(corrected_list)
    # text = " ".join(corrected_list)
    with open("output/" + filename + ".spell", 'w') as f:
        for line in corrected_list:
            f.write(line)
            f.write('\n')            
예제 #7
0
    def test_lookup_compound_no_suggestion(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("machine", 1)

        typo = "qwer erty ytui a"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(typo, results[0].term)
예제 #8
0
    def test_lookup_compound_only_combi(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("machine", 1)

        typo = "ste am machie"
        correction = "steam machine"
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
예제 #9
0
    def test_lookup_compound_transfer_casing(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("Whereis th elove hehaD Dated forImuch of thepast who "
                "couqdn'tread in sixthgrade AND ins pired him")
        correction = ("Where is the love he haD Dated for much of the past "
                      "who couldn't read in sixth grade AND inspired him")

        results = sym_spell.lookup_compound(typo, edit_distance_max,
                                            transfer_casing=True)
        self.assertEqual(correction, results[0].term)
예제 #10
0
    def test_lookup_compound_ignore_non_words(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who "
                "couqdn'tread in SIXTHgrade and ins pired him")
        correction = ("where is the love 123 he had dated for much of THEPAST "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the DHIRD 1 quarter of last year he had learned "
                      "of a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY "
                "of 12 funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with PLETY of 12 fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible 1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible 1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible AB1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible AB1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "PI on leave, arrange Co-I to do screening"
        correction = "PI on leave arrange co i to do screening"
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
예제 #11
0
    def symspell_correction(
            misspelled):  # not used because it is too expensive
        from symspellpy import SymSpell, Verbosity

        sym_spell = SymSpell(83000, 2)
        dictionary_path = resdir + "frequency_dictionary_en_82_765.txt"
        if not sym_spell.load_dictionary(dictionary_path, 0, 1):
            return ""
        suggestions = sym_spell.lookup(misspelled, Verbosity.CLOSEST, 2)
        if suggestions:
            return sorted(suggestions, key=lambda x: x.count,
                          reverse=True)[0].term
        return sorted(sym_spell.lookup_compound(misspelled, 2),\
                      key = lambda x: x.count,\
                      reverse = True)[0].term
예제 #12
0
    def spell_checker(inputTerm, path='./dictionary.txt'):
        symspell = SymSpell()
        symspell.load_dictionary(path, term_index=0, count_index=1)
        maxEditDistance = 2
        # ignore_non_words = True means if a particular word is not present then
        # we'll return as is.
        correct_sent = []
        for i in inputTerm.split():
            if i.isalnum():
                suggestion = symspell.lookup_compound(i, maxEditDistance, ignore_non_words=True)
                suggestion = str(suggestion[0]).split(',')[0].strip()
            else:
                suggestion = i.strip()
            correct_sent.append(suggestion)

        return " ".join(correct_sent)
예제 #13
0
def SpellCorrect(strings):
    sym_spell = SymSpell(max_dictionary_edit_distance=1, prefix_length=7)
    # dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(os.getcwd() + '\\frequency.txt',
                              term_index=0,
                              count_index=1)
    temp = []
    # lookup suggestions for single-word input strings
    for row in strings:
        try:
            suggestions = sym_spell.lookup_compound(row, max_edit_distance=1)
            temp.append(str(suggestions[0]).split(',')[0])
        except:
            temp.append('\n')
    return temp
예제 #14
0
    def test_lookup_compound_ignore_non_words(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)
        sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2)

        typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who "
                "couqdn'tread in SIXTHgrade and ins pired him")
        correction = ("where is the love 123 he had dated for much of THEPAST "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the DHIRD 1 quarter of last year he had learned "
                      "of a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY "
                "of 12 funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with PLETY of 12 fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible 1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible 1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible AB1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible AB1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "PI on leave, arrange Co-I to do screening"
        correction = "PI on leave arrange co i to do screening"
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
예제 #15
0
class WordCorrector(LogicAdapter):

    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        self.language = kwargs.get('language', languages.ENG)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        self.dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
        # TODO : 숫자 없음. dictionary modifying 필요
        self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2)

    def can_process(self, statement):
        try:
            if " " in statement.text.lower():
                return False
            else:
                response = self.process(statement)
                return response.confidence == 1
        except:
            return False


    def process(self, statement, additional_response_selection_parameters=None):
        input_text = statement.text
        input_text = (input_text)
        suggestions = self.sym_spell.lookup_compound(input_text, max_edit_distance=2)

        for suggestion in suggestions:
            #print(suggestion)
            #print(type(suggestion))
            expression = "Do you mean \""+ str(suggestion).split(",")[0] +"\""

            if input_text == str(suggestion).split(",")[0]:
                expression = ""
            response = Statement(text=expression)
            response.confidence = 1
            #TODO: corrector 돌렸을 때 같을 땐 confidence 0, 다를 땐 confidence 1로 줬었는데 모 딴 거하다 이걸로 냅둠
        return response
예제 #16
0
class Spell_Checker():
    def __init__(self):
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        self.dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
        self.bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

        self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2)

    def Correct_It(self, data):
        suggestions = self.sym_spell.lookup_compound(data, max_edit_distance=2,
                                            transfer_casing=True)

        clean_data = list()
        for suggestion in suggestions:
            clean_data.append(str(suggestion.term))

        correct_data = " ".join(clean_data)

        return correct_data
예제 #17
0
import pkg_resources
from symspellpy import SymSpell

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

s = "02/05/2016"
input_term = (s.replace(" ", "")).lower()

suggestions = sym_spell.lookup_compound(s, max_edit_distance=2)

# display suggestion term, edit distance, and term frequency
print(suggestions[0].term)
예제 #18
0
sym_spell.load_dictionary(dictionary_path,
                          term_index=0,
                          count_index=1,
                          encoding='utf-8')
# sym_spell.load_dictionary('C:/Users/nt.anh6/PycharmProjects/aicr_vn/nlp_model/spell_checker/dict/vi_full.txt', term_index=0, count_index=1, encoding='utf-8')
sym_spell.load_bigram_dictionary(bigram_path,
                                 term_index=0,
                                 count_index=2,
                                 encoding='utf-8')

# lookup suggestions for multi-word input strings (supports compound
# splitting & merging)
input_term = "Ngyễn tành nm"
# max edit distance per lookup (per single word, not per whole input string)
# suggestions = sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=2, include_unknown=True)
suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
# display suggestion term, edit distance, and term frequency
for suggestion in suggestions:
    print(suggestion)


def load_name_corection(dictionary_path, bigram_path):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    # dictionary_path = pkg_resources.resource_filename(
    #     dictionary_path)
    # bigram_path = pkg_resources.resource_filename(
    #     bigram_path)
    sym_spell.load_dictionary(dictionary_path,
                              term_index=0,
                              count_index=1,
                              encoding='utf-8')
예제 #19
0
class SpellCheck():
    def __init__(self, init_path=None):
        """Spelling checker: symspellpy==6.5.2.

        https://symspellpy.readthedocs.io/en/latest/examples/lookup.html#basic-usage.
        https://towardsdatascience.com/essential-text-correction-process-for-nlp-tasks-f731a025fcc3."""
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2,
                                  prefix_length=7)
        self.set_dictionary_path(init_path)
        self.set_dictionary()
        # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1)

    def set_dictionary_path(self, path):
        if path:
            self.path = path
        else:
            self.path = pkg_resources.resource_filename(
                "symspellpy", "frequency_dictionary_en_82_765.txt")
        return self.path

    def set_df(self):
        self.df = pd.read_csv(self.path,
                              sep=' ',
                              header=None,
                              dtype={
                                  0: str,
                                  1: np.int
                              })
        return self.df

    def set_dict(self):
        self.set_df()
        self.dictionary = {
            self.df.loc[i, 0]: self.df.loc[i, 1]
            for i in self.df.index
        }
        return self.dictionary

    def set_dictionary(self):
        self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1)
        self.set_dict()
        return None

    def find(self, term):
        return self.dictionary.get(term, 'nothing found')

    def append_dict(self, df_custom, cust_path='./data/cust_freq_dict_en.txt'):
        """Add custom dictionary.

        df: [term, freq]"""
        df_init = self.set_df()
        try:
            df_custom = df_custom.replace([np.inf, -np.inf, np.nan], 99)
            df_custom[1] = df_custom[1].astype(int)
            df = pd.concat([df_init, df_custom], ignore_index=True)
        except Exception as err:
            st.write('something went wrong', err)
            return -1

        # Remove duplicate terms and sort on frequency
        df.drop_duplicates(subset=[0], keep='first', inplace=True)
        df.sort_values(by=[1], ascending=False, inplace=True)

        # Save & Load after adding custom dictionary
        self.set_dictionary_path(cust_path)
        df.to_csv(self.path, sep=' ', index=None, header=None)
        # self.sym_spell.load_dictionary(self.path, term_index=0, count_index=1)
        self.set_dictionary()
        return None

    def __call__(self, input_term, N=8):
        """lookup suggestions for single- and multi-word input strings"""
        # Check loner words (N chars) on possible concatenation
        # https://symspellpy.readthedocs.io/en/latest/api/symspellpy.html#symspellpy.symspellpy.Verbosity
        if (len(input_term.split(' '))) == 1 or (len(input_term) < N):
            suggestions = self.sym_spell.lookup(input_term,
                                                Verbosity.TOP,
                                                max_edit_distance=2,
                                                transfer_casing=True,
                                                include_unknown=True)
        else:
            # Punctuation get's lost!
            suggestions = self.sym_spell.lookup_compound(input_term,
                                                         max_edit_distance=2,
                                                         transfer_casing=True)
        # Suggestion term, term frequency, and edit distance
        # return [(sug.term, sug.count, sug.distance) for sug in suggestions]
        return [sug.term for sug in suggestions][0]
예제 #20
0
    def test_lookup_compound_replaced_words(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        replacement_1 = {
            "whereis": "where is",
            "th": "the",
            "elove": "love",
            "hehad": "he had",
            "forimuch": "for much",
            "thepast": "the past",
            "couqdn'tread": "couldn't read",
            "sixthgrade": "sixth grade",
            "ins": "in"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(len(replacement_1), len(sym_spell.replaced_words))
        for k, v in replacement_1.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        replacement_2 = {
            "te": "the",
            "dhird": "third",
            "qarter": "quarter",
            "oflast": "of last",
            "jear": "year",
            "hadlearned": "had learned",
            "ofca": "of a",
            "sekretplan": "secret plan"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2),
            len(sym_spell.replaced_words))
        for k, v in replacement_2.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        replacement_3 = {
            "bigjest": "biggest",
            "playrs": "players",
            "strogsommer": "strong summer",
            "slatew": "slate",
            "ith": "with",
            "plety": "plenty",
            "funn": "fun"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2) + len(replacement_3),
            len(sym_spell.replaced_words))
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        for k, v in replacement_3.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)
예제 #21
0
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

file = open('../lycurgusOCR.txt', 'r', encoding='utf-8')
file_contents = ''
for line in file:
    if(len(line) < 15):
        continue
    file_contents += line
suggestions = sym_spell.lookup_compound(file_contents, max_edit_distance=2)
for suggestion in suggestions:
    print(suggestion)
def export():
    import os
    import torch
    import zipfile
    import torchaudio
    from glob import glob

    device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
    model, decoder, utils = torch.hub.load('snakers4/silero-models',
                                        model='silero_stt',
                                        language='en')
    (read_batch, split_into_batches,
    read_audio, prepare_model_input) = utils  # see function signature for details
    
    
    os.system("ffmpeg -i 'video.mp4' -vn -acodec copy audio.aac")
    os.system("ffmpeg -i audio.aac audio.wav")


    # download a single file, any format compatible with TorchAudio (soundfile backend)
    # torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',
    #                                dst ='speech_orig.wav', progress=True)
    test_files = glob('audio.wav') 
    batches = split_into_batches(test_files, batch_size=10)
    input = prepare_model_input(read_batch(batches[0]))

    text = ""
    output = model(input)
    for example in output:
        pred = decoder(example.cpu())
        text = text + pred
        
    os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt")
    os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt")



    import pkg_resources
    from symspellpy import SymSpell, Verbosity

    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    # input_term = ("whereis th elove hehad dated forImuch of thepast who "
    #              "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print(suggestion)
        
        
    text = str(suggestion)


    cnt = 0
    textlines = []
    while cnt < len(text.split(" ")):
        print(text.split(" ")[cnt:cnt+5])
        line = "\n" + " ".join(text.split(" ")[cnt:cnt+5])
        textlines.append(line)
        cnt += 5
        
        
    f = open("script_cleaned.txt", "a")
    f.writelines(textlines)
    f.close()


    os.system("python -m aeneas.tools.execute_task \
        audio.wav \
        script_cleaned.txt \
        'task_language=eng|os_task_file_format=srt|is_text_type=plain' \
        subtitles.srt")



    with open("subtitles.srt") as f:
        srt = f.read()
        
    return Response(
        srt,
        mimetype="text/srt",
        headers={
            "Content-disposition": "attachment; filename=subtitiles.srt"
        }
    )
예제 #23
0
class spellchecker:
    def __init__(
        self,
        max_dictionary_edit_distance,
        prefix_length,
        unigram_freq_file,
        bigram_freq_file=None,
        pickle_file=None,
    ):
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=max_dictionary_edit_distance,
            prefix_length=prefix_length,
        )

        if pickle_file is not None:
            self.sym_spell.load_pickle(pickle_file, )
        else:
            self.sym_spell.load_dictionary(
                unigram_freq_file,
                term_index=0,
                count_index=1,
                encoding="utf-8",
            )

            if bigram_freq_file:
                self.sym_spell.load_bigram_dictionary(
                    bigram_freq_file,
                    term_index=0,
                    count_index=2,
                    encoding="utf-8",
                )

    def suggest(
        self,
        word,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        # defaults
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup(
            word,
            verbosity,
            max_edit_distance=max_edit_dist,
            include_unknown=include_unknown,
        )
        return {
            'original_term': word,
            'suggestions': suggestions,
        }

    def suggest_compound(
        self,
        phrase,
        max_edit_dist=None,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        # spellcheck
        suggestions = self.sym_spell.lookup_compound(
            phrase,
            max_edit_distance=max_edit_dist,
            # ignore_non_words=False,
            # split_phrase_by_space=True,
        )
        return {
            'original_term': phrase,
            'suggestions': suggestions,
        }

    def tokenize(self, phrases):
        return tokenize_sentence(phrases)

    # Tokenize into individual phrases and return a list of suggestions for each
    def suggest_tokenize(
        self,
        phrases,
        max_edit_dist=None,
        include_unknown=True,
        verbosity=Verbosity.CLOSEST,
    ):
        if max_edit_dist == None:
            max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE

        words = self.tokenize(phrases)

        sentence_suggestions = []
        for word in words:
            suggestions = self.sym_spell.lookup(
                word,
                verbosity,
                max_edit_distance=max_edit_dist,
                include_unknown=include_unknown,
            )
            sentence_suggestions.append({
                'original_term': word,
                'suggestions': suggestions,
            })

        return sentence_suggestions
예제 #24
0
    def test_lookup_compound_replaced_words_no_bigram(self):
        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("whereas the love head dated for much of the past who "
                      "couldn't read in sixth grade and inspired him")
        replacement_1 = {
            "whereis": "whereas",
            "th": "the",
            "elove": "love",
            "hehad": "head",
            "forimuch": "for much",
            "thepast": "the past",
            "couqdn'tread": "couldn't read",
            "sixthgrade": "sixth grade",
            "ins": "in"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(len(replacement_1), len(sym_spell.replaced_words))
        for k, v in replacement_1.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        replacement_2 = {
            "te": "the",
            "dhird": "third",
            "qarter": "quarter",
            "oflast": "of last",
            "jear": "year",
            "hadlearned": "had learned",
            "ofca": "of a",
            "sekretplan": "secret plan"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2),
            len(sym_spell.replaced_words))
        for k, v in replacement_2.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        replacement_3 = {
            "bigjest": "biggest",
            "playrs": "players",
            "strogsommer": "strong summer",
            "slatew": "slate",
            "ith": "with",
            "plety": "plenty",
            "funn": "fun"
        }
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(
            len(replacement_1) + len(replacement_2) + len(replacement_3),
            len(sym_spell.replaced_words))
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
        for k, v in replacement_3.items():
            self.assertEqual(v, sym_spell.replaced_words[k].term)
예제 #25
0
def test3():
    # from autocorrect import Speller
    # doc = docx.Document("Word docs_Peace/1_CTS_119_eng_text.docx")
    # result = [p.text for p in doc.paragraphs]
    #
    # spell = Speller(lang='en')
    #
    # for j in range(15):
    #     print(spell(result[j]))

    # import jamspell
    #
    # corrector = jamspell.TSpellCorrector()
    # corrector.LoadLangModel('en.bin')
    # text = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the Imperial Court, or in the Imperial Chamber,\n",
    #
    # text = corrector.FixFragment(text)
    # print(text)
    sys.path.append("treatyUtil")
    import pkg_resources
    from symspellpy import SymSpell, Verbosity
    from treatyUtil import spellcheck_keep_punctuation

    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term1 = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII.\
    According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred \
    Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir \
    Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the \
    account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either \
    Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, \
    or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, \
    both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding \
    all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions \
    ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with \
    the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor \
    to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the\
    Imperial Court, or in the Imperial Chamber,\n"

    #input_term = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited "

    input_term = "God, and Safety of the Chriſtian World (the Electors,\nPrinces and States of the Sacred Roman Empire \
    being\npreſent, approving and conſenting) the Articles of Peace\nand Anity, whereof the Tenour follows.\n1. That \
    there be a Chriſtian, univerſal\nThe Re-efta. and perpetual Peace, and a true and ſincere\nbliſhment of Friendſhip and \
    Amity between his Sacred\nPeace and A. Imperial Majeſty, the Houſe of Austria,\nmity.\nand all his Allies and Adherents, \
    and the\nHeirs and Succeffors of each of them, chiefly the King\nof Spain, and the Electors, Princes and States of the En-\npire,\
    of the one ſide, and her Sacred Royal Majeſty,\nand the Kingdom of Sweden, her Allies and Adherents,\nand the Heirs and Succeſſors\
    of each of them, eſpecially\nthe moſt Chriſtian King, the reſpective Electors, Princes\nand States of the Empire, of the other ſide ; \
    and that this\nPeace be obſerv'd and cultivated ſincerely and ſeriouſly,\nſo that each Party may procure the Benefit, Honour and\nAdvantage \
    of one another, and thereby the Fruits of this\nPeace and Amity may be ſeen to grow up and fouriſh a-\nnew, by a ſure and reciprocal \
    maintaining of a good\nand faithful Neighbourhood between the Roman Empire\nand the Kingdom of Sweden reciprocally,\nII. That there be \
    on both ſides à perpe-\nAn Amneſty\ntua) Oblivion and Amneſty of all that has\nfrom all Hoffi- been done Since the beginning of theſe\nlity.\nTroubles, \
    in what Place or in what Man-\n"

    input_term2 = "God, and Safety of the Chriſtian World (the Electors,\nPrinces"
    input_term = re.sub("\n", " ", input_term)
    input_term = re.sub("- ", "", input_term)
    #input_term = re.sub("-", "", input_term)
    input_term = re.sub("ſ", "s", input_term)

    # word_split = re.compile(r"[^\W]+", re.U)
    # suggestions = sym_spell.lookup_compound((input_term), ignore_non_words=True, max_edit_distance=2)
    # for suggestion in suggestions:
    #    print(suggestion)
    #
    # corrected = suggestions[0].term
    # # This combined with split_phrase_by_space=True would be enough just to spell check
    # # but punctuation is lost.
    #
    # # The spell check is already done in 'corrected'. Now we just want to keep the punctuation.
    # in_list = word_split.findall(input_term)
    # chk_list = word_split.findall(corrected)
    # print(input_term)
    # print(corrected)
    # print(in_list)
    # print(chk_list)
    # pdb.set_trace()
    #
    # # To keep punctuation we take the original phrase and do word by word replacement
    # out_term = ""
    # outs  = input_term.split()
    # word_count = 0
    # for word in in_list:
    #     print(out_term)
    #     print(outs[word_count].lower(), word, chk_list[word_count])
    #     temp = outs[word_count].lower().replace(word, chk_list[word_count])
    #     word_count += 1
    #     out_term += temp+" "
    #
    # print(out_term)
    # return

    # max edit distance per lookup (per single word, not per whole input string)
    #pdb.set_trace()
    #print(spellcheck_keep_punctuation(input_term))
    suggestions = sym_spell.lookup_compound((input_term),
                                            transfer_casing=True,
                                            ignore_non_words=True,
                                            max_edit_distance=2)
    # display suggestion term, edit distance, and term frequency
    #print(suggestions)
    for suggestion in suggestions:
        print(suggestion)