示例#1
0
    def setUp(self):
        self.file_name = 'sukuntest.csv'
        self.sukun_words = ExcelHelperMethod.read_csv_file(path + self.file_name)
        self.chars_count = WordLetterProcessingHelperMethod.get_chars_count_for_each_word_in_this(deepcopy(self.sukun_words))
        self.Chars = WordLetterProcessingHelperMethod.convert_list_of_words_to_list_of_chars(deepcopy(self.sukun_words))

        self.Chars_And_Its_Location = WordLetterProcessingHelperMethod.get_location_of_each_char(
            self.Chars, self.chars_count)
示例#2
0
def get_diac_version_with_smallest_dist(list_of_objects):

    list_of_actual_words_after_dictionary_correction = []
    list_of_undiac_objects = WordLetterProcessingHelperMethod.remove_diacritics_from(
        list_of_objects)
    list_of_undiac_words = WordLetterProcessingHelperMethod.reform_word_from(
        list_of_undiac_objects)

    diacritized_rnn_op_words = WordLetterProcessingHelperMethod.reform_word_from(
        list_of_objects)

    if len(diacritized_rnn_op_words) != len(list_of_undiac_words):
        raise Exception(
            "error appeared in get_diac_version_with_smallest_dist")

    for each_corrected_word, each_un_diacritized_word in zip(
            diacritized_rnn_op_words, list_of_undiac_words):

        minimum_error = 100000000
        dictionary_diacritized_words = DBHelperMethod.\
            get_dictionary_all_diacritized_version_of(each_un_diacritized_word)

        if len(dictionary_diacritized_words) == 0:
            dictionary_diacritized_words.append(each_corrected_word)

        dictionary_diacritized_words_after_sukun_correction = SukunCorrection.\
            sukun_correction_for_list_of_words(dictionary_diacritized_words)

        if do_we_need_to_search_in_dictionary(
                dictionary_diacritized_words_after_sukun_correction,
                each_corrected_word):

            for each_word in dictionary_diacritized_words_after_sukun_correction:
                error_count = 0

                decomposed_dic_word = WordLetterProcessingHelperMethod.decompose_word_into_letters(
                    each_word)
                decomposed_act_word = WordLetterProcessingHelperMethod.decompose_word_into_letters(
                    each_corrected_word)

                norm_dic_word = WordLetterProcessingHelperMethod.normalize(
                    decomposed_dic_word)
                norm_act_word = WordLetterProcessingHelperMethod.normalize(
                    decomposed_act_word)

                for each_diacritized_version_letter, each_current_word_letter in zip(
                        norm_dic_word, norm_act_word):

                    if (len(each_diacritized_version_letter) -
                            len(each_current_word_letter)
                            == 1) or ((len(each_diacritized_version_letter) -
                                       len(each_current_word_letter) == -1)):
                        error_count += 1

                    elif (len(each_diacritized_version_letter) -
                          len(each_current_word_letter)
                          == 2) or ((len(each_diacritized_version_letter) -
                                     len(each_current_word_letter) == -2)):
                        error_count += 2

                    else:
                        for each_item_in_diacritized_version, each_item_in_current_word in \
                                zip(each_diacritized_version_letter, each_current_word_letter):
                            if each_item_in_diacritized_version != each_item_in_current_word:
                                error_count += 1

                if error_count < minimum_error:
                    minimum_error = error_count

                    selected_dictionary_word = each_word

            list_of_actual_words_after_dictionary_correction.append(
                selected_dictionary_word)
        else:
            list_of_actual_words_after_dictionary_correction.append(
                each_corrected_word)

    chars_after_dic_correction = WordLetterProcessingHelperMethod.convert_list_of_words_to_list_of_chars(
        list_of_actual_words_after_dictionary_correction)
    if len(list_of_objects) != len(chars_after_dic_correction):
        raise Exception("Error Happened Here")

    for x in range(0, len(list_of_objects)):
        list_of_objects[x].letter = chars_after_dic_correction[x]

    return list_of_objects