Exemplo n.º 1
0
def create_vocab():

    DBHelperMethod.connect_to_db()
    dataset = DBHelperMethod.load_data_set()
    chars = dp.load_nn_input_dataset_string(dataset[:, [0, 6]])
    vocab, vocab_inv = dp.build_vocab(chars)
    return vocab, vocab_inv, chars, dataset
Exemplo n.º 2
0
def load_testing_data():
    dp.establish_db_connection()
    sequence_list = []
    padded_output = []
    #sentence_numbers = []
    testing_dataset = DBHelperMethod.load_dataset_by_type("testing")
    #testing_dataset = DBHelperMethod.load_dataset_by_type_and_sentence_number_for_testing_purpose("testing", 3228)
    #sentence_numbers.append(3228)
    sentence_numbers = DBHelperMethod.get_list_of_sentence_numbers_by(
        "testing")
    labels_and_equiv_encoding = dp.get_label_table()

    for each_sentence_number in sentence_numbers:
        selected_sentence = testing_dataset[numpy.where(
            testing_dataset[:, 3] == str(each_sentence_number))]
        x, y = pad_data(selected_sentence, selected_sentence[:, [0, 1]],
                        labels_and_equiv_encoding)

        sequence_list.append(x)
        padded_output.append(y)

    padded_input, vocabulary, vocabulary_inv = convert_input_to_vocab(
        sequence_list)
    padded_output = numpy.array(list(chain(*padded_output)))

    testing_words = np.take(testing_dataset, 4, axis=1)
    input_testing_letters = np.take(testing_dataset, 0, axis=1)
    op_testing_letters = np.take(testing_dataset, 5, axis=1)
    sent_num = np.take(testing_dataset, 3, axis=1)
    letters_loc = np.take(testing_dataset, 6, axis=1)
    undiac_word = np.take(testing_dataset, 7, axis=1)

    return padded_input, padded_output, vocabulary, vocabulary_inv, testing_words, input_testing_letters, op_testing_letters,\
           sent_num, letters_loc, undiac_word
Exemplo n.º 3
0
def get_testing_data():
    DBHelperMethod.connect_to_db()
    training_dataset = DBHelperMethod.load_dataset_by_type("testing")

    x = dp.load_nn_input_dataset_string(training_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(training_dataset[:, [0, 1]])

    return x, y
Exemplo n.º 4
0
def load_testing_data():
    dp.establish_db_connection()
    testing_dataset = DBHelperMethod.load_dataset_by_type("testing")

    # x = dp.load_nn_input_dataset_string(testing_dataset[:, [0, 6]])
    x = dp.load_nn_input_dataset_string_space_only(testing_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(testing_dataset[:, [0, 1]])

    sent_num, sen_len = dp.load_nn_seq_lengths(testing_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = dp.pad_sentences1(x, sen_len, req_char_index, window_size)

    return sentences_padded, y, vocabulary, vocabulary_inv
Exemplo n.º 5
0
def load_testing_data():
    dp.establish_db_connection()
    training_dataset = DBHelperMethod.load_dataset_by_type("testing")

    x = dp.load_nn_input_dataset_string_space_only(training_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(training_dataset[:, [0, 1]])

    sent_num, sen_len = dp.load_nn_seq_lengths(training_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = dp.extract_sent_and_pad(
        x, sen_len, window_size)
    padded_output = dp.extract_sent_and_pad_output(y, sen_len, window_size)

    return sentences_padded, padded_output, vocabulary, vocabulary_inv
Exemplo n.º 6
0
def load_training_data():
    dp.establish_db_connection()
    training_dataset = DBHelperMethod.load_dataset_by_type("training")

    # x = dp.load_nn_input_dataset_string(training_dataset[:, [0, 6]])
    x = dp.load_nn_input_dataset_string_space_only(training_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(training_dataset[:, [0, 1]])

    sent_num, sen_len = dp.load_nn_seq_lengths(training_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = dp.build_one_to_one_input_data(
        x, sen_len, req_char_index, window_size)

    return numpy.array(sentences_padded), numpy.array(
        y), vocabulary, vocabulary_inv
Exemplo n.º 7
0
def load_testing_data():
    dp.establish_db_connection()
    sequence_list = []
    padded_output = []

    training_dataset = DBHelperMethod.load_dataset_by_type("testing")
    sentence_numbers = DBHelperMethod.get_list_of_sentence_numbers_by(
        "testing")
    labels_and_equiv_encoding = dp.get_label_table()

    for each_sentence_number in sentence_numbers:
        selected_sentence = training_dataset[numpy.where(
            training_dataset[:, 3] == str(each_sentence_number))]
        x, y = pad_data(selected_sentence, selected_sentence[:, [0, 1]],
                        labels_and_equiv_encoding)

        sequence_list.append(x)
        padded_output.append(y)

    padded_input, vocabulary, vocabulary_inv = convert_input_to_vocab(
        sequence_list)
    padded_output = numpy.array(list(chain(*padded_output)))

    return padded_input, padded_output, vocabulary, vocabulary_inv
Exemplo n.º 8
0
def load_testing_data():
    dp.establish_db_connection()
    training_dataset = DBHelperMethod.load_dataset_by_type("testing")

    x = dp.load_nn_input_dataset_string_space_only(training_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(training_dataset[:, [0, 1]])
    vocab, vocab_inv = dp.build_vocab(x)

    sent_num, sen_len = dp.load_nn_seq_lengths(training_dataset[:, [3]])

    input_sentences = sqp.create_window_of_chars(list(x), window_size)
    input_sentences = dp.build_input_data_without_flattening(
        input_sentences, vocab)
    input_sentences = numpy.array(input_sentences[:-1])

    output_labels = sqp.create_window_of_chars(list(y), window_size)
    output_labels = numpy.array(output_labels[:-1])

    return input_sentences, output_labels, vocab, vocab_inv
Exemplo n.º 9
0
def load_testing_data():
    dp.establish_db_connection()
    testing_dataset = DBHelperMethod.load_dataset_by_type("testing")
    #testing_dataset = DBHelperMethod.load_dataset_by_type_and_sentence_number_for_testing_purpose("testing", 3062)

    x = dp.load_nn_input_dataset_string_space_only(testing_dataset[:, [0, 6]])
    y = dp.load_nn_labels_dataset_string(testing_dataset[:, [0, 1]])

    sent_num, sen_len = dp.load_nn_seq_lengths(testing_dataset[:, [3]])
    sentences_padded, vocabulary, vocabulary_inv = dp.pad_sentences1(x, sen_len, req_char_index, window_size)

    testing_words = np.take(testing_dataset, 4, axis=1)
    input_testing_letters = np.take(testing_dataset, 0, axis=1)
    op_testing_letters = np.take(testing_dataset, 5, axis=1)
    sent_num = np.take(testing_dataset, 3, axis=1)
    letters_loc = np.take(testing_dataset, 6, axis=1)
    undiac_word = np.take(testing_dataset, 7, axis=1)

    return sentences_padded, y, vocabulary, vocabulary_inv, testing_words, input_testing_letters, op_testing_letters,\
           sent_num, letters_loc, undiac_word
Exemplo n.º 10
0
def get_all_dic_words():
    return DBHelperMethod.get_dictionary()
Exemplo n.º 11
0
def get_all_undiac_words(word_type):
    return DBHelperMethod.get_all_un_diacritized_words_in_sentences(word_type)
Exemplo n.º 12
0
    nn_labels = labels[nn_indices]
    nn_labels = np.take(nn_labels, 1, axis=1)
    expected_labels = labels[expected_indices]
    expected_labels = np.take(expected_labels, 1, axis=1)

    if len(nn_labels) == len(expected_labels) and len(nn_labels) == len(
            ip_letters):
        pass
    else:
        raise Exception("mismatch in number of elements in the array")

    nn_op_letters = dp.concatenate_char_and_diacritization(
        ip_letters, nn_labels)
    expected_op_letters = op_letters

    list_of_sentence_numbers = DBHelperMethod.get_list_of_sentence_numbers_by(
        'testing')
    list_of_all_words_and_sent_num = get_all_undiac_words('testing')

    current_sentence_counter = 0
    counter = 0
    start_range = 0
    end_range = 0
    all_sentences = DBHelperMethod.get_all_sentences_by('testing')
    for sentence_number in list_of_sentence_numbers:
        #sentence_number = 2838
        print("we will begin processing in sentence number:", sentence_number)
        indices_of_selected_sentence = np.where(
            all_sentences == str(sentence_number))
        selected_sentence1 = list(
            all_sentences[indices_of_selected_sentence[0], 0])
        #selected_sentence1 = DBHelperMethod.get_sentence_by(sentence_number, 'testing')
Exemplo n.º 13
0
import SukunCorrection
import WordLetterProcessingHelperMethod
import os
import glob
from copy import deepcopy
# actual = rnnop
extension = 'csv'
path = 'D:\Repos\\results\ATB3\\5\\new\\ff\\files\\'
current_row_1 = 0
current_row_2 = 0
Total_Error = 0
Total_Error_without_last_char = 0

if __name__ == "__main__":
    type = 'testing'
    list_of_sentence_numbers = DBHelperMethod.get_list_of_sentence_numbers_by(
        type)
    os.chdir(path)
    result = [i for i in glob.glob('*.{}'.format(extension))]
    current_sentence_counter = 0
    counter = 0
    if len(list_of_sentence_numbers) != len(result):
        raise Exception('Mismatch In Number Of Sentences')

    for file_name, sentence_number in zip(result, list_of_sentence_numbers):

        selected_sentence = DBHelperMethod.get_sentence_by(sentence_number)

        rnn_output = ExcelHelperMethod.read_rnn_op_csv_file(path + file_name)
        neurons_with_highest_probability, neurons_op_value = RNNOPProcessingHelperMethod.get_neurons_numbers_with_highest_output_value(
            rnn_output)
Exemplo n.º 14
0
    nn_labels = labels[nn_indices]
    nn_labels = np.take(nn_labels, 1, axis=1)
    expected_labels = labels[expected_indices]
    expected_labels = np.take(expected_labels, 1, axis=1)

    if len(nn_labels) == len(expected_labels) and len(nn_labels) == len(
            ip_letters):
        pass
    else:
        raise Exception("mismatch in number of elements in the array")

    nn_op_letters = dp.concatenate_char_and_diacritization(
        ip_letters, nn_labels)
    expected_op_letters = op_letters

    list_of_sentence_numbers = DBHelperMethod.get_list_of_sentence_numbers_by(
        'testing')
    list_of_all_words_and_sent_num = get_all_undiac_words()

    current_sentence_counter = 0
    counter = 0
    start_range = 0
    end_range = 0
    for sentence_number in list_of_sentence_numbers:
        #sentence_number = 4514
        print("we start in sentence # ", sentence_number)

        selected_sentence = DBHelperMethod.get_sentence_by(sentence_number)
        undiac_words = get_undiac_words_for_selected_sentence(
            list_of_all_words_and_sent_num, sentence_number)

        dic_words_for_selected_sent = get_dic_words_for_selected_sentence(
def get_diac_version_with_smallest_dist(list_of_objects, sentence_number):

    list_of_actual_words_after_dictionary_correction = []

    diacritized_rnn_op_words = WordLetterProcessingHelperMethod.reform_word_from(
        list_of_objects)
    undiacritized_words = DBHelperMethod.get_un_diacritized_words_from(
        sentence_number, 'testing')

    if len(diacritized_rnn_op_words) != len(diacritized_rnn_op_words):
        raise Exception(
            "error appeared in get_diac_version_with_smallest_dist")

    for each_corrected_word, each_un_diacritized_word in zip(
            diacritized_rnn_op_words, undiacritized_words):

        minimum_error = 100000000
        dictionary_diacritized_words = DBHelperMethod.\
            get_dictionary_all_diacritized_version_of(each_un_diacritized_word)

        if len(dictionary_diacritized_words) == 0:
            dictionary_diacritized_words.append(each_corrected_word)

        dictionary_diacritized_words_after_sukun_correction = SukunCorrection.\
            sukun_correction_for_list_of_words(dictionary_diacritized_words)

        if do_we_need_to_search_in_dictionary(
                dictionary_diacritized_words_after_sukun_correction,
                each_corrected_word):

            for each_word in dictionary_diacritized_words_after_sukun_correction:
                error_count = 0

                decomposed_dic_word = WordLetterProcessingHelperMethod.decompose_word_into_letters(
                    each_word)
                decomposed_act_word = WordLetterProcessingHelperMethod.decompose_word_into_letters(
                    each_corrected_word)

                norm_dic_word = WordLetterProcessingHelperMethod.normalize(
                    decomposed_dic_word)
                norm_act_word = WordLetterProcessingHelperMethod.normalize(
                    decomposed_act_word)

                for each_diacritized_version_letter, each_current_word_letter in zip(
                        norm_dic_word, norm_act_word):

                    if (len(each_diacritized_version_letter) -
                            len(each_current_word_letter)
                            == 1) or ((len(each_diacritized_version_letter) -
                                       len(each_current_word_letter) == -1)):
                        error_count += 1

                    elif (len(each_diacritized_version_letter) -
                          len(each_current_word_letter)
                          == 2) or ((len(each_diacritized_version_letter) -
                                     len(each_current_word_letter) == -2)):
                        error_count += 2

                    else:
                        for each_item_in_diacritized_version, each_item_in_current_word in \
                                zip(each_diacritized_version_letter, each_current_word_letter):
                            if each_item_in_diacritized_version != each_item_in_current_word:
                                error_count += 1

                if error_count < minimum_error:
                    minimum_error = error_count

                    selected_dictionary_word = each_word

            list_of_actual_words_after_dictionary_correction.append(
                selected_dictionary_word)
        else:
            list_of_actual_words_after_dictionary_correction.append(
                each_corrected_word)

    chars_after_dic_correction = WordLetterProcessingHelperMethod.convert_list_of_words_to_list_of_chars(
        list_of_actual_words_after_dictionary_correction)
    if len(list_of_objects) != len(chars_after_dic_correction):
        raise Exception("Error Happened Here")

    for x in range(0, len(list_of_objects)):
        list_of_objects[x].letter = chars_after_dic_correction[x]

    return list_of_objects