def main(): transDict, Greek_word_num, Greek_search_dict, Greek_text = preprocessing() # Save lemma to translations found found_translist = {} try: while (True): scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE) input_phrase = input("Enter Search Phrase> ") if re.sub(" ", "", re.sub("q", "", input_phrase)) == "" or re.sub(" ", "", re.sub("quit", "", input_phrase)) == "": exit(0) if (valid_search(input_phrase)): search = search_phrase(input_phrase, "Latin") # Find all the translations of the given words for i in range(search.search_len): search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist) xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text) print(scoreKeeper) else: print('Please enter a valid string\n') except KeyboardInterrupt: print('\nProgram Terminated\n') sys.exit(0)
def search_by_phrase(input_phrase, language, transDict, Greek_word_num, Greek_search_dict, Greek_text, max_scoreboard_size, min_score): if not (valid_search(input_phrase)): return ERROR else: output_translation_matrix = [] output_translation_matrix.append([""]) # Save lemma to translations found found_translist = {} scoreKeeper = scoreboard(max_scoreboard_size, min_score) search = search_phrase(input_phrase, language) # Find all the translations of the given words for i in range(search.search_len): search.has_translation[i] = trn.get_translation(search.text[i], transDict, found_translist) xls.try_all_search_combos(search, scoreKeeper, Greek_word_num, Greek_search_dict, Greek_text, output_translation_matrix) #translations_of_search = translation_matrix_to_string(output_translation_matrix) output_translation_matrix.pop(0) return scoreKeeper, output_translation_matrix
def preprocessing(): # Get filenames for the thesaurus, the latin text, and greek text thesaurus_filename, Latin_filename, Greek_filename = xls.get_whole_text_comparison_Args() # Read the thesaurus CSV file into a dicitonary object for easy access transDict = ths.build_thesaurus(thesaurus_filename) # Read both the latin text and greek text into a dictionary for easy access Latin_word_num, Latin_search_dict, Latin_text = xls.build_search_dictionary(Latin_filename, "Latin", True) Greek_word_num, Greek_search_dict, Greek_text = xls.build_search_dictionary = (Greek_filename, "Greek", True) return Latin_word_num, Latin_search_dict, Latin_text, Greek_word_num, Greek_search_dict, Greek_text
def find_best_match(L1, L2, L1_translation, L2_translation, Greek_search_dict, Latin_text, Latin_word_num, Greek_text, Greek_word_num, scoreKeeper): attemptsLeft = MAX_TRANSLATION_ATTEMPTS for L1_translation in L1.translations: for L2_translation in L2.translations: if (attemptsLeft > 0): attemptsLeft = attemptsLeft - 1 # Find a match for the given translations of L1 and L2 match = xls.find_match_pair(L1, L2, L1_translation, L2_translation, Greek_search_dict, Latin_text, Latin_word_num, Greek_text, Greek_word_num) if not match: continue print('Match Found!\n' + str(match)) # Add match to the scoreboard scoreKeeper.add_newMatch(match) # If quit turned on, quits after first match if (0 == QUIT_AFTER_FIRST_MATCH): return; # If no more attempts, break from inner for loop else: return return
def preprocessing(): latin_cltk_importer = CorpusImporter('latin') latin_cltk_importer.import_corpus('latin_models_cltk') greek_cltk_importer = CorpusImporter('greek') greek_cltk_importer.import_corpus('greek_models_cltk') # Get filenames for the thesaurus, the latin text, and greek text thesaurus_filename, Greek_filename = xls.get_search_by_phrase_Args() # Read the thesaurus CSV file into a dicitonary object for easy access transDict = ths.build_thesaurus(thesaurus_filename) # Read greek text into a dictionary for easy access Greek_word_num, Greek_search_dict, Greek_text = xls.build_search_dictionary(Greek_filename, "Greek", True) return transDict, Greek_word_num, Greek_search_dict, Greek_text
def test_build_search_dict(curr_test, filename, words_in_file , language, lemmatized_version = False): word_num, search_dict, indexed_corpus = xls.build_search_dictionary(filename, language ,lemmatized_version) if not (word_num == words_in_file): curr_test.passed = False curr_test.errors.append("Wrong number of words added (only " + str(word_num) + " out of " + str(words_in_file) + " words added) ") test_file = open(filename,'r') i = 0 curr_word = "" while 1: char = test_file.read(1) if not (re.sub("[\p{Z}\t\r\n\v\f\s]", "", char) == ""): curr_word += char else: curr_word = normalize_word(curr_word) if not (curr_word == ""): if curr_word in search_dict: if not( i in search_dict[curr_word]): curr_test.passed = False error_message = curr_word + " did not have the proper index in the search dict" error_message += "\n\t\t\t word number: " +str(i) + " Indices: " + str(search_dict[curr_word]) curr_test.errors.append(error_message) else: curr_test.passed = False error_message = curr_word + " was not found in the search dict" curr_test.errors.append(error_message) i += 1 curr_word = "" if char == None or char == "": break test_file.close() return curr_test
# Save lemma to translations found found_translist = {} #Create scoreboard scoreKeeper = scoreboard(MAX_SCOREBOARD_SIZE, MIN_SCORE) try: print("Beginning Search\n") # Finds all word pairs in Latin_text that are within 1,2,3, .. MAX_DISTANCE_LATIN positions of each other for j in range(1, MAX_DISTANCE_LATIN): for i in range(Latin_word_num - j - 1): # For both the first and second word in the pair find the word, its position, and the occurences in the text L1, L2 = xls.get_LatinWordStats(Latin_text, Latin_search_dict, i, j) if -1 == trn.get_translation_pair(L1, L2, transDict, found_translist): continue # Determine the maximum score for the word pair using all possible combinations of translations find_best_match(L1, L2, L1_translation, L2_translation, Greek_search_dict, Latin_text, Latin_word_num, Greek_text, Greek_word_num, scoreKeeper ) print(scoreKeeper) except KeyboardInterrupt: print('\nProgram Terminated\n') print(scoreKeeper) sys.exit(0)
def process_corpus(self, filename, language, make_IndexedText=True, use_lemmatized_text=False): self.corpus_ready = False self.word_num, self.search_dict, self.indexed_corpus = xls.build_search_dictionary(filename, language, use_lemmatized_text) self.corpus_ready = True
def test_functions(): print("Starting unit testing of simpleXLing.py") tests = [] #====Build Search Dict====# #Attempts to build a search dictionary for a .txt file containing Latin words (Doesn't check lemmatized text file) curr_test = test("Build latin search dictionary (XLingFunctions.py)") latin_filename = "./test_files/small_latin.txt" words_in_file = 663 language = "Latin" lemmatized_version = False tests.append(test_build_search_dict(curr_test,latin_filename, words_in_file, language, lemmatized_version )) #Attempts to build a search dictionary for a .txt file containing Greek wors (Doesn't check lemmatized text file) curr_test = test("Build greek search dictionary (XLingFunctions.py)") greek_filename = "./test_files/small_greek.txt" word_in_file = 789 language = "Greek" lemmatized_version = False tests.append(test_build_search_dict(curr_test,greek_filename, words_in_file, language, lemmatized_version)) #Builds an arbitrary translation dictionary and attempts to find translation for a valid word pair curr_test = test("Get valid translations (translate.py)") LA = word("latina", 1, 3, None) LB = word("latinb", 2, 4, None) transDict = {} latin_a_translations = ["greek1", "greek2", "greek3"] latin_b_translations = ["greek4"] transDict["latina"] = latin_a_translations transDict["latinb"] = latin_b_translations result = trn.get_translation_pair(LA,LB,transDict) if result == -1: curr_test.passed = False error_message = "Translations for two valid dictionary entries were not found" curr_test.errors.append(error_message) if not (LA.translations == latin_a_translations): curr_test.passed = False error_message = "Latina.translations doesn't match the actual translations" curr_test.errors.append(error_message) if not (LB.translations == latin_b_translations): curr_test.passed = False error_message = "Latinb.translations doesn't match the actual translations" curr_test.errors.append(error_message) tests.append(curr_test) #Attempts to find a translation for an invalid word pair curr_test = test("Get invalid translations (translate.py)") LC = word("latinc", 2, 3, None) result = trn.get_translation_pair(LA,LC,transDict) if not (result == -1): curr_test.passed = False error_message = "Translations for two valid dictionary entries were not found" curr_test.errors.append(error_message) if not (LA.translations == latin_a_translations): curr_test.passed = False error_message = "Latin_a.translations doesn't match the actual translations" curr_test.errors.append(error_message) if not (LC.translations == None): curr_test.passed = False error_message = "Latin_c. shouldn't have any translations" curr_test.errors.append(error_message) tests.append(curr_test) #Search for a translation of a pair of latin words in a greek search dictionary curr_test = test("Get Greek translation pair (XLingFunctions.py)") L1_translation = "greek1" L2_translation = "greek2" G_search_dict = { 'greek1' : [1,3], 'greek2' : [2] } G1, G2 = xls.get_GreekPair(L1_translation, L2_translation, G_search_dict) if not (G1 == [1,3]): curr_test.passed = False curr_test.errors.append("Translation array (array of indices in greek text where translation of latin word appear) is incorrect") if not (G2 == [2]): curr_test.passed = False curr_test.errors.append("Translation array (array of indices in greek text where translation of latin word appear) is incorrect") tests.append(curr_test) #Search for a translation of a pair of latin words in a greek search dictionary (when translations aren't in dictionary) curr_test = test("Get Greek translation pair with incomplete search dict (XLingFunctions.py)") L3_translation = "greek3" G1, G3 = xls.get_GreekPair(L1_translation, L3_translation, G_search_dict) if not (G1 == None): curr_test.passed = False curr_test.errors.append("Translation array (array of indices in greek text where translation of latin word appear) is incorrect") if not (G3 == None): curr_test.passed = False curr_test.errors.append("Translation array (array of indices in greek text where translation of latin word appear) is incorrect") tests.append(curr_test) #Get latin word stats curr_test = test("Get Latin word stats (XLingFunctions.py)") latin_text = ['L1', 'L2', 'L1', 'L1', 'L2', 'L3','L4', 'L5', 'L5', 'L5'] latin_search_dict = { 'L1' : [0,2,3], 'L2' : [1,4], 'L3' : [5], 'L4' : [6], 'L5': [7,8,9]} i = 3 j = 4 L1, L2 = xls.get_LatinWordStats(latin_text,latin_search_dict, i, j) if not ( L1.word == "L1"): curr_test.passed = False curr_test.errors.append("Latin word object #1 corresponds to the wrong word (" + str(L1.word) + ")") if not (L1.pos == 3): curr_test.passed = False curr_test.errors.append("Latin word object #1 corresponds to the wrong position in text (" + str(L1.pos) + ")") if not (L1.occurences == 3): curr_test.passed = False curr_test.errors.append("Latin word object #1 should occur 3 times, only occurs" + str(L1.occurences) + " times") if not ( L2.word == "L5"): curr_test.passed = False curr_test.errors.append("Latin word object #2 corresponds to the wrong word (" + str(L2.word) + ")" ) if not (L2.pos == 7): curr_test.passed = False curr_test.errors.append("Latin word object #2 corresponds to the wrong position in text (" + str(L2.pos) + ")" ) if not (L2.occurences == 3): curr_test.passed = False curr_test.errors.append("Latin word object #2 should occur 3 times, only occurs" + str(L2.occurences) + " times") tests.append(curr_test) #Find match pair test (three possible matches in greek test, should take the match with the two words side by side) Uses same L1,L2, Latin text, and latin text as above curr_test = test("Find match pair given three matches in the greek corpus (XLingFunctions.py") L1_translation = "g1" L2_translation = "g2" Greek_text = [ "g1", "x", "x", "g2", "g1", "x", "g2", "x", "x", "x", "g1" ] Greek_search_dict = {"g1": [0,4,10], "g2": [ 3,6], "x": [1,2,5,7,8,9]} bestMatch = xls.find_match_pair(L1, L2, L1_translation, L2_translation, Greek_search_dict, latin_text, len(latin_text), Greek_text, len(Greek_text), None) if not bestMatch: curr_test.passed = False curr_test.errors.append("No match was found") elif not (bestMatch.G1_pos == 4) or not(bestMatch.G2_pos == 3): curr_test.passed = False curr_test.errors.append("The best match did not occur in the expected position ( G1 = " +str(bestMatch.G1_pos) + " G2 = " + str(bestMatch.G2_pos) ) tests.append(curr_test) #Ensures that try_all_search_combos produces all combos of position indicies curr_test = test("Try all search combos (finds best match in a greek text given a search prhase)") search = search_phrase("L1 L2 L3", "Latin") search.has_translation = [1, 1, 1] search.text[0].translations = ["g1", "g4", "g3" ] search.text[1].translations = ["g6"] search.text[2].translations = ["g5", "g2"] score = scoreboard(1) xls.try_all_search_combos( search, score, len(Greek_text), Greek_search_dict, Greek_text) if not ( score.matches[0].G1_pos == 4 and score.matches[0].G2_pos == 3): curr_test.passed = False curr_test.errors.append("The wrong top match was found for the search") tests.append(curr_test) return tests