def create_tokeniser_test_training(): """Splits the glosses into two lists, one for training a character-level LSTM based tokeniser, and another containing 41 pre-selected glosses as a test set. The test set is split into two further lists, the first list is the untokenised glosses of the test set, the second is the same glosses, manually tokenised. Saves each list as a pickle file.""" glist = list_numbered_glosses("Wurzburg Glosses", 499, 712) # List numbers of all chosen test-set glosses in order testglossids = [ "2c4.", "5b11.", "5b28.", "6c7.", "6c9.", "9a14.", "9b4.", "9c20.", "10b27.", "10c21.", "10d23.", "10d36.", "11a24.", "12a22.", "12c9.", "12c29.", "12c32.", "12c36.", "14a8.", "14c2a.", "14c18.", "14c23.", "14d17.", "14d26.", "15a18.", "16d8.", "17d27.", "18a14.", "18c6.", "19b6.", "21a8.", "21c19.", "23b7.", "23d10.", "26b6.", "27a24.", "28c2.", "28d16.", "29d19.", "30b4.", "31c7." ] # Takes Manually Tokenised test-set glosses from file and adds them to a dictionary so they can be found using the # list above as keys man_tok_glosslist = get_text("Manually Tokenised Glosses").split("\n") mtgidpat = re.compile(r'\(\d{1,2}\w \d{1,2}\w?\) ') mtgs_with_ids = {} for mtg in man_tok_glosslist: mtgpatitir = mtgidpat.finditer(mtg) for mtgiditir in mtgpatitir: mtgloss = "".join(mtg.split(mtgiditir.group())) mtgid = "".join(mtg.split(mtgloss)) mtgid_fix = "".join(mtgid.split(" ")) + "." mtgid_fix = "".join(mtgid_fix.split("(")) mtgid_fix = "".join(mtgid_fix.split(")")) mtgs_with_ids[mtgid_fix] = mtgloss # Creates the test and training lists testglosses = [] testglosses_tokenised = [] trainglosses = [] for g in glist: if g[0] in testglossids: testglosses.append(g[1]) testglosses_tokenised.append(mtgs_with_ids.get(g[0])) else: trainglosses.append(g[1]) # Combines the untokenised and tokenised test-set testglosses_set = [testglosses, testglosses_tokenised] # Saves the test and train sets to pickle files pickletest_out = open("toktest.pkl", "wb") pickle.dump(testglosses_set, pickletest_out) pickletest_out.close() pickletrain_out = open("toktrain.pkl", "wb") pickle.dump(trainglosses, pickletrain_out) pickletrain_out.close() return "\nTest and Training Sets Compiled for Gloss Tokenisation.\n"
def opchsave(filename): """Opens a document, allows you to edit it somehow, saves a copy of the document.""" # Open Document and get text text = get_text(filename) # Change Document somehow # Here instances of "...[a][/GLat]" are being changed to "...[/GLat][a]" glatpat = re.compile(r'\[\w\]\[/GLat\]') glatpatitir = glatpat.finditer(text) swaplist = [] for i in glatpatitir: if i.group() not in swaplist: swaplist.append(i.group()) for error in swaplist: letter = error[:3] fix = "[/GLat]" + letter textlist = text.split(error) text = fix.join(textlist) # Save a copy of the updated Document save_docx(text, filename) return "Completed!"
def get_pages(filename, startpage=499, endpage=712): """Opens the text of the Glosses from the document and returns the text of the selected page range""" alltext = get_text(filename) pagestext = [] lastpage = 0 nextpagepoint = 0 startpoint = 0 if startpage < 499: startpage = 499 if endpage > 712: endpage = 712 if startpage > 712: startpage = 499 if endpage < 499: endpage = 712 if startpage == 499 and endpage == 712: return alltext elif startpage > 499: alltext = alltext[find_page(alltext, startpage):] for page in range(startpage, endpage + 1): if lastpage == 0: pageno = startpage else: pageno = lastpage + 1 startpoint += nextpagepoint if pageno == 712: pagetext = alltext[startpoint:] else: nextpage = pageno + 1 pagetext = alltext[startpoint:] nextpagepoint = pagetext.find(str(nextpage)) pagetext = pagetext[:nextpagepoint] nextpagepoint = pagetext.rfind("\n") pagetext = pagetext[:nextpagepoint] pagestext.append(pagetext) lastpage = pageno pagestext = "".join(pagestext) return pagestext
# # Import the training set of glosses for use as the single training text # one_text_in = open("toktrain.pkl", "rb") # one_text = " ".join(pickle.load(one_text_in)) # text_name = "Wb. Training Glosses" # Import and clean CELT texts for use as the single training text clean_text_list = [] all_clean_files = [ f for f in listdir("CELT_Texts_Clean") if op.isfile(op.join("CELT_Texts_Clean", f)) ] for cf in all_clean_files: cf = "".join(cf.split(".docx")) if cf != ".DS_Store": cf = op.join("CELT_Texts_Clean", cf) clean_text_list.append(get_text(cf)) one_text = " ".join(clean_text_list) one_text = " ".join(one_text.split("\n")) while " " in one_text: one_text = " ".join(one_text.split(" ")) text_name = "CELT Collection" # Import test and train sets for character mapping train_in = open("toktrain.pkl", "rb") train_set = pickle.load(train_in) test_in = open("toktest.pkl", "rb") test_set = pickle.load(test_in) x_train = remove_non_glosses(train_set) # temp = [] # Reverse x_train for reverse models # for x_trainer in x_train[::-1]: # Reverse x_train for reverse models # new_trainer = x_trainer[::-1] # Reverse x_train for reverse models
def cleantext_CELT(filename): """Opens docx file of text as copied from CELT, removes line numbers and punctuation, saves the updated document.""" # Open Document and get text text = get_text(filename) linelist = text.split("\n") # Here lines are stripped of white space at the end textlist = [] for line in linelist: line = line.strip() textlist.append(line) text = "\n".join(textlist) # Here irregular intrusions into texts are removed removes = [ "[LU1]", "[LU2]", "\nf. L.", 'L. f.', '.r.', '.C.', '.u.', ' m.', " c.", " e.", " R.", " U.", ".ix.", ".x.", ".xx.", ".xxx.", ".u." ] for rem in removes: text = "".join(text.split(rem)) # Here punctuated items are replaced so that they are not changed when punctuation is removed rep_list = [".i.", ".l.", "rl."] for replacer in rep_list: rep_str = "***".join(replacer.split(".")) text = rep_str.join(text.split(replacer)) # Here hyphenated items are replaced appropriately post_hyph = ["h-", "m-", "n-", "l-", "t-", "s-", "c-", "r-"] for hyph_item in post_hyph: hyphpat = re.compile(r'[ ‘\n]' + hyph_item) hyphpatitir = hyphpat.finditer(text) for hyphpatitem in hyphpatitir: thishyphitem = hyphpatitem.group() hyph_gone = "".join(thishyphitem.split("-")) text = hyph_gone.join(text.split(thishyphitem)) text = " ".join(text.split("")) text = " ".join(text.split("-")) # Here the letter v is replaced with u wherever used text = "u".join(text.split("v")) text = "u".join(text.split("V")) # Here apostrophes are removed where they represent a split word apostlist = ["'s", "'S", "m' ", "d' ", "th' ", "t' ", "T' "] for apost in apostlist: apost_gone = "".join(apost.split("'")) text = apost_gone.join(text.split(apost)) # Here line numbers are removed remnumlist = [] linopat = re.compile(r'\n\d{1,4}\] ?') linopatitir = linopat.finditer(text) for i in linopatitir: num = i.group() remnumlist.append(num) for i in remnumlist[::-1]: text = "\n".join(text.split(i)) # Here page numbers are removed pnumlist = [] pnopat = re.compile(r'p\.\d{1,3}') pnopatitir = pnopat.finditer(text) for i in pnopatitir: pnum = i.group() pnumlist.append(pnum) for i in pnumlist[::-1]: text = "".join(text.split(i)) # Here folio information is removed follist = [] folpat = re.compile(r'-?{.+}') folpatitir = folpat.finditer(text) for j in folpatitir: fol = j.group() follist.append(fol) for j in follist: text = "".join(text.split(j)) # Here & and 'et' are replaced with ⁊ text = "⁊".join(text.split("&")) text = " ⁊ ".join(text.split(" et ")) # Here punctuation is removed punclist = [ '!', ',', '.', ':', ';', '?', '"', "'", '‘', '’', '[', ']', '(', ')', '|', '/', '—', '_' ] for punc in punclist: text = "".join(text.split(punc)) # Here punctuated items are reinserted into the text where they were replaced reinst_list = ["***i***", "***l***", "rl***"] for reinstater in reinst_list: reinst_str = ".".join(reinstater.split("***")) text = reinst_str.join(text.split(reinstater)) # Here double spacing and triple line spacing are removed while "\n\n\n" in text: text = "\n\n".join(text.split("\n\n\n")) while " " in text: text = " ".join(text.split(" ")) while "\n " in text: text = "\n".join(text.split("\n ")) # Here text is stripped and lower-cased text = text.lower() text = text.strip() # Save a copy of the updated Document save_docx(text, op.join(clean_dir, filename[len(raw_dir) + 1:] + "_cleaned")) return "Complete!"
# Save Model model.save(NAME) print("Model {} saved".format(NAME)) """Parameters Input:""" # # Choose and name text to train on # text_name = "Wb. Training Glosses" # text_designation = "Wb" # one_text = [" ".join(pickle.load(open("toktrain.pkl", "rb")))] text_name = "Táin Bó Fraích" text_designation = "TBF" one_text = [rem_dubspace(" ".join((get_text("TBF_cleaned")).split("\n")))] # # Map all test and training characters mappings = map_chars(load_data(one_text, text_name)) chardict, rchardict, vocab_size = mappings[0], mappings[1], mappings[2] # # Save the mapping # pickle.dump(chardict, open('char_mappingTBF.pkl', 'wb')) # Name mapping # # Set how many characters the model should look at before predicting an upcoming character
def openhandlists(file): """Gets the text from a gloss-hand file""" filetext = get_text(file) return filetext