def get_glinfobasic(file, startpage=499, stoppage=712): """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table. Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and tag-free Gloss Text for every gloss in Wb.""" curepist = "Unknown" infolist = [["Epistle", "Page", "Folio", "Gloss No.", "Gloss Text"]] for page in range(startpage, stoppage + 1): thispage = page pagetext = get_pages(file, thispage, thispage) epfunc = get_tagtext(pagetext, "H2") if epfunc: curepist = epfunc[0] glosslist = order_glosslist( clear_tags("\n\n".join(get_section(pagetext, "SG")))) foliolist = [] for folinfo in get_fol( order_glosses( clear_tags( "\n\n".join( get_section(get_pages(file, thispage, thispage), "SG")), "fol"))): folio = folinfo[1] foliotext = folinfo[0] foliolist.append([folio, foliotext]) for gloss in glosslist: thisglosslist = [curepist, thispage] glossfound = False for foltextlist in foliolist: if gloss in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: glossstub = gloss[:11] for foltextlist in foliolist: if glossstub in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: thisglosslist.append("No Folio Information Found") glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glosspatitir = glossnopat.finditer(gloss) for i in glosspatitir: thisglosslist.extend([ (i.group())[:-2], gloss[gloss.find(i.group()) + len(i.group()):] ]) infolist.append(thisglosslist) return infolist
def list_numbered_glosses(file, startpage, stoppage): """Lists glosses by their folio ID and gloss number""" glist = [] for p in range(startpage, stoppage + 1): fcont = get_fol( order_glosses( clear_tags( "\n\n".join(get_section(get_pages(file, p, p), "SG")), "fol"))) for g in order_glosslist("\n\n".join( get_section(get_pages(file, p, p), "SG"))): for fol in fcont: raw_gloss = clear_tags(g) if clear_tags(g) in fol[0]: numpat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') numpatitir = numpat.finditer(raw_gloss) for i in numpatitir: if i.group() in raw_gloss: glist.append( [fol[1][3:] + i.group()[:-1], cleangloss(g)]) return glist
def get_glinfo(file, startpage=499, stoppage=712): """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table. Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and Gloss Text (with [GLat][/GLat] tags converted to html italics tags) for every gloss in Wb.""" curepist = "Unknown" infolist = [[ "Epistle", "Page", "Folio", "Gloss No.", "Gloss Full-Tags", "Gloss Text", "Gloss Footnotes", "Relevant Footnotes", "Adrian's Notes", "Gloss Translation" ]] pagestrans = get_transpagesinfo(file, startpage, stoppage) for page in range(startpage, stoppage + 1): thispage = page pagetext = get_pages(file, thispage, thispage) # Gets all page Footnotes for the first time (for the gloss) footnotelist = order_footlist(file, page) # Gets all notes supplied by me (for the gloss) notelist = order_newlist(file, page) newnotelist = list() notefol = False if notelist != ['']: for notenum, note in enumerate(notelist): noteidpat = re.compile(r'\[/?f\. \d{1,2}[a-d]\]') noteiditer = noteidpat.findall(note) if noteiditer: for folinfo in noteiditer: note = "".join(note.split(folinfo)) folinfo = "".join(i for i in noteiditer[0] if i not in ["[", "]", "/"]) if notenum == 0: notefol = folinfo elif notefol != folinfo: notefol = folinfo notenumpat = re.compile(r'^\d{1,2}[a-z]?\. ') notenumiter = notenumpat.findall(note) if not notenumiter: raise RuntimeError( f"Personal note found without link to gloss number.\nNote: {note}" ) elif len(notenumiter ) > 1 or notenumiter[0] != note[:len(notenumiter[0])]: raise RuntimeError( f"Multiple possible gloss numbers found for personal note.\nNote: {note}" ) elif notenumiter[0] == note[:len(notenumiter[0])]: glossnum = notenumiter[0][:-2] note = note[len(notenumiter[0]):].strip() newnotelist.append([notefol, glossnum, note]) # Checks for a new epistle on the current page. epfunc = get_tagtext(pagetext, "H2") if epfunc: curepist = epfunc[0] # Identifies individual glosses on the current page, and adds them to a gloss-list. glosslist = order_glosslist( clear_spectags("\n\n".join(get_section(pagetext, "SG")), "fol")) foliolist = [] # Creates a list of folios and related gloss text for the current page. for folinfo in get_fol( order_glosses("\n\n".join( get_section(get_pages(file, thispage, thispage), "SG")))): folio = folinfo[1] foliotext = folinfo[0] foliolist.append([folio, foliotext]) # Creates a list with the current epistle name and page, # Checks for each gloss on the current page which folio it is in, # Adds folio information to the list. for gloss in glosslist: thisglosslist = [curepist, thispage] glossfound = False for foltextlist in foliolist: if gloss in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: glossstub = gloss[:11] for foltextlist in foliolist: if glossstub in foltextlist[1]: thisfolio = foltextlist[0] thisglosslist.append(thisfolio) glossfound = True if not glossfound: thisglosslist.append("No Folio Information Found") # Identifies gloss numbers and removes them from the gloss text. glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glosspatitir = glossnopat.finditer(gloss) for i in glosspatitir: # Adds gloss number to list. thisglosslist.append(i.group()[:-2]) # Identifies foundational gloss including all markup tags. glossfulltags = gloss[gloss.find(i.group()) + len(i.group()):] # Creates a display copy of the gloss text, replacing Latin tags with html emphasis tags. glosstext = glossfulltags if "[GLat]" in glosstext: glosstextlist = glosstext.split("[GLat]") glosstext = "<em>".join(glosstextlist) if "[/GLat]" in glosstext: glosstextlist = glosstext.split("[/GLat]") glosstext = "</em>".join(glosstextlist) # Creates 2 copies of display gloss text, one primary, one retaining footnotes in superscript tags. basegloss = clear_tags(glosstext) footnotesgloss = glosstext[:] footnotepat = re.compile(r'\[/?[a-z]\]') fnpatitir = footnotepat.finditer(footnotesgloss) fnlist = [] for j in fnpatitir: fnlist.append(j.group()) if not fnlist: fnstring = "" thisglosslist.extend([ glossfulltags, basegloss, clear_tags(footnotesgloss), fnstring ]) if fnlist: for fntag in fnlist: if "[/" in fntag: endtag = fntag begintag = "".join(endtag.split("/")) footnotesgloss = "".join( footnotesgloss.split(begintag)) tagplace = footnotesgloss.find(endtag) footnotesgloss = footnotesgloss[:tagplace] + "<sup>" +\ footnotesgloss[tagplace + 2: tagplace + 3] + "</sup>" +\ footnotesgloss[tagplace + 4:] if begintag in fnlist: del fnlist[fnlist.index(begintag)] glossfnlist = [] for footnote in fnlist: fnletter = footnote[-2] # Collects footnotes relevant to this gloss and adds them to a list. for fnote in footnotelist: fnoteid = fnote[:1] if fnletter == fnoteid: glossfnlist.append( clear_tags(fnote[:1] + ":" + fnote[1:])) fnsuperscript = "<sup>" + fnletter + "</sup>" footnotesgloss = fnsuperscript.join( footnotesgloss.split(footnote)) thisglosslist.extend([ glossfulltags, basegloss, clear_tags(footnotesgloss), glossfnlist ]) if newnotelist: for note in newnotelist: folinfo = note[0] glossnum = note[1] notetext = note[2] if folinfo == thisglosslist[ 2] and glossnum == thisglosslist[3]: if len(thisglosslist) == 8: thisglosslist.extend([notetext]) elif len(thisglosslist) == 9: if thisglosslist[8] == "": thisglosslist[8] = notetext else: anstring = "" if len(thisglosslist) == 8: thisglosslist.extend([anstring]) elif not newnotelist: anstring = "" thisglosslist.extend([anstring]) infolist.append(thisglosslist) # add translations to the end of the info-lists where they are available for infoset in infolist[ 1:]: # exclude the first info-set containing the titles glossid = infoset[3] curpagetrans = pagestrans[0] curtransid = curpagetrans[0] curtrans = curpagetrans[1] # deal with the conjoined gloss on TPH p. 500 (1b10 + 1b11) # split the gloss id into the two numbers, use these to identify the two translations # conjoin the two translations and append them to the info-set for the conjoined gloss ids if ", " in glossid: glossidlist = glossid.split(", ") splittranslations = [] for newid in glossidlist: if newid == curtransid: splittranslations.append(curtrans) del pagestrans[0] curpagetrans = pagestrans[0] curtransid = curpagetrans[0] curtrans = curpagetrans[1] joinedtrans = " i.e. ".join(splittranslations) if "[GLat]" in joinedtrans: transtextlist = joinedtrans.split("[GLat]") joinedtrans = "<em>".join(transtextlist) if "[/GLat]" in joinedtrans: transtextlist = joinedtrans.split("[/GLat]") joinedtrans = "</em>".join(transtextlist) infoset.append(joinedtrans) else: if glossid == curtransid: if "[GLat]" in curtrans: transtextlist = curtrans.split("[GLat]") curtrans = "<em>".join(transtextlist) if "[/GLat]" in curtrans: transtextlist = curtrans.split("[/GLat]") curtrans = "</em>".join(transtextlist) infoset.append(curtrans) del pagestrans[0] # deal with page 587 where glosses 27, 28, and 29 share the one translation, numbered '27 – 29.'. elif " – " in curtransid: curtransidlist = curtransid.split(" – ") curtransidrange = [ int(curtransidlist[0]), int(curtransidlist[1]) ] idstart = curtransidrange[0] idstop = curtransidrange[1] curtransidlist = [] for i in range(idstart, idstop + 1): curtransidlist.append(str(i)) if glossid in curtransidlist: if "[GLat]" in curtrans: transtextlist = curtrans.split("[GLat]") curtrans = "<em>".join(transtextlist) if "[/GLat]" in curtrans: transtextlist = curtrans.split("[/GLat]") curtrans = "</em>".join(transtextlist) infoset.append(curtrans) if glossid == curtransidlist[-1]: del pagestrans[0] # if no translation is given in TPH else: infoset.append("No translation available.") # Gets all page Footnotes for the second time (for the translation) curpage = None for infoset in infolist[ 1:]: # exclude the first info-set containing the titles thistransfns = [] # ensures page footnotes are only generated once per page, and not for every gloss if curpage: if curpage != infoset[1]: curpage = infoset[1] footnotelist = order_footlist(file, curpage) elif not curpage: curpage = infoset[1] footnotelist = order_footlist(file, curpage) trans = infoset[9] # finds which translations have footnotes, looks for the associated footnote i the list generated above if "<sup>" in trans: superscriptpat = re.compile(r'<sup>\w</sup>') superscriptpatitir = superscriptpat.finditer(trans) for i in superscriptpatitir: fnid = i.group()[5] for footnote in footnotelist: if footnote[0] == fnid: # if the footnote is found and not already in the footnote list for the gloss it is added if infoset[7]: if clear_tags(footnote[:1] + ":" + footnote[1:]) not in infoset[7]: thistransfns.append( clear_tags(footnote[:1] + ":" + footnote[1:])) else: thistransfns.append( clear_tags(footnote[:1] + ":" + footnote[1:])) # all footnotes found for the gloss are combined # if there are translation footnotes if thistransfns: # if there are no gloss footnotes to add them to if not infoset[7]: infoset[7] = thistransfns # if there are gloss footnotes to add them to elif infoset[7]: for i in thistransfns: infoset[7].append(i) return infolist
def get_allinfo(file, startpage, stoppage=None): """Returns an infolist containing multiple sub-lists. The first sublist contains the headers for an info-table. Subsequent lists contain, respectively, for a set page range: page no., folio, gloss no., gloss text, Latin lemma, Position of lemma in Latin text, Latin verse number, and Latin text.""" if stoppage is None: stoppage = startpage infolist = [[ "Page", "Folio", "Gloss No.", "Gloss Text", "Lemma", "Lemma Position", "Verse", "Glossed Latin", "Latin Footnotes" ]] for page in range(startpage, stoppage + 1): thispage = page # Gets all page Footnotes (for the Latin) footnotelist = order_footlist(file, page) # Collect folio information, one page at a time foliolist = [] for folinfo in get_fol( order_glosses( clear_tags( "\n\n".join( get_section(get_pages(file, thispage, thispage), "SG")), "fol"))): folio = folinfo[1] foliotext = folinfo[0] foliolist.append([folio, foliotext]) # Gets gloss, glossed latin, lemma and lemma position for all glosses, one page at a time for sublist in get_latpageinfo(file, page): glosslistplus = [thispage] thisgloss = sublist[0] glossfound = False # For each folio on the page compares gloss from Latin list to the gloss in the folio list # If found, folio is identified for gloss for foltextlist in foliolist: if thisgloss in foltextlist[1]: thisfolio = foltextlist[0] glosslistplus.append(thisfolio) glossfound = True # If gloss not found still, compares first ten characters of gloss from Latin list to gloss in folio list # Only seems to affect f.2a 21 ([f. 2b]) marker bisects gloss in TPH if not glossfound: glossstub = thisgloss[:11] for foltextlist in foliolist: if glossstub in foltextlist[1]: thisfolio = foltextlist[0] glosslistplus.append(thisfolio) # Gets each gloss on this page by finding its number # Returns gloss number and then the gloss in a list for each gloss # Adds each list to a gloss list for the page glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glosspatitir = glossnopat.finditer(thisgloss) glosslist = [] for i in glosspatitir: glosslist.extend([ i.group(), thisgloss[thisgloss.find(i.group()) + len(i.group()):] ]) # Identifies glossno, glosstext, latin text, and latin lemma from their lists (all already found) glossno = glosslist[0] glosstext = glosslist[1] lempos = sublist[3] lemma = sublist[2] latin = sublist[1] rawfns = [] fns = [] fnpat = re.compile(r'<sup>[a-d]</sup>') fnpatitir = fnpat.finditer(latin) for fn in fnpatitir: rawfns.append(fn.group()) if rawfns: for rawfn in rawfns: fnletter = rawfn[5:6] for fnote in footnotelist: fnoteid = fnote[:1] if fnletter == fnoteid: fns.append(clear_tags(fnote[:1] + ":" + fnote[1:])) # Identifies Latin Verse Numbers and Latin text for that verse # Adds '00. ' to the start of every latin line in the page's Latin list (for folios that start with no no.) # 'Rom. ' is removed later, but must be included here for regex to work versenopat = re.compile( r'00\. (\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))' ) latpatitir = versenopat.finditer("00. " + latin) for i in latpatitir: if i.group() == "00. ": verseno = "0" elif i.group() == "00. [NV]": verseno = "[NV]" else: verseno = (i.group())[4:-2] latin = latin[len(verseno) + 2:] # Adds glossno, glosstext, lemma, verseno, and Latin text to glosslistplus # Now glosslistplus is: pageno, folio, glossno, glosstext, lemma, lempos, verseno, and Latin text # Adds glosslistplus to infolist (which is returned once fixed) glosslistplus.extend([ glossno[:glossno.rfind(".")], glosstext, lemma, lempos, verseno, clear_spectags(latin, ["NV"]), fns ]) infolist.append(glosslistplus) # Fixes versenos in infolist (combines numberless verses, adds chapter to verses with verseno only) # Adds all versenos from infolist to versenofixlist versenofixlist = [] for info in infolist: versenofixlist.append(info[6]) # Joins all versenos together numsearch = "".join(versenofixlist) romnumlist = [] # Finds all chapter numerals throughout all the verse info, adds these to romnumlist numeralpat = re.compile(r'(\[NV\]|[IVX]{1,4}\. )') numpatitir = numeralpat.finditer(numsearch) for numfind in numpatitir: if numfind.group() not in romnumlist: romnumlist.append(numfind.group()) # Goes through every verseno in the versenofixlist curverse = "0" poscount = 0 curnum = "" for item in versenofixlist: # Removes 'Rom. ' from the numeral if "Rom. " in item: versenofixlist[poscount] = item[5:] item = versenofixlist[poscount] if item != "0": curverse = item # Replaces no-number verses not at epistle boundaries with the number of the previous verse elif item == "0": versenofixlist[poscount] = curverse item = versenofixlist[poscount] # Replaces no-number verses at epistle boundaries with a comment that no information is available about verse. if item == "[NV]": versenofixlist[ poscount] = "- No Chapter or Verse Information Available" item = versenofixlist[poscount] # Updates the current (previous) verse to the roman numeral of the current verse for romnum in romnumlist: if romnum in item: if item.find(romnum) == 0: curnum = romnum # Combines chapter numerals to verse numbers that don't have them already if curnum not in item: if item != "- No Chapter or Verse Information Available": versenofixlist[poscount] = curnum + item poscount += 1 fixcount = 0 # The versenos in infolist are updated with the corrected forms from versenofixlist for wronglist in infolist: wronglist[6] = versenofixlist[fixcount] fixcount += 1 # Adds Epistle Name to chapter and verse epnames = [ "Rom.", "1 Cor.", "2 Cor.", "Gal.", "Eph.", "Phil.", "1 Thes.", "2 Thes.", "Col.", "1 Tim.", "2 Tim.", "Tit.", "Philem.", "Heb." ] eppages = [ 543, 591, 619, 631, 643, 654, 663, 669, 679, 690, 698, 703, 705, 713 ] spot = 0 for item in infolist[1:]: if int(item[0]) < eppages[spot]: item[6] = epnames[spot] + " " + item[6] elif int(item[0]) == eppages[spot]: spot += 1 item[6] = epnames[spot] + " " + item[6] return infolist
def get_latpageinfo(file, page): """returns a list of gloss-lists for a specified page of TPH each gloss-list contains a gloss[0], the Latin verse[1], the lemma[2], and the lemma position[3]""" latininfolist = [] latlines = order_latlist("\n\n".join( get_section(get_pages(file, page, page), "Lat"))) eachgloss = order_glosslist( clear_tags("\n\n".join(get_section(get_pages(file, page, page), "SG")))) glosses = order_glosses( clear_tags("\n\n".join(get_section(get_pages(file, page, page), "SG")))) numpat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ') glossitir = numpat.finditer(glosses) glossnums = [] for i in glossitir: # Gets gloss numbers from the Irish text, converts them to match tags in the Latin text, adds them to a list. glossnum = i.group() glossnum = glossnum[:-2] if ", " in glossnum: glossnum = "–".join(glossnum.split(", ")) glossnums.append("[" + glossnum + "]") latpergloss = [] lemmata = [] positions = [] usednums = [] backlist = [] # Creates a reversed version of latlines to be searched instead on pages where there are duplicate glossnos. # This prevents two glosses with the same number interacting with each other's Latin lines. for line in latlines: backlist.append(line) backlist.reverse() # Checks for expected gloss numbers in the latin text and, if found, adds the latin line and lemma to lists. for num in glossnums: if num not in usednums: # If this is the first instance of this glossno on this page. usednums.append(num) found = False while not found: for line in latlines: if num in line: latpergloss.append(line) linetext = line numpos = line.find(num) linetext = linetext[:numpos] lemma = linetext[linetext.rfind(" ") + 1:] if "[" in lemma: lemma = clear_tags(lemma) lemmata.append(lemma) notagtext = clear_tags(linetext, ["let"]) remnumpat = re.compile( r'(\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))' ) thisglossremnum = "" # has to be put here for pages which begin mid-Latin-line for remnum in remnumpat.finditer(notagtext): if remnum.group() != "": thisglossremnum = remnum.group() remlen = len(thisglossremnum) notagtext = notagtext[remlen:] fnpat = re.compile(r'\[/?[a-d]\]') fnpatitir = fnpat.finditer(notagtext) fns = [] for fn in fnpatitir: fns.append(fn.group()) if fns: for marker in fns: if "[/" in marker: notagtext = "".join( notagtext.split(marker[0] + marker[-2:])) supscr = "</em><sup>{}</sup><em>".format( marker[2:3]) notagtext = supscr.join( notagtext.split(marker)) for marker in fns: if marker in notagtext: supscr = "</em><sup>{}</sup><em>".format( marker[1:2]) notagtext = supscr.join( notagtext.split(marker)) lempos = notagtext.rfind(lemma) positions.append(lempos) found = True break elif num in usednums: # If this is not the first instance of this glossno on this page. found = False while not found: for line in backlist: if num in line: latpergloss.append(line) linetext = line numpos = line.find(num) linetext = linetext[:numpos] lemma = linetext[linetext.rfind(" ") + 1:] if "[" in lemma: lemma = clear_tags(lemma) lemmata.append(lemma) notagtext = clear_tags(linetext, ["let"]) remnumpat = re.compile( r'(\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))' ) thisglossremnum = "" # has to be put here for pages which begin mid-Latin-line for remnum in remnumpat.finditer(notagtext): if remnum.group() != "": thisglossremnum = remnum.group() remlen = len(thisglossremnum) notagtext = notagtext[remlen:] fnpat = re.compile(r'\[/?[a-d]\]') fnpatitir = fnpat.finditer(notagtext) fns = [] for fn in fnpatitir: fns.append(fn.group()) if fns: for marker in fns: if "[/" in marker: notagtext = "".join( notagtext.split(marker[0] + marker[-2:])) supscr = "</em><sup>{}</sup><em>".format( marker[2:3]) notagtext = supscr.join( notagtext.split(marker)) for marker in fns: if marker in notagtext: supscr = "</em><sup>{}</sup><em>".format( marker[1:2]) notagtext = supscr.join( notagtext.split(marker)) lempos = notagtext.rfind(lemma) positions.append(lempos) found = True break for i in range(len(glossnums)): # Compiles a list of the gloss, the Latin line, and the lemma for the gloss within the Latin line. thislatperglos = latpergloss[i] fnpat = re.compile(r'\[/?[a-d]\]') fnpatitir = fnpat.finditer(thislatperglos) fns = [] for fn in fnpatitir: fns.append(fn.group()) if fns: for marker in fns: if "[/" in marker: thislatperglos = "".join( thislatperglos.split(marker[0] + marker[-2:])) supscr = "</em><sup>{}</sup><em>".format(marker[2:3]) thislatperglos = supscr.join(thislatperglos.split(marker)) for marker in fns: if marker in thislatperglos: supscr = "</em><sup>{}</sup><em>".format(marker[1:2]) thislatperglos = supscr.join(thislatperglos.split(marker)) latininfolist.append([ eachgloss[i], clear_tags(thislatperglos, ["NV"]), clear_tags(lemmata[i]), positions[i] ]) return latininfolist