示例#1
0
def get_glinfobasic(file, startpage=499, stoppage=712):
    """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table.
       Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and tag-free
       Gloss Text for every gloss in Wb."""
    curepist = "Unknown"
    infolist = [["Epistle", "Page", "Folio", "Gloss No.", "Gloss Text"]]
    for page in range(startpage, stoppage + 1):
        thispage = page
        pagetext = get_pages(file, thispage, thispage)
        epfunc = get_tagtext(pagetext, "H2")
        if epfunc:
            curepist = epfunc[0]
        glosslist = order_glosslist(
            clear_tags("\n\n".join(get_section(pagetext, "SG"))))
        foliolist = []
        for folinfo in get_fol(
                order_glosses(
                    clear_tags(
                        "\n\n".join(
                            get_section(get_pages(file, thispage, thispage),
                                        "SG")), "fol"))):
            folio = folinfo[1]
            foliotext = folinfo[0]
            foliolist.append([folio, foliotext])
        for gloss in glosslist:
            thisglosslist = [curepist, thispage]
            glossfound = False
            for foltextlist in foliolist:
                if gloss in foltextlist[1]:
                    thisfolio = foltextlist[0]
                    thisglosslist.append(thisfolio)
                    glossfound = True
            if not glossfound:
                glossstub = gloss[:11]
                for foltextlist in foliolist:
                    if glossstub in foltextlist[1]:
                        thisfolio = foltextlist[0]
                        thisglosslist.append(thisfolio)
                        glossfound = True
            if not glossfound:
                thisglosslist.append("No Folio Information Found")
            glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
            glosspatitir = glossnopat.finditer(gloss)
            for i in glosspatitir:
                thisglosslist.extend([
                    (i.group())[:-2],
                    gloss[gloss.find(i.group()) + len(i.group()):]
                ])
            infolist.append(thisglosslist)
    return infolist
def testsectext(sec, startpage, stoppage):
    """Takes a desired section and page range as input.
       Outputs a list of lists of page no. and page content."""
    pagesinfolist = []
    for page in range(startpage, stoppage + 1):
        pageinfolist = [
            str(page), "\n\n".join(
                get_section(get_pages("Wurzburg Glosses", page, page), sec))
        ]
        pagesinfolist.append(pageinfolist)
    return pagesinfolist
示例#3
0
def list_numbered_glosses(file, startpage, stoppage):
    """Lists glosses by their folio ID and gloss number"""
    glist = []
    for p in range(startpage, stoppage + 1):
        fcont = get_fol(
            order_glosses(
                clear_tags(
                    "\n\n".join(get_section(get_pages(file, p, p), "SG")),
                    "fol")))
        for g in order_glosslist("\n\n".join(
                get_section(get_pages(file, p, p), "SG"))):
            for fol in fcont:
                raw_gloss = clear_tags(g)
                if clear_tags(g) in fol[0]:
                    numpat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
                    numpatitir = numpat.finditer(raw_gloss)
                    for i in numpatitir:
                        if i.group() in raw_gloss:
                            glist.append(
                                [fol[1][3:] + i.group()[:-1],
                                 cleangloss(g)])
    return glist
def get_transpageinfo(file, page):
    """Returns a list of translation lists for each page
       Each translation list contains a glossno [0] and a gloss translation [1]"""
    english = clear_spectags(
        "\n\n".join(get_section(get_pages(file, page, page), "Eng")), "fol")
    englishnums = []
    englishlines = []
    engpat = re.compile(r'(\d{1,2} – )?\d{1,2}[a-z]?\. ')
    engpatitir = engpat.finditer(english)
    # find the numbers in the english text, add them to a list
    for i in engpatitir:
        englishnums.append(i.group())
    # using the numbers as markers, identify the strings associated with the numbers, add them to a list
    for i in range(len(englishnums)):
        numlen = len(englishnums[i])
        # if the current number isn't the last number, the string is from this number to the next number
        if i != len(englishnums) - 1:
            if englishnums[i + 1] not in englishnums[i]:
                line = english[english.find(englishnums[i]) +
                               numlen:english.find(englishnums[i + 1])]
            # account for situations where the next number is in the current number
            # eg. "1. " is in "21. " on page 503
            else:
                scrap = english
                firstspot = scrap.find(englishnums[i + 1])
                scrap = scrap[firstspot + len(englishnums[i + 1]):]
                secondspot = scrap.find(englishnums[i + 1]) + len(
                    englishnums[i + 1])
                line = english[english.find(englishnums[i]) +
                               numlen:secondspot]
            english = english[numlen:]
            english = english[english.find(englishnums[i + 1]):]
        # if the current number is the last number, the string is from this number to the end
        else:
            line = english[english.find(englishnums[i]) + numlen:]
        line = line.split("\n")
        line = " ".join(line)
        line = line.strip()
        englishlines.append(line)
    translist = []
    # remove full stop and space from end of number
    for i in range(len(englishnums)):
        thistransnum = englishnums[i]
        englishnums[i] = thistransnum[:thistransnum.rfind(".")]
    # add number and trans to list, then lest to translist
    for i in range(len(englishnums)):
        thislist = [englishnums[i], englishlines[i]]
        translist.append(thislist)
    # edit translations to include html superscript footnotes instead of footnote tags
    for i in range(len(translist)):
        translationpair = translist[i]
        if "[" in translationpair[1]:
            fixedtrans = translationpair[1]
            newpair = [translationpair[0]]
            fnpat = re.compile(r'\[\w\]')
            fnpatitir = fnpat.finditer(translationpair[1])
            for fn in fnpatitir:
                fntags = fn.group()
                fntagless = fntags[1:-1]
                ss = "<sup>" + fntagless + "</sup>"
                fixedlist = fixedtrans.split(fntags)
                fixedtrans = ss.join(fixedlist)
            if fixedtrans != translationpair[1]:
                newpair.append(fixedtrans)
                translist[i] = newpair
    return translist
def order_footnotes(file, page):
    """Prints footnotes for a selected page as a single string"""
    footnotes = "\n\n".join(get_section(get_pages(file, page, page), "FN"))
    return footnotes
示例#6
0
def get_glinfo(file, startpage=499, stoppage=712):
    """Returns an infolist containing multiple sublists. The first sublist contains the headers for an info-table.
       Subsequent lists contain, respectively, for a set page range: Epistle, Page No., Folio, Gloss no. and Gloss Text
       (with [GLat][/GLat] tags converted to html italics tags) for every gloss in Wb."""
    curepist = "Unknown"
    infolist = [[
        "Epistle", "Page", "Folio", "Gloss No.", "Gloss Full-Tags",
        "Gloss Text", "Gloss Footnotes", "Relevant Footnotes",
        "Adrian's Notes", "Gloss Translation"
    ]]
    pagestrans = get_transpagesinfo(file, startpage, stoppage)
    for page in range(startpage, stoppage + 1):
        thispage = page
        pagetext = get_pages(file, thispage, thispage)
        # Gets all page Footnotes for the first time (for the gloss)
        footnotelist = order_footlist(file, page)
        # Gets all notes supplied by me (for the gloss)
        notelist = order_newlist(file, page)
        newnotelist = list()
        notefol = False
        if notelist != ['']:
            for notenum, note in enumerate(notelist):
                noteidpat = re.compile(r'\[/?f\. \d{1,2}[a-d]\]')
                noteiditer = noteidpat.findall(note)
                if noteiditer:
                    for folinfo in noteiditer:
                        note = "".join(note.split(folinfo))
                    folinfo = "".join(i for i in noteiditer[0]
                                      if i not in ["[", "]", "/"])
                    if notenum == 0:
                        notefol = folinfo
                    elif notefol != folinfo:
                        notefol = folinfo
                notenumpat = re.compile(r'^\d{1,2}[a-z]?\. ')
                notenumiter = notenumpat.findall(note)
                if not notenumiter:
                    raise RuntimeError(
                        f"Personal note found without link to gloss number.\nNote: {note}"
                    )
                elif len(notenumiter
                         ) > 1 or notenumiter[0] != note[:len(notenumiter[0])]:
                    raise RuntimeError(
                        f"Multiple possible gloss numbers found for personal note.\nNote: {note}"
                    )
                elif notenumiter[0] == note[:len(notenumiter[0])]:
                    glossnum = notenumiter[0][:-2]
                    note = note[len(notenumiter[0]):].strip()
                newnotelist.append([notefol, glossnum, note])
        # Checks for a new epistle on the current page.
        epfunc = get_tagtext(pagetext, "H2")
        if epfunc:
            curepist = epfunc[0]
        # Identifies individual glosses on the current page, and adds them to a gloss-list.
        glosslist = order_glosslist(
            clear_spectags("\n\n".join(get_section(pagetext, "SG")), "fol"))
        foliolist = []
        # Creates a list of folios and related gloss text for the current page.
        for folinfo in get_fol(
                order_glosses("\n\n".join(
                    get_section(get_pages(file, thispage, thispage), "SG")))):
            folio = folinfo[1]
            foliotext = folinfo[0]
            foliolist.append([folio, foliotext])
        # Creates a list with the current epistle name and page,
        # Checks for each gloss on the current page which folio it is in,
        # Adds folio information to the list.
        for gloss in glosslist:
            thisglosslist = [curepist, thispage]
            glossfound = False
            for foltextlist in foliolist:
                if gloss in foltextlist[1]:
                    thisfolio = foltextlist[0]
                    thisglosslist.append(thisfolio)
                    glossfound = True
            if not glossfound:
                glossstub = gloss[:11]
                for foltextlist in foliolist:
                    if glossstub in foltextlist[1]:
                        thisfolio = foltextlist[0]
                        thisglosslist.append(thisfolio)
                        glossfound = True
            if not glossfound:
                thisglosslist.append("No Folio Information Found")
            # Identifies gloss numbers and removes them from the gloss text.
            glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
            glosspatitir = glossnopat.finditer(gloss)
            for i in glosspatitir:
                # Adds gloss number to list.
                thisglosslist.append(i.group()[:-2])
                # Identifies foundational gloss including all markup tags.
                glossfulltags = gloss[gloss.find(i.group()) + len(i.group()):]
                # Creates a display copy of the gloss text, replacing Latin tags with html emphasis tags.
                glosstext = glossfulltags
                if "[GLat]" in glosstext:
                    glosstextlist = glosstext.split("[GLat]")
                    glosstext = "<em>".join(glosstextlist)
                if "[/GLat]" in glosstext:
                    glosstextlist = glosstext.split("[/GLat]")
                    glosstext = "</em>".join(glosstextlist)
                # Creates 2 copies of display gloss text, one primary, one retaining footnotes in superscript tags.
                basegloss = clear_tags(glosstext)
                footnotesgloss = glosstext[:]
                footnotepat = re.compile(r'\[/?[a-z]\]')
                fnpatitir = footnotepat.finditer(footnotesgloss)
                fnlist = []
                for j in fnpatitir:
                    fnlist.append(j.group())
                if not fnlist:
                    fnstring = ""
                    thisglosslist.extend([
                        glossfulltags, basegloss,
                        clear_tags(footnotesgloss), fnstring
                    ])
                if fnlist:
                    for fntag in fnlist:
                        if "[/" in fntag:
                            endtag = fntag
                            begintag = "".join(endtag.split("/"))
                            footnotesgloss = "".join(
                                footnotesgloss.split(begintag))
                            tagplace = footnotesgloss.find(endtag)
                            footnotesgloss = footnotesgloss[:tagplace] + "<sup>" +\
                                             footnotesgloss[tagplace + 2: tagplace + 3] + "</sup>" +\
                                             footnotesgloss[tagplace + 4:]
                            if begintag in fnlist:
                                del fnlist[fnlist.index(begintag)]
                    glossfnlist = []
                    for footnote in fnlist:
                        fnletter = footnote[-2]
                        # Collects footnotes relevant to this gloss and adds them to a list.
                        for fnote in footnotelist:
                            fnoteid = fnote[:1]
                            if fnletter == fnoteid:
                                glossfnlist.append(
                                    clear_tags(fnote[:1] + ":" + fnote[1:]))
                        fnsuperscript = "<sup>" + fnletter + "</sup>"
                        footnotesgloss = fnsuperscript.join(
                            footnotesgloss.split(footnote))
                    thisglosslist.extend([
                        glossfulltags, basegloss,
                        clear_tags(footnotesgloss), glossfnlist
                    ])
                if newnotelist:
                    for note in newnotelist:
                        folinfo = note[0]
                        glossnum = note[1]
                        notetext = note[2]
                        if folinfo == thisglosslist[
                                2] and glossnum == thisglosslist[3]:
                            if len(thisglosslist) == 8:
                                thisglosslist.extend([notetext])
                            elif len(thisglosslist) == 9:
                                if thisglosslist[8] == "":
                                    thisglosslist[8] = notetext
                        else:
                            anstring = ""
                            if len(thisglosslist) == 8:
                                thisglosslist.extend([anstring])
                elif not newnotelist:
                    anstring = ""
                    thisglosslist.extend([anstring])
            infolist.append(thisglosslist)
    # add translations to the end of the info-lists where they are available
    for infoset in infolist[
            1:]:  # exclude the first info-set containing the titles
        glossid = infoset[3]
        curpagetrans = pagestrans[0]
        curtransid = curpagetrans[0]
        curtrans = curpagetrans[1]
        # deal with the conjoined gloss on TPH p. 500 (1b10 + 1b11)
        # split the gloss id into the two numbers, use these to identify the two translations
        # conjoin the two translations and append them to the info-set for the conjoined gloss ids
        if ", " in glossid:
            glossidlist = glossid.split(", ")
            splittranslations = []
            for newid in glossidlist:
                if newid == curtransid:
                    splittranslations.append(curtrans)
                    del pagestrans[0]
                    curpagetrans = pagestrans[0]
                    curtransid = curpagetrans[0]
                    curtrans = curpagetrans[1]
            joinedtrans = " i.e. ".join(splittranslations)
            if "[GLat]" in joinedtrans:
                transtextlist = joinedtrans.split("[GLat]")
                joinedtrans = "<em>".join(transtextlist)
            if "[/GLat]" in joinedtrans:
                transtextlist = joinedtrans.split("[/GLat]")
                joinedtrans = "</em>".join(transtextlist)
            infoset.append(joinedtrans)
        else:
            if glossid == curtransid:
                if "[GLat]" in curtrans:
                    transtextlist = curtrans.split("[GLat]")
                    curtrans = "<em>".join(transtextlist)
                if "[/GLat]" in curtrans:
                    transtextlist = curtrans.split("[/GLat]")
                    curtrans = "</em>".join(transtextlist)
                infoset.append(curtrans)
                del pagestrans[0]
            # deal with page 587 where glosses 27, 28, and 29 share the one translation, numbered '27 – 29.'.
            elif " – " in curtransid:
                curtransidlist = curtransid.split(" – ")
                curtransidrange = [
                    int(curtransidlist[0]),
                    int(curtransidlist[1])
                ]
                idstart = curtransidrange[0]
                idstop = curtransidrange[1]
                curtransidlist = []
                for i in range(idstart, idstop + 1):
                    curtransidlist.append(str(i))
                if glossid in curtransidlist:
                    if "[GLat]" in curtrans:
                        transtextlist = curtrans.split("[GLat]")
                        curtrans = "<em>".join(transtextlist)
                    if "[/GLat]" in curtrans:
                        transtextlist = curtrans.split("[/GLat]")
                        curtrans = "</em>".join(transtextlist)
                    infoset.append(curtrans)
                if glossid == curtransidlist[-1]:
                    del pagestrans[0]
            # if no translation is given in TPH
            else:
                infoset.append("No translation available.")
    # Gets all page Footnotes for the second time (for the translation)
    curpage = None
    for infoset in infolist[
            1:]:  # exclude the first info-set containing the titles
        thistransfns = []
        # ensures page footnotes are only generated once per page, and not for every gloss
        if curpage:
            if curpage != infoset[1]:
                curpage = infoset[1]
                footnotelist = order_footlist(file, curpage)
        elif not curpage:
            curpage = infoset[1]
            footnotelist = order_footlist(file, curpage)
        trans = infoset[9]
        # finds which translations have footnotes, looks for the associated footnote i the list generated above
        if "<sup>" in trans:
            superscriptpat = re.compile(r'<sup>\w</sup>')
            superscriptpatitir = superscriptpat.finditer(trans)
            for i in superscriptpatitir:
                fnid = i.group()[5]
                for footnote in footnotelist:
                    if footnote[0] == fnid:
                        # if the footnote is found and not already in the footnote list for the gloss it is added
                        if infoset[7]:
                            if clear_tags(footnote[:1] + ":" +
                                          footnote[1:]) not in infoset[7]:
                                thistransfns.append(
                                    clear_tags(footnote[:1] + ":" +
                                               footnote[1:]))
                        else:
                            thistransfns.append(
                                clear_tags(footnote[:1] + ":" + footnote[1:]))
        # all footnotes found for the gloss are combined
        # if there are translation footnotes
        if thistransfns:
            # if there are no gloss footnotes to add them to
            if not infoset[7]:
                infoset[7] = thistransfns
            # if there are gloss footnotes to add them to
            elif infoset[7]:
                for i in thistransfns:
                    infoset[7].append(i)
    return infolist
def get_allinfo(file, startpage, stoppage=None):
    """Returns an infolist containing multiple sub-lists. The first sublist contains the headers for an info-table.
       Subsequent lists contain, respectively, for a set page range: page no., folio, gloss no., gloss text, Latin
       lemma, Position of lemma in Latin text, Latin verse number, and Latin text."""
    if stoppage is None:
        stoppage = startpage
    infolist = [[
        "Page", "Folio", "Gloss No.", "Gloss Text", "Lemma", "Lemma Position",
        "Verse", "Glossed Latin", "Latin Footnotes"
    ]]
    for page in range(startpage, stoppage + 1):
        thispage = page
        # Gets all page Footnotes (for the Latin)
        footnotelist = order_footlist(file, page)
        # Collect folio information, one page at a time
        foliolist = []
        for folinfo in get_fol(
                order_glosses(
                    clear_tags(
                        "\n\n".join(
                            get_section(get_pages(file, thispage, thispage),
                                        "SG")), "fol"))):
            folio = folinfo[1]
            foliotext = folinfo[0]
            foliolist.append([folio, foliotext])
        # Gets gloss, glossed latin, lemma and lemma position for all glosses, one page at a time
        for sublist in get_latpageinfo(file, page):
            glosslistplus = [thispage]
            thisgloss = sublist[0]
            glossfound = False
            # For each folio on the page compares gloss from Latin list to the gloss in the folio list
            # If found, folio is identified for gloss
            for foltextlist in foliolist:
                if thisgloss in foltextlist[1]:
                    thisfolio = foltextlist[0]
                    glosslistplus.append(thisfolio)
                    glossfound = True
            # If gloss not found still, compares first ten characters of gloss from Latin list to gloss in folio list
            # Only seems to affect f.2a 21 ([f. 2b]) marker bisects gloss in TPH
            if not glossfound:
                glossstub = thisgloss[:11]
                for foltextlist in foliolist:
                    if glossstub in foltextlist[1]:
                        thisfolio = foltextlist[0]
                        glosslistplus.append(thisfolio)
            # Gets each gloss on this page by finding its number
            # Returns gloss number and then the gloss in a list for each gloss
            # Adds each list to a gloss list for the page
            glossnopat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
            glosspatitir = glossnopat.finditer(thisgloss)
            glosslist = []
            for i in glosspatitir:
                glosslist.extend([
                    i.group(),
                    thisgloss[thisgloss.find(i.group()) + len(i.group()):]
                ])
            # Identifies glossno, glosstext, latin text, and latin lemma from their lists (all already found)
            glossno = glosslist[0]
            glosstext = glosslist[1]
            lempos = sublist[3]
            lemma = sublist[2]
            latin = sublist[1]
            rawfns = []
            fns = []
            fnpat = re.compile(r'<sup>[a-d]</sup>')
            fnpatitir = fnpat.finditer(latin)
            for fn in fnpatitir:
                rawfns.append(fn.group())
            if rawfns:
                for rawfn in rawfns:
                    fnletter = rawfn[5:6]
                    for fnote in footnotelist:
                        fnoteid = fnote[:1]
                        if fnletter == fnoteid:
                            fns.append(clear_tags(fnote[:1] + ":" + fnote[1:]))
            # Identifies Latin Verse Numbers and Latin text for that verse
            # Adds '00. ' to the start of every latin line in the page's Latin list (for folios that start with no no.)
            # 'Rom. ' is removed later, but must be included here for regex to work
            versenopat = re.compile(
                r'00\. (\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))'
            )
            latpatitir = versenopat.finditer("00. " + latin)
            for i in latpatitir:
                if i.group() == "00. ":
                    verseno = "0"
                elif i.group() == "00. [NV]":
                    verseno = "[NV]"
                else:
                    verseno = (i.group())[4:-2]
                    latin = latin[len(verseno) + 2:]
            # Adds glossno, glosstext, lemma, verseno, and Latin text to glosslistplus
            # Now glosslistplus is: pageno, folio, glossno, glosstext, lemma, lempos, verseno, and Latin text
            # Adds glosslistplus to infolist (which is returned once fixed)
            glosslistplus.extend([
                glossno[:glossno.rfind(".")], glosstext, lemma, lempos,
                verseno,
                clear_spectags(latin, ["NV"]), fns
            ])
            infolist.append(glosslistplus)
    # Fixes versenos in infolist (combines numberless verses, adds chapter to verses with verseno only)
    # Adds all versenos from infolist to versenofixlist
    versenofixlist = []
    for info in infolist:
        versenofixlist.append(info[6])
    # Joins all versenos together
    numsearch = "".join(versenofixlist)
    romnumlist = []
    # Finds all chapter numerals throughout all the verse info, adds these to romnumlist
    numeralpat = re.compile(r'(\[NV\]|[IVX]{1,4}\. )')
    numpatitir = numeralpat.finditer(numsearch)
    for numfind in numpatitir:
        if numfind.group() not in romnumlist:
            romnumlist.append(numfind.group())
    # Goes through every verseno in the versenofixlist
    curverse = "0"
    poscount = 0
    curnum = ""
    for item in versenofixlist:
        # Removes 'Rom. ' from the numeral
        if "Rom. " in item:
            versenofixlist[poscount] = item[5:]
            item = versenofixlist[poscount]
        if item != "0":
            curverse = item
        # Replaces no-number verses not at epistle boundaries with the number of the previous verse
        elif item == "0":
            versenofixlist[poscount] = curverse
            item = versenofixlist[poscount]
        # Replaces no-number verses at epistle boundaries with a comment that no information is available about verse.
        if item == "[NV]":
            versenofixlist[
                poscount] = "- No Chapter or Verse Information Available"
            item = versenofixlist[poscount]
        # Updates the current (previous) verse to the roman numeral of the current verse
        for romnum in romnumlist:
            if romnum in item:
                if item.find(romnum) == 0:
                    curnum = romnum
        # Combines chapter numerals to verse numbers that don't have them already
        if curnum not in item:
            if item != "- No Chapter or Verse Information Available":
                versenofixlist[poscount] = curnum + item
        poscount += 1
    fixcount = 0
    # The versenos in infolist are updated with the corrected forms from versenofixlist
    for wronglist in infolist:
        wronglist[6] = versenofixlist[fixcount]
        fixcount += 1
    # Adds Epistle Name to chapter and verse
    epnames = [
        "Rom.", "1 Cor.", "2 Cor.", "Gal.", "Eph.", "Phil.", "1 Thes.",
        "2 Thes.", "Col.", "1 Tim.", "2 Tim.", "Tit.", "Philem.", "Heb."
    ]
    eppages = [
        543, 591, 619, 631, 643, 654, 663, 669, 679, 690, 698, 703, 705, 713
    ]
    spot = 0
    for item in infolist[1:]:
        if int(item[0]) < eppages[spot]:
            item[6] = epnames[spot] + " " + item[6]
        elif int(item[0]) == eppages[spot]:
            spot += 1
            item[6] = epnames[spot] + " " + item[6]
    return infolist
示例#8
0
def scribe_split(glossfile, startpage=499, stoppage=712):
    """Takes the text of the glosses, identifies page number, gloss text and footnotes,
       Separates the three scribal hands first by identifying prima manus footnotes throughout whole text
       then by breaking the remaining glosses of f.32d from f.33a"""
    # get a list of pages and page numbers from the file, and isolate the irish gloss text ... and footnotes
    pagesinfolist = get_pageinfo(glossfile, startpage, stoppage)
    pagesdir = []
    for page in pagesinfolist:
        pageno = page[0]
        irish = get_section(get_pages(glossfile, pageno, pageno), "SG")
        irish = irish[0]
        pagedir = [pageno, irish]
        pagesdir.append(pagedir)
    # get the individual glosses per page, check if they have a 'prima manus' footnote, if so, put in PM list
    allglosses = ['All Glosses']
    primanlist = ['Prima Manus']
    handiilist = ['Hand Two']
    handiiilist = ['Hand Three']
    # glosscount = 0
    # pmcount = 0
    # htwocount = 0
    # hthreecount = 0
    # adds all glosses to a single list
    for page in pagesdir:
        glosslist = order_glosslist(page[1])
        for curgloss in glosslist:
            allglosses.append(curgloss)
            # glosscount += 1
    # adds prima manus glosses to a proma manus list
    for page in pagesdir:
        glosslist = order_glosslist(page[1])
        footnotes = order_footlist(glossfile, page[0])
        for curgloss in glosslist:
            # find footnote markers in each individual gloss
            glossfnpat = re.compile(r'\[[a-z]\]')
            glossfnitir = glossfnpat.finditer(curgloss)
            for i in glossfnitir:
                let = i.group()
                let = let[1:-1]
                for fn in footnotes:
                    # Find footnote associated with gloss, then if it indicates a prima manu add gloss to prima list
                    if fn[0] == let:
                        if "prima" in fn:
                            if curgloss not in primanlist:
                                primanlist.append(curgloss)
                                # pmcount += 1
    # adds remaining glosses to separate lists for hands 2 and 3
    handtwo = True
    for page in pagesdir:
        glosslist = order_glosslist(page[1])
        for curgloss in glosslist:
            # iterate through the remaining glosses, remove prima glosses, divide rest into hand 2 or hand 3 list
            if "[f. 33a]" in curgloss:
                handtwo = False
            if handtwo:
                if curgloss not in primanlist:
                    handiilist.append(curgloss)
                    # htwocount += 1
            else:
                if curgloss not in primanlist:
                    handiiilist.append(curgloss)
                    # hthreecount += 1
    handlists = [allglosses, primanlist, handiilist, handiiilist]
    # print("Full Count: %d\nH1: %d\nH2: %d\nH3: %d" % (glosscount, pmcount, htwocount, hthreecount))
    return handlists
def get_latpageinfo(file, page):
    """returns a list of gloss-lists for a specified page of TPH
       each gloss-list contains a gloss[0], the Latin verse[1], the lemma[2], and the lemma position[3]"""
    latininfolist = []
    latlines = order_latlist("\n\n".join(
        get_section(get_pages(file, page, page), "Lat")))
    eachgloss = order_glosslist(
        clear_tags("\n\n".join(get_section(get_pages(file, page, page),
                                           "SG"))))
    glosses = order_glosses(
        clear_tags("\n\n".join(get_section(get_pages(file, page, page),
                                           "SG"))))
    numpat = re.compile(r'(\d{1,2}[a-z]?, )?\d{1,2}[a-z]?\. ')
    glossitir = numpat.finditer(glosses)
    glossnums = []
    for i in glossitir:
        # Gets gloss numbers from the Irish text, converts them to match tags in the Latin text, adds them to a list.
        glossnum = i.group()
        glossnum = glossnum[:-2]
        if ", " in glossnum:
            glossnum = "–".join(glossnum.split(", "))
        glossnums.append("[" + glossnum + "]")
    latpergloss = []
    lemmata = []
    positions = []
    usednums = []
    backlist = []
    # Creates a reversed version of latlines to be searched instead on pages where there are duplicate glossnos.
    # This prevents two glosses with the same number interacting with each other's Latin lines.
    for line in latlines:
        backlist.append(line)
    backlist.reverse()
    # Checks for expected gloss numbers in the latin text and, if found, adds the latin line and lemma to lists.
    for num in glossnums:
        if num not in usednums:
            # If this is the first instance of this glossno on this page.
            usednums.append(num)
            found = False
            while not found:
                for line in latlines:
                    if num in line:
                        latpergloss.append(line)
                        linetext = line
                        numpos = line.find(num)
                        linetext = linetext[:numpos]
                        lemma = linetext[linetext.rfind(" ") + 1:]
                        if "[" in lemma:
                            lemma = clear_tags(lemma)
                        lemmata.append(lemma)
                        notagtext = clear_tags(linetext, ["let"])
                        remnumpat = re.compile(
                            r'(\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))'
                        )
                        thisglossremnum = ""  # has to be put here for pages which begin mid-Latin-line
                        for remnum in remnumpat.finditer(notagtext):
                            if remnum.group() != "":
                                thisglossremnum = remnum.group()
                        remlen = len(thisglossremnum)
                        notagtext = notagtext[remlen:]
                        fnpat = re.compile(r'\[/?[a-d]\]')
                        fnpatitir = fnpat.finditer(notagtext)
                        fns = []
                        for fn in fnpatitir:
                            fns.append(fn.group())
                        if fns:
                            for marker in fns:
                                if "[/" in marker:
                                    notagtext = "".join(
                                        notagtext.split(marker[0] +
                                                        marker[-2:]))
                                    supscr = "</em><sup>{}</sup><em>".format(
                                        marker[2:3])
                                    notagtext = supscr.join(
                                        notagtext.split(marker))
                            for marker in fns:
                                if marker in notagtext:
                                    supscr = "</em><sup>{}</sup><em>".format(
                                        marker[1:2])
                                    notagtext = supscr.join(
                                        notagtext.split(marker))
                        lempos = notagtext.rfind(lemma)
                        positions.append(lempos)
                        found = True
                        break
        elif num in usednums:
            # If this is not the first instance of this glossno on this page.
            found = False
            while not found:
                for line in backlist:
                    if num in line:
                        latpergloss.append(line)
                        linetext = line
                        numpos = line.find(num)
                        linetext = linetext[:numpos]
                        lemma = linetext[linetext.rfind(" ") + 1:]
                        if "[" in lemma:
                            lemma = clear_tags(lemma)
                        lemmata.append(lemma)
                        notagtext = clear_tags(linetext, ["let"])
                        remnumpat = re.compile(
                            r'(\[NV\]|((Rom\. )?([IVX]{1,4}\. )?(\d{1,2}[a-z]?, )?(\d{1,2}[a-z]?\. )?))'
                        )
                        thisglossremnum = ""  # has to be put here for pages which begin mid-Latin-line
                        for remnum in remnumpat.finditer(notagtext):
                            if remnum.group() != "":
                                thisglossremnum = remnum.group()
                        remlen = len(thisglossremnum)
                        notagtext = notagtext[remlen:]
                        fnpat = re.compile(r'\[/?[a-d]\]')
                        fnpatitir = fnpat.finditer(notagtext)
                        fns = []
                        for fn in fnpatitir:
                            fns.append(fn.group())
                        if fns:
                            for marker in fns:
                                if "[/" in marker:
                                    notagtext = "".join(
                                        notagtext.split(marker[0] +
                                                        marker[-2:]))
                                    supscr = "</em><sup>{}</sup><em>".format(
                                        marker[2:3])
                                    notagtext = supscr.join(
                                        notagtext.split(marker))
                            for marker in fns:
                                if marker in notagtext:
                                    supscr = "</em><sup>{}</sup><em>".format(
                                        marker[1:2])
                                    notagtext = supscr.join(
                                        notagtext.split(marker))
                        lempos = notagtext.rfind(lemma)
                        positions.append(lempos)
                        found = True
                        break
    for i in range(len(glossnums)):
        # Compiles a list of the gloss, the Latin line, and the lemma for the gloss within the Latin line.
        thislatperglos = latpergloss[i]
        fnpat = re.compile(r'\[/?[a-d]\]')
        fnpatitir = fnpat.finditer(thislatperglos)
        fns = []
        for fn in fnpatitir:
            fns.append(fn.group())
        if fns:
            for marker in fns:
                if "[/" in marker:
                    thislatperglos = "".join(
                        thislatperglos.split(marker[0] + marker[-2:]))
                    supscr = "</em><sup>{}</sup><em>".format(marker[2:3])
                    thislatperglos = supscr.join(thislatperglos.split(marker))
            for marker in fns:
                if marker in thislatperglos:
                    supscr = "</em><sup>{}</sup><em>".format(marker[1:2])
                    thislatperglos = supscr.join(thislatperglos.split(marker))
        latininfolist.append([
            eachgloss[i],
            clear_tags(thislatperglos, ["NV"]),
            clear_tags(lemmata[i]), positions[i]
        ])
    return latininfolist