Пример #1
0
def generatesortedoutputbyword(completeindexdict: dict, onework: bool,
                               alphabetical: bool) -> List[tuple]:
    """

	the simple case: just spew the index out alphabetically

	:param completeindexdict:
	:param onework:
	:param alphabetical:
	:return:
	"""

    unsortedoutput = htmlifysimpleindex(completeindexdict, onework)
    if alphabetical:
        sortkeys = [x[0] for x in unsortedoutput]
        outputdict = {x[0]: x for x in unsortedoutput}
        sortkeys = polytonicsort(sortkeys)
        sortedoutput = [outputdict[x] for x in sortkeys]
    else:
        sortedoutput = sorted(unsortedoutput,
                              key=lambda x: int(x[1]),
                              reverse=True)
    # pad position 0 with a fake, unused headword so that these tuples have the same shape as the ones in the other branch of the condition
    sortedoutput = [(s[0], s[0], s[1], s[2], False) for s in sortedoutput]

    return sortedoutput
Пример #2
0
def offerlemmatahints(query) -> list:
    """

	fill in the hint box with eligible values

	since there are a crazy number of words, don't update until you are beyond 3 chars

	:return:
	"""

    hintlist = list()

    invals = u'jvσς'
    outvals = u'iuϲϲ'

    if len(query) > 1:
        # query = stripaccents(term.lower())
        query = stripaccents(query)
        qlen = len(query)
        bag = query[0:2]
        key = stripaccents(bag.translate(str.maketrans(invals, outvals)))
        try:
            wordlist = keyedlemmata[key]
        except KeyError:
            wordlist = list()

        wordlist = polytonicsort(wordlist)

        # print('offerlemmatahints() wordlist', wordlist)

        if qlen > 2:
            # always true, but what if you changed 'len(term) > 2'?
            q = key + query[2:]
        else:
            q = key
        #hintlist = [{'value': w} for w in wordlist if q == stripaccents(w.lower()[0:qlen])]
        hintlist = [{
            'value': w
        } for w in wordlist if q == stripaccents(w[0:qlen])]

    if len(hintlist) > 50:
        hintlist = hintlist[0:50]
        hintlist = ['(>50 items: list was truncated)'] + hintlist

    return hintlist
Пример #3
0
    def generatequotesummary(self, lemmaobject=None) -> List:
        qfinder = re.compile(
            r'<span class="dictquote dictlang_\w+">(.*?)</span>')
        quotelist = re.findall(qfinder, self.body)

        # many of the 'quotes' are really just forms of the word
        # trim these
        if lemmaobject:
            morphologylist = lemmaobject.formlist
        else:
            morphologylist = list()

        quotelist = [x for x in quotelist if x not in morphologylist]
        quotelist = polytonicsort(quotelist)

        if session['quotesummary']:
            qq = len(quotelist)
            if qq != 1:
                quotelist = ['{n} quotes'.format(n=qq)]
            else:
                quotelist = ['1 quote']

        return quotelist
Пример #4
0
def generatesortedoutputbyheadword(completeindexdict: dict, onework: bool,
                                   alphabetical: bool,
                                   activepoll) -> List[tuple]:
    """

	arrange the index by headword

	:return:
	"""

    # [a] find the morphologyobjects needed
    remaining = len(completeindexdict)
    activepoll.statusis('Finding headwords for entries')
    activepoll.setnotes('({r} entries found)'.format(r=remaining))

    morphobjects = getrequiredmorphobjects(completeindexdict.keys())

    activepoll.statusis('Assigning headwords to entries')
    remaining = len(completeindexdict)
    activepoll.setnotes('({bf} baseforms found)'.format(bf=remaining))
    activepoll.allworkis(remaining)

    # [b] find the baseforms
    augmentedindexdict = findindexbaseforms(completeindexdict, morphobjects,
                                            activepoll)

    # sample items in an augmentedindexdict
    # erant {'baseforms': 'sum¹', 'homonyms': None, 'loci': [('lt2300w001', 5, '1.4')]}
    # regis {'baseforms': ['rex', 'rego (to keep straight)'], 'homonyms': 2, 'loci': [('lt2300w001', 7, '1.6')]}
    # qui {'baseforms': ['qui¹', 'quis²', 'quis¹', 'qui²'], 'homonyms': 4, 'loci': [('lt2300w001', 7, '1.6'), ('lt2300w001', 4, '1.3')]}

    # [c] remap under the headwords you found

    activepoll.statusis('Remapping entries')
    activepoll.allworkis(-1)

    if hipparchia.config['DELETEUNACCENTEDGREEKFROMINDEX']:
        hasaccent = re.compile(
            r'[ἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]'
        )
        accenteddict = {
            k: augmentedindexdict[k]
            for k in augmentedindexdict.keys() if re.search(hasaccent, k)
        }
        latindict = {
            k: augmentedindexdict[k]
            for k in augmentedindexdict.keys() if re.search(r'[a-z]', k)
        }
        augmentedindexdict = {**accenteddict, **latindict}
        if hipparchia.config['DROPLATININAGREEKINDEX']:
            if len(latindict.keys()) * 5 < len(accenteddict.keys()):
                augmentedindexdict = accenteddict

    if session['indexskipsknownwords']:
        # 'baseforms' is either False or a list
        augmentedindexdict = {
            k: augmentedindexdict[k]
            for k in augmentedindexdict.keys()
            if not augmentedindexdict[k]['baseforms']
        }

    headwordindexdict = generateheadwordindexdict(augmentedindexdict)

    # [d] format and arrange the output

    if not alphabetical:
        sorter = list()
        for wd in headwordindexdict:
            forms = headwordindexdict[wd]
            allhits = sum([len(forms[f]) for f in forms])
            sorter.append((allhits, wd))
        sorter = sorted(sorter, reverse=True)
        sortedheadwordindexdictkeys = [
            s[1] for s in sorter if s[1] != '•••unparsed•••'
        ]
        sortedheadwordindexdictkeys.append('•••unparsed•••')
    else:
        sortedheadwordindexdictkeys = polytonicsort(headwordindexdict.keys())

    htmlindexdict = dict()
    sortedoutput = list()
    for headword in sortedheadwordindexdictkeys:
        hw = re.sub('v', 'u', headword)
        hw = re.sub('j', 'i', hw)
        sortedoutput.append(('&nbsp;', '', '', '', False))
        if len(headwordindexdict[headword].keys()) > 1:
            formcount = 0
            homonymncount = 0
            for form in headwordindexdict[headword].keys():
                formcount += len(headwordindexdict[headword][form])
                homonymncount += len(
                    [x for x in headwordindexdict[headword][form] if x[3]])
            if formcount > 1 and homonymncount > 0:
                sortedoutput.append(
                    (hw, '({fc} / {hc})'.format(fc=formcount,
                                                hc=homonymncount), '', '',
                     False))
            elif formcount > 1:
                sortedoutput.append(
                    (hw, '({fc})'.format(fc=formcount), '', '', False))

        if alphabetical:
            sortedforms = polytonicsort(headwordindexdict[headword].keys())
        else:
            sortedforms = sorted(
                headwordindexdict[headword].keys(),
                key=lambda x: len(headwordindexdict[headword][x]),
                reverse=True)

        for form in sortedforms:
            # print('headwordindexdict[{h}][{f}] = {x}'.format(h=headword, f=form, x=headwordindexdict[headword][form]))
            hits = sorted(headwordindexdict[headword][form])
            isahomonymn = hits[0][3]
            if onework:
                hits = [h[2] for h in hits]
                loci = ', '.join(hits)
            else:
                previouswork = hits[0][0]
                loci = '<span class="work">{wk}</span>: '.format(
                    wk=previouswork[6:10])
                for hit in hits:
                    if hit[0] == previouswork:
                        loci += hit[2] + ', '
                    else:
                        loci = loci[:-2] + '; '
                        previouswork = hit[0]
                        loci += '<span class="work">{wk}</span>: '.format(
                            wk=previouswork[6:10])
                        loci += hit[2] + ', '
                loci = loci[:-2]
            htmlindexdict[headword] = loci
            sortedoutput.append(
                ((hw, form, len(hits), htmlindexdict[headword], isahomonymn)))

    return sortedoutput
Пример #5
0
def generatevocabfor(searchid: str,
                     author: str,
                     work=None,
                     passage=None,
                     endpoint=None,
                     citationdelimiter='|') -> JSON_STR:
    """

	given a text span
		figure out what words are used by this span
		then provide a vocabulary list from that list

	ex:
		http://localhost:5000/vocabularyfor/SEARCHID/lt0631/001/1/20

	this is a lot like building an index so we just leverage buildindexto() but pull away from it after the initial
	bit where we establish endpoints and get ready to gather the lines

	:param searchid:
	:param author:
	:param work:
	:param passage:
	:param endpoint:
	:param citationdelimiter:
	:return:
	"""

    starttime = time.time()
    segmenttext = str()

    dbconnection = ConnectionObject('autocommit')
    dbcursor = dbconnection.cursor()

    justvocab = True

    cdict = buildindexto(searchid, author, work, passage, endpoint,
                         citationdelimiter, justvocab)
    lineobjects = grabbundlesoflines(cdict, dbcursor)

    allwords = [l.wordset() for l in lineobjects]
    allwords = set(flattenlistoflists(allwords))

    morphobjects = getrequiredmorphobjects(allwords)
    # 'dominatio': <server.hipparchiaobjects.dbtextobjects.dbMorphologyObject object at 0x14ab92d68>, ...

    baseformsmorphobjects = list()
    for m in morphobjects:
        try:
            baseformsmorphobjects.extend(morphobjects[m].getpossible())
        except AttributeError:
            # 'NoneType' object has no attribute 'getpossible'
            pass

    vocabset = {
        '{w} ~~~ {t}'.format(w=b.getbaseform(), t=b.gettranslation())
        for b in baseformsmorphobjects if b.gettranslation()
    }
    vocabset = {
        v.split(' ~~~ ')[0]: v.split(' ~~~ ')[1].strip()
        for v in vocabset
    }
    vocabset = {v: vocabset[v] for v in vocabset if vocabset[v]}

    # the following can be in entries and will cause problems...:
    #   <tr opt="n">which had become milder</tr>

    vocabset = {
        v: re.sub(r'<(|/)tr.*?>', str(), vocabset[v])
        for v in vocabset
    }

    # now you have { word1: definition1, word2: definition2, ...}

    vocabcounter = [
        b.getbaseform() for b in baseformsmorphobjects if b.gettranslation()
    ]
    vocabcount = dict()
    for v in vocabcounter:
        try:
            vocabcount[v] += 1
        except KeyError:
            vocabcount[v] = 1

    po = IndexmakerInputParsingObject(author, work, passage, endpoint,
                                      citationdelimiter)

    ao = po.authorobject
    wo = po.workobject
    psg = po.passageaslist
    stop = po.endpointlist

    tableheadtemplate = """
	<tr>
		<th class="vocabtable">word</th>
		<th class="vocabtable">count</th>
		<th class="vocabtable">definitions</th>
	</tr>
	"""

    tablerowtemplate = """
	<tr>
		<td class="word"><vocabobserved id="{w}">{w}</vocabobserved></td>
		<td class="count">{c}</td>
		<td class="trans">{t}</td>
	</tr>
	"""

    tablehtml = """
	<table>
		{head}
		{rows}
	</table>
	"""

    byfrequency = False
    if not byfrequency:
        rowhtml = [
            tablerowtemplate.format(w=k, t=vocabset[k], c=vocabcount[k])
            for k in polytonicsort(vocabset.keys())
        ]
    else:
        vc = [(vocabcount[v], v) for v in vocabcount]
        vc.sort(reverse=True)
        vk = [v[1] for v in vc]
        vk = [v for v in vk if v in vocabset]
        rowhtml = [
            tablerowtemplate.format(w=k, t=vocabset[k], c=vocabcount[k])
            for k in vk
        ]

    wordsfound = len(rowhtml)
    rowhtml = '\n'.join(rowhtml)

    vocabhtml = tablehtml.format(head=tableheadtemplate, rows=rowhtml)

    if not ao:
        ao = makeanemptyauthor('gr0000')

    buildtime = time.time() - starttime
    buildtime = round(buildtime, 2)

    if not stop:
        segmenttext = '.'.join(psg)

    results = dict()
    results['authorname'] = avoidsmallvariants(ao.shortname)
    results['title'] = avoidsmallvariants(wo.title)
    results['structure'] = avoidsmallvariants(wo.citation())
    results['worksegment'] = segmenttext
    results['elapsed'] = buildtime
    results['wordsfound'] = wordsfound
    results['texthtml'] = vocabhtml
    results['keytoworks'] = str()
    results['newjs'] = supplementalvocablistjs()
    results = json.dumps(results)

    # print('vocabhtml', vocabhtml)

    return results