def findsetofallwords(listofwordclusters: list) -> set:
    """

	get ready to vectorize by splitting and cleaning a set of lines or sentences

	:param listofwordclusters:
	:return:
	"""

    # find all words in use
    allwords = [c.split(' ') for c in listofwordclusters]
    # flatten
    allwords = [item for sublist in allwords for item in sublist]

    greekwords = [w for w in allwords if re.search(minimumgreek, w)]

    trans = buildhipparchiatranstable()
    latinwords = [w for w in allwords if not re.search(minimumgreek, w)]

    allwords = [removegravity(w) for w in greekwords
                ] + [stripaccents(w, trans) for w in latinwords]

    punct = re.compile('[{s}]'.format(s=re.escape(punctuation + extrapunct)))

    allwords = [re.sub(punct, str(), w) for w in allwords]

    allwords = set(allwords) - {''}

    return allwords
def polytonicsort(unsortedwords: list) -> list:
    """
	sort() looks at your numeric value, but α and ά and ᾶ need not have neighboring numerical values
	stripping diacriticals can help this, but then you get words that collide
	gotta jump through some extra hoops

		[a] build an unaccented copy of the word in front of the word
		[b] substitute sigmas for lunate sigmas (because lunate comes after omega...)
			θαλαττησ-snip-θαλάττηϲ
		[c] sort the augmented words (where ά and ᾶ only matter very late in the game)
		[d] remove the augment
		[e] return

	:param unsortedwords:
	:return:
	"""

    transtable = buildhipparchiatranstable()

    stripped = [
        re.sub(r'ϲ', r'σ', stripaccents(word, transtable)) + '-snip-' + word
        for word in unsortedwords if word
    ]

    stripped = sorted(stripped)

    snipper = re.compile(r'(.*?)(-snip-)(.*?)')

    sortedversion = [re.sub(snipper, r'\3', word) for word in stripped]

    return sortedversion
示例#3
0
def bulkfindwordcounts(listofwords: List[str]) -> List[dbWordCountObject]:
    """

	note that the lists of words should all start with the same letter since the wordcount tables are letter-keyed

	hipparchiaDB=# CREATE TEMP TABLE bulkcounter_51807f8bbe08 AS SELECT values AS  entriestocheck FROM unnest(ARRAY['κατακλειούϲηϲ', 'κατακλῇϲαι', 'κατακλεῖϲαι']) values;

	hipparchiaDB=# SELECT * FROM wordcounts_κ WHERE EXISTS (SELECT 1 FROM bulkcounter_51807f8bbe08 tocheck WHERE tocheck.entriestocheck = wordcounts_κ.entry_name);
	  entry_name   | total_count | gr_count | lt_count | dp_count | in_count | ch_count
	---------------+-------------+----------+----------+----------+----------+----------
	 κατακλεῖϲαι   |          31 |       30 |        0 |        0 |        1 |        0
	 κατακλειούϲηϲ |           3 |        3 |        0 |        0 |        0 |        0
	 κατακλῇϲαι    |           1 |        1 |        0 |        0 |        0 |        0
	(3 rows)

	:param listofwords:
	:return:
	"""

    dbconnection = ConnectionObject(readonlyconnection=False)
    dbcursor = dbconnection.cursor()

    try:
        firstletteroffirstword = stripaccents(listofwords[0][0])
    except IndexError:
        return list()

    if firstletteroffirstword not in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ':
        firstletteroffirstword = '0'

    tqtemplate = """
	CREATE TEMP TABLE bulkcounter_{rnd} AS
		SELECT values AS 
			entriestocheck FROM unnest(ARRAY[%s]) values
	"""

    uniquename = assignuniquename(12)
    tempquery = tqtemplate.format(rnd=uniquename)
    data = (listofwords, )
    dbcursor.execute(tempquery, data)

    qtemplate = """
	SELECT * FROM wordcounts_{x} WHERE EXISTS 
		(SELECT 1 FROM bulkcounter_{rnd} tocheck WHERE tocheck.entriestocheck = wordcounts_{x}.entry_name)
	"""

    query = qtemplate.format(rnd=uniquename, x=firstletteroffirstword)
    try:
        dbcursor.execute(query)
        results = resultiterator(dbcursor)
    except psycopg2.ProgrammingError:
        # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist
        results = list()

    wordcountobjects = [dbWordCountObject(*r) for r in results]

    dbconnection.connectioncleanup()

    return wordcountobjects
示例#4
0
def offerlemmatahints(query) -> list:
    """

	fill in the hint box with eligible values

	since there are a crazy number of words, don't update until you are beyond 3 chars

	:return:
	"""

    hintlist = list()

    invals = u'jvσς'
    outvals = u'iuϲϲ'

    if len(query) > 1:
        # query = stripaccents(term.lower())
        query = stripaccents(query)
        qlen = len(query)
        bag = query[0:2]
        key = stripaccents(bag.translate(str.maketrans(invals, outvals)))
        try:
            wordlist = keyedlemmata[key]
        except KeyError:
            wordlist = list()

        wordlist = polytonicsort(wordlist)

        # print('offerlemmatahints() wordlist', wordlist)

        if qlen > 2:
            # always true, but what if you changed 'len(term) > 2'?
            q = key + query[2:]
        else:
            q = key
        #hintlist = [{'value': w} for w in wordlist if q == stripaccents(w.lower()[0:qlen])]
        hintlist = [{
            'value': w
        } for w in wordlist if q == stripaccents(w[0:qlen])]

    if len(hintlist) > 50:
        hintlist = hintlist[0:50]
        hintlist = ['(>50 items: list was truncated)'] + hintlist

    return hintlist
示例#5
0
    def _generatekeyedwordcounts(self) -> dict:
        """

		return something like:

		{'πνεύμαϲι': 378, 'πνευμάτων': 2161, 'πνεῦμ': 89, 'πνεύμαϲ': 1, 'πνεῦμα': 23686, 'πνεύματα': 1959, 'πνεύματ': 17,
		'πνεύματοϲ': 19025, 'πνεύμαϲιν': 299, 'πνεύματι': 8855}

		:return:
		"""
        wordset = {re.sub(r"'$", str(), a.word) for a in self.listofanalyses}
        initials = {stripaccents(w[0]) for w in wordset}
        byinitial = {
            i: [w for w in wordset if stripaccents(w[0]) == i]
            for i in initials
        }
        wco = [bulkfindwordcounts(byinitial[i]) for i in byinitial]
        wco = list(itertools.chain(*wco))
        keyedwcounts = {w.entryname: w.t for w in wco if w}
        return keyedwcounts
示例#6
0
    def _entrylistsplitter(matchgroup):
        entrytemplate = r'<dictionaryentry id="{clean}">{dirty}</dictionaryentry>'
        head = matchgroup.group(1)
        tail = matchgroup.group(3)

        synonymns = matchgroup.group(2)
        synonymns = synonymns.split(', ')

        substitutes = [
            entrytemplate.format(clean=stripaccents(s), dirty=s)
            for s in synonymns
        ]
        substitutes = ', '.join(substitutes)

        newstring = head + substitutes + tail

        return newstring
示例#7
0
def buildkeyedlemmata(listofentries: list) -> defaultdict:
    """

	a list of 140k words is too long to send to 'getlemmahint' without offering quicker access

	a dict with keys...

	:param listofentries:
	:return:
	"""

    invals = u'jvσς'
    outvals = u'iuϲϲ'

    keyedlemmata = defaultdict(list)

    if track:
        iterable = track(listofentries,
                         description='building keyedlemmata',
                         transient=True)
    else:
        print('building keyedlemmata', end=str())
        iterable = listofentries

    for e in iterable:
        try:
            # might IndexError here...
            bag = e[0:2]
            key = stripaccents(bag.translate(str.maketrans(invals, outvals)))
            try:
                keyedlemmata[key].append(e)
            except KeyError:
                keyedlemmata[key] = [e]
        except IndexError:
            pass

    if track:
        print('building keyedlemmata', end=str())

    return keyedlemmata
示例#8
0
def findcountsviawordcountstable(wordtocheck):
    """

	used to look up a list of specific observed forms
	(vs. dictionary headwords)

	:param wordtocheck:
	:return:
	"""

    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()

    initial = stripaccents(wordtocheck[0])
    # alternatives = re.sub(r'[uv]','[uv]',c)
    # alternatives = '^'+alternatives+'$'
    if initial in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ':
        # note that we just lost "'φερον", "'φερεν", "'φέρεν", "'φερεϲ", "'φερε",...
        # but the punctuation killer probably zapped them long ago
        # this needs to be addressed in HipparchiaBuilder
        q = 'SELECT * FROM wordcounts_{i} WHERE entry_name = %s'.format(
            i=initial)
    else:
        q = 'SELECT * FROM wordcounts_0 WHERE entry_name = %s'

    d = (wordtocheck, )
    try:
        dbcursor.execute(q, d)
        result = dbcursor.fetchone()
    except psycopg2.ProgrammingError:
        # psycopg2.ProgrammingError: relation "wordcounts_ε" does not exist
        # you did not build the wordcounts at all?
        result = None

    dbconnection.connectioncleanup()

    return result
示例#9
0
def lookformorphologymatches(word: str,
                             dbcursor,
                             trialnumber=0,
                             revertword=None,
                             rewrite=None,
                             furtherdeabbreviate=False) -> dbMorphologyObject:
    """

	hipparchiaDB=# select * from greek_morphology limit 1;
	 observed_form |   xrefs   | prefixrefs |                                                             possible_dictionary_forms
	---------------+-----------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------
	 Τηνίουϲ       | 114793123 |            | <possibility_1>Τήνιοϲ<xref_value>114793123</xref_value><xref_kind>0</xref_kind><transl> </transl><analysis>masc acc pl</analysis></possibility_1>+
               |           |            |
	hipparchiaDB=# select * from greek_lemmata where xref_number=114793123;
	 dictionary_entry | xref_number |                  derivative_forms
	------------------+-------------+----------------------------------------------------
	 τήνιοϲ           |   114793123 | {τηνίων,τήνια,τηνίουϲ,τήνιοι,τηνίοιϲ,τηνία,τήνιοϲ}

	funky because we need to poke at words several times and to try combinations of fixes

	ought to pass a cursor to this one because this function will have trouble cleaning the connection properly

	:param word:
	:param dbcursor:
	:param trialnumber:
	:param revertword:
	:param rewrite:
	:param furtherdeabbreviate: a vector run has already turned 'm.' into Marcus, so it is safe to turn 'm' into 'mille'
	:return:
	"""

    if re.search(r'[a-z]', word):
        usedictionary = 'latin'
    else:
        usedictionary = 'greek'

    # βοῶ̣ντεϲ -> βοῶντεϲ
    word = re.sub(r'̣', str(), word)

    ihavesession = True
    try:
        session['available'][usedictionary + '_morphology']
    except RuntimeError:
        # vectorbot thread does not have access to the session...
        # we will *dangerously guess* that we can skip the next check because vectorbotters
        # are quite likely to have beefy installations...
        ihavesession = False

    if ihavesession and not session['available'][usedictionary +
                                                 '_morphology']:
        return None

    maxtrials = 4
    retrywithcapitalization = 1
    trialnumber += 1

    # the things that can confuse me
    terminalacute = re.compile(r'[άέίόύήώ]')

    morphobjects = None

    # syntax = '~' if you have to deal with '[uv]' problems, e.g.
    # but that opens up a whole new can of worms

    query = 'SELECT * FROM {d}_morphology WHERE observed_form = %s'.format(
        d=usedictionary)
    data = (word, )

    # print('lookformorphologymatches() q/d', query, data)

    dbcursor.execute(query, data)
    # NOT TRUE: fetchone() because all possiblities are stored inside the analysis itself
    # loss of case sensitivity is a problem here: Latro vs latro.
    analyses = dbcursor.fetchall()

    if analyses:
        morphobjects = [dbMorphologyObject(*a) for a in analyses]
        if rewrite:
            for m in morphobjects:
                m.observed = rewrite
                m.rewritten = True
    elif trialnumber < maxtrials:
        # turn 'kal' into 'kalends', etc.
        # not very costly as this is a dict lookup, and less costly than any call to the db
        newword = unpackcommonabbreviations(word, furtherdeabbreviate)
        if newword != word:
            return lookformorphologymatches(newword, dbcursor, 0, rewrite=word)

        if revertword:
            word = revertword
        # this code lets you make multiple stabs at an answer if you have already failed once
        # need to be careful about the retries that reset the trialnumber: could infinite loop if not careful
        # [a] something like πλακουντάριόν τι will fail because of the enclitic (greek_morphology can find πλακουντάριον and πλακουντάριοϲ)
        # [b] something like προχοίδιόν τι will fail twice over because of the enclitic and the diaresis

        try:
            # have to 'try...' because there might not be a word[-2]
            if trialnumber == 1:
                # elided ending? you will ask for ἀλλ, but you need to look for ἀλλ'
                newword = word + "'"
                morphobjects = lookformorphologymatches(newword,
                                                        dbcursor,
                                                        trialnumber,
                                                        revertword=word)
            elif trialnumber == 2:
                # a proper noun?
                newword = word[0].upper() + word[1:]
                morphobjects = lookformorphologymatches(newword,
                                                        dbcursor,
                                                        trialnumber,
                                                        revertword=word)
            elif re.search(r'\'$', word):
                # the last word in a greek quotation might have a 'close quote' that was mistaken for an elision
                newword = re.sub(r'\'', '', word)
                morphobjects = lookformorphologymatches(
                    newword, dbcursor, trialnumber)
            elif re.search(r'[ΐϊΰῧϋî]', word):
                # desperate: ῥηϊδίωϲ --> ῥηιδίωϲ
                diacritical = 'ΐϊΰῧϋî'
                plain = 'ίιύῦυi'
                xform = str.maketrans(diacritical, plain)
                newword = word.translate(xform)
                morphobjects = lookformorphologymatches(
                    newword, dbcursor, trialnumber=retrywithcapitalization)
            elif re.search(terminalacute, word[-1]):
                # an enclitic problem?
                sub = stripaccents(word[-1])
                newword = word[:-1] + sub
                morphobjects = lookformorphologymatches(
                    newword, dbcursor, trialnumber=retrywithcapitalization)
            elif re.search(terminalacute, word[-2]):
                # πλακουντάριόν?
                sub = stripaccents(word[-2])
                newword = word[:-2] + sub + word[-1]
                morphobjects = lookformorphologymatches(
                    newword, dbcursor, trialnumber=retrywithcapitalization)
            else:
                return None
        except IndexError:
            morphobjects = None

    if not morphobjects:
        return None

    # OK: we have a list of dbMorphologyObjects; this needs to be turned into a single object...
    # def __init__(self, observed, xrefs, prefixrefs, possibleforms):

    if isinstance(morphobjects, dbMorphologyObject):
        # you got here after multiple tries
        # if you don't do the next, the len() check will fail
        morphobjects = [morphobjects]

    if len(morphobjects) == 1:
        morphobject = morphobjects[0]
    else:
        ob = morphobjects[0].observed
        xr = flattenlistoflists([m.xrefs for m in morphobjects])
        xr = ', '.join(xr)
        pr = flattenlistoflists([m.prefixrefs for m in morphobjects])
        pr = ', '.join(pr)
        pf = [m.possibleforms for m in morphobjects]
        hw = flattenlistoflists([m.headwords for m in morphobjects])

        # note that you will have multiple '<possibility_1>' entries now... Does not matter ATM, but a bug waiting to bite
        mergedpf = dict()
        for p in pf:
            mergedpf = {**mergedpf, **p}

        morphobject = dbMorphologyObject(ob, xr, pr, mergedpf, hw)

    return morphobject
示例#10
0
def probedictionary(usedictionary: str,
                    usecolumn: str,
                    seeking: str,
                    syntax: str,
                    dbcursor=None,
                    trialnumber=0) -> List:
    """

	this will make several stabs at finding a word in the dictionary

	we need to do this because sometimes a find in the morphology dictionary does not point to something
	you can find in the dictionary of meanings

	sample values:
		dictionary:	'greek_dictionary'
		usecolumn: 'entry_name'
		seeking: 'προχοΐδιον'
		syntax: '=' or 'LIKE'

	still unimplemented:
		τήθη vs τηθή; the parser has the latter, the dictionary expects the former (but knows of the latter)

	:param dbcursor:
	:param usedictionary:
	:param usecolumn:
	:param seeking:
	:param syntax:
	:param trialnumber:
	:return:
	"""
    # print('seeking/trial',seeking,trialnumber)

    dbconnection = None
    if not dbcursor:
        dbconnection = ConnectionObject()
        dbconnection.setautocommit()
        dbcursor = dbconnection.cursor()

    maxtrials = 8
    trialnumber += 1
    accenteddiaresis = re.compile(r'αί|εί|οί|υί|ηί|ωί')
    unaccenteddiaresis = re.compile(r'αι|ει|οι|υι|ηι|ωι')

    # nothingfound = convertdictionaryfindintoobject('nothing', 'nodict')

    if usedictionary == 'latin_dictionary':
        extracolumn = 'entry_key'
    else:
        extracolumn = 'unaccented_entry'

    qtemplate = """SELECT entry_name, metrical_entry, id_number, pos, translations, 
					entry_body, {ec}
					FROM {d} WHERE {col} {sy} %s ORDER BY id_number ASC"""
    query = qtemplate.format(ec=extracolumn,
                             d=usedictionary,
                             col=usecolumn,
                             sy=syntax)
    data = (seeking, )
    # print('searchdictionary()',query,'\n\t',data)

    try:
        dbcursor.execute(query, data)
        found = dbcursor.fetchall()
    except psycopg2.DataError:
        # thrown by dbcursor.execute()
        # invalid regular expression: parentheses () not balanced
        # ό)μβροϲ is a (bogus) headword; how many others are there?
        found = list()

    # we might be at trial 2+ and so we need to strip the supplement we used at trial #1
    if trialnumber > 2:
        seeking = re.sub(r'\[¹²³⁴⁵⁶⁷⁸⁹\]', '', seeking)
        seeking = re.sub(r'\^', '', seeking)

    foundobjects = None

    if len(found) > 0:
        foundobjects = [
            convertdictionaryfindintowordobject(f, usedictionary, dbcursor)
            for f in found
        ]
    elif trialnumber == 1:
        # failure...
        # the word is probably there, we have just been given the wrong search term; try some other solutions
        # [1] first guess: there were multiple possible entries, not just one
        newword = re.sub(r'[¹²³⁴⁵⁶⁷⁸⁹]', '', seeking.lower())
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '=',
                                       dbcursor, trialnumber)
    elif trialnumber == 2:
        # grab any/all variants: ⁰¹²³⁴⁵⁶⁷⁸⁹
        newword = '^{sk}[¹²³⁴⁵⁶⁷⁸⁹]'.format(sk=seeking)
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '~',
                                       dbcursor, trialnumber)
    # elif trialnumber < maxtrials and '-' in seeking:
    # 	newword = attemptelision(seeking)
    # 	foundobject = searchdictionary(cursor, dictionary, usecolumn, newword, '=', trialnumber)
    elif trialnumber < maxtrials and seeking[-1] == 'ω':
        # ὑποϲυναλείφομαι is in the dictionary, but greek_lemmata says to look for ὑπό-ϲυναλείφω
        newword = seeking[:-1] + 'ομαι'
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '=',
                                       dbcursor, trialnumber)
    elif trialnumber < maxtrials and re.search(r'ομαι$', seeking):
        # χαρίζω is in the dictionary, but greek_lemmata says to look for χαρίζομαι
        newword = seeking[:-4] + 'ω'
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '=',
                                       dbcursor, trialnumber)
    elif trialnumber < maxtrials and re.search(accenteddiaresis, seeking):
        # false positives very easy here, but we are getting desperate and have nothing to lose
        diaresis = re.search(accenteddiaresis, seeking)
        head = seeking[:diaresis.start()]
        tail = seeking[diaresis.end():]
        vowels = diaresis.group(0)
        vowels = vowels[0] + 'ΐ'
        newword = head + vowels + tail
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '=',
                                       dbcursor, trialnumber)
    elif trialnumber < maxtrials and re.search(unaccenteddiaresis, seeking):
        diaresis = re.search(unaccenteddiaresis, seeking)
        head = seeking[:diaresis.start()]
        tail = seeking[diaresis.end():]
        vowels = diaresis.group(0)
        vowels = vowels[0] + 'ϊ'
        newword = head + vowels + tail
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '=',
                                       dbcursor, trialnumber)
    elif trialnumber < maxtrials:
        # τήθη vs τηθή; the parser has the latter, the dictionary expects the former (but knows of the latter)
        trialnumber = maxtrials - 1
        newword = re.sub(r'\[¹²³⁴⁵⁶⁷⁸⁹\]', '', seeking)
        newword = stripaccents(newword)
        newword = universalregexequivalent(newword)
        # strip '(' and ')'
        newword = '^{wd}$'.format(wd=newword[1:-1])
        foundobjects = probedictionary(usedictionary, usecolumn, newword, '~',
                                       dbcursor, trialnumber)

    if dbconnection:
        dbconnection.connectioncleanup()

    return foundobjects
示例#11
0
def headwordsearch(seeking: str, limit: str, usedictionary: str,
                   usecolumn: str) -> List[tuple]:
    """

	dictsearch() uses this

	hipparchiaDB=# SELECT entry_name, id_number FROM latin_dictionary WHERE entry_name ~* '.*?scrof.*?' ORDER BY id_number ASC LIMIT 50;
	  entry_name  | id_number
	--------------+-----------
	 scrofa¹      |     43118
	 Scrofa²      |     43119
	 scrofinus    |     43120
	 scrofipascus |     43121
	 scrofulae    |     43122
	(5 rows)

	:param seeking:
	:param limit:
	:param usedictionary:
	:param usecolumn:
	:return:
	"""

    cleanpoolifneeded()
    dbconnection = ConnectionObject()
    dbcursor = dbconnection.cursor()

    qstring = 'SELECT entry_name, id_number FROM {d}_dictionary WHERE {c} ~* %s ORDER BY id_number ASC LIMIT {lim}'

    query = qstring.format(d=usedictionary, c=usecolumn, lim=limit)

    if seeking[0] == ' ' and seeking[-1] == ' ':
        data = ('^' + seeking[1:-1] + '$', )
    elif seeking[0] == ' ' and seeking[-1] != ' ':
        data = ('^' + seeking[1:] + '.*?', )
    elif seeking[0] == '^' and seeking[-1] == '$':
        # esp if the dictionary sent this via next/previous entry
        data = (seeking, )
    else:
        data = ('.*?' + seeking + '.*?', )

    dbcursor.execute(query, data)

    # note that the dictionary db has a problem with vowel lengths vs accents
    # SELECT * FROM greek_dictionary WHERE entry_name LIKE %s d ('μνᾱ/αϲθαι,μνάομαι',)
    try:
        foundentries = dbcursor.fetchall()
    except:
        foundentries = list()

    # print('foundentries', foundentries)
    # '/dictsearch/scrof'
    # foundentries [('scrofa¹', 43118), ('scrofinus', 43120), ('scrofipascus', 43121), ('Scrofa²', 43119), ('scrofulae', 43122)]

    if not foundentries:
        variantseeker = seeking[:-1] + '[¹²³⁴⁵⁶⁷⁸⁹]' + seeking[-1]
        data = (variantseeker, )
        dbcursor.execute(query, data)
        foundentries = dbcursor.fetchall()

    if not foundentries:
        # maybe an inflected form was requested (can happen via clicks inside of an entry)
        morph = lookformorphologymatches(seeking, dbcursor)
        if morph:
            guesses = morph.getpossible()
            firstguess = guesses[0].getbaseform()
            seeking = stripaccents(firstguess)
            data = ('^{s}$'.format(s=seeking), )
            # print('lookformorphologymatches() new data=', data)
            dbcursor.execute(query, data)
            foundentries = dbcursor.fetchall()

    dbconnection.connectioncleanup()

    return foundentries
示例#12
0
 def _entrywordcleaner(foundword, substitutionstring):
     # example substitute: r'<dictionaryentry id="{clean}">{dirty}</dictionaryentry>'
     stripped = stripaccents(foundword)
     newstring = substitutionstring.format(clean=stripped, dirty=foundword)
     # print('entrywordcleaner()', foundword, stripped)
     return newstring
示例#13
0
def dictsearch(searchterm) -> JSON_STR:
	"""
	look up words
	return dictionary entries
	json packing
	:return:
	"""
	returndict = dict()

	searchterm = searchterm[:hipparchia.config['MAXIMUMLEXICALLENGTH']]
	probeforsessionvariables()

	dbconnection = ConnectionObject()
	dbcursor = dbconnection.cursor()

	if hipparchia.config['UNIVERSALASSUMESBETACODE']:
		searchterm = replacegreekbetacode(searchterm.upper())

	allowedpunct = '^$.'
	seeking = depunct(searchterm, allowedpunct)
	seeking = seeking.lower()
	seeking = re.sub('[σς]', 'ϲ', seeking)
	stripped = stripaccents(seeking)

	# don't turn 'injurius' into '[iiII]n[iiII][uuVV]r[iiII][uuVV]s'
	# that will happen if you call stripaccents() prematurely
	stripped = re.sub(r'[uv]', '[uvUV]', stripped)
	stripped = re.sub(r'[ij]', '[ijIJ]', stripped)

	if re.search(r'[a-z]', seeking):
		usedictionary = 'latin'
		usecolumn = 'entry_name'
	else:
		usedictionary = 'greek'
		usecolumn = 'unaccented_entry'

	if not session['available'][usedictionary + '_dictionary']:
		returndict['newhtml'] = 'cannot look up {w}: {d} dictionary is not installed'.format(d=usedictionary, w=seeking)
		return json.dumps(returndict)

	if not session['available'][usedictionary + '_dictionary']:
		returndict['newhtml'] = 'cannot look up {w}: {d} dictionary is not installed'.format(d=usedictionary, w=seeking)
		return json.dumps(returndict)

	limit = hipparchia.config['CAPONDICTIONARYFINDS']

	foundtuples = headwordsearch(stripped, limit, usedictionary, usecolumn)

	# example:
	# results are presorted by ID# via the postgres query
	# foundentries [('scrofa¹', 43118), ('scrofinus', 43120), ('scrofipascus', 43121), ('Scrofa²', 43119), ('scrofulae', 43122)]

	returnlist = list()

	if len(foundtuples) == limit:
		returnlist.append('[stopped searching after {lim} finds]<br>'.format(lim=limit))

	if len(foundtuples) > 0:

		if len(foundtuples) == 1:
			# sending '0' to browserdictionarylookup() will hide the count number
			usecounter = False
		else:
			usecounter = True

		wordobjects = [probedictionary(setdictionarylanguage(f[0]) + '_dictionary', 'entry_name', f[0], '=', dbcursor=dbcursor, trialnumber=0) for f in foundtuples]
		wordobjects = flattenlistoflists(wordobjects)
		outputobjects = [lexicalOutputObject(w) for w in wordobjects]

		# very top: list the finds
		if usecounter:
			findstemplate = '({n})&nbsp;<a class="nounderline" href="#{w}_{wdid}">{w}</a>'
			findslist = [findstemplate.format(n=f[0]+1, w=f[1][0], wdid=f[1][1]) for f in enumerate(foundtuples)]
			returnlist.append('\n<br>\n'.join(findslist))

		# the actual entries
		count = 0
		for oo in outputobjects:
			count += 1
			if usecounter:
				entry = oo.generatelexicaloutput(countervalue=count)
			else:
				entry = oo.generatelexicaloutput()
			returnlist.append(entry)
	else:
		returnlist.append('[nothing found]')

	if session['zaplunates']:
		returnlist = [attemptsigmadifferentiation(x) for x in returnlist]
		returnlist = [abbreviatedsigmarestoration(x) for x in returnlist]

	returndict['newhtml'] = '\n'.join(returnlist)
	returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()])

	jsondict = json.dumps(returndict)

	dbconnection.connectioncleanup()

	return jsondict
示例#14
0
def findbyform(observedword, authorid=None) -> JSON_STR:
	"""
	this function sets of a chain of other functions
	find dictionary form
	find the other possible forms
	look up the dictionary form
	return a formatted set of info
	:return:
	"""

	if authorid and authorid not in authordict:
		authorid = None

	observedword = observedword[:hipparchia.config['MAXIMUMLEXICALLENGTH']]

	probeforsessionvariables()

	dbconnection = ConnectionObject()
	dbcursor = dbconnection.cursor()

	sanitationerror = '[empty search: <span class="emph">{w}</span> was sanitized into nothingness]'
	dberror = '<br />[the {lang} morphology data has not been installed]'
	notfounderror = '<br />[could not find a match for <span class="emph">{cw}</span> in the morphology table]'
	nodataerror = '<br /><br />no prevalence data for {w}'

	# the next is pointless because: 'po/lemon' will generate a URL '/parse/po/lemon'
	# that will 404 before you can get to replacegreekbetacode()
	# this is a bug in the interaction between Flask and the JS

	# if hipparchia.config['UNIVERSALASSUMESBETACODE']:
	# 	observedword = replacegreekbetacode(observedword.upper())

	# the next makes sense only in the context of pointedly invalid input
	w = depunct(observedword)
	w = w.strip()
	w = tidyupterm(w)
	w = re.sub(r'[σς]', 'ϲ', w)

	# python seems to know how to do this with greek...
	w = w.lower()
	retainedgravity = w
	cleanedword = removegravity(retainedgravity)

	# index clicks will send you things like 'αὖ²'
	cleanedword = re.sub(r'[⁰¹²³⁴⁵⁶⁷⁸⁹]', str(), cleanedword)

	# the search syntax is '=' and not '~', so the next should be avoided unless a lot of refactoring will happen
	# cleanedword = re.sub(r'[uv]', r'[uv]', cleanedword)
	# cleanedword = re.sub(r'[ij]', r'[ij]', cleanedword)

	# a collection of HTML items that the JS will just dump out later; i.e. a sort of pseudo-page
	returndict = dict()

	try:
		cleanedword[0]
	except IndexError:
		returndict['newhtml'] = sanitationerror.format(w=observedword)
		return json.dumps(returndict)

	isgreek = True
	if re.search(r'[a-z]', cleanedword[0]):
		cleanedword = stripaccents(cleanedword)
		isgreek = False

	morphologyobject = lookformorphologymatches(cleanedword, dbcursor)
	# print('findbyform() mm',morphologyobject.getpossible()[0].transandanal)
	# φέρεται --> morphologymatches [('<possibility_1>', '1', 'φέρω', '122883104', '<transl>fero</transl><analysis>pres ind mp 3rd sg</analysis>')]

	if morphologyobject:
		oo = multipleWordOutputObject(cleanedword, morphologyobject, authorid)
		returndict['newhtml'] = oo.generateoutput()
	else:
		newhtml = list()
		if isgreek and not session['available']['greek_morphology']:
			newhtml.append(dberror.format(lang='Greek'))
		elif not isgreek and not session['available']['latin_morphology']:
			newhtml.append(dberror.format(lang='Latin'))
		else:
			newhtml.append(notfounderror.format(cw=cleanedword))

		prev = getobservedwordprevalencedata(cleanedword)
		if not prev:
			newhtml.append(getobservedwordprevalencedata(retainedgravity))
		if not prev:
			newhtml.append(nodataerror.format(w=retainedgravity))
		else:
			newhtml.append(prev)
		try:
			returndict['newhtml'] = '\n'.join(newhtml)
		except TypeError:
			returndict['newhtml'] = '[nothing found]'

	returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()])
	jsondict = json.dumps(returndict)

	dbconnection.connectioncleanup()

	return jsondict