def findsetofallwords(listofwordclusters: list) -> set: """ get ready to vectorize by splitting and cleaning a set of lines or sentences :param listofwordclusters: :return: """ # find all words in use allwords = [c.split(' ') for c in listofwordclusters] # flatten allwords = [item for sublist in allwords for item in sublist] greekwords = [w for w in allwords if re.search(minimumgreek, w)] trans = buildhipparchiatranstable() latinwords = [w for w in allwords if not re.search(minimumgreek, w)] allwords = [removegravity(w) for w in greekwords ] + [stripaccents(w, trans) for w in latinwords] punct = re.compile('[{s}]'.format(s=re.escape(punctuation + extrapunct))) allwords = [re.sub(punct, str(), w) for w in allwords] allwords = set(allwords) - {''} return allwords
def polytonicsort(unsortedwords: list) -> list: """ sort() looks at your numeric value, but α and ά and ᾶ need not have neighboring numerical values stripping diacriticals can help this, but then you get words that collide gotta jump through some extra hoops [a] build an unaccented copy of the word in front of the word [b] substitute sigmas for lunate sigmas (because lunate comes after omega...) θαλαττησ-snip-θαλάττηϲ [c] sort the augmented words (where ά and ᾶ only matter very late in the game) [d] remove the augment [e] return :param unsortedwords: :return: """ transtable = buildhipparchiatranstable() stripped = [ re.sub(r'ϲ', r'σ', stripaccents(word, transtable)) + '-snip-' + word for word in unsortedwords if word ] stripped = sorted(stripped) snipper = re.compile(r'(.*?)(-snip-)(.*?)') sortedversion = [re.sub(snipper, r'\3', word) for word in stripped] return sortedversion
def bulkfindwordcounts(listofwords: List[str]) -> List[dbWordCountObject]: """ note that the lists of words should all start with the same letter since the wordcount tables are letter-keyed hipparchiaDB=# CREATE TEMP TABLE bulkcounter_51807f8bbe08 AS SELECT values AS entriestocheck FROM unnest(ARRAY['κατακλειούϲηϲ', 'κατακλῇϲαι', 'κατακλεῖϲαι']) values; hipparchiaDB=# SELECT * FROM wordcounts_κ WHERE EXISTS (SELECT 1 FROM bulkcounter_51807f8bbe08 tocheck WHERE tocheck.entriestocheck = wordcounts_κ.entry_name); entry_name | total_count | gr_count | lt_count | dp_count | in_count | ch_count ---------------+-------------+----------+----------+----------+----------+---------- κατακλεῖϲαι | 31 | 30 | 0 | 0 | 1 | 0 κατακλειούϲηϲ | 3 | 3 | 0 | 0 | 0 | 0 κατακλῇϲαι | 1 | 1 | 0 | 0 | 0 | 0 (3 rows) :param listofwords: :return: """ dbconnection = ConnectionObject(readonlyconnection=False) dbcursor = dbconnection.cursor() try: firstletteroffirstword = stripaccents(listofwords[0][0]) except IndexError: return list() if firstletteroffirstword not in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ': firstletteroffirstword = '0' tqtemplate = """ CREATE TEMP TABLE bulkcounter_{rnd} AS SELECT values AS entriestocheck FROM unnest(ARRAY[%s]) values """ uniquename = assignuniquename(12) tempquery = tqtemplate.format(rnd=uniquename) data = (listofwords, ) dbcursor.execute(tempquery, data) qtemplate = """ SELECT * FROM wordcounts_{x} WHERE EXISTS (SELECT 1 FROM bulkcounter_{rnd} tocheck WHERE tocheck.entriestocheck = wordcounts_{x}.entry_name) """ query = qtemplate.format(rnd=uniquename, x=firstletteroffirstword) try: dbcursor.execute(query) results = resultiterator(dbcursor) except psycopg2.ProgrammingError: # if you do not have the wordcounts installed: 'ProgrammingError: relations "wordcounts_a" does not exist results = list() wordcountobjects = [dbWordCountObject(*r) for r in results] dbconnection.connectioncleanup() return wordcountobjects
def offerlemmatahints(query) -> list: """ fill in the hint box with eligible values since there are a crazy number of words, don't update until you are beyond 3 chars :return: """ hintlist = list() invals = u'jvσς' outvals = u'iuϲϲ' if len(query) > 1: # query = stripaccents(term.lower()) query = stripaccents(query) qlen = len(query) bag = query[0:2] key = stripaccents(bag.translate(str.maketrans(invals, outvals))) try: wordlist = keyedlemmata[key] except KeyError: wordlist = list() wordlist = polytonicsort(wordlist) # print('offerlemmatahints() wordlist', wordlist) if qlen > 2: # always true, but what if you changed 'len(term) > 2'? q = key + query[2:] else: q = key #hintlist = [{'value': w} for w in wordlist if q == stripaccents(w.lower()[0:qlen])] hintlist = [{ 'value': w } for w in wordlist if q == stripaccents(w[0:qlen])] if len(hintlist) > 50: hintlist = hintlist[0:50] hintlist = ['(>50 items: list was truncated)'] + hintlist return hintlist
def _generatekeyedwordcounts(self) -> dict: """ return something like: {'πνεύμαϲι': 378, 'πνευμάτων': 2161, 'πνεῦμ': 89, 'πνεύμαϲ': 1, 'πνεῦμα': 23686, 'πνεύματα': 1959, 'πνεύματ': 17, 'πνεύματοϲ': 19025, 'πνεύμαϲιν': 299, 'πνεύματι': 8855} :return: """ wordset = {re.sub(r"'$", str(), a.word) for a in self.listofanalyses} initials = {stripaccents(w[0]) for w in wordset} byinitial = { i: [w for w in wordset if stripaccents(w[0]) == i] for i in initials } wco = [bulkfindwordcounts(byinitial[i]) for i in byinitial] wco = list(itertools.chain(*wco)) keyedwcounts = {w.entryname: w.t for w in wco if w} return keyedwcounts
def _entrylistsplitter(matchgroup): entrytemplate = r'<dictionaryentry id="{clean}">{dirty}</dictionaryentry>' head = matchgroup.group(1) tail = matchgroup.group(3) synonymns = matchgroup.group(2) synonymns = synonymns.split(', ') substitutes = [ entrytemplate.format(clean=stripaccents(s), dirty=s) for s in synonymns ] substitutes = ', '.join(substitutes) newstring = head + substitutes + tail return newstring
def buildkeyedlemmata(listofentries: list) -> defaultdict: """ a list of 140k words is too long to send to 'getlemmahint' without offering quicker access a dict with keys... :param listofentries: :return: """ invals = u'jvσς' outvals = u'iuϲϲ' keyedlemmata = defaultdict(list) if track: iterable = track(listofentries, description='building keyedlemmata', transient=True) else: print('building keyedlemmata', end=str()) iterable = listofentries for e in iterable: try: # might IndexError here... bag = e[0:2] key = stripaccents(bag.translate(str.maketrans(invals, outvals))) try: keyedlemmata[key].append(e) except KeyError: keyedlemmata[key] = [e] except IndexError: pass if track: print('building keyedlemmata', end=str()) return keyedlemmata
def findcountsviawordcountstable(wordtocheck): """ used to look up a list of specific observed forms (vs. dictionary headwords) :param wordtocheck: :return: """ dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() initial = stripaccents(wordtocheck[0]) # alternatives = re.sub(r'[uv]','[uv]',c) # alternatives = '^'+alternatives+'$' if initial in 'abcdefghijklmnopqrstuvwxyzαβψδεφγηιξκλμνοπρϲτυωχθζ': # note that we just lost "'φερον", "'φερεν", "'φέρεν", "'φερεϲ", "'φερε",... # but the punctuation killer probably zapped them long ago # this needs to be addressed in HipparchiaBuilder q = 'SELECT * FROM wordcounts_{i} WHERE entry_name = %s'.format( i=initial) else: q = 'SELECT * FROM wordcounts_0 WHERE entry_name = %s' d = (wordtocheck, ) try: dbcursor.execute(q, d) result = dbcursor.fetchone() except psycopg2.ProgrammingError: # psycopg2.ProgrammingError: relation "wordcounts_ε" does not exist # you did not build the wordcounts at all? result = None dbconnection.connectioncleanup() return result
def lookformorphologymatches(word: str, dbcursor, trialnumber=0, revertword=None, rewrite=None, furtherdeabbreviate=False) -> dbMorphologyObject: """ hipparchiaDB=# select * from greek_morphology limit 1; observed_form | xrefs | prefixrefs | possible_dictionary_forms ---------------+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------- Τηνίουϲ | 114793123 | | <possibility_1>Τήνιοϲ<xref_value>114793123</xref_value><xref_kind>0</xref_kind><transl> </transl><analysis>masc acc pl</analysis></possibility_1>+ | | | hipparchiaDB=# select * from greek_lemmata where xref_number=114793123; dictionary_entry | xref_number | derivative_forms ------------------+-------------+---------------------------------------------------- τήνιοϲ | 114793123 | {τηνίων,τήνια,τηνίουϲ,τήνιοι,τηνίοιϲ,τηνία,τήνιοϲ} funky because we need to poke at words several times and to try combinations of fixes ought to pass a cursor to this one because this function will have trouble cleaning the connection properly :param word: :param dbcursor: :param trialnumber: :param revertword: :param rewrite: :param furtherdeabbreviate: a vector run has already turned 'm.' into Marcus, so it is safe to turn 'm' into 'mille' :return: """ if re.search(r'[a-z]', word): usedictionary = 'latin' else: usedictionary = 'greek' # βοῶ̣ντεϲ -> βοῶντεϲ word = re.sub(r'̣', str(), word) ihavesession = True try: session['available'][usedictionary + '_morphology'] except RuntimeError: # vectorbot thread does not have access to the session... # we will *dangerously guess* that we can skip the next check because vectorbotters # are quite likely to have beefy installations... ihavesession = False if ihavesession and not session['available'][usedictionary + '_morphology']: return None maxtrials = 4 retrywithcapitalization = 1 trialnumber += 1 # the things that can confuse me terminalacute = re.compile(r'[άέίόύήώ]') morphobjects = None # syntax = '~' if you have to deal with '[uv]' problems, e.g. # but that opens up a whole new can of worms query = 'SELECT * FROM {d}_morphology WHERE observed_form = %s'.format( d=usedictionary) data = (word, ) # print('lookformorphologymatches() q/d', query, data) dbcursor.execute(query, data) # NOT TRUE: fetchone() because all possiblities are stored inside the analysis itself # loss of case sensitivity is a problem here: Latro vs latro. analyses = dbcursor.fetchall() if analyses: morphobjects = [dbMorphologyObject(*a) for a in analyses] if rewrite: for m in morphobjects: m.observed = rewrite m.rewritten = True elif trialnumber < maxtrials: # turn 'kal' into 'kalends', etc. # not very costly as this is a dict lookup, and less costly than any call to the db newword = unpackcommonabbreviations(word, furtherdeabbreviate) if newword != word: return lookformorphologymatches(newword, dbcursor, 0, rewrite=word) if revertword: word = revertword # this code lets you make multiple stabs at an answer if you have already failed once # need to be careful about the retries that reset the trialnumber: could infinite loop if not careful # [a] something like πλακουντάριόν τι will fail because of the enclitic (greek_morphology can find πλακουντάριον and πλακουντάριοϲ) # [b] something like προχοίδιόν τι will fail twice over because of the enclitic and the diaresis try: # have to 'try...' because there might not be a word[-2] if trialnumber == 1: # elided ending? you will ask for ἀλλ, but you need to look for ἀλλ' newword = word + "'" morphobjects = lookformorphologymatches(newword, dbcursor, trialnumber, revertword=word) elif trialnumber == 2: # a proper noun? newword = word[0].upper() + word[1:] morphobjects = lookformorphologymatches(newword, dbcursor, trialnumber, revertword=word) elif re.search(r'\'$', word): # the last word in a greek quotation might have a 'close quote' that was mistaken for an elision newword = re.sub(r'\'', '', word) morphobjects = lookformorphologymatches( newword, dbcursor, trialnumber) elif re.search(r'[ΐϊΰῧϋî]', word): # desperate: ῥηϊδίωϲ --> ῥηιδίωϲ diacritical = 'ΐϊΰῧϋî' plain = 'ίιύῦυi' xform = str.maketrans(diacritical, plain) newword = word.translate(xform) morphobjects = lookformorphologymatches( newword, dbcursor, trialnumber=retrywithcapitalization) elif re.search(terminalacute, word[-1]): # an enclitic problem? sub = stripaccents(word[-1]) newword = word[:-1] + sub morphobjects = lookformorphologymatches( newword, dbcursor, trialnumber=retrywithcapitalization) elif re.search(terminalacute, word[-2]): # πλακουντάριόν? sub = stripaccents(word[-2]) newword = word[:-2] + sub + word[-1] morphobjects = lookformorphologymatches( newword, dbcursor, trialnumber=retrywithcapitalization) else: return None except IndexError: morphobjects = None if not morphobjects: return None # OK: we have a list of dbMorphologyObjects; this needs to be turned into a single object... # def __init__(self, observed, xrefs, prefixrefs, possibleforms): if isinstance(morphobjects, dbMorphologyObject): # you got here after multiple tries # if you don't do the next, the len() check will fail morphobjects = [morphobjects] if len(morphobjects) == 1: morphobject = morphobjects[0] else: ob = morphobjects[0].observed xr = flattenlistoflists([m.xrefs for m in morphobjects]) xr = ', '.join(xr) pr = flattenlistoflists([m.prefixrefs for m in morphobjects]) pr = ', '.join(pr) pf = [m.possibleforms for m in morphobjects] hw = flattenlistoflists([m.headwords for m in morphobjects]) # note that you will have multiple '<possibility_1>' entries now... Does not matter ATM, but a bug waiting to bite mergedpf = dict() for p in pf: mergedpf = {**mergedpf, **p} morphobject = dbMorphologyObject(ob, xr, pr, mergedpf, hw) return morphobject
def probedictionary(usedictionary: str, usecolumn: str, seeking: str, syntax: str, dbcursor=None, trialnumber=0) -> List: """ this will make several stabs at finding a word in the dictionary we need to do this because sometimes a find in the morphology dictionary does not point to something you can find in the dictionary of meanings sample values: dictionary: 'greek_dictionary' usecolumn: 'entry_name' seeking: 'προχοΐδιον' syntax: '=' or 'LIKE' still unimplemented: τήθη vs τηθή; the parser has the latter, the dictionary expects the former (but knows of the latter) :param dbcursor: :param usedictionary: :param usecolumn: :param seeking: :param syntax: :param trialnumber: :return: """ # print('seeking/trial',seeking,trialnumber) dbconnection = None if not dbcursor: dbconnection = ConnectionObject() dbconnection.setautocommit() dbcursor = dbconnection.cursor() maxtrials = 8 trialnumber += 1 accenteddiaresis = re.compile(r'αί|εί|οί|υί|ηί|ωί') unaccenteddiaresis = re.compile(r'αι|ει|οι|υι|ηι|ωι') # nothingfound = convertdictionaryfindintoobject('nothing', 'nodict') if usedictionary == 'latin_dictionary': extracolumn = 'entry_key' else: extracolumn = 'unaccented_entry' qtemplate = """SELECT entry_name, metrical_entry, id_number, pos, translations, entry_body, {ec} FROM {d} WHERE {col} {sy} %s ORDER BY id_number ASC""" query = qtemplate.format(ec=extracolumn, d=usedictionary, col=usecolumn, sy=syntax) data = (seeking, ) # print('searchdictionary()',query,'\n\t',data) try: dbcursor.execute(query, data) found = dbcursor.fetchall() except psycopg2.DataError: # thrown by dbcursor.execute() # invalid regular expression: parentheses () not balanced # ό)μβροϲ is a (bogus) headword; how many others are there? found = list() # we might be at trial 2+ and so we need to strip the supplement we used at trial #1 if trialnumber > 2: seeking = re.sub(r'\[¹²³⁴⁵⁶⁷⁸⁹\]', '', seeking) seeking = re.sub(r'\^', '', seeking) foundobjects = None if len(found) > 0: foundobjects = [ convertdictionaryfindintowordobject(f, usedictionary, dbcursor) for f in found ] elif trialnumber == 1: # failure... # the word is probably there, we have just been given the wrong search term; try some other solutions # [1] first guess: there were multiple possible entries, not just one newword = re.sub(r'[¹²³⁴⁵⁶⁷⁸⁹]', '', seeking.lower()) foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber == 2: # grab any/all variants: ⁰¹²³⁴⁵⁶⁷⁸⁹ newword = '^{sk}[¹²³⁴⁵⁶⁷⁸⁹]'.format(sk=seeking) foundobjects = probedictionary(usedictionary, usecolumn, newword, '~', dbcursor, trialnumber) # elif trialnumber < maxtrials and '-' in seeking: # newword = attemptelision(seeking) # foundobject = searchdictionary(cursor, dictionary, usecolumn, newword, '=', trialnumber) elif trialnumber < maxtrials and seeking[-1] == 'ω': # ὑποϲυναλείφομαι is in the dictionary, but greek_lemmata says to look for ὑπό-ϲυναλείφω newword = seeking[:-1] + 'ομαι' foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials and re.search(r'ομαι$', seeking): # χαρίζω is in the dictionary, but greek_lemmata says to look for χαρίζομαι newword = seeking[:-4] + 'ω' foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials and re.search(accenteddiaresis, seeking): # false positives very easy here, but we are getting desperate and have nothing to lose diaresis = re.search(accenteddiaresis, seeking) head = seeking[:diaresis.start()] tail = seeking[diaresis.end():] vowels = diaresis.group(0) vowels = vowels[0] + 'ΐ' newword = head + vowels + tail foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials and re.search(unaccenteddiaresis, seeking): diaresis = re.search(unaccenteddiaresis, seeking) head = seeking[:diaresis.start()] tail = seeking[diaresis.end():] vowels = diaresis.group(0) vowels = vowels[0] + 'ϊ' newword = head + vowels + tail foundobjects = probedictionary(usedictionary, usecolumn, newword, '=', dbcursor, trialnumber) elif trialnumber < maxtrials: # τήθη vs τηθή; the parser has the latter, the dictionary expects the former (but knows of the latter) trialnumber = maxtrials - 1 newword = re.sub(r'\[¹²³⁴⁵⁶⁷⁸⁹\]', '', seeking) newword = stripaccents(newword) newword = universalregexequivalent(newword) # strip '(' and ')' newword = '^{wd}$'.format(wd=newword[1:-1]) foundobjects = probedictionary(usedictionary, usecolumn, newword, '~', dbcursor, trialnumber) if dbconnection: dbconnection.connectioncleanup() return foundobjects
def headwordsearch(seeking: str, limit: str, usedictionary: str, usecolumn: str) -> List[tuple]: """ dictsearch() uses this hipparchiaDB=# SELECT entry_name, id_number FROM latin_dictionary WHERE entry_name ~* '.*?scrof.*?' ORDER BY id_number ASC LIMIT 50; entry_name | id_number --------------+----------- scrofa¹ | 43118 Scrofa² | 43119 scrofinus | 43120 scrofipascus | 43121 scrofulae | 43122 (5 rows) :param seeking: :param limit: :param usedictionary: :param usecolumn: :return: """ cleanpoolifneeded() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() qstring = 'SELECT entry_name, id_number FROM {d}_dictionary WHERE {c} ~* %s ORDER BY id_number ASC LIMIT {lim}' query = qstring.format(d=usedictionary, c=usecolumn, lim=limit) if seeking[0] == ' ' and seeking[-1] == ' ': data = ('^' + seeking[1:-1] + '$', ) elif seeking[0] == ' ' and seeking[-1] != ' ': data = ('^' + seeking[1:] + '.*?', ) elif seeking[0] == '^' and seeking[-1] == '$': # esp if the dictionary sent this via next/previous entry data = (seeking, ) else: data = ('.*?' + seeking + '.*?', ) dbcursor.execute(query, data) # note that the dictionary db has a problem with vowel lengths vs accents # SELECT * FROM greek_dictionary WHERE entry_name LIKE %s d ('μνᾱ/αϲθαι,μνάομαι',) try: foundentries = dbcursor.fetchall() except: foundentries = list() # print('foundentries', foundentries) # '/dictsearch/scrof' # foundentries [('scrofa¹', 43118), ('scrofinus', 43120), ('scrofipascus', 43121), ('Scrofa²', 43119), ('scrofulae', 43122)] if not foundentries: variantseeker = seeking[:-1] + '[¹²³⁴⁵⁶⁷⁸⁹]' + seeking[-1] data = (variantseeker, ) dbcursor.execute(query, data) foundentries = dbcursor.fetchall() if not foundentries: # maybe an inflected form was requested (can happen via clicks inside of an entry) morph = lookformorphologymatches(seeking, dbcursor) if morph: guesses = morph.getpossible() firstguess = guesses[0].getbaseform() seeking = stripaccents(firstguess) data = ('^{s}$'.format(s=seeking), ) # print('lookformorphologymatches() new data=', data) dbcursor.execute(query, data) foundentries = dbcursor.fetchall() dbconnection.connectioncleanup() return foundentries
def _entrywordcleaner(foundword, substitutionstring): # example substitute: r'<dictionaryentry id="{clean}">{dirty}</dictionaryentry>' stripped = stripaccents(foundword) newstring = substitutionstring.format(clean=stripped, dirty=foundword) # print('entrywordcleaner()', foundword, stripped) return newstring
def dictsearch(searchterm) -> JSON_STR: """ look up words return dictionary entries json packing :return: """ returndict = dict() searchterm = searchterm[:hipparchia.config['MAXIMUMLEXICALLENGTH']] probeforsessionvariables() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() if hipparchia.config['UNIVERSALASSUMESBETACODE']: searchterm = replacegreekbetacode(searchterm.upper()) allowedpunct = '^$.' seeking = depunct(searchterm, allowedpunct) seeking = seeking.lower() seeking = re.sub('[σς]', 'ϲ', seeking) stripped = stripaccents(seeking) # don't turn 'injurius' into '[iiII]n[iiII][uuVV]r[iiII][uuVV]s' # that will happen if you call stripaccents() prematurely stripped = re.sub(r'[uv]', '[uvUV]', stripped) stripped = re.sub(r'[ij]', '[ijIJ]', stripped) if re.search(r'[a-z]', seeking): usedictionary = 'latin' usecolumn = 'entry_name' else: usedictionary = 'greek' usecolumn = 'unaccented_entry' if not session['available'][usedictionary + '_dictionary']: returndict['newhtml'] = 'cannot look up {w}: {d} dictionary is not installed'.format(d=usedictionary, w=seeking) return json.dumps(returndict) if not session['available'][usedictionary + '_dictionary']: returndict['newhtml'] = 'cannot look up {w}: {d} dictionary is not installed'.format(d=usedictionary, w=seeking) return json.dumps(returndict) limit = hipparchia.config['CAPONDICTIONARYFINDS'] foundtuples = headwordsearch(stripped, limit, usedictionary, usecolumn) # example: # results are presorted by ID# via the postgres query # foundentries [('scrofa¹', 43118), ('scrofinus', 43120), ('scrofipascus', 43121), ('Scrofa²', 43119), ('scrofulae', 43122)] returnlist = list() if len(foundtuples) == limit: returnlist.append('[stopped searching after {lim} finds]<br>'.format(lim=limit)) if len(foundtuples) > 0: if len(foundtuples) == 1: # sending '0' to browserdictionarylookup() will hide the count number usecounter = False else: usecounter = True wordobjects = [probedictionary(setdictionarylanguage(f[0]) + '_dictionary', 'entry_name', f[0], '=', dbcursor=dbcursor, trialnumber=0) for f in foundtuples] wordobjects = flattenlistoflists(wordobjects) outputobjects = [lexicalOutputObject(w) for w in wordobjects] # very top: list the finds if usecounter: findstemplate = '({n}) <a class="nounderline" href="#{w}_{wdid}">{w}</a>' findslist = [findstemplate.format(n=f[0]+1, w=f[1][0], wdid=f[1][1]) for f in enumerate(foundtuples)] returnlist.append('\n<br>\n'.join(findslist)) # the actual entries count = 0 for oo in outputobjects: count += 1 if usecounter: entry = oo.generatelexicaloutput(countervalue=count) else: entry = oo.generatelexicaloutput() returnlist.append(entry) else: returnlist.append('[nothing found]') if session['zaplunates']: returnlist = [attemptsigmadifferentiation(x) for x in returnlist] returnlist = [abbreviatedsigmarestoration(x) for x in returnlist] returndict['newhtml'] = '\n'.join(returnlist) returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()]) jsondict = json.dumps(returndict) dbconnection.connectioncleanup() return jsondict
def findbyform(observedword, authorid=None) -> JSON_STR: """ this function sets of a chain of other functions find dictionary form find the other possible forms look up the dictionary form return a formatted set of info :return: """ if authorid and authorid not in authordict: authorid = None observedword = observedword[:hipparchia.config['MAXIMUMLEXICALLENGTH']] probeforsessionvariables() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() sanitationerror = '[empty search: <span class="emph">{w}</span> was sanitized into nothingness]' dberror = '<br />[the {lang} morphology data has not been installed]' notfounderror = '<br />[could not find a match for <span class="emph">{cw}</span> in the morphology table]' nodataerror = '<br /><br />no prevalence data for {w}' # the next is pointless because: 'po/lemon' will generate a URL '/parse/po/lemon' # that will 404 before you can get to replacegreekbetacode() # this is a bug in the interaction between Flask and the JS # if hipparchia.config['UNIVERSALASSUMESBETACODE']: # observedword = replacegreekbetacode(observedword.upper()) # the next makes sense only in the context of pointedly invalid input w = depunct(observedword) w = w.strip() w = tidyupterm(w) w = re.sub(r'[σς]', 'ϲ', w) # python seems to know how to do this with greek... w = w.lower() retainedgravity = w cleanedword = removegravity(retainedgravity) # index clicks will send you things like 'αὖ²' cleanedword = re.sub(r'[⁰¹²³⁴⁵⁶⁷⁸⁹]', str(), cleanedword) # the search syntax is '=' and not '~', so the next should be avoided unless a lot of refactoring will happen # cleanedword = re.sub(r'[uv]', r'[uv]', cleanedword) # cleanedword = re.sub(r'[ij]', r'[ij]', cleanedword) # a collection of HTML items that the JS will just dump out later; i.e. a sort of pseudo-page returndict = dict() try: cleanedword[0] except IndexError: returndict['newhtml'] = sanitationerror.format(w=observedword) return json.dumps(returndict) isgreek = True if re.search(r'[a-z]', cleanedword[0]): cleanedword = stripaccents(cleanedword) isgreek = False morphologyobject = lookformorphologymatches(cleanedword, dbcursor) # print('findbyform() mm',morphologyobject.getpossible()[0].transandanal) # φέρεται --> morphologymatches [('<possibility_1>', '1', 'φέρω', '122883104', '<transl>fero</transl><analysis>pres ind mp 3rd sg</analysis>')] if morphologyobject: oo = multipleWordOutputObject(cleanedword, morphologyobject, authorid) returndict['newhtml'] = oo.generateoutput() else: newhtml = list() if isgreek and not session['available']['greek_morphology']: newhtml.append(dberror.format(lang='Greek')) elif not isgreek and not session['available']['latin_morphology']: newhtml.append(dberror.format(lang='Latin')) else: newhtml.append(notfounderror.format(cw=cleanedword)) prev = getobservedwordprevalencedata(cleanedword) if not prev: newhtml.append(getobservedwordprevalencedata(retainedgravity)) if not prev: newhtml.append(nodataerror.format(w=retainedgravity)) else: newhtml.append(prev) try: returndict['newhtml'] = '\n'.join(newhtml) except TypeError: returndict['newhtml'] = '[nothing found]' returndict['newjs'] = '\n'.join([dictionaryentryjs(), insertlexicalbrowserjs()]) jsondict = json.dumps(returndict) dbconnection.connectioncleanup() return jsondict