def threejsgraphofvectors(sentencetuples, workssearched, so, vectorspace): """ unused parameters so that the shape of this function's inputs can match other parallel functions see https://github.com/tobydoig/3dword2vec :param sentencetuples: :param workssearched: :param so: :param vectorspace: :return: """ output = SearchOutputObject(so) graphdata = reducetothreedimensions(so, vectorspace) output.found = str() output.htmlsearch = str() output.found = trheedimensionalhtml() output.js = threedimensionaljs(graphdata) jsonoutput = json.dumps(output.generateoutput()) # print('jsonoutput', jsonoutput) return jsonoutput
def lsiformatoutput(findshtml: str, workssearched: int, matches: list, searchobject: SearchObject) -> str: """ should use OutputObject() instead :param findshtml: :param workssearched: :param searchobject: :param activepoll: :param starttime: :return: """ so = searchobject activepoll = so.poll output = SearchOutputObject(so) output.found = findshtml output.js = insertbrowserclickjs('browser') output.setscope(workssearched) output.title = 'Sentences that are reminiscent of »{skg}«'.format( skg=so.seeking) output.thesearch = output.title output.htmlsearch = 'sentences that are reminiscent of <span class="sought">»{skg}«</span>'.format( skg=so.seeking) output.resultcount = '{n} sentences above the cutoff'.format( n=len(matches)) output.searchtime = so.getelapsedtime() activepoll.deactivate() jsonoutput = json.dumps(output.generateoutput()) return jsonoutput
def dispatchvectorsearch(vectortype: str, searchid: str, one=None, two=None, three=None) -> JSON_STR: """ dispatcher for "/vectors/..." requests """ if not hipparchia.config['SEMANTICVECTORSENABLED']: so = SearchObject(str(), str(), str(), str(), str(), session) oo = SearchOutputObject(so) target = 'searchsummary' message = '[semantic vectors have not been enabled]' return oo.generatenulloutput(itemname=target, itemval=message) pollid = validatepollid(searchid) one = depunct(one) two = depunct(two) three = depunct(three) simple = [pollid, one] triple = [pollid, one, two, three] knownfunctions = { 'nearestneighborsquery': { 'bso': simple, 'pref': 'CONCEPTMAPPINGENABLED' }, 'analogies': { 'bso': triple, 'pref': 'VECTORANALOGIESENABLED' }, 'topicmodel': { 'bso': simple, 'pref': 'TOPICMODELINGENABLED' }, 'vectortestfunction': { 'bso': simple, 'pref': 'TESTINGVECTORBUTTONENABLED' }, 'unused': { 'fnc': lambda: str(), 'bso': None, 'pref': None }, } if not knownfunctions[vectortype]['pref'] or not hipparchia.config[ knownfunctions[vectortype]['pref']]: return json.dumps('this type of search has not been enabled') bso = knownfunctions[vectortype]['bso'] so = None if len(bso) == 4: so = buildtriplelemmasearchobject(*bso) if len(bso) == 2: so = buildsinglelemmasearchobject(*bso) so.vectorquerytype = vectortype progresspolldict[pollid] = ProgressPoll(pollid) so.poll = progresspolldict[pollid] so.poll.activate() so.poll.statusis('Preparing to vectorize') if hipparchia.config['EXTERNALVECTORHELPER']: j = externalvectors(so) else: j = pythonvectors(so) if hipparchia.config['JSONDEBUGMODE']: print('/vectors/{f}\n\t{j}'.format(f=vectortype, j=j)) try: del so.poll except AttributeError: pass return j
def ldatopicsgenerateoutput(ldavishtmlandjs: str, searchobject: SearchObject): """ pyLDAvis.prepared_data_to_html() outputs something that is almost pure JS and looks like this: <link rel="stylesheet" type="text/css" href="https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css"> <div id="ldavis_el7428760626948328485476648"></div> <script type="text/javascript"> var ldavis_el7428760626948328485476648_data = {"mdsDat": ... } </script> instance = { 'maxfeatures': 2000, 'components': 15, # topics 'maxfreq': .75, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': 5, # word must be found >n times 'iterations': 12, 'mustbelongerthan': 3 } :param ldavishtmlandjs: :param workssearched: :param settings: :param searchobject: :return: """ so = searchobject activepoll = so.poll output = SearchOutputObject(so) workssearched = len(so.searchlist) vv = searchobject.vectorvalues settings = { 'maxfeatures': vv.ldamaxfeatures, 'components': vv.ldacomponents, # topics 'maxfreq': vv. ldamaxfreq, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': vv.ldaminfreq, # word must be found >n times 'iterations': vv.ldaiterations, 'mustbelongerthan': vv.ldamustbelongerthan } lines = ldavishtmlandjs.split('\n') lines = [re.sub(r'\t', str(), l) for l in lines if l] lines.reverse() thisline = str() html = list() while not re.search(r'<script type="text/javascript">', thisline): html.append(thisline) try: thisline = lines.pop() except IndexError: # oops, we never found the script... thisline = '<script type="text/javascript">' # we cut '<script>'; now drop '</script>' lines.reverse() js = lines[:-1] findshtml = '\n'.join(html) findsjs = '\n'.join(js) ldacssurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.css' ldacsslocal = '/css/ldavis.css' findshtml = re.sub(ldacssurl, ldacsslocal, findshtml) # brittle: ldavis might change its URLs between versions, etc. # should probably make this conditional upon the presence of the file locally... ldajsurl = r'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js' ldajslocal = '/static/jsforldavis.js' findsjs = re.sub(ldajsurl, ldajslocal, findsjs) # this next will break the reloaded figure: hm... # d3jsurl = r'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min' # d3jslocal = '/static/jsd3' # findsjs = re.sub(d3jsurl, d3jslocal, findsjs) # # print('findsjs',findsjs) who = str() where = '{n} authors'.format(n=searchobject.numberofauthorssearched()) if searchobject.numberofauthorssearched() == 1: a = authordict[searchobject.searchlist[0][:6]] who = a.akaname where = who if workssearched == 1: try: w = workdict[searchobject.searchlist[0]] w = w.title except KeyError: w = str() where = '{a}, <worktitle>{w}</worktitle>'.format(a=who, w=w) output.title = 'Latent Dirichlet Allocation' output.found = findshtml output.js = findsjs output.setscope(workssearched) output.sortby = 'weight' output.thesearch = 'thesearch'.format(skg='') output.resultcount = 'the following topics' output.htmlsearch = '{n} topics in {w}'.format(n=settings['components'], w=where) output.searchtime = so.getelapsedtime() activepoll.deactivate() jsonoutput = json.dumps(output.generateoutput()) return jsonoutput
def analogiesgenerateoutput(searchobject, findstuples: list) -> JSON_STR: """ findstuples: [(word1, value1), (word2, value2), ...] @htmlcommentdecorator here will actually kill the json... :param searchobject: :param findstuples: :return: """ so = searchobject output = SearchOutputObject(so) if so.session['baggingmethod'] != 'unlemmatized': a = so.lemmaone.dictionaryentry b = so.lemmatwo.dictionaryentry c = so.lemmathree.dictionaryentry else: a = so.seeking b = so.proximate c = so.termthree tabletemplate = """ <table class="vectortable outline"> {thdr} {rows} <table> """ thdrtemplate = """ <tr> <th>{a}</th> <th>{b}</th> <th>{c}</th> </tr> """ meth = searchobject.session['baggingmethod'] thdr = thdrtemplate.format(a='Bagging method:', b=meth, c=str()) rowtemplate = """ <tr> <td>{wrd}</td> <td></td> <td>{val}</td> </tr> """ therows = [rowtemplate.format(wrd=t[0], val=t[1]) for t in findstuples] therows = '\n'.join(therows) thetable = tabletemplate.format(thdr=thdr, rows=therows) output.found = thetable activepoll = so.poll output.title = '{a} : {b} :: {c} : ???'.format(a=a, b=b, c=c) output.searchtime = so.getelapsedtime() activepoll.deactivate() jsonoutput = json.dumps(output.generateoutput()) return jsonoutput
def nearestneighborgenerateoutput(findshtml: str, mostsimilar: list, imagename: str, workssearched: int, searchobject: SearchObject) -> str: """ :param findshtml: :param mostsimilar: :param imagename: :param workssearched: :param searchobject: :param activepoll: :param starttime: :return: """ vectorsearchwaslemmatized = True so = searchobject activepoll = so.poll output = SearchOutputObject(so) output.image = imagename findsjs = generatevectorjs() try: lm = so.lemma.dictionaryentry except AttributeError: # AttributeError: 'NoneType' object has no attribute 'dictionaryentry' vectorsearchwaslemmatized = False lm = so.seeking try: pr = so.proximatelemma.dictionaryentry except AttributeError: # proximatelemma is None pr = None if vectorsearchwaslemmatized: extrastringone = 'all forms of ' ht = 'all {n} known forms of <span class="sought">»{skg}«</span>'.format( n=len(so.lemma.formlist), skg=lm) else: extrastringone = str() ht = '<span class="sought">»{skg}«</span>'.format(skg=lm) output.title = 'Neighbors for {es}»{skg}«'.format(skg=lm, pr=pr, es=extrastringone) output.found = findshtml output.js = findsjs try: output.setresultcount(len(mostsimilar), 'proximate terms to graph') except TypeError: pass output.setscope(workssearched) output.thesearch = '{es}»{skg}«'.format(skg=lm, es=extrastringone) output.htmlsearch = ht output.sortby = 'proximity' output.image = imagename output.searchtime = so.getelapsedtime() jsonoutput = json.dumps(output.generateoutput()) activepoll.deactivate() if isinstance(activepoll, RedisProgressPoll): activepoll.deleteredispoll() del activepoll return jsonoutput
def executesearch(searchid: str, so=None, req=request) -> JSON_STR: """ the interface to all of the other search functions tell me what you are looking for and i'll try to find it the results are returned in a json bundle that will be used to update the html on the page note that cosdistbysentence vector queries also flow through here: they need a hitdict overview: buildsearchobject() and then start modifying elements of the SearchObject build a search list via compilesearchlist() modify search list via flagexclusions() modify search list via calculatewholeauthorsearches() build search list restrictions via indexrestrictions() search via searchdispatcher() format results via buildresultobjects() :return: """ pollid = validatepollid(searchid) if not so: # there is a so if singlewordsearch() sent you here probeforsessionvariables() so = buildsearchobject(pollid, req, session) frozensession = so.session progresspolldict[pollid] = ProgressPoll(pollid) so.poll = progresspolldict[pollid] so.poll.activate() so.poll.statusis('Preparing to search') nosearch = True output = SearchOutputObject(so) allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if frozensession[c]] if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph'] or frozensession['topicmodel']) and activecorpora: so.poll.statusis('Compiling the list of works to search') so.searchlist = compilesearchlist(listmapper, frozensession) if so.searchlist: # do this before updatesearchlistandsearchobject() which collapses items and cuts your total workssearched = len(so.searchlist) # calculatewholeauthorsearches() + configurewhereclausedata() so = updatesearchlistandsearchobject(so) nosearch = False skg = None prx = None isgreek = re.compile( '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]' ) if so.lemmaone: so.termone = wordlistintoregex(so.lemma.formlist) skg = so.termone if re.search(isgreek, skg): # 'v' is a problem because the lemmata list is going to send 'u' # but the greek lemmata are accented so.usecolumn = 'accented_line' if so.lemmatwo: so.termtwo = wordlistintoregex(so.lemmatwo.formlist) prx = so.termtwo if re.search(isgreek, prx): so.usecolumn = 'accented_line' so.setsearchtype() thesearch = so.generatesearchdescription() htmlsearch = so.generatehtmlsearchdescription() # now that the SearchObject is built, do the search... hits = precomposedsqlsearch(so) so.poll.statusis('Putting the results in context') # hits is List[dbWorkLine] hitdict = sortresultslist(hits, so, authordict, workdict) if so.vectorquerytype == 'cosdistbylineorword': # print('executesearch(): h - cosdistbylineorword') # take these hits and head on over to the vector worker output = findabsolutevectorsfromhits(so, hitdict, workssearched) del progresspolldict[pollid] return output resultlist = buildresultobjects(hitdict, authordict, workdict, so) so.poll.statusis('Converting results to HTML') sandp = rewriteskgandprx(skg, prx, htmlsearch, so) skg = sandp['skg'] prx = sandp['prx'] htmlsearch = sandp['html'] for r in resultlist: r.lineobjects = flagsearchterms(r, skg, prx, so) if so.context > 0: findshtml = htmlifysearchfinds(resultlist, so) else: findshtml = nocontexthtmlifysearchfinds(resultlist) if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']: findshtml = gtltsubstitutes(findshtml) findsjs = insertbrowserclickjs('browser') resultcount = len(resultlist) if resultcount < so.cap: hitmax = False else: hitmax = True output.title = thesearch output.found = findshtml output.js = findsjs output.setresultcount(resultcount, 'passages') output.setscope(workssearched) output.searchtime = so.getelapsedtime() output.thesearch = thesearch output.htmlsearch = htmlsearch output.hitmax = hitmax if nosearch: if not activecorpora: output.reasons.append('there are no active databases') if len(so.seeking) == 0: output.reasons.append('there is no search term') if len(so.seeking) > 0 and len(so.searchlist) == 0: output.reasons.append('zero works match the search criteria') output.title = '(empty query)' output.setresultcount(0, 'passages') output.explainemptysearch() so.poll.deactivate() jsonoutput = json.dumps(output.generateoutput()) del progresspolldict[pollid] return jsonoutput
def generateabsolutevectorsoutput(listsofwords: list, workssearched: list, searchobject, vtype: str): """ :return: """ so = searchobject vv = so.vectorvalues activepoll = so.poll # find all words in use allwords = findsetofallwords(listsofwords) # print('allwords', allwords) # find all possible forms of all the words we used # consider subtracting some set like: rarewordsthatpretendtobecommon = {} activepoll.statusis('Finding headwords') morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True) morphdict = convertmophdicttodict(morphdict) # find all possible headwords of all of the forms in use # note that we will not know what we did not know: count unparsed words too and deliver that as info at the end? allheadwords = dict() for m in morphdict.keys(): for h in morphdict[m]: allheadwords[h] = m if so.lemma: # set to none for now subtractterm = None else: subtractterm = so.seeking activepoll.statusis('Building vectors') vectorspace = buildrudimentaryvectorspace(allheadwords, morphdict, listsofwords, subtractterm=subtractterm) # for k in vectorspace.keys(): # print(k, vectorspace[k]) if so.lemma: focus = so.lemma.dictionaryentry else: focus = so.seeking activepoll.statusis('Calculating cosine distances') cosinevalues = caclulatecosinevalues(focus, vectorspace, allheadwords.keys()) # cosinevalues = vectorcosinedispatching(focus, vectorspace, allheadwords.keys()) # print('generatevectoroutput cosinevalues', cosinevalues) # apply the threshold and drop the 'None' items threshold = 1.0 - vv.localcutoffdistance falseidentity = .02 cosinevalues = { c: 1 - cosinevalues[c] for c in cosinevalues if cosinevalues[c] and falseidentity < cosinevalues[c] < threshold } mostsimilar = [(c, cosinevalues[c]) for c in cosinevalues] mostsimilar = sorted(mostsimilar, key=lambda t: t[1], reverse=True) findshtml = formatnnmatches(mostsimilar, vv) # next we look for the interrelationships of the words that are above the threshold activepoll.statusis('Calculating metacosine distances') imagename = graphbliteraldistancematches(focus, mostsimilar, so) findsjs = generatevectorjs() output = SearchOutputObject(so) output.title = 'Cosine distances to »{skg}«'.format(skg=focus) output.found = findshtml output.js = findsjs if not so.session['cosdistbylineorword']: space = 'related terms in {s} {t}'.format(s=len(listsofwords), t=vtype) else: dist = so.session['proximity'] scale = {'words': 'word', 'lines': 'line'} if int(dist) > 1: plural = 's' else: plural = str() space = 'related terms within {a} {b}{s}'.format( a=dist, b=scale[so.session['searchscope']], s=plural) found = max(vv.neighborscap, len(cosinevalues)) output.setresultcount(found, space) output.setscope(workssearched) if so.lemma: xtra = 'all forms of ' else: xtra = str() output.thesearch = '{x}»{skg}«'.format(x=xtra, skg=focus) output.htmlsearch = '{x}<span class="sought">»{skg}«</span>'.format( x=xtra, skg=focus) output.sortby = 'distance with a cutoff of {c}'.format( c=vv.localcutoffdistance) output.image = imagename output.searchtime = so.getelapsedtime() activepoll.deactivate() jsonoutput = json.dumps(output.generateoutput()) return jsonoutput
def tsnegraphofvectors(sentencetuples, workssearched, so, vectorspace): """ lifted from https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#sphx-glr-auto-examples-tutorials-run-word2vec-py unused parameters so that the shape of this function's inputs can match other parallel functions :param sentencetuples: :param workssearched: :param so: :param vectorspace: :return: """ plotdict = reducetotwodimensions(vectorspace) xvalues = plotdict['xvalues'] yvalues = plotdict['yvalues'] labels = plotdict['labels'] # random.seed(0) plt.figure(figsize=(12, 12)) # https://jonasjacek.github.io/colors/ plt.scatter(xvalues, yvalues, color='#c6c6c6') # Label randomly subsampled 25 data points # indices = list(range(len(labels))) selected_indices = random.sample(indices, 25) for i in selected_indices: plt.annotate(labels[i], (xvalues[i], yvalues[i])) graphobject = BytesIO() plt.savefig(graphobject) plt.clf() plt.close() graphobject = graphobject.getvalue() imagename = storevectorgraph(graphobject) # print('http://localhost:5000/getstoredfigure/{i}'.format(i=imagename)) output = SearchOutputObject(so) output.image = imagename findsjs = generatevectorjs() htmltemplate = """ <p id="imagearea"></p> """ output.found = str() output.htmlsearch = str() output.found = htmltemplate output.js = findsjs jsonoutput = json.dumps(output.generateoutput()) # print('jsonoutput', jsonoutput) return jsonoutput