def updatesearchlistandsearchobject(so: SearchObject) -> SearchObject: """ you have a searchlist; now tell the searchobject more about it... this has been peeled off so that golangvectors() can call it too """ # mark works that have passage exclusions associated with them: # gr0001x001 instead of gr0001w001 if you are skipping part of w001 so.searchlist = flagexclusions(so.searchlist, so.session) so.poll.statusis('Calculating full authors to search') so.searchlist = calculatewholeauthorsearches(so.searchlist, authordict) so.usedcorpora = so.wholecorporasearched() so.poll.statusis('Configuring the search restrictions') so.indexrestrictions = configurewhereclausedata(so.searchlist, workdict, so) return so
def gensimexperiment(so): """ :param activepoll: :param so: :return: """ activepoll = so.poll activecorpora = so.getactivecorpora() searchlist = flagexclusions(searchlist, so.session) workssearched = len(searchlist) searchlist = compilesearchlist(listmapper, so.session) searchlist = calculatewholeauthorsearches(searchlist, authordict) so.searchlist = searchlist sentencetuples = vectorprepdispatcher(so) # find all words in use listsofwords = [s[1] for s in sentencetuples] allwords = findsetofallwords(listsofwords) # find all possible forms of all the words we used # consider subtracting some set like: rarewordsthatpretendtobecommon = {} wl = '{:,}'.format(len(listsofwords)) activepoll.statusis('Finding headwords for {n} sentences'.format(n=wl)) morphdict = getrequiredmorphobjects(allwords) morphdict = convertmophdicttodict(morphdict) # find all possible headwords of all of the forms in use # note that we will not know what we did not know: count unparsed words too and deliver that as info at the end? allheadwords = dict() for m in morphdict.keys(): for h in morphdict[m]: allheadwords[h] = m vectorspace = logentropybuildspace(so, morphdict, listsofwords) return output
def tensorgraphelectedworks(searchobject): """ adapted from https://raw.githubusercontent.com/tensorflow/tensorflow/r1.5/tensorflow/examples/tutorials/word2vec/word2vec_basic.py :param activepoll: :param searchobject: :return: """ so = searchobject activepoll = so.poll tffunctiontocall = tftrainondata tffunctiontocall = tfnlptraining activepoll.statusis('Preparing to search') so.usecolumn = 'marked_up_line' allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if so.session[c]] if activecorpora: activepoll.statusis('Compiling the list of works to search') searchlist = compilesearchlist(listmapper, so.session) else: reasons = ['search list contained zero items'] return emptyvectoroutput(so, reasons) # make sure you don't go nuts maxwords = hipparchia.config['MAXVECTORSPACE'] wordstotal = 0 for work in searchlist: work = work[:10] try: wordstotal += workdict[work].wordcount except TypeError: # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType' pass if wordstotal > maxwords: reasons = [ 'the vector scope max exceeded: {a} > {b} '.format( a=locale.format('%d', wordstotal, grouping=True), b=locale.format('%d', maxwords, grouping=True)) ] return emptyvectoroutput(so, reasons) if len(searchlist) > 0: searchlist = flagexclusions(searchlist, so.session) workssearched = len(searchlist) searchlist = calculatewholeauthorsearches(searchlist, authordict) so.searchlist = searchlist indexrestrictions = configurewhereclausedata(searchlist, workdict, so) so.indexrestrictions = indexrestrictions # find all sentences activepoll.statusis('Finding all sentences') so.seeking = r'.' sentences = vectorprepdispatcher(so) output = tffunctiontocall(sentences, searchobject) else: return emptyvectoroutput(so) return output
def fortestingpurposessklearnselectedworks(searchobject): """ :param activepoll: :param searchobject: :return: """ skfunctiontotest = sklearntextfeatureextractionandevaluation skfunctiontotest = simplesktextcomparison skfunctiontotest = ldatopicmodeling skfunctiontotest = ldatopicgraphing so = searchobject activepoll = so.poll activepoll.statusis('Preparing to search') so.usecolumn = 'marked_up_line' so.vectortype = 'sentencesimilarity' allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if so.session[c]] if activecorpora: activepoll.statusis('Compiling the list of works to search') searchlist = compilesearchlist(listmapper, so.session) else: reasons = ['search list contained zero items'] return emptyvectoroutput(so, reasons) # make sure you don't go nuts maxwords = hipparchia.config['MAXVECTORSPACE'] wordstotal = 0 for work in searchlist: work = work[:10] try: wordstotal += workdict[work].wordcount except TypeError: # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType' pass if wordstotal > maxwords: reasons = [ 'the vector scope max exceeded: {a} > {b} '.format( a=locale.format('%d', wordstotal, grouping=True), b=locale.format('%d', maxwords, grouping=True)) ] return emptyvectoroutput(so, reasons) if len(searchlist) > 0: searchlist = flagexclusions(searchlist, so.session) workssearched = len(searchlist) searchlist = calculatewholeauthorsearches(searchlist, authordict) so.searchlist = searchlist indexrestrictions = configurewhereclausedata(searchlist, workdict, so) so.indexrestrictions = indexrestrictions # find all sentences activepoll.statusis('Finding all sentences') so.seeking = r'.' # if skfunctiontotest == ldatopicgraphing: # so.sentencebundlesize = 2 sentencetuples = vectorprepdispatcher(so) if len(sentencetuples ) > hipparchia.config['MAXSENTENCECOMPARISONSPACE']: reasons = [ 'scope of search exceeded allowed maximum: {a} > {b}'.format( a=len(sentencetuples), b=hipparchia.config['MAXSENTENCECOMPARISONSPACE']) ] return emptyvectoroutput(so, reasons) similaritiesdict = skfunctiontotest(sentencetuples, so) if skfunctiontotest == ldatopicgraphing: # kludge for now: this is already html corehtml = similaritiesdict return corehtml # similaritiesdict: {id: (scoreA, lindobjectA1, sentA1, lindobjectA2, sentA2), id2: (scoreB, lindobjectB1, sentB1, lindobjectB2, sentB2), ... } corehtml = skformatmostimilar(similaritiesdict) output = generatesimilarsentenceoutput(corehtml, so, workssearched, len(similaritiesdict)) else: return emptyvectoroutput(so) return output
def sklearnselectedworks(searchobject): """ :param activepoll: :param searchobject: :return: """ if not ldavis or not CountVectorizer: reasons = [ 'requisite software not installed: sklearn and/or ldavis is unavailable' ] return emptyvectoroutput(searchobject, reasons) so = searchobject activepoll = so.poll activepoll.statusis('Preparing to search') so.usecolumn = 'marked_up_line' so.vectortype = 'topicmodel' allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if so.session[c]] if activecorpora: activepoll.statusis('Compiling the list of works to search') searchlist = compilesearchlist(listmapper, so.session) else: reasons = ['search list contained zero items'] return emptyvectoroutput(so, reasons) # make sure you don't go nuts maxwords = hipparchia.config['MAXVECTORSPACE'] wordstotal = 0 for work in searchlist: work = work[:10] try: wordstotal += workdict[work].wordcount except TypeError: # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType' pass if wordstotal > maxwords: reasons = [ 'the vector scope max exceeded: {a} > {b} '.format( a=locale.format_string('%d', wordstotal, grouping=True), b=locale.format_string('%d', maxwords, grouping=True)) ] return emptyvectoroutput(so, reasons) if len(searchlist) > 0: searchlist = flagexclusions(searchlist, so.session) workssearched = len(searchlist) searchlist = calculatewholeauthorsearches(searchlist, authordict) so.searchlist = searchlist indexrestrictions = configurewhereclausedata(searchlist, workdict, so) so.indexrestrictions = indexrestrictions # find all sentences activepoll.statusis('Finding all sentences') so.seeking = r'.' sentencetuples = vectorprepdispatcher(so) if len(sentencetuples ) > hipparchia.config['MAXSENTENCECOMPARISONSPACE']: reasons = [ 'scope of search exceeded allowed maximum: {a} > {b}'.format( a=len(sentencetuples), b=hipparchia.config['MAXSENTENCECOMPARISONSPACE']) ] return emptyvectoroutput(so, reasons) output = ldatopicgraphing(sentencetuples, workssearched, so) else: return emptyvectoroutput(so) return output
def findabsolutevectorsbysentence(searchobject): """ use the searchlist to grab a collection of sentences then take a lemmatized search term and build association semanticvectors around that term in those passages generators are tempting, but dealing with generators+MP is a trick: TypeError: can't pickle generator objects :param searchitem: :param vtype: :return: """ so = searchobject activepoll = so.poll # we are not really a route at the moment, but instead being called by execute search # when the δ option is checked; hence the commenting out of the following # lemma = cleaninitialquery(request.args.get('lem', '')) try: lemma = lemmatadict[so.lemma.dictionaryentry] except KeyError: lemma = None except AttributeError: # 'NoneType' object has no attribute 'dictionaryentry' lemma = None activepoll.statusis('Preparing to search') so.usecolumn = 'marked_up_line' allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if so.session[c]] if (lemma or so.seeking) and activecorpora: activepoll.statusis('Compiling the list of works to search') searchlist = compilesearchlist(listmapper, so.session) else: reasons = ['search list contained zero items'] return emptyvectoroutput(so, reasons) # make sure you don't go nuts maxwords = hipparchia.config['MAXVECTORSPACE'] wordstotal = 0 for work in searchlist: work = work[:10] try: wordstotal += workdict[work].wordcount except TypeError: # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType' pass if wordstotal > maxwords: reasons = [ 'the vector scope max exceeded: {a} > {b} '.format( a=locale.format_string('%d', wordstotal, grouping=True), b=locale.format_string('%d', maxwords, grouping=True)) ] return emptyvectoroutput(so, reasons) if len(searchlist) > 0: searchlist = flagexclusions(searchlist, so.session) workssearched = len(searchlist) searchlist = calculatewholeauthorsearches(searchlist, authordict) so.searchlist = searchlist indexrestrictions = configurewhereclausedata(searchlist, workdict, so) so.indexrestrictions = indexrestrictions # find all sentences activepoll.statusis('Finding all sentences') sentencetuples = vectorprepdispatcher(so) sentences = [s[1] for s in sentencetuples] output = generateabsolutevectorsoutput(sentences, workssearched, so, 'sentences') else: return emptyvectoroutput(so) return output
def executegensimsearch(searchobject, outputfunction, indextype): """ use the searchlist to grab a collection of sentences then take a lemmatized search term and build association semanticvectors around that term in those passages :param searchitem: :param vtype: :return: """ so = searchobject activepoll = so.poll # print('so.vectorquerytype', so.vectorquerytype) activepoll.statusis('Preparing to search') so.usecolumn = 'marked_up_line' activecorpora = so.getactivecorpora() # so.seeking should only be set via a fallback when session['baggingmethod'] == 'unlemmatized' if (so.lemma or so.tovectorize or so.seeking) and activecorpora: activepoll.statusis('Compiling the list of works to search') searchlist = compilesearchlist(listmapper, so.session) elif not activecorpora: reasons = ['no active corpora'] return emptyvectoroutput(so, reasons) else: reasons = ['there was no search term'] return emptyvectoroutput(so, reasons) # make sure you don't go nuts maxwords = hipparchia.config['MAXVECTORSPACE'] wordstotal = 0 for work in searchlist: work = work[:10] try: wordstotal += workdict[work].wordcount except TypeError: # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType' pass if wordstotal > maxwords: wt = '{:,}'.format(wordstotal) mw = '{:,}'.format(maxwords) reasons = [ 'the vector scope max exceeded: {a} > {b} '.format(a=wt, b=mw) ] return emptyvectoroutput(so, reasons) # DEBUGGING # Frogs and mice # so.lemma = lemmatadict['βάτραχοϲ'] # searchlist = ['gr1220'] # Euripides # so.lemma = lemmatadict['ἄτη'] # print(so.lemma.formlist) # so.lemma.formlist = ['ἄτῃ', 'ἄταν', 'ἄτηϲ', 'ἄτηι'] # searchlist = ['gr0006'] if len(searchlist) > 0: searchlist = flagexclusions(searchlist, so.session) workssearched = len(searchlist) searchlist = calculatewholeauthorsearches(searchlist, authordict) so.searchlist = searchlist indexrestrictions = configurewhereclausedata(searchlist, workdict, so) so.indexrestrictions = indexrestrictions # 'False' if there is no vectorspace; 'failed' if there can never be one; otherwise vectors vectorspace = checkforstoredvector(so, indextype) if not vectorspace and hipparchia.config[ 'FORBIDUSERDEFINEDVECTORSPACES']: reasons = [ 'you are only allowed to fetch pre-stored vector spaces; <b>try a single author or corpus search using the default vector values</b>' ] return emptyvectoroutput(so, reasons) # find all sentences if not vectorspace: activepoll.statusis( 'No stored model for this search. Finding all sentences') else: activepoll.statusis('Finding neighbors') # blanking out the search term will return every last sentence... # otherwise you only return sentences with the search term in them (i.e. rudimentaryvectorsearch) if not vectorspace: so.seeking = r'.' sentencetuples = vectorprepdispatcher(so) else: sentencetuples = None output = outputfunction(sentencetuples, workssearched, so, vectorspace) else: reasons = ['search list contained zero items'] return emptyvectoroutput(so, reasons) return output