示例#1
0
def updatesearchlistandsearchobject(so: SearchObject) -> SearchObject:
    """

	you have a searchlist; now tell the searchobject more about it...

	this has been peeled off so that golangvectors() can call it too

	"""

    # mark works that have passage exclusions associated with them:
    # gr0001x001 instead of gr0001w001 if you are skipping part of w001
    so.searchlist = flagexclusions(so.searchlist, so.session)

    so.poll.statusis('Calculating full authors to search')
    so.searchlist = calculatewholeauthorsearches(so.searchlist, authordict)
    so.usedcorpora = so.wholecorporasearched()
    so.poll.statusis('Configuring the search restrictions')
    so.indexrestrictions = configurewhereclausedata(so.searchlist, workdict,
                                                    so)

    return so
def gensimexperiment(so):
	"""

	:param activepoll:
	:param so:
	:return:
	"""

	activepoll = so.poll

	activecorpora = so.getactivecorpora()
	searchlist = flagexclusions(searchlist, so.session)
	workssearched = len(searchlist)
	searchlist = compilesearchlist(listmapper, so.session)
	searchlist = calculatewholeauthorsearches(searchlist, authordict)
	so.searchlist = searchlist
	sentencetuples = vectorprepdispatcher(so)
	# find all words in use
	listsofwords = [s[1] for s in sentencetuples]
	allwords = findsetofallwords(listsofwords)

	# find all possible forms of all the words we used
	# consider subtracting some set like: rarewordsthatpretendtobecommon = {}
	wl = '{:,}'.format(len(listsofwords))
	activepoll.statusis('Finding headwords for {n} sentences'.format(n=wl))

	morphdict = getrequiredmorphobjects(allwords)
	morphdict = convertmophdicttodict(morphdict)

	# find all possible headwords of all of the forms in use
	# note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
	allheadwords = dict()
	for m in morphdict.keys():
		for h in morphdict[m]:
			allheadwords[h] = m

	vectorspace = logentropybuildspace(so, morphdict, listsofwords)

	return output
def tensorgraphelectedworks(searchobject):
    """

	adapted from https://raw.githubusercontent.com/tensorflow/tensorflow/r1.5/tensorflow/examples/tutorials/word2vec/word2vec_basic.py

	:param activepoll:
	:param searchobject:
	:return:
	"""
    so = searchobject
    activepoll = so.poll

    tffunctiontocall = tftrainondata
    tffunctiontocall = tfnlptraining

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format('%d', wordstotal, grouping=True),
                b=locale.format('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        so.seeking = r'.'
        sentences = vectorprepdispatcher(so)
        output = tffunctiontocall(sentences, searchobject)
    else:
        return emptyvectoroutput(so)

    return output
示例#4
0
def fortestingpurposessklearnselectedworks(searchobject):
    """

	:param activepoll:
	:param searchobject:
	:return:
	"""

    skfunctiontotest = sklearntextfeatureextractionandevaluation
    skfunctiontotest = simplesktextcomparison
    skfunctiontotest = ldatopicmodeling
    skfunctiontotest = ldatopicgraphing

    so = searchobject
    activepoll = so.poll

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'
    so.vectortype = 'sentencesimilarity'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format('%d', wordstotal, grouping=True),
                b=locale.format('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        so.seeking = r'.'

        # if skfunctiontotest == ldatopicgraphing:
        # 	so.sentencebundlesize = 2

        sentencetuples = vectorprepdispatcher(so)
        if len(sentencetuples
               ) > hipparchia.config['MAXSENTENCECOMPARISONSPACE']:
            reasons = [
                'scope of search exceeded allowed maximum: {a} > {b}'.format(
                    a=len(sentencetuples),
                    b=hipparchia.config['MAXSENTENCECOMPARISONSPACE'])
            ]
            return emptyvectoroutput(so, reasons)
        similaritiesdict = skfunctiontotest(sentencetuples, so)

        if skfunctiontotest == ldatopicgraphing:
            # kludge for now: this is already html
            corehtml = similaritiesdict
            return corehtml

        # similaritiesdict: {id: (scoreA, lindobjectA1, sentA1, lindobjectA2, sentA2), id2: (scoreB, lindobjectB1, sentB1, lindobjectB2, sentB2), ... }
        corehtml = skformatmostimilar(similaritiesdict)
        output = generatesimilarsentenceoutput(corehtml, so, workssearched,
                                               len(similaritiesdict))
    else:
        return emptyvectoroutput(so)

    return output
示例#5
0
def sklearnselectedworks(searchobject):
    """

	:param activepoll:
	:param searchobject:
	:return:
	"""

    if not ldavis or not CountVectorizer:
        reasons = [
            'requisite software not installed: sklearn and/or ldavis is unavailable'
        ]
        return emptyvectoroutput(searchobject, reasons)

    so = searchobject
    activepoll = so.poll

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'
    so.vectortype = 'topicmodel'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format_string('%d', wordstotal, grouping=True),
                b=locale.format_string('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        so.seeking = r'.'

        sentencetuples = vectorprepdispatcher(so)
        if len(sentencetuples
               ) > hipparchia.config['MAXSENTENCECOMPARISONSPACE']:
            reasons = [
                'scope of search exceeded allowed maximum: {a} > {b}'.format(
                    a=len(sentencetuples),
                    b=hipparchia.config['MAXSENTENCECOMPARISONSPACE'])
            ]
            return emptyvectoroutput(so, reasons)
        output = ldatopicgraphing(sentencetuples, workssearched, so)

    else:
        return emptyvectoroutput(so)

    return output
示例#6
0
def findabsolutevectorsbysentence(searchobject):
    """

	use the searchlist to grab a collection of sentences

	then take a lemmatized search term and build association semanticvectors around that term in those passages

	generators are tempting, but dealing with generators+MP is a trick:

		TypeError: can't pickle generator objects

	:param searchitem:
	:param vtype:
	:return:
	"""

    so = searchobject
    activepoll = so.poll

    # we are not really a route at the moment, but instead being called by execute search
    # when the δ option is checked; hence the commenting out of the following
    # lemma = cleaninitialquery(request.args.get('lem', ''))

    try:
        lemma = lemmatadict[so.lemma.dictionaryentry]
    except KeyError:
        lemma = None
    except AttributeError:
        # 'NoneType' object has no attribute 'dictionaryentry'
        lemma = None

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if (lemma or so.seeking) and activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format_string('%d', wordstotal, grouping=True),
                b=locale.format_string('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        sentencetuples = vectorprepdispatcher(so)
        sentences = [s[1] for s in sentencetuples]
        output = generateabsolutevectorsoutput(sentences, workssearched, so,
                                               'sentences')
    else:
        return emptyvectoroutput(so)

    return output
示例#7
0
def executegensimsearch(searchobject, outputfunction, indextype):
    """

	use the searchlist to grab a collection of sentences

	then take a lemmatized search term and build association semanticvectors around that term in those passages

	:param searchitem:
	:param vtype:
	:return:
	"""

    so = searchobject
    activepoll = so.poll

    # print('so.vectorquerytype', so.vectorquerytype)

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'

    activecorpora = so.getactivecorpora()

    # so.seeking should only be set via a fallback when session['baggingmethod'] == 'unlemmatized'
    if (so.lemma or so.tovectorize or so.seeking) and activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    elif not activecorpora:
        reasons = ['no active corpora']
        return emptyvectoroutput(so, reasons)
    else:
        reasons = ['there was no search term']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        wt = '{:,}'.format(wordstotal)
        mw = '{:,}'.format(maxwords)
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(a=wt, b=mw)
        ]
        return emptyvectoroutput(so, reasons)

    # DEBUGGING
    # Frogs and mice
    # so.lemma = lemmatadict['βάτραχοϲ']
    # searchlist = ['gr1220']

    # Euripides
    # so.lemma = lemmatadict['ἄτη']
    # print(so.lemma.formlist)
    # so.lemma.formlist = ['ἄτῃ', 'ἄταν', 'ἄτηϲ', 'ἄτηι']
    # searchlist = ['gr0006']

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # 'False' if there is no vectorspace; 'failed' if there can never be one; otherwise vectors
        vectorspace = checkforstoredvector(so, indextype)

        if not vectorspace and hipparchia.config[
                'FORBIDUSERDEFINEDVECTORSPACES']:
            reasons = [
                'you are only allowed to fetch pre-stored vector spaces; <b>try a single author or corpus search using the default vector values</b>'
            ]
            return emptyvectoroutput(so, reasons)

        # find all sentences
        if not vectorspace:
            activepoll.statusis(
                'No stored model for this search. Finding all sentences')
        else:
            activepoll.statusis('Finding neighbors')
        # blanking out the search term will return every last sentence...
        # otherwise you only return sentences with the search term in them (i.e. rudimentaryvectorsearch)
        if not vectorspace:
            so.seeking = r'.'
            sentencetuples = vectorprepdispatcher(so)
        else:
            sentencetuples = None

        output = outputfunction(sentencetuples, workssearched, so, vectorspace)

    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    return output