Пример #1
0
def gensimexperiment(so):
	"""

	:param activepoll:
	:param so:
	:return:
	"""

	activepoll = so.poll

	activecorpora = so.getactivecorpora()
	searchlist = flagexclusions(searchlist, so.session)
	workssearched = len(searchlist)
	searchlist = compilesearchlist(listmapper, so.session)
	searchlist = calculatewholeauthorsearches(searchlist, authordict)
	so.searchlist = searchlist
	sentencetuples = vectorprepdispatcher(so)
	# find all words in use
	listsofwords = [s[1] for s in sentencetuples]
	allwords = findsetofallwords(listsofwords)

	# find all possible forms of all the words we used
	# consider subtracting some set like: rarewordsthatpretendtobecommon = {}
	wl = '{:,}'.format(len(listsofwords))
	activepoll.statusis('Finding headwords for {n} sentences'.format(n=wl))

	morphdict = getrequiredmorphobjects(allwords)
	morphdict = convertmophdicttodict(morphdict)

	# find all possible headwords of all of the forms in use
	# note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
	allheadwords = dict()
	for m in morphdict.keys():
		for h in morphdict[m]:
			allheadwords[h] = m

	vectorspace = logentropybuildspace(so, morphdict, listsofwords)

	return output
Пример #2
0
def tensorgraphelectedworks(searchobject):
    """

	adapted from https://raw.githubusercontent.com/tensorflow/tensorflow/r1.5/tensorflow/examples/tutorials/word2vec/word2vec_basic.py

	:param activepoll:
	:param searchobject:
	:return:
	"""
    so = searchobject
    activepoll = so.poll

    tffunctiontocall = tftrainondata
    tffunctiontocall = tfnlptraining

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format('%d', wordstotal, grouping=True),
                b=locale.format('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        so.seeking = r'.'
        sentences = vectorprepdispatcher(so)
        output = tffunctiontocall(sentences, searchobject)
    else:
        return emptyvectoroutput(so)

    return output
Пример #3
0
def fortestingpurposessklearnselectedworks(searchobject):
    """

	:param activepoll:
	:param searchobject:
	:return:
	"""

    skfunctiontotest = sklearntextfeatureextractionandevaluation
    skfunctiontotest = simplesktextcomparison
    skfunctiontotest = ldatopicmodeling
    skfunctiontotest = ldatopicgraphing

    so = searchobject
    activepoll = so.poll

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'
    so.vectortype = 'sentencesimilarity'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format('%d', wordstotal, grouping=True),
                b=locale.format('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        so.seeking = r'.'

        # if skfunctiontotest == ldatopicgraphing:
        # 	so.sentencebundlesize = 2

        sentencetuples = vectorprepdispatcher(so)
        if len(sentencetuples
               ) > hipparchia.config['MAXSENTENCECOMPARISONSPACE']:
            reasons = [
                'scope of search exceeded allowed maximum: {a} > {b}'.format(
                    a=len(sentencetuples),
                    b=hipparchia.config['MAXSENTENCECOMPARISONSPACE'])
            ]
            return emptyvectoroutput(so, reasons)
        similaritiesdict = skfunctiontotest(sentencetuples, so)

        if skfunctiontotest == ldatopicgraphing:
            # kludge for now: this is already html
            corehtml = similaritiesdict
            return corehtml

        # similaritiesdict: {id: (scoreA, lindobjectA1, sentA1, lindobjectA2, sentA2), id2: (scoreB, lindobjectB1, sentB1, lindobjectB2, sentB2), ... }
        corehtml = skformatmostimilar(similaritiesdict)
        output = generatesimilarsentenceoutput(corehtml, so, workssearched,
                                               len(similaritiesdict))
    else:
        return emptyvectoroutput(so)

    return output
Пример #4
0
def sklearnselectedworks(searchobject):
    """

	:param activepoll:
	:param searchobject:
	:return:
	"""

    if not ldavis or not CountVectorizer:
        reasons = [
            'requisite software not installed: sklearn and/or ldavis is unavailable'
        ]
        return emptyvectoroutput(searchobject, reasons)

    so = searchobject
    activepoll = so.poll

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'
    so.vectortype = 'topicmodel'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format_string('%d', wordstotal, grouping=True),
                b=locale.format_string('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        so.seeking = r'.'

        sentencetuples = vectorprepdispatcher(so)
        if len(sentencetuples
               ) > hipparchia.config['MAXSENTENCECOMPARISONSPACE']:
            reasons = [
                'scope of search exceeded allowed maximum: {a} > {b}'.format(
                    a=len(sentencetuples),
                    b=hipparchia.config['MAXSENTENCECOMPARISONSPACE'])
            ]
            return emptyvectoroutput(so, reasons)
        output = ldatopicgraphing(sentencetuples, workssearched, so)

    else:
        return emptyvectoroutput(so)

    return output
Пример #5
0
def findabsolutevectorsbysentence(searchobject):
    """

	use the searchlist to grab a collection of sentences

	then take a lemmatized search term and build association semanticvectors around that term in those passages

	generators are tempting, but dealing with generators+MP is a trick:

		TypeError: can't pickle generator objects

	:param searchitem:
	:param vtype:
	:return:
	"""

    so = searchobject
    activepoll = so.poll

    # we are not really a route at the moment, but instead being called by execute search
    # when the δ option is checked; hence the commenting out of the following
    # lemma = cleaninitialquery(request.args.get('lem', ''))

    try:
        lemma = lemmatadict[so.lemma.dictionaryentry]
    except KeyError:
        lemma = None
    except AttributeError:
        # 'NoneType' object has no attribute 'dictionaryentry'
        lemma = None

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'

    allcorpora = [
        'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus',
        'christiancorpus'
    ]
    activecorpora = [c for c in allcorpora if so.session[c]]

    if (lemma or so.seeking) and activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(
                a=locale.format_string('%d', wordstotal, grouping=True),
                b=locale.format_string('%d', maxwords, grouping=True))
        ]
        return emptyvectoroutput(so, reasons)

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # find all sentences
        activepoll.statusis('Finding all sentences')
        sentencetuples = vectorprepdispatcher(so)
        sentences = [s[1] for s in sentencetuples]
        output = generateabsolutevectorsoutput(sentences, workssearched, so,
                                               'sentences')
    else:
        return emptyvectoroutput(so)

    return output
Пример #6
0
def lsifindmatches(sentencestuples, searchobject, lsispace):
	"""


	:return:
	"""

	so = searchobject
	vv = so.vectorvalues

	activepoll = so.poll

	makespace = lsibuildspace

	if not lsispace:
		# find all words in use
		listsofwords = [s[1] for s in sentencestuples]
		allwords = findsetofallwords(listsofwords)

		# find all possible forms of all the words we used
		# consider subtracting some set like: rarewordsthatpretendtobecommon = {}
		wl = '{:,}'.format(len(listsofwords))
		activepoll.statusis('Finding headwords for {n} sentences'.format(n=wl))

		morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)
		morphdict = convertmophdicttodict(morphdict)

		# find all possible headwords of all of the forms in use
		# note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
		allheadwords = dict()
		for m in morphdict.keys():
			for h in morphdict[m]:
				allheadwords[h] = m

		hw = '{:,}'.format(len(allheadwords.keys()))
		activepoll.statusis('Building vectors for {h} headwords in {n} sentences'.format(h=hw, n=wl))

		lsispace = makespace(searchobject, morphdict, listsofwords)
		storevectorindatabase(so, 'lsi', lsispace)

	vectorquerylsi = lsispace.findquerylsi(so.tovectorize)

	vectorindex = MatrixSimilarity(lsispace.semantics)

	similis = vectorindex[vectorquerylsi]
	# print('similis', similis)

	threshold = vv.lemmapaircutoffdistance

	matches = list()
	sims = sorted(enumerate(similis), key=lambda item: -item[1])
	count = 0
	activepoll.statusis('Sifting results')

	if not sentencestuples:
		sentencestuples = vectorprepdispatcher(so)

	dbconnection = ConnectionObject('autocommit')
	cursor = dbconnection.cursor()
	for s in sims:
		if s[1] > threshold:
			thissentence = lsispace.sentences[s[0]]
			# this part is slow and needs MP refactoring?
			# dblines = finddblinefromsentence(thissentence, subsearchobject)
			dblines = finddblinesfromsentences(thissentence, sentencestuples, cursor)
			if dblines:
				if len(dblines) > 1:
					xtra = ' <span class="small">[1 of {n} occurrences]</span>'.format(n=len(dblines))
				else:
					xtra = ''
				dbline = dblines[0]
				count += 1
				thismatch = dict()
				thismatch['count'] = count
				thismatch['score'] = float(s[1])  # s[1] comes back as <class 'numpy.float32'>
				thismatch['line'] = dbline
				thismatch['sentence'] = '{s}{x}'.format(s=' '.join(thissentence), x=xtra)
				thismatch['words'] = lsispace.bagsofwords[s[0]]
				matches.append(thismatch)

	dbconnection.connectioncleanup()

	matches = [m for m in matches if len(m['sentence'].split(' ')) > 2]

	return matches
Пример #7
0
def executegensimsearch(searchobject, outputfunction, indextype):
    """

	use the searchlist to grab a collection of sentences

	then take a lemmatized search term and build association semanticvectors around that term in those passages

	:param searchitem:
	:param vtype:
	:return:
	"""

    so = searchobject
    activepoll = so.poll

    # print('so.vectorquerytype', so.vectorquerytype)

    activepoll.statusis('Preparing to search')

    so.usecolumn = 'marked_up_line'

    activecorpora = so.getactivecorpora()

    # so.seeking should only be set via a fallback when session['baggingmethod'] == 'unlemmatized'
    if (so.lemma or so.tovectorize or so.seeking) and activecorpora:
        activepoll.statusis('Compiling the list of works to search')
        searchlist = compilesearchlist(listmapper, so.session)
    elif not activecorpora:
        reasons = ['no active corpora']
        return emptyvectoroutput(so, reasons)
    else:
        reasons = ['there was no search term']
        return emptyvectoroutput(so, reasons)

    # make sure you don't go nuts
    maxwords = hipparchia.config['MAXVECTORSPACE']
    wordstotal = 0
    for work in searchlist:
        work = work[:10]
        try:
            wordstotal += workdict[work].wordcount
        except TypeError:
            # TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'
            pass

    if wordstotal > maxwords:
        wt = '{:,}'.format(wordstotal)
        mw = '{:,}'.format(maxwords)
        reasons = [
            'the vector scope max exceeded: {a} > {b} '.format(a=wt, b=mw)
        ]
        return emptyvectoroutput(so, reasons)

    # DEBUGGING
    # Frogs and mice
    # so.lemma = lemmatadict['βάτραχοϲ']
    # searchlist = ['gr1220']

    # Euripides
    # so.lemma = lemmatadict['ἄτη']
    # print(so.lemma.formlist)
    # so.lemma.formlist = ['ἄτῃ', 'ἄταν', 'ἄτηϲ', 'ἄτηι']
    # searchlist = ['gr0006']

    if len(searchlist) > 0:
        searchlist = flagexclusions(searchlist, so.session)
        workssearched = len(searchlist)
        searchlist = calculatewholeauthorsearches(searchlist, authordict)
        so.searchlist = searchlist

        indexrestrictions = configurewhereclausedata(searchlist, workdict, so)
        so.indexrestrictions = indexrestrictions

        # 'False' if there is no vectorspace; 'failed' if there can never be one; otherwise vectors
        vectorspace = checkforstoredvector(so, indextype)

        if not vectorspace and hipparchia.config[
                'FORBIDUSERDEFINEDVECTORSPACES']:
            reasons = [
                'you are only allowed to fetch pre-stored vector spaces; <b>try a single author or corpus search using the default vector values</b>'
            ]
            return emptyvectoroutput(so, reasons)

        # find all sentences
        if not vectorspace:
            activepoll.statusis(
                'No stored model for this search. Finding all sentences')
        else:
            activepoll.statusis('Finding neighbors')
        # blanking out the search term will return every last sentence...
        # otherwise you only return sentences with the search term in them (i.e. rudimentaryvectorsearch)
        if not vectorspace:
            so.seeking = r'.'
            sentencetuples = vectorprepdispatcher(so)
        else:
            sentencetuples = None

        output = outputfunction(sentencetuples, workssearched, so, vectorspace)

    else:
        reasons = ['search list contained zero items']
        return emptyvectoroutput(so, reasons)

    return output