Пример #1
0
def logentropybuildspace(searchobject, morphdict, sentences):
	"""

	currently unused

	:param allheadwords:
	:param morphdict:
	:param sentences:
	:return:
	"""

	sentences = [[w for w in words.lower().split() if w] for words in sentences if words]
	sentences = [s for s in sentences if s]

	bagsofwords = buildwordbags(searchobject, morphdict, sentences)

	logentropydictionary = corpora.Dictionary(bagsofwords)
	logentropycorpus = [logentropydictionary.doc2bow(bag) for bag in bagsofwords]
	logentropyxform = LogEntropyModel(logentropycorpus)
	lsixform = LsiModel(corpus=logentropycorpus,
						id2word=logentropydictionary,
						onepass=False,
						num_topics=400)

	corpus = LogEntropyVectorCorpus(lsixform, logentropyxform, logentropydictionary, logentropycorpus, bagsofwords, sentences)

	return corpus
Пример #2
0
def lsibuildspace(searchobject, morphdict, sentences):
	"""

	:param allheadwords:
	:param morphdict:
	:param sentences:
	:return:
	"""

	sentences = [[w for w in words.lower().split() if w] for words in sentences if words]
	sentences = [s for s in sentences if s]

	bagsofwords = buildwordbags(searchobject, morphdict, sentences)

	lsidictionary = corpora.Dictionary(bagsofwords)
	lsicorpus = [lsidictionary.doc2bow(bag) for bag in bagsofwords]
	termfreqinversedocfreq = TfidfModel(lsicorpus)
	corpustfidf = termfreqinversedocfreq[lsicorpus]
	semanticindex = LsiModel(corpustfidf, id2word=lsidictionary, num_topics=250)

	"""	
	"An empirical study of required dimensionality for large-scale latent semantic indexing applications"
	Bradford 2008

	For a term-document matrix that has been decomposed via SVD with a non-zero diagonal... 

	Dimensionality is reduced by deleting all but the k largest values on 
	this diagonal, together with the corresponding columns in the
	other two matrices. This truncation process is used to generate a
	k-dimensional vector space. Both terms and documents are represented
	by k-dimensional vectors in this vector space.

	Landauer and Dumais in 1997: They found that the degree of match 
	between cosine measures in the LSI space and human judgment
	was strongly dependent on k, with a maximum for k = 300

	It is clear that there will be a growing degradation of representational
	fidelity as the dimensionality is increased beyond 400. Depending
	upon the application, such behavior may preclude use of
	dimensionality greater than 400.  

	recommendations:
	300: thousands to 10s of thousands

	"""

	corpus = LSIVectorCorpus(semanticindex, corpustfidf, lsidictionary, lsicorpus, bagsofwords, sentences)

	return corpus
Пример #3
0
def ldatopicgraphing(sentencetuples,
                     workssearched,
                     searchobject,
                     headwordstops=True):
    """

	a sentence tuple looks like:
		('gr2397w001_ln_42', 'ποίῳ δὴ τούτων ἄξιον τὸν κόϲμον φθείρεϲθαι φάναι')

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	see also:

		https://nlpforhackers.io/topic-modeling/

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
	    When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
	    When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


	see:
		https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151

	max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

		max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
		max_df = 25 means "ignore terms that appear in more than 25 documents".

	The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

	min_df is used for removing terms that appear too infrequently. For example:

		min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
		min_df = 5 means "ignore terms that appear in less than 5 documents".

	The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

	notes:
		maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc.
		maxfreq of

	on the general issue of graphing see also:
		https://speakerdeck.com/bmabey/visualizing-topic-models
		https://de.dariah.eu/tatom/topic_model_visualization.html

	on the axes:
		https://stats.stackexchange.com/questions/222/what-are-principal-component-scores

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    if headwordstops:
        stops = mostcommonwordsviaheadwords()
    else:
        stops = mostcommoninflectedforms()

    sentencetuples = [(a, removestopwords(b, stops))
                      for a, b in sentencetuples]

    activepoll = searchobject.poll
    vv = searchobject.vectorvalues

    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    # not easy to store/fetch since you need both ldavectorizer and ldamodel
    # so we just store the actual graph...
    ldavishtmlandjs = checkforstoredvector(searchobject, 'lda')

    if not ldavishtmlandjs:
        sentencetuples = [
            s for s in sentencetuples
            if len(s[1].strip().split(' ')) > settings['mustbelongerthan']
        ]
        sentences = [s[1] for s in sentencetuples]

        sentencesaslists = [s.split(' ') for s in sentences]
        allwordsinorder = [
            item for sublist in sentencesaslists for item in sublist if item
        ]

        activepoll.statusis('Finding all headwords')
        morphdict = getrequiredmorphobjects(set(allwordsinorder),
                                            furtherdeabbreviate=True)
        morphdict = convertmophdicttodict(morphdict)

        bagsofwordlists = buildwordbags(searchobject, morphdict,
                                        sentencesaslists)
        bagsofsentences = [' '.join(b) for b in bagsofwordlists]

        # print('bagsofsentences[:3]', bagsofsentences[3:])

        activepoll.statusis('Running the LDA vectorizer')
        # Use tf (raw term count) features for LDA.
        ldavectorizer = CountVectorizer(max_df=settings['maxfreq'],
                                        min_df=settings['minfreq'],
                                        max_features=settings['maxfeatures'])

        ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

        ldamodel = LatentDirichletAllocation(
            n_components=settings['components'],
            max_iter=settings['iterations'],
            learning_method='online',
            learning_offset=50.,
            random_state=0)

        ldamodel.fit(ldavectorized)

        visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer)
        # pyLDAvis.save_html(visualisation, 'ldavis.html')

        ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation)
        storevectorindatabase(searchobject, 'lda', ldavishtmlandjs)

    jsonoutput = ldatopicsgenerateoutput(ldavishtmlandjs, searchobject)

    return jsonoutput
Пример #4
0
def ldatopicmodeling(sentencetuples, searchobject):
    """

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
		When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
		When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.

	see sample results at end of file

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    maxfeatures = 2000
    components = 15
    topwords = 15

    maxfreq = .60
    minfreq = 5
    iterations = 12

    mustbelongerthan = 2

    sentencetuples = [
        s for s in sentencetuples
        if len(s[1].strip().split(' ')) > mustbelongerthan
    ]
    sentences = [s[1] for s in sentencetuples]

    sentences = [s.split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentences for item in sublist if item
    ]

    morphdict = getrequiredmorphobjects(set(allwordsinorder))
    morphdict = convertmophdicttodict(morphdict)

    bagsofwords = buildwordbags(searchobject, morphdict, sentences)

    bagsofsentences = [' '.join(b) for b in bagsofwords]

    # Use tf (raw term count) features for LDA.
    ldavectorizer = CountVectorizer(max_df=maxfreq,
                                    min_df=minfreq,
                                    max_features=maxfeatures)

    ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

    lda = LatentDirichletAllocation(n_components=components,
                                    max_iter=iterations,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

    lda.fit(ldavectorized)

    print("\nTopics in LDA model:")
    tf_feature_names = ldavectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, topwords)

    # Use tf-idf features for NMF.
    tfidfvectorizer = TfidfVectorizer(max_df=0.95,
                                      min_df=2,
                                      max_features=maxfeatures)

    tfidf = tfidfvectorizer.fit_transform(bagsofsentences)

    # Fit the NMF model
    nmf = NMF(n_components=components, random_state=1, alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (Frobenius norm):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    # Fit the NMF model
    print(
        "Fitting the NMF model (generalized Kullback-Leibler divergence) with "
        "tf-idf features, n_samples=%d and n_features=%d..." %
        (len(sentences), maxfeatures))

    nmf = NMF(n_components=components,
              random_state=1,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    return
Пример #5
0
def buildgensimmodel(searchobject, morphdict: dict,
                     sentences: List[str]) -> Word2Vec:
    """

    returns a Word2Vec model

    then you use one of the many ill-documented class functions that come with
    the model to make queries against it

    WordEmbeddingsKeyedVectors in keyedvectors.py is your friend here for learning what you can really do
        most_similar(positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None)
            [analogies; most_similar(positive=['woman', 'king'], negative=['man']) --> queen]

        similar_by_word(word, topn=10, restrict_vocab=None)
            [the top-N most similar words]

        similar_by_vector(vector, topn=10, restrict_vocab=None)

        similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL)

        wmdistance(document1, document2)
            [Word Mover's Distance between two documents]

        most_similar_cosmul(positive=None, negative=None, topn=10)
            [analogy finder; most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) --> iraq]

        cosine_similarities(vector_1, vectors_all)

        distances(word_or_vector, other_words=())

        distance(w1, w2)
            [distance('woman', 'man')]

        similarity(w1, w2)
            [similarity('woman', 'man')]

        n_similarity(ws1, ws2)
            [sets of words: n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])]


    FYI: Doc2VecKeyedVectors
        doesnt_match(docs)
            [Which doc from the given list doesn't go with the others?]

    note that Word2Vec will hurl out lots of DeprecationWarnings; we are blocking them
    one hopes that this does not yield a surprise some day... [surprise: it did...]

    this code is a candidate for refactoring because of the gensim 3.8 vs 4.0 API difference
    a drop down from model to model.wv requires refactoring dependent functions

    :return:
    """

    vv = searchobject.vectorvalues

    sentences = [[w for w in words.lower().split() if w] for words in sentences
                 if words]
    sentences = [s for s in sentences if s]

    bagsofwords = buildwordbags(searchobject, morphdict, sentences)
    # debugmessage('first bag is {b}'.format(b=bagsofwords[0]))
    # debugmessage('# of bags is {b}'.format(b=len(bagsofwords)))

    workers = setthreadcount()

    computeloss = False

    # Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread
    # (workers=1), to eliminate ordering jitter from OS thread scheduling.
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            try:
                gensimmodel = Word2Vec(
                    bagsofwords,
                    min_count=vv.minimumpresence,
                    seed=1,
                    iter=vv.trainingiterations,
                    size=vv.dimensions,
                    sample=vv.downsample,
                    sg=1,  # the results seem terrible if you say sg=0
                    window=vv.window,
                    workers=workers,
                    compute_loss=computeloss)
            except TypeError:
                # TypeError: __init__() got an unexpected keyword argument 'iter'
                # i.e., gensim 4.0.0 changed the API
                # see: https://radimrehurek.com/gensim/models/word2vec.html
                #
                # class gensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025,
                # window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0,
                # hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5,
                # null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(),
                # comment=None, max_final_vocab=None)
                #
                # epochs (int, optional) – Number of iterations (epochs) over the corpus. (Formerly: iter)
                # vector_size (int, optional) – Dimensionality of the word vectors.
                gensimmodel = Word2Vec(
                    bagsofwords,
                    min_count=vv.minimumpresence,
                    seed=1,
                    epochs=vv.trainingiterations,
                    vector_size=vv.dimensions,
                    sample=vv.downsample,
                    sg=1,  # the results seem terrible if you say sg=0
                    window=vv.window,
                    workers=workers,
                    compute_loss=computeloss)

    except RuntimeError:
        # RuntimeError: you must first build vocabulary before training the model
        # this will happen if you have a tiny author with too few words
        gensimmodel = None

    if computeloss:
        print('loss after {n} iterations was: {l}'.format(
            n=vv.trainingiterations, l=gensimmodel.get_latest_training_loss()))

    reducedmodel = None

    if gensimmodel:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            try:
                gensimmodel.delete_temporary_training_data(
                    replace_word_vectors_with_normalized=True)
            except AttributeError:
                # AttributeError: 'Word2Vec' object has no attribute 'delete_temporary_training_data'
                # i.e., gensim 4.0.0 changed the API
                # see: https://radimrehurek.com/gensim/models/word2vec.html
                # 	If you’re finished training a model (i.e. no more updates, only querying), you can switch to the KeyedVectors instance:
                # 	word_vectors = model.wv
                # 	del model
                # this complicates our backwards-compatible-life, though.
                # we want to return a Word2Vec and not a KeyedVectors instance
                # gensimmodel = gensimmodel.wv
                reducedmodel = Word2Vec(
                    [["cat", "say", "meow"], ["dog", "say", "woof"]],
                    min_count=1)
                reducedmodel.wv = gensimmodel.wv

    if reducedmodel:
        gensimmodel = reducedmodel

    # print(gensimmodel.wv['ludo'])

    storevectorindatabase(searchobject, 'nn', gensimmodel)

    return gensimmodel