Пример #1
0
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized


# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()
# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]
# Code starts here
# Creating the dictionary from our cleaned word list doc_clean
dictionary = corpora.Dictionary(doc_clean)
# Creating the corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
pprint(lsimodel.print_topics())

# --------------
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task


# Function to calculate coherence values
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
!pip install pyLDAvis

import pyLDAvis.gensim                             
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, journals_corpus, journals_dictionary)

from gensim.models import CoherenceModel
lda_cm=CoherenceModel(model=lda,corpus=journals_corpus,dictionary=journals_dictionary,texts= journals['Full title'],coherence='c_v')
LDA_cm=lda_cm.get_coherence()
LDA_cm

from gensim.models.lsimodel import LsiModel

lsi = LsiModel(corpus=journals_corpus,id2word=journals_dictionary,num_topics=20)

lsi_topics = lsi.print_topics()
for topic in lsi_topics:
  print(topic)

test_doc = 'Journal of medicines and herbs'
test_doc = custom_preprocess(test_doc)
test_doc_bow = journals_dictionary.doc2bow(test_doc)
print(test_doc_bow)

print(lsi[test_doc_bow])

test_doc2 = 'Material and physics'
test_doc2 = custom_preprocess(test_doc2)
test_doc_bow2 = journals_dictionary.doc2bow(test_doc2)
print(test_doc_bow2)
Пример #3
0
list_of_docs = df["message"].tolist()

# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]

# Code starts here
#Creating the dictionary id2word from our cleaned word list doc_clean
dictionary = corpora.Dictionary(doc_clean)

# Creating the corpus
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)

lsimodel.print_topics()

#Topic modeling using LDA --------------
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task


# Function to calculate coherence values
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
Пример #4
0
print corpus         
# tfidf = models.TfidfModel(corpus)
# vec = [(0, 1), (4, 1)]
# print(tfidf[vec])
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
# sims = index[tfidf[vec]]
# print(list(enumerate(sims)))   
corpora.MmCorpus.save_corpus('file.mm', corpus)
#id2word= corpora.Dictionary.load('deerwester.dict')
mmCorpus = corpora.MmCorpus("file.mm")
print mmCorpus
lsi = LsiModel(mmCorpus, id2word=dictionary,num_topics=10)
print "lsi:"
#print(lsi[new_vec])
lsi.print_debug(4, 4)
lsi.print_topics(4,2)
lsi.show_topic(10, 10)

lda = LdaModel(mmCorpus,id2word=dictionary,num_topics=10)
lda.print_topics(4,4)
doc_lda = lda[new_vec]

print "lda:"
#print doc_lda
         
# corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
#            [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
#            [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
#            [(0, 1.0), (4, 2.0), (7, 1.0)],
#            [(3, 1.0), (5, 1.0), (6, 1.0)],
#            [(9, 1.0)],
Пример #5
0
def fit_model(corpus, id2word, num_topics):
    # 训练模型
    lsi = LsiModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
    lsi.print_topics(num_topics)
    return lsi
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized


# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()
# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]
# Code starts here
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# Creating the LSi model
lsimodel = LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)
lsimodel.print_topics(num_topics=5, num_words=3)

# --------------
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task


# Function to calculate coherence values
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,