示例#1
0
def build_lsi(docs):
    '''
    build lsi model from beginning
    the documents that needs to extract topics
    '''
    logging.info('There are {} documents'.format(docs.count()))
    # copy the iterator
    # build the dictionary
    logging.info('Building the dictionary...')
    dictionary = Dict.build_dict(docs)
    corpus = [i for i in get_corpus(dictionary)]  # freeze all the corpus
    logging.info('number of corpus {}'.format(len(corpus)))
    logging.info('Construction Completed.')

    # build the tfidf model
    logging.info('Building the tfidf model...')
    tfidf_model = TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf_model[corpus]

    logging.info('Construction Completed.')

    # build the lsi model
    logging.info('Building the LSI model...')
    lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    corpus_lsi = lsi_model[corpus_tfidf]
    logging.info('Construction Complete.')

    lsi_model.show_topics()
    return
示例#2
0
def train_lsa(is_tfidf, num_topics):
    # Create corpus
    print('Create corpus')
    corpus = doc_processor.create_corpus(dictionary, doc_list, is_tfidf)

    # Set training parameters.
    num_topics = num_topics
    chunksize = 20000

    start = time.time()
    temp = dictionary[0]
    id2word = dictionary.id2token
    print('Start LSI training')

    lsi_model = LsiModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=num_topics,
        chunksize=chunksize,
    )
    lsi_model.show_topics()

    ir_method = 'tfidf'  if is_tfidf else 'bow'

    lsi_model.save('saved_models/lsi_model_%s_%s' % (ir_method, num_topics))
    print('LSA for %s %s done in %.1f seconds' % (ir_method, num_topics, time.time() - start))
示例#3
0
def build_lsi(docs):
    '''
    build lsi model from beginning
    the documents that needs to extract topics
    '''
    logging.info('There are {} documents'.format(docs.count()))
    # copy the iterator
    # build the dictionary
    logging.info('Building the dictionary...')
    dictionary = Dict.build_dict(docs)
    corpus = [i for i in get_corpus(dictionary)] # freeze all the corpus
    logging.info('number of corpus {}'.format(len(corpus)))
    logging.info('Construction Completed.')

    # build the tfidf model
    logging.info('Building the tfidf model...')
    tfidf_model = TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf_model[corpus]


    logging.info('Construction Completed.')

    # build the lsi model
    logging.info('Building the LSI model...')
    lsi_model = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
    corpus_lsi = lsi_model[corpus_tfidf]
    logging.info('Construction Complete.')

    lsi_model.show_topics()
    return
示例#4
0
def latent_semantic_indexing(corpus, num_topics, id2word):
    ''' LATENT SEMANTIC INDEXING
    # Advantage of LSI: ranks topics by itself. Outputs topics in a ranked order.
    # Requires a num_topics parameter (200 by default) to determine the number of latent dimensions after the SVD.
    '''
    print 'Latent Semantic Indexing'
    lsi_model = LsiModel(corpus = corpus, num_topics = num_topics, id2word = id2word)
    lsi_model.show_topics(num_topics = num_topics)
    lsi_topic = lsi_model.show_topics(formatted = False)
    return lsi_model
示例#5
0
    def topicsLSI(self,
                  num_topics=10,
                  num_words=10,
                  num_iterations=2000,
                  chunksize=20000,
                  decay=0.5,
                  onepass=False):
        # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100)
        lsi = LsiModel(corpus=self.corpus,
                       num_topics=num_topics,
                       id2word=self.id2word,
                       chunksize=chunksize,
                       onepass=onepass,
                       power_iters=num_iterations,
                       decay=decay)

        # documents for each topic
        if self.doc2class:
            doc_idx = 0
            for line in lsi[self.corpus]:
                # get topic with maximum percentage
                if line:
                    topic_idx = max(line, key=lambda item: item[1])[0]
                else:
                    # if there is no topic assign a random one
                    topic_idx = random.randint(0, num_topics - 1)
                # make the dictionary
                if self.doc2topicLSI.get(self.doc2class[doc_idx]) is None:
                    self.doc2topicLSI[self.doc2class[doc_idx]] = {}
                    for i in range(0, num_topics):
                        self.doc2topicLSI[self.doc2class[doc_idx]][i] = 0
                self.doc2topicLSI[self.doc2class[doc_idx]][topic_idx] += 1
                doc_idx += 1
            print self.doc2topicLSI

        # show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
        # Return num_topics most significant topics (return all by default).
        # For each topic, show num_words most significant words (10 words by default).
        # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False.
        # If log is True, also output this result to log.
        return lsi.show_topics(num_topics=num_topics,
                               num_words=num_words,
                               formatted=False)

        # show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
        # Return num_topics most significant topics (return all by default).
        # For each topic, show num_words most significant words (10 words by default).
        # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False.
        # If log is True, also output this result to log.
        return lsi.show_topics(num_topics=num_topics,
                               num_words=num_words,
                               formatted=False)
示例#6
0
class LSITransformation:
    def __init__(self, input_space_vectors_map):
        self.input_space_vectors = input_space_vectors_map.values()
        self.transform()

    def transform(self):
        self.space = LSISpace(self.input_space_vectors)
        #TODO Handle Saner Reduction
        self.reduced_space = 15

        input_BOWs = [self.space.doc2bow(vector) for vector in self.input_space_vectors]
        self.lsi_model = LsiModel(corpus=input_BOWs, num_topics=self.reduced_space, id2word=self.space.id2Word())
        return self.lsi_model

    def dissimilarity_score(self, tokens, other_tokens):
        bows = self.space.doc2bow(tokens)
        other_bows = self.space.doc2bow(other_tokens)

        vector = self.infer_and_vectorize(bows)
        other_vector = self.infer_and_vectorize(other_bows)
        similarity = CosineSimilarity().calculate(vector, other_vector)
        return 1 - similarity

    def infer_and_vectorize(self, bows):
        transformed_bow = defaultdict(float)
        transformed_bow.update(dict(self.lsi_model[bows]))
        return [transformed_bow[dimension] for dimension in range(0, self.reduced_space)]

    def print_transformation(self):
        topics = self.lsi_model.show_topics(num_words=self.space.length(), formatted=False)
        for topic in topics:
                print [(round(value, 4), token) for value, token in topic]
def get_topic(text):
    np.random.seed(100)
    nlp = spacy.load('en')
    my_stop_words = [
        u'say', u'\'s', u'Mr', u'be', u'said', u'says', u'saying', u'get'
    ]
    for stopword in my_stop_words:
        lexeme = nlp.vocab[stopword]
        lexeme.is_stop = True
    doc = nlp(text)
    article = []
    texts = []
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
            # we add the lematized version of the word
            article.append(w.lemma_)
    texts.append(article)
    # getting bigrams out of words using gensim
    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    # Creating corpus with our words
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(i) for i in texts]
    # Applying LDA and LSI models
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
    ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
    lsitopics = [[word for word, prob in topic]
                 for topicid, topic in lsimodel.show_topics(formatted=False)]
    ldatopics = [[word for word, prob in topic]
                 for topicid, topic in ldamodel.show_topics(formatted=False)]
    topics = []
    for i in ldatopics:
        topics.append(i[0])
    tags = nltk.pos_tag(topics)
    # removing verbs as generally nouns are topics
    lfinaltopics = [
        word for word, pos in tags
        if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP'
        and pos != 'VBZ' and pos != 'VBG' and pos != 'JJ' and pos != 'RB'
    ]
    ldafinaltopics = list(set(lfinaltopics))
    lstopics = []
    for i in lsitopics:
        for j in i:
            lstopics.append(j)
    ltags = nltk.pos_tag(lstopics)
    lsifinaltopics = [
        word for word, pos in ltags
        if pos != 'VB' and pos != 'VBD' and pos != 'VBN' and pos != 'VBP'
        and pos != 'VBZ' and pos != 'VBG' and pos != 'RB' and pos != 'JJ'
    ]

    # Intersection of results from both models
    finaltopics = list(set(ldafinaltopics) & set(lsifinaltopics))
    final_topics = []
    for i in finaltopics:
        if len(i) >= 2:
            final_topics.append(i)
    return final_topics
示例#8
0
def lsi(corpus, dictionary):
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=100)
    lsi_corpus = []
    for i in range(len(corpus)):
        lsi_corpus.append(lsi_model[corpus[i]])

    lsi_similarity_matrix = MatrixSimilarity(lsi_corpus)
    print(lsi_model.show_topics())
    return lsi_similarity_matrix
def comparison(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsimodel = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LSI Model output')
    print(lsimodel.show_topics())

    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
    print('hdp model output')
    print(hdpmodel.show_topics())

    ldamodel = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
    print('LDA Model output')
    print(ldamodel.show_topics())


    pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdpmodel.show_topics(formatted=False)]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldamodel.show_topics(formatted=False)]

    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=texts, dictionary=dictionary,
                                   window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=texts, dictionary=dictionary, window_size=10).get_coherence()

    def evaluate_bar_graph(coherences, indices):
        assert len(coherences) == len(indices)
        n = len(coherences)
        x = np.arange(n)
        plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
        plt.xlabel('Models')
        plt.ylabel('Coherence Value')
        plt.show()

    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence], ['LSI', 'HDP', 'LDA'])
示例#10
0
    def topicsLSI(self, num_topics=10, num_words=10):
        # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100)
        lsi = LsiModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word)

        # show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
        # Return num_topics most significant topics (return all by default).
        # For each topic, show num_words most significant words (10 words by default).
        # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False.
        # If log is True, also output this result to log.

        return lsi.show_topics(num_words=num_words, formatted=False)
示例#11
0
def create_gensim_lsa_model(doc_clean, number_of_topics, words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary, doc_term_matrix = prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary)  # train model
    # print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    print(lsamodel.show_topics(number_of_topics, words))
    return lsamodel
示例#12
0
    def topicsLSI(self, num_topics=10, num_words=10):
        # LsiModel(corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=2, extra_samples=100)
        lsi = LsiModel(corpus=self.corpus,
                       num_topics=num_topics,
                       id2word=self.id2word)

        # show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
        # Return num_topics most significant topics (return all by default).
        # For each topic, show num_words most significant words (10 words by default).
        # The topics are returned as a list – a list of strings if formatted is True, or a list of (weight, word) 2-tuples if False.
        # If log is True, also output this result to log.

        return lsi.show_topics(num_words=num_words, formatted=False)
示例#13
0
  def algorithm_lsi(self, category_id, objs, goldstandards):
    numTopics = self.calculate_k_using_firstnames(objs)
    print "Using k = "+str(numTopics)

    texts = []
    for obj in objs:
      texts.append(get_categorizedproduct_content(obj))

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts] # bag of words

    print "Create models"
    lsi_model = LsiModel(corpus, id2word=dictionary, num_topics=numTopics)
    corpus_lsi = lsi_model[corpus]
    print "Done creating models"

    results = []
    labels = []
    cont = 0
    for probabilities, obj in izip(corpus_lsi, objs):
      if probabilities:
        max_prop = max(probabilities, key=lambda item:item[1])[0]
      else:
        max_prop = "WARNING "+str(texts[cont])
      labels.append(max_prop)
      results.append(str(max_prop)+" # "+obj['name'].encode('utf8'))
      cont += 1
    results.sort()
    for r in results:
      print r

    topic_id = 0
    for topic in lsi_model.show_topics(num_words=5):
        print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
        topic_id+=1

    if numTopics > 1:
      self.calculate_metrics(category_id, objs, labels, goldstandards)
    else:
      print "number of clusters equals or lower than 1, ignoring metric"
示例#14
0
def get_best_model(token_list,
                   min_topic_num=3,
                   max_topic_num=14,
                   coherence_metric="c_v",
                   model_type="lsi"):
    model_list = []
    coherence_values = []
    #create the corpus for the model
    corpus, tfidf_vect = create_corpus_and_vectorizer(token_list)
    for topics_num in range(min_topic_num, max_topic_num + 1):
        #Create the LsiModels with increasing number of Topics\
        if model_type == "nmf":
            model = nmf.Nmf(tfidf_vect[corpus],
                            id2word=dataset,
                            num_topics=topics_num)
        else:
            model = LsiModel(tfidf_vect[corpus],
                             id2word=dataset,
                             num_topics=topics_num)
        model_list.append(model)

        topics_model = [[
            word for word, prob in topic
        ] for topicid, topic in model.show_topics(formatted=False)]
        #Create the CoherenceModel and evaluate its score
        coherence_model = CoherenceModel(topics=topics_model,
                                         texts=token_list,
                                         dictionary=dataset,
                                         coherence=coherence_metric,
                                         window_size=30)
        coherence_values.append(coherence_model.get_coherence())
    try:
        index_value = coherence_values.index(max(coherence_values))
    except:
        index_value = 0
    best_model = model_list[index_value]
    return best_model, corpus
示例#15
0
print ("创建模型")
tfidf_model = TfidfModel(corpus)#转换成局部/全局加权TF_IDF矩阵,它可以将一个简单的计数表示成TFIDF空间。
# tfidf = TfidfModel(corpus)
# print(tfidf[some_doc])#输出模型
# tfidf.save('/tmp/foo.tfidf_model')#保存模型

lsi_model = LsiModel(corpus)
#LSA(latent semantic analysis)潜在语义分析,也被称为LSI(latent semantic index),
#是一种新的索引和检索方法。该方法和传统向量空间模型(vector space model)一样使用向量来表示词(terms)和文档(documents),
#并通过向量间的关系(如夹角)来判断词及文档间的关系;而不同的是,LSA将词和文档映射到潜在语义空间。
#同义词和多义词如何导致传统向量空间模型检索精确度的下降。
#LSA潜在语义分析的目的,就是要找出词(terms)在文档和查询中真正的含义,也就是潜在语义,从而解决上节所描述的问题。

topic_id = 0
for topic in lsi_model.show_topics():
    topic_id+=1
    print ("TOPIC (LSI) " + str(topic_id) + " : ", topic)

print('#'*50)
print(lsi_model.num_topics)
for i in range(0, lsi_model.num_topics-1):
    if lsi_model.print_topic(i):
        print (lsi_model.print_topic(i))

corpus_tfidf = tfidf_model[corpus]
corpus_lsi = lsi_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
corpus_lsi_2 = lsi_model_2[corpus]
print ('完成创建模型')
示例#16
0
class LSA(object):
    def __init__(self, stopwords, ignorechars):
        #self.stopwords = stopwords
        self.ignorechars = ignorechars
        self.wdict = {}
        self.dcount = 0

    def createStopwords(self, stopword_path):
        with open(stopword_path, 'r') as file1:
            temp = file1.read()
            self.stopwords = temp.split()

    def parse_dic_bow(self, seg_post):
        self.posts = [post for post in seg_post.values()]
        logger.info("BOW process... ")
        print "original post:"
        logger.debug("original post:")
        logger.debug(self.posts)
        #print self.posts
        self.mergeLineForOnePost = [
            " ".join(post) for post in self.posts
        ]  #change to ['\xe9\xa3\x9f\xe8\xa8\x98 \xe8\xa7\x92\xe9\xa0\xad',' efffe wedw']
        #print self.mergeLineForOnePost
        #self.texts = [[word for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']]
        ## covert UTF to ASCII
        self.texts = [
            [word.encode('utf8') for word in post.split()]
            for post in self.mergeLineForOnePost
        ]  #change to [['human', 'interface', 'computer'],['survey', 'user']]
        print "self.mergeLineForOnePost: "

        self.dictionary = gensim.corpora.Dictionary(self.texts)

        self.postIdList = [str(postId) for postId in seg_post.keys()]
        logger.debug("original dic and list:")
        logger.debug(self.dictionary, len(self.dictionary), self.postIdList)
        print "original dic and list:"
        print self.dictionary, self.postIdList

        ### preprocess - remove the once-word, stopwords, other shits
        stop_ids = [
            self.dictionary.token2id[stopword] for stopword in self.stopwords
            if stopword in self.dictionary.token2id
        ]
        once_ids = [
            tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems()
            if docfreq == 1
        ]
        ### remove once_id sometime cause invalid shape of LSA (TOO LESS words to cluster)

        #self.dictionary.filter_tokens(once_ids)
        self.dictionary.filter_tokens(stop_ids)
        logger.info("removed once-words and stopwords......")
        logger.debug(self.dictionary, len(self.dictionary))
        print "removed once-words and stopwords......"
        print self.dictionary
        self.dictionary.compactify()
        self.new_vec = [self.dictionary.doc2bow(post) for post in self.texts]
        #self.new_vec = self.dictionary.doc2bow(post for post in self.coverts)
    def store(self):
        logger.info("store process starts")
        self.dictionary.save(testDictionary)
        self.dictionary.save_as_text(testDictionaryString)
        corpora.MmCorpus.serialize(
            testBOWCorpus, self.new_vec)  # store to disk, for later use
        #corpus = corpora.MmCorpus(testBOWCorpus) # comes from the store
        #dictionary = corpora.Dictionary.load(testDictionary) # comes from the store
    def TFIDF(self):
        logger.info("TFIDF process starts")
        self.tfidf = TfidfModel(self.new_vec)
        self.corpus_tfidf = self.tfidf[self.new_vec]

    def printInfo(self):
        print 'show Dic: '
        print self.dictionary
        print 'show BOW: '
        for bow in self.new_vec:
            print bow
        print 'show corpus_tfidf model: '
        print self.tfidf
        print "show corpus_tfidf: "
        for i in self.corpus_tfidf:
            print i
        print "show LSA assignment of each post: "
        #self.num = len(self.corpus_lsi)
        #for doc, i in zip(self.corpus_lsi, range(self.num)): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
        for doc, postId in zip(self.corpus_lsi, self.postIdList):
            templist = []
            print 'post: {0}'.format(postId)
            print doc
            #print "breakdown"
            #for each in doc:
            #	templist.append(abs(each[1]))
            #print "templist: "
            #print templist
            theLarge = nlargest(
                1, doc,
                key=lambda e: abs(e[1]))  ## 1 means find the largest one
            if theLarge:
                print "the largest one with absoule value: ", theLarge[0][0]
            else:
                print "cannot find it!!!!"
        print "LSA Topics : "
        print self.topics
        print "Break down : "
        for i in self.topics:
            print i
            print type(i)

    def build(self):
        ### need to find out a way to pick the proper number of the cluster - may be based on the number of POST
        self.lsi_model = LsiModel(self.corpus_tfidf,
                                  id2word=self.dictionary,
                                  num_topics=3)
        self.corpus_lsi = self.lsi_model[self.corpus_tfidf]
        ##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4)
        #print "topics difference"
        #print self.lsi_model.print_topic(2, topn=4)
        self.topics = self.lsi_model.show_topics(num_topics=5,
                                                 num_words=4,
                                                 log=False,
                                                 formatted=False)
        #print "tuple!@!"
        #print ss
    def repaserForOutput(self):
        ### post_assignment = {post_id:topic} Ex. {"p1":"t1"}
        ### topic_assignment = {topic_id:[keywords]} Ex. {"t1":["秘密", "飛行器", "新華", "任務"]
        #print "start to extact info for post_assignment"
        self.post_assignment = {}
        self.topic_assignment = {}
        for doc, postId in zip(
                self.corpus_lsi, self.postIdList
        ):  #self.postIdList // ['p2', 'p3', 'p1', 'p6', 'p7', 'p4', 'p5', 'p8']
            theTopic = nlargest(1, doc, key=lambda e: abs(e[1]))
            if theTopic:
                self.post_assignment[postId] = theTopic[0][0]
            else:
                self.post_assignment[postId] = "NB"
            #self.post_assignment[postId] = theTopic[0]
        self.num = len(self.topics)
        for topic, num in zip(self.topics, range(self.num)):
            topicWords = []
            for each in topic:
                #covert from string to unicode
                topicWords.append(each[1].decode('utf8'))
                #topicWords.append(each[1])
            ## just exact the first topic content, for example, use "秘密" in ["秘密", "飛行器", "新華", "任務"]
            #self.topic_assignment[str(num)] = topicWords[0]
            self.topic_assignment[str(num)] = topicWords
        #matchObj = re.match( r'(.*) are(\.*)', line)
        #rerurn(self.post_assignment,self.topic_assignment)
        return (self.post_assignment, self.topic_assignment)

    def create_result(self, seg_post):
        logger.info('LSA main process starts.....')
        self.createStopwords(stopword_path)
        self.parse_dic_bow(seg_post)
        self.TFIDF()
        self.build()
        self.store()

    def get_result(self):
        self.printInfo()
        return (self.repaserForOutput())
示例#17
0
    print vector

topics = 200
num_clusters = 4

print "Create models"
lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics)
corpus_lsi = lsi_model[corpus]

print "Done creating models"


#lsi_model_2 .print_topics(5)

topic_id = 0
for topic in lsi_model.show_topics(num_words=5):
    print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
    topic_id+=1


#for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print "Doc " + str(doc)


corpus_lsi_dense = corpus2dense(corpus_lsi, topics)
print "Dense Matrix Shape " + str(corpus_lsi_dense.shape)


#attempt scikit integration
km = KMeans(num_clusters, init='random', max_iter=100, n_init=1, verbose=1)
km.fit(corpus_lsi_dense)
# In[29]:

lsi2 = LsiModel(bows, num_topics=2, id2word=vocab, extra_samples=100, power_iters=2)
lsi2


# In[30]:

lsi.save(os.path.join(DATA_PATH, 'lsi100'))
lsi2.save(os.path.join(DATA_PATH, 'lsi2'))


# In[16]:

lsi2.show_topics()


# In[23]:

# for topic in lsi.show_topics():
#     print(topic)

lsi.show_topic(0, 100)


# ## Hold onto your hat
# This will take a lot of RAM!  
# (and CPU)  

# In[31]:
示例#19
0

def get_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

get_topics(lda, num_topics)

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, id2word)


lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]
ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)]

lsi_coherence = CoherenceModel(model=lsimodel,topics=lsitopics,dictionary=id2word, texts=train_headlines,window_size=10).get_coherence()
lda_coherence = CoherenceModel(model=lda,topics=ldatopics,dictionary=id2word,texts=train_headlines,window_size=10).get_coherence()

#lda_coherence =CoherenceModel(model=lsimodel, corpus=corpus, coherence='u_mass').get_coherence() 

def  evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
示例#20
0
    print vector

topics = 200
num_clusters = 4

print "Create models"
lsi_model = LsiModel(corpus, id2word=corpus.dictionary, num_topics=topics)
corpus_lsi = lsi_model[corpus]

print "Done creating models"


#lsi_model_2 .print_topics(5)

topic_id = 0
for topic in lsi_model.show_topics(num_words=5):
    print "TOPIC (LSI2) " + str(topic_id) + " : " + topic
    topic_id+=1


#for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print "Doc " + str(doc)


corpus_lsi_dense = corpus2dense(corpus_lsi, topics)
print "Dense Matrix Shape " + str(corpus_lsi_dense.shape)


#attempt scikit integration
km = KMeans(k=num_clusters, init='random', max_iter=100, n_init=1, verbose=1)
km.fit(corpus_lsi_dense)
示例#21
0
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# We're now done with a very important part of any text analysis - the data cleaning and setting up of corpus. It must be kept in mind that we created the corpus the way we did because that's how gensim requires it - most algorithms still require one to clean the data set the way we did, by removing stop words and numbers, adding the lemmatized form of the word, and using bigrams.

# ### LSI
#
# LSI stands for Latent Semantic Indeixing - it is a popular information retreival method which works by decomposing the original matrix of words to maintain key topics. Gensim's implementation uses an SVD.

# In[11]:

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

# In[12]:

lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

# ### HDP
#
# HDP, the Hierarchical Dirichlet process is an unsupervised topic model which figures out the number of topics on it's own.

# In[13]:

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

# In[14]:

hdpmodel.show_topics()

# ### LDA
#
示例#22
0
# for i in model.show_topics():
#     print(i)

from gensim.models import LsiModel
from gensim import corpora, models
import jieba
file_dir = "../corpora/test1"
documents = []
with open(file_dir, "r", encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        seg_list = jieba.cut(line, cut_all=False)
        sentence = [word for word in seg_list]
        documents.append(sentence)

Dict = corpora.Dictionary(documents)

corpus = [Dict.doc2bow(doc) for doc in documents]

tf_idf = models.TfidfModel(corpus)

lsimodel = LsiModel(corpus=tf_idf[corpus], id2word=Dict, num_topics=4)

# for i in lsimodel[tf_idf[corpus]]:
#     print(i)
for i in lsimodel.show_topics():
    print(i)

# 添加文档
lsimodel.add_documents([[(1, 2), (2, 1)]])
示例#23
0
print(len(texts_list))
#
print("TDF Vectorizer, matrix and texts loaded from file")

######

bigram = gensim.models.Phrases(texts_list)  # for bigram collocation detection
stops = set(stopwords.words('english'))  # nltk stopwords list

texts_list = process_texts(texts_list)
dictionary = Dictionary(texts_list)
corpus = [dictionary.doc2bow(text) for text in texts_list if text !=[] and text != [[]]]
print(len(corpus))

### LSI

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
print(lsimodel.show_topics(num_topics=5))  # Showing only the top 5 topics
lsitopics = lsimodel.show_topics(formatted=False)

### HDP
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
print(hdpmodel.show_topics())
hdptopics = hdpmodel.show_topics(formatted=False)

### LDA
ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

示例#24
0
# In[ ]:

texts = [bigram[line] for line in texts]

# In[ ]:

dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# In[ ]:

lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary)

# In[ ]:

lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics

# In[ ]:

hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

# In[ ]:

hdpmodel.show_topics()

# In[ ]:

ldamodel = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary)

# In[ ]:
示例#25
0
# In[100]:

# create dictionary and corpus
dictionary = Dictionary(cleaned_tweets)
corpus = [dictionary.doc2bow(clean_tween) for clean_tween in cleaned_tweets]

# In[101]:

#### LSI MODEL basically SVD / Principal component analysis

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

# In[102]:

lsimodel.show_topics(num_topics=5)

# In[103]:

# HDP - Hierarchical Dirichlet process
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics()

# In[123]:

# LDA
ldamodel = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
ldamodel.show_topics()

# In[105]:
示例#26
0
results = hdp.retrieveText(pn)

bigram = gensim.models.Phrases(results)
#train_texts = process_texts(train_texts)

train_texts = process_texts(results)

preProcsText(results)

dictionary = Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]

for i in range(10, 100, 10):
    lsimodel = LsiModel(corpus=corpus, num_topics=i, id2word=dictionary)

    lsitopics = lsimodel.show_topics(num_topics=i)

    result_dict = addTotalTermResults(lsitopics)
    addToResults(result_dict)
    printResults(i, 'lsi')

    del listResults[:]
    hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

    hdpmodel.show_topics()

    hdptopics = hdpmodel.show_topics(num_topics=i)

    result_dict = addTotalTermResults(hdptopics)

    #add results to total kept in a list
class LSA(object):
	def __init__(self, stopwords, ignorechars):
		#self.stopwords = stopwords
		self.ignorechars = ignorechars
		self.wdict = {} 
		self.dcount = 0
	def createStopwords(self, stopword_path):
		with open(stopword_path, 'r') as file1:
			temp = file1.read()
			self.stopwords = temp.split()

	def parse_dic_bow(self, seg_post):
		self.posts = [post for post in seg_post.values()]
		logger.info("BOW process... ")
		print "original post:"
		logger.debug("original post:")
		logger.debug(self.posts)
		#print self.posts
		self.mergeLineForOnePost = [" ".join(post) for post in self.posts] #change to ['\xe9\xa3\x9f\xe8\xa8\x98 \xe8\xa7\x92\xe9\xa0\xad',' efffe wedw'] 
		#print self.mergeLineForOnePost
		#self.texts = [[word for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']]
		## covert UTF to ASCII
		self.texts = [[word.encode('utf8') for word in post.split()] for post in self.mergeLineForOnePost] #change to [['human', 'interface', 'computer'],['survey', 'user']]
		print "self.mergeLineForOnePost: "
	
		self.dictionary = gensim.corpora.Dictionary(self.texts)


		self.postIdList = [str(postId) for postId in seg_post.keys()]
		logger.debug("original dic and list:")
		logger.debug(self.dictionary, len(self.dictionary), self.postIdList)
		print "original dic and list:"
		print self.dictionary, self.postIdList

		### preprocess - remove the once-word, stopwords, other shits 
		stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords if stopword in self.dictionary.token2id]
		once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
		### remove once_id sometime cause invalid shape of LSA (TOO LESS words to cluster)
		
		#self.dictionary.filter_tokens(once_ids)
		self.dictionary.filter_tokens(stop_ids)
		logger.info("removed once-words and stopwords......")
		logger.debug(self.dictionary, len(self.dictionary))
		print "removed once-words and stopwords......"
		print self.dictionary
		self.dictionary.compactify()
		self.new_vec = [self.dictionary.doc2bow(post) for post in self.texts]
		#self.new_vec = self.dictionary.doc2bow(post for post in self.coverts)
	def store(self):
		logger.info("store process starts")
		self.dictionary.save(testDictionary)
		self.dictionary.save_as_text(testDictionaryString)
		corpora.MmCorpus.serialize(testBOWCorpus, self.new_vec) # store to disk, for later use
		#corpus = corpora.MmCorpus(testBOWCorpus) # comes from the store 
		#dictionary = corpora.Dictionary.load(testDictionary) # comes from the store
	def TFIDF(self):
		logger.info("TFIDF process starts")
		self.tfidf = TfidfModel(self.new_vec)
		self.corpus_tfidf = self.tfidf[self.new_vec]
	def printInfo(self):
		print 'show Dic: '
		print self.dictionary
		print 'show BOW: '
		for bow in self.new_vec: 
			print bow
		print 'show corpus_tfidf model: '
		print self.tfidf
		print "show corpus_tfidf: "
		for i in self.corpus_tfidf:
			print i
		print "show LSA assignment of each post: "
		#self.num = len(self.corpus_lsi)
		#for doc, i in zip(self.corpus_lsi, range(self.num)): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
		for doc, postId in zip(self.corpus_lsi,self.postIdList):
			templist = [] 
			print 'post: {0}'.format(postId)
			print doc
			#print "breakdown"
			#for each in doc:
			#	templist.append(abs(each[1]))
			#print "templist: "
			#print templist
			theLarge = nlargest(1, doc, key=lambda e:abs(e[1])) ## 1 means find the largest one
			if theLarge:
				print "the largest one with absoule value: ", theLarge[0][0]
			else:
				print "cannot find it!!!!"
		print "LSA Topics : "
		print self.topics
		print "Break down : "
		for i in self.topics:
			print i
			print type(i)
	def build(self):
		### need to find out a way to pick the proper number of the cluster - may be based on the number of POST 
		self.lsi_model = LsiModel(self.corpus_tfidf, id2word = self.dictionary, num_topics=3)
		self.corpus_lsi = self.lsi_model[self.corpus_tfidf]
		##self.topics = self.lsi_model.print_topics(num_topics=5, num_words=4)
		#print "topics difference"
		#print self.lsi_model.print_topic(2, topn=4)
		self.topics = self.lsi_model.show_topics(num_topics=5, num_words=4, log=False, formatted=False)
		#print "tuple!@!"
		#print ss 
	def repaserForOutput(self): 
	### post_assignment = {post_id:topic} Ex. {"p1":"t1"}
	### topic_assignment = {topic_id:[keywords]} Ex. {"t1":["秘密", "飛行器", "新華", "任務"]
		#print "start to extact info for post_assignment"
		self.post_assignment = {}
		self.topic_assignment = {}
		for doc, postId in zip(self.corpus_lsi,self.postIdList): #self.postIdList // ['p2', 'p3', 'p1', 'p6', 'p7', 'p4', 'p5', 'p8']
			theTopic = nlargest(1, doc, key=lambda e:abs(e[1]))
			if theTopic:
				self.post_assignment[postId] = theTopic[0][0]
			else: 
				self.post_assignment[postId] = "NB"
			#self.post_assignment[postId] = theTopic[0]
		self.num = len(self.topics)
		for topic, num in zip(self.topics, range(self.num)):
			topicWords = []
			for each in topic:
				#covert from string to unicode
				topicWords.append(each[1].decode('utf8'))
				#topicWords.append(each[1])
			## just exact the first topic content, for example, use "秘密" in ["秘密", "飛行器", "新華", "任務"]
			#self.topic_assignment[str(num)] = topicWords[0]
			self.topic_assignment[str(num)] = topicWords
		#matchObj = re.match( r'(.*) are(\.*)', line)
		#rerurn(self.post_assignment,self.topic_assignment)
		return (self.post_assignment,self.topic_assignment)
	def create_result(self,seg_post):
		logger.info('LSA main process starts.....')
		self.createStopwords(stopword_path)
		self.parse_dic_bow(seg_post)
		self.TFIDF()
		self.build()
		self.store()
	def get_result(self):
		self.printInfo()
		return (self.repaserForOutput())
示例#28
0
        def topicmodiling():
            l=[]
            text=''
            for i in range(len(dfs)):
                for j in dfs[i]:
                    if(j=='\n'):
                        j=' '
                        text=text+j
                    else:
                        text=text + j
                l.append(text)
                text=''
            for i in l:
                text=text+i+"\n"
            nlp=English()
            doc = nlp(text)
            texts, article = [], []
            for w in doc:
            # if it's not a stop word or punctuation mark or it is not a number, add it to our article!
                if w.is_stop == False and w.is_punct == False and w.like_num == False  and w.like_email ==False :
                # we add the lematized version of the word
                     article.append(w.lemma_)
            # if it's a new line, it means we're onto our next document
                if w.text == '\n':
                    texts.append(article)
                    article = []
            bigram = gensim.models.Phrases(texts)
            texts = [bigram[line] for line in texts]
            for i in texts:
                for j in i:
                    if(j=='\n'): 
                        i.remove(j)
            dictionary = Dictionary(texts)

            corpus = [dictionary.doc2bow(text) for text in texts]
            dictionary.token2id
            dictionary 
            lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=dictionary)
            a=lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics
            b=[]
            for i in range(0,len(a)):
                b.append(a[i][1].split('+'))   
            k=[]
            for i in range(0,len(b)):
                k.append(b[i][0:5])
            top1=[]
            for i in range(0,5):
                top1.append(k[0][i].split('*'))   
            top2=[]
            for i in range(0,5):
                top2.append(k[1][i].split('*'))   
            top3=[]
            for i in range(0,5):
                top3.append(k[2][i].split('*'))  
            top4=[]
            for i in range(0,5):
                top4.append(k[3][i].split('*'))   
            df1 = DataFrame (top1,columns=['Topic 1 weight','Topic 1 words'])
            df2 = DataFrame (top2,columns=['Topic 2 weight','Topic 2 words'])
            df3 = DataFrame (top3,columns=['Topic 3 weight','Topic 3 words'])
            df4 = DataFrame (top4,columns=['Topic 4 weight','Topic 4 words'])
            result = pd.concat([df1, df2,df3,df4], axis=1)
            for col in result.columns:
                result[col]=result[col].str.replace('"','')
                result[col]=result[col].str.replace('-','')
            return result
示例#29
0
def main():
    # --- arguments ---
    (dataset, version, _, _, nbs_topics, _, _, cache_in_memory, use_callbacks,
     tfidf, args) = parse_args()

    model_class = 'LSImodel'
    _split_ = "_split" if use_callbacks else ""

    data_name = f'{dataset}_{version}_{tfidf}'
    data_dir = join(LDA_PATH, version, tfidf)

    # --- logging ---
    logger = init_logging(name=data_name,
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    logg = logger.info
    log_args(logger, args)

    # --- load dict ---
    logg('Loading dictionary')
    data_file = join(data_dir, f'{data_name}.dict')
    dictionary = Dictionary.load(data_file)

    # --- load corpus ---
    logg('Loading corpus')
    data_file = join(data_dir, f'{data_name}.mm')
    corpus = MmCorpus(data_file)
    if cache_in_memory:
        logg('Reading corpus into RAM')
        corpus = list(corpus)
    if use_callbacks:
        train, test = split_corpus(corpus)
    else:
        train, test = corpus, []
    logg(f'size of... train_set={len(train)}, test_set={len(test)}')

    # --- train ---
    topn = 20
    columns = [f'term{x}'
               for x in range(topn)] + [f'weight{x}' for x in range(topn)]
    for nbtopics in nbs_topics:
        gc.collect()

        logg(f'Running {model_class} with {nbtopics} topics')
        model = LsiModel(corpus=train, num_topics=nbtopics, id2word=dictionary)

        model_dir = join(LSI_PATH, version, tfidf, f'{_split_}')
        model_path = join(model_dir,
                          f'{dataset}_{model_class}{_split_}_{nbtopics}')
        if not exists(model_dir):
            makedirs(model_dir)

        # --- save topics ---
        topics = model.show_topics(num_words=topn, formatted=False)
        topics = [list(chain(*zip(*topic[1]))) for topic in topics]
        topics = pd.DataFrame(topics, columns=columns)
        logg(f'Saving topics to {model_path}.csv')
        topics.to_csv(f'{model_path}.csv')

        # --- save model ---
        logg(f'Saving model to {model_path}')
        model.save(model_path)

    # --- done ---
    logg(f'\n'
         f'----- end -----\n'
         f'----- {dataset.upper()} -----\n'
         f'{"#" * 50}\n')
示例#30
0
# combine bigrams and add to corpus
bigram = gensim.models.Phrases(texts)

texts = [bigram[line] for line in texts]

dictionary = Dictionary(texts)

# (word id, number of times word appears in document)
corpus = [dictionary.doc2bow(text) for text in texts]

# latent semantic indexing, a popular information retrieval method,
# which works by decomposing the original matrix of words to
# maintain key topics. Gensim's implementation uses an SVD.
lsi_model = LsiModel(corpus=corpus, num_topics=2, id2word=dictionary)
lsi_topics = lsi_model.show_topics(num_topics=5)
print(lsi_topics)

# hierarchical dirichlet process is an unsupervised topic model which
# determines the number of topics on its own
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
hdp_topics = hdp_model.show_topics()
print(hdp_topics)

# latent dirichlet allocation
lda_model = LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
lda_topics = lda_model.show_topics()
print(lda_topics)

lsi_topics_clean = [[
    word for word, prob in topic
# HDP Hierarchical Dirichlet Process - unsupervised method that determines number of topics itself
print('HDP Hierarchical Dirichlet Process')
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
with open('./tm_results.txt', 'w') as f:
    f.write('Without Spelling Correction\nHDP\n')
    for topic in hdp_model.show_topics(formatted=True):
        f.write('{}\t{}\n'.format(topic[0], topic[1]))

num_topics = len(hdp_model.show_topics())

# LSI Latent Symantex Indexing
print('LSI Latent Symantex Indexing')
lsi_model = LsiModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
with open('./tm_results.txt', 'a') as f:
    f.write('LSI\n')
    for topic in lsi_model.show_topics(formatted=True):
        f.write('{}\t{}\n'.format(topic[0], topic[1]))

# LDA Latent Dirichlet Allocation
print('LDA Latent Dirichlet Allocation')
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
with open('./tm_results.txt', 'a') as f:
    f.write('LDA\n')
    for topic in lda_model.show_topics(formatted=True):
        f.write('{}\t{}\n'.format(topic[0], topic[1]))

# HDP Hierarchical Dirichlet Process - unsupervised method that determines number of topics itself
print('HDP Hierarchical Dirichlet Process')
hdp_model = HdpModel(corpus=corpus_spell, id2word=dictionary_spell)
with open('./tm_results.txt', 'a') as f:
    f.write('\nWith Spelling Correction\nHDP\n')
train_corpus = norm_corpus.apply(remove_stopwords)

#Bigram would been necessary for joining words like new_york so they dont affect the model
# bigram = gensim.models.Phrases(train_corpus)
# train_corpus = [bigram[line] for line in train_corpus]

dictionary = Dictionary(train_corpus)
final_corpus = [dictionary.doc2bow(text) for text in train_corpus]

# Unsupervised learning approach to get the number of topics in this dataset
hdpmodel = HdpModel(corpus=final_corpus, id2word=dictionary)
print(hdpmodel.show_topics())

#Latent Semantic Indeixing, a popular information retreival method which works by decomposing the original matrix of words to maintain key topics
lsimodel = LsiModel(corpus=final_corpus, num_topics=10, id2word=dictionary)
print(lsimodel.show_topics())

#Latent Dirichlet Allocation - famous topic modelling algorithm out there
ldamodel = LdaModel(corpus=final_corpus,
                    num_topics=10,
                    chunksize=100,
                    update_every=1,
                    id2word=dictionary,
                    minimum_probability=0)
print(ldamodel.show_topics())

#Topic Coherence to identify which model is doing better
lsitopics = [[word for word, prob in topic]
             for topicid, topic in lsimodel.show_topics(formatted=False)]
hdptopics = [[word for word, prob in topic]
             for topicid, topic in hdpmodel.show_topics(formatted=False)]
示例#33
0
########################################
print(corpus)
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

########################################
## Applying LSI
########################################
lsi = LsiModel(corpus_tfidf,
               id2word=dictionary,
               num_topics=400,
               decay=1,
               onepass=False,
               extra_samples=20)
corpus_lsi = lsi[corpus_tfidf]
print(lsi.show_topics(num_topics=10))
lsitopics = lsi.show_topics(formatted=False)

########################################
## Applying LDA
########################################
lda = LdaModel(corpus=corpus,
               id2word=dictionary,
               num_topics=400,
               update_every=1,
               chunksize=100,
               passes=1)
print(lda.show_topics(num_topics=10))
ldatopics = lda.show_topics(formatted=False)

########################################
config_file = "/home/rohola/Codes/Python/topic_modeling_visualization-master/configs/lsi_config.json"
config = LSIConfig.from_json_file(config_file)

corpus_manager = CorpusManager()
corpus, dictionary = corpus_manager.read_corpus(config.dataset_dir)

tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi = LsiModel(
    corpus_tfidf,
    id2word=dictionary,
    num_topics=config.num_topics,
    power_iters=config.power_iters)  # initialize an LSI transformation

topic_words = lsi.show_topics(config.num_topics_to_show,
                              num_words=config.num_words,
                              formatted=False)
topic_words = [j for (i, j) in topic_words]

visualize_method = ""
if config.dimension == 2:
    visualize_method = 'plotly'
elif config.dimension == 3:
    visualize_method = 'plotly3d'
else:
    raise ("Wrong dimension, can accept only 2 or 3")

topic_modeling_semantic_network.visualize_semantic_netwrok(
    config, topic_words, visualize_method=visualize_method)
示例#35
0
texts = [bigram[line] for line in texts]


texts[10]


dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


corpus[100]

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

lsimodel.show_topics(num_topics=5)  # Showing only the top 5 topics


hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)

hdpmodel.show_topics()

ldamodel = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
ldamodel.show_topics()

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)


lsitopics = [[word for word, prob in topic] for topicid, topic in lsimodel.show_topics(formatted=False)]