def create_evaluation_perplexity(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    try:
        id2word = Dictionary.load(corpus_fname + '.dict')
        corpus = MalletCorpus(corpus_fname, id2word=id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    held_out = list()
    training = list()
    target_len = int(0.1 * len(corpus))
    logger.info('Calculating perplexity with held-out %d of %d documents' %
                (target_len, len(corpus)))

    ids = set()
    while len(ids) < target_len:
        ids.add(random.randint(0, len(corpus)))

    for doc_id, doc in enumerate(corpus):
        if doc_id in ids:
            held_out.append(doc)
        else:
            training.append(doc)

    model = LdaModel(training,
                     id2word=corpus.id2word,
                     alpha=config.alpha,
                     passes=config.passes,
                     num_topics=config.num_topics)

    pwb = model.log_perplexity(held_out)

    with open(config.path + 'evaluate-perplexity-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, pwb])
class TestLdaCallback(unittest.TestCase):

    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097

    def testCallbackUpdateGraph(self):

        # Popen have no context-manager in 2.7, for this reason - try/finally.
        try:
            # spawn visdom.server
            proc = subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)])

            # wait for visdom server startup (any better way?)
            time.sleep(3)

            viz = Visdom(server=self.host, port=self.port)
            assert viz.check_connection()

            # clear screen
            viz.close()

            self.model.update(self.corpus)
        finally:
            proc.kill()
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lda.gz'


    if not os.path.exists(model_fname) or force:
        if corpus:
            update_every=None # run in batch if we have a pre-supplied corpus
        else:
            update_every=1

        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         alpha=project.alpha,
                         eta=project.eta,
                         passes=project.passes,
                         num_topics=project.num_topics,
                         iterations=project.iterations,
                         eval_every=None, # disable perplexity tests for speed
                         update_every=update_every,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LdaModel.load(model_fname)

    return model, model_fname
示例#4
0
def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200):
    """


    :param dictionary:
    :param corpus:
    :param wiki_path:
    :param num_topics:
    :param passes:
    :param iterations:
    :param chunksize:
    :return:
    """
    if wiki_path is not None:
        logging.info('Generating wiki corpus...')
        wikis = unpickle(wiki_path)
        wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis]

        logging.info('Combining original corpus and wiki corpus...')
        corpus = corpus + wiki_corpus  # wiki_corpus is merged after the original corpus

    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes,
                         iterations=iterations, alpha='auto', chunksize=chunksize)
    corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id)
    # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids]
    doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids]
    doc_vectors = lda_model.inference(corpus)[0]
    doc_vectors = doc_vectors[corpus_ids, :]
    doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1)

    return lda_model, doc_vectors, doc_vector_ids
示例#5
0
def lda(docs, k):
    """Latent Dirichlet allocation topic model.

    Uses Gensim's LdaModel after tokenizing using scikit-learn's
    TfidfVectorizer.

    Parameters
    ----------
    k : integer
        Number of topics.
    """
    from gensim.matutils import Sparse2Corpus
    from gensim.models import LdaModel

    # Use a scikit-learn vectorizer rather than Gensim's equivalent
    # for speed and consistency with LSA and k-means.
    vect = _vectorizer()
    corpus = vect.fit_transform(fetch(d) for d in docs)
    corpus = Sparse2Corpus(corpus)

    model = LdaModel(corpus=corpus, num_topics=k)

    topics = model.show_topics(formatted=False)
    vocab = vect.get_feature_names()
    #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic]
    return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
示例#6
0
class TestLdaDiff(unittest.TestCase):
    def setUp(self):
        self.dictionary = common_dictionary
        self.corpus = common_corpus
        self.num_topics = 5
        self.n_ann_terms = 10
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)

    def testBasic(self):
        # test for matrix case
        mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)

        self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
        self.assertEqual(len(annotation), self.num_topics)
        self.assertEqual(len(annotation[0]), self.num_topics)

        # test for diagonal case
        mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True)

        self.assertEqual(mdiff.shape, (self.num_topics,))
        self.assertEqual(len(annotation), self.num_topics)

    def testIdentity(self):
        for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
            # test for matrix case
            mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)

            for row in annotation:
                for (int_tokens, diff_tokens) in row:
                    self.assertEqual(diff_tokens, [])
                    self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

            # test for diagonal case
            mdiff, annotation = \
                self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)

            for (int_tokens, diff_tokens) in annotation:
                self.assertEqual(diff_tokens, [])
                self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

    def testInput(self):
        self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
        self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')
示例#7
0
def extract_topics(words):
    word_id_map=Dictionary([words])
    word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2])
    word_id_map.compactify()
    deals_corpus=[word_id_map.doc2bow(words)]
    lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1)
    topics=[]
    for i in range(15):
        tokens=lda.print_topic(i).split('+')
        topic_scores=[]
        for token in tokens:
            score,token_val=token.split('*')
            topic_scores.append((token_val,score))
        topics.append(topic_scores)
    return topics
示例#8
0
    def __init__(self, destination, fileName, modelName='', ldaPasses='', topicNum=''):
        '''
        Constructor
        '''
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        self.__destination = destination
        self.__fileName = fileName
        self.__modelName = modelName
        self.__ldaPasses = ldaPasses
        self.__topicNum = topicNum
                
        #=======================================================================
        # STOP WORDS AND CAHRACTERS
        #=======================================================================
        self.__stopwords = stopwords.words('english')# + string.punctuation
        self.__chars_to_remove = [u'[', u']', u'(', u')', u'*', u'%', u'{', u'}', u'\n', u'\n\n', u'\t', u';',u'/',u'^',u'--',u'\\',u'+',u'-',u'.',u'?',u'&',u'#',u'',u'']
        self.__stopwords.extend(self.__chars_to_remove)
        self.__stopwords.extend([item for item in string.punctuation])

        #=======================================================================
        # DATABASE
        #=======================================================================
        self.__db = connectMySQL(db='xpath', port=3366)
        self.__queryResults = None
        self.__cleanedCorpus = []
        

        if modelName != '' and os.path.exists(self.__destination+modelName+'.lda'):
            self.__ldaModel = LdaModel.load(self.__destination+modelName+'.lda', mmap='r') 
            
        if fileName != '' and os.path.exists(self.__destination+fileName+'.dict'):
            self.__modelDict = corpora.Dictionary.load(self.__destination+fileName+'.dict')
示例#9
0
def write_topics(model_path, csv_name, k):
    model = LdaModel.load(model_path)
    topics = []
    for topic_id in range(model.num_topics):
        topics.append(model.return_topic(topicid=topic_id))

    dictionary = Dictionary.load('data/dictionary/tweets.dict')
    word_indices = dictionary.id2token
    writer = csv.writer(file(csv_name, 'w'))

    output = [[0 for i in range(model.num_topics)] for j in range(k)]
    for topic_id, topic in enumerate(topics):
        for rank, index in enumerate(topic.argsort()[::-1]):
            output[rank][topic_id] = {}
            output[rank][topic_id]['word'] = word_indices[index]
            output[rank][topic_id]['p'] = topic[index]
            rank += 1
            if rank >= k:
                break

    for topic_id in range(model.num_topics):
        row = ['z = ' + str(topic_id)]

        for rank in range(k):
            row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p']))

        writer.writerow(row)
示例#10
0
文件: dmp.py 项目: npiaq/dmp
 def load(self):
     '''读取 lda 模型和 dic 词典.
     '''
     lda_file = config.get('dmp', 'lda_file')
     dic_file = config.get('dmp', 'dic_file')
     self.lda = LdaModel.load(lda_file)
     self.dic = Dictionary.load(dic_file)
def create_evaluation_distinctiveness(config, Kind):
    model_fname = config.model_fname % Kind.__name__

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    scores = utils.score(model, utils.kullback_leibler_divergence)
    total = sum([x[1] for x in scores])

    logger.info("%s model KL: %f" % (model_fname, total))
    with open(config.path + 'evaluate-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, total])

    etas = list()
    for topic in model.state.get_lambda():
        topic_eta = list()
        for p_w in topic:
            topic_eta.append(p_w * numpy.log2(p_w))
            etas.append(-sum(topic_eta))

    entropy = sum(etas) / len(etas)

    logger.info("%s model entropy mean: %f" % (model_fname, entropy))
    with open(config.path + 'evaluate-entropy-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, entropy])
示例#12
0
class CorpusLdaModelWrapper:
    def __init__(self, corpus, dictionary, doc_labels, preprocessing_pipeline, numtopics):
        self.corpus = corpus
        self.dictionary = dictionary
        self.doc_labels = doc_labels
        self.pipeline = preprocessing_pipeline
        self.numtopics = numtopics
        self.trained = False

    def train(self):
        # training
        self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
        self.index = MatrixSimilarity(self.model[self.corpus])

        # flag
        self.trained = True

    def convertTextToReducedVector(self, text):
        if not self.trained:
            raise exceptions.ModelNotTrainedException()
        tokens = word_tokenize(prep.preprocess_text(text, self.pipeline))
        tokens = filter(lambda token: self.dictionary.token2id.has_key(token), tokens)
        bow = self.dictionary.doc2bow(tokens)
        return self.model[bow]

    def queryDoc(self, text):
        reducedVec = self.convertTextToReducedVector(text)
        sims = self.index[reducedVec]
        simtuples = zip(range(len(sims)), sims) if self.doc_labels==None else zip(self.doc_labels, sims)
        simtuples = sorted(simtuples, key=lambda item: item[1], reverse=True)
        return simtuples

    def show_topic(self, id):
        return self.model.show_topic(id)
示例#13
0
    def train(self):
        # training
        self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
        self.index = MatrixSimilarity(self.model[self.corpus])

        # flag
        self.trained = True
示例#14
0
    def calculateLDADistance(self, modelName='', topNSimilar='', topicList=''):
        
        if modelName=='':
            modelName=self.__fileName
    
        if topNSimilar=='':
            topNSimilar=5       
            
        write2file = self.__destination+modelName+"_results_LDA_similarTopics.csv"
        resultsCSV = open(write2file, "wb")
        
        print 'Reading model data'
        gensimDict = corpora.Dictionary.load(self.__destination+self.__fileName+'.dict')
        ldaModel = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
        topics = ldaModel.show_topics(num_topics=ldaModel.num_topics, num_words=len(gensimDict),formatted=False)
        #=======================================================================
        # num_topics=ldaModel.num_topics                             
        # num_words=len(gensimDict)
        #=======================================================================
        
        #=======================================================================
        # GET SIMILARITY VECTORS
        #=======================================================================
        print 'Extractig vectors'
        topicsSorted = [sorted(x,  key=lambda x: x[1]) for x in topics]
        vectors = []
            
        for topic in topicsSorted:
            vector = [item[0] for item in topic]
            vectors.append(vector)

        #=======================================================================    
        # CALCULATE SIMILARITIES BETWEEN TOPICS
        #=======================================================================
        print 'Calculating distances between LDA topics\n'
        results = []
        for topicListItem in topicList:
            distances = []
            for j in range (0, len(vectors)):
                dist = euclidean(vectors[topicListItem], vectors[j])
                #===============================================================
                # print topicListItem, j, dist
                #===============================================================
                distances.append(dist)
            results.append(distances)

        #=======================================================================
        # EXPORT TOP N SIMILAR TOPICS NAD PRINT OUT QUERY TERMS
        #=======================================================================
        print 'Writing found similar topics to file\n'
        for resultItem in range(0,len(results)):
            similarLDATopics = np.argsort(results[resultItem])[::-1]
              
            for similarItem in similarLDATopics[:topNSimilar]:
                #===============================================================
                # print topicList[resultItem],similarItem
                #===============================================================
                resultsCSV.write(str(topicList[resultItem])+'; '+str(similarItem)+'; '+', '.join(x[1].lstrip().rstrip() for x in topics[similarItem][:100])+'\n\n')
            resultsCSV.write('*******************************************\n\n')
def evaluate_log(context, config):
    logger.info('Evalutating models for: %s' % config.project.name)

    model_fname = config.model_fname % ChangesetCorpus.__name__
    changeset_fname = config.corpus_fname % ChangesetCorpus.__name__
    commit_fname = config.corpus_fname % CommitLogCorpus.__name__

    try:
        commit_id2word = Dictionary.load(commit_fname + '.dict')
        commit_corpus = MalletCorpus(commit_fname,
                                     id2word=commit_id2word)
        changeset_id2word = Dictionary.load(changeset_fname + '.dict')
        changeset_corpus = MalletCorpus(changeset_fname,
                                        id2word=changeset_id2word)
    except:
        error('Corpora not built yet -- cannot evaluate')

    try:
        model = LdaModel.load(model_fname)
        logger.info('Opened previously created model at file %s' % model_fname)
    except:
        error('Cannot evalutate LDA models not built yet!')

    changeset_doc_topic = get_doc_topic(changeset_corpus, model)
    commit_doc_topic = get_doc_topic(commit_corpus, model)

    first_shared = dict()
    for id_ in commit_doc_topic:
        i = 0
        commit_topics = [topic[0] for topic in commit_doc_topic[id_]]
        try:
            changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]]
        except:
            continue

        maximum = 101
        minimum = maximum

        for i, topic in enumerate(commit_topics):
            if topic in changeset_topics:
                j = changeset_topics.index(topic)
                minimum = min(minimum, max(i, j))

        for i, topic in enumerate(changeset_topics):
            if topic in commit_topics:
                j = commit_topics.index(topic)
                minimum = min(minimum, max(i, j))

        first_shared[id_] = minimum

        if minimum == maximum:
            logger.info('No common topics found for %s' % str(id_))
            del first_shared[id_]

    mean = sum(first_shared.values()) / len(first_shared)

    with open('data/evaluate-log-results.csv', 'a') as f:
        w = csv.writer(f)
        w.writerow([model_fname, mean] + list(first_shared.values()))
示例#16
0
文件: predict.py 项目: nkman/Raiden
  def __init__(self):

    cwd = os.path.dirname(__file__)
    dictionary_path = os.path.abspath(os.path.join(cwd, 'models/dictionary.dict'))
    lda_model_path = os.path.abspath(os.path.join(cwd, 'models/lda_model_10_topics.lda'))

    self.dictionary = corpora.Dictionary.load(dictionary_path)
    self.lda = LdaModel.load(lda_model_path)
示例#17
0
def load_lda_model(lda_model_name=None, mallet=False):
    if os.path.isfile(lda_model_name):
        if mallet:
            lda_model = LdaMallet.load(lda_model_name)
        else:
            lda_model = LdaModel.load(lda_model_name)
        return lda_model
    return None
示例#18
0
 def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10):
     # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001)
     try:
         lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=num_iterations)
         result = {}
         tpd = lda[self.corpus] # topic probability distribution
         for topics in tpd:
             for elem in topics:
                 if result.get(elem[0], -1) == -1:
                     words = lda.show_topic(elem[0], topn=num_words)
                     result[elem[0]] = {'weight': elem[1], 'words': words}
                 else:
                     result[elem[0]]['weight'] += elem[1]
         return result
     except Exception as e:
         print e
         return None
    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097
示例#20
0
    def analyzeLDA(self, modelName='', numberOfTerms=''):
        '''
        modelName -> name of model to read in to memory without the extension
        '''
        
        if modelName=='':
            modelName=self.__fileName
            
        if numberOfTerms == '':
            numberOfTerms=100
            
        write2file = self.__destination+modelName+"_results_%s_SW.csv"%(numberOfTerms)
        #=======================================================================
        # allTopicsFile = self.__destination+modelName+"_results_AllTopics.csv"
        #=======================================================================
        
        resultsCSV = open(write2file, "wb")
        model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
         
        #and another way, only prints top words 
        for t in range(0, model.num_topics-1):
            #===================================================================
            # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, numberOfTerms)])
            #===================================================================

            topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords]
            listSet = set(topicSet)

            for key in self.__queryWords:  
                difference = set(topicSet).intersection(self.__queryWords[key])
                 
                if len(difference) > 0:
                    self.__overlapingTopics[key][t]=topicSet
        
        try:
            for key in self.__overlapingTopics:
                if self.__overlapingTopics[key]:
                    for topicKey in self.__overlapingTopics[key]:
                        topicTerms = [w.lstrip().rstrip() for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords][:100]
                        #=======================================================
                        # topicTerms = [w.translate(None, ''.join(self.__chars_to_remove)) for w in topicTerms if w !='']
                        #=======================================================
                        resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(topicTerms)+'\n\n')
                        print key,'\t',topicKey,'\t', topicTerms
                    resultsCSV.write('***************************************\n')
                print '*************************\n'
                
            write2fileJSON = self.__destination+modelName+"_results_%s_SW.json"%(numberOfTerms)
            with open(write2fileJSON, 'w') as fp:
                json.dump(self.__overlapingTopics, fp)
     
        except KeyError as e: 
            print e
            pass 
        
        resultsCSV.close()
示例#21
0
    def analyzeUniqueLDA(self, modelName='', numberOfTerms=''):
        '''
        modelName -> name of model to read in to memory without the extension
        '''
        
        if modelName=='':
            modelName=self.__fileName
            
        if numberOfTerms=='':
            numberOfTerms=100
            
        write2File = self.__destination+modelName+"_results_unique_%sTerms.csv"%(numberOfTerms)
        resultsCSV = open(write2File, "wb")
        
        model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)

         
        #and another way, only prints top words 
        for t in range(0, model.num_topics-1):
            #===================================================================
            # print 'topic {}: '.format(t) + ', '.join([v[1] for v in model.show_topic(t, 500)])
            #===================================================================
            # raw_input('prompt')
            topicSet = [v[1].lstrip().rstrip() for v in model.show_topic(t, numberOfTerms) if v[1] not in self.__stopwords]
            #===================================================================
            # print type(topicSet), topicSet
            #===================================================================
            listSet = set(topicSet)
            #print listSet
            #print type(topicSet), topicSet
            for key in self.__queryWords:  
                #print self.__queryWords[key]
                difference = set(topicSet).intersection(self.__queryWords[key])
                 
                if len(difference) > 0:
                    self.__overlapingTopics[key][t]=topicSet
        
        try:
            for key in self.__overlapingTopics:
                uniqueQueryTerms = []
                if self.__overlapingTopics[key]:
                    for topicKey in self.__overlapingTopics[key]:
                        topicTerms = [w for w in self.__overlapingTopics[key][topicKey] if w not in self.__stopwords]
                        uniqueQueryTerms.extend(topicTerms)
                        
                uniqueQueryTerms = [x for x in set(uniqueQueryTerms)]
                resultsCSV.write(key+';'+str(topicKey)+';'+', '.join(uniqueQueryTerms)+'\n\n')
                resultsCSV.write('***************************************\n')
                print key, uniqueQueryTerms
                print '*************************\n'

        except KeyError as e: 
            print e
            pass 
        
        resultsCSV.close()
示例#22
0
class TestLdaDiff(unittest.TestCase):
    def setUp(self):
        texts = [
            ['human', 'interface', 'computer'],
            ['survey', 'user', 'computer', 'system', 'response', 'time'],
            ['eps', 'user', 'interface', 'system'],
            ['system', 'human', 'system', 'eps'],
            ['user', 'response', 'time'],
            ['trees'],
            ['graph', 'trees'],
            ['graph', 'minors', 'trees'],
            ['graph', 'minors', 'survey'],
        ]
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.num_topics = 5
        self.n_ann_terms = 10
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)

    def testBasic(self):
        mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)

        self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
        self.assertEquals(len(annotation), self.num_topics)
        self.assertEquals(len(annotation[0]), self.num_topics)

    def testIdentity(self):
        for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
            mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)

            for row in annotation:
                for (int_tokens, diff_tokens) in row:
                    self.assertEquals(diff_tokens, [])
                    self.assertEquals(len(int_tokens), self.n_ann_terms)

            self.assertTrue(np.allclose(np.diag(mdiff), np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

    def testInput(self):
        self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
        self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')
示例#23
0
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
def create_model(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    if not os.path.exists(model_fname):
        try:
            id2word = Dictionary.load(corpus_fname + '.dict')
            corpus = MalletCorpus(corpus_fname, id2word=id2word)
            logger.info('Opened previously created corpus: %s' % corpus_fname)
        except:
            error('Corpora for building file models not found!')

        file_model = LdaModel(corpus,
                              id2word=corpus.id2word,
                              alpha=config.alpha,
                              passes=config.passes,
                              num_topics=config.num_topics)

        file_model.save(model_fname)
def get_keywords(threshold=0.01, model_path='result/model.lda'):
    lda_model = LdaModel.load(model_path)
    topic_num = lda_model.num_topics
    keywords = set()
    for topic_id in range(topic_num):
        topic = lda_model.state.get_lambda()[topic_id]
        topic = topic / topic.sum()  # normalize to probability dist
        signif_word_ids = np.where(topic > threshold)[0]
        keywords = keywords.union([lda_model.id2word[word_id] for word_id in signif_word_ids])

    return keywords
示例#26
0
文件: dmp.py 项目: npiaq/dmp
    def train(self):
        '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象.

        dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词
        lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过
             lda.print_topic(id) 来获取主题中词的列表
        '''
        docs = self.__load_corpus()
        self.dic = Dictionary(docs)
        bow = [self.dic.doc2bow(doc) for doc in docs]
        self.lda = LdaModel(bow, id2word=self.dic,
                            num_topics=self.topic_num)
示例#27
0
    def __init__(self):
        # current_working_dir = '/home/etu/eason/nodejs/Semantic_Aware_RecSys'
        current_working_dir = '.'
        os.chdir(current_working_dir)
        lda_model_path = "./LDAmodel/final_ldamodel"

        self.lda = LdaModel.load(lda_model_path)
        self.no_of_recommendation = 10
        self.omit_topic_below_this_fraction = 0.1
        self.mapping = self.__init_mapping()
        self.linkMapping = self.__init_Link_mapping()
        self.doc_topic_matrix = loadPickleFile('doc_topic_matrix')
示例#28
0
 def getAllTopics(self, modelName='', numberOfTerms=100):
     '''
     modelName -> name of model to read in to memory without the extension
     '''
     
     returningData = {}
     
     if modelName=='':
         modelName=self.__fileName
         
     model = LdaModel.load(self.__destination+modelName+'.lda',  mmap=None)
     
     return model.show_topics(num_topics=model.num_topics,num_words=numberOfTerms, formatted=False)
def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics
示例#30
0
    def train(self):
        data = []
        entity2id = {}
        id2entity = []

        for obj in self.data:
            doc = []
            obj_sents = obj["text_data"]
            entity = obj["prod"]
            if entity not in entity2id:
                entity2id[entity] = len(entity2id)
                id2entity.append(entity)
            doc_id = entity2id[entity]

            for obj_sent in obj_sents:
                for pair in obj_sent:
                    if pair[0] >= 0:
                        doc.append((pair[0], doc_id))
            data.append(doc)



        self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic)

        f_entity = open("lda/prod.txt", "w")
        f_model = open("lda/model.txt", "w")
        f_model.write(str(len(entity2id)))
        f_model.write(" ")
        f_model.write(str(self.n_topic))
        f_model.write("\n")

        for entity in id2entity:
            f_entity.write(entity)
            f_entity.write("\n")

            f_model.write(entity)
            f_model.write(" ")

            distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0)
            distr = [pair[1] for pair in distr]

            for prod in distr:
                f_model.write(str(prod))
                f_model.write(" ")

            f_model.write("\n")

        self.ldamodel.save("lda/model_200")
示例#31
0
 def LDALoad(self):
     self.ldamodel = LdaModel.load("fixed_time_window_lda.model")
     self.dictionary = Dictionary.load("lda_dictionary.model")
     print(self.dictionary)
示例#32
0
from gensim import models

train = []
stopwords = codecs.open('../../corpus/English_StopWords.txt',
                        'r',
                        encoding='utf8').readlines()
stopwords = [w.strip() for w in stopwords]
fp = codecs.open('../../corpus/test.lsnp', 'r', encoding='utf8')
for line in fp:
    line = line.split()
    train.append([w for w in line if w not in stopwords])
print(train)
dictionary = Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]
print(corpus[0])
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
# 打印前20个topic的词分布
print(lda.print_topics(20))
# 打印id为20的topic的词分布
print(lda.print_topic(20))
#模型的保存/ 加载
lda.save('zhwiki_lda.model')
lda = models.ldamodel.LdaModel.load('zhwiki_lda.model')

# tt = 'loss of energy , motivation and no interest in work anymore - be it time to through it all in'
#
# test_doc = list(i for i in tt.split())
#
# doc_bow = id2word.doc2bow(test_doc)      #文档转换成bow
# doc_lda = lda[doc_bow]                   #得到新文档的主题分布
# #输出新文档的主题分布
示例#33
0
def gensim_lda_topic_modelling(path,
                               documents,
                               num_of_topics=6,
                               passes=50,
                               verbose=True,
                               plotTopicsResults=True):
    dictionary = Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    if verbose:
        print("Cleaned documents:\n", documents)
        print("\nDictionary:\n", dictionary)
        print("\nCorpus in BoW form: \n", corpus)
    start = time.time()
    ldamodel = LdaModel(corpus=corpus,
                        num_topics=num_of_topics,
                        passes=passes,
                        id2word=dictionary)
    end = time.time()
    print("Completion time for building LDA model: %.3f s = %.3f min" %
          ((end - start), (end - start) / 60.0))

    ldatopics = ldamodel.show_topics(formatted=False)
    ldatopics_words = [[[word, prob] for word, prob in topic]
                       for topicid, topic in ldatopics]

    if verbose:
        print("\nList of words associated with each topic:\n")
        for i in range(len(ldatopics_words)):
            print("\nTopic %d:\n" % i)
            for w, p in ldatopics_words[i]:
                print(p, " - ", w)

    if plotTopicsResults:
        plot_top_10_words_per_topic(path,
                                    ldatopics_words,
                                    num_topics=6,
                                    num_top_words=10)

    all_documents_topics = [
        (doc_topics, word_topics, word_phis)
        for doc_topics, word_topics, word_phis in ldamodel.get_document_topics(
            corpus, per_word_topics=True)
    ]
    all_doc_topics = []
    for i in range(len(all_documents_topics)):
        doc_topics, word_topics, phi_values = all_documents_topics[i]
        all_doc_topics.append(
            [doc_topics[i][1] for i in range(len(doc_topics))])
        if verbose:
            print('Document topics:', doc_topics)
            print('Word topics:', word_topics)
            print('Phi values:', phi_values)
            print('-------------- \n')

    if plotTopicsResults:
        plot_share_of_topics(path, all_doc_topics, no_random_tweets=10)

    # Plot words coloured differently depending on the topic
    for doc in documents[0:100]:
        if len(doc) > 4:
            color_words(ldamodel, doc)
示例#34
0
    # we add some words to the stop word list
    texts, article = [], []
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and w.text != 'I':
            # we add the lematized version of the word
            article.append(w.lemma_)
        # if it's a new line, it means we're onto our next document
        if w.text == '\n':
            texts.append(article)
            article = []
    # for i in texts:
    #     print(i)

    bigram = gensim.models.Phrases(texts)
    texts = [bigram[line] for line in texts]
    # print(texts)
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    if len(corpus) == 0:
        print("fadssf")
    else:
        ldamodel = LdaModel(corpus=corpus, num_topics=3, id2word=dictionary)

        for i in ldamodel.show_topics():
            ans = ' '.join(i[1].split(" + "))
            # ans=' '.join(ans.split('*'))
            ans = ''.join(ans.split('"'))

            print(a + "~" + ans)
示例#35
0
class LDATagger:
    _lda_model = None
    _dictionary = None
    _lda_model_path = None
    _dictionary_path = None
    DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model")
    DEFAULT_NUM_TOPICS = 1000

    def __init__(self,
                 model_path=DEFAULT_MODEL_PATH,
                 num_topics=DEFAULT_NUM_TOPICS,
                 lock=threading.Lock()):
        self.save_model_lock = lock

        if os.path.isfile(model_path):
            raise Exception("Invalid Model Path; Should Be a Directory")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        self._lda_model_path = os.path.join(model_path, "lda.model")
        self._dictionary_path = os.path.join(model_path, "tokens.dict")
        self.num_topics = num_topics
        self.model_folder_lock = FileLock(model_path)

    def topics_for_documents(self, doc_tokens_map):
        self.check_and_load_model()
        doc_topics_map = defaultdict(list)
        for document_id, document_tokens in doc_tokens_map.iteritems():
            doc_topics_map[document_id] = self.topics_for_document(
                document_tokens)
        return doc_topics_map

    def topics_for_document(self, tokens):
        self.check_and_load_model()
        bow_tokens = self._dictionary.doc2bow(tokens)
        topics = self._lda_model[bow_tokens]
        return topics

    def build_topics(self, tokens_list):
        self._dictionary = Dictionary(tokens_list)
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model = LdaModel(corpus=corpus,
                                   id2word=self._dictionary,
                                   num_topics=self.num_topics,
                                   passes=100)
        self.save_model()

    def save_model(self, sleep_for_test=False, mock_datastruct=None):
        self.save_model_lock.acquire()
        self.model_folder_lock.acquire()
        if mock_datastruct: mock_datastruct.acquire()
        if sleep_for_test:
            import time
            time.sleep(1)
        print
        "Acquired Lock "
        try:
            self._lda_model.save(self._lda_model_path)
            self._dictionary.save(self._dictionary_path)
        finally:
            print
            "Released Lock"
            if mock_datastruct: mock_datastruct.release()
            self.model_folder_lock.release()
            self.save_model_lock.release()

    def check_and_load_model(self):
        if self._lda_model and self._dictionary:
            return
        if os.path.exists(self._lda_model_path):
            self._lda_model = LdaModel.load(self._lda_model_path)
        else:
            raise Exception("LDA Model Not found in the path")
        if os.path.exists(self._dictionary_path):
            self._dictionary = Dictionary.load(self._dictionary_path)
        else:
            raise Exception("Tokens Dictionary Not found in the path")

    def update_model(self, tokens_list):
        self.check_and_load_model()
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model.update(corpus=corpus)
        self.save_model()

    def build_or_update_model(self, tokens_list):
        if not self.does_model_exist():
            self.build_topics(tokens_list)
        else:
            self.update_model(tokens_list)

    def does_model_exist(self):
        if os.path.exists(self._lda_model_path) and os.path.exists(
                self._dictionary_path):
            return True
        return False

    def get_model(self):
        self.check_and_load_model()
        model_hash = {
            "lda_model": cPickle.dumps(self._lda_model),
            "dictionary": cPickle.dumps(self._dictionary)
        }
        return model_hash

    def restore_model(self, model_hash):
        self._lda_model = cPickle.loads(
            model_hash["lda_model"].encode('utf-8'))
        self._dictionary = cPickle.loads(
            model_hash["dictionary"].encode('utf-8'))
        self.save_model()

    def topics_to_tokens(self):
        topics_tokens_map = defaultdict(list)
        if not self.does_model_exist():
            return []
        else:
            model = self._lda_model
            topics_to_tokens = model.show_topics(
                topics=self.DEFAULT_NUM_TOPICS,
                topn=25,
                log=False,
                formatted=False)

            for topic_id, tokens in enumerate(topics_to_tokens):
                topics_tokens_map[topic_id] = self.list_of_tuples_to_hash(
                    tokens)

            return topics_tokens_map

    def list_of_tuples_to_hash(self, tokens):
        tokens_hash = defaultdict(float)
        for token_probability, token in tokens:
            tokens_hash[token] = token_probability
        return tokens_hash
    dtm = vectorizer.fit_transform(docs)
    sparse.save_npz(dtm_path, dtm)
    tokens = vectorizer.get_feature_names()
    vocab_size = len(tokens)
    pd.Series(tokens).to_csv(token_path, index=False)

    id2word = pd.Series(tokens).to_dict()
    corpus = Sparse2Corpus(dtm, documents_columns=False)

    # dictionary = Dictionary.from_corpus(corpus=train_corpus, id2word=id2word)

    # for n_topics in [3, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 100]:
    for n_topics in [5, 10, 15, 20, 30]:
        print(n_topics, end=' ', flush=True)
        lda = LdaModel(corpus=corpus, num_topics=n_topics, id2word=id2word)

        doc_topics = pd.DataFrame()
        for i, topics in enumerate(lda.get_document_topics(corpus)):
            doc_topics = pd.concat([
                doc_topics,
                pd.DataFrame(topics, columns=['topic', 'value']).assign(doc=i)
            ])
        doc_topics.to_csv(model_path / f'doc_topics_{key}_{n_topics}.csv',
                          index=False)

        model_file = datapath((model_path / f'{key}_{n_topics}').resolve())
        lda.save(model_file)
        train_lda = LdaModel(corpus=train_corpus,
                             num_topics=n_topics,
                             id2word=pd.Series(train_tokens).to_dict())
id2word_nouns = dict_nouns.id2token

# Display
#pp.pprint(id2word_nouns)

# Display results of Corpus
# print(corpus_nouns)
# print('Number of unique tokens: {}'.format(len(dict_nouns)))
# print('Number of documents: {}'.format(len(corpus_nouns)))

# TODO: save corpus and dctionary to disk and load them back
# save to path_lda_data

lda_nouns = LdaModel(corpus=corpus_nouns,
                     id2word=id2word_nouns,
                     num_topics=10,
                     iterations=300,
                     eval_every=1)

lda_nouns.print_topics(-1)

# Print the Keyword in the 10 topics
pp.pprint(lda_nouns.print_topics())

########################
########################

#u_mass coherence measure
from gensim.models.coherencemodel import CoherenceModel
lda_nouns_cm = CoherenceModel(model=lda_nouns,
                              corpus=corpus_nouns,
def run_lda_with_entropy(industry_lda, token_dict, max_k=5):
    common_dictionary = corpora.Dictionary(industry_lda)
    common_corpus = [common_dictionary.doc2bow(text) for text in industry_lda]
    ldamodel = LdaModel(corpus=common_corpus, num_topics=max_k + 1, id2word=common_dictionary)
    result = ldamodel.print_topics(num_topics=max_k + 1, num_words=10)
    center_lst = []
    for i in range(max_k + 1):
        result2 = ldamodel.get_topic_terms(topicid=i)
        sum_word = 0
        center = 0
        length = len(result2)
        for v in result2:
            if common_dictionary[v[0]] in token_dict.keys():
                center += token_dict[common_dictionary[v[0]]]
        center_lst.append(center / length)

    industry_with_center_distance = []
    sum_temp5_lst = []
    for i in industry_lda:
        temp2 = []
        for k in i:
            temp = []
            if k in token_dict.keys():
                for j in center_lst:
                    temp.append((cal_sim(np.array(token_dict[k]), j)))
            if len(temp) > 0:
                temp2.append(temp)
        if len(temp2) > 0:
            temp3 = np.array(temp2)
            temp4 = np.mean(temp3, axis=0)
            temp5 = np.sum(temp3)
        else:
            temp4 = []
            for i in range(0, max_k + 1):
                temp4.append(0.0)
            temp5 = temp4

        industry_with_center_distance.append(temp4)
        sum_temp5_lst.append(temp5)

    entro_result_final = {}

    for number, i in enumerate(industry_lda):
        entro_result_2 = []
        for k in i:
            entro_result = []
            if k in token_dict.keys():
                for j in center_lst:
                    temp = cal_sim(np.array(token_dict[k]), j)
                    temp_value = temp / sum_temp5_lst[number]
                    entro_result.append(temp_value * math.log(temp_value))
            entro_result_2.append(entro_result)
        if len(entro_result_2) > 0:
            temp5 = np.zeros(shape=(1, max_k + 1), dtype=float)
            for w in entro_result_2:
                if len(w) > 0:
                    temp4 = np.array(w)
                    temp5 += temp4

            list_temp5 = list(temp5[0])
            entro_result_final[number] = list_temp5.index(max(list_temp5))

    final_result = defaultdict(list)
    for i in range(0, max_k + 1):
        for key, value in entro_result_final.items():
            if value == i:
                final_result[i].append(industry_lda[key])
示例#39
0
import numpy as np
from gensim.models import LdaMulticore as LdaModel
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train LDA model')
    parser.add_argument('tweet_file', help=('path to twitter downloader dump where each line is a cleaned tweet'))
    parser.add_argument('out_dir', help=('path output file to save the model'))
    parser.add_argument('--include_list', nargs='?', default=None)  # see preprocess for default list
    parser.add_argument('--exclude_list', nargs='?', default=None)  # see preprocess for default list

    parser.add_argument('num_topics', type=int)
    parser.add_argument('--npasses', type=int, default=50)
    parser.add_argument('--decay', type=float, default=.5)
    parser.add_argument('--chunksize', type=int, default=2000)

    lda_filename    = 'lda/middle_east_100.lda'
    args = parser.parse_args()
    corpus = corpora.MmCorpus('lda/lda_middle_east.mm')
    dictionary = corpora.Dictionary.load('lda/lda_middle_east.dict')

    lda = LdaModel(corpus, num_topics=100,
                   alpha=1./100, eta=.2, chunksize=10000,
                   workers=5, passes=100, decay=0.75,
                   id2word=dictionary)

    print('Saving model')
    lda.print_topics()
    lda.save(lda_filename)
    print("lda saved in %s " % lda_filename)
示例#40
0
def main():
    global args

    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    num_iters = args.num_iters
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    bkpt_continue = args.bkpt_continue
    use_tfidf = args.use_tfidf
    rebuild = args.rebuild
    auto_adj = args.auto_adj

    docSet = DocDataset(taskname,
                        no_below=no_below,
                        no_above=no_above,
                        rebuild=rebuild)
    if auto_adj:
        no_above = docSet.topk_dfs(topk=20)
        docSet = DocDataset(taskname,
                            no_below=no_below,
                            no_above=no_above,
                            rebuild=rebuild,
                            use_tfidf=False)

    model_name = 'LDA'
    msg = 'bow' if not use_tfidf else 'tfidf'
    run_name = '{}_K{}_{}_{}'.format(model_name, n_topic, taskname, msg)
    if not os.path.exists('logs'):
        os.mkdir('logs')
    if not os.path.exists('ckpt'):
        os.mkdir('ckpt')
    loghandler = [
        logging.FileHandler(filename=f'logs/{run_name}.log', encoding="utf-8")
    ]
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(message)s',
                        handlers=loghandler)
    logger = logging.getLogger(__name__)

    if bkpt_continue:
        print('loading model ckpt ...')
        lda_model = gensim.models.ldamodel.LdaModel.load(
            'ckpt/{}.model'.format(run_name))

    # Training
    print('Start Training ...')

    if use_tfidf:
        tfidf = TfidfModel(docSet.bows)
        corpus_tfidf = tfidf[docSet.bows]
        #lda_model = LdaMulticore(list(corpus_tfidf),num_topics=n_topic,id2word=docSet.dictionary,alpha='asymmetric',passes=num_iters,workers=n_cpu,minimum_probability=0.0)
        lda_model = LdaModel(list(corpus_tfidf),
                             num_topics=n_topic,
                             id2word=docSet.dictionary,
                             alpha='asymmetric',
                             passes=num_iters)
    else:
        #lda_model = LdaMulticore(list(docSet.bows),num_topics=n_topic,id2word=docSet.dictionary,alpha='asymmetric',passes=num_iters,workers=n_cpu)
        lda_model = LdaModel(list(docSet.bows),
                             num_topics=n_topic,
                             id2word=docSet.dictionary,
                             alpha='asymmetric',
                             passes=num_iters)

    save_name = f'./ckpt/LDA_{taskname}_tp{n_topic}_{time.strftime("%Y-%m-%d-%H-%M", time.localtime())}.ckpt'
    lda_model.save(save_name)

    # Evaluation
    print('Evaluation ...')
    topic_words = get_topic_words(model=lda_model,
                                  n_topic=n_topic,
                                  topn=15,
                                  vocab=docSet.dictionary)

    (cv_score, w2v_score, c_uci_score,
     c_npmi_score), _ = calc_topic_coherence(topic_words,
                                             docs=docSet.docs,
                                             dictionary=docSet.dictionary)

    topic_diversity = calc_topic_diversity(topic_words)

    result_dict = {
        'cv': cv_score,
        'w2v': w2v_score,
        'c_uci': c_uci_score,
        'c_npmi': c_npmi_score
    }
    logger.info('Topics:')

    for idx, words in enumerate(topic_words):
        logger.info(f'##{idx:>3d}:{words}')
        print(f'##{idx:>3d}:{words}')

    for measure, score in result_dict.items():
        logger.info(f'{measure} score: {score}')
        print(f'{measure} score: {score}')

    logger.info(f'topic diversity: {topic_diversity}')
    print(f'topic diversity: {topic_diversity}')
示例#41
0
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word=dictionary, passes=50)
print('used: {:.2f}s'.format(time() - start))
print(ldamodel.print_topics(num_topics=2, num_words=4))

for i in ldamodel.print_topics():
    for j in i:
        print(j)

ldamodel.save(MODEL_FILE)

from gensim.models import LdaModel
loading = LdaModel.load(MODEL_FILE)

print(loading.print_topics(num_topics=2, num_words=4))


def pre_new(doc):
    one = cleaning(doc).split()
    two = dictionary.doc2bow(one)
    return two


pre_new('new article that to be classified by trained model!')

belong = loading[(
    pre_new('new article that to be classified by trained model!'))]
print(belong)
示例#42
0
import logging

from gensim.models import LdaModel
from gensim import corpora


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

dictionary_path = "topics_labels/models/dictionary.dict"
corpus_path = "topics_labels/models/corpus.lda-c"
lda_num_topics = 25
lda_model_path = "topics_labels/models/lda_model_50_topics.lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaModel.load(lda_model_path)

topics = [5,10,15,20,25]
for num_topics in topics:
	print "number of topics:", num_topics
	i = 0
	for topic in lda.show_topics(num_topics):
	    print '#' + str(i) + ': ' + topic
	    i += 1

示例#43
0
"""
Classifies all articles according to the LDA model
"""
import os
from multiprocessing.pool import Pool

import psycopg2
from gensim import corpora
from gensim.models import LdaModel

from src.features.text import article_tokenizer
from src.visualization.console import StatusVisualization

dictionary = corpora.Dictionary.load(os.environ['MODEL_PATH'] +
                                     'articles.dict')
model = LdaModel.load(os.environ['MODEL_PATH'] + 'articles.lda')


def classify(article):
    source_url, text = article
    tokens = article_tokenizer.tokenize(text)
    """
    doc_bow = [dictionary.doc2bow(token) for token in [tokens]]
    doc_lda = model.get_document_topics(doc_bow,
                                        minimum_probability=None,
                                        minimum_phi_value=None,
                                        per_word_topics=False)
    topics = doc_lda[0]
    topics_ret = dict()
    for topic in topics:
        print(topics)
for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):

    print(min_df, max_df, binary)

    vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
    try:
        dtm = sparse.load_npz(vocab_path / f'dtm.npz')
        tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
    except FileNotFoundError:
        print('missing')
        continue
    corpus = Sparse2Corpus(dtm, documents_columns=False)
    id2word = tokens.to_dict()
    dictionary = Dictionary.from_corpus(corpus, id2word)

    for num_topics in topics:
        print(num_topics, end=' ')
        model_path = vocab_path / str(num_topics) / str(passes) / 'lda'
        if model_path.exists():
            lda = LdaModel.load(model_path.as_posix())
        else:
            continue
        start = time()
        vis = prepare(lda, corpus, dictionary, mds='tsne')
        terms = vis.topic_info
        terms = terms[terms.Category != 'Default']
        pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix())
        terms.to_csv(model_path / 'relevant_terms.csv', index=False)
        duration = time() - start
        print(format_time(duration))
示例#45
0
    type=int,
)
args = parser.parse_args()

print('Reading dataset')
data = pd.read_parquet(args.input_filepath)

print('Normalizing text')
data.text = data.text.map(nlp.normalize_text)

print('Building docterm matrix')
docterm, dictionary = nlp.get_docterm_matrix(data.text)
doclength = np.array([sum(x[1] for x in doc) for doc in docterm])

print('Training LDA model')
lda = LdaModel(docterm, num_topics=args.n_topics)

print('Getting document topics')
doctopics = corpus2csc([lda.get_document_topics(doc) for doc in docterm])
termtopics = lda.get_topics()

print('Computing topic volume time series')
topic_volume_over_time = nlp.get_topic_volume_over_time(data, doctopics, 20)

print('Computing topic coordinates')
topic_coordinates = nlp.get_topic_coordinates(termtopics, method='mds')
topic_proportions = nlp.get_topic_proportions(doctopics, doclength)

print('Computing term frequencies')
term_frequencies = nlp.get_term_frequencies(docterm, termtopics,
                                            topic_proportions, doclength)
class TestLdaDiff(unittest.TestCase):
    def setUp(self):
        self.dictionary = common_dictionary
        self.corpus = common_corpus
        self.num_topics = 5
        self.n_ann_terms = 10
        self.model = LdaModel(corpus=self.corpus,
                              id2word=self.dictionary,
                              num_topics=self.num_topics,
                              passes=10)

    def testBasic(self):
        # test for matrix case
        mdiff, annotation = self.model.diff(self.model,
                                            n_ann_terms=self.n_ann_terms)

        self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
        self.assertEqual(len(annotation), self.num_topics)
        self.assertEqual(len(annotation[0]), self.num_topics)

        # test for diagonal case
        mdiff, annotation = self.model.diff(self.model,
                                            n_ann_terms=self.n_ann_terms,
                                            diagonal=True)

        self.assertEqual(mdiff.shape, (self.num_topics, ))
        self.assertEqual(len(annotation), self.num_topics)

    def testIdentity(self):
        for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
            # test for matrix case
            mdiff, annotation = self.model.diff(self.model,
                                                n_ann_terms=self.n_ann_terms,
                                                distance=dist_name)

            for row in annotation:
                for (int_tokens, diff_tokens) in row:
                    self.assertEqual(diff_tokens, [])
                    self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(
                np.allclose(np.diag(mdiff),
                            np.zeros(mdiff.shape[0], dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(
                    np.allclose(mdiff, np.zeros(mdiff.shape,
                                                dtype=mdiff.dtype)))

            # test for diagonal case
            mdiff, annotation = \
                self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)

            for (int_tokens, diff_tokens) in annotation:
                self.assertEqual(diff_tokens, [])
                self.assertEqual(len(int_tokens), self.n_ann_terms)

            self.assertTrue(
                np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

            if dist_name == "jaccard":
                self.assertTrue(
                    np.allclose(mdiff, np.zeros(mdiff.shape,
                                                dtype=mdiff.dtype)))

    def testInput(self):
        self.assertRaises(ValueError,
                          self.model.diff,
                          self.model,
                          n_ann_terms=self.n_ann_terms,
                          distance='something')
        self.assertRaises(ValueError,
                          self.model.diff, [],
                          n_ann_terms=self.n_ann_terms,
                          distance='something')
示例#47
0
class TopicModel:
    def __init__(self, topicCollection, string):
        if string.lower() == "nmf":
            self.model = "NMF"
            print("Topic Extraction Model: sklearn.NMF")
        else:
            self.model = "LDA"
            print("Topic Extraction Model: gensim.LDAModel")
        self.stemmer = PorterStemmer()

    #Train the LDA model on the current discussion
    def train(self, sentences):
        if self.model == "NMF":
            self.sentenceData = []
            for sentence in sentences:
                self.sentenceData.append(preprocess(sentence, self.stemmer))
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=1500,
                ngram_range=(1, 2),
                preprocessor=' '.join,
                stop_words='english'
            )
            tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData)
            self.nmf = NMF(n_components=2, solver="mu")
            self.W = self.nmf.fit_transform(tfidf)
            self.H = self.nmf.components_
        else:
            sentenceData = []
            for sentence in sentences:
                sentenceData.append(preprocess(sentence, self.stemmer))
            self.dictionary = Dictionary(sentenceData)
            bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData]
            self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10)

    #Classify a given sentence to one of the topics found in training
    def classify(self, sentence):
        if self.model == "NMF":
            index = self.sentenceData.index(preprocess(sentence, self.stemmer))
            topic = self.W.argmax(axis=1)[index]
            return "Topic " + str(topic)
        else:
            bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer))
            return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0])

    #Shows the terms of a given topic
    def showTerms(self, topic):
        if self.model == "NMF":
            terms = ""
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == int(topic.split(' ')[-1]):
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for term in top_features:
                terms += term + ", "
            print(topic.split(' ')[-1] + " " + terms)
            return terms
        else:
            terms = ""
            topic = int(topic.split(" ")[-1])
            for term in self.lda_model.show_topic(topic):
                terms += term[0] + ", "
            print(str(topic) + " " + terms)
            return terms

    #Gets the probability or the coefficient of the given term in the topic
    def getCoeff(self, topic, term):
        if self.model == "NMF":
            weights = []
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == topic:
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for coeff, terms in zip(weights, top_features):
                if terms == term:
                    return coeff
        else:
            topic = int(topic.split(" ")[-1])
            for terms in self.lda_model.show_topic(topic):
                if terms[0] == term:
                    return terms[1]

    #Shows all the topics found in training
    def showTopics(self):
        if self.model == "NMF":
            ret = []
            for topic_idx, topicID in enumerate(self.H):
                ret.append("Topic " + str(topic_idx))
            return ret
        else:
            topics = self.lda_model.print_topics()
            ret = []
            for topic in topics:
                ret.append("Topic " + str(topic[0]))
            return ret

    #Returns a flag to check what model is deployed at the moment
    def getModel(self):
        return self.model
 def __init__(self):
     self.dictionary = corpora.Dictionary.load("models/dictionary.dict")
     self.lda = LdaModel.load("models/lda_model.lda")
示例#49
0
#to check
print("Wordlist from the dictionary lookup:", dictionary[21], dictionary[22],
      dictionary[23], dictionary[24], dictionary[25], dictionary[26],
      dictionary[27])

# In[ ]:

#scale it to all text
corpus = [dictionary.doc2bow(text) for text in all_text]
end_corpus = time.time()
print("Time till corpus creation:", end_clean - start_time, "s")

# In[ ]:

#create the LDA model
ldamodel = LdaModel(corpus=corpus, num_topics=15, id2word=dictionary)
end_lda = time.time()
print("Time till LDA model creation:", end_lda - start_time, "s")

# In[ ]:

pyLDAvis.enable_notebook()

# In[ ]:

pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

# In[ ]:

end_viz = time.time()
print("Time till viz:", end_viz - start_time, "s")
示例#50
0
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return topic_list, coherence_values


# Code starts here
topic_list, coherence_value_list = compute_coherence_values(
    dictionary=dictionary,
    corpus=doc_term_matrix,
    texts=doc_clean,
    start=1,
    limit=41,
    step=5)

# Finding the index associated with maximum coherence value
max_index = coherence_value_list.index(max(coherence_value_list))
opt_topic = topic_list[max_index]

print("optimum no of topics:", opt_topic)

# Implementing LDA with the optimum no. of topic
lda_model = LdaModel(corpus=doc_term_matrix,
                     num_topics=opt_topic,
                     id2word=dictionary,
                     iterations=10,
                     passes=30,
                     random_state=0)

lda_model.print_topics(5)
示例#51
0
from TechDashAPI.mysqlUtilities import connectMySQL
from TechDashAPI.ContentExtractor import ContentExtractor
from TechDashAPI.ContentExtractorTrainer import ContentExtractorTrainer
from TechDashAPI.createDOM import createDom
from TechDashAPI.util import utilities
from TechDashAPI.topicModeling import techDashTopicModel

from gensim.models import LdaModel

db = connectMySQL(db='xpath', port=3366)
filesFolder = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/xpathModels/'
utilitiesFunctions = utilities()

modelDestination = '/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/'
modelName ='fullModel_100P_20T'
model = LdaModel.load(modelDestination+modelName+'.lda',  mmap=None)
topicModel = techDashTopicModel(destination='/Users/jurica/Documents/workspace/eclipse/TechDashboard/modelsLDA/', fileName='fullModel', modelName='fullModel_100P_20T')

#===============================================================================
# UPDATE ALL ARTICLES TO NEW TOPICS
#===============================================================================

sqlQuery = """SELECT `xpathValuesXPath`.`xpathValuesID`, `xpathValuesXPath`.`xpathValuesContent` FROM `xpath`.`xpathValuesXPath`; """

db.executeQuery(sqlQuery)

for item in db._connectMySQL__results:
    #===========================================================================
    # print item
    #===========================================================================
    topicModelCat = topicModel.getDocumentTopics(item[1])
示例#52
0
corpus = load_obj('LDABOWcorpus-application')
#%%
# lda model training  (ETA 10 mins)

num_topics = 6  # topics declared (based on MATLAB tut)
chunk_size = 300
t1 = time.time()
# low alpha => each doc only rep by small num topics & vice versa
# low eta means each topic only rep by small num words and vice versa
lda = LdaModel(
    corpus=corpus,
    num_topics=num_topics,
    id2word=dct,
    alpha='auto',
    random_state=100,
    # eta=None,
    update_every=1,
    chunksize=chunk_size,
    minimum_probability=0.0,
    # iterations=100,
    # gamma_threshold=0.001,
    passes=10,
    per_word_topics=True)

lda.get_document_topics(bow=corpus, per_word_topics=True)
tpl = lda.print_topics(num_topics=6, num_words=5)
topic, contrib = zip(*tpl)

t2 = time.time()
print("Time to train LDA model on", len(df), "articles:", (t2 - t1) / 60,
      "min")
示例#53
0
from gensim import corpora,similarities
from gensim.models import LdaModel, LsiModel

lda_model = LdaModel.load('./data/lda_model')
lsi_model = LsiModel.load('./data/lsi_model')
id2word = corpora.Dictionary.load('./data/id2word')
index = similarities.MatrixSimilarity.load('./data/index')