Python LdaMulticore.print_topic示例

编程语言: Python

命名空间/包名称: gensim.models

类/类型: LdaMulticore

方法/功能: print_topic

hotexamples.com的示例: 6

Python LdaMulticore.print_topic - 已找到6个示例。这些是从开源项目中提取的最受好评的gensim.models.LdaMulticore.print_topic现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

LdaMulticore(30)

load(30)

save(30)

print_topics(27)

show_topics(9)

log_perplexity(9)

show_topic(8)

print_topic(6)

get_document_topics(6)

update(4)

top_topics(3)

get_topics(2)

get_topic_terms(2)

infer_vector(1)

build_vocab(1)

diff(1)

train(1)

示例#1

显示文件

文件： lda.py 项目： mikolajkania/data-experiments

    def main(self):
        print('Loading data')
        data = pd.read_csv('../../resources/abcnews-date-text.csv', error_bad_lines=False)
        data_text = data[['headline_text']]
        data_text['index'] = data_text.index
        documents = data_text

        np.random.seed(2018)

        print('Preprocessing text')
        preprocessed_docs = documents['headline_text'].map(self.preprocess)

        print('Building bag of words corpus')
        dictionary = Dictionary(preprocessed_docs)   # list: token_id, token
        dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
        bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]     # list: token_id, token_count

        print(documents[documents['index'] == 4310].values[0][0])
        print(bow_corpus[4310])
        print(bow_corpus[:100])

        print('Building lda model from bag of words')
        lda_model_bow = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, workers=self.workers)
        for idx, topic in lda_model_bow.print_topics(-1):
            print('Topic: {} \nWords: {}'.format(idx, topic))

        for index, score in sorted(lda_model_bow[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 10)))

        print('Building tfidf corpus from bag of words corpus')
        tfidf = TfidfModel(bow_corpus)
        tfidf_corpus = tfidf[bow_corpus]
        from pprint import pprint
        for doc in tfidf_corpus:
            pprint(doc)
            break

        print('Building lda model from tfidf')
        lda_model_tfidf = LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary, workers=self.workers)
        for idx, topic in lda_model_tfidf.print_topics(-1):
            print('Topic: {} Word: {}'.format(idx, topic))

        for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

        print('Testing on unseen document')
        unseen_document = 'Facebook’s global lobbying against data privacy laws'
        bow_vector = dictionary.doc2bow(self.preprocess(unseen_document))

        print('Bow:')
        for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model_bow.print_topic(index, 5)))

        print('TfIdf:')
        for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
            print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

示例#2

显示文件

                    hue='Dominant_Topic',
                    palette=sns.color_palette("hls", 10),
                    data=topics_en,
                    legend="full",
                    alpha=0.8)
    #plt.xlim(xmin=-10,xmax=10)
    #plt.ylim(ymin=-10,ymax=10)
    plt.show()

    ## see how every topic looks like
    for item in topics_en['Dominant_Topic'].sort_values().unique():
        part = topics_en[topics_en['Dominant_Topic'] == item]

        print('\n Topic:', item, 'with', len(part), 'elements',
              round(len(part) / len(topics_en) * 100, 2), '%')
        print(lda_tfidf_en.print_topic(item, 10))

        ##wordcloud
        wordcloud = WordCloud(max_font_size=50,
                              max_words=100,
                              background_color="white").generate(' '.join(
                                  part['clean3'].to_list()))
        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        plt.show()

        ##word2vec tsne
        model = Word2Vec((part['clean3'].str.split(' ')).to_list(),
                         size=100,
                         seed=0,

示例#3

显示文件

#predict a topic for a document
important_words = docs[2]
print(important_words)
print(len(important_words))

ques_vec = []
ques_vec = dictionary.doc2bow(important_words)
print("ques_vec", ques_vec)

topic_vec = []
topic_vec = model[ques_vec]
print("topic_vec", topic_vec)

word_count_array = numpy.empty((len(topic_vec), 2), dtype=numpy.object)
for i in range(len(topic_vec)):
    word_count_array[i, 0] = topic_vec[i][0]
    word_count_array[i, 1] = topic_vec[i][1]

print("word count array")
print(word_count_array)

idx = numpy.argsort(word_count_array[:, 1])
idx = idx[::-1]
word_count_array = word_count_array[idx]

final = []
final = model.print_topic(word_count_array[0, 0], 5)

question_topic = final.split('*')  ## as format is like "probability * topic"
print(question_topic)

示例#4

显示文件

文件： Topic_Modelling_LDA.py 项目： lukysummer/Topic-Modelling-using-LDA

                    --> if not specified, all parameters = 0.1
'''

############### 6. PRINT OUT DETECTED TOPICS & ASSOCIATED WORDS ###############
# Following prints out words occuring in each of the 10 topics & their relative weight
for i, topic in BOW_lda_model.print_topics(-1):
    print("Topic {}: \n{}\n".format(i, topic))

############### 7. PREDICT A TOPIC CLASS FOR A SAMPLE DOCUMENT ################
# Use BOW_lda_model to predict which topic this document belongs to:
sample_doc_i = 827

for i, score in sorted(BOW_lda_model[BOW_corpus[sample_doc_i]],
                       key=lambda tup: -1 * tup[1]):
    print("\nScore: {}\nTopic: {}".format(score,
                                          BOW_lda_model.print_topic(i, 10)))

################# 8. PREDICT A TOPIC CLASS FOR A NEW DOCUMENT #################
# Use BOW_lda_model to predict which topic a new document belongs to:
new_doc = "Syria gets terrorist attack kills 22 people"

new_BOW_vector = dictionary.doc2bow(preprocess(new_doc))

for i, score in sorted(BOW_lda_model[new_BOW_vector],
                       key=lambda tup: -1 * tup[1]):
    print("Score: {}\nTopic: {}\n".format(score,
                                          BOW_lda_model.print_topic(i, 10)))

######### 9. CREATE TF-IDF REPRESENTATION OF HEADLINES & UNIQUE WORDS #########
from gensim.models import TfidfModel

示例#5

显示文件

class LDAMWBase:
    def __init__(self,
                 mtype='multiple',
                 resource=None,
                 lda_work_folder=None,
                 lda_model_filename=None,
                 lda_dict_filename=None,
                 lda_topic_word_count=0,
                 lda_topics_count=0,
                 resource_language=None,
                 data_type=None):

        #
        # todo Deutsch Lemmatizer / Stemmer !!!
        #

        self.p_stemmer = PorterStemmer()
        self.wn_lemmatizer = WordNetLemmatizer()

        if resource is not None:
            # resource_lang == 'en' as default
            resource_lang = 'en'

            # hope that resource is correct and exists
            if data_type == 'db':
                resource_lang = Resources.select(Resources.lang).where(
                    Resources.resource == resource).get()
                resource_lang = resource_lang.__data__['lang'].lower()

            elif data_type == 'csv':
                if resource_language is None:
                    raise Exception(
                        "Resource language must be defined for csv data type.")
                else:
                    resource_lang = resource_language
            else:
                pass

            self.stop_words = get_stop_words(resource_lang)

        self.resource_identifier_name = resource

        def _create_model_deps(model_name,
                               twordscount,
                               tcount,
                               mini=False,
                               mini_path=None):

            if not mini:
                mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + model_name
            else:
                mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + mini_path

            mn = 'lda_model' + '_' + model_name
            md = 'dictionary' + '_' + model_name
            ltwordscount = twordscount
            ltcount = tcount

            _short_model_report = "{}{}: {} \n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}".format(
                INFO_FLAG, colored("Model path", 'red', None,
                                   ['bold']), mp, INFO_FLAG,
                colored("Model name", 'red', None, ['bold']), mn, INFO_FLAG,
                colored("Model dictionary", 'red', None,
                        ['bold']), md, INFO_FLAG,
                colored("Topic words count", 'red', None,
                        ['bold']), ltwordscount, INFO_FLAG,
                colored("Topics count", 'red', None, ['bold']), ltcount,
                "-" * 88)
            if model_name != 'mini':
                print(_short_model_report)

            return mp, mn, md, ltwordscount, ltcount

        if mtype == 'multiple':
            if resource is not None:
                mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps(
                    self.resource_identifier_name, LDA_TOPIC_WORD_COUNT,
                    LDA_TOPICS_COUNT)
            else:
                raise Exception(
                    "{}Resource must be defined. Exiting... \n".format(
                        EXCEPTION_FLAG))

        elif mtype == 'single_ltc':
            mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps(
                "mini",
                MINI_LDA_TOPIC_WORD_COUNT,
                MINI_LDA_TOPICS_COUNT,
                mini=True,
                mini_path=self.resource_identifier_name + "/mini")

        if lda_work_folder is None:
            self.lda_work_folder = mpath
        else:
            self.lda_work_folder = lda_work_folder

        if not os.path.exists(self.lda_work_folder):
            os.mkdir(self.lda_work_folder)

        if lda_model_filename is None:
            self.lda_model_filename = os.path.join(self.lda_work_folder, mname)
        else:
            self.lda_model_filename = os.path.join(self.lda_work_folder,
                                                   lda_model_filename)

        if lda_dict_filename is None:
            self.lda_dict_filename = os.path.join(self.lda_work_folder, mdict)
        else:
            self.lda_dict_filename = os.path.join(self.lda_work_folder,
                                                  lda_dict_filename)

        self.lda_topics_count = lda_topics_count
        self.lda_topic_word_count = lda_topic_word_count

        self.dictionary = None
        self.lda_model = None
        self.lda_topics = []

    @staticmethod
    def load_csv_data(csv_file):
        df = pd.read_csv(csv_file)
        train_documents = df['content'].values

        return train_documents

    @staticmethod
    def load_single_ltc(ltc_data):
        train_documents = re.split(
            r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ltc_data)

        return train_documents

    @staticmethod
    def load_db_data(resource=None):
        # if resource is None:
        #     art_content_stream = Articles.select()
        # else:
        art_content_stream = Articles.select().where(
            Articles.resource == resource)

        train_documents = (acs.content for acs in art_content_stream
                           if acs.content is not None)

        return train_documents

    def save_model(self,
                   as_name=None,
                   save_on_disk=True,
                   save_topics_into_db=False):
        if save_on_disk:
            print(" \t-> Model was saved as [ {} ]".format(as_name))
            if as_name is not None:
                self.lda_model.save(as_name)
            else:
                self.save_model(self.lda_model_filename)

        if save_topics_into_db:
            truncate_topics_tables(resource=self.resource_identifier_name)

            print(" \t-> Topics will be saved in database for [ {} ]".format(
                self.resource_identifier_name))

            model_numbers_topics = self._get_topics()

            try:
                for topic_info in model_numbers_topics:
                    tnum = topic_info[0]
                    tresourceid = topic_info[1]
                    tname = topic_info[2]

                    _topic = {
                        'ident_number': tnum,
                        'value': tname,
                        'created_at': dt.datetime.today().date()
                    }

                    t = Topics.create(**_topic)

                    t_id = t.__data__['topic']

                    _topic_resource = {
                        'resource': tresourceid,
                        'topic': t_id,
                        'created_at': dt.datetime.today().date()
                    }

                    tr = TopicsResources.create(**_topic_resource)

                print("{}[ {} ]".format(SUCCESS_FLAG,
                                        self.resource_identifier_name))
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                print("{}Failure: [ {} ]".format(
                    ERROR_FLAG, self.resource_identifier_name))

    def train_model(self,
                    data_type,
                    resource,
                    single_ltc_data=None,
                    data_file_path=None,
                    train_corpus=None,
                    train_dictionary=None,
                    save_model_as=None,
                    chunksize=LDA_CHUNKSIZE,
                    passes=LDA_PASSES):

        if train_corpus is not None:
            corpus = train_corpus

        elif data_type == 'db':
            corpus = self._make_corpus(data_type=data_type, resource=resource)

        elif data_type == 'single_ltc' and single_ltc_data is not None:
            corpus = self._make_corpus(data_type=data_type,
                                       ltc=single_ltc_data,
                                       resource=resource)

        elif data_type == 'csv' and data_file_path is not None:
            corpus = self._make_corpus(data_type=data_type,
                                       data_file_path=data_file_path,
                                       resource=resource)

        else:
            raise Exception("{}Corpus is None".format(EXCEPTION_FLAG))

        if train_dictionary is not None:
            dictionary = train_dictionary
        else:
            dictionary = self.dictionary
        """
			id2word parameter need to get words in topics instead of their indexes in dict
		"""
        _tcount = self.lda_topics_count

        # self.lda_model = LdaModel(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize)
        self.lda_model = LdaMulticore(corpus=corpus,
                                      num_topics=_tcount,
                                      id2word=dictionary,
                                      passes=passes,
                                      chunksize=chunksize)

        if save_model_as is not None and not single_ltc_data:
            self.save_model(save_model_as,
                            save_on_disk=True,
                            save_topics_into_db=False)

        elif single_ltc_data:
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=False)
        elif data_type == 'csv':
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=False)

        else:
            self.save_model(self.lda_model_filename,
                            save_on_disk=True,
                            save_topics_into_db=True)

        print("{}Trained".format(SUCCESS_FLAG))

    def load_model(self, model_file_path=None, dict_file_path=None):
        """
			load model and dictionary from file (need to save them in train function)
			uses to update model on another corpus
		"""

        if model_file_path is not None and os.path.exists(model_file_path):
            self.lda_model = LdaMulticore.load(model_file_path)
            # self.lda_model = LdaModel.load(model_file_path)
            self.dictionary = Dictionary.load(dict_file_path)
            print(" \t-> Loaded: [ {} ]".format(model_file_path))

        elif model_file_path is None and os.path.exists(
                self.lda_model_filename):
            self.lda_model = LdaMulticore.load(self.lda_model_filename)
            # self.lda_model = LdaModel.load(self.lda_model_filename)
            self.dictionary = Dictionary.load(self.lda_dict_filename)
            print(" \t-> Loaded: [ {} ]".format(self.lda_model_filename))

        else:
            print(
                "{}Filepath you gave is incorrect. \n     Give another one and retry."
                "\n     Exiting...".format(ERROR_FLAG))
            exit()

        for i in range(self.lda_model.num_topics):
            terms_id = self.lda_model.get_topic_terms(
                i, self.lda_topic_word_count)

            terms = [self.dictionary.get(x[0]) for x in terms_id]

            self.lda_topics.append(' '.join(terms))

    def update_model(self,
                     ondata_file_path=None,
                     resource=None,
                     data_type='db'):
        if ondata_file_path is not None and data_type == 'csv':
            corpus = self._make_corpus(data_file_path=ondata_file_path,
                                       data_type=data_type,
                                       resource=resource)
        elif data_type == 'db':
            corpus = self._make_corpus(data_file_path=None,
                                       data_type=data_type,
                                       resource=resource)
        else:
            raise Exception("{}Corpus is None".format(EXCEPTION_FLAG))

        self.lda_model.update(corpus)

    def process_record(self, text, data_type):
        """
			data_type - db / csv / single_ltc
		"""

        if data_type == 'single_ltc':
            try:
                self.load_model()
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                pass

        elif self.lda_model is None:

            try:
                self.load_model()
            except Exception as e:
                print("{}{}".format(EXCEPTION_FLAG, e))
                pass

        if data_type == 'db':
            if self.lda_model is None:
                return dict()

            doc = self._prepare_single_document(text)

            if doc is not None:
                topics = self._get_document_topics(doc)

                top_topic = topics[0]

                return [('topic', self.lda_topics[top_topic])]

            return [('topic', "")]

        elif data_type == 'csv':
            doc = self._prepare_single_document(text)
            topics_in_count_by_ids = self._get_document_topics(doc)
            current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[
                0], topics_in_count_by_ids[1:]

            result_topic_word_descr = re.sub(
                '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id))

            return [('topic', result_topic_word_descr),
                    ('other_topics', current_doc_other_topics)]

        elif data_type == 'single_ltc':
            doc = self._prepare_single_document(text)
            topics_in_count_by_ids = self._get_document_topics(doc)
            if topics_in_count_by_ids is not None:
                current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[
                    0], topics_in_count_by_ids[1:]

                result_topic_word_descr = re.sub(
                    '[^A-Za-z]+', ' ',
                    self._get_topic_by_id(current_doc_topic_id))

                return result_topic_word_descr, current_doc_other_topics
            else:
                return "", []

    def _get_metric_fields(self):
        if self.lda_model is None:
            return []

        else:
            return ['topic']

    def _get_document_topics(self, doc, count=5):
        if doc is not None:
            bow = self.dictionary.doc2bow(doc)
            topics = self.lda_model.get_document_topics(
                bow, minimum_probability=0.0)
            topics_in_count = list(
                ident_number for (ident_number, prob) in sorted(
                    topics, key=itemgetter(1), reverse=True)[:count])

            return topics_in_count

    def _get_document_topic(self, doc_topics):
        topic_id_probs = {}

        for t_prob in doc_topics:
            topic_id_probs[t_prob[0]] = t_prob[1]

        doc_topic_id = sorted(topic_id_probs,
                              key=topic_id_probs.get,
                              reverse=True)[0]
        doc_topic_prob = topic_id_probs[doc_topic_id]

        return [doc_topic_id, doc_topic_prob]

    def _prepare_single_document(self, sd):
        if sd is None or type(sd) == np.float:
            return None

        try:
            sd = sd.lower()
            sd = nltk.tokenize.word_tokenize(sd)
            sd = (word for word in sd if word.isalpha() and len(word) > 2)
            stopped_sd = (word for word in sd if word not in self.stop_words)

            lemmatized_doc = [
                self.wn_lemmatizer.lemmatize(word) for word in stopped_sd
            ]

            return lemmatized_doc

        except AttributeError as e:
            print("{}{}".format(EXCEPTION_FLAG, e))
            return None

    def _make_bow(self, text):
        if text is not None:
            d = self._prepare_single_document(text)

            return self.dictionary.doc2bow(d)

    def _make_corpus(self,
                     data_type,
                     resource,
                     data_file_path=None,
                     save_train_dict=True,
                     save_dict_as=None,
                     ltc=None):
        """
			data type can be csv or db # or new - single_ltc
		"""
        if data_type == 'db':
            documents = self.load_db_data(resource=resource)

        elif data_type == 'csv' and data_file_path is not None:
            documents = self.load_csv_data(data_file_path)

        elif data_type == 'single_ltc' and ltc is not None:

            ltc_text = " ".join(e if type(e) is str else "" for e in ltc)
            documents = self.load_single_ltc(ltc_text)

        else:
            documents = None

            print("{}documents is None. Exiting ... \n".format(ERROR_FLAG))
            exit()

        with Pool() as pool:
            processed_docs = pool.imap(self._prepare_single_document,
                                       documents)
            pool.close()
            pool.join()

        processed_docs = (i for i in processed_docs if i is not None)
        self.dictionary = Dictionary(processed_docs)

        if save_train_dict and save_dict_as is None:
            self.dictionary.save(self.lda_dict_filename)
        else:
            self.dictionary.save(save_dict_as)

        corpus = [
            self.dictionary.doc2bow(proc_doc) for proc_doc in processed_docs
        ]

        return corpus

    def _get_topic_by_id(self, topic_id):
        if self.lda_topic_word_count is not None:
            return self.lda_model.print_topic(topic_id,
                                              self.lda_topic_word_count)

        else:
            return self.lda_model.print_topic(topic_id, 6)

    def _get_topics(self, default_view=False, for_db=True):
        """
			2-tuples (probability * word) of most probable words in topics
			num_topics=-1 <--- to print all topics
		"""
        def _get_words(probabilities_words_string):
            _pre_topic_with_digits_trash = " ".join(
                re.findall(ALL_CHARS, probabilities_words_string))
            probaply_clean_topic = re.sub(r'\b\d+(?:\.\d+)?\s+', "",
                                          _pre_topic_with_digits_trash)

            return probaply_clean_topic  # " ".join(re.findall('[a-zA-Z]+', probabilities_words_string))

        if default_view:
            return self.lda_model.print_topics(num_topics=-1)

        if for_db:
            resource_id = Resources.select().where(
                Resources.resource == self.resource_identifier_name).first()
            resource_id = resource_id.__data__['resource']

            return [(elem[0], resource_id, _get_words(elem[1]))
                    for elem in self.lda_model.print_topics(
                        num_topics=self.lda_topics_count,
                        num_words=self.lda_topic_word_count)]

        return [(elem[0], _get_words(elem[1]))
                for elem in self.lda_model.print_topics(
                    num_topics=self.lda_topics_count,
                    num_words=self.lda_topic_word_count)]

示例#6

显示文件

news_train = fetch_20newsgroups(subset='train')

# Tokenization and lemmatization
wnl = WordNetLemmatizer()
news_train_lemma = [
    tokenize_lemmatize(article, wnl.lemmatize) for article in news_train.data
]

# Build a genism corpara structure
dict_train = Dictionary(news_train_lemma)
mmCorpus_train = [dict_train.doc2bow(article) for article in news_train_lemma]

# Latent Semantic Analysis
lsi_train = LsiModel(corpus=mmCorpus_train, num_topics=40, id2word=dict_train)

for i in range(40):
    print('topic' + i.__str__() + ' :')
    print(lsi_train.print_topic(i))

# Latent Dirichlet Allocation

lda_train = LdaMulticore(corpus=mmCorpus_train,
                         num_topics=40,
                         id2word=dict_train,
                         workers=5)

for i in range(40):
    print('topic' + i.__str__() + ' :')
    print(lda_train.print_topic(i))