chunksize=batch_size,
                             iterations=max_e_steps,
                             eval_every=eval_every)
gn_time = time.time() - start

log_prep_gensim_mc = lda_gensim_mc.log_perplexity(gensim_te_corpus)
preplexity_gensim_mc = np.exp(-1. * log_prep_gensim_mc)

print("gensim run time and perplexity: {}, {}".format(gn_time,
                                                      preplexity_gensim_mc))
print("sklearn run time and perplexity: {}, {}".format(sk_time,
                                                       sklearn_perplexity))

# Lets have a look to the topics
topic_words = dict()
gensim_topics = lda_gensim_mc.show_topics(formatted=False)


def sklearn_show_topics(model, feature_names, n_top_words):
    sk_topics = []
    for topic_idx, topic in enumerate(model.components_):
        tot_score = np.sum(topic)
        top_words = [(feature_names[i], topic[i] / tot_score)
                     for i in topic.argsort()[:-n_top_words - 1:-1]]
        sk_topics.append([topic_idx, top_words])
    return sk_topics


feature_names = vectorizer.get_feature_names()
sklearn_topics = sklearn_show_topics(lda_sklearn, feature_names, 10)
topic_words['gensim'] = gensim_topics
예제 #2
0
import gensim
import json
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

input_dict_fname = '../outputs/20news_18828/output_text_preprocessed.json'

# Retrive gensim corpus & dictionary data...

json_word_list = None
with open(input_dict_fname, "r") as f:
    json_word_list = json.loads(f.read())

json_word_list = list(json_word_list.values())
print(json_word_list[1])
dictionary = Dictionary(json_word_list)
corpus = [dictionary.doc2bow(x) for x in json_word_list]

newsdata_topics = LdaMulticore(corpus, id2word=dictionary, num_topics=10)

print(newsdata_topics.show_topics(num_words=5))
예제 #3
0
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()
        ]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=1200,
                  height=1200,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16), y=1.05)
    plt.gca().axis('off')

plt.subplots_adjust(wspace=.3, hspace=.2)
plt.axis('off')
plt.margins(x=0, y=0)
#plt.tight_layout()
            tfidf_corpus.append(tfidf[doc])
        tfidf_mat = matutils.corpus2dense(tfidf_corpus,
                                          num_terms=len(id2word.token2id))
        tfidf_mat_transpose = tfidf_mat.transpose()
        dfTFIDF = pd.DataFrame(
            data=tfidf_mat_transpose[0:, 0:],
            index=[i for i in range(tfidf_mat_transpose.shape[0])],
            columns=['' + str(i) for i in range(tfidf_mat_transpose.shape[1])])
        dfTFIDF['id'] = ids.tolist()

        ef.deleteIndex(credentials, "tfidf")
        ef.saveTFIDF(credentials, dfTFIDF)

# Keyword weights
    x = lda_model.show_topics(num_topics=args.number_of_topics,
                              num_words=50,
                              formatted=False)
    keywordWeights = []
    topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
    for tp in x:
        words = []
        weights = []
        for pair in tp[1]:
            words.append(pair[0])
            weights.append(int(pair[1] * 10000))
        keywordWeights.append(weights)

# Top topics per paragraph
    df = pd.DataFrame()
    df['referenceId'] = referenceIds
    df['paragraph'] = raw_paragraphs
                                 chunksize=1000,
                                 batch=False,
                                 alpha='asymmetric',
                                 decay=0.5,
                                 offset=64,
                                 eta=None,
                                 eval_every=0,
                                 iterations=100,
                                 gamma_threshold=0.001,
                                 per_word_topics=True)

        #get document topic distribution:
        doc_topic_dist = get_corpus_topics(_texts, lda_model)

        #print(lda_model.show_topics(num_words=20))
        topic_terms = lda_model.show_topics(num_words=50)
        #get top words for each topic:
        topic_term_dict = {}
        rel_terms = []
        for topic_dist in topic_terms:
            topic_id = topic_dist[0]
            topic_term_dict[topic_id] = {}
            topic_terms = topic_dist[1]
            for _split in topic_terms.split('+'):
                topic_term_prob = _split.split('*')[0]
                topic_term = str(_split.split('*')[1]).replace('"', '').strip()
                topic_term_dict[topic_id][topic_term] = float(topic_term_prob)
                #rel_terms.append(topic_term)

        #print(topic_term_dict)
        picked_sentences[key] = {}
class GensimMalletTopicExtractor:
    def __init__(self, language='english', stopwords_extent=None):
        self.language2la = {
            'english': 'en',
            'french': 'fr',
            'spanish': 'es'
        }
        if language not in self.language2la:
            raise ValueError('Language must be "english", "french" or "spanish"')
        self.language = language
        self.stop_words = stopwords.words(self.language)
        if stopwords_extent is str or stopwords_extent is list:
            self.stop_words.extend(stopwords_extent)
        self.df_topic_sents_keywords = None
        self.bigram = None
        self.bigram_phraser = None
        self.trigram = None
        self.trigram_phraser = None
        self.vis = None
        self.data = None
        self.data_words = None
        self.data_words_nostops = None
        self.data_words_bigrams = None
        self.data_words_trigrams = None
        self.nlp = None
        self.data_lemmatized = None
        self.id2word = None
        self.texts = None
        self.corpus = None
        self.mallet_path = None
        self.lda_model = None
        self.coherence_model_lda = None
        self.coherence_lda = None
        self.coherence_values = []
        self.model_list = []
        self.optimal_number_of_topics = None
        self.optimal_model = None
        self.optimal_topics = None

    @staticmethod
    def sent_to_words(sentences, remove_punctuation=True):
        for sentence in sentences:
            # deacc=True removes punctuations
            yield(simple_preprocess(str(sentence), deacc=remove_punctuation))

    def remove_stopwords(self, texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in self.stop_words] for doc in texts]

    def make_bigrams(self, texts):
        self.bigram = Phrases(self.data_words, min_count=5, threshold=100)
        self.bigram_phraser = Phraser(self.bigram)
        return [self.bigram_phraser[doc] for doc in texts]

    def make_trigrams(self, texts):
        tokens_ = self.bigram_phraser[texts]
        self.trigram = Phrases(tokens_, threshold=100)
        self.trigram_phraser = Phraser(self.trigram)
        return [self.trigram_phraser[self.bigram_phraser[doc]] for doc in texts]

    def lemmatization(self, texts, allowed_postags=None):
        if allowed_postags is None:
            allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = self.nlp(" ".join(sent))
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out

    def view_terms_frequency(self, text_id, first_words=20):
        # Human readable format of corpus (term-frequency)
        list_ = [[(self.id2word[id_], freq) for id_, freq in text[:first_words]] for text in self.corpus[text_id]]
        pprint(list_)

    def visualize_lda(self):
        # Visualize the topics
        # pyLDAvis.enable_notebook()
        self.vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word)
        print(self.vis)

    def instanciate_model(self, num_topics, passes, iterations,
                          enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False):
        if enable_mallet is True:
            # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
            os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'})
            self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet'  # update this path
            self.lda_model = LdaMallet(self.mallet_path,
                                       corpus=self.corpus,
                                       num_topics=num_topics,
                                       id2word=self.id2word,
                                       iterations=iterations,
                                       optimize_interval=optimize_interval,
                                       topic_threshold=topic_threshold)
            print('Mallet LDA model built\n')
            if show_topics_on_creation is True:
                pprint(self.lda_model.show_topics(formatted=False))
        else:
            self.lda_model = LdaMulticore(corpus=self.corpus,
                                          id2word=self.id2word,
                                          num_topics=num_topics,
                                          random_state=100,
                                          chunksize=500,
                                          passes=passes,
                                          iterations=iterations,
                                          per_word_topics=True)
            print('LDA_MultiCore model built\n')
            if show_topics_on_creation is True:
                pprint(self.lda_model.print_topics())

    def extract_topics(self, data, num_topics, passes=10, iterations=500,
                       enable_mallet=True, optimize_interval=0,
                       topic_threshold=0.0):
        self.data = data
        print('\nEXTRACTING ' + str(num_topics) + ' TOPICS')
        self.data_words = list(self.sent_to_words(self.data, True))
        # Remove Stop Words
        print('\nRemoving stopwords')
        self.data_words_nostops = self.remove_stopwords(self.data_words)
        # Form Bigrams
        print('Looking for bigrams')
        self.data_words_bigrams = self.make_bigrams(self.data_words_nostops)
        # Form Trigrams
        print('Looking for trigrams')
        self.data_words_trigrams = self.make_trigrams(self.data_words_nostops)
        # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
        # python3 -m spacy download en
        print('Loading Spacy with ' + self.language + ' dictionary')
        self.nlp = spacy.load(self.language2la[self.language], disable=['parser', 'ner'])
        # Do lemmatization keeping only noun, adj, vb, adv
        print('Lemmatizing')
        self.data_lemmatized = self.lemmatization(self.data_words_trigrams,
                                                  allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
        # Create Dictionary
        print('Creating dictionary')
        self.id2word = corpora.Dictionary(self.data_lemmatized)
        # Create Corpus
        print('Creating corpus')
        self.texts = self.data_lemmatized
        # Term Document Frequency
        print('Computing document frequency')
        self.corpus = [self.id2word.doc2bow(text) for text in self.texts]
        # Build LDA model
        print('\nEnable_mallet is', enable_mallet, '\n')
        self.instanciate_model(num_topics, passes, iterations,
                               enable_mallet, optimize_interval, topic_threshold,
                               show_topics_on_creation=True)
        # print(self.lda_model[self.corpus])
        # Compute Perplexity
        # a measure of how good the model is. lower the better.
        if hasattr(self.lda_model, 'log_perplexity'):
            print('\nPerplexity: ', self.lda_model.log_perplexity(self.corpus))

        # Compute Coherence Score
        print('\nComputing coherence model')
        self.coherence_model_lda = CoherenceModel(model=self.lda_model,
                                                  texts=self.data_lemmatized,
                                                  dictionary=self.id2word,
                                                  coherence='c_v')
        print('Getting coherence')
        self.coherence_lda = self.coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', self.coherence_lda)

        if enable_mallet is False:
            self.visualize_lda()

    def view_optimal_topics(self, num_words=20):
        pprint(self.optimal_model.print_topics(num_words=num_words))

    def compute_coherence_values(self, limit, start=2, step=3, passes=10,
                                 iterations=500, enable_mallet=True,
                                 optimize_interval=0, topic_threshold=0.0):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        limit : Max num of topics

        Returns:
        -------
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        """
        for num_topics in range(start, limit, step):
            print('\n' + '*'*10 + ' COMPUTING COHERENCE FOR ' + str(num_topics) + ' TOPICS ' + '*'*10)
            self.instanciate_model(num_topics, passes, iterations,
                                   enable_mallet, optimize_interval, topic_threshold,
                                   show_topics_on_creation=False)
            self.model_list.append(self.lda_model)
            coherence_model = CoherenceModel(model=self.lda_model,
                                             texts=self.data_lemmatized,
                                             dictionary=self.id2word,
                                             coherence='c_v')
            self.coherence_values.append(coherence_model.get_coherence())

        # Show graph
        x = range(start, limit, step)
        plt.plot(x, self.coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend("coherence_values", loc='best')
        plt.show()

        # Print the coherence scores
        for m, cv in zip(x, self.coherence_values):
            print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

        optimal_model_index = self.coherence_values.index(max(self.coherence_values))
        self.optimal_number_of_topics = start + optimal_model_index
        self.optimal_model = self.model_list[optimal_model_index]
        print('\nOptimal number of topics is ' + str(self.optimal_number_of_topics) +
              ' with coherence score : ' + str(self.coherence_values[optimal_model_index]))
        self.optimal_topics = self.optimal_model.show_topics(num_topics=self.optimal_number_of_topics,
                                                             num_words=20, formatted=False)
        self.view_optimal_topics()

    def format_topics_sentences(self, ldamodel=None):
        if ldamodel is None and self.optimal_model is not None:
            ldamodel = self.optimal_model
        elif ldamodel is None and self.lda_model is not None:
            ldamodel = self.lda_model
        # Init output
        sent_topics_df = pd.DataFrame()

        # Get main topic in each document
        for i, row in enumerate(ldamodel[self.corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0:  # => dominant topic
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num),
                                                                      round(prop_topic, 4),
                                                                      topic_keywords]),
                                                           ignore_index=True)
                else:
                    break
        sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

        # Add original text to the end of the output
        contents = pd.Series(self.data)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return sent_topics_df

    def get_most_representative_documents(self):
        # Group top 5 sentences under each topic
        sent_topics_sorteddf_mallet = pd.DataFrame()

        if self.df_topic_sents_keywords is None:
            self.df_topic_sents_keywords = self.format_topics_sentences()
        # Format
        df_dominant_topic = self.df_topic_sents_keywords.reset_index()
        df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
        sent_topics_outdf_grpd = self.df_topic_sents_keywords.groupby('Dominant_Topic')

        for i, grp in sent_topics_outdf_grpd:
            sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                                     grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
                                                    axis=0)

        # Reset Index
        sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
        # Format
        sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
        # Show
        sent_topics_sorteddf_mallet.head()

        for i in range(len(sent_topics_sorteddf_mallet)):
            print(i, sent_topics_sorteddf_mallet.loc[i, 'Text'])

    def get_topic_distribution(self):
        if self.df_topic_sents_keywords is None:
            self.df_topic_sents_keywords = self.format_topics_sentences()
        # Number of Documents for Each Topic
        topic_counts = self.df_topic_sents_keywords['Dominant_Topic'].value_counts()
        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts/topic_counts.sum(), 4)
        # Topic Number and Keywords
        topic_num_keywords = self.df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]
        # Concatenate Column wise
        df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)
        # Change Column names
        df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
        # Show
        print(df_dominant_topics)
    def run(self, args):

        # mlflow logs
        experiment_name = "dev-LessonsClustering"
        if args.environment == "production":
            experiment_name = "LessonsClustering"
        elif args.environment == "staging":
            experiment_name = "staging-LessonsClustering"
        mlflow.set_experiment(experiment_name)
        client = mlflow.tracking.MlflowClient()

        with mlflow.start_run():
            log_param("environment", args.environment)
            log_param("mode", args.mode)
            log_param("update_related_lessons", args.update_related_lessons)

            # Get lessons data from database

            df = ef.getLessons(self.credentials)

            # Pre Processing
            lessonsData = df[df['isLesson'] == True]
            lessonsData = lessonsData[lessonsData['summary'] ==
                                      lessonsData['summary']]
            raw_paragraphs = lessonsData['paragraph']
            urls = lessonsData['urlToFile']
            raw_sentences = raw_paragraphs
            ids = lessonsData['_id']

            sentences = [line.split(' ') for line in raw_sentences]
            stop_words = stopwords.words('english')
            stop_words.extend(
                ['from', 'subject', 're', 'edu', 'use', 'äô', 'äù', 'äì'])
            words_to_remove = ['iii', 'project']

            def remove_stopwords(texts):
                return [[
                    word for word in simple_preprocess(str(doc))
                    if word not in stop_words
                ] for doc in texts]

            def remove_words(texts):
                return [[
                    word for word in simple_preprocess(str(doc))
                    if word not in words_to_remove
                ] for doc in texts]

            def remove_word_length_2(texts):
                allSentences = []
                for doc in texts:
                    newWords = []
                    for word in doc:
                        if len(word) > 2:
                            newWords.append(word)
                    allSentences.append(newWords)
                return allSentences

            def replace_adb_special_characters(texts):
                return [[
                    word.replace('’s',
                                 "'s ").replace('O’Smach', "0").replace(
                                     'äù', "").replace('äô',
                                                       "").replace('äì', "")
                    for word in doc
                ] for doc in texts]

            def get_wordnet_pos(word):
                tag = nltk.pos_tag([word])[0][1][0].upper()
                tag_dict = {
                    "J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV
                }

                return tag_dict.get(tag, wordnet.NOUN)

            sentences = replace_adb_special_characters(sentences)
            data_words_nostops = remove_stopwords(sentences)
            lemmatizer = WordNetLemmatizer()
            lemmatized_output = []
            for paragraph in data_words_nostops:
                lemmatized_output.append([
                    lemmatizer.lemmatize(word, get_wordnet_pos(word))
                    for word in paragraph
                ])
            sentences = remove_words(lemmatized_output)
            sentences_no_length_2 = remove_word_length_2(sentences)
            sentences = sentences_no_length_2

            id2word = corpora.Dictionary(sentences)
            texts = sentences
            corpus = [id2word.doc2bow(text) for text in texts]

            def compute_coherence_values(corpus, dictionary, k, a, b):
                lda_model = LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=k,
                                         random_state=100,
                                         chunksize=100,
                                         passes=10,
                                         alpha=a,
                                         eta=b,
                                         per_word_topics=True)
                coherence_model_lda = CoherenceModel(model=lda_model,
                                                     texts=sentences,
                                                     dictionary=id2word,
                                                     coherence='c_v')
                return coherence_model_lda.get_coherence()

    # Fine Tuning

            if args.mode == "fine_tuning":
                grid = {}
                grid['Validation_Set'] = {}

                # Topics range
                min_topics = 2
                max_topics = args.max_number_of_topics
                step_size = 1
                topics_range = range(min_topics, max_topics + 1, step_size)

                # Alpha parameter
                alpha = list(np.arange(0.01, 1, 0.3))
                # alpha.append('symmetric')
                # alpha.append('asymmetric')

                # Beta parameter
                beta = list(np.arange(0.01, 1, 0.3))
                # beta.append('symmetric')

                # Validation sets
                # num_of_docs = len(corpus)
                corpus_sets = [
                    # ClippedCorpus(corpus, int(num_of_docs*0.25)),
                    # ClippedCorpus(corpus, int(num_of_docs*0.5)),
                    # ClippedCorpus(corpus, int(num_of_docs*0.75)),
                    corpus
                ]
                # corpus_title = [
                #                 '25% Corpus'
                #                 '50% Corpus',
                #                 '75% Corpus'
                #                 '100% Corpus'
                # ]
                model_results = {
                    # 'Validation_Set': [],
                    'Number Of Topics': [],
                    'Alpha': [],
                    'Beta': [],
                    'Coherence': []
                }
                model_results_2 = {
                    'Number Of Topics': [],
                    'Average Coherence': []
                }
                maxCoherence = 0
                maxCoherenceK = 2
                maxCoherenceA = 0.01
                maxCoherenceB = 0.01
                for i in range(len(corpus_sets)):
                    for k in topics_range:
                        for a in alpha:
                            for b in beta:
                                cv = compute_coherence_values(
                                    corpus=corpus_sets[i],
                                    dictionary=id2word,
                                    k=k,
                                    a=a,
                                    b=b)
                                if cv > maxCoherence:
                                    maxCoherence = cv
                                    maxCoherenceK = k
                                    maxCoherenceA = a
                                    maxCoherenceB = b
                                # model_results['Validation_Set'].append(corpus_title[i])
                                model_results['Number Of Topics'].append(k)
                                model_results['Alpha'].append(a)
                                model_results['Beta'].append(b)
                                model_results['Coherence'].append(cv)
                                customStep = int(
                                    str(k) +
                                    "{:.2f}".format(a).replace(".", "") +
                                    "{:.2f}".format(b).replace(".", ""))
                                log_metric("coherence", cv, step=customStep)

                        model_results_2['Number Of Topics'].append(k)
                        model_results_2['Average Coherence'].append(cv)
                        log_metric("average_coherence", cv, step=k)
                log_metric("max_coherence", maxCoherence)
                log_metric("number_of_topics_of_max_coherence", maxCoherenceK)
                log_metric("alpha_of_max_coherence", maxCoherenceA)
                log_metric("beta_of_max_coherence", maxCoherenceB)
                pd.DataFrame(model_results).to_csv(defaults.DATA_PATH +
                                                   "fine-tuning.csv",
                                                   index=False)
                pd.DataFrame(model_results_2).to_csv(defaults.DATA_PATH +
                                                     "fine-tuning-2.csv",
                                                     index=False)
                log_artifact(defaults.DATA_PATH + "fine-tuning.csv", "data/")
                log_artifact(defaults.DATA_PATH + "fine-tuning-2.csv", "data/")

    # Train LDA model
            elif args.mode == "train":
                log_metric("number_of_topics", args.number_of_topics)
                log_metric("alpha", args.alpha)
                log_metric("beta", args.beta)
                lda_model = LdaMulticore(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=args.number_of_topics,
                                         random_state=200,
                                         chunksize=100,
                                         passes=10,
                                         alpha=args.alpha,
                                         eta=args.beta,
                                         per_word_topics=True)
                cv = compute_coherence_values(corpus=corpus,
                                              dictionary=id2word,
                                              k=args.number_of_topics,
                                              a=args.alpha,
                                              b=args.beta)
                log_metric("coherence", cv)
                lda_model.save(defaults.MODEL_PATH + "lda.model")
                log_artifact(defaults.MODEL_PATH + "lda.model", "models/")

    # Predict LDA model
            elif args.mode == "predict":
                log_param("run_id_model", args.run_id_model)
                number_of_topics = int(args.number_of_topics)
                if not args.run_id_model == "":
                    data = client.get_run(args.run_id_model).data
                    number_of_topics = int(data.params['number_of_topics'])
                    alpha = float(data.params['alpha'])
                    beta = float(data.params['beta'])
                    log_metric("number_of_topics", number_of_topics)
                    log_metric("alpha", alpha)
                    log_metric("beta", beta)
                    cv = compute_coherence_values(corpus=corpus,
                                                  dictionary=id2word,
                                                  k=number_of_topics,
                                                  a=alpha,
                                                  b=beta)
                    log_metric("coherence", cv)

    # Download and load the LDA model
                modelFilePath = defaults.MODEL_PATH + "lda.model"
                af.downloadLDAModel(args, modelFilePath)
                lda_model = LdaModel.load(modelFilePath)
                # lda_model.save(defaults.MODEL_PATH + "lda.model")
                # log_artifact(defaults.MODEL_PATH + "lda.model", "models/")

                # Keyword weights

                x = lda_model.show_topics(num_topics=number_of_topics,
                                          num_words=50,
                                          formatted=False)
                keywordWeights = []
                topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
                for tp in x:
                    words = []
                    weights = []
                    for pair in tp[1]:
                        words.append(pair[0])
                        weights.append(int(pair[1] * 10000))
                    keywordWeights.append(weights)

    # Top topics per paragraph
                topicNumbers = []
                for c in range(len(corpus)):
                    maxProbability = 0
                    indexOfMax = 0
                    topTopics = []
                    topTopicProbabilities = []
                    lda_model.get_document_topics(corpus[c])
                    for topicNumber in lda_model.get_document_topics(
                            corpus[c]):
                        topTopics.append(topicNumber[0])
                        topTopicProbabilities.append(topicNumber[1])
                    topTopicsSorted = [
                        x for _, x in sorted(zip(topTopicProbabilities,
                                                 topTopics),
                                             reverse=True)
                    ]
                    topicNumbers.append(topTopicsSorted)
                lessonsData['newTopTopics'] = topicNumbers
                lessonsData['topTopics'] = topicNumbers

                # Most probable topic per paragraph
                topTopics = []
                for index, row in lessonsData.iterrows():
                    if (row['topTopics']):
                        topTopics.append(row['topTopics'][0])
                    else:
                        topTopics.append(-1)
                lessonsData['topic'] = topTopics

                # Frequencies of topic keywords and number of PCRs per topic
                topics = pd.DataFrame()
                topicKeywords = []
                allKeywords = []
                topicIds = []
                for topic, words in topics_words:
                    allKeywords.append(words)
                    topicIds.append(topic)
                topics['key'] = topicIds
                topics['keywords'] = allKeywords
                topics['oldFrequencies'] = [[0] * len(keywords)
                                            for keywords in allKeywords]
                topics['numberOfLessons'] = 0
                topics['PCRs'] = [[] for i in range(len(topics))]
                topics['numberOfPCRs'] = 0

                for sentenceTopicNumbers, sentenceURL in zip(
                        topicNumbers, urls):
                    for topicNumber in sentenceTopicNumbers:
                        topics.at[topicNumber, 'numberOfLessons'] = topics.at[
                            topicNumber, 'numberOfLessons'] + 1
                        topics.at[topicNumber, 'PCRs'].append(sentenceURL)
                for index, row in topics.iterrows():
                    topics.at[index, 'numberOfPCRs'] = len(
                        set(topics.at[index, 'PCRs']))
                topics = topics.drop(columns=['PCRs'])

                # Frequencies of words per sentence per topic
                topics['oldFrequencies'] = [[0] * len(keywords)
                                            for keywords in allKeywords]
                for index, row in topics.iterrows():
                    topicNumber = topics.at[index, 'key']
                    topicKeywords = topics.at[index, 'keywords']
                    topicKeywordsFrequencies = topics.at[index,
                                                         'oldFrequencies']
                    for sentence, sentenceTopicNumbers in zip(
                            sentences, topicNumbers):
                        for sentenceTopicNumber in sentenceTopicNumbers:
                            if topicNumber == sentenceTopicNumber:
                                for word in sentence:
                                    if word in topicKeywords:
                                        indexOfWord = topicKeywords.index(word)
                                        topicKeywordsFrequencies[
                                            indexOfWord] = topicKeywordsFrequencies[
                                                indexOfWord] + 1
                    topics.at[index,
                              'oldFrequencies'] = topicKeywordsFrequencies
                topics['frequencies'] = keywordWeights

                # Top word per topic
                topicTopWords = []
                for index, row in topics.iterrows():
                    topicTopWords.append(row['keywords'][0])
                topics['topWord'] = topicTopWords

                # Adjacent topics
                # pyLDAvis.enable_notebook()
                vis = pyLDAvis.gensim.prepare(lda_model,
                                              corpus,
                                              dictionary=lda_model.id2word)
                topics['x'] = 1.0
                topics['y'] = 1.0
                for topic, x in zip(list(vis.topic_coordinates.index),
                                    list(vis.topic_coordinates.x)):
                    topics.at[topic, 'x'] = float(x)
                for topic, y in zip(list(vis.topic_coordinates.index),
                                    list(vis.topic_coordinates.y)):
                    topics.at[topic, 'y'] = float(y)

                import math

                def calculateDistance(x1, y1, x2, y2):
                    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
                    return dist

                distanceMatrix = []
                allDistances = []
                c1 = 0
                topicsX = topics['x'].tolist()
                topicsY = topics['y'].tolist()
                for tx1, ty1 in zip(topicsX, topicsY):
                    distances = []
                    for tx2, ty2 in zip(topicsX, topicsY):
                        distance = calculateDistance(tx1, ty1, tx2, ty2)
                        if not distance:
                            distance = 999
                        else:
                            allDistances.append(distance)
                        distances.append(distance)
                    distanceMatrix.append(distances)
                    c1 = c1 + 1

                percentile20 = np.percentile(allDistances, 20)
                numberOfAdjacent = 0
                numberOfNodes = len(distanceMatrix)
                allAdjacentTopics = []
                for distances in distanceMatrix:
                    adjacentTopics = []
                    for index, distance in zip(range(len(distances)),
                                               distances):
                        if distance <= percentile20:
                            adjacentTopics.append(index)
                    allAdjacentTopics.append(adjacentTopics)
                    numberOfAdjacent = numberOfAdjacent + len(adjacentTopics)
                numberOfAdjacent = numberOfAdjacent / 2
                pairs = []
                for index, adjacentTopicList in zip(
                        range(len(allAdjacentTopics)), allAdjacentTopics):
                    for adjacentTopic in adjacentTopicList:
                        pairs.append(sorted([index, adjacentTopic]))
                pairs.sort()
                dedupedPairs = list(pairs
                                    for pairs, _ in itertools.groupby(pairs))
                topWordPairs = []
                for pair in dedupedPairs:
                    topWordPairs.append(
                        [topicTopWords[pair[0]], topicTopWords[pair[1]]])
                topics['adjacentTopics'] = allAdjacentTopics

                # Save topics data
                ef.deleteIndex(self.credentials, "topics")
                ef.saveTopics(self.credentials, topics)

                # Lesson strength
                maxLessonStrength = topics['numberOfPCRs'].sum()
                lessonStrengths = []
                for index, row in lessonsData.iterrows():
                    topicNumbers = row['topTopics']
                    lessonStrength = 0
                    for topicNumber in topicNumbers:
                        lessonStrength = lessonStrength + topics.at[
                            topicNumber, 'numberOfPCRs']
                    lessonStrengths.append(lessonStrength / maxLessonStrength)
                lessonsData['lessonStrength'] = lessonStrengths

                # Save lessons data
                ef.updateSentences(self.credentials, lessonsData)
                mf.backupIndex(self.credentials, "sentences")
                mf.backupIndex(self.credentials, "topics")

    # Update related lessons

    # Get TFIDF model
        if args.update_related_lessons == "True":
            tfidf = TfidfModel(corpus, smartirs='ntc')
            tfidf_corpus = []
            for doc in corpus:
                tfidf_corpus.append(tfidf[doc])
            tfidf_mat = matutils.corpus2dense(tfidf_corpus,
                                              num_terms=len(id2word.token2id))
            tfidf_mat_transpose = tfidf_mat.transpose()
            tfidfDF = pd.DataFrame(
                data=tfidf_mat_transpose[0:, 0:],
                index=[i for i in range(tfidf_mat_transpose.shape[0])],
                columns=[
                    '' + str(i) for i in range(tfidf_mat_transpose.shape[1])
                ])
            tfidfDF['id'] = ids.tolist()

            # Save related lessons
            cf.updateRelatedLessons(self.credentials, tfidfDF)
# Build another model using multicore LDA implementation and compare the coherence score
from gensim.models import LdaMulticore
ldamulticore = LdaMulticore(corpus=corpus,
                            num_topics=num_topics,
                            id2word=id2word,
                            workers=4,
                            eval_every=None,
                            passes=20,
                            batch=True,
                            per_word_topics=True)

# In[ ]:

# Display topics
from pprint import pprint
pprint(ldamulticore.show_topics(num_words=5, formatted=False))

# In[ ]:

# Compute Coherence Score for the multicore model
processed_data = pickle.load(open("processed_data_100_QAT.pkl", "rb"))
coherence_model_ldamulticore = CoherenceModel(model=ldamulticore,
                                              texts=processed_data,
                                              dictionary=id2word,
                                              coherence='c_v')
coherence_ldamulticore = coherence_model_ldamulticore.get_coherence()
print('Coherence Score: ', coherence_ldamulticore)

# In[ ]:

# Build another model using LDA implementation and compare the coherence score with the two previous models
예제 #9
0
### Create BOW corpus ###
corpus = [dictionary.doc2bow(text) for text in text_list]

print("--- Corpus made: %s minutes ---" % round(((time.time() - start_time)/60),2)) 



start_lda_time = time.time()

#################################
######### Train LDA  ############
#################################

lda_model = LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=150, workers = 3)
final_topics = lda_model.show_topics()

print("--- LDA trained : %s minutes ---" % round(((time.time() - start_lda_time)/60),2)) 


#################################
##### Display WordCloud #########
#################################
curr_topic = 0
wc = WordCloud(background_color="black", max_words=2000,max_font_size=40, width=120, height=120, random_state=42)
for line in final_topics:
    line = line[1]
    scores = [float(x.split("*")[0]) for x in line.split(" + ")]
    words = [x.split("*")[1] for x in line.split(" + ")]
    freqs = []
    for word, score in zip(words, scores):