Пример #1
0
def build_model(dictionary, corpus, n_topics, lemmatized_notes):
    # Build LDA model
    coh_val_lda = []
    coh_val_lda_mallet = []
    model_lda = []
    model_mallet = []
    for topic in n_topics:
        lda_model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=topic,
                             random_state=100,
                             update_every=1,
                             chunksize=100,
                             passes=10,
                             alpha='auto',
                             per_word_topics=True)
        coh_lda_model = CoherenceModel(model=model_lda,
                                       texts=lemmatized_notes,
                                       dictionary=dictionary,
                                       coherence='c_v')
        coh_val_lda.append(coh_lda_model.get_coherence())
        model_lda.append(lda_model)
        # Build LDA Mallet model
        mallet_path = 'mallet/bin/mallet'
        lda_mallet = LdaMallet(mallet_path,
                               corpus=corpus,
                               num_topics=n_topics,
                               id2word=dictionary)
        coh_lda_model = CoherenceModel(model=lda_mallet,
                                       texts=lemmatized_notes,
                                       dictionary=dictionary,
                                       coherence='c_v')
        model_mallet.append(lda_mallet)
        coh_val_lda_mallet.append(coh_lda_model.get_coherence())
    return model_mallet, coh_val_lda_mallet, model_lda, coh_val_lda
Пример #2
0
def topic_count_selection(dictionary: Dictionary, corpus: list,
                          tokenized_docs: list, test_range: tuple) -> tuple:
    """
    Function to measure LDA topic coherence for different numbers of topics
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(test_range[0], test_range[1]):
        lm = LdaMallet('/home/hadoop/Mallet-master/bin/mallet',
                       corpus=corpus,
                       num_topics=num_topics,
                       id2word=dictionary,
                       iterations=1000,
                       prefix=f'{os.getcwd()}/models/MALLET/',
                       random_seed=42)
        lm_list.append(lm)

        cm = CoherenceModel(model=lm,
                            texts=tokenized_docs,
                            dictionary=dictionary,
                            coherence='c_v')
        c_v.append(cm.get_coherence())

    return lm_list, c_v
Пример #3
0
    def gensim_mallet_lda(self, num_topics=5, num_words=15):        
        """Performs Mallet LDA using Gensim wrapper.

        Requires gensim_corpus output for a column from gensim_preprocessing().

        Args:
            num_topics (int): Desired number of topics to model.
            num_words (int): Number of words to print for each topic.
        """

        mallet_lda_model = LdaMallet(self.mallet_path,
                                     corpus=self.gensim_corpus,
                                     num_topics=num_topics,
                                     id2word=self.id2word)

        label = self.data_frame.columns.to_numpy()[self.col_num]
        print(f"Column {self.col_num} - Label: {label}\n")
        print(f"MALLET LDA Topic Modeling via Gensim with {num_topics} topics:\n")

        # Print topics and words
        x = mallet_lda_model.show_topics(num_topics=num_topics,
                                        num_words=num_words,
                                        log=False,
                                        formatted=False)
        topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

        for topic, words in topics_words:
            print(f"Topic {str(topic)}:\n{str(words)}\n")

        coherence = self.coherence_score(mallet_lda_model,
                                         self.gensim_words_nostops,
                                         self.id2word)
        print(f"Coherence: {coherence}")
Пример #4
0
    def lda(self, column, method='mallet', save_model=None, load_model=None):
        if method == 'mallet':
            print("Mallet LDA")
        else:
            raise ValueError("Invalid paramater for LDA.method: {}".format(method))
        tmp_dir = os.path.join(tempfile.gettempdir(), "mallet_lda/")
        if not os.path.exists(tmp_dir):
            os.makedirs(tmp_dir)

        if not hasattr(self, "vocab"):
            self.__learn_vocab(column)

        if len(self.__bag_of_words) != 0:
            docs, id2word = self.__bag_of_words[column]
        else:
            docs, id2word = self.__get_bag_of_words(column)
        model = LdaMallet(mallet_path=self.mallet_path,
                          id2word=id2word,
                          prefix=tmp_dir,
                          num_topics=self.num_topics,
                          iterations=self.lda_max_iter,
                          optimize_interval=20)
        model.train(docs)
        doc_topics = list()
        for doc_vec in model.read_doctopics(model.fdoctopics()):
            topic_ids, vecs = zip(*doc_vec)
            doc_topics.append(np.array(vecs))
        self.features["lda"] = np.array(doc_topics)
        self.feature_names["lda"] = model.get_topics()
        return
Пример #5
0
def lda(filename):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    path_to_mallet_binary = "/home/xiu-xiu/Mallet/bin/mallet"

    tweets = []
    with open(filename, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for tweet in reader:
            tweets.append(tweet['text'].split(' '))

    dictionary = Dictionary(tweets)
    corpus = [dictionary.doc2bow(tweet) for tweet in tweets]

    for num_topics in [20, 30, 50]:
        lines = []
        model = LdaMallet(path_to_mallet_binary,
                          corpus=corpus,
                          num_topics=num_topics,
                          id2word=dictionary)
        with open('lda' + str(num_topics) + '.txt', 'w') as result:
            for topic in range(num_topics):
                for word in model.show_topic(topic, topn=20):
                    result.write(word[0] + ' ')
                result.write('\n')
 def instanciate_model(self, num_topics, passes, iterations,
                       enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False):
     if enable_mallet is True:
         # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
         os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'})
         self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet'  # update this path
         self.lda_model = LdaMallet(self.mallet_path,
                                    corpus=self.corpus,
                                    num_topics=num_topics,
                                    id2word=self.id2word,
                                    iterations=iterations,
                                    optimize_interval=optimize_interval,
                                    topic_threshold=topic_threshold)
         print('Mallet LDA model built\n')
         if show_topics_on_creation is True:
             pprint(self.lda_model.show_topics(formatted=False))
     else:
         self.lda_model = LdaMulticore(corpus=self.corpus,
                                       id2word=self.id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=500,
                                       passes=passes,
                                       iterations=iterations,
                                       per_word_topics=True)
         print('LDA_MultiCore model built\n')
         if show_topics_on_creation is True:
             pprint(self.lda_model.print_topics())
    def topic_modelling(data_object_name):
        """
        perform topic modelign for a given set of posts (data object)
        :param data_object_name: raw data for topic modeling
        """
        data_words = Serialization.load_obj(data_object_name)

        stop_words = stopwords.words('english')
        print('removing stopwords and unfrequent words...')
        ranks = Serialization.load_obj('dict.ranks')
        data_words = Utils.remove_noncontent_words(data_words, stop_words,
                                                   ranks)

        id2word = corpora.Dictionary(data_words)
        corpus = [id2word.doc2bow(post) for post in data_words]

        topics = CS_TOPICS
        print('performing topic modeling with', topics, 'topics')
        ldamodel = LdaMallet(mallet_path,
                             corpus=corpus,
                             num_topics=topics,
                             id2word=id2word)
        pprint(
            malletmodel2ldamodel(ldamodel).top_topics(corpus, data_words,
                                                      id2word))
        '''
 def setUp(self):
     # Suppose given below are the topics which two different LdaModels come up with.
     # `topics1` is clearly better as it has a clear distinction between system-human
     # interaction and graphs. Hence both the coherence measures for `topics1` should be
     # greater.
     self.topics1 = [['human', 'computer', 'system', 'interface'],
                     ['graph', 'minors', 'trees', 'eps']]
     self.topics2 = [['user', 'graph', 'minors', 'system'],
                     ['time', 'graph', 'survey', 'minors']]
     self.ldamodel = LdaModel(corpus=corpus,
                              id2word=dictionary,
                              num_topics=2,
                              passes=0,
                              iterations=0)
     mallet_home = os.environ.get('MALLET_HOME', None)
     self.mallet_path = os.path.join(mallet_home, 'bin',
                                     'mallet') if mallet_home else None
     if self.mallet_path:
         self.malletmodel = LdaMallet(mallet_path=self.mallet_path,
                                      corpus=corpus,
                                      id2word=dictionary,
                                      num_topics=2,
                                      iterations=0)
     vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
     if not vw_path:
         msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
         logging.info(msg)
         self.vw_path = None
     else:
         self.vw_path = vw_path
         self.vwmodel = LdaVowpalWabbit(self.vw_path,
                                        corpus=corpus,
                                        id2word=dictionary,
                                        num_topics=2,
                                        passes=0)
Пример #9
0
def fit_lda(prefix, tokenized_docs, id2word,
            mallet_path=os.environ["MALLET_PATH"],
            num_topics=500, iterations=500):

    if not os.path.isdir(prefix):
        os.makedirs(prefix)

    if os.path.exists(os.path.join(prefix, "saved_model.pkl")):
        return utils.SaveLoad.load(os.path.join(prefix, "saved_model.pkl"))
    elif tokenized_docs is None:
        raise ValueError("LDA model not found at {}/{}".format(prefixed, "saved_model.pkl"))

    if mallet_path is None or mallet_path == "":
        raise ValueError("No mallet path specified")

    corpus = [id2word.doc2bow(tokens) for tokens in tokenized_docs.values.tolist()]

    lda_model = LdaMallet(mallet_path=mallet_path,
                          prefix=prefix,
                          corpus=corpus,
                          id2word=id2word,
                          iterations=iterations,
                          workers=4,
                          num_topics=num_topics,
                          optimize_interval=20)
    lda_model.save(os.path.join(prefix, "saved_model.pkl"))
    id2word.save_as_text(os.path.join(prefix, "id2word"))

    # save clean lda weights for later analysis
    W = lda_model.get_topics()
    W = pd.DataFrame(W).rename(columns=id2word)
    W.index = pd.Series(["lda.{}".format(i) for i in range(len(W))], name="topic_id")
    W.to_csv(os.path.join(prefix, "lda_weights.csv"))
    return lda_model
Пример #10
0
 def learn_lda_model(self, corpus, dictionary, k, iterations=100):
     """
     learning LDA model
     :param corpus: corpus created by gensim
     :param dictionary: dictionary created by gensim
     :param k: number of topics
     :param iterations: number of iterations
     :return:
     """
     if not self.use_mallet:
         lda = LdaMulticore(corpus,
                            id2word=dictionary,
                            workers=self.cpu_count,
                            num_topics=k,
                            random_state=42,
                            iterations=iterations,
                            per_word_topics=False,
                            eval_every=None)
     else:
         lda = LdaMallet(self.path_to_mallet_binary,
                         corpus=corpus,
                         id2word=dictionary,
                         workers=self.cpu_count,
                         num_topics=k,
                         random_seed=42,
                         iterations=iterations,
                         optimize_interval=10)
     cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass')
     coherence = cm.get_coherence()
     print('{}: {}'.format(k, coherence))
     return coherence, lda
Пример #11
0
 def create_mallet_lda_model(self,
                             no_topics,
                             random_state=42,
                             workers=None,
                             mallet_path="mallet-2.0.8/bin/mallet",
                             iterations=1000,
                             custom_prefix=None):
     """
     Method to create a mallet lda model using gensim wrapper for lda mallet
     :param no_topics: Number of topics for lda model
     :param random_state: Random state to be able to reprocude model creation
     :param workers: Number of workers to use
     :param mallet_path: path to mallet binary, e.g. "mallet-2.0.8/bin/mallet"
     :param iterations: iterations over the corpus?!
     """
     if workers is None:
         workers = self.processes
     if self.bag_of_words is None:
         self.create_bag_of_words()
     if custom_prefix is None:
         prefix = f"{self.path}mallet_temp_"
     else:
         prefix = f"{self.path}mallet_temp_{custom_prefix}_"
     self.lda_model = LdaMallet(num_topics=no_topics,
                                mallet_path=mallet_path,
                                corpus=self.bag_of_words,
                                id2word=self.id2word,
                                random_seed=random_state,
                                iterations=iterations,
                                workers=workers,
                                prefix=prefix)
Пример #12
0
    def run_lda(self, processed_sentences):
        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
        # list containing the final topic keywords
        topic_top_words = []

        documents = [
            comment.split() for comment in processed_sentences if comment
        ]
        dictionary = corpora.Dictionary(documents)
        # Filter the words that occur in less than 5 comments or those that occur in more than half of the comments
        dictionary.filter_extremes(no_below=5, no_above=0.5)
        doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents]
        mallet_path = 'C:\\Mallet-2.0.8\\bin\\mallet'
        optimization_interval = 50
        lda_alpha = 1

        lda = LdaMallet(mallet_path,
                        doc_term_matrix,
                        num_topics=self.number_of_topics,
                        id2word=dictionary,
                        optimize_interval=optimization_interval,
                        alpha=lda_alpha)

        # This list contains the word probabilities given a topic
        topic_words_and_probs = []

        for i in range(self.number_of_topics):
            # Get top number_of_lda_keywords_for_assignment words and corresponding probabilities for the topic
            topic_words_and_probs.append(
                lda.show_topic(
                    i, topn=self.number_of_lda_keywords_for_assignment))

        for i in range(len(topic_words_and_probs)):
            temp = []
            for j in topic_words_and_probs[i]:
                if j[1] > 0.0:
                    temp.append(j)
                    self.total_topic_word.append(j[0])
            topic_words_and_probs[i] = temp

        for i in range(self.number_of_topics):
            # Get the top keywords for the topic and extract the top nouns
            topic_words = [
                component[0] for component in topic_words_and_probs[i]
            ]

            final_topic_words = []

            for word in topic_words:
                if len(final_topic_words) >= self.number_of_lda_keywords:
                    break

                pos = nltk.pos_tag([word])
                word = lemmatizer.lemmatize(word)
                noun_tags = ['NN', 'NNS', 'NP', 'NPS']
                if word not in final_topic_words and pos[0][1] in noun_tags:
                    final_topic_words.append(word)
            topic_top_words.append(final_topic_words)
        return topic_top_words, topic_words_and_probs
Пример #13
0
def get_lda_mallet_model(doc_term_matrix, id2word, fname):
    mallet_path = '../../model/mallet-2.0.8/bin/mallet'

    if fname is not None:
        try:
            LdaMallet(fname)
        except:
            pass

    lda_mallet = LdaMallet(mallet_path=mallet_path,
                           corpus=doc_term_matrix,
                           id2word=id2word,
                           num_topics=10)

    _save_model(lda_mallet, fname=fname)

    return lda_mallet
def main():
    print("\n-----LDA CONCEPT DETECITON-----")
    corpus = load_from_csv(CORPUS_PATH)

    # Create CountVectorizer to get Document-Term matrix

    stop_words = load_stop_words("data/stopwords-fr.txt")
    vectorizer = CountVectorizer(lowercase=True,
                                 max_df=MAX_DF,
                                 min_df=MIN_DF,
                                 token_pattern=r"(?u)\b\w\w\w+\b")

    proc_corpus, proc_corpus_text_only = remove_short_segs(corpus, vectorizer)
    proc_corpus_text_only = [seg.split() for seg in proc_corpus_text_only]
    proc_stop_words = []

    for i in range(len(proc_corpus_text_only)):
        proc_stop_words.append([])
        for j in range(len(proc_corpus_text_only[i])):
            if proc_corpus_text_only[i][j] not in stop_words and len(
                    proc_corpus_text_only[i][j]) >= 3:
                proc_stop_words[i].append(proc_corpus_text_only[i][j])

    # train vectorizer on corpus

    id2word = Dictionary(proc_stop_words)
    corp = [id2word.doc2bow(text) for text in proc_stop_words]

    # print("Number of Features: " + str(len(feature_names)))

    # initialize model
    path_to_mallet_binary = "Mallet/bin/mallet"

    mallet_model = LdaMallet(path_to_mallet_binary,
                             corpus=corp,
                             num_topics=14,
                             id2word=id2word,
                             optimize_interval=1,
                             random_seed=9,
                             iterations=5)

    doc_topics = list(
        mallet_model.read_doctopics(mallet_model.fdoctopics(), renorm=False))
    topic_word = TopicWord(mallet_model)
    topic_word.get_topic_word()
    topic_word.write_to_csv("../output/topic_" +
                            str(mallet_model.random_seed) + "_" +
                            str(mallet_model.iterations) + "_" +
                            str(mallet_model.num_topics) + ".csv")

    topic_doc = TopicDoc(mallet_model)
    topic_doc.get_topic_doc()
    topic_doc.write_to_csv("output/topic_doc" + str(mallet_model.random_seed) +
                           "_" + str(mallet_model.iterations) + "_" +
                           str(mallet_model.num_topics) + ".csv",
                           num_docs=50)

    return 0
Пример #15
0
 def fit(self, X, y=None):
     print('vect2gensim')
     corpus, dictionary = self.vect2gensim(self.vectorizer, X)
     self.model = LdaMallet(self.mallet_path,
                            iterations=self.iterations,
                            corpus=corpus,
                            num_topics=self.n_components,
                            id2word=dictionary)
     return self
Пример #16
0
def LDA(dictionary, corpus, k_topics, iterations):
    print("Iniciando LDA...")
    model = LdaMallet(os.path.dirname(os.path.abspath(__file__)) +
                      '/mallet-2.0.8/bin/mallet',
                      corpus=corpus,
                      num_topics=k_topics,
                      id2word=dictionary,
                      iterations=iterations)
    return model
Пример #17
0
    def mallet_lda(self, num):

        id2word = corpora.Dictionary(self.data['token'])
        texts = self.data['token']
        corpus = [id2word.doc2bow(text) for text in texts]
        os.environ['Mallet_HOME'] = 'C:\\Mallet'
        mallet_path = 'C:\\Mallet\\bin\\mallet'
        ldamallet = LdaMallet(mallet_path,
                              corpus=corpus,
                              num_topics=num,
                              id2word=id2word)
        return ldamallet.print_topics(num, num_words=6)
Пример #18
0
def run():
    # Get the Preprocessed Dataset
    df = pd.read_pickle('./data/tmp/preprocessed.pkl')

    if os.path.isfile('./models/MALLET/mallet_model.pkl'):
        # Let's not do any model retraining without building in topic stability constraints
        #     e.g. number of docs or tokens now in different topics

        seen = False  # Data we provide is new and unseen for the model
        with open('./models/MALLET/mallet_model.pkl', 'rb') as modelfile:
            topic_model = pickle.load(modelfile)

        with open('./models/MALLET/mallet_dict.pkl', 'rb') as dictfile:
            dictionary = pickle.load(dictfile)
            df['bow'] = df['tokens'].apply(dictionary.doc2bow)

    else:
        seen = True  # any data we provide is used to train the model
        with Timer('Train the LDA Model'):
            test_range = (5, 50)
            df, corpus, dictionary = get_corpus_and_dict(df, 'tokens')
            list_of_models, scores = topic_count_selection(
                dictionary, corpus, list(df['tokens']), test_range)

            plot_coherence(
                test_range,
                scores).savefig('./models/MALLET/ModelCoherence.png')

            # Let's save the model with highest coherence
            num_topics = test_range[0] + scores.index(max(scores)) + 1
            topic_model = LdaMallet('/home/hadoop/Mallet-master/bin/mallet',
                                    corpus=corpus,
                                    num_topics=num_topics,
                                    id2word=dictionary,
                                    iterations=1000,
                                    prefix=f'{os.getcwd()}/models/MALLET/',
                                    random_seed=42)

            print(f"* Chosen Model with {num_topics} topics")
            with open('./models/MALLET/mallet_model.pkl', 'wb') as modelfile:
                topic_model.save(modelfile)
            with open('./models/MALLET/mallet_corpus.pkl', 'wb') as corpusfile:
                pickle.dump(corpus, corpusfile)
            with open('./models/MALLET/mallet_dict.pkl', 'wb') as dictfile:
                pickle.dump(dictionary, dictfile)

    df = get_topic_model_scores(df, topic_model, seen=seen)
    df.to_pickle('./data/tmp/scored.pkl')

    print("\nSample")
    print(df.head(), "\n")
    def find_best_number_of_topics(data):
        dictionary = Dictionary(data)
        corpus = [dictionary.doc2bow(text) for text in data]

        scores = dict()
        for topics in range(2, 10, 1):
            print('performing topic modeling with', topics, 'topics')
            ldamodel = LdaMallet(TopicModeling.MALLET_PATH, corpus=corpus, num_topics=topics, id2word=dictionary)
            coherence_model = CoherenceModel(model=ldamodel, texts=data, coherence='c_v')
            coherence = coherence_model.get_coherence()
            scores[topics] = coherence
        # end for

        print('coherence scores: the higher, the better:', scores)
Пример #20
0
def get_lda_mallet_model(doc_term_matrix, id2word, fname):
    mallet_path = '../model/mallet/bin/mallet'

    if params['training']:
        lda_mallet = LdaMallet(mallet_path=mallet_path,
                               corpus=doc_term_matrix,
                               id2word=id2word,
                               workers=6,
                               num_topics=params['num_topics'])
        _save_model('mallet', lda_mallet, fname=fname)
    else:
        lda_mallet = _load_model('mallet', fname)

    return lda_mallet
 def model_topic(data_words, topics):
     """
     return topics model given data and number of topics
     :param data_words: data for topic modeling (e.g., a set of posts)
     :param topics: number of desired topics
     :return: topic model
     """
     id2word = corpora.Dictionary(data_words)
     corpus = [id2word.doc2bow(post) for post in data_words]
     print('performing topic modeling with', topics, 'topics')
     return LdaMallet(mallet_path,
                      corpus=corpus,
                      num_topics=topics,
                      id2word=id2word)
Пример #22
0
 def train(self, train_filename):
     print("train LDA")
     train_name = os.path.basename(train_filename)
     model_filename = train_name + ".lda_model"
     if os.path.isfile(model_filename):
         self.model = LdaMallet.load(model_filename)
     else:
         self.corpus = preprocessing.GensimCorpus(train_filename)
         self.model = LdaMallet(mallet_path,
                                self.corpus,
                                num_topics=100,
                                id2word=self.corpus.dictionary)
         self.model.save(model_filename)
         topics_str = self.model.show_topics(num_topics=-1)
         open(train_name + ".lda_model.topics", 'w').write(str(topics_str))
Пример #23
0
def train_model(num_topics, documents):

    # documents = get_dictionary()
    dictionary = corpora.Dictionary(documents)
    max_tokens = len(dictionary.keys())
    # print(f'Num tokens before cleanup {len(dictionary.keys())}')
    dictionary.filter_extremes(no_below=10, no_above=0.7, keep_n=max_tokens)
    # print(f'Num tokens after cleanup {len(dictionary.keys())}')
    corpus_bow = [dictionary.doc2bow(doc) for doc in documents]
    mallet_model = LdaMallet(mallet_path=MALLET_BINARY_PATH,
                             corpus=corpus_bow,
                             id2word=dictionary,
                             num_topics=num_topics)
    lda_model = ldamallet.malletmodel2ldamodel(mallet_model)
    return lda_model, corpus_bow, dictionary
Пример #24
0
 def LdaModel(self, num_topics, corpus, dictionary):
     """Create a LDA topic model
         Input:
                 num_topics: number of topics for the model
                 corpus: gensim corpus
                 ditionary: gensim dictionary
         Output:
                 lda_model: a topic model using Latent Dirichlet Allocation (LDA)
         """
     lda_model = LdaMallet(mallet_path=self.path_to_mallet_bin,
                           num_topics=num_topics,
                           corpus=corpus,
                           id2word=dictionary,
                           random_seed=123)
     return lda_model
Пример #25
0
def compute_coherence_values(dnary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for topics in range(start, limit, step):
        model = LdaMallet(mallet_path,
                          corpus=corpus,
                          id2word=dnary,
                          num_topics=topics,
                          workers=3)
        model_list.append(model)
        coherence_model = CoherenceModel(model=model,
                                         texts=texts,
                                         dictionary=dnary,
                                         coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())

    return model_list, coherence_values
Пример #26
0
def lda(bow, df, vocab):
    # Generate and load corpus
    corpus = text_to_corpus(bow)
    corpus = np.load('corpus.npy')

    path_to_mallet = './mallet-2.0.8/bin/mallet'
    model = LdaMallet(path_to_mallet,
                      corpus=corpus,
                      num_topics=5,
                      workers=4,
                      id2word=vocab)
    res = model.print_topics(num_topics=-1, num_words=50)

    # print response
    for x in res:
        print(x)
    for x in model[corpus]:
        print(x)
    def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int,
                  model_file_path: str, language_processed_data: list):
        my_path = os.path.abspath(os.path.dirname(__file__))
        logging.info("---- Creating LDA Mallet model")
        logging.info("------ Getting LDA Mallet model file")
        mallet_path = os.path.join(my_path, "../../statics/mallet-2.0.8/bin/mallet")
        temp = self.essentials.dictionary[0]
        model = LdaMallet(mallet_path,
                          corpus=self.essentials.corpus, num_topics=self.number_of_topics,
                          id2word=self.essentials.dictionary.id2token)
        model.save(model_file_path)
        self.model = model
        logging.info("---- LDA Mallet model is created")

        metrics = self.get_model_evaluation_metrics(language_processed_data)
        parameters = self.get_model_parameters()
        self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version,param_name, param_version, metrics, parameters)
        return
Пример #28
0
def get_LDA_mallet_model(paths, num_topics, iterations, minimum_probability):
    with open(paths[1], 'rb') as f:
        corpus = pickle.load(f)  # sparse terms (sparse matrix form of corpus)

    dictionary = Dictionary.load(paths[9])
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaMallet(
        mallet_path=os.getenv('MALLET_BIN'),
        corpus=corpus,
        num_topics=num_topics,
        prefix=f'{paths[16]}{num_topics}',
        id2word=id2word,
        workers=3,
        iterations=iterations,  # topic_threshold=minimum_probability
    )

    return model
Пример #29
0
def get_lda_model(corpus, id2word, model_type, num_topics, mallet_path):
    if model_type == 'lda':
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=id2word,
                                                    num_topics=num_topics,
                                                    random_state=100,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=4,
                                                    alpha='auto',
                                                    per_word_topics=True)
    elif model_type == 'mallet':
        lda_model = LdaMallet(mallet_path,
                              corpus=corpus,
                              id2word=id2word,
                              num_topics=num_topics)
    else:
        raise ValueError(
            'Unknown model type. Available types: \'lda\', \'mallet\'')
    return lda_model
Пример #30
0
def train_lda_mallet(corpus, id2word, num_topics, params: dict):
    mallet_path = params.get('mallet_path', MALLET_PATH)
    prefix_path = params.get('prefix_path', str(
        Path(ARTEFACTS_PATH) / PREFIX_BASE_PATH))
    prefix = params.get('prefix', '')
    if prefix:
        prefix = prefix + '_'
    prefix = str(Path(prefix_path) / prefix)
    iterations = params.get('iterations', ITERATIONS)
    alpha = params.get('alpha', 50)
    random_state = params.get('random_state', RANDOM_STATE)

    return LdaMallet(mallet_path=mallet_path,
                     prefix=prefix,
                     corpus=corpus,
                     id2word=id2word,
                     num_topics=num_topics,
                     alpha=alpha,
                     iterations=iterations,
                     random_seed=random_state)