def main(self): print('Loading data') data = pd.read_csv('../../resources/abcnews-date-text.csv', error_bad_lines=False) data_text = data[['headline_text']] data_text['index'] = data_text.index documents = data_text np.random.seed(2018) print('Preprocessing text') preprocessed_docs = documents['headline_text'].map(self.preprocess) print('Building bag of words corpus') dictionary = Dictionary(preprocessed_docs) # list: token_id, token dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs] # list: token_id, token_count print(documents[documents['index'] == 4310].values[0][0]) print(bow_corpus[4310]) print(bow_corpus[:100]) print('Building lda model from bag of words') lda_model_bow = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, workers=self.workers) for idx, topic in lda_model_bow.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) for index, score in sorted(lda_model_bow[bow_corpus[4310]], key=lambda tup: -1*tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda_model_bow.print_topic(index, 10))) print('Building tfidf corpus from bag of words corpus') tfidf = TfidfModel(bow_corpus) tfidf_corpus = tfidf[bow_corpus] from pprint import pprint for doc in tfidf_corpus: pprint(doc) break print('Building lda model from tfidf') lda_model_tfidf = LdaMulticore(tfidf_corpus, num_topics=10, id2word=dictionary, workers=self.workers) for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic)) for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]): print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10))) print('Testing on unseen document') unseen_document = 'Facebook’s global lobbying against data privacy laws' bow_vector = dictionary.doc2bow(self.preprocess(unseen_document)) print('Bow:') for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model_bow.print_topic(index, 5))) print('TfIdf:') for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
def train(file=DATA_FILE, type=JSON): delete_previous_models() faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type) faq_df = clean_data(faq_df) faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess) faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess) print('Preprocessing Done') if DEBUG: print(faq_df.head()) for mode in modes: model = modes[mode] dictionary = corpora.Dictionary(faq_df[model.column]) dictionary.save(os.path.join(MODEL_DIR, model.dictionary)) corpus = faq_df[model.column].map(dictionary.doc2bow) if DEBUG: print(f'{model.corpus} generated') print(corpus.head()) corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus), corpus) tfidf_model = TfidfModel(corpus) if DEBUG: print(f'{model.tfidf} generated') tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf)) tfidf = tfidf_model[corpus] lda_model = LdaMulticore(corpus=tfidf, id2word=dictionary, num_topics=30) lda_model.save(os.path.join(MODEL_DIR, model.model)) if DEBUG: print(f'{model.model} generated') print(lda_model.print_topics(5)) print('Training completed')
def load_model(self, phrase): processed_phrase = self.preprocessing(phrase) self.all_phrases.append(processed_phrase) # print(self.all_phrases) # dct = Dictionary(common_texts) dct = Dictionary(self.all_phrases) corpus = [dct.doc2bow(line) for line in self.all_phrases] lda_model = LdaMulticore(corpus=corpus, id2word=dct, random_state=100, num_topics=3, passes=10, chunksize=1000, batch=False, alpha="asymmetric", decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) topic_keywords = [] topics = lda_model.print_topics(-1) for topic in topics[:3]: topics_str = topic[1] pattern = r"[^a-zA-Z+]" topics_list = re.sub(pattern, "", topics_str).split("+") topic_keywords += topics_list[:5] return topic_keywords
def updateLDA(): api_file="./newsapi.key" categories=['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology'] with open(api_file,"r") as apikey: newsapi=NewsApiClient(api_key=apikey.read().strip()) headlines={cat:newsapi.get_top_headlines(category=cat, language='en', country='in') for cat in categories} pp_docs=[] for category in headlines: for article in headlines[category]['articles']: #print(lemma_pp(article['title'])) pp_docs.append(lemma_pp(article['title'])) if os.path.exists(MODEL_DIR+"corpus_dict.model"): corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model") corp_d.add_documents(pp_docs) else: corp_d = Dictionary(pp_docs) corp_d.filter_extremes(no_below=2, no_above=0.5) dtm=[corp_d.doc2bow(doc) for doc in pp_docs] tfidf=TfidfModel(dtm) corp_tfidf=tfidf[dtm] lda = LdaMulticore(corp_tfidf, num_topics=5, id2word=corp_d, passes=60, workers=3) print(lda.print_topics(num_topics=5, num_words=5)) checkdir(MODEL_DIR) corp_d.save(MODEL_DIR+"corpus_dict.model") #corp_tfidf.save(MODEL_DIR+"corpus_tfidf.model") lda.save(MODEL_DIR+"lda.model")
def extract_topics_from_text(text_tokenized): try: dictionary = Dictionary(text_tokenized) bow_corpus = [dictionary.doc2bow(doc) for doc in text_tokenized] lda_model = LdaMulticore(bow_corpus, num_topics=1, random_state=42, id2word=dictionary, workers=2) topics, topics_list = pretty_print_topics( lda_model.print_topics(num_topics=1, num_words=5)) except ValueError: return 'No topics extracted' return topics
def LDA(corpus, dictionary): lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, random_state=100, num_topics=250, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) return lda_model.print_topics(5, num_words=10)
def create_topics(context_groups, num_topics): topics = {} topics_dist = defaultdict(lambda: {}) word_counts = defaultdict(lambda: 0) for key, citation in context_groups.items(): try: citations = citation.values() if type(citations) == str: citations = [citations] dictionary = Dictionary(citations) bow_corpus = [dictionary.doc2bow(doc) for doc in citations] lda_model = LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2) topics[key], topics_list = pretty_print_topics(lda_model.print_topics(num_topics=num_topics, num_words=5)) topics_d = [] probs = [] topics_counts = defaultdict(lambda: []) for topic in topics_list: topic_words = topic.split(', ') for word in topic_words: word_counts[word] += 1 s = 0 for i in range(len(bow_corpus)): pretty_output, probs_, topics_counts = print_topics_by_ids(lda_model[bow_corpus[i]], topics_list, list(citation.keys())[i], topics_counts) s += len(pretty_output) topics_d.extend(pretty_output) probs.extend(probs_) topics_counts_ = [] for key_, value_ in topics_counts.items(): temp_dict = {} temp_dict['topic'] = key_ temp_dict['number'] = str(len(value_)) temp_dict['probability_average'] = str(round(np.average(value_), 3)) temp_dict['probability_std'] = str(round(np.std(value_), 3)) topics_counts_.append(temp_dict) topics_dist[key]['topics'] = sorted(topics_counts_, key=lambda k: int(k['number']), reverse=True) topics_dist[key]['contexts'] = sorted(topics_d, key=lambda k: float(k['probability']), reverse=True) # visdata = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary) # pyLDAvis.save_html(visdata, path.join('..', 'data', 'new_vis', '{}_vis.html'.format(key))) except ValueError: continue return dict(topics_dist)
def main(): """ Main file for Discere """ if len(sys.argv) < 2 or not isinstance(sys.argv[1], str): print("You need to provide a path for a pdf file") path = sys.argv[1] raw_pdf = parse_pdf(path) segments = segment(raw_pdf) processed_docs = clean_segments(segments) dictionary = Dictionary(processed_docs) bow = [dictionary.doc2bow(doc) for doc in processed_docs] lda_model = LdaMulticore(bow, num_topics=20, id2word=dictionary, passes=10, workers=2) print(lda_model.print_topics())
def lda3(corpus,dictionary): lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, random_state=22, num_topics=100, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) # save the model lda_model.save('lda_model.model') # See the topics for topic in lda_model.print_topics(100,20): print(topic)
num_topics=200, passes=50, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) corpus_lda = lda_model[bow_corpus] topics = lda_model.print_topics(5, num_words=10) for topic in topics: print(topic) speech = '' decade = [] for i in range(data.shape[0]): year = int(data['year'][i]) speech = speech + " " + data['speech'][i] if (year + 1) % 10 == 0: decade.append(speech) speech = '' decade.append(speech) len(decade)
'parking_lot', 'trail_starts', 'mile_turn', 'north_south', 'mountain_bike', 'mountain_biking', 'single_track', 'mountain_bike_trail', 'trail_head' ]) second_stopwords = my_stopwords.union(STOPWORDS).union(bitri_stops) # Gensim LDA st_featurizer = Featurizer(first_stopwords=first_stopwords, second_stopwords=second_stopwords, bigrams=True, trigrams=True) processed_docs = st_featurizer.featurize(X) bow_corpus, id2word = make_gensim_bow(processed_docs, no_below=3, no_above=0.6, keep_n=10000) k = 6 lda_model = LdaMulticore(bow_corpus, num_topics=k, id2word=id2word, passes=5, workers=2, iterations=100) perplexity, coherence = get_perplexity_coherence(lda_model, bow_corpus, processed_docs, id2word) print( f'LDA with {k} topics: Perplexity is {perplexity:0.2} and coherence is {coherence:0.2}.' ) pprint(lda_model.print_topics())
with open("temp_corpus.pickle", "rb") as f: comments_corpus, dictionary = np.array(pickle.load(f)) print("created corpus") print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(comments_corpus)) num_topics = 150 if args.load: model = LdaMulticore.load("topic_models/model_comments") else: model = LdaMulticore(comments_corpus, id2word=dictionary, num_topics=num_topics) print("model done") model.save("topic_models/model_comments") print(model.print_topics(20)) top_topics = model.top_topics(comments_corpus) #, num_words=20) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) #from pprint import pprint #pprint(top_topics) for _ in range(10): idx = np.random.randint(0, len(comments_text)) print("comment: {} - topics: {}".format(comments_text[idx], [(model.show_topic(tid, topn=10), v) for tid, v
print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) # Train LDA model. # Set training parameters. num_topics = 15 chunksize = 20000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=chunksize, \ alpha='auto', eta='auto', \ iterations=iterations, num_topics=num_topics, \ passes=passes, eval_every=eval_every, workers=3) #model = LdaModel(corpus, num_topics=num_topics, id2word = id2word, passes=passes) model.save('reddit_autism_pass20_topic15_iter400_lda.model') #hdp = HdpModel(corpus, dictionary) #hdp.save('reddit_autism_hdp.model') print(model.print_topics(num_topics=num_topics, num_words=12)) print("Program Ended at: " + str(datetime.now()))
class model: def __init__(self, data): print('model class instantiating') self.__data = data self.__modelfilename = 'topicmodel.pkl' def createbasemodel(self): print('Creating base model') #Topics Alpha Beta Coherence #6 asymmetric symmetric 0.723863804 self.__model = LdaMulticore(corpus=self.__data.corpus_tfidf, id2word=self.__data.id2word, num_topics=6, alpha='asymmetric', eta='symmetric', workers=2, random_state=100, chunksize=100, passes=10, per_word_topics=True) print(self.__model.print_topics()) print(self.__model[self.__data.gensim_bow]) print('calculating coherence') __cohe_model = CoherenceModel(model=self.__model, texts=self.__data.processeddata, dictionary=self.__data.id2word, coherence='c_v') __cohe = __cohe_model.get_coherence() print('coherence :', __cohe) #print('hyper param tuning') #self.__hyperparamtunning() print('saving model') self.__savemodel() def __savemodel(self): with open(self.__modelfilename, 'wb') as file: pickle.dump(self.__model, file) def __getcoh(self, corpus, dictionary, k, a, b): __model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=k, alpha=a, eta=b, random_state=100, chunksize=100, passes=10, per_word_topics=True) __cohe_model = CoherenceModel(model=__model, texts=self.__data.processeddata, dictionary=dictionary, coherence='c_v') return __cohe_model.get_coherence() def __hyperparamtunning(self): print('hyper param tuning') topics_range = list(np.arange(2, 10, 1)) alpha_range = list(np.arange(0.01, 1, 0.3)) alpha_range.extend(['symmetric', 'asymmetric']) beta_range = list(np.arange(0.01, 1, 0.3)) beta_range.extend(['symmetric']) noofdocs = len(self.__data.processeddata) corpus = self.__data.corpus_tfidf print('no of docs : ', noofdocs) print('dividing corpus 0.25, 0.5, 0.75, 1 shares for testing ') corpus_sets = [ #gensim.utils.ClippedCorpus(corpus,noofdocs* 0.25), #gensim.utils.ClippedCorpus(corpus,noofdocs* 0.5), #gensim.utils.ClippedCorpus(corpus,noofdocs* 0.75), corpus ] corpus_title = ['100% corpus'] model_results = { 'Validation_Set': [], 'Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': [] } if 1 == 1: pbar = tqdm.tqdm(total=540) for i in range(len(corpus_sets)): for k in topics_range: for a in alpha_range: for b in beta_range: cv = self.__getcoh(corpus_sets[i], self.__data.id2word, k, a, b) model_results['Validation_Set'].append( corpus_title[i]) model_results['Topics'].append(k) model_results['Alpha'].append(a) model_results['Beta'].append(b) model_results['Coherence'].append(cv) pbar.update(1) results = pd.DataFrame(model_results) results.to_csv('lda_tuning_results.csv', index=False) print(results) pbar.close()
############################################################################### ### LDA Code ############################################################################### #LDA using bag of words dictionary = corpora.Dictionary(processed_text) corpus = [dictionary.doc2bow(doc) for doc in processed_text] ldamodel = LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=2, workers=2) for idx, topic in ldamodel.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) #LDA using TFIDF tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ldamodel = LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=2) for idx, topic in ldamodel.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) ###############################################################################
import numpy as np from gensim.models import LdaMulticore as LdaModel import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='Train LDA model') parser.add_argument('tweet_file', help=('path to twitter downloader dump where each line is a cleaned tweet')) parser.add_argument('out_dir', help=('path output file to save the model')) parser.add_argument('--include_list', nargs='?', default=None) # see preprocess for default list parser.add_argument('--exclude_list', nargs='?', default=None) # see preprocess for default list parser.add_argument('num_topics', type=int) parser.add_argument('--npasses', type=int, default=50) parser.add_argument('--decay', type=float, default=.5) parser.add_argument('--chunksize', type=int, default=2000) lda_filename = 'lda/middle_east_100.lda' args = parser.parse_args() corpus = corpora.MmCorpus('lda/lda_middle_east.mm') dictionary = corpora.Dictionary.load('lda/lda_middle_east.dict') lda = LdaModel(corpus, num_topics=100, alpha=1./100, eta=.2, chunksize=10000, workers=5, passes=100, decay=0.75, id2word=dictionary) print('Saving model') lda.print_topics() lda.save(lda_filename) print("lda saved in %s " % lda_filename)
class ModelTraining(NewsPipeline): def __init__(self): super().__init__() self.__modelfilename = 'BaseModel.pkl' print('ModelTraining instantiated') @mlflowtimed def _process(self, features): self.__config = PipelineConfig.getPipelineConfig(self) self.id2word, self.gensim_bow, self.tfidfmodel, self.processeddata = features[ 0], features[1], features[2], features[3] self.corpus_tfidf = self.tfidfmodel[self.gensim_bow] self.__createbasemodel() self._storeMLflowData() return features def __createbasemodel(self): print('Creating base model') #Topics Alpha Beta Coherence #6 asymmetric symmetric 0.723863804 self.__model = LdaMulticore(corpus=self.corpus_tfidf, id2word=self.id2word, num_topics=6, alpha='asymmetric', eta='symmetric', workers=2, random_state=100, chunksize=100, passes=10, per_word_topics=True) if self.__config['Storemodel']: self.__savemodel() print(self.__model.print_topics()) print(self.__model[self.gensim_bow]) print('calculating coherence') #__cohe_model = CoherenceModel(model=self.__model,texts=self.processeddata,dictionary=self.id2word,coherence='c_v') __cohe_model = CoherenceModel(model=self.__model, corpus=self.corpus_tfidf, coherence='u_mass') __cohe = __cohe_model.get_coherence() print('coherence :', __cohe) self._addMLflowMetric('BaseModel.Coherence', __cohe) return self.__model def fit(self, x, y=None): print('ModelTraining.fit') return self def transform(self, x): print('ModelTraining.transform') return self._process(x) def __savemodel(self): print('storing topic model') try: today = datetime.today().strftime('%Y-%m-%d') topicmodelfile = os.path.join(DATA_PATH, today, self.__modelfilename) if os.path.isfile(topicmodelfile): os.remove(topicmodelfile) with open(topicmodelfile, 'wb') as plkfile: pickle.dump(self.__model, plkfile) return True except Exception as ex: print(ex) return False
class ReviewLDA(): def __init__(self): self.__tokenizer_type = None self.dataset = None self.context = [] self.vocab_dict = None self.lda_model = None self.regex = re.compile('[^ 가-힣]+') def __check_dataset(self, dataset): if type(dataset) != pd.core.frame.DataFrame: raise TypeError('데이터 형식을 확인하십시오 (pandas dataframe)') if '리뷰' not in dataset.columns: raise ValueError('데이터에 "리뷰" column이 없습니다.') def load_pkl(self, dataset_path): """Pickle 형식의 데이터를 읽습니다.""" with open(dataset_path, 'rb') as f: dataset = pickle.load(f) self.__check_dataset(dataset) self.dataset = dataset['리뷰'].values def load_excel(self, dataset_path): dataset = pd.read_excel(dataset_path) self.__check_dataset(dataset) self.dataset = dataset['리뷰'].values def load_csv(self, dataset_path): dataset = pd.read_csv(dataset_path) self.__check_dataset(dataset) self.dataset = dataset['리뷰'].values def load_tokenizer(self, method, spm_path=None): """method(str) : spm, mecab spm_path(str) : method가 spm이면 spm 모델 경로 입력""" if method not in ['spm', 'mecab', 'okt']: raise ValueError('잘못된 method 입력됨') self.__tokenizer_type = method if method == 'spm': if not spm_path: raise ValueError('spm_path가 존재하지 않습니다.') self.tokenizer = spm.SentencePieceProcessor() self.tokenizer.load(spm_path) elif method == 'okt': self.tokenizer = Okt() else: self.tokenizer = Mecab() def __tokenize(self, text): text = self.regex.sub('', text) if not self.__tokenizer_type: raise ValueError('Tokenizer를 먼저 load하세요') if self.__tokenizer_type == 'spm': return [ repeat_normalize(token.replace("▁", "")) for token in self.tokenizer.EncodeAsPieces(text) ] elif self.__tokenizer_type == 'okt': tag_list = ['Noun', 'Verb', 'Adjective', 'Adverb'] return [ repeat_normalize(token) for token, pos in self.tokenizer.pos(text) if pos in tag_list ] else: return [ repeat_normalize(token) for token in self.tokenizer.nouns(text) ] def kor_preprocess(self): for review in tqdm(self.dataset): self.context.append(self.__tokenize(review)) def make_ngram(self, n, min_count=10): result = list() bigram = Phrases(self.context, min_count=min_count, threshold=10) if n == 2: for doc in tqdm(self.context): result.append(bigram[doc]) elif n == 3: trigram = Phrases(bigram[self.context]) for doc in tqdm(self.context): result.append(trigram[doc]) else: raise ValueError('n그램 값이 너무 큽니다.(2 혹은 3)') self.context = result def make_vocab(self): count_dict = Counter() for review in tqdm(self.context): count_dict.update(Counter(review)) self.vocab_dict = dict( sorted(count_dict.items(), key=(lambda x: x[1]), reverse=True)) def get_vocab(self, reverse=True): if reverse: return self.vocab_dict else: return dict( sorted(self.vocab_dict.items(), key=(lambda x: x[1]), reverse=False)) def filter_vocab(self, min_count=0, max_count=None): """min_count (int) : 이 이상 등장한 단어만 사용 max_count (int) : 이 이하 등장한 단어만 사용""" if not self.vocab_dict: raise ValueError("vocab을 먼저 생성해주십시요 (make_vocab method)") if not max_count: max_count = len(self.context) self.vocab_dict = dict([(k, v) for k, v in self.vocab_dict.items() if v > min_count if v < max_count]) def do_lda(self, num_topics, workers=8, iterations=400, passes=15): self.id2word = dict([(i, k) for i, k in enumerate(self.vocab_dict.keys())]) self.word2id = dict([(k, i) for i, k in enumerate(self.vocab_dict.keys())]) self.corpus = list() for review in tqdm(self.context): self.corpus.append(self.__get_doc2bow(review)) print('Fitting Start.') self.lda_model = LdaMulticore(corpus=self.corpus, num_topics=num_topics, id2word=Dictionary().from_corpus( self.corpus, self.id2word), workers=workers, iterations=iterations, passes=passes) print('Model Fitted.') def print_lda(self): if not self.lda_model: raise ValueError('모델을 먼저 학습하십시오') pprint.pprint(self.lda_model.print_topics()) def __get_doc2bow(self, review): counter = dict() for token in review: if self.word2id.get(token): counter[self.word2id[token]] = counter.get( self.word2id[token], 1) + 1 return list(counter.items())
class LDAMWBase: def __init__(self, mtype='multiple', resource=None, lda_work_folder=None, lda_model_filename=None, lda_dict_filename=None, lda_topic_word_count=0, lda_topics_count=0, resource_language=None, data_type=None): # # todo Deutsch Lemmatizer / Stemmer !!! # self.p_stemmer = PorterStemmer() self.wn_lemmatizer = WordNetLemmatizer() if resource is not None: # resource_lang == 'en' as default resource_lang = 'en' # hope that resource is correct and exists if data_type == 'db': resource_lang = Resources.select(Resources.lang).where( Resources.resource == resource).get() resource_lang = resource_lang.__data__['lang'].lower() elif data_type == 'csv': if resource_language is None: raise Exception( "Resource language must be defined for csv data type.") else: resource_lang = resource_language else: pass self.stop_words = get_stop_words(resource_lang) self.resource_identifier_name = resource def _create_model_deps(model_name, twordscount, tcount, mini=False, mini_path=None): if not mini: mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + model_name else: mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + mini_path mn = 'lda_model' + '_' + model_name md = 'dictionary' + '_' + model_name ltwordscount = twordscount ltcount = tcount _short_model_report = "{}{}: {} \n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}".format( INFO_FLAG, colored("Model path", 'red', None, ['bold']), mp, INFO_FLAG, colored("Model name", 'red', None, ['bold']), mn, INFO_FLAG, colored("Model dictionary", 'red', None, ['bold']), md, INFO_FLAG, colored("Topic words count", 'red', None, ['bold']), ltwordscount, INFO_FLAG, colored("Topics count", 'red', None, ['bold']), ltcount, "-" * 88) if model_name != 'mini': print(_short_model_report) return mp, mn, md, ltwordscount, ltcount if mtype == 'multiple': if resource is not None: mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps( self.resource_identifier_name, LDA_TOPIC_WORD_COUNT, LDA_TOPICS_COUNT) else: raise Exception( "{}Resource must be defined. Exiting... \n".format( EXCEPTION_FLAG)) elif mtype == 'single_ltc': mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps( "mini", MINI_LDA_TOPIC_WORD_COUNT, MINI_LDA_TOPICS_COUNT, mini=True, mini_path=self.resource_identifier_name + "/mini") if lda_work_folder is None: self.lda_work_folder = mpath else: self.lda_work_folder = lda_work_folder if not os.path.exists(self.lda_work_folder): os.mkdir(self.lda_work_folder) if lda_model_filename is None: self.lda_model_filename = os.path.join(self.lda_work_folder, mname) else: self.lda_model_filename = os.path.join(self.lda_work_folder, lda_model_filename) if lda_dict_filename is None: self.lda_dict_filename = os.path.join(self.lda_work_folder, mdict) else: self.lda_dict_filename = os.path.join(self.lda_work_folder, lda_dict_filename) self.lda_topics_count = lda_topics_count self.lda_topic_word_count = lda_topic_word_count self.dictionary = None self.lda_model = None self.lda_topics = [] @staticmethod def load_csv_data(csv_file): df = pd.read_csv(csv_file) train_documents = df['content'].values return train_documents @staticmethod def load_single_ltc(ltc_data): train_documents = re.split( r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ltc_data) return train_documents @staticmethod def load_db_data(resource=None): # if resource is None: # art_content_stream = Articles.select() # else: art_content_stream = Articles.select().where( Articles.resource == resource) train_documents = (acs.content for acs in art_content_stream if acs.content is not None) return train_documents def save_model(self, as_name=None, save_on_disk=True, save_topics_into_db=False): if save_on_disk: print(" \t-> Model was saved as [ {} ]".format(as_name)) if as_name is not None: self.lda_model.save(as_name) else: self.save_model(self.lda_model_filename) if save_topics_into_db: truncate_topics_tables(resource=self.resource_identifier_name) print(" \t-> Topics will be saved in database for [ {} ]".format( self.resource_identifier_name)) model_numbers_topics = self._get_topics() try: for topic_info in model_numbers_topics: tnum = topic_info[0] tresourceid = topic_info[1] tname = topic_info[2] _topic = { 'ident_number': tnum, 'value': tname, 'created_at': dt.datetime.today().date() } t = Topics.create(**_topic) t_id = t.__data__['topic'] _topic_resource = { 'resource': tresourceid, 'topic': t_id, 'created_at': dt.datetime.today().date() } tr = TopicsResources.create(**_topic_resource) print("{}[ {} ]".format(SUCCESS_FLAG, self.resource_identifier_name)) except Exception as e: print("{}{}".format(EXCEPTION_FLAG, e)) print("{}Failure: [ {} ]".format( ERROR_FLAG, self.resource_identifier_name)) def train_model(self, data_type, resource, single_ltc_data=None, data_file_path=None, train_corpus=None, train_dictionary=None, save_model_as=None, chunksize=LDA_CHUNKSIZE, passes=LDA_PASSES): if train_corpus is not None: corpus = train_corpus elif data_type == 'db': corpus = self._make_corpus(data_type=data_type, resource=resource) elif data_type == 'single_ltc' and single_ltc_data is not None: corpus = self._make_corpus(data_type=data_type, ltc=single_ltc_data, resource=resource) elif data_type == 'csv' and data_file_path is not None: corpus = self._make_corpus(data_type=data_type, data_file_path=data_file_path, resource=resource) else: raise Exception("{}Corpus is None".format(EXCEPTION_FLAG)) if train_dictionary is not None: dictionary = train_dictionary else: dictionary = self.dictionary """ id2word parameter need to get words in topics instead of their indexes in dict """ _tcount = self.lda_topics_count # self.lda_model = LdaModel(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize) self.lda_model = LdaMulticore(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize) if save_model_as is not None and not single_ltc_data: self.save_model(save_model_as, save_on_disk=True, save_topics_into_db=False) elif single_ltc_data: self.save_model(self.lda_model_filename, save_on_disk=True, save_topics_into_db=False) elif data_type == 'csv': self.save_model(self.lda_model_filename, save_on_disk=True, save_topics_into_db=False) else: self.save_model(self.lda_model_filename, save_on_disk=True, save_topics_into_db=True) print("{}Trained".format(SUCCESS_FLAG)) def load_model(self, model_file_path=None, dict_file_path=None): """ load model and dictionary from file (need to save them in train function) uses to update model on another corpus """ if model_file_path is not None and os.path.exists(model_file_path): self.lda_model = LdaMulticore.load(model_file_path) # self.lda_model = LdaModel.load(model_file_path) self.dictionary = Dictionary.load(dict_file_path) print(" \t-> Loaded: [ {} ]".format(model_file_path)) elif model_file_path is None and os.path.exists( self.lda_model_filename): self.lda_model = LdaMulticore.load(self.lda_model_filename) # self.lda_model = LdaModel.load(self.lda_model_filename) self.dictionary = Dictionary.load(self.lda_dict_filename) print(" \t-> Loaded: [ {} ]".format(self.lda_model_filename)) else: print( "{}Filepath you gave is incorrect. \n Give another one and retry." "\n Exiting...".format(ERROR_FLAG)) exit() for i in range(self.lda_model.num_topics): terms_id = self.lda_model.get_topic_terms( i, self.lda_topic_word_count) terms = [self.dictionary.get(x[0]) for x in terms_id] self.lda_topics.append(' '.join(terms)) def update_model(self, ondata_file_path=None, resource=None, data_type='db'): if ondata_file_path is not None and data_type == 'csv': corpus = self._make_corpus(data_file_path=ondata_file_path, data_type=data_type, resource=resource) elif data_type == 'db': corpus = self._make_corpus(data_file_path=None, data_type=data_type, resource=resource) else: raise Exception("{}Corpus is None".format(EXCEPTION_FLAG)) self.lda_model.update(corpus) def process_record(self, text, data_type): """ data_type - db / csv / single_ltc """ if data_type == 'single_ltc': try: self.load_model() except Exception as e: print("{}{}".format(EXCEPTION_FLAG, e)) pass elif self.lda_model is None: try: self.load_model() except Exception as e: print("{}{}".format(EXCEPTION_FLAG, e)) pass if data_type == 'db': if self.lda_model is None: return dict() doc = self._prepare_single_document(text) if doc is not None: topics = self._get_document_topics(doc) top_topic = topics[0] return [('topic', self.lda_topics[top_topic])] return [('topic', "")] elif data_type == 'csv': doc = self._prepare_single_document(text) topics_in_count_by_ids = self._get_document_topics(doc) current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[ 0], topics_in_count_by_ids[1:] result_topic_word_descr = re.sub( '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id)) return [('topic', result_topic_word_descr), ('other_topics', current_doc_other_topics)] elif data_type == 'single_ltc': doc = self._prepare_single_document(text) topics_in_count_by_ids = self._get_document_topics(doc) if topics_in_count_by_ids is not None: current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[ 0], topics_in_count_by_ids[1:] result_topic_word_descr = re.sub( '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id)) return result_topic_word_descr, current_doc_other_topics else: return "", [] def _get_metric_fields(self): if self.lda_model is None: return [] else: return ['topic'] def _get_document_topics(self, doc, count=5): if doc is not None: bow = self.dictionary.doc2bow(doc) topics = self.lda_model.get_document_topics( bow, minimum_probability=0.0) topics_in_count = list( ident_number for (ident_number, prob) in sorted( topics, key=itemgetter(1), reverse=True)[:count]) return topics_in_count def _get_document_topic(self, doc_topics): topic_id_probs = {} for t_prob in doc_topics: topic_id_probs[t_prob[0]] = t_prob[1] doc_topic_id = sorted(topic_id_probs, key=topic_id_probs.get, reverse=True)[0] doc_topic_prob = topic_id_probs[doc_topic_id] return [doc_topic_id, doc_topic_prob] def _prepare_single_document(self, sd): if sd is None or type(sd) == np.float: return None try: sd = sd.lower() sd = nltk.tokenize.word_tokenize(sd) sd = (word for word in sd if word.isalpha() and len(word) > 2) stopped_sd = (word for word in sd if word not in self.stop_words) lemmatized_doc = [ self.wn_lemmatizer.lemmatize(word) for word in stopped_sd ] return lemmatized_doc except AttributeError as e: print("{}{}".format(EXCEPTION_FLAG, e)) return None def _make_bow(self, text): if text is not None: d = self._prepare_single_document(text) return self.dictionary.doc2bow(d) def _make_corpus(self, data_type, resource, data_file_path=None, save_train_dict=True, save_dict_as=None, ltc=None): """ data type can be csv or db # or new - single_ltc """ if data_type == 'db': documents = self.load_db_data(resource=resource) elif data_type == 'csv' and data_file_path is not None: documents = self.load_csv_data(data_file_path) elif data_type == 'single_ltc' and ltc is not None: ltc_text = " ".join(e if type(e) is str else "" for e in ltc) documents = self.load_single_ltc(ltc_text) else: documents = None print("{}documents is None. Exiting ... \n".format(ERROR_FLAG)) exit() with Pool() as pool: processed_docs = pool.imap(self._prepare_single_document, documents) pool.close() pool.join() processed_docs = (i for i in processed_docs if i is not None) self.dictionary = Dictionary(processed_docs) if save_train_dict and save_dict_as is None: self.dictionary.save(self.lda_dict_filename) else: self.dictionary.save(save_dict_as) corpus = [ self.dictionary.doc2bow(proc_doc) for proc_doc in processed_docs ] return corpus def _get_topic_by_id(self, topic_id): if self.lda_topic_word_count is not None: return self.lda_model.print_topic(topic_id, self.lda_topic_word_count) else: return self.lda_model.print_topic(topic_id, 6) def _get_topics(self, default_view=False, for_db=True): """ 2-tuples (probability * word) of most probable words in topics num_topics=-1 <--- to print all topics """ def _get_words(probabilities_words_string): _pre_topic_with_digits_trash = " ".join( re.findall(ALL_CHARS, probabilities_words_string)) probaply_clean_topic = re.sub(r'\b\d+(?:\.\d+)?\s+', "", _pre_topic_with_digits_trash) return probaply_clean_topic # " ".join(re.findall('[a-zA-Z]+', probabilities_words_string)) if default_view: return self.lda_model.print_topics(num_topics=-1) if for_db: resource_id = Resources.select().where( Resources.resource == self.resource_identifier_name).first() resource_id = resource_id.__data__['resource'] return [(elem[0], resource_id, _get_words(elem[1])) for elem in self.lda_model.print_topics( num_topics=self.lda_topics_count, num_words=self.lda_topic_word_count)] return [(elem[0], _get_words(elem[1])) for elem in self.lda_model.print_topics( num_topics=self.lda_topics_count, num_words=self.lda_topic_word_count)]
passes = random.randint(100, 120) eval_every = None seed = np.random.randint(0, 999999) print("Seed:", seed, "\n") ldaModel = LdaMulticore(corpus, num_topics=numberOfTopics, id2word=dictionary, passes=passes, alpha='asymmetric', eval_every=eval_every, workers=3, random_state=seed) # Check resulting topics. listOfTopics = ldaModel.print_topics(num_topics=numberOfTopics, num_words=15) for index, i in enumerate(listOfTopics): string = str(i[1]) for c in "0123456789+*\".": string = string.replace(c, "") string = string.replace(" ", " ") print(string) # calculate & display perplexity print('\nPerplexity: ', ldaModel.log_perplexity( corpus)) # a measure of how good the model is. lower the better. # calculate & display coherence coherenceModel = CoherenceModel(model=ldaModel, texts=document, dictionary=dictionary, coherence='c_v')
eta=None) # LDA params for topic-words (all = 0.1) ''' : param corpus : corpus to perform the LDA on : param num_topics : assumed number of topics present in the corpus : param id2word : dictionary mapping word ids (int) to actual words (str) : param alpha : list of parameters for Drichlet Distribution of topics per document --> # of parameters = num_topics (# of topics) --> if 'symmetric', all parameters = 0.1 : param eta : list of parameters for Drichlet Distribution of words per topic --> # of parameters = len(id2word) (# of unique words) --> if not specified, all parameters = 0.1 ''' ############### 6. PRINT OUT DETECTED TOPICS & ASSOCIATED WORDS ############### # Following prints out words occuring in each of the 10 topics & their relative weight for i, topic in BOW_lda_model.print_topics(-1): print("Topic {}: \n{}\n".format(i, topic)) ############### 7. PREDICT A TOPIC CLASS FOR A SAMPLE DOCUMENT ################ # Use BOW_lda_model to predict which topic this document belongs to: sample_doc_i = 827 for i, score in sorted(BOW_lda_model[BOW_corpus[sample_doc_i]], key=lambda tup: -1 * tup[1]): print("\nScore: {}\nTopic: {}".format(score, BOW_lda_model.print_topic(i, 10))) ################# 8. PREDICT A TOPIC CLASS FOR A NEW DOCUMENT ################# # Use BOW_lda_model to predict which topic a new document belongs to: new_doc = "Syria gets terrorist attack kills 22 people"
def lda(self, cat_list: list, below: int = 100, above: float = 0.1, eta: float = 0.9): assert set(cat_list).issubset(set(self.table.category.unique())) df_topic2 = self.table[self.table.category.isin( cat_list)].reset_index().iloc[:, 1:] instances = df_topic2.clean_text.apply(str.split) d = Dictionary(instances) print("Dictionary is:", d) d.filter_extremes(no_below=below, no_above=above) print("Dictionary after filtering:", d) ldacorpus = [d.doc2bow(text) for text in instances] tfidfmodel = TfidfModel(ldacorpus) model_corpus = tfidfmodel[ldacorpus] num_topics = len(df_topic2.groupby(['category']).count()) temp = df_topic2.groupby(['category']).count() prior_probabilities = temp["app"] / temp["app"].sum() alpha = prior_probabilities.values print("Prior probabilities of the topics -alpha- are:", alpha) num_passes = 10 chunk_size = len(model_corpus) * num_passes / 200 print("Preliminary steps to prepare the model done") model = LdaMulticore( num_topics=num_topics, # number of topics corpus=model_corpus, # what to train on id2word=d, # mapping from IDs to words workers=min(10, multiprocessing.cpu_count() - 1), # choose 10 cores, or whatever computer has passes=num_passes, # make this many passes over data chunksize=chunk_size, # update after this many instances alpha=alpha, eta=eta, random_state=5) print("Model is ready") topic_corpus = model[model_corpus] topic_sep = re.compile(r"0\.[0-9]{3}\*") model_topics = [(topic_no, re.sub(topic_sep, '', model_topic).split(' + ')) for topic_no, model_topic in model.print_topics( num_topics=num_topics, num_words=5)] descriptors = [] for i, m in model_topics: print(i + 1, ", ".join(m[:3])) descriptors.append(", ".join(m[:2]).replace('"', '')) print(descriptors) scores = [[t[1] for t in topic_corpus[entry]] for entry in range(len(instances))] topic_distros = pd.DataFrame(data=scores, columns=descriptors) topic_distros['category'] = df_topic2['category'] #%matplotlib inline print("Preparing graph") sns.set_context('poster') fig, ax = plt.subplots(figsize=(20, 10)) aggregate_by_category = topic_distros.groupby( topic_distros.category).mean() aggregate_by_category[descriptors].plot.bar(ax=ax) fig.set_size_inches(30, 30) plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), prop={'size': 25})
def train_LDA_model(data, num_topics, CPUs): # Pre-processing sentences = [nltk.tokenize.sent_tokenize(doc) for doc in data] sentences = [val for sublist in sentences for val in sublist] data_words = list(sent_to_words(sentences)) # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization( data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary id2word = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # ## Train LDA Model # Build LDA model lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=50, chunksize=100, passes=10, per_word_topics=True, workers=CPUs) model_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/lda_model_all_years.model' lda_model.save(model_dest) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] # Visualize the topics vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) storage_dest_lda_html = lda_data_dir + 'LDA_model/all_years_2007_2017/all_years_2007_2017_local_lda.html' pyLDAvis.save_html(vis, storage_dest_lda_html) wordcloud_dest = lda_data_dir + 'LDA_model/all_years_2007_2017/wordclouds/' for t in range(lda_model.num_topics): plt.figure() dictionary = {} plt.imshow(WordCloud().fit_words( Convert(lda_model.show_topic(t, 30), dictionary))) plt.axis("off") plt.title("Topic_" + str(t)) plt.show() plt.savefig(wordcloud_dest + "Topic #" + str(t) + '.png') # set location on server return lda_model
chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) # save the model lda_model.save('tmp/lda_model.model') # See the topics lda_model.print_topics(-1) for c in lda_model[corpus[5:8]]: print("Document Topics : ", c[0]) # [(Topics, Perc Contrib)] print("Word id, Topics : ", c[1][:3]) # [(Word id, [Topics])] print("Phi Values (word id) : ", c[2][:2]) # [(Word id, [(Topic, Phi Value)])] print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:2]]) # [(Word, [Topics])] print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:2]]) # [(Word, [(Topic, Phi Value)])] print("------------------------------------------------------\n") train_vecs = []
class TopicModel: """ Create a topic model. Filtering parts of speech is currently done using tools.Farasa. """ def __init__(self, pos_to_use: List[str], stop_words: Union[Set[str], List[str], str], min_df: Union[int, float] = 5, max_df: Union[int, float] = 0.85, num_workers: int = 1): """ Initialize model. :param pos_to_use: Parts of speech to use, possible values are (Farasa-specific) ['S', 'E', 'V', 'NOUN', 'PRON', 'ADJ', 'NUM', 'CONJ', 'PART', 'NSUFF', 'CASE', 'FOREIGN', 'DET', 'PREP', 'ABBREV', 'PUNC'] :param stop_words: list/set of stop words or filepath to the file containing the stop words. :param max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). :param min_df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. :param num_workers: Number of worker to use for preprocessing and training. """ if isinstance(stop_words, str): stop_words = open(stop_words).read().split('\n') if isinstance(stop_words, list): stop_words = set(stop_words) self.pos_to_use = pos_to_use self.num_workers = num_workers self.min_df = min_df self.max_df = max_df self.stop_words = stop_words self.vectorizer = TfidfVectorizer(stop_words=stop_words, max_df=self.max_df) self.bigram_model: Optional[Phraser] = None self.trigram_model: Optional[Phraser] = None self.id2word: Optional[Dict] = None self._farasa: Optional[Farasa] = None @staticmethod def _init_pool(): """ Intialize pool. Ran only one. """ global farasa farasa = Farasa(singelton=False) def preprocess_document(self, document: str) -> str: """ Preprocess document. :param document: document to preprocess. """ return _preprocess_arabic_text(document, remove_non_arabic=True, remove_punctuation=True, remove_numbers=True, remove_emails_urls_html=True, remove_hashtags_mentions=True) def _unit_of_work(self, pos_to_use: List[str], document: str) -> str: """ Apply unit of work. :param pos_to_use: Parts of speech to keep. :param document: Document to process. """ global farasa return farasa.filter_pos( # type: ignore self.preprocess_document(document), parts_of_speech_to_keep=pos_to_use) def preprocess_documents( self, documents: Sequence[str]) -> Generator[str, None, None]: """ Preprocess documents. :param documents: documents to preprocess. """ progress = tqdm(total=len(documents)) LOGGER.info('Launching %d workers..', self.num_workers) pool = Pool(self.num_workers, initializer=self._init_pool) LOGGER.info('Preprocessing documents using %d workers..', self.num_workers) results = [] for document in documents: result = pool.apply_async(self._unit_of_work, (self.pos_to_use, document), callback=lambda *args: progress.update()) results.append(result) for result in results: document = result.get() if document != '': yield document LOGGER.info('Preprocessing is done.') def tokenize(self, document: str) -> List[str]: """ Tokenize a document. Uses NLTK word tokenizer. """ tokens = word_tokenize(document) return [token for token in tokens if token not in self.stop_words] def create_trigrams(self, tokens: List[str]) -> List[str]: """ Create trigrams. :param tokens: list of tokens. :returns: n-gram where n is between 1-3. """ if self.trigram_model and self.bigram_model: return self.trigram_model[self.bigram_model[tokens]] raise ValueError('trigram model is not fitted yet!') def build_vocab( self, documents_tokens: List[List[str]]) -> Tuple[List[List[str]], Dict]: """ Build vocabualry. :param documents_tokens: documents as list of tokens, e.g. [ ['the', 'brown', 'fox'], ['another', 'word', ..], ... ] :returns: a tuple consisting of list of documents as word counts (Bag-of-words), and Id2Word dictionary. """ LOGGER.info('Fitting bigram model..') bigram = Phrases(documents_tokens, min_count=self.min_df, threshold=100, progress_per=100, common_terms=self.stop_words) self.bigram_model = Phraser(bigram) LOGGER.info('Fitting trigram model..') self.trigram_model = Phraser( Phrases(bigram[documents_tokens], threshold=100)) documents_trigrams = [] LOGGER.info('Creating trigrams..') for index in range(len(documents_tokens) - 1, -1, -1): documents_trigrams.append( self.create_trigrams(documents_tokens[index])) documents_tokens.pop() id2word = Dictionary(documents_trigrams) return [id2word.doc2bow(text) for text in documents_trigrams], id2word def fit(self, documents: Sequence[str], preprocess: bool, passes: int, random_state: int, num_topics: int, chunksize: int = 1000): """ Fit model. :param documents: documents to fit the model on. :param preprocess: whether to preprocess documens before training the model. :param passes: number of passes over the training dataset, 1 is enough if dataset is large. :param random_state: random state seed for reproducibility. :param num_topics: number of topics. :param chunksize: number of document to use per update. """ self.vectorizer = self.vectorizer.fit(documents) self.stop_words |= self.vectorizer.stop_words_ documents_iter: Iterable = documents if not preprocess else self.preprocess_documents( documents) LOGGER.info('Building vocab..') corpus, self.id2word = self.build_vocab( [self.tokenize(x) for x in documents_iter]) LOGGER.info('Fitting lda..') self._lda_model = LdaMulticore( corpus=corpus, id2word=self.id2word, num_topics=num_topics, random_state=random_state, chunksize=chunksize, passes=passes, per_word_topics=True, workers=self.num_workers, ) self.topics = self._lda_model.print_topics(num_topics=num_topics, num_words=100) def predict(self, document, topics_map: Dict[int, str], num_topics: int) -> List[str]: """ Predict topics distribution for a document. :params document: document to predict topics for. :params topics_map: a mapping of topic number to topic name. :params num_topics: return the top num_topics. :returns: a list of topic numbers sorted by their probabilities. """ tokens = ( seq([document]).map(self.preprocess_document).map( lemmatize) # type: ignore .map(self.tokenize).map(self.create_trigrams).flat_map( self.id2word.doc2bow) # type: ignore .to_list()) topics = (seq( self._lda_model[tokens][0]).sorted(key=lambda x: -x[1]).map( get(0)).filter(None).distinct().take(num_topics)) if topics_map: topics = topics.map(lambda topic: topics_map[topic]) return topics.to_list() @staticmethod def load(path: str) -> 'TopicModel': """ Load model. :param path: path to the model. """ return dill.load(open(path, 'rb')) def save(self, path: str): """ Save model. :param path: path to save the model to. """ farasa: Farasa = self.__dict__.pop('_farasa') dill.dump(self, open(path, 'wb')) self._farasa = farasa
class GensimMalletTopicExtractor: def __init__(self, language='english', stopwords_extent=None): self.language2la = { 'english': 'en', 'french': 'fr', 'spanish': 'es' } if language not in self.language2la: raise ValueError('Language must be "english", "french" or "spanish"') self.language = language self.stop_words = stopwords.words(self.language) if stopwords_extent is str or stopwords_extent is list: self.stop_words.extend(stopwords_extent) self.df_topic_sents_keywords = None self.bigram = None self.bigram_phraser = None self.trigram = None self.trigram_phraser = None self.vis = None self.data = None self.data_words = None self.data_words_nostops = None self.data_words_bigrams = None self.data_words_trigrams = None self.nlp = None self.data_lemmatized = None self.id2word = None self.texts = None self.corpus = None self.mallet_path = None self.lda_model = None self.coherence_model_lda = None self.coherence_lda = None self.coherence_values = [] self.model_list = [] self.optimal_number_of_topics = None self.optimal_model = None self.optimal_topics = None @staticmethod def sent_to_words(sentences, remove_punctuation=True): for sentence in sentences: # deacc=True removes punctuations yield(simple_preprocess(str(sentence), deacc=remove_punctuation)) def remove_stopwords(self, texts): return [[word for word in simple_preprocess(str(doc)) if word not in self.stop_words] for doc in texts] def make_bigrams(self, texts): self.bigram = Phrases(self.data_words, min_count=5, threshold=100) self.bigram_phraser = Phraser(self.bigram) return [self.bigram_phraser[doc] for doc in texts] def make_trigrams(self, texts): tokens_ = self.bigram_phraser[texts] self.trigram = Phrases(tokens_, threshold=100) self.trigram_phraser = Phraser(self.trigram) return [self.trigram_phraser[self.bigram_phraser[doc]] for doc in texts] def lemmatization(self, texts, allowed_postags=None): if allowed_postags is None: allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV'] """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = self.nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out def view_terms_frequency(self, text_id, first_words=20): # Human readable format of corpus (term-frequency) list_ = [[(self.id2word[id_], freq) for id_, freq in text[:first_words]] for text in self.corpus[text_id]] pprint(list_) def visualize_lda(self): # Visualize the topics # pyLDAvis.enable_notebook() self.vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word) print(self.vis) def instanciate_model(self, num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False): if enable_mallet is True: # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'}) self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet' # update this path self.lda_model = LdaMallet(self.mallet_path, corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=iterations, optimize_interval=optimize_interval, topic_threshold=topic_threshold) print('Mallet LDA model built\n') if show_topics_on_creation is True: pprint(self.lda_model.show_topics(formatted=False)) else: self.lda_model = LdaMulticore(corpus=self.corpus, id2word=self.id2word, num_topics=num_topics, random_state=100, chunksize=500, passes=passes, iterations=iterations, per_word_topics=True) print('LDA_MultiCore model built\n') if show_topics_on_creation is True: pprint(self.lda_model.print_topics()) def extract_topics(self, data, num_topics, passes=10, iterations=500, enable_mallet=True, optimize_interval=0, topic_threshold=0.0): self.data = data print('\nEXTRACTING ' + str(num_topics) + ' TOPICS') self.data_words = list(self.sent_to_words(self.data, True)) # Remove Stop Words print('\nRemoving stopwords') self.data_words_nostops = self.remove_stopwords(self.data_words) # Form Bigrams print('Looking for bigrams') self.data_words_bigrams = self.make_bigrams(self.data_words_nostops) # Form Trigrams print('Looking for trigrams') self.data_words_trigrams = self.make_trigrams(self.data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en print('Loading Spacy with ' + self.language + ' dictionary') self.nlp = spacy.load(self.language2la[self.language], disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv print('Lemmatizing') self.data_lemmatized = self.lemmatization(self.data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary print('Creating dictionary') self.id2word = corpora.Dictionary(self.data_lemmatized) # Create Corpus print('Creating corpus') self.texts = self.data_lemmatized # Term Document Frequency print('Computing document frequency') self.corpus = [self.id2word.doc2bow(text) for text in self.texts] # Build LDA model print('\nEnable_mallet is', enable_mallet, '\n') self.instanciate_model(num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=True) # print(self.lda_model[self.corpus]) # Compute Perplexity # a measure of how good the model is. lower the better. if hasattr(self.lda_model, 'log_perplexity'): print('\nPerplexity: ', self.lda_model.log_perplexity(self.corpus)) # Compute Coherence Score print('\nComputing coherence model') self.coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') print('Getting coherence') self.coherence_lda = self.coherence_model_lda.get_coherence() print('\nCoherence Score: ', self.coherence_lda) if enable_mallet is False: self.visualize_lda() def view_optimal_topics(self, num_words=20): pprint(self.optimal_model.print_topics(num_words=num_words)) def compute_coherence_values(self, limit, start=2, step=3, passes=10, iterations=500, enable_mallet=True, optimize_interval=0, topic_threshold=0.0): """ Compute c_v coherence for various number of topics Parameters: ---------- limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ for num_topics in range(start, limit, step): print('\n' + '*'*10 + ' COMPUTING COHERENCE FOR ' + str(num_topics) + ' TOPICS ' + '*'*10) self.instanciate_model(num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False) self.model_list.append(self.lda_model) coherence_model = CoherenceModel(model=self.lda_model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') self.coherence_values.append(coherence_model.get_coherence()) # Show graph x = range(start, limit, step) plt.plot(x, self.coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend("coherence_values", loc='best') plt.show() # Print the coherence scores for m, cv in zip(x, self.coherence_values): print("Num Topics =", m, " has Coherence Value of", round(cv, 4)) optimal_model_index = self.coherence_values.index(max(self.coherence_values)) self.optimal_number_of_topics = start + optimal_model_index self.optimal_model = self.model_list[optimal_model_index] print('\nOptimal number of topics is ' + str(self.optimal_number_of_topics) + ' with coherence score : ' + str(self.coherence_values[optimal_model_index])) self.optimal_topics = self.optimal_model.show_topics(num_topics=self.optimal_number_of_topics, num_words=20, formatted=False) self.view_optimal_topics() def format_topics_sentences(self, ldamodel=None): if ldamodel is None and self.optimal_model is not None: ldamodel = self.optimal_model elif ldamodel is None and self.lda_model is not None: ldamodel = self.lda_model # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[self.corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(self.data) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return sent_topics_df def get_most_representative_documents(self): # Group top 5 sentences under each topic sent_topics_sorteddf_mallet = pd.DataFrame() if self.df_topic_sents_keywords is None: self.df_topic_sents_keywords = self.format_topics_sentences() # Format df_dominant_topic = self.df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] sent_topics_outdf_grpd = self.df_topic_sents_keywords.groupby('Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0) # Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True) # Format sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"] # Show sent_topics_sorteddf_mallet.head() for i in range(len(sent_topics_sorteddf_mallet)): print(i, sent_topics_sorteddf_mallet.loc[i, 'Text']) def get_topic_distribution(self): if self.df_topic_sents_keywords is None: self.df_topic_sents_keywords = self.format_topics_sentences() # Number of Documents for Each Topic topic_counts = self.df_topic_sents_keywords['Dominant_Topic'].value_counts() # Percentage of Documents for Each Topic topic_contribution = round(topic_counts/topic_counts.sum(), 4) # Topic Number and Keywords topic_num_keywords = self.df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']] # Concatenate Column wise df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1) # Change Column names df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents'] # Show print(df_dominant_topics)
id2word=dictionary_big, workers=min(10, multiprocessing.cpu_count() - 1), passes=num_passes, chunksize=chunk_size, alpha=0.5) print("done in {}".format(time.time() - start), flush=True) topic_corpus = model[model_corpus] #topic_corpus[0] #Print the topics in a ore readable format transofmring them using RegEx topic_sep = re.compile(r"0\.[0-9]{3}\*") model_topics = [(topic_no, re.sub(topic_sep, '', model_topic).split(' + ')) for topic_no, model_topic in model.print_topics( num_topics=num_topics, num_words=5)] descriptors = [] for i, m in model_topics: print(i + 1, ", ".join(m[:5])) descriptors.append(", ".join(m[:2]).replace('"', '')) # #### 2.5.2- DYNAMIC TOPIC MODELING -- LdaSeqModel # <a id="dynamic"></a> # In[41]: '''Analyzing the changes of three topics btetween the two halves of Harry Potter 1 and the two halves of Harry Potter 7 ''' #Create an object toklist_17 containing the tokens for Book1 and Book 7 together toklist_17 = corpustot.loc[corpustot['Book'].isin( ['Harry Potter 1', 'Harry Potter 7'])]['Tokens'].to_list()