def get_lda_score_eval2(self, dictionary: typing.Dict[str, str], bow_corpus) -> list: """LDA model and coherence score.""" # lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=self.topic_num, id2word=dictionary, passes=10, update_every=1, random_state = 300, alpha=self.alpha, eta=self.eta) # the trained model lda_model = LdaTransformer(num_topics=self.topic_num, id2word=dictionary, iterations=10, random_state=300, alpha=self.alpha, eta=self.eta, scorer= 'mass_u') # The topic distribution for each input document. docvecs = lda_model.fit_transform(bow_corpus) # pprint(lda_model.print_topics()) return lda_model, docvecs
def train_lda_model(): data = pd.read_csv("corpus/part-00000-8274d92c-217e-4ce7-80c7-50c52a899545-c000.csv", header=None) data.columns = ["sentence"] sentence = data["sentence"].values.tolist() sentence = map(lambda x: x.split(), sentence) dct = Dictionary.load('data/lda_dict') # dct = Dictionary(sentence) # dct.save('./data/lda_dict') sentence = list(map(lambda x: dct.doc2bow(x), sentence)) model = LdaTransformer(num_topics=100, id2word=dct, random_state=1) model.fit(sentence) joblib.dump(model, './data/lda.model')
def lda_gensim_to_sci(data, sections, n_topics, **kw): """Wrap gensim LDA model for scikit-learn.""" dat = get_nips_combined(sections, data) d, bow = lda_get_dictionary(d, **kw) args = {**lda_gensim_defaults, **kw} args.pop('per_word_topics') args['num_topics'] = n_topics return LdaTransformer(id2word=d, **args)
def fit_model(corpora, dictionary, topicNum, beta): corpus = [dictionary.doc2bow(text) for text in corpora] model = LdaTransformer(id2word=dictionary, num_topics=topicNum, alpha='auto', eta=beta, iterations=100, random_state=2019) lda = model.fit(corpus) #docvecs = lda.transform(corpus) coherence = evaluateModel(lda.gensim_model, corpora) try: cm = CoherenceModel(model=lda.gensim_model, corpus=corpus, dictionary=dictionary, coherence='u_mass') u_mass = cm.get_coherence() cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_uci') c_uci = cm.get_coherence() cm = CoherenceModel(model=lda.gensim_model, texts=corpora, coherence='c_npmi') c_npmi = cm.get_coherence() saveModelConfigs(lda, coherence, u_mass, c_uci, c_npmi, config_path) except: saveModelConfigs(lda, coherence, "Invalid", "Invalid", "Invalid", config_path) #return lda.gensim_model, docvecs return lda.gensim_model
writer.writerow([str(lda_model.num_topics), str(lda_model.eta), str(max_bleu), str(max_jaccard), str(max_cos), str(max_fscore)]) def evaluateModel(lda_model, topic_info, term_emb, mode): if mode == 1: max_bleu, max_jaccard, max_cos, max_fscore = main_topicDistr(lda_model, topic_info) elif mode == 2: max_bleu, max_jaccard, max_cos, max_fscore = main_LDA_avgEmb(lda_model, topic_info, term_emb) elif mode == 3: max_bleu, max_jaccard, max_cos, max_fscore = main_LDA_maxEmb(lda_model, topic_info, term_emb) elif mode == 4: max_bleu, max_jaccard, max_cos, max_fscore = main_Word2Vec_AvgEmb(lda_model, topic_info) elif mode == 5: max_bleu, max_jaccard, max_cos, max_fscore = main_Word2Vec_MaxEmb(lda_model, topic_info) write_results_to_file("/home/norberteke/PycharmProjects/Thesis/data/SO_simulation_results_2.csv", lda_model, max_bleu, max_jaccard, max_cos, max_fscore) k = [] for i in range(10,51): k.append(i) beta = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] for topic_num in k: print("----- Progress: k= ", topic_num ,"----") for b in beta: model = LdaTransformer(id2word=dictionary, num_topics = topic_num, alpha='auto', eta = b, iterations=100, random_state=2019) lda = model.fit(corpus) term_topic_matrix = lda.gensim_model.get_topics() term_emb = pd.DataFrame(term_topic_matrix, columns=terms) topic_info = get_topic_info(lda.gensim_model, corpus, dictionary) evaluateModel(lda.gensim_model, topic_info, term_emb, mode = 2)
dictionary.save( "/home/norberteke/PycharmProjects/Thesis/data/GH_past_full_activity_gensimDictionary.dict" ) with open( "/home/norberteke/PycharmProjects/Thesis/data/GH_past_full_activity_corpus.txt", 'w') as f: for text in texts: f.write(str(text) + "\n") corpus = [dictionary.doc2bow(text) for text in texts] # output_fname = get_tmpfile("/home/norberteke/PycharmProjects/Thesis/data/SO_recent_full_activity_gensimCorpus.mm") # MmCorpus.serialize(output_fname, corpus) model = LdaTransformer(id2word=dictionary, alpha='auto', iterations=100, random_state=2019) # The list of hyper-parameters to optimize. For each one define the bounds space = [Integer(20, 500, name='num_topics'), Real(0.001, 200, name='eta')] # this decorator allows your objective function to receive a the parameters as keyword arguments @use_named_args(space) def objective(**params): model.set_params(**params) lda = model.fit(corpus) coherence = evaluateModel(lda.gensim_model) try: cm = CoherenceModel(model=lda.gensim_model,
def __init__(self, columns, n_topics=20, n_iter=10, random_state=0, lib='sklearn', trained_model=None, start=2, stop=21, step=1, stop_words='english', max_features=None): self.model = None self.trained_model = None self.lib = None self.columns = None self.stop_words = stop_words self.max_features = max_features if columns: if isinstance(columns, list) and all( isinstance(col, str) for col in columns): self.columns = columns else: raise TypeError( "Columns has to be list of strings . Column {} is of type {}" .format(columns, type(columns))) else: raise ValueError( "You have to specify which columns you want to vectorize") if trained_model: warnings.warn( "Trained models are not trained again. Please make sure to only input the column(s) " "that the model was trained on", UserWarning) self.trained_model = trained_model self.random_state = self.trained_model.random_state if isinstance(self.trained_model, type(LDA_skl())): self.n_topics = self.trained_model.n_components self.n_iter = self.trained_model.max_iter else: self.n_topics = self.trained_model.num_topics self.n_iter = self.trained_model.iterations else: if n_topics == 'auto': self.n_topics = n_topics params = [start, stop, step] for number in params: try: val = int(number) if val < 2: raise ValueError( "Number of topic has to be a positive. Received: {}" .format(number)) break except TypeError: raise TypeError( "That's not an int! Received: {}".format( type(number))) if stop < start: raise ValueError( "Stop value has to be higher than the start value. Received: {}" .format(n_topics)) self.start = start self.stop = stop self.step = step else: if not isinstance(n_topics, int): raise TypeError( "Number of topic has to be an integer. Received: {}". format(type(n_topics))) if n_topics < 2: raise ValueError( "Number of topics has to be at least 2. Received: {}". format(n_topics)) self.n_topics = n_topics if not isinstance(n_iter, int): raise TypeError( "Random_state has to be a integer. Received: {}".format( type(n_iter))) if n_iter < 1: raise ValueError( "Random_state has to be at least 1. Received: {}".format( n_iter)) self.n_iter = n_iter if not isinstance(random_state, int): raise TypeError( "Random_state has to be a integer. Received: {}".format( type(random_state))) if random_state < 0: raise ValueError( "Random_state has to be positive or zero. Received: {}". format(random_state)) self.random_state = random_state if not isinstance(lib, str): raise TypeError("Lib has to be a string. Received: {}".format( type(lib))) if lib == 'sklearn': self.model = \ LDA_skl(n_components=self.n_topics, max_iter=self.n_iter, random_state=self.random_state) elif lib == 'gensim': self.model = \ LdaTransformer(num_topics=self.n_topics, iterations=self.n_iter, random_state=self.random_state) else: raise ValueError( "The supported libraries are sklearn and gensim. Received: {}" .format(lib)) self.lib = lib