class LDAembedding(InputEmbedding): def __init__(self, workdir="./embedding-models", name="lda-embedding"): """ Erstellt durch Aufruf von Pretrain ein Vokabular :param workdir: :param name: """ super(LDAembedding, self).__init__(workdir=workdir, name=name) self._normalizer = TweetNormalisation() def _load(self): modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name)) if not modeldir.exists(): return False self._lda = LdaMulticore.load(str(modeldir)) self._dictionary = Dictionary.load( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name)))) def pretrain(self, texts: typing.Iterable[typing.Text]): texts = [self._normalizer(text).split() for text in tqdm(texts)] self._dictionary = Dictionary(texts, prune_at=200000) corpus = [self._dictionary.doc2bow(text) for text in tqdm(texts)] self._lda = LdaMulticore(corpus=corpus, id2word=self._dictionary, workers=15, num_topics=50) self._dictionary.save( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name)))) self._lda.save( str(self._workdir.joinpath("ldamodel_{}".format(self._name)))) def get_train_data(self, texts: typing.Iterable[typing.Text]) -> np.array: to_array = lambda x: np.array([ v for _, v in self._lda.get_document_topics(x, minimum_probability=0) ]) return np.stack([ to_array(self._dictionary.doc2bow(self._normalizer(text).split())) for text in texts ])
def make_ldamodel(pre_processed, num_topics=5, pylda=False): dictionary = corpora.Dictionary(pre_processed) corpus = [dictionary.doc2bow(text) for text in pre_processed] model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=30, random_state=1) if pylda: lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary) with open('topic_associations.txt', 'w') as outfile: for f in sorted(glob.glob('letters/*.txt')): fp = open(f, 'r') text = pre_process([fp.read()]) dictionary = corpora.Dictionary(text) outfile.write( '%s - %s\n' % (f, model.get_document_topics(dictionary.doc2bow( text[0])))) fp.close() pyLDAvis.show(lda_display)
# Build LDA model # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, # id2word=id2word, # num_topics=35, # random_state=100, # update_every=1, # chunksize=100, # passes=10, # alpha='auto', # per_word_topics=True) pprint(model.print_topics()) doc_lda = model[corpus] doc_lda[4] model.get_document_topics(corpus)[1] # Compute Perplexity print('\nPerplexity: ', model.log_perplexity(corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_multicore, corpus, dictionary) vis mallet_path = '/home/ubuntu/Signal/mallet-2.0.8/bin/mallet' ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=35, id2word=id2word)
def generate_embedings(self, method="tf-idf", tag=None, tag_column=None, return_model=False): # Coleta os dados dos embedings e salva em um arquivo par ao multiprocess if tag != None and tag_column != None: if (tag_column not in self.df.columns): raise ValueError(f"Tag {tag_column} not found in dataset") elif tag not in self.df[tag_column].to_list(): raise ValueError( f"Tag {tag} not found in dataset column {tag_column}") texts = self.df[self.df[tag_column] == tag][self.text_column] else: texts = self.df[self.text_column] with open('storage/texts.txt', 'w', encoding='utf8') as file: for sentence in texts: file.write(" ".join([tok for tok in sentence]) + "\n") # Verifica se usuario cometeu um erro no imput das tags if tag != None and tag_column == None: raise ValueError("if passing tag must pass tag_column as well") if tag_column != None and tag == None: raise ValueError("if passing tag_column must pass tag as well") # Verifica se o vetor ja foi gerado e se o alvo é o corpus inteiro if method in self.embedings and tag == None: if return_model: return self.embedings[method] else: return self.embedings[method][0] # Realiza o tf-idf if method == "tf-idf": model = TfidfVectorizer(min_df=5, max_df=0.9, max_features=5000, sublinear_tf=False, analyzer=lambda x: x) vectors = model.fit_transform(texts) # Realiza o Word2Vec elif method == "word2vec" or method == "cbow": model = gensim.models.Word2Vec(corpus_file='storage/texts.txt', window=5, size=200, min_count=5, iter=100, workers=4) vectors = model.wv if tag == None: self.embedings["word2vec"] = vectors # Realiza o cbow if method == "cbow": vectors = [] for text in texts: vec = np.zeros(model.wv.vector_size) for word in text: if word in model.wv.vocab: vec += model.wv.get_vector(word) norm = np.linalg.norm(vec) if norm > np.finfo(float).eps: vec /= norm vectors.append(vec) vectors = scipy.sparse.csr.csr_matrix(vectors) # Realiza o Doc2Vec elif method == "doc2vec": model = gensim.models.Doc2Vec(corpus_file='storage/texts.txt', vector_size=200, window=5, min_count=5, workers=12, epochs=100) vectors = scipy.sparse.csr.csr_matrix(model.docvecs.vectors_docs) # Realiza a LDA elif "lda" in method: if "_" in method: NUM_TOPICS = int(method.split("_")[-1]) else: NUM_TOPICS = 20 dictionary = Dictionary(texts) doc2bow = [dictionary.doc2bow(text) for text in texts] ldamodel = LdaMulticore(doc2bow, num_topics=NUM_TOPICS, id2word=dictionary, passes=30) raw_vecs = [ldamodel.get_document_topics(text) for text in doc2bow] lda_vecs = [] for vec in raw_vecs: this_vec = [] curr = 0 for i in range(ldamodel.num_topics): if (i == vec[curr][0]): this_vec.append(vec[curr][1]) curr += 1 if curr == len(vec): curr = -1 else: this_vec.append(0) lda_vecs.append(this_vec) vectors = scipy.sparse.csr.csr_matrix(lda_vecs) model = [ldamodel, doc2bow, dictionary] else: raise ValueError(f"Method {method} is not recognized") # Se não estiver fazendo uma versão com tags salva os resultados if tag == None and not self.low_memory: self.embedings[method] = (vectors, model) if return_model: return vectors, model else: return vectors
class LDA(): corpus = None model = None dictionary = None util = None loaded = False topicLabelling = defaultdict(int) def __init__(self, utilObj=None, logfilename=None): if (utilObj != None): self.util = utilObj elif (logfilename != None): self.util = Utilities.Utility() self.util.setupLogFileLoc(logfilename) self.util.startTimeTrack() def labelTopics(self, modelFilename): if (os.path.exists(modelFilename + '.label')): f = open(modelFilename + '.label', "rb") self.topicLabelling = pickle.load(f) f.close() else: #Label file not available, performing manual labelling. (One time operation) topics = self.model.show_topics(num_topics=100, num_words=20) print( 'You will be shown a series of words and asked to label the topic in the form of an integer\n' ) for topic in topics: print('The words affliated to this topic is as follows\n', topic[1]) print( '\033[92m' + 'Please label as one of these \n(0) EDUCATION\n(1) SKILLS\n(2) PERSONAL DETAILS\n(3) WORK EXPERIENCE' + '\033[0m') mappedTopicInt = input( 'Please enter a new integer for this topic: ') self.topicLabelling[topic[0]] = mappedTopicInt f = open(modelFilename + '.label', "wb") pickle.dump(self.topicLabelling, f) f.close() def buildCorpus(self, folderListOfCorpus=None, maxdocs=-1): """ For each folder for each cvd2v in in folder Get tokens from Utility tokenise and then form into a string Append string into a list (This forms a document) """ self.util.logDebug('LDA', 'Building and fitting corpus ') documentList = [] maxDocPerFolder = int(maxdocs / len(folderListOfCorpus.split(','))) docCounter = 0 for folder in folderListOfCorpus.split(','): self.util.logDebug('LDA', 'Processing ' + folder) for filename in sorted(glob.iglob(folder + '/*.cvd2v')): if (docCounter <= maxDocPerFolder): fileContent = self.util.tokensToStr( self.util.tokenize( self.util.readFileContent(filename=filename), removeStopwords=True, toLowercase=True, replaceSlash=True, flatEmail=True, flatMonth=True, flatNumber=True, lemmatize=True), ' ') documentList.append(fileContent) docCounter = docCounter + 1 else: docCounter = 0 break self.util.logDebug( 'LDA', str(len(documentList)) + ' documents loaded in ' + self.util.stopTimeTrack()) texts = [[word for word in document.lower().split()] for document in documentList] self.util.logDebug('LDA', 'No of vocab words: ' + str(len(texts))) self.util.logDebug('LDA', 'Text example: ' + str(texts[0])) self.dictionary = Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.util.logDebug('LDA', 'Corpus built in ' + self.util.stopTimeTrack()) def trainModel(self, noOfTopics=4, dstFilename=None): workers = 30 eval_every = 10 iterations = 400 passes = 20 self.util.logDebug('LDA', 'Training model...') self.model = LdaMulticore(self.corpus, workers=workers, num_topics=noOfTopics, id2word=self.dictionary, eval_every=None, iterations=iterations, passes=passes) self.util.logDebug('LDA', 'Model trained in ' + self.util.stopTimeTrack()) print(self.model.print_topics()) self.saveModel(dstFilename) self.loaded = True def saveModel(self, filename): self.util.logDebug('LDA', 'Saving model to ' + filename) self.model.save(filename) self.dictionary.save(filename + '.dict') MmCorpus.serialize(filename + '.corpus', self.corpus) self.util.logDebug('LDA', 'Saved in ' + self.util.stopTimeTrack()) def loadModel(self, filename): self.util.logDebug('LDA', 'Loading model from ' + filename) self.model = LdaMulticore.load(fname=filename) self.dictionary = Dictionary.load(fname=filename + '.dict') self.corpus = MmCorpus(filename + '.corpus') print(self.dictionary) print(self.model.print_topic(0, topn=5)) print(self.model.print_topic(1, topn=5)) print(self.model.print_topic(2, topn=5)) print(self.model.print_topic(3, topn=5)) self.loaded = True self.util.logDebug('LDA', 'Model loaded in ' + self.util.stopTimeTrack()) self.labelTopics(filename) def getTopTopic(self, inferenceOutput): thisDict = defaultdict(int) probList = [] for topic, prob in inferenceOutput: thisDict[str(prob)] = topic probList.append(prob) largestProb = max(probList) mostLikelyTopic = thisDict[str(largestProb)] return mostLikelyTopic def infer_topic_proba(self, string): import numpy as np prediction = [0.0, 0.0, 0.0, 0.0] if (self.loaded): bow = self.dictionary.doc2bow(self.util.tokenize(string)) results = self.model.get_document_topics(bow) for result in results: prediction[result[0]] = result[1] else: self.util.logError('LDA', 'Model is not loaded, cannot infer') prediction = np.array(prediction) return prediction def infer_topic(self, string): results = None if (self.loaded): bow = self.dictionary.doc2bow(self.util.tokenize(string)) results = self.model.get_document_topics(bow) else: self.util.logError('LDA', 'Model is not loaded, cannot infer') results = self.getTopTopic(results) return results def visualizeLDA(self, filename): dictionary = Dictionary.load(filename + '.dict') corpus = MmCorpus(filename + '.corpus') lda = LdaMulticore.load(filename) self.util.logDebug('LDA', 'Preparing HTML ') ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary) self.util.logDebug('LDA', 'HTML prepared in ' + self.util.stopTimeTrack()) pyLDAvis.save_html(ldavis, filename + '.html') self.util.logDebug('LDA', 'HTML saved in ' + self.util.stopTimeTrack()) # # lda = LDA(logfilename='/home/kah1/test.log') # lda.loadModel('/u01/bigdata/02d_d2vModel1/CvLda4TopicModel.model') # lda.labelTopics()
def lda_MULTICORE(texts, n_topics_range, iterations, passes, workers, out_dir, verbose=True, save_doc_top=True): '''Fit topic models and search for optimal hyperparameters. Dirtier, multicore version for faster running of HOPE stuff. Parameters ---------- texts : list preprocessed corpus, where texts[0] is a document and texts[0][0] is a token. n_topics_range : range of int range of integers to use as the number of topics in interations of the topic model. iterations : int maximum number of iterations for each topic models passes : int maximum number of passes (start iterations again) for each topic models workers : int how many CPUs to initiate? out_dir : str path to a directory, where results will be saved (in a child directory). verbose : bool give comments about the progress? save_doc_top : bool save documet-topic matices from models? Exports ------- out_dir/report_lines/* pickled dict with model information (n topics, model coherence, per-topic coherence, hyperparameters) out_dir/models/* gensim objects, where the model is saved. out_dir/plots/* pyLDAvis visualizations of the model ''' # check how legit out_dir is make_folders(out_dir) # if a single model is to be fitted, # make sure it can be "iterated" if isinstance(n_topics_range, int): n_topics_range = [n_topics_range] # input texts to gensim format dictionary = corpora.Dictionary(texts) bows = [dictionary.doc2bow(tl) for tl in texts] # iterate report_list = [] for n_top in chain(n_topics_range): if verbose: print("{} topics".format(n_top)) start_time = time() # paths for saving ## it's not very elegant defining the paths here ## after there already is funciton make_folders filename = str(n_top) + "T_" + 'ASM' report_path = os.path.join(out_dir, 'report_lines', filename + '.ndjson') trained_path = os.path.join(out_dir, 'trained_lda', filename + '.model') pyldavis_path = os.path.join(out_dir, 'plots', filename + '_pyldavis.html') doctop_path = os.path.join(out_dir, 'doctop_mats', filename + '_mat.ndjson') model = LdaMulticore( corpus=bows, num_topics=n_top, id2word=dictionary, workers=workers, chunksize=2000, passes=passes, batch=False, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=iterations, gamma_threshold=0.001, random_state=None, minimum_probability=0.01, minimum_phi_value=0.01, per_word_topics=False, ) # track time usage training_time = time() - start_time if verbose: print(' Time: {}'.format(training_time)) # coherence coherence_model = CoherenceModel(model=model, texts=texts, corpus=bows, coherence='c_v') coh_score = coherence_model.get_coherence() coh_topics = coherence_model.get_coherence_per_topic() if verbose: print(' Coherence: {}'.format(coh_score.round(2))) # save priors alpha = model.alpha.tolist() eta = model.eta.tolist() # save report report = (n_top, alpha, eta, training_time, coh_score, coh_topics) report_list.append(report) with open(report_path, 'w') as f: ndjson.dump(report, f) # save model model.save(trained_path) # produce a visualization # it is imperative that sort_topics should never be turned on! vis = pyLDAvis.gensim.prepare(model, bows, dictionary, sort_topics=False) pyLDAvis.save_html(vis, pyldavis_path) # save document-topic matrix if save_doc_top: # keep minimum_probability at 0 for a complete matrix doc_top = [ model.get_document_topics(doc, minimum_probability=0) for doc in model[bows] ] # unnest (n topic, prob) tuples # float to convert from np.float32 which is not # JSON serializable doc_top_prob = [[float(prob) for i, prob in doc] for doc in doc_top] # save the matrix as ndjson with open(doctop_path, 'w') as f: ndjson.dump(doc_top_prob, f) gc.collect() return None