def train_lda(args): print "[LDA > n_topics: %d ]" % args.dim lda_reader = LDAReader(args.ds, max_sent=args.max_sent) ldazito = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd, num_topics=args.dim, workers=args.workers) ldazito.save(args.out)
def train_lda(): """ Train the LDA model. generate_dictionary() must be called before this method. """ print("------------------") print("Training LDA model") print("------------------") # load dictionary, as generated by generate_dictionary() print("Loading dictionary...") dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) # generate a mapping from word id to word print("Generating id2word...") id2word = {} for word in dictionary.token2id: id2word[dictionary.token2id[word]] = word # initialize LDA print("Initializing LDA...") lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word, workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE) # Train the LDA model print("Training...") examples = [] update_every_n_windows = 25000 windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE, only_labeled_windows=True) for i, window in enumerate(windows): tokens_str = [token.word.lower() for token in window.tokens] bow = dictionary.doc2bow(tokens_str) # each window as bag of words examples.append(bow) if len(examples) >= update_every_n_windows: print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA)) # this is where the LDA model is trained lda_model.update(examples) examples = [] if i >= COUNT_EXAMPLES_FOR_LDA: print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,)) break # i don't update here with the remainder of windows, because im not sure if each update step's # results are heavily influenced/skewed by the the number of examples #if len(examples) > 0: # print("Updating with remaining windows...") # lda_model.update(examples) # save trained model to HDD print("Saving...") lda_model.save(cfg.LDA_MODEL_FILEPATH)
class LdaProcessor(object): def __init__(self, token_docs, **filter_extremes_args): """ token_docs : a list of lists of word or n-gram or sentence tokens. Eg, [['the','crazy','cat'],['that','doggone','dog']] """ self.token_docs = token_docs self.id2word = corpora.Dictionary(token_docs) if filter_extremes_args: print 'filtering words with extreme frequencies' self.id2word.filter_extremes(**filter_extremes_args) # initialize the bow_corpus self.reset_bow_corpus(token_docs) print 'Got %i total tokens (words)' % len(self.id2word) def reset_bow_corpus(self, documents): """set or reset the corpus with the given documents""" self.bow_corpus = [self.id2word.doc2bow(doc) for doc in documents] return None def train_lda(self, num_topics, **kwargs): print 'training LDA...' self.lda = LdaMulticore(self.bow_corpus, id2word=self.id2word, num_topics=num_topics, **kwargs) return self def word_topics(self, num_words=10): return [topic[1] for topic in self.lda.print_topics(num_topics=self.lda.num_topics, num_words=num_words)] # utility functions def significant_topic_terms(self, topicid): raise NotImplementedError()
def test_lda(sentence): """Tests the trained LDA model on an example sentence, i.e. returns the topics of that sentence. May only be called after train_lda(). Args: sentence: A sentence to test on as string. """ # validate and process the sentence if sentence is None or len(sentence) < 1: raise Exception("Missing or empty 'sentence' argument.") sentence = sentence.decode("utf-8").lower().strip().split(" ") if len(sentence) != cfg.LDA_WINDOW_SIZE: print("[INFO] the token size of your sentence does not match the defined window " \ "size (%d vs %d)." % (len(sentence), cfg.LDA_WINDOW_SIZE)) # load dictionary and trained model dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH) lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH) # sentence to bag of words bow = dictionary.doc2bow(sentence) # print topics of sentence print(lda_model[bow])
class Lda(BaseEstimator, TransformerMixin): def __init__(self, id2word=None, num_topics=25, passes=1): self.lda = None self.id2word = id2word self.num_topics = num_topics self.passes = passes def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore( id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def transform(self, X): """ Parameter --------- X : [sp.csr_matrix] Returns ------- topic_vectors : [np.ndarray] each matrix is of shape (sent_count, topic_count) """ topic_vectors = [] for doc in X: sents_bow = Sparse2Corpus(doc, documents_columns=False) gamma, _ = self.lda.inference(sents_bow) # divide row by row sum topic_dist = (gamma.T / np.sum(gamma, axis=1)).T topic_vectors.append(topic_dist) return topic_vectors
def load(self): """ Load previous saved ldaprocessor results """ try: return LdaMulticore.load(self.lda_out_file_name) except: return None
def perform(self, option="load"): """ Perform LDA analysis to generate topics and topic distribution for each app """ logging.info("Start Lda analysis") ldamodel = LdaMulticore(self.corpus, num_topics=self.ntopic, id2word=self.dictionary, passes=self.iteration) logging.info("LDA multicore modeling done") ldamodel.save(self.lda_out_file_name) self.topics = {} for i in range(0, self.ntopic, 1): self.topics["topic{}".format(i)] = ldamodel.show_topic(i, topn=self.nword) logging.info("Topic{}".format(i)) words = [w[1] for w in self.topics["topic{}".format(i)]] logging.info(words)
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) args = parse_args() dictionary = corpora.Dictionary.load(os.path.join(args.prefix, 'review.dict')) logging.info('Pruning dictionary') dictionary.filter_extremes(no_below=args.no_below, no_above=args.no_above) corpus = ReviewCorpus(os.path.join(args.prefix, 'review.json'), dictionary) logging.info('Computing LDA model') lda = LdaMulticore(corpus, num_topics=args.num_topics, id2word=dictionary, workers=args.workers) logging.info('Persisting LDA model') lda.save(os.path.join(args.prefix, 'review.ldamodel'))
def build_model(self, fname=None, save_to=None): id2word = self.id2word or self.build_id2word() corpus = self.corpus or self.build_corpus() # read model.lda file if not fname: fname = click.prompt('model file name', type=str, default='model.lda') fname = self.__dest(fname) # if there is no model file or the user wants to rebuild, build .model if not os.path.isfile(fname) or click.confirm('There already is %s. Do you want to re run lda?' % fname): num_procs = click.prompt('Number of processes to launch', type=int, default=multiprocessing.cpu_count()) num_epochs = click.prompt('Number of epochs to run', type=int, default=20) num_topics = click.prompt('Number of topics', type=int, default=100) print 'start building model' start = time() model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs) model.save(fname) #save print 'building model takes: %s' % LdaUtils.human_readable_time(time() - start) self.model = LdaMulticore.load(fname) return self.model
def __init__(self, lda_filepath, dictionary_filepath, cache_filepath=None): """Initialize the LDA wrapper. Args: lda_filepath: Filepath to the trained LDA model. dictionary_filepath: Filepath to the dictionary of the LDA. cache_filepath: Optional filepath to a shelve cache for the LDA results. """ self.lda = LdaMulticore.load(lda_filepath) self.dictionary = gensim.corpora.dictionary.Dictionary.load(dictionary_filepath) self.cache_synch_prob = 2 # in percent, 1 to 100 self.cache_filepath = cache_filepath self.cache = shelve.open(cache_filepath) if cache_filepath is not None else None
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True): """ Args: num_topics_list = list of number of topics, a model will be fitted for each save: indicates whether model should be saved Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics """ topics_dict = {} logfile = open(logfilename, 'w') for num_topics in num_topics_list: print('training', num_topics) np.random.seed(NUM) start_time = time.time() model = LdaMulticore(corpus=train_corpus, id2word=id2word, num_topics=num_topics, iterations=iters, eval_every=None, workers=workers, chunksize=chunksize) end_time = time.time() if save: fname = 'data\\orig_' + str(num_topics) + 'topics.lda' model.save(fname) per_word_bound = model.log_perplexity(test_corpus) perplexity = np.exp2(-1.0 * per_word_bound) logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n') logfile.write('perplexity: ' + str(perplexity) + '\n') logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n') topics = model.show_topics(num_topics=num_topics, num_words=20) topics_dict[str(num_topics)] = topics for topic in topics: logfile.write('\n\t' + topic.encode('ascii', 'ignore') + '\n') logfile.close() return topics_dict
def train(self, comments): """ Build the topic model from a list of documents (strings). Assumes documents have been pre-processed (e.g. stripped of HTML, etc) """ docs = [c.body for c in comments] vecs = self.vectr.vectorize(docs, train=True) corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) if self.verbose: self.print_topics()
def build_lda_model(self, topics: int=20): ignore_words = [ 'like', 'know', 'f**k', 'f*****g', 'want', 'shit', 'know', 'sure', 'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going', 'WEBLINK', 'got', 'way', '' ] filename = op.join(self.input_dir, f'{self.board}.dictionary') dictionary: Dictionary = Dictionary.load(filename) documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: dictionary.doc2bow( [w for w in y.split() if w not in ignore_words] ) ) lda = LdaMulticore( documents, id2word=dictionary, num_topics=topics, iterations=2) filename = op.join(self.input_dir, f'{self.board}.lda') lda.save(filename) return lda
def show_topics(): """Shows all topics of the trained LDA model. May only be called after train_lda(). """ # load trained model lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH) # list the topics topics = lda_model.show_topics(num_topics=cfg.LDA_COUNT_TOPICS, num_words=10, log=False, formatted=True) print("List of topics:") for i, topic in enumerate(topics): # not adding topic to the tuple here prevents unicode errors print("%3d:" % (i,), topic)
def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore( id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self
def main(): options = { 'corpus_file': 'data\\origtweets_dtm.pkl', 'id_file': 'data\\row_origtweets.csv', 'model_file': 'data\\orig_10topics.lda', 'meta_file': 'data\\origtweets_meta.csv', 'output_file': 'data\\origtweets_topics.csv' } start_time = time.time() id_df = pd.read_csv(options['id_file'], usecols=['row'], dtype='float') meta_df = pd.read_csv(options['meta_file']) with open(options['corpus_file']) as corpus_file: corpus = pickle.load(corpus_file) lda = LdaMulticore.load(options['model_file']) if len(meta_df) != len(corpus): print ('Warning: Some documents may have been deleted during processing.\n') print ('metadata size - corpus size = ' + str(len(meta_df) - len(corpus))) topic_features = [to_dense(lda[bow], lda.num_topics) for bow in corpus] topic_colname = 'topic{0}'.format topic_colnames = [topic_colname(t+1) for t in xrange(lda.num_topics)] topic_df = pd.DataFrame.from_records(topic_features, columns=topic_colnames) with open('data\\topic_df.pkl', 'wb') as pkl_file: pickle.dump(topic_df, pkl_file) print ('topic size - id size = ' + str(len(id_df) - len(topic_df))) if len(id_df) != len(topic_df): raise Exception() topic_df = pd.concat([id_df, topic_df], axis=1) merged_df = pd.merge(meta_df, topic_df, on='row', how='right', sort=False) merged_df.to_csv(options['output_file'], index=False) end_time = time.time() print ('running time: ' + str((end_time - start_time)/60) + ' minutes')
class LDA(Pipe): """ LDA (Latent Dirichlet Allocation) model for unsupervised topic modeling. Takes vectors and returns topic vectors, which can be used for clustering. """ input = Pipe.type.vecs output = Pipe.type.vecs def __init__(self, n_topics=5): self.n_topics = n_topics self.trained = False def __call__(self, vecs): """ Return topic vectors. """ if not self.trained: self.train(vecs) self.trained = True distribs = [] for distrib in self.m[Scipy2Corpus(vecs)]: distribs.append([t[1] for t in distrib]) distribs = np.array(distribs) return distribs def train(self, vecs): """ Build the topic model. """ corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) def print_topics(self, vectorizer): vocab = vectorizer.vocabulary for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False): print([vocab[int(ix)] for prob, ix in topic])
def get_topics(): '''Computes distribution over topics for each abstract''' dictionary = Dictionary.load('lda.dict') lda = LdaMulticore.load('lda.gensim') base = 'datasets/dspace' new_base = 'datasets/dspace_topics' for filename in tqdm(os.listdir(base)): path = os.path.join(base, filename) with open(path, 'r') as f: d = json.load(f) abstract = d['abstract'] if abstract is not None: words = tokenize(abstract.split()) bow = dictionary.doc2bow(words) topics = lda.get_document_topics(bow, minimum_probability=0) topics = to_vec(topics) d['topics'] = topics new_path = os.path.join(new_base, filename) with open(new_path, 'w') as new_f: json.dump(d, new_f)
#============================================================================== # #Train LDA model Take 1655 seconds to train the model #============================================================================== # No need to run LDA everytime, model has bee stored vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(res) vocab = vectorizer.get_feature_names() # single LDA topic_number = 3 start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=topic_number,passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = folder_name+'LDA'+str(topic_number)+'topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #perplexity perplexity = model.log_perplexity(matutils.Sparse2Corpus(X,documents_columns=False), total_docs=None)
def learn(corpus): dictionary = Dictionary.load('lda.dict') lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, chunksize=10000, passes=5) for line in lda.print_topics(NUM_TOPICS): print line lda.save('lda.gensim')
def generate_embedings(self, method="tf-idf", tag=None, tag_column=None, return_model=False): # Coleta os dados dos embedings e salva em um arquivo par ao multiprocess if tag != None and tag_column != None: if (tag_column not in self.df.columns): raise ValueError(f"Tag {tag_column} not found in dataset") elif tag not in self.df[tag_column].to_list(): raise ValueError( f"Tag {tag} not found in dataset column {tag_column}") texts = self.df[self.df[tag_column] == tag][self.text_column] else: texts = self.df[self.text_column] with open('storage/texts.txt', 'w', encoding='utf8') as file: for sentence in texts: file.write(" ".join([tok for tok in sentence]) + "\n") # Verifica se usuario cometeu um erro no imput das tags if tag != None and tag_column == None: raise ValueError("if passing tag must pass tag_column as well") if tag_column != None and tag == None: raise ValueError("if passing tag_column must pass tag as well") # Verifica se o vetor ja foi gerado e se o alvo é o corpus inteiro if method in self.embedings and tag == None: if return_model: return self.embedings[method] else: return self.embedings[method][0] # Realiza o tf-idf if method == "tf-idf": model = TfidfVectorizer(min_df=5, max_df=0.9, max_features=5000, sublinear_tf=False, analyzer=lambda x: x) vectors = model.fit_transform(texts) # Realiza o Word2Vec elif method == "word2vec" or method == "cbow": model = gensim.models.Word2Vec(corpus_file='storage/texts.txt', window=5, size=200, min_count=5, iter=100, workers=4) vectors = model.wv if tag == None: self.embedings["word2vec"] = vectors # Realiza o cbow if method == "cbow": vectors = [] for text in texts: vec = np.zeros(model.wv.vector_size) for word in text: if word in model.wv.vocab: vec += model.wv.get_vector(word) norm = np.linalg.norm(vec) if norm > np.finfo(float).eps: vec /= norm vectors.append(vec) vectors = scipy.sparse.csr.csr_matrix(vectors) # Realiza o Doc2Vec elif method == "doc2vec": model = gensim.models.Doc2Vec(corpus_file='storage/texts.txt', vector_size=200, window=5, min_count=5, workers=12, epochs=100) vectors = scipy.sparse.csr.csr_matrix(model.docvecs.vectors_docs) # Realiza a LDA elif "lda" in method: if "_" in method: NUM_TOPICS = int(method.split("_")[-1]) else: NUM_TOPICS = 20 dictionary = Dictionary(texts) doc2bow = [dictionary.doc2bow(text) for text in texts] ldamodel = LdaMulticore(doc2bow, num_topics=NUM_TOPICS, id2word=dictionary, passes=30) raw_vecs = [ldamodel.get_document_topics(text) for text in doc2bow] lda_vecs = [] for vec in raw_vecs: this_vec = [] curr = 0 for i in range(ldamodel.num_topics): if (i == vec[curr][0]): this_vec.append(vec[curr][1]) curr += 1 if curr == len(vec): curr = -1 else: this_vec.append(0) lda_vecs.append(this_vec) vectors = scipy.sparse.csr.csr_matrix(lda_vecs) model = [ldamodel, doc2bow, dictionary] else: raise ValueError(f"Method {method} is not recognized") # Se não estiver fazendo uma versão com tags salva os resultados if tag == None and not self.low_memory: self.embedings[method] = (vectors, model) if return_model: return vectors, model else: return vectors
# with open(io_file) as t: # print(t.read()) # io_file.seek(0) # print(io_file.read()) # print(os.path.getsize(io_file)) # io_file.close() # test import uuid model_guid = uuid.uuid4() trained_model = LdaMulticore.load(model_dir) # save_model(engine, 'model', 'repository',trained_model, model_guid, 'LDA_model', 'AKA - Requests Topic finder') lda_model = get_model(engine, 'model', 'repository', '55644D1D-D187-4347-8DCD-C94A67F5D5A5') print(lda_model) print('COMPLETE') def test(): import pickle # from sqlalchemy.dialects.mssql import BINARY ## Create a semi-complex list to pickle
class Model(): """ LDA (Latent Dirichlet Allocation) model for unsupervised topic modeling. TO DO: - this model has to be rebuilt for each comment section as new comments come in - what's the best way to manage that? Notes: - tried LDA on individual sentences, doesn't work as well. """ def __init__(self, n_topics=5, verbose=False): self.verbose = verbose self.n_topics = n_topics self.vectr = Vectorizer() def train(self, comments): """ Build the topic model from a list of documents (strings). Assumes documents have been pre-processed (e.g. stripped of HTML, etc) """ docs = [c.body for c in comments] vecs = self.vectr.vectorize(docs, train=True) corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3) if self.verbose: self.print_topics() def featurize(self, docs): """ Return topic vectors for documents. """ vecs = self.vectr.vectorize(docs) dists = [] for dist in self.m[Scipy2Corpus(vecs)]: dists.append([t[1] for t in dist]) dists = np.array(dists) return dists def cluster(self, comments): """ Build clusters out of most likely topics. """ # If no model exists, train it. if not hasattr(self, 'm'): self.train(comments) clusters = [[] for _ in range(self.n_topics)] dists = self.featurize([c.body for c in comments]) for i, comment in enumerate(comments): topic = dists[i].argmax() clusters[topic].append(comment) return clusters def identify(self, docs): """ Labels a list of documents with their topic and probability for that topic. """ vecs = self.vectr.vectorize(docs) dists = self.featurize(docs) for i, doc in enumerate(docs): topic = dists[i].argmax() proba = dists[i][topic] yield doc, topic, proba def print_topics(self): vocab = self.vectr.vocabulary for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False): print([vocab[int(ix)] for prob, ix in topic])
def train_lda(dataset_path: str = None, print_stats: bool = True, save_pp_file: bool = True, pp_file_path: str = None, run_preprocess: bool = True): """ Train search engine """ if dataset_path is None: dataset_path = DATA_PATH if pp_file_path is None: pp_file_path = PP_DATA_PATH if not os.path.isfile(pp_file_path): run_preprocess = True df = pd.read_csv(dataset_path) df.rename({"class": "class_", "name": "sentencia"}, axis=1, inplace=True) df = df[[isinstance(x, str) for x in df.text.values]] # ## **Crear corpus preprocesado para FastText** # #### **Preprocesamiento de texto (para TF-IDF)** if run_preprocess: logging.info("Preprocesando...") with Pool(POOL_SIZE) as my_pool: pp_list = my_pool.map(preprocessor_sentences, df.text.values) df["pp"] = pp_list if save_pp_file: df.to_csv(pp_file_path) # df["class_"] = pd.Categorical(df["class_"]) # df.to_csv("data/cc_dump_full_pp_20210508.csv") # df = pd.read_csv("data/cc_dump_full_pp_20210508.csv") df[df.text.isnull()] df[df.pp == ""] df = df[df.text.notnull()].copy() df[~np.array([isinstance(x, str) for x in df.pp.values])] df = df[[isinstance(x, str) for x in df.pp.values]].copy() doc_list = df.sentencia.unique().tolist() doc_sample = random.sample(doc_list, k=int(len(doc_list) * LDA_SAMPLE_SIZE)) df_sample = df[df.sentencia.isin(doc_sample)].copy() logging.info(f"Tamaño de muestra: {len(df_sample)} elementos (párrafos)") # DESCARGAR MEMORIA! df = None # df_sample["lemma_stop"] = df_sample["pp"] df_sample = df_sample[[isinstance(a, str) for a in df_sample.pp.values]] df_sample["pp"] = [ doc.replace("\n", " ").strip() for doc in df_sample.pp.values ] # create tmp corpus file for yielding df_sample.pp.to_csv(LDA_CORPUS_PATH, index=False, header=False) # Build dict cc_dict = corpora.Dictionary(line.lower().strip().split() for line in open(LDA_CORPUS_PATH)) logging.info("Dictionary initial size: {}".format(len(cc_dict))) # Filter common words in paragraphs and docs # paragraph level cc_dict.filter_extremes(no_above=0.5, keep_n=200000) cc_dict.compactify( ) # remove gaps in id sequence after words that were removed # document-level doc_dict = { doc: ' '.join(df_sample[df_sample.sentencia == doc]["pp"].values) for doc in df_sample.sentencia.unique() } doc_word_dict = { d: list(set(doc_dict[d].strip().split())) for d in doc_dict } doc_df = pd.DataFrame([{ "name": doc, "words": ' '.join(doc_word_dict[doc]) } for doc in doc_word_dict]) cv = CountVectorizer(max_df=0.5) cv.fit(doc_df.words) logging.info(f"Document-level stopwords: {len(cv.stop_words_)}") stop_ids = [cc_dict.token2id[w] for w in cv.stop_words_] cc_dict.filter_tokens(bad_ids=stop_ids) cc_dict.compactify() # short words word_set = list( set([w for doc in doc_word_dict for w in doc_word_dict[doc]])) word_len = [(w, len(w)) for w in word_set] short_words = [word for word, length in word_len if length < 3] logging.info(f"Short words to remove: {len(short_words)}") short_words = [w for w in short_words if w in cc_dict.token2id.keys()] short_ids = [cc_dict.token2id[w] for w in short_words] cc_dict.filter_tokens(bad_ids=short_ids) cc_dict.compactify() # Train light_corpus = CorpusIterator(LDA_CORPUS_PATH, cc_dict) tfidf = models.TfidfModel(light_corpus) corpus_tfidf = TfidfCorpus(light_corpus, tfidf) # Train the model on the corpus. lda = LdaMulticore(corpus_tfidf, id2word=cc_dict, num_topics=LDA_NUM_TOPICS, workers=LDA_WORKERS, per_word_topics=True, random_state=RANDOM_STATE, passes=1) # temp_file = datapath("model") cc_dict.save_as_text(LDA_DICT_PATH) tfidf.save(LDA_TFIDF_PATH) lda.save(LDA_MODEL_PATH)
def train_lda(self, num_topics, **kwargs): print 'training LDA...' self.lda = LdaMulticore(self.bow_corpus, id2word=self.id2word, num_topics=num_topics, **kwargs) return self
# load cleaned corpus with open('data/cleaned_corpus_broad.pkl', 'rb') as f: corpus = pkl.load(f) with open("data/id2word_broad.pkl", 'rb') as f: id2word = pkl.load(f) # Choose the number of topics nTopics = 40 # Train the LDA model with a prespecified number of topics lda_model = LdaMulticore( corpus=corpus, id2word=id2word, num_topics=nTopics, random_state=100, chunksize=200, passes=5000, # iterations=10000, # minimum_probability=0, per_word_topics=True) # Save the trained LDA model lda_model.save(f"trained_models/trained_lda_model_new_{lda_model.num_topics}") # Run the model doc_lda = lda_model[corpus] # Extract the topic distributions for each paper as numpy array hm = np.zeros([len(corpus), lda_model.num_topics]) for i in range(len(doc_lda)): for topic_pair in doc_lda[i][0]:
if __name__ == "__main__": # # GENSIM TOPIC APPROACH # dictionary = Dictionary(token_stream(NOVELS_DIRPATH)) dictionary.filter_extremes(no_below=10, no_above=0.66) # excludes terms like "the", "to", "and", "of", "i", etc. print("-------------") print("TOKENS", len(dictionary.token2id), list(dictionary.token2id.items())[0:4], "...") bags_of_words = [dictionary.doc2bow(tokens) for tokens in token_stream(NOVELS_DIRPATH)] print("-------------") print("BAGS OF WORDS (CORPUS)", len(bags_of_words), bags_of_words[0]) lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, num_topics=15, passes=10, workers=4) print("-------------") print("LDA MODEL", type(lda)) results = lda.print_topics() print("-------------") print("TOPICS (RAW RESULTS)...") print(results) parsed_topics = parse_topics(lda) print("-------------") print("TOPICS (PARSED RESULTS)...") pprint(parsed_topics) # h/t: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling topics = lda[bags_of_words]
if MODE == 'distributed': lda = LdaModel(corpus=corpus, num_topics=i, id2word=dictionary, distributed=False, update_every=1, chunksize=args.chunksize, passes=1, iterations=args.iterations, random_state=args.seed, eval_every=None) elif MODE == 'multicore': lda = LdaMulticore(corpus=corpus, num_topics=i, id2word=dictionary, chunksize=args.chunksize, workers=3, passes=1, iterations=args.iterations, random_state=args.seed) lda_lst.append(lda) # Model Evaluation cm = CoherenceModel(model=lda, texts=texts, corpus=corpus, dictionary=dictionary, coherence='c_v') coherence = cm.get_coherence() coherence_lst.append(coherence)
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm') else: tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True) #tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format mm = tfidf[wiki] MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000) logger.info("finished pre-processing, starting LDA %s", program) lda = LdaMulticore(mm, id2word=dictionary, workers=10, num_topics=ntopics) lda.save(model_name) topics = lda.show_topics(num_topics=ntopics, num_words=30) print(topics) logger.info("finished LDA %s", program) toptopics = lda.top_topics(corpus=wiki, dictionary=lda.id2word, coherence='u_mass') logger.info("top topicsL %s", 'u_mass') print(toptopics)
def main(): df = read_forum_json('json/levergunscommunity.com.json') corpus, dictionary = generate_corpus(df) lda = LdaMulticore(corpus, num_topics=20, id2word=dictionary, workers=3) lda.print_topics(num_topics=20, num_words=20)
def LDA_topics(corpus, dictionary, num_topics): lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10) return lda_model
print('------>',p[0]) print('3 random reviews with the highest Negative sentiment polarity: \n') neg = carReviews.loc[carReviews.Vader_Rating <= 2.5, ['EntireReview']].sample(3).values for n in neg: print('------>',n[0]) #LDA Topic Modelling #Approach 1 reviews = carReviews["ReviewTokens"] dictionary = corpora.Dictionary(reviews) #Term document frequency doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews] #perform LDA ldamodel = LdaMulticore(corpus= doc_term_matrix, num_topics =8, id2word=dictionary,chunksize=2000, passes=20,per_word_topics=True) #get highlighted topics topics = ldamodel.show_topics() lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False) #show HTML view pyLDAvis.save_html(lda_display,open("lda_8_topics.html","w")) pprint(ldamodel.show_topics(formatted=False)) # Calculate coherence score def compute_coherence_score(lda_model,reviews): coherence = CoherenceModel(lda_model,texts = reviews,dictionary = dictionary ,coherence = "c_v") return coherence.get_coherence(),coherence.get_coherence_per_topic()
class LdaPipeline(Pipeline): """Pipeline for creating and updating a gensim LDA model. This is a data sink. it does not return any new data """ def __init__(self, num_topics, *args, **kwargs): """Initialises the LDA pipeline Args: num_topics (int): The number of topics in the LDA model workers (int): The number of workers to use to training. Defaults to num_cores - 1 """ super().__init__(*args, **kwargs) self._num_topics = num_topics self._workers = get_workers() # This is only used for lazy loading. Use self.get_model() to ensure it # is not None. And to create a new model if one does not exist. self._model = None @property def file_path(self): """str: the name of the model's file. It is of the form lda-num-topics.model """ file_name = "lda-{num_topics}.model".format( num_topics=self._num_topics) return get_training_file_path(file_name) async def get_model(self): """This function is used to get an instance of an LdaModel. It will load the model from file if it finds one, otherwise it will create a new one using a saved dictionary if one exists. Lastly it will create a new dictionary from the corpus if it finds no pre-saved dictionary to use and no pre-saved LDA model to load. Returns: :obj:`gensim.models.ldamodel.LdaModel`: A gensim LdaModel """ self._model = self._model or self._load_model() if not self._model: print("No previous model found. Creating a new one for training") dictionary = await DictionaryPipeline().get_dictionary() self._model = LdaMulticore(id2word=dictionary, workers=self._workers) return self._model def _load_model(self): """This function is used to load a gensim LdaModel from the models folder. Or `None` if one does not exist. Returns: :obj:`gensim.models.ldamodel.LdaModel`: The model found in ucla_topic_analysis/model/lda.model or None if there was no lda model saved or the number of topics does not match. """ if os.path.isfile(self.file_path): return LdaMulticore.load(self.file_path) return None def save_model(self, file_path=None): """Saves the updated model to file """ if self._model is not None: path = file_path or self.file_path self._model.save(path) else: raise Exception("Can not save. No model has been loaded.") @log_time def get_log_perplexity(self, mode): """Used to get the log perplexity for the LDA model. Args: mode (str): The label associated with the files for which to calculate the log perplexity. Returns: int: The log perplexity for the LDA model """ # Get corpus corpus = LdaCorpusPipeline(mode=mode) # Make sure corpus data has been prepared if not os.path.isfile(corpus.get_file_path()): raise Exception("No corpus has been prepared.") model = self._model or self._load_model() if not model: raise Exception( "No model saved model found. Please tain one first.") return model.log_perplexity(corpus) @log_async_time async def train(self): """This function trains an LDA model from the data in the corpus file. It will overwrite any existing model and creating a new one if one does not exist. """ # Get the dictionary dictionary = await DictionaryPipeline().get_dictionary() # Get corpus corpus = LdaCorpusPipeline() # Make sure corpus data has been prepared if not os.path.isfile(corpus.get_file_path()): await corpus.prepare_data() print("Training model. This might take some time") model = LdaMulticore(corpus=corpus, num_topics=self._num_topics, id2word=dictionary, workers=self._workers) self._model = model self.save_model() async def coroutine(self, data): """Updates the model with the documents in the data. This is a data sink it does not return any new data Args: data (:obj:`list` of :obj:`list` of :obj:`(int, int)`): A list of documents, in bag of words representation """ model = await self.get_model() model.update(data)
MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_records_filepath)) # load the finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus(trigram_bow_filepath) lda_model_filepath = os.path.join('.', 'lda_model_all_diags') if rerun: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=50, id2word=trigram_dictionary, workers=3) lda.save(lda_model_filepath) # load the finished LDA model from disk lda = LdaMulticore.load(lda_model_filepath) def explore_topic(topic_number, topn=10): """ accept a user-supplied topic number and print out a formatted list of the top terms """ print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
] #loop through input matrices of cancer types and perform LDA for path in paths: project = path.split("/")[:-1].join("/") + "ground.truth.syn.sigs.csv" print("Executing LDA algorithm on " + project) pd.read_csv() num_sigs = projectToSigs[project] bagOfMutations, idToChannel = matrixToBag(path) classification = list(pd.read_csv(path, sep="\t", usecols=[0]).iloc[:, 0]) print("Extracting Bayes Signatures") ldamodel = LdaMulticore(bagOfMutations, num_topics=num_sigs, id2word=idToChannel, passes=100, iterations=100, minimum_probability=0) print("Now Extracting Gibbs Signatures") ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=bagOfMutations, num_topics=num_sigs, id2word=idToChannel, iterations=100, topic_threshold=0.0) #hdpmodel = HdpModel(bagOfMutations, idToChannel, K=20, T=48) pickle.dump(ldamodel, open(output_path + project + '_lda_model.pickle', 'wb')) pickle.dump(ldamallet,
# # if x in idmap: # # return x # # else: # # return -1 # for idx, (doc_id, document) in enumerate(corpus.documents.items()): # if idx % 1000 == 0: # logger.info("remapping: %d documents finished" % idx) # # corpus.documents[doc_id] = [check_and_replace(oldid) for oldid in document] # corpus.documents[doc_id] = [idmap[oldid] for oldid in document if oldid in idmap] corpus.save_tbmm_corpus(args.corpus_filename) if args.train_lda: # from gensim.models.ldamodel import LdaModel from gensim.models.ldamulticore import LdaMulticore # setting metadata to False is required because of the way logperplexity code requires the # output of get_texts to be. corpus.metadata = False lda = LdaMulticore(workers=19, corpus=corpus, id2word=corpus.dictionary, num_topics=20, eval_every=100, chunksize=100, passes=5) lda.print_topics(20) lda.save(args.corpus_filename + ".tbmm_lda.model")
start_time = time.time() score = cosine_similarity(X[0:10],X) print("--- %s seconds ---" % (time.time() - start_time)) # calculate similarity and output title and error x_array = X.toarray() len(x_array[0]) len(X) #Train LDA model Take 327 seconds to train the model start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=7,passes=10, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) # Get all topics from training doc_list = [] for var in matutils.Sparse2Corpus(X,documents_columns=False): doc_list.append(var) topic = model.print_topics(num_topics=7, num_words=10) fin_sum = [] for i in range(len(doc_list)):
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus from gensim.models import TfidfModel, LdaModel from gensim.utils import smart_open, simple_preprocess from gensim.corpora.wikicorpus import _extract_pages, filter_wiki from gensim import corpora from gensim.models.ldamulticore import LdaMulticore wiki_corpus = MmCorpus('Wiki_Corpus.mm') # Loading the corpus print (".... successfully loaded the corpus") wiki_dict = Dictionary.load('WikiDictionary200k.dict') # Loading the dictionary print (".... successfully loaded the dictionary") lda = LdaMulticore(corpus=wiki_corpus, id2word=wiki_dict, num_topics=300, chunksize=10000, passes=2) print ".... successfully extracted the topics; saving the model" lda.save('WikiLDA_300.lda') print "finished ...."
doc_tokens = [] doc_tokens = prepare_text_for_lda(doc) tokens.append(doc_tokens) # Makes tokens column df['tokens'] = tokens id2word = Dictionary(df['tokens']) id2word.filter_extremes(no_below=2, no_above=.99) corpus = [id2word.doc2bow(d) for d in df['tokens']] # Instantiating a Base LDA model base_model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5) words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()] topics = [' '.join(t[0:10]) for t in words] # Getting the topics for id, t in enumerate(topics): print(f"------ Topic {id} ------") print(t, end="\n\n") p=pyLDAvis.gensim.prepare(base_model, corpus, id2word) pyLDAvis.save_html(p, 'biden_lda.html') ldamodel.save('biden_model.gensim')
from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore from gensim.models import Phrases from gensim.models.word2vec import LineSentence import pyLDAvis import pyLDAvis.gensim def bow(filepath, d): # output bag of words representation for review in LineSentence(filepath): yield d.doc2bow(review) fake_sent = LineSentence('fake.txt') fake_dict = Dictionary(fake_sent) fake_dict.filter_extremes(no_below=5, no_above=0.2) fake_dict.compactify() fake_dict.save('fake.dict') fake_dict = Dictionary.load('fake.dict') MmCorpus.serialize('fake.mm', bow('fake.txt', fake_dict)) fake_corpus = MmCorpus('fake.mm') fake_lda = LdaMulticore(fake_corpus, num_topics=10, id2word=fake_dict, workers=2) fake_lda.save('./fake_lda_model')
# LDA Model Training # We want to maximize the probability of the corpus in the training set. corpus = scope_lda_sample.bow print(( 'LDA Model based on {3} dataset.\n\tSample Size: {0},\n\tTop {1} Words,\n\tNo of Topics {2}' .format(sample_size, len(dictionary.values()), num_topics, data_scope_name))) LDAmodel_scope = LdaMulticore( corpus=corpus, #mm, id2word=dictionary, num_topics=num_topics, workers=4, chunksize=5000, passes=50, alpha='asymmetric', random_state=random_state) dictionary.save( 'data/model/{0}_dictionary.pkl'.format(research_scope)) #data_scope_name)) LDAmodel_scope.save( 'data/model/{0}'.format(research_scope)) #data_scope_name)) # pickle the model here and insert in SQL LDAmodel_scope = LdaMulticore.load( 'data/model/{0}'.format(research_scope)) #data_scope_name)) # Feature vector df_lda_features(LDAmodel_scope, scope_lda_sample)
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list other_texts.append(stemmed_tokens) other_corpus = [dictionary.doc2bow(text) for text in other_texts] # unseen_doc = other_corpus[2] # vector = ldamodel[unseen_doc] # print(vector) # generate LDA model------------------------------------------------------------------------- my_loop_num_topics = [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100] for i in my_loop_num_topics: my_num_topics = i print(my_num_topics) # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) myldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(myldamodel.log_perplexity(corpus)) print(myldamodel.log_perplexity(other_corpus))
# add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model my_num_topics = 30 # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20) ldamodel = LdaMulticore(corpus, num_topics=my_num_topics, id2word=dictionary, workers=3, alpha=1e-5, eta=5e-1) print(ldamodel.print_topics(num_topics=my_num_topics, num_words=5)) print(corpus[0]) print(corpus[1]) print(corpus[2]) print(ldamodel[corpus[0]]) print(ldamodel[corpus[1]]) print(ldamodel[corpus[2]]) print(ldamodel.print_topics(20)) #---------------------------------------------------------------------- new_texts_set = [
article_dict = Dictionary.load('articles.dict') def bow(filepath, d): # output bag of words representation for review in LineSentence(filepath): yield d.doc2bow(review) # generate bag-of-words representations for all reviews and save them as a matrix MmCorpus.serialize('articles.mm', bow('articles.txt', article_dict)) # load the finished bag-of-words corpus from disk corpus = MmCorpus('articles.mm') # Create LDA model lda = LdaMulticore(corpus, num_topics=10, id2word=article_dict, workers=2) lda.save('./lda_model') lda = LdaMulticore.load('./lda_model') # It's really slow when they're all together for some reason # For real and fake dataframes # fake_sent = LineSentence('fake.txt') # fake_dict = Dictionary(fake_sent) # fake_dict.filter_extremes(no_below=5, no_above=0.2) # fake_dict.compactify() # fake_dict.save('fake.dict') # fake_dict = Dictionary.load('fake.dict') # # real_sent = LineSentence('real.txt') # real_dict = Dictionary(real_sent)
print prod_cat_name_prefix # load and build user products # products correspond to corpus df_user_prods = common.load_df("../data/", user_corpus_name, converters={"user_corpus": literal_eval}) df_user_prods = df_user_prods.set_index('user_id', drop=False) user_prods = list(df_user_prods.user_corpus) # build lda model if __debug__: print "In debug mode, None-debug mode command : python -O " + __file__ + "\n\n" print "Processing 200 users, 10 topics" up_lda = LdaMulticore(corpus=user_prods[0:1000], id2word=id2prod, workers=3, num_topics=20) up_lda.save('/tmp/up.lda') loaded_up_lda = LdaModel.load('/tmp/up.lda') loaded_up_lda.show_topics() loaded_up_lda.get_document_topics(user_prods[0]) loaded_up_lda.get_term_topics(13176) # get user cats df_user_cat = get_user_cat(loaded_up_lda, df_user_prods.iloc[100:120]) # get prod cats i = 3 cat_prods = [x[0] for x in loaded_up_lda.get_topic_terms(i, topn=20)] df_prod_cat = get_prod_cat(loaded_up_lda, products.loc[cat_prods]) print df_prod_cat
) sentence_ids, text_corpus = bow_util.filter_words( keys=sentence_ids, text_corpus=text_corpus, stopwords=stopwords_under, ) print(f"\t- Reduced to {len(text_corpus)} documents") assert len(sentence_ids) == len(text_corpus) print("Computing topics") word_index = Dictionary(text_corpus) int_corpus = [word_index.doc2bow(t) for t in text_corpus] topic_model = LdaMulticore( corpus=int_corpus, id2word=word_index, num_topics=config.topic_model.num_topics, random_state=config.topic_model.random_seed, iterations=config.topic_model.iterations, ) ##################################################### # Store results print("Interpreting") result = qpb.TopicQueryResult() result.source = config.source result.target = config.target # Add path for p in path: result.path.append(p)
def main(argv): """ The main function of the script. Flags: --tokenize : Apply tokenizing and stemming. --load-tokenized : Load existing tokenized and stemmed data. --load-lda : Load existing trained LDA (with --num-topics X topics). --load-features : Load saved features (of type --topic/term/char-features or raw). --filter-extremes : Remove extreme tokens from corpus. --filter-types : Remove only MBTI tokens. --topic-features : Use topic distribution for document as feature vector. --term-features : Use term topics as feature vector, together with TF. --char-features : Use char frequency as feature vector. --num-topics X : X: int, follow after --num-topics arg with space between. --normalize : TODO: Write more here. """ setup_logging() mbti = { 'I': 'Introversion', 'E': 'Extroversion', 'N': 'Intuition', 'S': 'Sensing', 'T': 'Thinking', 'F': 'Feeling', 'J': 'Judging', 'P': 'Perceiving' } if not "--load-tokenized" in argv: data_set = get_data(argv) types = sorted(set(data_set["type"])) #create_wordclouds(train) data_set["posts"].apply(handle_delimiter) data_set["posts"] = [post.lower() for post in data_set["posts"]] if "--tokenize" in argv or "-t" in argv: if "--filter-extremes" in argv: filter_level = 'extremes' elif "--filter-types" in argv: filter_level = 'types' else: filter_level = None data_set["posts"] = tokenize_and_stem(data_set["posts"], types, create_corpus=True, filter_level=filter_level) logging.info("Saving tokenized and stemmed data.") data_set.to_csv(TOKENIZED_DATA_FILE, index=False) else: logging.info("Loading tokenized data.") data_set = load_csv(TOKENIZED_DATA_FILE) data_set["posts"] = [str(post) for post in data_set["posts"]] types = sorted(set(data_set["type"])) scoring = { 'acc': 'accuracy', 'prec_micro': 'precision_micro', 'rec_micro': 'recall_micro', 'f1_micro': 'f1_micro' } prob_scoring = {'neg_log_loss': 'neg_log_loss'} X_train, X_test, y_train, y_test = train_test_split( data_set["posts"], data_set["type"], test_size=0.3, stratify=data_set["type"], random_state=1773) print(Counter(y_train), len(y_train)) corpus = corpora.MmCorpus(CORPUS_FILE) dictionary = corpora.Dictionary.load(DICTIONARY_FILE) if "--load-lda" in argv: logging.info("Loading LDA for %s topics...", NUM_TOPICS) lda = LdaMulticore.load(LDA_FOLDER + "lda_model_{}".format(NUM_TOPICS)) else: logging.info("Generating LDA for %s topics...", NUM_TOPICS) lda = LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, workers=3, passes=50, batch=True, iterations=500) lda.save(LDA_FOLDER + "lda_model_{}".format(NUM_TOPICS)) #print(lda.print_topics(num_topics=NUM_TOPICS, num_words=30)) if "--term-features" in argv: feature_type = "terms" elif "--topic-features" in argv: feature_type = "topics" elif "--char-features" in argv: feature_type = "chars" else: logging.warning("Default features used (TF-IDF, etc.)") feature_type = "raw" if not "--load-features" in argv: logging.info("Extracting X_train features...") X_train_features, save_name = get_features(X_train, feature_type, lda, dictionary) np.save(X_TRAIN_FOLDER + "X_train_features_{}.npy".format(save_name), X_train_features) logging.info("Extracting X_test features...") X_test_features, save_name = get_features(X_test, feature_type, lda, dictionary) np.save(X_TEST_FOLDER + "X_test_features_{}.npy".format(save_name), X_test_features) else: logging.info("Loading feature vectors...") if feature_type == "chars" or feature_type == "raw": file_ending = feature_type elif feature_type == "topics" or feature_type == "terms": file_ending = "{}_{}".format(feature_type, NUM_TOPICS) else: raise Exception("Feature type {} is unknown!".format(feature_type)) X_train_features = np.load( X_TRAIN_FOLDER + "X_train_features_{}.npy".format(file_ending)) X_test_features = np.load(X_TEST_FOLDER + "X_test_features_{}.npy".format(file_ending)) if "--normalize" in argv: X_train_features = normalize(X_train_features) X_test_features = normalize(X_test_features) #y_train = [mbti_type[2:4] for mbti_type in y_train] #y_test = [mbti_type[2:4] for mbti_type in y_test] if feature_type == "raw": #"extra_trees" "sgd" clf = get_model("etc", argv, X_train_features, y_train, scoring, prob_scoring) elif feature_type == "terms": clf = get_terms_classifier("logit", scoring, prob_scoring) elif feature_type == "topics": clf = get_topics_classifier("gradboost", scoring, prob_scoring) elif feature_type == "chars": clf = get_chars_classifier("linear", scoring, prob_scoring) else: raise Exception("Feature type {} is unknown".format(feature_type)) #clf.fit(X_train_features, y_train) #logging.info("Best: %s, %s, %s", clf.best_index_, clf.best_score_, clf.best_params_) #clf = get_model("lr", argv, X_train, y_train, scoring) # extra_trees sgd lr logging.info("Testing clf: %s", clf) logging.info("Test set score: %s", clf.score(X_test_features, y_test)) y_pred = clf.predict(X_test_features) logging.info( precision_recall_fscore_support(y_test, y_pred, average="micro")) print(Counter(y_pred)) logging.info(classification_report(y_test, y_pred)) my_own_data = [ "I'm eagerly waiting for the next development on Social Media Platforms: being able to like likes on your/others' Social Media posts.", "The autumn semester was beyond all of my expectations. I've learned a lot of new things and gained new friends from all over the world. A big thank you to all of you for making my time here amazing. I'll never forget you. I hope the spring semester at ETH will be as memorable as the autumn one. <URL>", "Month of finals: 6 out of 7 exams done so far with varying performance and 6 seasons of series completed. It's all about balance in life!", "Why do I always eat food while cooking? I'm always full by the time the dish is finished!", "Reliving my childhood #harrypotter #game <URL>", "Peridot loves Steven #stevenuniverse #pfefferkuchen #pepparkakor #gingerbreadcookies #selfmade #happyholidays <URL>", "Guess who's back on twitter? - this neeerd", "Java Lecture: When you feel like taking a nap.", "A bunch of friends on a friday night playin' #PropHunt! Awesome!", "Interesting day :D", "#dhopen @QuanticHyuN GG! you're the best!", "I stand by TotalBiscuit and the Terran Republic in the PlanetSide 2 Ultimate Showdown! #PS2showdown", "longing for P tutorials with @ApolloSC2. in the mean time I'll ladder against High Gold/Diamond players! Thanks to you I'm now in gold!! :D", "Loosing hard in #SC2, MMMVG and Broodlords are really hard to deal with :(", "I uploaded a @YouTube video <URL> [CoD: WaW] Nostalgia! Quick Match! 30 - 2", "I uploaded a @YouTube video <URL> BF3 Test Footage", "Thanks! Now I know! <URL> #ComputerPowerTest", "One does not simply make games without passion.", "I uploaded a @YouTube video <URL> Swetrox- - MW3 Game Clip", "I uploaded a @YouTube video <URL> Shatterhand Audio School Project", "Finally some spare time! Time for #BF3!!! :D", "I uploaded a @YouTube video <URL> Swetrox- - MW3 Game Clip", "I nominate @totalbiscuit for a Shorty Award in #gaming because he delivers entertaining top-quality gaming videos. <URL>", "#MW3 released a new game mode for FREE, and MIGHT release some free DLCs! What's that, #BF3? Right, you guys already do that!! :D", "Our first duet coming up soon! #LAN and #Singstar <3 The song: The Killers, When we were young!", "Land of Confusion! #Singstar", "I forgot: I also bought some APELSIN KROKANT! :D", "Bought some pizza, 1 Grape Tonic, 1 Grappo and 2 Ciders! #LAN", "We're up and running! #LAN time!! Gonna warm up with some #BF3 Wanna join? :D", "Would be awesome if I had any spare time to work on my #XNA game! Maybe do some bugfixing or animating the player? :D", "Enthusiastic about tomorrow's 18 hour LAN-Party! Gonna play soo many games! #StarCraft2 #BF3 #Sanctum being a few of 'em!", "#SC2 Time!", "Time to sleep. Tomorrow is a new day, filled with #Skyrim #StarCraft2 #XNA and #Floorball Good night! :D", "XNA Time!", "Just picked up mw3 :D" ] my_own_data = [" ".join(my_own_data)] my_own_data = [post.lower() for post in my_own_data] if feature_type != "raw": my_own_data = tokenize_and_stem(my_own_data, types) my_own_data = [" ".join(my_own_data)] logging.info("Extracting features from my own data...") my_own_data_features = get_features(my_own_data, feature_type, lda, dictionary)[0] if "--normalize" in argv: my_own_data_features = normalize(my_own_data_features) print(my_own_data_features) print(clf.predict(my_own_data_features)) try: predicted_classes = clf.predict_proba(my_own_data_features) pprint(list(zip(types, predicted_classes[0]))) except Exception: print("Predict probabilities not supported...") return 0
def train(self, vecs): """ Build the topic model. """ corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)
# N = len(alldata) # ii=800000 # ff=ii+20000 # while ff<N: # aa = reviewPreProcess(alldata['FullReview'][ii:ff]) # ii=ff # ff=ii+20000 # print(ff) # else: # aa = reviewPreProcess(alldata['FullReview'][ii:N]) d = reviewPreProcess(alldata['FullReview']) # bigram_model, trigram_model, trigram_dictionary = reviewPreProcess(alldata['FullReview']) trigram_bow_corpus, lda = LDA_Model(15) import pickle trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm') lda = LdaMulticore.load('./models2/lda_model') LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary) # Save pre-prepared pyLDAvis data to disk: with open('./models2/ldavis_prepared', 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk: with open('./models2/ldavis_prepared', 'rb') as f: LDAvis_prepared = pickle.load(f) # pyLDAvis.display(LDAvis_prepared) pyLDAvis.save_html(LDAvis_prepared,'./models2/lda.html')
class recommendationsys_LDA: def __init__(self, ngram): # load the spacy english model self.nlp = spacy.load('en') self.extrawords = ["'s", "st", "th", "’s", "-PRON-", "’", "htt", "ht", "km", "pm", "am"] # parse the latest emoji code html = str(ur.urlopen('http://www.unicode.org/Public/emoji/5.0/emoji-data.txt').read()) codes=list(map(lambda x: '-'.join(['\\U'+a.zfill(8) for a in x.split('..')]).encode().decode('unicode-escape'),re.findall(r'(?<=\\n)[\w.]+',html))) self.emojiPattern = re.compile('['+','.join(codes)+']',flags=re.UNICODE) PROJECT_DIRECTORY = 'output/project/' + project_name self.f_titles = PROJECT_DIRECTORY + '/titlesLF_target.txt' self.f_authors = PROJECT_DIRECTORY + '/authors_target.txt' self.authorcontent_clean = {} self.ngram_bow_corpus = [] self.ldavec = {} self.ngram_dictionary = None self.ngram = ngram self.num_topics = None def clean_text(self, text): # remove the 'RT' and replace '\n' to '.' text = text.lower() #text = text.replace('RT',' ') text = text.replace('\n',' . ') # this is for USC-2 # remove emojis myre = re.compile(u'(' '@\S*\s?|#|' # remove @ mention names and hastag sign 'http[s]?[:…]+\S*\s|' # remove url '[-~\^\$\*\+\{\}\[\]\\\|\(\)/“"]|' 'rt[:]? |' '…' ')+', re.UNICODE) text = myre.sub(' ', text) text = self.emojiPattern.sub(' ', text) text = text.replace('&','and') #text = ' '.join(text) return text #--------------------------- # make the recommendations #--------------------------- def recomendation(self, username, topicn=0, list=[]): similaritis = self.ldacosinesimilarity(username,topicn) result=[] # list is empty, run on the whole dataset if not list: for key, value in sorted(similaritis.items(), key=lambda x:x[1]): result.append((key,value)) else: for i in list: result.append((i,similaritis[i])) # sort the result by similarities result = sorted(result, key=lambda x:x[1]) #--------------------------- # load and clean the data #--------------------------- def loadandclean(self, n=-1): #authorcontent = {} # ------ with codecs.open(self.f_titles, encoding='utf_8') as f_t: with codecs.open(self.f_authors, encoding='utf_8') as f_a: for l_a, l_t in zip(f_a, f_t): # remove the '\n' at the end key = l_a[:-1].lower() l_t = self.clean_text(l_t) if key in self.authorcontent_clean: self.authorcontent_clean[key].append(l_t) #self.authorcontent_clean[key] = self.clean_text(value) else: self.authorcontent_clean[key] = [l_t] #self.authorcontent_clean[key] = self.clean_text(value) if n != -1 and len(self.authorcontent_clean) == n: break # --------------- for key, value in self.authorcontent_clean.items(): self.authorcontent_clean[key] = self.lemmatized_sentence_corpus(self.authorcontent_clean[key]) #------------------------------------------------------ # build the trigram content based on the clean content #------------------------------------------------------ def punct_space(self, token): """ helper function to eliminate tokens that are pure punctuation or whitespace """ #return token.pos_ == 'NOUN' or token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2 return token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2 def lemmatized_sentence_corpus(self, contents): """ generator function to use spaCy to parse reviews, lemmatize the text, and yield sentences """ sentents = [] for content in self.nlp.pipe(contents,batch_size=500, n_threads=8): for sent in content.sents: #sentents.append(u' '.join([token.lemma_ for token in sent # if not punct_space(token)])) #sentents.append([token.lemma_ for token in sent # if not punct_space(token)]) tokens = [] for token in sent: if self.punct_space(token): continue #if token.lemma_ == '-PRON-': # token.lemma_ = token.lower_ tokens.append(token.lemma_) sentents.append(tokens) return sentents """ prepare the parameters for lda """ def ldainit(self): # self.num_topics = num_topics # ngram = self.ngram # # if ngram_bow_corpus is empty, build it first # if not self.ngram_bow_corpus: self.user_sentences = self.authorcontent_clean self.user_bigramsentences = {} self.all_sentences = [] self.all_bigram_sentences = [] sentences = list(self.authorcontent_clean.values()) self.all_sentences = [item for sublist in sentences for item in sublist] # buld bigram model if self.ngram == 2: self.bigram_model = Phrases(self.all_sentences) for user,content in self.user_sentences.items(): bigram_s = [] for s in content: bigram_s.append(self.bigram_model[s]) self.user_bigramsentences[user] = bigram_s self.all_bigram_sentences += self.user_bigramsentences[user] def trainlda(self, topics_n = 10): self.num_topics = topics_n alltexts = [] for name,sentences in self.user_sentences.items(): sentences = [item for sublist in sentences for item in sublist] alltexts.append(sentences) # if self.ngram_dictionary == None: # if self.ngram == 1: # self.ngram_dictionary = Dictionary(self.all_sentences) # elif self.ngram == 2: # self.ngram_dictionary = Dictionary(self.all_bigram_sentences) # if self.ngram_dictionary == None: if self.ngram == 1: self.ngram_dictionary = Dictionary(alltexts) elif self.ngram == 2: self.ngram_dictionary = Dictionary(alltexts) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8) self.ngram_dictionary.compactify() # if self.ngram == 1: # sentences = self.all_sentences # elif self.ngram == 2: # sentences = self.all_bigram_sentences # ngram_bow_corpus = [] # for sentence in sentences: # ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) # # # self.lda = LdaMulticore(ngram_bow_corpus, # num_topics = topics_n, # id2word=self.ngram_dictionary, # workers=3) ngram_bow_corpus = [] for sentence in alltexts: ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) self.lda = LdaMulticore(ngram_bow_corpus, num_topics = topics_n, id2word=self.ngram_dictionary, workers=3) # calculate the cohe topics=[] for i in range(self.lda.num_topics): terms = [] for n in self.lda.show_topic(i): terms.append(n[0]) topics.append(terms) cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass') cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v') cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci') cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi') return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence() def explore_topic(self, topic_number, topn=25): """ accept a user-supplied topic number and print out a formatted list of the top terms """ print(u'{:20} {}'.format(u'term', u'frequency') + u'\n') for term, frequency in self.lda.show_topic(topic_number, topn): print(u'{:20} {:.3f}'.format(term, round(frequency, 3))) def runlda(self, username): if self.ngram == 1: user_sentences = self.user_sentences[username] elif self.ngram == 2: user_sentences = self.user_bigramsentences[username] # flat the list of list into single list user_sentences = [item for sublist in user_sentences for item in sublist] user_bow = self.ngram_dictionary.doc2bow(user_sentences) user_lda = self.lda[user_bow] #user_lda = sorted(user_lda, key=lambda x:-x[1]) return user_lda """ compute the lda topic vec for every one """ def runldavec(self): if not self.ldavec: for key, value in self.user_sentences.items(): vec = np.zeros(self.num_topics) result = self.runlda(key) for i in result: vec[i[0]] = i[1] self.ldavec[key] = vec """ """ def runtopntopic(self, n): self.topntopics = [] for key, value in self.ldavec.items(): idx = value.argsort() self.topntopics += list(idx[-n:]) self.topntopics = list(set(self.topntopics)) """ compute the lda cosine similarity between a given user and the rest users """ def ldacosinesimilarity(self, username, topn=0): if username not in self.authorcontent_clean: print('The user cannot find') return if topn < 0: print('topn should be >= 0') return topn = int(topn) cosinesimilaritydic = {} if not self.ldavec: self.runldavec() if topn == 0: usertopicvec = self.ldavec[username] else: self.runtopntopic(topn) usertopicvec = self.ldavec[username][self.topntopics] for key, value in self.ldavec.items(): if key != username: if topn == 0: pairtopicvec = value else: pairtopicvec = value[self.topntopics] cosinesimilarity = pairwise_distances(np.array(usertopicvec).reshape(1,-1),np.array(pairtopicvec).reshape(1,-1), metric='cosine')[0][0] cosinesimilaritydic[key] = cosinesimilarity return cosinesimilaritydic
import requests import gensim from gensim.models.ldamulticore import LdaMulticore from gensim.corpora import Dictionary from '../../nlp_utils' import lemmatize_stemming, preprocess, text_from_html, tag_visible #link = "https://www.nytimes.com/2018/06/12/opinion/earth-will-survive-we-may-not.html" #link = "https://www.slowtwitch.com/Products/Tri_Bike_by_brand/Specialized/S-Works_Shiv_Disc_7053.html" #html = requests.get(link).text #article = text_from_html(html) article = "How a Pentagon deal became an identity crisis for Google" lda_model_tfidf = LdaMulticore.load("kaggle_lda_tfidf") dictionary = Dictionary.load('kaggle_dict') bow_vector = dictionary.doc2bow(preprocess(article)) for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]): print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))
import pyLDAvis import pyLDAvis.gensim import warnings import _pickle as pickle # In[26]: lda_model_filepath = 'lda_model_eat_30' trigram_dictionary_filepath = 'trigram_dict_eat_30.dict' trigram_model_filepath = 'trigram_model_all_eat_30' bigram_model_filepath = 'bigram_model_all_eat_30' # In[27]: lda = LdaMulticore.load(lda_model_filepath) trigram_dictionary = Dictionary.load(trigram_dictionary_filepath) trigram_model = Phrases.load(trigram_model_filepath) bigram_model = Phrases.load(bigram_model_filepath) all_numbers = list(range(0, 50)) df_all_numbers = pd.DataFrame(columns=["topic_number"]) for topic_number in all_numbers: df_all_numbers = df_all_numbers.append({"topic_number": topic_number}, ignore_index=True) # In[32]: #output_file = open('review_bus_lda.txt','w') # In[ ]:
eval_every = None workers = 6 random_state_list = [7,14,21,28] # random_state = 42 for data in dataset: for n_topics in num_topics_list: for random_state in random_state_list: starttime = datetime.datetime.now() print('dataset:', data, 'num_topics:', n_topics) data_dir = './%s_data'%data dictionary = Dictionary.load(os.path.join(data_dir, 'ne_weighting.dict')) bow_news = load_model(os.path.join(data_dir, 'ne_weighting.bow')) dict_id2token = dict(dictionary.items()) lda = LdaMulticore(bow_news, id2word=dict_id2token, num_topics=n_topics, passes=passes, iterations=iterations,\ eval_every=eval_every, workers=workers, random_state=random_state) #print(lda.show_topics(num_topics=num_topics, num_words=20)) name = 'ne_topic%s_passes%s_iteration%s_random%s' % (n_topics, passes, iterations, random_state) result_dir = os.path.join(data_dir, name) if not os.path.exists(result_dir): os.mkdir(result_dir) lda.save(os.path.join(result_dir, 'lda_model')) topics = lda.show_topics(num_topics=n_topics, num_words=20, log=False, formatted=False) # 输出主题 with open(os.path.join(result_dir, 'topics.txt'), 'w', encoding='utf-8') as f: for topic in topics: f.write('topic ' + str(topic[0]) + ':\n')
def load_model(self, model_path, wrd2idx_path): self.model = LdaMulticore.load(model_path) with open(wrd2idx_path) as fid: wrd2idx = cPickle.load(fid) self.load_vocabulary(wrd2idx)
res = fin_res len(res) res[0] #============================================================================== # #Train LDA model Take 1655 seconds to train the model #============================================================================== # No need to run LDA everytime, model has bee stored vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(res) vocab = vectorizer.get_feature_names() start_time = time.time() model = LdaMulticore( matutils.Sparse2Corpus(X,documents_columns=False), num_topics=9,passes=10, chunksize=5000, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=7, ) print("--- %s seconds ---" % (time.time() - start_time)) fname = '/Users/royyang/Desktop/trending_project/re_categorization_ls/LDA_9topics' model.save(fname) #Load a pretrained model model = LdaModel.load(fname, mmap='r') type(model) #============================================================================== # # Get all topics from training # topic_number, number_of_aritcles, top_words #==============================================================================
def trainlda(self, topics_n = 10): self.num_topics = topics_n alltexts = [] for name,sentences in self.user_sentences.items(): sentences = [item for sublist in sentences for item in sublist] alltexts.append(sentences) # if self.ngram_dictionary == None: # if self.ngram == 1: # self.ngram_dictionary = Dictionary(self.all_sentences) # elif self.ngram == 2: # self.ngram_dictionary = Dictionary(self.all_bigram_sentences) # if self.ngram_dictionary == None: if self.ngram == 1: self.ngram_dictionary = Dictionary(alltexts) elif self.ngram == 2: self.ngram_dictionary = Dictionary(alltexts) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8) self.ngram_dictionary.compactify() # if self.ngram == 1: # sentences = self.all_sentences # elif self.ngram == 2: # sentences = self.all_bigram_sentences # ngram_bow_corpus = [] # for sentence in sentences: # ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) # # # self.lda = LdaMulticore(ngram_bow_corpus, # num_topics = topics_n, # id2word=self.ngram_dictionary, # workers=3) ngram_bow_corpus = [] for sentence in alltexts: ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence)) self.lda = LdaMulticore(ngram_bow_corpus, num_topics = topics_n, id2word=self.ngram_dictionary, workers=3) # calculate the cohe topics=[] for i in range(self.lda.num_topics): terms = [] for n in self.lda.show_topic(i): terms.append(n[0]) topics.append(terms) cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass') cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v') cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci') cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi') return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence()