def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result n_topics, X = _build_sparse_mtx() prior = 1. / n_topics lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) assert_almost_equal(topic_distr_1, topic_distr_2)
def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
def basic_lda(df, n_topics=200, max_df=0.5, min_df=5): ''' Basic LDA model for album recommendations Args: df: dataframe with Pitchfork reviews n_topics: number of lda topics max_df: max_df in TfidfVectorizer min_df: min_df in TfidfVectorizer Returns: tfidf: sklearn fitted TfidfVectorizer tfidf_trans: sparse matrix with tfidf transformed data lda: sklearn fitted LatentDirichletAllocation lda_trans: dense array with lda transformed data ''' X = df['review'] cv = CountVectorizer(stop_words='english', min_df=5, max_df=0.5) cv_trans = cv.fit_transform(X) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=7) lda_trans = lda.fit_transform(cv_trans) return cv, cv_trans, lda, lda_trans
def _get_model_LDA(self, corpus): #lda = models.LdaModel(corpus, id2word=self.corpus.dictionary, num_topics=5, alpha='auto', eval_every=50) lda = LatentDirichletAllocation(n_topics=self.num_of_clusters, max_iter=20, learning_method='online', learning_offset=50., random_state=1) return lda.fit_transform(corpus)
def produceLDATopics(): ''' Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer to extract topics. :return: pandas data frame with topic weights for each game (rows) and topic (columns) ''' data_samples, gameNames = create_game_profile_df(game_path) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topics = lda.fit_transform(tf) # for i in range(50): # gameTopics = [] # for j in range(len(topics[0])): # if topics[i,j] > 1.0/float(n_topics): # gameTopics.append(j) # print gameNames[i], gameTopics topicsByGame = pandas.DataFrame(topics) topicsByGame.index = gameNames print topicsByGame tf_feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) return topicsByGame
def score_lda(src, dst): ##read sentence pairs to two lists b1 = [] b2 = [] lines = 0 with open(src) as p: for i, line in enumerate(p): s = line.split('\t') b1.append(s[0]) b2.append(s[1][:-1]) #remove \n lines = i + 1 vectorizer = CountVectorizer() vectors=vectorizer.fit_transform(b1 + b2) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) X = lda.fit_transform(vectors) print X.shape b1_v = vectorizer.transform(b1) b2_v = vectorizer.transform(b2) b1_vecs = lda.transform(b1_v) b2_vecs = lda.transform(b2_v) res = [round(5*(1 - spatial.distance.cosine(b1_vecs[i], b2_vecs[i])),2) for i in range(lines)] with open(dst, 'w') as thefile: thefile.write("\n".join(str(i) for i in res))
def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation_cos: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation_cos: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def latdirall(content): lda = LatentDirichletAllocation(n_topics=10) tf_vectorizer = TfidfVectorizer(max_df=0.99, min_df=1, stop_words='english') tf = tf_vectorizer.fit_transform(content) lolz = lda.fit_transform(tf) tfidf_feature_names = tf_vectorizer.get_feature_names() return top_topics(lda, tfidf_feature_names, 10)
def test_lda_transform(): # Test LDA transform. # Transform result cannot be negative rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_topics = 3 lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any())
def test_lda_transform(): # Test LDA transform. # Transform result cannot be negative and should be normalized rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) n_topics = 3 lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any()) assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) distr_1 = lda_1.fit_transform(X) perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False) distr_2 = lda_2.fit_transform(X) perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_fit_transform(method): # Test LDA fit_transform & transform # fit_transform and transform result should be the same rng = np.random.RandomState(0) X = rng.randint(10, size=(50, 20)) lda = LatentDirichletAllocation(n_components=5, learning_method=method, random_state=rng) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) assert_array_almost_equal(X_fit, X_trans, 4)
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, random_state=0) distr = lda.fit_transform(X) perplexity_1 = lda.perplexity(X, distr, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
def test_doc_topic_distr_deprecation(): # Test that the appropriate warning message is displayed when a user # attempts to pass the doc_topic_distr argument to the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) distr1 = lda.fit_transform(X) distr2 = None assert_warns(DeprecationWarning, lda.perplexity, X, distr1) assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method='batch', total_samples=100, random_state=0) distr = lda.fit_transform(X) perp_1 = lda.perplexity(X) perp_2 = lda.perplexity(X, distr) perp_3 = lda.perplexity(X.toarray(), distr) assert_almost_equal(perp_1, perp_2) assert_almost_equal(perp_1, perp_3)
def fit_lda(tf,vectorizer): n_topics = 20 n_top_words = 20 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,learning_method='online', learning_offset=50.,random_state=0) tf_lda = lda.fit_transform(tf) f_print = True if f_print: tf_feature_names = vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) return [tf_lda,lda]
def latdirall(content): lda = LatentDirichletAllocation(n_topics=5) tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(content) lolz = lda.fit_transform(tf) tfidf_feature_names = tf_vectorizer.get_feature_names() tops = top_topics(lda, tfidf_feature_names, 10) wordlist = [] for topic in tops: wordlist += topic return wordlist
def infer_topics(self, num_topics=10, algorithm='variational', **kwargs): self.nb_topics = num_topics lda_model = None topic_document = None if algorithm == 'variational': lda_model = LDA(n_topics=num_topics, learning_method='batch') topic_document = lda_model.fit_transform(self.corpus.sklearn_vector_space) elif algorithm == 'gibbs': lda_model = lda.LDA(n_topics=num_topics, n_iter=500) topic_document = lda_model.fit_transform(self.corpus.sklearn_vector_space) else: raise ValueError("algorithm must be either 'variational' or 'gibbs', got '%s'" % algorithm) self.topic_word_matrix = [] self.document_topic_matrix = [] vocabulary_size = len(self.corpus.vocabulary) row = [] col = [] data = [] for topic_idx, topic in enumerate(lda_model.components_): for i in range(vocabulary_size): row.append(topic_idx) col.append(i) data.append(topic[i]) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() row = [] col = [] data = [] doc_count = 0 for doc in topic_document: topic_count = 0 for topic_weight in doc: row.append(doc_count) col.append(topic_count) data.append(topic_weight) topic_count += 1 doc_count += 1 self.document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
def text_transformation(initial_data, y, categorical_feature=['project_description']): tf = CountVectorizer(token_pattern='[a-zA-Z]{3,}',max_df=0.95, min_df=0.002, max_features=2000, stop_words='english') serie = initial_data[categorical_feature[0]] articles_words = tf.fit_transform(serie.to_dict().values(), y) word_index = tf.get_feature_names() K = 20 lda = LatentDirichletAllocation(n_topics=K, max_iter=10, learning_method='online', learning_offset=10., random_state=0, n_jobs=-1) t0 = time() new_feature = lda.fit_transform(articles_words) print("done in %0.3fs." % (time() - t0)) new_feature = pd.DataFrame(new_feature) return re_assemble_dataset(initial_data, new_feature, categorical_feature), lda
def lda_topics_modeling(n_topics: int, id2text, corpus, id2word, n_top_features=10, dump_to_db=True): """ Возвращает вероятностное распределение документов по темам :param n_topics: Число возможных тем (топиков) :param id2text: Список имен (индексов) текстов :param corpus: текстов :param id2word: Список слов :param n_top_features: Число выводимых слов, характеризующих кластер :param dump_to_db: True - записывает топики в базу данных, False - не записывает топики в базуданных :return: кортеж с распределений слов по темам и документов по темам """ t0 = time() lda = LatentDirichletAllocation(n_topics=n_topics) logging.info('LDA created in {:.3} sec'.format(time() - t0)) t0 = time() doc_topic_dist = lda.fit_transform(corpus) logging.info('LDA model fit-transformed in {:.3} sec'.format(time() - t0)) # Загрузка полученных топиков в базу данных if dump_to_db: lda_topics = connect_to_db()['lda_clusters'] lda_topics.drop() for topic_idx, topic_dist in enumerate(lda.components_): doc = {'_id': int(topic_idx), 'terms': [id2word[i] for i in np.argsort(topic_dist)[:-n_top_features - 1:-1]]} lda_topics.insert(doc) logging.info('Topic {} dumped to database'.format(topic_idx)) # Нормализация весов (получение вероятностей) topic_word_dist = np.apply_along_axis(_normalize_weights, 1, lda.components_) doc_topic_dist = np.apply_along_axis(_normalize_weights, 1, doc_topic_dist) topic_word_dist = pd.DataFrame(topic_word_dist, index=id2word) doc_topic_dist = pd.DataFrame(doc_topic_dist, columns=id2text) return topic_word_dist, doc_topic_dist, lda
def __init__(self, path, corpusName, query=None): self.query = query documents = (line.lower().split() for line in codecs.open( corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore')) self.corpus = [' '.join(i) for i in documents] if self.query is not None: self.corpus.append(' '.join(query.getTokens())) # Make models t0 = time() print "Creating SciKit TF-IDF Model" self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LSA Model" t0 = time() lsa = TruncatedSVD(n_components=300) self.lsaModel = lsa.fit_transform(self.tfidfModel) self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LDA Model" # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA") tf_vectorizer = CountVectorizer(max_features=2000) t0 = time() tf = tf_vectorizer.fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print("Fitting LDA model") lda = LatentDirichletAllocation(n_topics=300, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() self.ldaModel = lda.fit_transform(tf) self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel) print("Done in %0.3fs." % (time() - t0))
def lda_viz(docs, lengths, n_features, n_topics, n_top_words): n_samples = len(docs) norm = lambda data: pandas.DataFrame(data).div(data.sum(1),axis=0).values vect = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') vected = vect.fit_transform(docs) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) doc_topic_dists = norm(lda.fit_transform(vected)) prepared = pyLDAvis.prepare( doc_lengths = lengths, vocab = vect.get_feature_names(), term_frequency = vected.sum(axis=0).tolist()[0], topic_term_dists = norm(lda.components_), doc_topic_dists = doc_topic_dists, ) #print(doc_topic_dists) #print(n_samples) return prepared, doc_topic_dists
def extended_lda(df, n_topics=200): ''' Trains an extended LDA model with custom text preprocessor and custom tokenizer Args: df: dataframe with Pitchfork reviews n_topics: number of topis in LDA model Returns: tfidf: sklearn fitted TfidfVectorizer tfidf_trans: sparse matrix with tf! transformed data lda: sklearn fitted LatentDirichletAllocation lda_trans: dense array with lda transformed data ''' print('Starting TfIdf') # for LDA, use raw counts; that is, tfidf with appropriate parameters tfidf, tfidf_trans = extended_tfidf(df, use_idf=False, norm=None) print('Starting LDA') lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5) lda_trans = lda.fit_transform(tfidf_trans) return tfidf, tfidf_trans, lda, lda_trans
class Featurizer(): def __init__(self, plot_vectorizer = 'count', tokenizer = None, lda = False, use_genre_vecs = False): t = None if tokenizer is 'named_entity': t = NETokenizer() elif tokenizer is 'lemma': t = LemmaTokenizer() self.use_genre_vecs = use_genre_vecs self.binary = plot_vectorizer is 'binary' if plot_vectorizer is 'tfidf': self.vectorizer = TfidfVectorizer(analyzer = "word", \ tokenizer = t, \ preprocessor = None, \ stop_words = 'english') elif plot_vectorizer is 'binary': self.vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = t, \ preprocessor = None, \ stop_words = 'english', \ binary = True) else: self.vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = t, \ preprocessor = None, \ stop_words = 'english') if lda: self.lda = LatentDirichletAllocation(n_topics=20, max_iter=2, \ learning_method='online', learning_offset=10., \ random_state=0) else: self.lda = None def find_movie(self, title, year = None): """ Finds a movie with the given name substring. """ return [movie for movie in self.movies.keys() if title in movie[0] and (year is None or year == movie[1])] def load(self, path): """ Loads the data into memory. """ with io.open(path, 'r', encoding = 'latin-1') as f: movies = json.load(f) od = OrderedDict({(movie['title'],movie['year']):{'plot':movie['plot'],'cast':set(movie['cast']), \ 'genres':set(movie['genres'])} \ for movie in movies}.items()) return od def train(self, movies): """ Trains the featurizer. """ movie_keys = list(movies.keys()) self.movies = dict(zip(movie_keys, range(0, len(movie_keys)))) self.movie_indices = dict([reversed(i) for i in self.movies.items()]) plots = [movie['plot'] for movie in movies.values()] self.plots = self.vectorizer.fit_transform(plots) self.casts = [movie['cast'] for movie in movies.values()] self.genres = [movie['genres'] for movie in movies.values()] if self.lda is not None: self.plot_topics = self.lda.fit_transform(feat_vec) else: self.plot_topics = None if self.use_genre_vecs: genre_lis = set([]) for g in self.genres: genre_lis.update(g) self.genre_lis = dict(zip(genre_lis, range(0, len(genre_lis)))) self.genre_indices = dict([reversed(i) for i in self.genre_lis.items()]) genre_plots = np.zeros((len(genre_lis),self.plots.shape[1])) for i in range(len(self.genres)): gl = self.genres[i] for g in gl: genre_plots[self.genre_lis[g],:] += self.plots[i,:] if self.binary: genre_plots = np.minimum(np.ones((len(genre_lis),self.plots.shape[1])),genre_plots) self.genre_plots = cosine_simil(self.plots, genre_plots) def load_train(self, path): """ Loads the data into memory and trains the featurizer. """ self.train(self.load(path)) def plot_features(self, base_movie, plots, plot_topics = None): """ Returns a feature matrix derived from the plots. The # of rows returned matches the length of the parameter plots. """ if self.use_genre_vecs: plot = self.genre_plots[self.movies[base_movie]] pv = cosine_simil(plots, plot) return pv else: plot = self.plots[self.movies[base_movie]] pv = cosine_simil(plots, plot) return pv def cast_features(self, base_movie, casts): """ Returns a feature matrix derived from the casts. The # of rows returned matches the length of the parameter casts. """ cv = np.array([jaccard(cast_set, self.casts[self.movies[base_movie]]) for cast_set in casts]) return cv.reshape((cv.shape[0],1)) # Reshape into column vector def genre_features(self, base_movie, genres): """ Returns a feature matrix derived from the genres. The # of rows returned matches the length of the parameter genres. """ gv = np.array([jaccard(genre_set, self.genres[self.movies[base_movie]]) for genre_set in genres]) return gv.reshape((gv.shape[0],1)) # Reshape into column vector def single_features(self, base_movie, trial_movie): """ Returns a feature matrix for a single movie. """ ind = self.movies[trial_movie] return self.features(base_movie, movies = ((self.genre_plots[ind] if self.use_genre_vecs else self.plots[ind], self.plot_topics[ind] if self.lda is not None else None), [self.casts[ind]], [self.genres[ind]])) def features(self, base_movie, movies = None): """ Returns the feature set for the given movies, when compared to the base movie. When movies is None, uses the whole list of movies. Parameter movies must be a 3-tuple, representing the plots, casts and genres. The # of rows of each should match. Returns an AxB matrix where A is the # of rows for plots and B is the total number of features. """ plots = (self.genre_plots if self.use_genre_vecs else self.plots) if movies is None else movies[0][0] plot_topics = self.plot_topics if movies is None else movies[0][1] casts = self.casts if movies is None else movies[1] genres = self.genres if movies is None else movies[2] pv = self.plot_features(base_movie, plots, plot_topics) cv = self.cast_features(base_movie, casts) gv = self.genre_features(base_movie, genres) return hstack((pv,cv,gv)) if issparse(pv) else np.hstack((pv,cv,gv)) def similar_movies(self, weights, base_movie, movies = None, n = 6): """ Gets the n similar movies to a base movie. """ fv = self.features(base_movie, movies = movies) wv = weights.reshape((weights.shape[1],1)) scores = fv.dot(wv) inds = np.argpartition(scores,-n, axis = 0)[-n:].reshape(n) return [self.movie_indices[i]for i in inds]
class LDA(): def __init__(self, n_topics=10, n_features=5000, max_df=.75, min_df=2, max_iter=5, alpha=None, eta=None): ''' ''' self.n_topics = n_topics self.n_features = n_features self.max_df = max_df self.min_df = min_df self.max_iter = max_iter self.lda = None self.tf = None self.topics = None self.alpha = alpha self.eta = eta def vectorizecounts(self, docs): ''' ''' # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") self.tf_vectorizer = CountVectorizer(max_df=self.max_df, min_df=self.min_df, max_features=self.n_features) t0 = time() self.tf = self.tf_vectorizer.fit_transform(docs) self.n_samples = len(docs) print("done in %0.3fs." % (time() - t0)) def fitLDA(self): ''' ''' print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (self.n_samples, self.n_features)) self.lda = LatentDirichletAllocation(doc_topic_prior=self.alpha, topic_word_prior=self.eta, n_topics=self.n_topics, max_iter=self.max_iter, learning_method='online', learning_offset=10., random_state=0, n_jobs=6) t0 = time() self.topics = self.lda.fit(self.tf) print("done in %0.3fs." % (time() - t0)) def print_top_words(self, n_top_words): ''' ''' tf_feature_names = self.tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(self.lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() def get_topic_content(self, topic): ''' Parameters -------------- topic: int Topic index Returns ----------- feature_names : list Array of words corresponding to the given feature. topic_content : np.array(n_features) Topic vector over the feature space ''' return self.tf_vectorizer.get_feature_names(), self.lda.components_ def get_doc_topics(self, docs): # Convert the document into feature space. feature_vec = self.tf_vectorizer.fit_transform(docs) return self.lda.fit_transform(feature_vec)
def preprocess_data(data, method): print "[EDEN I/O -- preprocess_data] Preprocessing data..." def format_entities(norm_ent): ents = [] for ent in norm_ent: try: ents.append(porter.stem(ent['surface-form'].lower())) except: continue return " ".join(ents) stop_words = set(stopwords.words('english')) porter = PorterStemmer() def nlp_prepro(doc, porter, stop_words): return " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(doc) if i.lower() not in stop_words]) d = [{"id": doc["_id"], "first-published": doc["_source"]["first-published"], "title": doc["_source"]["title"], "summary": doc["_source"]["title"], "content": doc["_source"]["content"], "entities": format_entities(doc["_source"]["normalised-entities"]), "content_prepro": nlp_prepro(doc["_source"]["content"], porter, stop_words)} for doc in data] df_story = pd.DataFrame(d) df_story['first-published-epoch'] = df_story['first-published'].apply( lambda x: int(datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").strftime("%s"))) df_story = df_story.sort_values(by='first-published-epoch') df_story = df_story.reset_index() if method == 'ltc': vect = TfidfVectorizer(sublinear_tf=True, use_idf=True, norm='l2') vsm = vect.fit_transform(df_story['content_prepro'].values) vsm_arr = vsm.toarray() print "[EDEN I/O -- preprocess_data] VSM shape: ", vsm.shape print "[EDEN I/O -- preprocess_data] VSM type: ", type(vsm) df_story['vsm'] = [r for r in vsm_arr] elif method == 'ltc_ent': vect = TfidfVectorizer(use_idf=True, norm='l2', sublinear_tf=True) vsm = vect.fit_transform(df_story['entities'].values) vsm_arr = vsm.toarray() print "[EDEN I/O -- preprocess_data] VSM shape: ", vsm.shape print "[EDEN I/O -- preprocess_data] VSM type: ", type(vsm) df_story['vsm'] = [r for r in vsm_arr] elif method == 'word2vec': with open('../datasets/word2vec_signal/word2vec_signal.p', 'rb') as fin: word2vec_signal = pickle.load(fin) vecs = [word2vec_signal[id_] for id_ in df_story['id']] df_story['vsm'] = vecs elif method == 'LatentDirichlet': vect = CountVectorizer(max_df=0.90, min_df=2).fit_transform( df_story['content_prepro'].values) lda = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0) vsm_arr = lda.fit_transform(vect, None) df_story['vsm'] = [r for r in vsm_arr] return df_story
% (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) ''' print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,learning_method='batch') t0 = time() result = lda.fit_transform(tf) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) #number of clusters clusters = 2 km = KMeans(n_clusters=clusters, init='k-means++', max_iter=100, n_init=10,verbose=opts.verbose)
token_pattern='[A-Za-z]{3,}|[A-Z]{2,}', stop_words='english') tf_matrix = tf_vectorizer.fit_transform(df['ProcessedContents']) print "lda model done now..." lda = LatentDirichletAllocation(n_topics=20, max_iter=15, doc_topic_prior=0.4, topic_word_prior=0.4, learning_method='online', learning_offset=50., verbose=1, random_state=1) lda_result = lda.fit_transform(tf_matrix) print "dbscan model done now..." dbscan_model = DBSCAN(eps=0.1, min_samples=3) dbscan_model.fit(lda_result) df['dbscan_labels'] = dbscan_model.labels_ max_cluster = max(df['dbscan_labels']) print "There are total of " + str(max_cluster + 1) + " clusters..." for cluster_id in range(0, max_cluster + 1): # cluster_id = 1 print "Cluster ID: " + str(cluster_id) + "..." subset_df = df[df['dbscan_labels'] == cluster_id] subset_df = subset_df.reset_index()
df = pd.read_csv("movie_data.csv", encoding="utf-8") # Use CountVectorizer to create the bag-of-words matrix as input to the LDA. count = CountVectorizer(stop_words="english", max_df=0.1, max_features=5000) X = count.fit_transform(df["review"].values) lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method="batch", n_jobs=-1) # Let the lda estimator do its estimation based on all the availabel training # data (bag-of-words matrix) in one iteration. X_topics = lda.fit_transform(X) # fit to the data, then transform it. components = lda.components_ print(components[0, 0: 10]) n_top_words = 5 feature_names = count.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic {:.0f}:".format(topic_idx + 1)) print(" ".join([feature_names[i] for i in topic.argsort()[-1: -n_top_words - 1: -1]])) print("\n") music = X_topics[:, 7].argsort()[::-1] for iter_idx, movie_idx in enumerate(music[:3]): print("\nMusic Movie {:.0f}:".format(iter_idx + 1))
def main(filename): global data_vectorized global lda_output global plot_df df = pd.read_csv(filename) # CHANGE THIS df = df.sample(frac=0.2, replace=False, random_state=1) N_NGRAM_RANGE = 2 # CHANGE HERE my_additional_stop_words = pd.read_csv( r'C:\Users\noel.alexander\Documents\Fullstack\Topic Modelling\Stopwords\custom_stopwords.csv' ).values.flatten().tolist() #CHANGE THIS stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words) data = df.content.values.tolist() # Remove Emails data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data] # Remove new line characters data = [re.sub(r'\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] data_words = list(sent_to_words(data)) nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # Do lemmatization keeping only Noun, Adj, Verb, Adverb data_lemmatized = lemmatization( n=nlp, texts=data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) vectorizer = CountVectorizer( analyzer='word', min_df=0.05, # ignore terms that appear in less than 5% of the documents stop_words=stop_words, # remove stop words lowercase=True, # convert all words to lowercase token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3 ngram_range=(1, N_NGRAM_RANGE)) data_vectorized = vectorizer.fit_transform(data_lemmatized) space = { 'n_topics': hp.quniform("n_topics", 6, 10, 1), # search n_topics from 2-20 'learning_decay': hp.uniform('learning_decay', 0.5, 0.9), # search learning_decay from 0.5-0.9 } trials = Trials() best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=5, trials=trials) LEARNING_DECAY = best['learning_decay'] #0.84529 #best['learning_decay'] N_TOPICS = best['n_topics'] #9 #best['n_topics'] print('starting lda') # Build LDA Model lda_model = LatentDirichletAllocation( n_components=int(N_TOPICS), # number of topics learning_decay= LEARNING_DECAY, # control learning rate in the online learning method max_iter=10, # max learning iterations learning_method='online', # use mini-batch of training data batch_size=128, # n docs in each learning iter n_jobs=-1, # use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) lda_output = lda_model.transform(data_vectorized) # column names topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)] # index names docnames = ["Doc" + str(i) for i in range(len(data))] # Make the pandas dataframe df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames) # Get dominant topic for each document dominant_topic = np.argmax(df_document_topic.values, axis=1) df_document_topic['dominant_topic'] = dominant_topic # Apply Style df_document_topics = df_document_topic.head(15).style.applymap( color_green).applymap(make_bold) df_topic_distribution = df_document_topic['dominant_topic'].value_counts( ).reset_index(name="Num Documents") df_topic_distribution.columns = ['Topic Num', 'Num Documents'] df_topic_distribution['Percent of Total'] = round( df_topic_distribution['Num Documents'] / np.sum(df_topic_distribution['Num Documents'].values), 2) topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15) # Topic - Keywords Dataframe df_topic_keywords = pd.DataFrame(topic_keywords) df_topic_keywords.columns = [ 'Word ' + str(i) for i in range(df_topic_keywords.shape[1]) ] df_topic_keywords.index = [ 'Topic ' + str(i) for i in range(df_topic_keywords.shape[0]) ] #pyLDAvis.enable_notebook() panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne') """ topics_dic ={} for i in range(int(N_TOPICS)): topics_dic[i] = 'topic ' + str(i) plot_df = pd.DataFrame({'topics':labels}) plot_df['topics'] = plot_df['topics'].map(topics_dic) labels = [] for doc in lda_output: labels.append(np.argmax(doc)) labels = np.array(labels) embedding = umap.UMAP(n_neighbors=100, min_dist=0.9).fit_transform(lda_output) plot_df['axis_1'] = embedding[:, 0] plot_df['axis_2'] = embedding[:, 1] """ html = pyLDAvis.prepared_data_to_html(panel) Html_file = open("html_output", "w") Html_file.write(html) Html_file.close() return html
#print("\n\nmyinput : ", myinput) vectorizer = CountVectorizer(min_df=1, max_df=0.95, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}', ngram_range=(1, 3)) data_vectorized = vectorizer.fit_transform(myinput) # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_components=4, max_iter=50, learning_method='online', random_state=0) lda_Z = lda_model.fit_transform(data_vectorized, num_titles) print("\n\nNO_DOCUMENTS, NO_TOPICS (n_components) : ", lda_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Let's see how the first document in the corpus looks like in different topic spaces print("\n\nlda_z : ", lda_Z[0]) model = (vectorizer, data_vectorized, lda_model.components_, lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_) print("Start pickling LDA Model") import pickle pickle.dump(model, open("LDAModel_Pickle.p", "wb")) print("Done pickling LDA Model")
from infant_pipe import pdf_extract, process from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation import pandas as pd import mglearn import numpy as np from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS vect = CountVectorizer(ngram_range=(1, 1), stop_words='english') circular_content = process(pdf_extract('001.pdf')) dtm = vect.fit_transform([circular_content]) #print(pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())) print(type(circular_content)) lda = LatentDirichletAllocation(n_components=5) lda_dtf = lda.fit_transform(dtm) sorting = np.argsort(lda.components_)[:, ::-1] features = np.array(vect.get_feature_names()) #print(mglearn.tools.print_topics(topics = range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=10)) #str_circular_content = str(circular_content)
# documentId, similarity = similarities[0] # print(data[documentId][0:1000]) vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}') dataVectorized = vectorizer.fit_transform(data) # Build LDA Model : Sklearn ldaModel1 = LatentDirichletAllocation(n_components=numOfTopics, max_iter=10, learning_method='online') ldaZ = ldaModel1.fit_transform(dataVectorized) # x = ldaModel1.transform(vectorizer.transform([text]))[0] # print(x, x.sum()) # Visualize LDA Sklearn Results panel = pyLDAvis.sklearn.prepare(ldaModel1, dataVectorized, vectorizer, mds="tsne") pyLDAvis.save_html( panel, "C:/xampp/htdocs/SpeechArt/LDA_visualizations/" + doc_name + ".html")
sentences = pd.DataFrame(sentenses, columns=['Sentences']) countVectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1, 1)) vectorizedText = countVectorizer.fit_transform(sentences['Sentences']) ldaModel = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=0, verbose=0) lda_topics = ldaModel.fit_transform(vectorizedText) from collections import Counter lda_keys = lda_topics.argmax(axis=1) lda_categories, lda_counts = zip(*Counter(lda_keys).items()) tsne_Model = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75) tsne_vector = tsne_Model.fit_transform(lda_topics)
term_idf_scores = [] for i in range(len(terms)): term_idf_scores.append([terms[i], term_idf_sums[0,i]]) print("The Term/Frequency matrix has", tf.shape[0], " rows, and", tf.shape[1], " columns.") print("The Term list has", len(terms), " terms.") term_idf_scores.sort(key=sortSecond, reverse=True) print("\nTerms with Highest TF-IDF Scores:") for i in range(10): j = i print('{:<15s}{:>8.2f}'.format(term_idf_scores[j][0], term_idf_scores[j][1])) uv = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\ learning_method=learning_method, \ learning_offset=learning_offset, \ random_state=12345) U = uv.fit_transform(tf) # Display the topic selections print("\n********** GENERATED TOPICS **********") TextAnalytics.display_topics(uv.components_, terms, n_terms=15, mask=None) # Store topic selection for each doc in topics[] topics = [0] * n_reviews for i in range(n_reviews): max = abs(U[i][0]) topics[i] = 0 for j in range(n_topics): x = abs(U[i][j]) if x > max: max = x topics[i] = j
for indx, freq in line: rows.append(i) cols.append(indx) data.append(freq) dtm = csr_matrix((data, (rows, cols)), shape=(Nrow, Ncol), dtype=int) # Materialize the sparse data # data_dense = dtm.todense() # Compute Sparsicity = Percentage of Non-Zero cells # print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%") n_topics = list(range( 50, 150, 10)) # + list(range(50, 200, 50)) + list(range(200, 500, 100)) for NTopic in n_topics: # Build LDA Model lda_model = LatentDirichletAllocation( n_components=NTopic, # Number of topics max_iter=10, # Max learning iterations learning_method='online', batch_size=500000, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs verbose=1) lda_output = lda_model.fit_transform(dtm) from joblib import dump, load model_fname = './sklearnlda/lda_n_' + str(NTopic) + '.joblib' dump(lda_model, model_fname) X_fname = './sklearnlda/transformedX_n_' + str(NTopic) + '.joblib' dump(lda_output, X_fname) print(lda_model) # Model attributes
# instatiate the vectorizer vect_LDA = CountVectorizer(stop_words = stop_words, strip_accents = 'ascii') # vectorize the documents - get dtm data_vect = vect_LDA.fit_transform(documents) data_vect.shape # create a dataframe corpus_df_LDA = pd.DataFrame(data_vect.toarray(), columns = vect_LDA.get_feature_names()) corpus_df_LDA.shape # filter digits from column names corpus_df_LDA = corpus_df_LDA[corpus_df_LDA.columns.drop(list(corpus_df_LDA.filter(regex = r'(\d+)')))] corpus_df_LDA # LDA MODEL n_components = number of topics lda_model = LatentDirichletAllocation(n_components = NUM_TOPICS, max_iter = 10, learning_method = 'online') # fit the model to the vectorized data (dtm) lda_Z = lda_model.fit_transform(data_vect) print(lda_Z.shape) # 218 docs, 5 topics # fit the model to the dataframe lda_Z_DF = lda_model.fit_transform(corpus_df_LDA) # Build a Non-Negative Matrix Factorization Model nmf_model = NMF(n_components = NUM_TOPICS) nmf_Z = nmf_model.fit_transform(corpus_df_LDA) print(nmf_Z.shape)# 218 docs, 5 topics # Build a Latent Semantic Indexing Model lsi_model = TruncatedSVD(n_components = NUM_TOPICS) lsi_Z = lsi_model.fit_transform(corpus_df_LDA) print(lsi_Z.shape) # 40 docs, 10 topics
def run_kmeans(n,save=False,filename=""): tf_data_features = count_vectorizer.fit_transform([review[1] for review in reviews]) #tf_data_features_array = tf_data_features.toarray() tf_vocab = count_vectorizer.get_feature_names() #to check that has same vocab from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_topics=17, max_iter=2, learning_method='online',learning_offset=10., random_state=5) topic_transformed_features = lda.fit_transform(tf_data_features) #topic_transformed_features is array of topic composition of reviews #code taken from example on scikit learn to print for topic_idx, topic in enumerate(lda.components_): print("Topic #%d:" % topic_idx) print(" ".join([tf_vocab[i] for i in topic.argsort()[:-50 - 1:-1]])) #normalize LDA topic score vectors topic_transformed_features = Normalizer(copy=False).fit_transform(topic_transformed_features) num_clusters = n k_means = cluster.KMeans(n_clusters=num_clusters) #run k means clustering algorithm k_means.fit(topic_transformed_features) original_space_centroids = k_means.cluster_centers_ #print out the centroid topic scores for topics that scored above 0.10 for i in range(num_clusters): print("Cluster %d: " % i, end='\n') for x in range(0,original_space_centroids[i].size): if original_space_centroids[i,x] > .1: print("Topic ",x,": ",original_space_centroids[i,x]) print("") print("") #reduce dimensionality for visualization and silhouette score calculation svd = TruncatedSVD(2) data_features = svd.fit_transform(topic_transformed_features) space_centroids = svd.transform(k_means.cluster_centers_) data_features_array = data_features.tolist() firstArray = [] secondArray = [] thirdArray = [] fourthArray = [] for i in range(len(data_features_array)): #print(coord_pair) if k_means.labels_[i] == 0: firstArray.append((data_features_array[i][0],data_features_array[i][1])) elif k_means.labels_[i] == 1: secondArray.append((data_features_array[i][0],data_features_array[i][1])) elif k_means.labels_[i] == 2: thirdArray.append((data_features_array[i][0],data_features_array[i][1])) else: fourthArray.append((data_features_array[i][0],data_features_array[i][1])) plt.plot([x[0] for x in firstArray],[y[1] for y in firstArray], 'ro',label="Cluster 0") if n >= 2: plt.plot([x[0] for x in secondArray],[y[1] for y in secondArray], 'go',label="Cluster 1") if n >= 3: plt.plot([x[0] for x in thirdArray],[y[1] for y in thirdArray], 'bo',label="Cluster 2") if n >= 4: plt.plot([x[0] for x in fourthArray],[y[1] for y in fourthArray], 'mo',label="Cluster 3") plt.plot([centroid[0] for centroid in space_centroids], [centroid[1] for centroid in space_centroids],'ko') plt.title('K-Means Clustering with LDA Feature Vectorization ('+str(n)+' Clusters)') plt.legend(loc='upper right',shadow=True, fontsize='medium') figure = plt.gcf() figure.set_size_inches(8,6) if save is True and filename != "": plt.savefig(filename+'.png', dpi=100) plt.show() #calculate silhouette score silhouette_score = metrics.silhouette_score(data_features,k_means.labels_,metric='euclidean',sample_size=len(reviews)) print(num_clusters, ': silhouette score: ',silhouette_score) return silhouette_score
tfidf = transformer.fit_transform(cntTf) word = vectorizer.get_feature_names() weight = tfidf.toarray() df_weight = pd.DataFrame(weight) feature = df_weight.columns df_weight['sum'] = 0 for f in tqdm(feature): df_weight['sum'] += df_weight[f] deviceid_packages['tfidf_sum'] = df_weight['sum'] # In[10]: lda = LatentDirichletAllocation(n_topics=5, learning_offset=50., random_state=666) docres = lda.fit_transform(cntTf) # In[11]: deviceid_packages = pd.concat( [deviceid_packages, pd.DataFrame(docres)], axis=1) # In[12]: temp = deviceid_packages.drop('apps', axis=1) deviceid_train = pd.merge(deviceid_train, temp, on='device_id', how='left') # In[13]: #解析出所有的device_app_pair device_id_arr = []
# X = docuemnt-term matrix X = vectorizer.fit_transform(total_text) t2 = time.time() print('time for count vectorizer: ' + str((t2-t1))) #vocab = vectorizer.get_feature_names() n_top_words = 5 lda_model = LatentDirichletAllocation(n_components=17, random_state=100) lda_model.fit_transform(X) t3 = time.time() print('time for LDA: ' + str((t3-t2))) prepared_data = prepare(lda_model,X,vectorizer, mds = 'tsne', plot_opts={'xlab': '', 'ylab': ''}) t4 = time.time() print('time for pyLDAvis: ' + str((t4-t3))) print('total time: ' + str((t4-t0)))
def main(data_samples, lang, n_features, n_topics, n_top_words): # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA...") # Get the top 1000 tokens in order to correct and lemmatize them tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words=stopword_spec(lang)) t0 = time.time() tf = tf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time.time() - t0)) # extract the top 1000 words for later use words_list = list(tf_vectorizer.vocabulary_.keys()) print("Initialization of the spell checker on tokens...") t0 = time.time() # Check spelling of the top 1000 words corrected_words = spell_checker(words_list) print("done in %0.3fs." % (time.time() - t0)) print("Initialization of the Lemmatizer...") tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr') words_lemmatized = [] for word in corrected_words: lemma = treetaggerwrapper.make_tags(tagger.tag_text(word),exclude_nottags=False)[0].lemma words_lemmatized.append(lemma) # Dict containing {unmodified words: words corrected and lemmatized} word_to_lemma_dict = dict(zip(words_list, words_lemmatized)) # Transform the matrix to take into account spell check and lemmatization # 1 - Convert sparse matrice to dataframe to chan tf_df = pd.DataFrame(tf.A, columns=tf_vectorizer.get_feature_names()) # 2 - Change the name of the columns by the corrected and lemmatized words tf_df.rename(index=str, columns=word_to_lemma_dict, inplace=True) # 3 - Groupby columns with same name and sum of counts tf_df = tf_df.groupby(by=tf_df.columns, axis=1).sum() # 4 - Convert df back to sparse matrix tf = sps.csr_matrix(tf_df) print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (len(data_samples), n_features)) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time.time() doc_topics = lda.fit_transform(tf) print("done in %0.3fs." % (time.time() - t0)) tf_feature_names = list(tf_df.columns) Topics = pd.DataFrame() for topic_idx, topic in enumerate(lda.components_): topic_words = [tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for count, word in enumerate(topic_words): probabilite=sorted(lda.components_[topic_idx][:-n_top_words - 1:-1], reverse=True) total=sum(probabilite) #print(probabilite) topic_words[count] = word + " ( %0.3f )" % (probabilite[count]/total*100) #topic_words[count] = word + " ( %0.3f )" % (lda.components_[topic_idx, count]/lda.components_[topic_idx][:-n_top_words - 1:-1].sum()*100) #topic_words[count] = word + " (" + str(lda.components_[topic_idx, count]/lda.components_[topic_idx].sum()*100) + ")" #print(topic_words[i]) Topics[topic_idx] = topic_words Topics = Topics.transpose() # frequence = pd.DataFrame(lda.components_) #frequence = frequence.transpose() print("end LDA") return doc_topics, Topics
class Cluster: """cluster input data using K-means, Minibatch-Kmeans or LDA. Input to clustering algorithms must be either a Tf-Idf vector or a hashing vector. tuning parameters can be configured in default.cfg file.""" def __init__(self, config): self.config = config self.model = None self.svd = None # log_file = self.config.LOG_DIR + self.config.LOGFILE # logging.basicConfig(format='%(asctime)s::%(levelname)s::%(message)s', level=logging.INFO, filename=log_file) def do_kmeans(self, dataset): """vanilla k-means - Llyod's algorithm. Input: :parameter dataset: input data in the form of a term document matrix Output: :returns labels_: a list of cluster identifiers - 1 per input document :rtype list""" # # normalization # self.svd = TruncatedSVD(self.config.NCLUSTERS) # normalizer = Normalizer(copy=False) # lsa = make_pipeline(self.svd, normalizer) # dataset = lsa.fit_transform(dataset) # finish normalization,start k-means self.model = KMeans(n_clusters=self.config.NCLUSTERS, n_init=self.config.NINIT, n_jobs=self.config.INIT_PCNT) self.model.fit_transform(dataset) return self.model.labels_ def do_minibatch_kmeans(self, dataset): """scalable version of k-means. used for large datasets. same input/output as k-means function Input: :parameter dataset: input data in the form of a term document matrix Output: :returns labels_: a list of cluster identifiers - 1 per input document :rtype list""" self.model = MiniBatchKMeans(n_clusters=self.config.NCLUSTERS, n_init=self.config.NINIT, batch_size=self.config.BATCHSIZE, max_iter=self.config.NITER, verbose=self.config) self.model.fit(dataset) return self.model.predict(dataset) def print_top_terms(self, features, model='kmeans'): """print top 'n' features(cluster centers) of each cluster Inputs: :parameter features: list of features returned by the vectorizer :parameter model: name of the model. default - kmeans""" if model == 'kmeans': for ind, term in enumerate(self.get_top_cluster_terms(features, model='kmeans')): print("Cluster #: {0} Top terms: {1}".format(ind, term)) elif model == 'lda': for ind, term in enumerate(self.get_top_cluster_terms(features, model='lda')): print("Topic #: {0} Top terms: {1}".format(ind, term)) def get_top_cluster_terms(self, features, model='kmeans', num_terms=15): """get top 'n' cluster features that constitute cluster centroids Input: :parameter features: list of features returned by the vectorizer :parameter model: name of the model. default - kmeans :parameter num_terms: # of terms to return. default - 15 Output: :returns cluster centroids :rtype list""" top_terms = [] if model == 'kmeans': # original_space_centroids = self.svd.inverse_transform(self.model.cluster_centers_) # order_centroids = original_space_centroids.argsort()[:, ::-1] order_centroids = self.model.cluster_centers_.argsort()[:, ::-1] for cluster_num in range(self.config.NCLUSTERS): top_terms.append(", ".join([features[i] for i in order_centroids[cluster_num, :num_terms]])) elif model == 'lda': for topic in self.model.components_: top_terms.append(", ".join([features[i] for i in topic.argsort()[:-num_terms - 1:-1]])) return top_terms def do_lda(self, dataset): """Latent Dirichlet Allocation Input: :parameter dataset: input data in the form of a term-document matrix Output: :return components_: list of topic labels for each topic :rtype list""" self.model = LatentDirichletAllocation(n_topics=self.config.NTOPICS, max_iter=self.config.NITER) self.model.fit(dataset) return self.model.components_ def do_h2o_kmeans(self, dataset, server_url): """use the h2o module to perform k-means clustering. This method delegates clustering to a H2O server instance(local or remote). A connection attempt will be made to the provided server_url before clustering is initiated. input: :param dataset: input data - term document matrix :param server_url: URL of the H2O server instance on which clustering would run output: labels_: a list of cluster identifiers - 1 per input document :raises ConnectionError""" # establish connection to H20 server try: h2o.connect(url=server_url, verbose=False) logging.info("connected to H2O server") h2o_dataframe = h2o.H2OFrame(python_obj=dataset) self.model = H2OKMeansEstimator(max_iterations=self.config.NITER, k=self.config.NCLUSTERS, init="PlusPlus", standardize=False) self.model.train(training_frame=h2o_dataframe) logging.info("modelling complete. predicting cluster membership") return self.model.predict(h2o_dataframe)["predict"].as_data_frame(use_pandas=False, header=False) except H2OConnectionError: logging.error("unable to connect to H2O server @ {0}".format(server_url)) raise ConnectionError("unable to connect to H2O server. check if server is running at specified URL")
#test_doc = [doc7, doc8] NUM_TOPICS = 1 vectorizer = CountVectorizer(min_df=1, max_df=6, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}') data_vectorized = vectorizer.fit_transform(documents) # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online') lda_Z = lda_model.fit_transform(data_vectorized) print(lda_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Non-Negative Matrix Factorization Model nmf_model = NMF(n_components=NUM_TOPICS) nmf_Z = nmf_model.fit_transform(data_vectorized) print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Latent Semantic Indexing Model lsi_model = TruncatedSVD(n_components=NUM_TOPICS) lsi_Z = lsi_model.fit_transform(data_vectorized) print(lsi_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Let's see how the first document in the corpus looks like in different topic spaces print("LDA") print(lda_Z[0])
# Build LDA Model lda_model = LatentDirichletAllocation(n_components=20, # Number of topics max_iter=10, # Max learning iterations learning_method='online', random_state=100, # Random state batch_size=128, # n docs in each learning iter evaluate_every=-1, # compute perplexity every n iters, default: Don't n_jobs=-1, # Use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) print(lda_model) # Model attributes # LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, # evaluate_every=-1, learning_decay=0.7, # learning_method="online", learning_offset=10.0, # max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, # n_components=10, n_jobs=-1, n_topics=20, perp_tol=0.1, # random_state=100, topic_word_prior=None, # total_samples=1000000.0, verbose=0) # Log Likelyhood: Higher the better print("Log Likelihood: ", lda_model.score(data_vectorized)) # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word) print("Perplexity: ", lda_model.perplexity(data_vectorized)) # See model parameters
km = KMeans(n_clusters=2) km.fit_transform(similarity_df) cluster_labels = km.labels_ cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel']) pd.concat([corpus_df, cluster_labels], axis=1) # # Topic models # In[11]: from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42) dt_matrix = lda.fit_transform(tv_matrix) features = pd.DataFrame(dt_matrix, columns=['T1', 'T2']) features # ## Show topics and their weights # In[12]: tt_matrix = lda.components_ for topic_weights in tt_matrix: topic = [(token, weight) for token, weight in zip(vocab, topic_weights)] topic = sorted(topic, key=lambda x: -x[1]) topic = [item for item in topic if item[1] > 0.6] print(topic) print()
'foo bar bar bar baz foo', 'foo foo foo bar baz', 'blah banana', 'cookies candy', 'more text please', 'hey there are more words here', 'bananas', 'i am a real boy', 'boy', 'girl' ] vectorizer = CountVectorizer() X = vectorizer.fit_transform(data) vocab = vectorizer.get_feature_names() n_top_words = 5 k = 2 model = LatentDirichletAllocation(n_topics=k, random_state=100) id_topic = model.fit_transform(X) topic_words = {} for topic, comp in enumerate(model.components_): # for the n-dimensional array "arr": # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array" # which contains the indices that would sort arr in a descending fashion # for the ith element in ranked_array, ranked_array[i] represents the index of the # element in arr that should be at the ith index in ranked_array # ex. arr = [3,7,1,0,3,6] # np.argsort(arr) -> [3, 2, 0, 4, 5, 1] # word_idx contains the indices in "topic" of the top num_top_words most relevant # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now) word_idx = np.argsort(comp)[::-1][:n_top_words]
# First Create Term-Frequency/Inverse Doc Frequency by Review Matrix # This requires constructing Term Freq. x Doc. matrix first tf_idf = TfidfTransformer() print("\nTF-IDF Parameters\n", tf_idf.get_params(), "\n") tf_idf = tf_idf.fit_transform(tf) # Or you can construct the TF/IDF matrix from the data tfidf_vect = TfidfVectorizer(max_df=max_df, min_df=2, max_features=m_features,\ analyzer=my_analyzer, ngram_range=ngram) tf_idf = tfidf_vect.fit_transform(discussions) print("\nTF_IDF Vectorizer Parameters\n", tfidf_vect, "\n") lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\ learning_method=learning_method, \ learning_offset=learning_offset, \ random_state=12345) lda.fit_transform(tf_idf) print('{:.<22s}{:>6d}'.format("Number of Reviews", tf.shape[0])) print('{:.<22s}{:>6d}'.format("Number of Terms", tf.shape[1])) print("\nTopics Identified using LDA with TF_IDF") tf_features = cv.get_feature_names() max_words = 15 desc = [] for topic_idx, topic in enumerate(lda.components_): message = "Topic #%d: " % topic_idx message += " ".join( [tf_features[i] for i in topic.argsort()[:-max_words - 1:-1]]) print(message) print() desc.append([tf_features[i] for i in topic.argsort()[:-max_words - 1:-1]]) #Extract topic probablities
class TopicModeller(object): ''' Wrapper for NMF ''' def __init__(self, model_type='NMF', vectorizer_type='tfidf', k_topics=8, max_vocab_size=5000, min_df=20, max_df=1.0, ngram_range=(1, 1), **kwargs): ''' Input: max_vocab_size - upper bound limit to the number of features/terms min_df - vectorizer: min document frequency max_df - vectorizer: ignore words with a document frequency above % ngram_range - number of ngrams to search for, starting from 1. The lower and upper boundary of the range of n-values for different n-grams to be extracted ''' # self.x_train = X_train # document set as Pandas Series self.vectorizer = None self.vectorizer_type = vectorizer_type.lower() self.document_term_mat = None self.model = None self.model_type = model_type.lower() self.W = None # Populated only for NMF model self.H = None # Populated only for NMF model self.d2v_model = None token_pattern = nlp_utils.get_token_pattern() stop_words = nlp_utils.get_stop_words() for key in ('vectorizer', 'model_type'): if key in kwargs: setattr(self, key, kwargs[key]) # for NMF if self.model_type == 'nmf': if self.vectorizer_type == 'tfidf': self.vectorizer = TfidfVectorizer(token_pattern=token_pattern, min_df=min_df, max_df=max_df, max_features=max_vocab_size, stop_words=stop_words, ngram_range=ngram_range) else: self.vectorizer = nlp_utils.LocalwiseVectorizer( max_features=max_vocab_size, min_df=min_df, max_df=max_df, ngram_range=ngram_range) # For LDA elif self.model_type == 'lda': # Use tf (raw term count) features for LDA. self.vectorizer = CountVectorizer(token_pattern=token_pattern, max_df=max_df, min_df=min_df, max_features=max_vocab_size, stop_words=stop_words, ngram_range=ngram_range) def vectorize(self, docs): ''' Vectorize the document content and fit the NMF Input train_docs - Training document set Output: the fit model ''' # list of document content # eg, resume content for each user or job posting description content print('Number of documents to process: %s\n' % docs.shape) print("Extracting Vectorizer features...") t1 = time.time() self.document_term_mat = self.vectorizer.fit_transform(docs) print("- Time: %0.3fs.\n" % (time.time() - t1)) def fit(self, docs, k_topics): ''' Input docs - Documents to topic model k_topics - k number of topics to generate ''' if self.document_term_mat is None: print('Vectorizer wasn\'t fitted. ' \ 'Call your TopicModeller.vectorize first.') return print("Fitting %s model with %d documents. " \ "Vectorizer: \n%s" % (self.model_type, docs.shape[0], self.vectorizer)) # for NMF if self.model_type == 'nmf': self.model = NMF(n_components=k_topics, alpha=.1, l1_ratio=.5, init='nndsvd') # For LDA elif self.model_type == 'lda': self.model = LatentDirichletAllocation(n_components=k_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) else: print('Unsupported models type \'%s\'' % self.model_type) return t1 = time.time() W = self.model.fit_transform(self.document_term_mat) H = self.model.components_ if self.model.__class__.__name__.upper() == 'NMF': self.W = W self.H = H print("- Time: %0.3fs.\n" % (time.time() - t1)) self.describe_matrix_factorization_results(self.document_term_mat, W, H, n_top_words=20) if self.d2v_model is None: self.d2v_model = nlp_utils.get_doc2vec_model(docs) # # Place holder # def fit_lda(): # t1 = time.time() # lda.fit(document_term_mat) # print "- Time: %0.3fs.\n" % (time.time() - t1) # # print "Topics in LDA model:" # tf_feature_names = self.vectorizer.get_feature_names() # self.print_top_words(lda, tf_feature_names) def document_term_mat_toframe(self): all_feature_names = self.vectorizer.get_feature_names() dtm = self.document_term_mat.todense() dfv = pd.DataFrame(dtm, columns=all_feature_names) return dfv # def print_top_words(self, model, feature_names, n_top=15): # for topic_idx, topic in enumerate(model.components_): # message = 'Topic #%d: ' % topic_idx # message += ' '.join([feature_names[i] # for i in topic.argsort()[:-n_top - 1:-1]]) # print(message) # print() def reconst_mse(self, target, left, right): ''' Calcuate the mean squared error between soruce matrix and the reconstruction of the matrix with W*H ''' return (np.array(target - left.dot(right))**2).mean() def describe_matrix_factorization_results(self, document_term_mat, W, H, n_top_words=15): ''' For each latent topic print the top n words assocaited with that topic TODO: print probabilities ''' feature_words = self.vectorizer.get_feature_names() print("Reconstruction mse: %f" % (self.reconst_mse(document_term_mat, W, H))) for topic_num, topic in enumerate(H): top_features = ', '.join([ feature_words[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) print("Topic %d: %s\n" % (topic_num, top_features)) return def rank_terms(self): # get the sums over each column/term sums = self.document_term_mat.sum(axis=0) terms = self.vectorizer.get_feature_names() # map weights to the terms weights = {} for col, term in enumerate(terms): weights[term] = sums[0, col] # rank the terms by their weight over all documents return sorted(list(weights.items()), key=operator.itemgetter(1), reverse=True) def get_doc_terms_and_scores(self, doc_index): ''' Return the tfidf values for vectorized terms for a document. Input: doc_index: The row index number for the fitted document_term_matrix Output: dictionary of {doc terms: tfidf scores} Hint: A sorted print to use: for key, value in sorted(tfidf_scores.iteritems(), key=lambda (k,v): (v,k), reverse=True): print "{:<10}: {:<10}".format(key, value) ''' all_feature_names = self.vectorizer.get_feature_names() dtm = self.document_term_mat.todense() doc_terms_indicies = dtm[doc_index, :].nonzero()[1] tfidf_scores = { all_feature_names[term_idx]: dtm[doc_index, term_idx] for term_idx in doc_terms_indicies } return tfidf_scores def print_W_probs(self, W): ''' Input W NMF matrix ''' probs = (W / W.sum(axis=1, keepdims=True)).flatten() ordered = np.argsort(probs)[::-1] for idx in ordered: print('Topic %s: %0.3f' % (idx, probs[idx])) def get_normalized_probs(self, topic_weights): ''' Return the normalized topic cluseter weights for a given row vector ''' topic_weights = topic_weights.flatten() probs = (topic_weights / topic_weights.sum()) return probs def get_top_topics_and_topic_probs(self): ''' Generate the probability of each topic for each row (eg, job posting) in W, and add the top topic and probability and return each as a list, (for example to be used as new columns added to a dataframe) ''' # For each row, get the topic weights, normalize, order by # weight value, and store in a list to add to the dataframe top_topics = [] top_topic_weights = [] for row_idx in range(self.W.shape[0]): W = self.W[row_idx] probs = self.get_normalized_probs(W) ordered_idxs = np.argsort(probs)[::-1] top_topics.append(ordered_idxs[0]) top_topic_weights.append(probs[ordered_idxs[0]]) return (top_topics, top_topic_weights) def custom_nmf(self, document_term_mat, k_topics=15, n_iterations=50, max_rows=20000, eps=1e-6): ''' Build the W and H matrix with least squares, clip negative values to 0 k_topics is also said as number of components ''' # n_rows = document_term_mat.shape[0] n_rows = max_rows n_cols = document_term_mat.shape[1] W = rand(n_rows * k_topics).reshape([n_rows, k_topics]) H = rand(k_topics * n_cols).reshape([k_topics, n_cols]) # linalg.lstsq doesn't work on sparse mats dense_document_term_mat = document_term_mat[0:n_rows].todense() print('dense_document_term_mat shape: ', dense_document_term_mat.shape) for i in range(n_iterations): print('iteration', i) H = np.linalg.lstsq(W, dense_document_term_mat)[0].clip(eps) W = np.linalg.lstsq(H.T, dense_document_term_mat.T)[0].clip(eps).T return np.array(W), np.array(H) def classify_training_docs(self, doc, display=True): ''' Using the trained model to label each source doc used in training ''' pass def classify_new_doc(self, doc, display=True): ''' Classify a new document using the fit model (NMF, LDA, or other). Input doc - string Output Dictionary of topics and their weights Optional output on (True) by default ''' if not self.model: 'A model has not been fit yet.' if type(doc) != str: 'Input document must be a string' # Using NMF # TODO word2Vec document_term_mat = self.vectorizer.transform([doc]) W = self.model.transform(document_term_mat) # H = self.model.components_ probs = (W / W.sum(axis=1, keepdims=True)).flatten() ordered = np.argsort(probs)[::-1] topic_dict = {} for idx in ordered: topic_dict[idx] = probs[idx] if display: print('Topic %s: %0.3f' % (idx, probs[idx])) return topic_dict
lda.fit(corpBodySpaCy) score = lda.score(corpBodySpaCy) perplexity = lda.perplexity(corpBodySpaCy) print n,score,perplexity lda_eval.append({'topics':n,'score':score,'perplexity':perplexity}) for item in lda_eval: print item #Best number of topics from the best vectorizer lda15 = LatentDirichletAllocation(n_topics=15, max_iter=5, learning_method='online', learning_offset=50., random_state=0) tf_trans = lda15.fit_transform(corpBodySpaCy) topics = pd.DataFrame(tf_trans) ldaTopics = topics.idxmax(axis=1) blogs['ldaTopics'] = ldaTopics topics.iloc[1] %matplotlib inline import seaborn as sns import matplotlib.pyplot as plt topics.iloc[1]
def compute_latent_vectors(self, col2, df) -> np.ndarray: document_term_matrix = self.create_document_term_matrix(df, col2) transformer = LatentDirichletAllocation(n_components=5, learning_method="online", random_state=99) return transformer.fit_transform(document_term_matrix)
def gene_var_text_relation(): print('Loading gene/variation text...') train_gene_text = pd.read_csv('data/intermediate/train_gene_text', sep='|') train_var_text = pd.read_csv('data/intermediate/train_variation_text', sep='|') test_gene_text = pd.read_csv('data/intermediate/test_gene_text', sep='|') test_var_text = pd.read_csv('data/intermediate/test_variation_text', sep='|') print('train_gene_text.shape:', train_gene_text.shape, 'train_var_text.shape:', train_var_text.shape, 'test_gene_text.shape:', test_gene_text.shape, 'test_var_text.shape:', test_var_text.shape) gene_text = pd.concat((train_gene_text, test_gene_text), axis=0, ignore_index=True) gene_text.columns = ['Entity', 'Text'] gps = gene_text.groupby('Entity') entity = [] text = [] for val, gp in gps: if gp.shape[0] != 1: entity.append(gp['Entity'].values[0]) text.append(gp['Text'].sum()) for e in entity: gene_text = gene_text[gene_text['Entity'] != e] gene_text = gene_text.append(pd.DataFrame({ 'Entity': entity, 'Text': text }), ignore_index=True) del entity, text, gps var_text = pd.concat((train_var_text, test_var_text), axis=0, ignore_index=True) var_text.columns = ['Entity', 'Text'] gps = var_text.groupby('Entity') entity = [] text = [] for val, gp in gps: if gp.shape[0] != 1: entity.append(gp['Entity'].values[0]) text.append(gp['Text'].sum()) for e in entity: var_text = var_text[var_text['Entity'] != e] var_text = var_text.append(pd.DataFrame({ 'Entity': entity, 'Text': text }), ignore_index=True) del entity, text, gps gene_var_text = pd.concat((gene_text, var_text), axis=0, ignore_index=True) print(gene_text.shape, var_text.shape, gene_var_text.shape) print('Applying document level tfidf + svd...') tfidf_word_vector = TfidfVectorizer(strip_accents='unicode', ngram_range=(1, 3), stop_words='english') tfidf_svd = TruncatedSVD(n_components=50, n_iter=25, random_state=12) text_tfidf = tfidf_word_vector.fit_transform(gene_var_text['Text'].values) print('text_tfidf:', text_tfidf.shape) text_doc_svd = tfidf_svd.fit_transform(text_tfidf) print('text_doc_svd:', text_doc_svd.shape) pd.concat( (gene_var_text['Entity'], pd.DataFrame(data=text_doc_svd)), axis=1, ignore_index=True).to_csv('data/intermediate/gv_text_doc_svd.csv', header=False, index=False) print('Applying document level tfidf + nmf...') tfidf_nmf = NMF(n_components=60) text_doc_nmf = tfidf_nmf.fit_transform(text_tfidf) print('text_doc_nmf:', text_doc_nmf.shape) pd.concat( (gene_var_text['Entity'], pd.DataFrame(data=text_doc_nmf)), axis=1, ignore_index=True).to_csv('data/intermediate/gv_text_doc_nmf.csv', header=False, index=False) del text_tfidf print('Applying sentence level tfidf(word/char) + svd...') sent_win = np.zeros((gene_var_text.shape[0], ), dtype=object) for i, text in enumerate(gene_var_text['Text'].tolist()): sent_win[i] = ' '.join([sent for sent in sent_tokenize(text)]) tfidf_char_vector = TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(1, 8), stop_words='english') word = tfidf_word_vector.fit_transform(sent_win) print('word.shape:', word.shape) word_svd = tfidf_svd.fit_transform(word) print('word_svd.shape:', word_svd.shape) del word char = tfidf_char_vector.fit_transform(sent_win) print('char.shape:', char.shape) del sent_win char_svd = tfidf_svd.fit_transform(char) print('char_svd.shape:', char_svd.shape) del char sent_tfidf_word_char_svd = np.concatenate((word_svd, char_svd), axis=1) del word_svd, char_svd print('sent_tfidf_word_char_svd:', sent_tfidf_word_char_svd.shape) pd.concat( (gene_var_text['Entity'], pd.DataFrame(data=sent_tfidf_word_char_svd)), axis=1, ignore_index=True).to_csv( 'data/intermediate/gv_sent_tfidf_word_char_svd.csv', header=False, index=False) print('Extracting tf features on gene text for LDA...') count_vector = CountVectorizer(analyzer='word', stop_words='english') gene_train_tf = count_vector.fit_transform(train_gene_text['Text'].values) gene_test_tf = count_vector.transform(test_gene_text['Text'].values) print('gene_train_tf:', gene_train_tf.shape) print('gene_test_tf:', gene_test_tf.shape) print('Applying Latent Dirichlet Allocation on gene text...') lda_vector = LatentDirichletAllocation(n_components=50) gene_train_lda = lda_vector.fit_transform(gene_train_tf) gene_test_lda = lda_vector.transform(gene_test_tf) print('gene_train_lda:', gene_train_lda.shape) print('gene_test_lda:', gene_test_lda.shape) del gene_train_tf, gene_test_tf gene_lda = np.concatenate((gene_train_lda, gene_test_lda), axis=0) del gene_train_lda, gene_test_lda gene_lda_df = pd.concat( (pd.concat([train_gene_text['Gene'], test_gene_text['Gene']], axis=0, ignore_index=True), pd.DataFrame(data=gene_lda)), axis=1, ignore_index=True) # merge same entities gps = gene_lda_df.groupby(0) entity = [] vec = [] for val, gp in gps: if gp.shape[0] != 1: entity.append(gp[0].values[0]) vec.append(sum(gp.values[:, 1:].astype(float))) for e in entity: gene_lda_df = gene_lda_df[gene_lda_df[0] != e] gene_lda_df = gene_lda_df.append( pd.concat([pd.DataFrame(data=entity), pd.DataFrame(data=vec)], axis=1, ignore_index=True)) del entity, vec, gps gene_lda_df.to_csv('data/intermediate/gv_gene_tf_lda50.csv', header=False, index=False) del gene_lda print('Extracting tf features on variation text for LDA...') var_train_tf_feats = count_vector.fit_transform( train_var_text['Text'].values) var_test_tf_feats = count_vector.transform(test_var_text['Text'].values) print('var_train_tf_feats:', var_train_tf_feats.shape) print('var_test_tf_feats:', var_test_tf_feats.shape) print('Applying Latent Dirichlet Allocation on variation text...') var_train_lda_feats = lda_vector.fit_transform(var_train_tf_feats) var_test_lda_feats = lda_vector.transform(var_test_tf_feats) print('var_train_lda_feats:', var_train_lda_feats.shape) print('var_test_lda_feats:', var_test_lda_feats.shape) del var_train_tf_feats, var_test_tf_feats var_lda = np.concatenate((var_train_lda_feats, var_test_lda_feats), axis=0) del var_train_lda_feats, var_test_lda_feats var_lda_df = pd.concat( (pd.concat([train_var_text['Variation'], test_var_text['Variation']], axis=0, ignore_index=True), pd.DataFrame(data=var_lda)), axis=1, ignore_index=True) # merge same entities gps = var_lda_df.groupby(0) entity = [] vec = [] for val, gp in gps: if gp.shape[0] != 1: entity.append(gp[0].values[0]) vec.append(sum(gp.values[:, 1:].astype(float))) for e in entity: var_lda_df = var_lda_df[var_lda_df[0] != e] var_lda_df = var_lda_df.append( pd.concat([pd.DataFrame(data=entity), pd.DataFrame(data=vec)], axis=1, ignore_index=True)) del entity, vec, gps var_lda_df.to_csv('data/intermediate/gv_var_tf_lda50.csv', header=False, index=False) del var_lda print('Applying TF custom idf feature on gene text...') gene_dic = _get_tf_dic(train_gene_text['Text'].values, test_gene_text['Text'].values, flag='gene') _, gene_idf_list = document_mining._word_occur_cls(gene_dic) gene_tfidf = document_mining._get_tfidf(gene_text['Text'].values, gene_dic, gene_idf_list) pd.concat((gene_text['Entity'], pd.DataFrame(data=gene_tfidf)), axis=1, ignore_index=True).to_csv( 'data/intermediate/gv_gene_tf_custom_idf.csv', header=False, index=False) print('Applying TF custom idf feature on variation text...') var_dic = _get_tf_dic(train_var_text['Text'].values, test_var_text['Text'].values, flag='variation') _, var_idf_list = document_mining._word_occur_cls(var_dic) var_tfidf = document_mining._get_tfidf(var_text['Text'].values, var_dic, var_idf_list) pd.concat( (var_text['Entity'], pd.DataFrame(data=var_tfidf)), axis=1, ignore_index=True).to_csv('data/intermediate/gv_var_tf_custom_idf.csv', header=False, index=False) del gene_dic, var_dic, gene_idf_list, var_idf_list, gene_tfidf, var_tfidf print('Applying TF custom idf feature on built gene/var dictionary...') gene_dic = set([ line.rstrip('\n') for line in open('data/intermediate/gene_tf_unique_dict_all.txt', 'r') ]) var_dic = set([ line.rstrip('\n') for line in open( 'data/intermediate/variation_tf_unique_dict_all.txt', 'r') ]) gene_var_dic_intxn = set(gene_dic).intersection(set(var_dic)) gene_unique_dic = list(gene_dic - gene_var_dic_intxn) _, gene_idf_list = document_mining._word_occur_cls(gene_unique_dic) gene_tfidf = document_mining._get_tfidf(gene_text['Text'].values, gene_unique_dic, gene_idf_list) pd.concat((gene_text['Entity'], pd.DataFrame(data=gene_tfidf)), axis=1, ignore_index=True).to_csv( 'data/intermediate/gv_gene_unique_tf_custom_idf.csv', header=False, index=False) var_unique_dic = list(var_dic - gene_var_dic_intxn) _, var_idf_list = document_mining._word_occur_cls(var_unique_dic) var_tfidf = document_mining._get_tfidf(var_text['Text'].values, var_unique_dic, var_idf_list) pd.concat((var_text['Entity'], pd.DataFrame(data=var_tfidf)), axis=1, ignore_index=True).to_csv( 'data/intermediate/gv_var_unique_tf_custom_idf.csv', header=False, index=False) _, idf_list = document_mining._word_occur_cls(list(gene_var_dic_intxn)) tfidf = document_mining._get_tfidf(gene_var_text['Text'].values, list(gene_var_dic_intxn), idf_list) pd.concat((gene_var_text['Entity'], pd.DataFrame(data=tfidf)), axis=1, ignore_index=True).to_csv( 'data/intermediate/gv_gene_var_intxn_tf_custom_idf.csv', header=False, index=False)
if len(line) == 0: print("File read finished") break line = re.sub("[^a-zA-Z ]", "", line) tokenized = word_tokenize(line) for word in tokenized: if word == "CNN" or word == "highlight": tokenized.remove(word) cnnTokenized.append(tokenized) sent = "" for w in tokenized: sent += (w + " ") cnnDoc.append(sent) cnnTF_IDF = vectorizer.fit_transform(cnnDoc) with open("cnn-tf-idf.pkl", 'wb') as handle: pickle.dump(cnnTF_IDF, handle) vocabs = vectorizer.get_feature_names() with open("cnn-terms.pkl", 'wb') as handle: pickle.dump(vectorizer.get_feature_names(), handle) lda = LatentDirichletAllocation(n_components=20) lda.fit_transform(cnnTF_IDF) topics = lda.components_ for index, topic in enumerate(topics): print("Topic %d : " % (index + 1), end="") print([(vocabs[i], topic[i].round(5)) for i in topic.argsort()[:-6:-1]])
def generate_lda_feature(x, topic) -> pd.DataFrame: log.info(f"Generating lda features from x:{x.shape} topics:{topic}") lda = LatentDirichletAllocation(n_components=topic, max_iter=10, random_state=0) dt_matrix = lda.fit_transform(x) return pd.DataFrame(dt_matrix)
train_svd = svd2.fit_transform(train_tfidf) train_svd = pd.DataFrame(train_svd) train_svd = sc.fit_transform(train_svd) test_svd = svd2.transform(test_tfidf) test_svd = pd.DataFrame(test_svd) test_svd = sc.fit_transform(test_svd) # Words - LDiA Counter = CountVectorizer(tokenizer=italian_tokenizer) cv_bow = pd.DataFrame(Counter.fit_transform(raw_documents=cv_text).toarray()) dev_bow = pd.DataFrame(Counter.transform(raw_documents=dev_text).toarray()) ldia = LDiA(n_components=32, learning_method="batch") cv_ldia = ldia.fit_transform(cv_bow) cv_ldia = pd.DataFrame(cv_ldia) cv_ldia = sc.fit_transform(cv_ldia) dev_ldia = ldia.transform(dev_bow) dev_ldia = pd.DataFrame(dev_ldia) dev_ldia = sc.fit_transform(dev_ldia) Counter2 = CountVectorizer(tokenizer=italian_tokenizer) train_bow = pd.DataFrame( Counter2.fit_transform(raw_documents=df_train).toarray()) test_bow = pd.DataFrame(Counter2.transform(raw_documents=test_text).toarray()) ldia2 = LDiA(n_components=32, learning_method="batch")
stop_words='english') tf = tf_vectorizer.fit_transform(text) # todo rationalize this true_k = tf.shape[0] * tf.shape[1] / tf.nnz logger.debug( 'using the TF/CountVectorizer data we expect %d clusters' % true_k) n_topics = true_k lda = LatentDirichletAllocation(learning_method='online', learning_offset=50., max_iter=5, n_topics=n_topics, random_state=random_state) lda.fit(tf) lda_results = lda.fit_transform(tf) tf_feature_names = tf_vectorizer.get_feature_names() for topic_idx, topic in enumerate(lda.components_): logger.debug('Topic #%d:' % topic_idx) logger.debug(' '.join([ '[' + tf_feature_names[i] + ']' for i in topic.argsort()[:-n_top_words - 1:-1] ])) # let's make a grid of topics and words if False: values = lda.components_.copy() # todo find a good threshold # threshold = 0.75 # values[values < threshold] = 0 t4 = values.min() t5 = values.max()
fr = open("Med5Sept.csv", "r") if fr.mode == 'r': contenidofiltrado = fr.readlines() print(contenidofiltrado) count_vect = CountVectorizer(max_df=0.1, min_df=0, stop_words=spanish_stopwords) # 'spanish') # doc_term_matrix = count_vect.fit_transform(reviews_datasets['description'].values.astype('U')) # doc_term_matrix = count_vect.fit_transform(reviews_datasets['text'].values.astype('U')) doc_term_matrix = count_vect.fit_transform(contenidofiltrado) NUM_TOPICS = 6 LDA = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42) lda_Z = LDA.fit_transform(doc_term_matrix) sentencia = [] def print_topics(model, vectorizer, top_n=11): # 11 Sintagmas for idx, topic in enumerate(model.components_): # print("Topic %d:" % (idx)) oracion = ' '.join([(vectorizer.get_feature_names()[i]) for i in topic.argsort()[:-top_n - 1:-1]]) # print([(vectorizer.get_feature_names()[i] ) # for i in topic.argsort()[:-top_n - 1:-1]]) # print(oracion) sentencia.append(oracion)
#%% Import libs and frameworks import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation #%% Latent Dirichlet Allocation df = pd.read_csv("movie_data.csv", encoding="utf-8") count = CountVectorizer(stop_words="english", max_df=0.1, max_features=5000) X = count.fit_transform(df["review"].values) lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method="online") X_topics = lda.fit_transform(X) print(lda.components_.shape) n_top_words = 5 feature_names = count.get_feature_names() for topic_idx, topic in enumerate(lda.components_): print("Topic %d" % (topic_idx + 1)) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
print("") print("==> DFS", prefix) print_results_for_field(dataset, Xt_full, "DFS", prefix) print("") print("") print("==> posOutcome", prefix) print_results_for_field(dataset, Xt_full, "posOutcome", prefix) print("") print("") treat_dataset = read_treat_dataset() combat_dataset = read_combat_dataset() X_full, _ = prepare_full_dataset(drop_trea(combat_dataset)) pam_types_cat_dataset = read_pam_types_cat_dataset() assert all(pam_types_cat_dataset['patient_ID'] == combat_dataset['patient_ID']) for n_cluster in [1, 5,10,20, 100, 200]: lda = LatentDirichletAllocation(n_components=n_cluster) Xt_full = lda.fit_transform(X_full - np.min(X_full)) print_results(pam_types_cat_dataset, Xt_full, "nc" + str(n_cluster))
transformer = TfidfTransformer() tfidf = transformer.fit_transform(textVector) from sklearn.decomposition import LatentDirichletAllocation n_topics = 9 lda = LatentDirichletAllocation(n_topics=n_topics) lda.fit(textVector) topicWordMatrix = lda.components_ import numpy prefixMatrix = numpy.where(topicWordMatrix >= 0, "", "^") sort = numpy.argsort(-1 * numpy.abs(topicWordMatrix), axis=1)[:, 0:10] prefixs = [] for i in range(n_topics): prefixs.append(prefixMatrix[i, sort[i]]) keywords = pandas.Index(countVectorizer.get_feature_names())[sort].values print(prefixs + keywords) textTopicMatrix = lda.fit_transform(textVector) corpos['topic'] = textTopicMatrix.argmax(axis=1) pandas.crosstab(corpos['class'], corpos['topic'])
df["content_cutted"] = df.content.apply(chinese_word_cut) print(type(df.content_cutted)) #numpy.savetxt('new1.csv',df.content_cutted, delimiter = ',') n_features = 10000 #特征词数量 tf_vectorizer = CountVectorizer(strip_accents='unicode', max_features=n_features, stop_words='english', max_df=0.5, min_df=10) tf = tf_vectorizer.fit_transform(df.content_cutted) n_topics = 30 #话题数量 lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50, learning_method='online', learning_offset=50., random_state=0) cm = lda.fit_transform(tf) n_top_words = 30 #打印的话题数量 tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) numpy.savetxt('new2.csv', cm, delimiter=',') #聚类20个 kmeans = KMeans(n_clusters=32, random_state=0).fit(cm) kresults = pd.DataFrame(data=numpy.array(kmeans.labels_)) print(type(names), type(kresults)) newre = pd.concat([names, kresults], axis=1) #newre = names.append(kresults) newre.to_csv('new3.csv', encoding="utf-8", index=False)