def run_lsi_gensim(pp_descriptions, filtered_dcm, verbose=False): """as in [VISR12: 4.2.1]""" # TODO options here: # * if it should filter AFTER the LSI if verbose: filtered_dcm.show_info(descriptions=pp_descriptions) if get_setting("DCM_QUANT_MEASURE") != "binary": logger.warn("VISR12 say it works best with binary!") filtered_dcm.add_pseudo_keyworddocs() dictionary = corpora.Dictionary([list(filtered_dcm.all_terms.values())]) print("Start creating the LSA-Model with MORE topics than terms...") lsamodel_manytopics = LsiModel(doc_term_matrix, num_topics=len(all_terms) * 2, id2word=dictionary) print("Start creating the LSA-Model with FEWER topics than terms...") lsamodel_lesstopics = LsiModel(filtered_dcm.dtm, num_topics=len(filtered_dcm.all_terms) // 10, id2word=dictionary) print() import matplotlib.cm import matplotlib.pyplot as plt # TODO use the mpl_tools here as well to also save plot! plt.imshow(lsamodel_lesstopics.get_topics()[:100, :200], vmin=lsamodel_lesstopics.get_topics().min(), vmax=lsamodel_lesstopics.get_topics().max(), cmap=matplotlib.cm.get_cmap("coolwarm")) plt.show()
class GensimLatentSemanticAnalysis(GensimProjectionsWordEmbeddingLearner): """ Class that implements the Abstract Class GensimProjectionsWordEmbeddingLearner Class that implements latent semantic analysis using Gensim """ def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): super().__init__(reference, auto_save, ".model", **kwargs) def fit_model(self, corpus: List): """ This method creates the model, using Gensim Latent Semantic Analysis. The model isn't then returned, but gets stored in the 'model' class attribute. """ dictionary = Dictionary(corpus) word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus] self.model = LsiModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters) def load_model(self): return LsiModel.load(self.reference) def get_vector_size(self) -> int: return len(self.model.get_topics()) def __str__(self): return "GensimLatentSemanticAnalysis" def __repr__(self): return "< GensimLatentSemanticAnalysis : model = " + str( self.model) + " >"
def build_and_save_lsi_model(): print('Connecting to the database...') sentences = SentencesIterator(tokens_generator) dct = Dictionary(sentences) # Corpus as dictionary ids lists, in memory # Can be transformed in an iterable as done with the others if needed print('Calculating the LSI model...') bow_corpus = [dct.doc2bow(s) for s in sentences] model = LsiModel(bow_corpus, id2word=dct) model.print_debug() model.save(LSI_MODEL_FILE) for t in range(model.get_topics().shape[0]): print(t) print(model.print_topic(t))
def train_lsa(docs: Iterable, outputFolder: str): docs = list(docs) id2word = Dictionary(docs) id2word.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000) corpus = [id2word.doc2bow(doc) for doc in docs] corpus = log_entropy_norm(corpus) print("Starting training...") lsa = LsiModel(corpus=corpus, id2word=id2word, num_topics=300) path = outputFolder + "/lsa.model" lsa.save(outputFolder + "/lsa.bin") matrix = np.transpose(lsa.get_topics()) with open(path, "wt", encoding='utf-8') as f: f.write("{} {}\n".format(np.size(matrix, 0), np.size(matrix, 1))) for idx in range(np.size(matrix, 0)): f.write(id2word[idx] + " " + " ".join([str(x) for x in matrix[idx]]) + "\n") print("Model saved to ", path)
def __init__(self, embedding_dictionary_file, word_to_index_file, docs_tokens, doc_len, word_len, iters): self.time = 0. if embedding_dictionary_file is not None and word_to_index_file is not None: super(LSAEmbedding, self).get_from_files(embedding_dictionary_file, word_to_index_file, doc_len, self) else: self.time = time() word_dictionary = Dictionary(docs_tokens) word_to_index = word_dictionary.token2id docs_term_matrix = [ word_dictionary.doc2bow(tokens) for tokens in docs_tokens ] tfidfmodel = TfidfModel(docs_term_matrix, id2word=word_dictionary) corpus = [tfidfmodel[doc] for doc in docs_term_matrix] lsamodel = LsiModel(corpus, num_topics=word_len, id2word=word_dictionary, power_iters=iters) self.time = time() - self.time embedding_matrix = lsamodel.get_topics().transpose() embedding_dictionary = {} embedding_dim = None for word, i in word_to_index.items(): embedding_dictionary[word] = embedding_matrix[i] if embedding_dim is None: embedding_dim = len(embedding_matrix[i]) # print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)) # one_hot = OneHot(docs_tokens, max_doc_len=self.doc_len) # word_to_index = one_hot.get_word_indexes() super(LSAEmbedding, self).get_from_data(embedding_dictionary, embedding_dim, word_to_index, doc_len, self) self.name = 'lsa' self.iters = iters
def __init__(self, docs_tokens, emb_dim, iters): self.time = 0. self.time = time() word_dictionary = Dictionary(docs_tokens) word_to_index = word_dictionary.token2id docs_term_matrix = [word_dictionary.doc2bow(tokens) for tokens in docs_tokens] tfidfmodel = TfidfModel(docs_term_matrix, id2word=word_dictionary) corpus = [tfidfmodel[doc] for doc in docs_term_matrix] lsamodel = LsiModel(corpus, num_topics=emb_dim, id2word=word_dictionary, power_iters=iters) self.time = time() - self.time embedding_matrix = lsamodel.get_topics().transpose() embedding_dictionary = {} embedding_dim = None for word, i in word_to_index.items(): embedding_dictionary[word] = embedding_matrix[i] if embedding_dim is None: embedding_dim = len(embedding_matrix[i]) super(EmbeddingModel, self).get_from_data(embedding_dictionary, embedding_dim, word_to_index, self) self.name = 'lsa'
labels_topics[label] = topic_with_id with open("./data/20180101_0815/labels_ids.dump", "wb") as f: pickle.dump(labels_ids, f) with open("./data/20180101_0815/labels_corpus.dump", "wb") as f: pickle.dump(labels_corpus, f) with open("./data/20180101_0815/labels_topics.dump", "wb") as f: pickle.dump(labels_topics, f) labels_topic_vec = {} for label, many_topic in labels_topics.items(): if label not in labels_topic_vec.keys(): labels_topic_vec[label] = [] topic_vec_list = [] for topic_id, weight in many_topic: w_vector = lsi_model.get_topics()[topic_id] * weight topic_vec_list.append(w_vector) labels_topic_vec[label] = np.average(topic_vec_list, axis=0) with open("./data/20180101_0815/labels_topic_vec.dump", "wb") as f: pickle.dump(labels_topic_vec, f) labels_words_freq = {} for label, vec in labels_topic_vec.items(): if label not in labels_words_freq.keys(): labels_words_freq[label] = {} for id_, val in enumerate(vec): labels_words_freq[label][dict_2d[id_]] = abs(val) with open("./data/20180101_0815/labels_words_freq.dump", "wb") as f: pickle.dump(labels_words_freq, f)
class AnnStream: def __init__(self, data, k: int, n_cluster: int, reduction_method: str, dims: int, loadings: np.ndarray, use_for_pca: np.ndarray, mu: np.ndarray, sigma: np.ndarray, ann_metric: str, ann_efc: int, ann_ef: int, ann_m: int, nthreads: int, ann_parallel: bool, rand_state: int, do_kmeans_fit: bool, disable_scaling: bool, ann_idx): self.data = data self.k = k if self.k >= self.data.shape[0]: self.k = self.data.shape[0] - 1 self.nClusters = max(n_cluster, 2) self.dims = dims self.loadings = loadings if self.dims is None and self.loadings is None: raise ValueError( "ERROR: Provide either value for atleast one: 'dims' or 'loadings'" ) self.annMetric = ann_metric self.annEfc = ann_efc self.annEf = ann_ef self.annM = ann_m self.nthreads = nthreads if ann_parallel: self.annThreads = self.nthreads else: self.annThreads = 1 self.randState = rand_state self.batchSize = self._handle_batch_size() self.method = reduction_method self.nCells, self.nFeats = self.data.shape self.clusterLabels: np.ndarray = np.repeat(-1, self.nCells) disable_reduction = False if self.dims < 1: disable_reduction = True with threadpool_limits(limits=self.nthreads): if self.method == 'pca': self.mu, self.sigma = mu, sigma if self.loadings is None or len(self.loadings) == 0: if len(use_for_pca) != self.nCells: raise ValueError( "ERROR: `use_for_pca` does not have sample length as nCells" ) if disable_reduction is False: self._fit_pca(disable_scaling, use_for_pca) else: # Even though the dims might have been already adjusted according to loadings before calling # AnnStream, it could still be overwritten by _handle_batch_size. Hence need to hard set it here. self.dims = self.loadings.shape[1] # it is okay for dimensions to be larger than batch size here because we will not fit the PCA if disable_scaling: if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) else: if disable_reduction: self.reducer = lambda x: self.transform_z(x) else: self.reducer = lambda x: self.transform_z(x).dot( self.loadings) elif self.method == 'lsi': if self.loadings is None or len(self.loadings) == 0: if disable_reduction is False: self._fit_lsi() else: self.dims = self.loadings.shape[1] if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) elif self.method == 'custom': if self.loadings is None or len(self.loadings) == 0: logger.warning( "No loadings provided for manual dimension reduction") else: self.dims = self.loadings.shape[1] if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) else: raise ValueError( f"ERROR: Unknown reduction method: {self.method}") if ann_idx is None: self.annIdx = self._fit_ann() else: self.annIdx = ann_idx self.annIdx.set_ef(self.annEf) self.annIdx.set_num_threads(1) self.kmeans = self._fit_kmeans(do_kmeans_fit) def _handle_batch_size(self): if self.dims > self.data.shape[0]: self.dims = self.data.shape[0] batch_size = self.data.chunksize[ 0] # Assuming all chunks are same size if self.dims >= batch_size: self.dims = batch_size - 1 # -1 because we will do PCA +1 logger.info( f"Number of PCA/LSI components reduced to batch size of {batch_size}" ) if self.nClusters > batch_size: self.nClusters = batch_size logger.info( f"Cluster number reduced to batch size of {batch_size}") return batch_size def iter_blocks(self, msg: str = '') -> np.ndarray: for i in tqdm(self.data.blocks, desc=msg, total=self.data.numblocks[0]): yield controlled_compute(i, self.nthreads) def transform_z(self, a: np.ndarray) -> np.ndarray: return (a - self.mu) / self.sigma def transform_ann(self, a: np.ndarray, k: int = None, self_indices: np.ndarray = None) -> tuple: if k is None: k = self.k # Adding +1 to k because first neighbour will be the query itself if self_indices is None: i, d = self.annIdx.knn_query(a, k=k) return i, d else: i, d = self.annIdx.knn_query(a, k=k + 1) return fix_knn_query(i, d, self_indices) def _fit_pca(self, disable_scaling, use_for_pca) -> None: from sklearn.decomposition import IncrementalPCA # We fit 1 extra PC dim than specified and then ignore the last PC. self._pca = IncrementalPCA(n_components=self.dims + 1, batch_size=self.batchSize) do_sample_subset = False if use_for_pca.sum() == self.nCells else True s, e = 0, 0 # We store the first block of values here. if such a case arises that we are left with less dims+1 cells to fit # then those cells can be added to end_reservoir for fitting. if there are no such cells then end reservoir is # just by itself after fitting rest of the cells. If may be the case that the first batch itself has less than # dims+1 cells. in that we keep adding cells to carry_over pile until it is big enough. end_reservoir = [] # carry_over store cells that can yet not be added to end_reservoir ot be used for fitting pca directly. carry_over = [] for i in self.iter_blocks(msg='Fitting PCA'): if do_sample_subset: e = s + i.shape[0] i = i[use_for_pca[s:e]] s = e if disable_scaling is False: i = self.transform_z(i) if len(carry_over) > 0: i = np.vstack((carry_over, i)) carry_over = [] if len(i) < (self.dims + 1): carry_over = i continue if len(end_reservoir) == 0: end_reservoir = i continue try: self._pca.partial_fit(i, check_input=False) except LinAlgError: # Add retry counter to make memory consumption doesn't escalate carry_over = i if len(carry_over) > 0: i = np.vstack((end_reservoir, carry_over)) else: i = end_reservoir try: self._pca.partial_fit(i, check_input=False) except LinAlgError: logger.warning( "{i.shape[0]} samples were not used in PCA fitting due to LinAlgError", flush=True) self.loadings = self._pca.components_[:-1, :].T def _fit_lsi(self) -> None: from gensim.models import LsiModel from gensim.matutils import Dense2Corpus self._lsiModel = LsiModel( Dense2Corpus( controlled_compute(self.data.blocks[0], self.nthreads).T), num_topics=self.dims, chunksize=self.data.chunksize[0], id2word={x: x for x in range(self.data.shape[1])}, extra_samples=0) for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")): if n == 0: continue self._lsiModel.add_documents(Dense2Corpus(i.T)) self.loadings = self._lsiModel.get_topics().T def _fit_ann(self): import hnswlib dims = self.dims if dims < 1: dims = self.data.shape[1] ann_idx = hnswlib.Index(space=self.annMetric, dim=dims) ann_idx.init_index(max_elements=self.nCells, ef_construction=self.annEfc, M=self.annM, random_seed=self.randState) ann_idx.set_ef(self.annEf) ann_idx.set_num_threads(self.annThreads) for i in self.iter_blocks(msg='Fitting ANN'): ann_idx.add_items(self.reducer(i)) return ann_idx def _fit_kmeans(self, do_ann_fit): from sklearn.cluster import MiniBatchKMeans if do_ann_fit is False: return None kmeans = MiniBatchKMeans(n_clusters=self.nClusters, random_state=self.randState, batch_size=self.batchSize) with threadpool_limits(limits=self.nthreads): for i in self.iter_blocks(msg='Fitting kmeans'): kmeans.partial_fit(self.reducer(i)) temp = [] for i in self.iter_blocks(msg='Estimating seed partitions'): temp.extend(kmeans.predict(self.reducer(i))) self.clusterLabels = np.array(temp) return kmeans
def main(): conf = SparkConf().setAppName("Program Number 1") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # creates Spark Session spark = SparkSession.builder.appName("Program Number 1").getOrCreate() # tweets folder address on HDFS server - ignore files with .tmp extensions (Flume active files). inputpath = "hdfs://hdfs input path" spark.conf.set("spark.sql.shuffle.partitions", 1) # get the raw tweets from HDFS raw_tweets = spark.read.format("json").option( "inferScehma", "true").option("mode", "dropMalformed").load(inputpath) # get the tweet text from the raw data. text is transformed to lower case. Deletes re-tweets. and finally include an index for each tweet tweets = raw_tweets.select( functions.lower(functions.col("text"))).withColumnRenamed( "lower(text)", "text").distinct().withColumn( "id", functions.monotonically_increasing_id()) # Create a tokenizer that Filter away tokens with length < 4, and get rid of symbols like $,#,... tokenizer = RegexTokenizer().setPattern("[\\W_]+").setMinTokenLength( 4).setInputCol("text").setOutputCol("tokens") # Tokenize tweets tokenized_tweets = tokenizer.transform(tweets) remover = StopWordsRemover().setInputCol("tokens").setOutputCol("cleaned") # remove stopwords cleaned_tweets = remover.transform(tokenized_tweets) # create a vector of words that at least appeared in two different tweets, and set maximum vocab size to 20000. vectorizer = CountVectorizer().setInputCol("cleaned").setOutputCol( "features").setVocabSize(20000).setMinDF(2).fit(cleaned_tweets) wordVectors = vectorizer.transform(cleaned_tweets).select("id", "features") # LDA # create Latent Dirichlet Allocation model and run it on our data with 25 iteration and 5 topics lda = LDA(k=5, maxIter=25) # fit the model on data ldaModel = lda.fit(wordVectors) # create topics based on LDA lda_topics = ldaModel.describeTopics() # show LDA topics # ______________________________________________________________________________________________________________ # LSA clean_tweets_list = [] tweet_list = [] # for creating the document term matrix for the LSIModel as input # this is needed as LSI needs tuples of (vocabulary_index, frequency) form for tweet_row in wordVectors.select('features').collect(): tweet_list.clear() # reading the SparseVector of 'features' column (hence the 0 index) and zipping them to a list # idx = vocabulary_index, val=frequency of that word in that tweet for idx, val in zip(tweet_row[0].indices, tweet_row[0].values): # converting the frequency from float to integer tweet_list.append((idx, int(val))) clean_tweets_list.append(tweet_list[:]) # calling the LSIModel and passing the number of topics as 5 lsa_model = LsiModel(clean_tweets_list, num_topics=5) # show LSA topics # ______________________________________________________________________________________________________________ # #Comparison # get the weights and indices of words from LDA topics in format of List[list[]] lda_wordIndices = [row['termIndices'] for row in lda_topics.collect()] lda_wordWeights = [row['termWeights'] for row in lda_topics.collect()] # get the weights and indices of words from LDA topics in format of numpy array with 5*wordCount shape. # each element is the weight of the corresponding word in that specific topic. lsa_weightsMatrix = lsa_model.get_topics() # function to calculate the similarity between an lsa topic and an lda topic. def topic_similarity_calculator(lsa_t, lda_t): (lda_index, lda_weight) = lda_t sum = 0 for index, weight in zip(lda_index, lda_weight): sum = sum + (np.abs(lsa_t[index] * weight)) return sum # run the similarity function on 25 possibilities (5 LSA * 5 LDA) similarity = [] eachLSA = [] for i in range(0, 5): eachLSA.clear() for j in range(0, 5): temp = topic_similarity_calculator( lsa_weightsMatrix[i], (lda_wordIndices[j], lda_wordWeights[j])) eachLSA.append(temp) similarity.append(eachLSA[:]) # Print the similarity table # each row is a LDA topic and each column is an LSA topic. print(" ") print("Similarity table") def similarity_print(s): i = 1 print("|--------------------------------------------------------|") print("| | LSA 1 | LSA 2 | LSA 3 | LSA 4 | LSA 5 |") print("|--------------------------------------------------------|") for one, two, three, four, five in zip(*similarity): print( '|LDA {} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} | {:+1.4f} |' .format(i, one, two, three, four, five)) print("|--------------------------------------------------------|") i = i + 1 #creates the similarity matrix similarity_print(similarity) # ______________________________________________________________________________________________________________ # Final result Table # Manually found the following Topics to be similar # (LSA1 - LDA1) # (LSA5 - LDA2) # rest are alone lsa_words_idx = [] for idx, curr_topic in enumerate(lsa_weightsMatrix): lsa_words_idx.append(np.abs(curr_topic).argsort()[-10:][::-1]) lsa_topics_bow = {} lda_topics_bow = {} lsa_bow_list = [] lda_bow_list = [] for curr_idx, (lda_topic, lsa_topic) in enumerate(zip(lda_wordIndices, lsa_words_idx)): lsa_bow_list.clear() lda_bow_list.clear() for idx in range(10): lsa_bow_list.append(vectorizer.vocabulary[lsa_topic[idx]]) lda_bow_list.append(vectorizer.vocabulary[lda_topic[idx]]) lsa_topics_bow[curr_idx] = lsa_bow_list[:] lda_topics_bow[curr_idx] = lda_bow_list[:] results = [] names = [] # Creating word dictionary for LDA2 and LSA5 lda2_lsa5 = lda_topics_bow[1][:] for word in (lsa_topics_bow[4]): if word not in lda2_lsa5: lda2_lsa5.append(word) # Creating word dictionary for LDA1 and LSA1 lda1_lsa1 = lda_topics_bow[0][:] for word in (lsa_topics_bow[0]): if word not in lda1_lsa1: lda1_lsa1.append(word) results.append(lda1_lsa1) names.append("LDA1 - LSA1 ") results.append(lda2_lsa5) names.append("LDA2 - LSA5 ") results.append(lda_topics_bow[2]) names.append("LDA3 ") results.append(lda_topics_bow[3]) names.append("LDA4 ") results.append(lda_topics_bow[4]) names.append("LDA5 ") results.append(lsa_topics_bow[1]) names.append("LSA2 ") results.append(lsa_topics_bow[2]) names.append("LSA3 ") results.append(lsa_topics_bow[3]) names.append("LSA4 ") #printing the topics and related words print(" ") print("Topics Table") print( "|------------------------------------------------------------------------------------------|" ) print( "| Topic | Significant Words |" ) print( "|------------------------------------------------------------------------------------------|" ) for name, r in zip(names, results): print('| {} | {} |'.format(name, r)) print( "|------------------------------------------------------------------------------------------|" ) print(" ") print(" ")
#-*- coding: utf-8 -*- import pickle from gensim.corpora import Dictionary from gensim.models import LsiModel with open("../data/corpus_test.pkl", "rb") as f: corpus = pickle.load(f) corpus_dictionary = Dictionary(corpus) corpus = [corpus_dictionary.doc2bow(text) for text in corpus] CORPUS = corpus ID2WORD = corpus_dictionary NUM_TOPICS = 200 lsi = LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=ID2WORD) topic_word_matrix = lsi.get_topics() sigma_matrix = lsi.projection.s # SVD에서 sigular value들로 이루어진 sigma matrix. 편의를 위해 형태는 그냥 k vector # document-topic matrix를 구해주지 않는다. new_doc = ["영화/Noun", "재미/Noun"] # 새로운 document new_doc_bow = corpus_dictionary.doc2bow(new_doc) # 새로운 document의 bag of words vec_lsi = lsi[new_doc_bow] # 새로운 document를 LSI 공간으로 사상. print(vec_lsi)