class IR: def __init__(self, documents: List, stop_words: Set, stemming: bool = True): self.documents = documents self.stop_words = stop_words self.stemming = stemming self.proccess_documents(self.documents) self.dictionary = None self.corpus = None self.tfidf_model = None self.tfidf_corpus = None self.tfidf_similarity = None self.lsi_model = None self.lsi_corpus = None self.lsi_similarity = None self.build_dictionary() self.build_models() """ 1. Data loading and preprocessing """ def proccess_documents(self, documents: List) -> List: # Tokenize documents result = self.tokenize(documents) # Lowercase all words result = list(map(lambda x: self.lowercase(x), result)) # Remove stop words result = self.filter_stopwords(result) # Remove text punctuation result = self.remove_text_punctation(result) # Stem words if self.stemming: result = self.port_stem(result) # Remove empty words from all documents return self.filter_empty_words(result) """ 2. Dictionary building """ def build_dictionary(self): documents = self.proccess_documents(self.documents) self.dictionary = Dictionary(documents) self.corpus = list( map(lambda doc: self.dictionary.doc2bow(doc), documents)) """ 3. Retrieval Models """ def build_models(self): # Create tfidf model self.tfidf_model = TfidfModel(self.corpus) # Map bag of words to (word-index, word-weight) self.tfidf_corpus = list( map(lambda c: self.tfidf_model[c], self.corpus)) self.tfidf_similarity = MatrixSimilarity(self.tfidf_corpus) self.lsi_model = LsiModel(self.tfidf_corpus, id2word=self.dictionary, num_topics=100) self.lsi_corpus = list( map(lambda c: self.lsi_model[c], self.tfidf_corpus)) self.lsi_similarity = MatrixSimilarity(self.lsi_corpus) def filter_stopwords(self, paragraphs: List) -> List: return list( map(lambda p: list(filter(lambda x: x not in self.stop_words, p)), paragraphs)) """ 4. Querying """ def procces_query(self, query: str) -> List: tokenized = self.tokenize([query]) lowered = list(map(lambda x: self.lowercase(x), tokenized)) stop_word_filtered = self.filter_stopwords(lowered) punctation_filtered = self.remove_text_punctation(stop_word_filtered) if self.stemming: return self.port_stem(punctation_filtered) return punctation_filtered def tfidf_query(self, query: str, number_of_results: int = 3) -> None: # Proccess query proccessed_query = self.procces_query(query) query_corpus = self.dictionary.doc2bow(proccessed_query[0]) query_tfidf = self.tfidf_model[query_corpus] similarity = enumerate(self.tfidf_similarity[query_tfidf]) # Query most relevant paragraphs using TFIDF model query_result = sorted(similarity, key=lambda kv: -kv[1])[:number_of_results] # Print search result for result in query_result: number, _ = result print("Paragraph:", number) print(self.documents[number], "\n") def lsi_query(self, query: str, number_of_results: int = 3) -> None: # Proccess query proccessed_query = self.procces_query(query) query_corpus = self.dictionary.doc2bow(proccessed_query[0]) tfidf_query = self.tfidf_model[query_corpus] lsi_query = self.lsi_model[tfidf_query] # Fetch most relevant topics relevant_topics = sorted( lsi_query, key=lambda kv: -abs(kv[1]))[:number_of_results] # Print for result in relevant_topics: number, _ = result print("Topic:", number) print(self.lsi_model.show_topic(number)) print() # Find most relevant paragraphs using LSI similarity doc2similarity = enumerate(self.lsi_similarity[lsi_query]) query_result = sorted(doc2similarity, key=lambda kv: -kv[1])[:number_of_results] # Print query result for result in query_result: number, _ = result print("Paragraph:", number) print(self.documents[number], "\n") """ All methods below is helpers to preproccess both documents and queries. """ @staticmethod def filter_empty_words(paragraphs: List) -> List: return list( map(lambda p: list(filter(lambda w: w != "", p)), paragraphs)) @staticmethod def tokenize(documents: List) -> List: return list(map(lambda x: x.split(), documents)) @staticmethod def lowercase(words: List) -> List: return list(map(lambda s: s.lower(), words)) @staticmethod def port_stem(documents: List) -> List: stemmer = PorterStemmer() return list( map(lambda p: list(map(lambda w: stemmer.stem(w), p)), documents)) @staticmethod def remove_text_punctation(documents: List) -> List: regular = "[" + string.punctuation + "\n\r\t" + "]" return list( map(lambda p: list(map(lambda w: re.sub(regular, "", w), p)), documents))
lsi.save(os.path.join(DATA_PATH, 'lsi100')) lsi2.save(os.path.join(DATA_PATH, 'lsi2')) # In[16]: lsi2.show_topics() # In[23]: # for topic in lsi.show_topics(): # print(topic) lsi.show_topic(0, 100) # ## Hold onto your hat # This will take a lot of RAM! # (and CPU) # In[31]: tweetids = pd.Series(range(len(bows)), name='tweet') topicids = pd.Series(range(lsi.num_topics), name='topic') # `dict()` keeps track of the columns for each topic, in case the lsi model shuffles or skips topics for odd tweets df = pd.DataFrame([pd.Series(dict(lsi[bows[i]]), name='tweet') for i in tweetids], columns=topicids, index=tweetids)