def preclean_xlsxParagraph(text_list): preprocessor = TextPreprocessor() word_list = list() for text in text_list: clean_text = preprocessor.clean_sentence(text) word_list.append(' '.join(preprocessor.tokenize_text(clean_text))) return (word_list)
def extract_dictionary(document_paths, max_words): """ Extracts a gensim Dictionary object from a set of documents. Parameters ---------- document_paths : [str] List of document paths that make up the corpus. Returns ------- dictionary : gensim.corpora.Dictionary Extracted dictionary (or tokenizer). """ print("Extracting dictionary from corpus") dictionary = Dictionary(prune_at=None) preprocessor = TextPreprocessor() for document_path in tqdm(document_paths): with open(document_path, "r") as f: document = f.read() document = preprocessor.clean_sentence(document, alphabetic_only=True) words = preprocessor.tokenize_text(document) dictionary.add_documents([words]) if len(dictionary) > max_words: start = time() dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=int(max_words * 0.9)) print("Dictionary filtered in {} seconds".format(time() - start)) return dictionary
class BM25Engine(SearchEngine): tokenizer_file = "data/processed/{id}/tokenizer" corpus_file = "data/processed/{id}/corpus" doc_idxs_file = "data/processed/{id}/doc_idxs.json" def __init__(self, tokenizer, corpus, idxs2id): """ Parameters ---------- tokenizer : gensim.corpora.Dictionary Word tokenizer. corpus : gensim.corpora.mmcorpus.MmCorpus Bag-of-words formatted corpus of documents. """ self.preprocessor = TextPreprocessor() self.tokenizer = tokenizer self.corpus = corpus self.internal_engine = BM25(self.corpus) self.idxs2id = idxs2id print("BM25 engine loaded") def top_k_matches(self, query, k): clean = self.preprocessor.clean_sentence(query, alphabetic_only=True) word_list = self.preprocessor.tokenize_text(clean) bow_representation = self.tokenizer.doc2bow(word_list) scores = self.internal_engine.get_scores(bow_representation) top_k_idxs = np.argsort(scores)[::-1][:k] return [self.idxs2id[str(idx)] for idx in top_k_idxs] def load(id): tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id)) corpus = MmCorpus(BM25Engine.corpus_file.format(id=id)) with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f: idxs2id = json.load(f) return BM25Engine(tokenizer, corpus, idxs2id)
class BoWHighlighter(SegmentHighlighter): def __init__(self): self.preprocessor = TextPreprocessor() self.counter = CountVectorizer() def highlight(self, document, query, precision): clean_query = self.preprocessor.clean_sentence( query) #, alphabetic_only=True word_list = self.preprocessor.tokenize_text(clean_query) #Convert query after tokenization as a string query_mod = ' '.join(word_list) document = re.sub(" +", " ", document) document = re.sub("\n\s+", " \n \n ", document) document = re.sub("\n+", "\n", document) # document = re.sub("\n*", "\n\n ", document) document = document.strip() highlights = [] scores = [] for paragraph in document.split(" \n \n "): if len(paragraph) == 0: continue clean_paragraph = self.preprocessor.clean_sentence( paragraph) #, alphabetic_only=True corpus = self.preprocessor.tokenize_text(clean_paragraph) paragraph_mod = ' '.join(corpus) vectorizer = self.counter.fit([query_mod, paragraph_mod]) vectors = [ vec for vec in vectorizer.transform([query_mod, paragraph_mod ]).toarray() ] norm_vec_query = np.linalg.norm(vectors[0]) norm_vec_paragraph = np.linalg.norm(vectors[1]) if norm_vec_paragraph == 0: continue cosine_similarity = np.dot( vectors[0], vectors[1]) / (norm_vec_query * norm_vec_paragraph) if cosine_similarity > precision: print("The cosine similary is: ", cosine_similarity, paragraph, "\n \n") highlights.append(paragraph) scores.append(cosine_similarity) return highlights, scores def load(id): return BoWHighlighter()
def highlight(self, document, query, precision): '''document must be the json document to be able to extract page''' highlights = [] scores = [] pages = [] if isinstance(document, dict): for page_num, text in document.items(): page_num = page_num.split("_")[1] for sentence in text.split("\n\n"): sentence = re.sub("\n", " ", sentence) sentence = re.sub(" +", " ", sentence) sentence = sentence.strip() if len(sentence) < 60: continue score = self._model.get_similarity(sentence, query) if score > precision: highlights.append(sentence) scores.append(score) pages.append(page_num) sorted_idxs = np.argsort(scores)[::-1] highlights = [highlights[idx] for idx in sorted_idxs] scores = [scores[idx] for idx in sorted_idxs] pages = [pages[idx] for idx in sorted_idxs] return highlights, scores, pages else: preprocessor = TextPreprocessor() clean_text = preprocessor.clean_sentence(document) paragraphs = preprocessor.split_into_paragraphs(document) for paragraph in paragraphs: paragraph = re.sub("\n", " ", sentence) paragraph = re.sub(" +", " ", sentence) paragraph = paragraph.strip() if len(paragraph) < 60: continue score = self._model.get_similarity(paragraph, query) if score > precision: highlights.append(paragraph) scores.append(score) sorted_idxs = np.argsort(scores)[::-1] highlights = [highlights[idx] for idx in sorted_idxs] scores = [scores[idx] for idx in sorted_idxs] return highlights, scores, None
def preclean_entireDoc(text): preprocessor = TextPreprocessor() clean_text = preprocessor.clean_sentence(text) word_list = preprocessor.tokenize_text(clean_text) words = ' '.join(word_list) return (words)
class IterativeCorpusBuilder(): def __init__(self, document_paths, max_words): self.tokenizer = IterativeCorpusBuilder.extract_dictionary( document_paths, max_words=max_words) self.document_paths = iter(document_paths) self.preprocessor = TextPreprocessor() self.clock = time() self.iterations = 0 self.inform_frequency = 1000 self.max_words = 2 * 10**6 def __next__(self): with open(next(self.document_paths), "r") as f: document = f.read() document = self.preprocessor.clean_sentence(document, alphabetic_only=True) words = self.preprocessor.tokenize_text(document) bow_representation = self.tokenizer.doc2bow(words) # Inform progress as specified self.iterations += 1 if self.iterations % self.inform_frequency == 0: print("{} iterations took {} seconds. {} done.".format( self.inform_frequency, time() - self.clock, self.iterations)) self.clock = time() return bow_representation def __iter__(self): print("Building corpus term-document matrices") return self def extract_dictionary(document_paths, max_words): """ Extracts a gensim Dictionary object from a set of documents. Parameters ---------- document_paths : [str] List of document paths that make up the corpus. Returns ------- dictionary : gensim.corpora.Dictionary Extracted dictionary (or tokenizer). """ print("Extracting dictionary from corpus") dictionary = Dictionary(prune_at=None) preprocessor = TextPreprocessor() for document_path in tqdm(document_paths): with open(document_path, "r") as f: document = f.read() document = preprocessor.clean_sentence(document, alphabetic_only=True) words = preprocessor.tokenize_text(document) dictionary.add_documents([words]) if len(dictionary) > max_words: start = time() dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=int(max_words * 0.9)) print("Dictionary filtered in {} seconds".format(time() - start)) return dictionary