예제 #1
0
def preclean_xlsxParagraph(text_list):
    preprocessor = TextPreprocessor()
    word_list = list()
    for text in text_list:
        clean_text = preprocessor.clean_sentence(text)
        word_list.append(' '.join(preprocessor.tokenize_text(clean_text)))
    return (word_list)
    def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary
class BM25Engine(SearchEngine):
    tokenizer_file = "data/processed/{id}/tokenizer"
    corpus_file = "data/processed/{id}/corpus"
    doc_idxs_file = "data/processed/{id}/doc_idxs.json"

    def __init__(self, tokenizer, corpus, idxs2id):
        """
        Parameters
        ----------
        tokenizer : gensim.corpora.Dictionary
            Word tokenizer.
        corpus : gensim.corpora.mmcorpus.MmCorpus
            Bag-of-words formatted corpus of documents.
        """
        self.preprocessor = TextPreprocessor()
        self.tokenizer = tokenizer
        self.corpus = corpus
        self.internal_engine = BM25(self.corpus)
        self.idxs2id = idxs2id
        print("BM25 engine loaded")

    def top_k_matches(self, query, k):
        clean = self.preprocessor.clean_sentence(query, alphabetic_only=True)
        word_list = self.preprocessor.tokenize_text(clean)
        bow_representation = self.tokenizer.doc2bow(word_list)
        scores = self.internal_engine.get_scores(bow_representation)
        top_k_idxs = np.argsort(scores)[::-1][:k]
        return [self.idxs2id[str(idx)] for idx in top_k_idxs]

    def load(id):
        tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id))
        corpus = MmCorpus(BM25Engine.corpus_file.format(id=id))
        with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f:
            idxs2id = json.load(f)
        return BM25Engine(tokenizer, corpus, idxs2id)
예제 #4
0
class BoWHighlighter(SegmentHighlighter):
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.counter = CountVectorizer()

    def highlight(self, document, query, precision):
        clean_query = self.preprocessor.clean_sentence(
            query)  #, alphabetic_only=True
        word_list = self.preprocessor.tokenize_text(clean_query)
        #Convert query after tokenization as a string
        query_mod = ' '.join(word_list)
        document = re.sub(" +", " ", document)
        document = re.sub("\n\s+", " \n \n ", document)
        document = re.sub("\n+", "\n", document)
        # document = re.sub("\n*", "\n\n ", document)
        document = document.strip()
        highlights = []
        scores = []
        for paragraph in document.split(" \n \n "):
            if len(paragraph) == 0:
                continue
            clean_paragraph = self.preprocessor.clean_sentence(
                paragraph)  #, alphabetic_only=True
            corpus = self.preprocessor.tokenize_text(clean_paragraph)
            paragraph_mod = ' '.join(corpus)
            vectorizer = self.counter.fit([query_mod, paragraph_mod])
            vectors = [
                vec for vec in vectorizer.transform([query_mod, paragraph_mod
                                                     ]).toarray()
            ]
            norm_vec_query = np.linalg.norm(vectors[0])
            norm_vec_paragraph = np.linalg.norm(vectors[1])
            if norm_vec_paragraph == 0: continue
            cosine_similarity = np.dot(
                vectors[0], vectors[1]) / (norm_vec_query * norm_vec_paragraph)
            if cosine_similarity > precision:
                print("The cosine similary is: ", cosine_similarity, paragraph,
                      "\n \n")
                highlights.append(paragraph)
                scores.append(cosine_similarity)
        return highlights, scores

    def load(id):
        return BoWHighlighter()
예제 #5
0
    def highlight(self, document, query, precision):
        '''document must be the json document to be able to extract page'''
        highlights = []
        scores = []
        pages = []
        if isinstance(document, dict):
            for page_num, text in document.items():
                page_num = page_num.split("_")[1]
                for sentence in text.split("\n\n"):
                    sentence = re.sub("\n", " ", sentence)
                    sentence = re.sub(" +", " ", sentence)
                    sentence = sentence.strip()
                    if len(sentence) < 60:
                        continue
                    score = self._model.get_similarity(sentence, query)
                    if score > precision:
                        highlights.append(sentence)
                        scores.append(score)
                        pages.append(page_num)
            sorted_idxs = np.argsort(scores)[::-1]
            highlights = [highlights[idx] for idx in sorted_idxs]
            scores = [scores[idx] for idx in sorted_idxs]
            pages = [pages[idx] for idx in sorted_idxs]

            return highlights, scores, pages

        else:
            preprocessor = TextPreprocessor()
            clean_text = preprocessor.clean_sentence(document)
            paragraphs = preprocessor.split_into_paragraphs(document)

            for paragraph in paragraphs:
                paragraph = re.sub("\n", " ", sentence)
                paragraph = re.sub(" +", " ", sentence)
                paragraph = paragraph.strip()
                if len(paragraph) < 60:
                    continue
                score = self._model.get_similarity(paragraph, query)
                if score > precision:
                    highlights.append(paragraph)
                    scores.append(score)
            sorted_idxs = np.argsort(scores)[::-1]
            highlights = [highlights[idx] for idx in sorted_idxs]
            scores = [scores[idx] for idx in sorted_idxs]

            return highlights, scores, None
예제 #6
0
def preclean_entireDoc(text):
    preprocessor = TextPreprocessor()
    clean_text = preprocessor.clean_sentence(text)
    word_list = preprocessor.tokenize_text(clean_text)
    words = ' '.join(word_list)
    return (words)
class IterativeCorpusBuilder():
    def __init__(self, document_paths, max_words):
        self.tokenizer = IterativeCorpusBuilder.extract_dictionary(
            document_paths, max_words=max_words)
        self.document_paths = iter(document_paths)
        self.preprocessor = TextPreprocessor()
        self.clock = time()
        self.iterations = 0
        self.inform_frequency = 1000
        self.max_words = 2 * 10**6

    def __next__(self):
        with open(next(self.document_paths), "r") as f:
            document = f.read()
        document = self.preprocessor.clean_sentence(document,
                                                    alphabetic_only=True)
        words = self.preprocessor.tokenize_text(document)
        bow_representation = self.tokenizer.doc2bow(words)
        # Inform progress as specified
        self.iterations += 1
        if self.iterations % self.inform_frequency == 0:
            print("{} iterations took {} seconds. {} done.".format(
                self.inform_frequency,
                time() - self.clock, self.iterations))
            self.clock = time()
        return bow_representation

    def __iter__(self):
        print("Building corpus term-document matrices")
        return self

    def extract_dictionary(document_paths, max_words):
        """
        Extracts a gensim Dictionary object from a set of documents.

        Parameters
        ----------
        document_paths : [str]
            List of document paths that make up the corpus.
        
        Returns
        -------
        dictionary : gensim.corpora.Dictionary
            Extracted dictionary (or tokenizer).
        """
        print("Extracting dictionary from corpus")
        dictionary = Dictionary(prune_at=None)
        preprocessor = TextPreprocessor()
        for document_path in tqdm(document_paths):
            with open(document_path, "r") as f:
                document = f.read()
            document = preprocessor.clean_sentence(document,
                                                   alphabetic_only=True)
            words = preprocessor.tokenize_text(document)
            dictionary.add_documents([words])
            if len(dictionary) > max_words:
                start = time()
                dictionary.filter_extremes(no_below=10,
                                           no_above=0.5,
                                           keep_n=int(max_words * 0.9))
                print("Dictionary filtered in {} seconds".format(time() -
                                                                 start))
        return dictionary