Пример #1
0
    def get_doc_text_hyper_linked_titles_for_articles(self, doc_id):
        """
        fetch all of the paragraphs with their corresponding hyperlink titles.
        e.g., 
        >>> paras, links = db.get_doc_text_hyper_linked_titles_for_articles("Tokyo Imperial Palace_0")
        >>> paras[2]
        'It is built on the site of the old Edo Castle. The total area including the gardens is . During the height of the 1980s Japanese property bubble, the palace grounds were valued by some to be more than the value of all of the real estate in the state of California.'
        >>> links[2]
        ['Edo Castle', 'Japanese asset price bubble', 'Real estate', 'California']
        """
        cursor = self.connection.cursor()
        cursor.execute("SELECT text FROM documents WHERE id = ?", (doc_id, ))
        result = cursor.fetchone()
        cursor.close()
        if result is None:
            return [], []
        else:
            hyper_linked_paragraphs = result[0].split("\n\n")
            paragraphs, hyper_linked_titles = [], []

            for hyper_linked_paragraph in hyper_linked_paragraphs:
                paragraphs.append(remove_tags(hyper_linked_paragraph))
                hyper_linked_titles.append([
                    normalize(title) for title in find_hyper_linked_titles(
                        hyper_linked_paragraph)
                ])

            return paragraphs, hyper_linked_titles
Пример #2
0
    def text2spvec(self, query):
        """Create a sparse tfidf-weighted word vector from query.

        tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5))
        """
        # Get hashed ngrams
        # TODO: do we need to have normalize?
        words = self.parse(normalize(query))
        wids = [hash(w, self.hash_size) for w in words]

        if len(wids) == 0:
            if self.strict:
                raise RuntimeError('No valid word in: %s' % query)
            else:
                logger.warning('No valid word in: %s' % query)
                return sp.csr_matrix((1, self.hash_size))

        # Count TF
        wids_unique, wids_counts = np.unique(wids, return_counts=True)
        tfs = np.log1p(wids_counts)

        # Count IDF
        Ns = self.doc_freqs[wids_unique]
        idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5))
        idfs[idfs < 0] = 0

        # TF-IDF
        data = np.multiply(tfs, idfs)

        # One row, sparse csr matrix
        indptr = np.array([0, len(wids_unique)])
        spvec = sp.csr_matrix((data, wids_unique, indptr),
                              shape=(1, self.hash_size))

        return spvec
Пример #3
0
    def load_sampled_tagged_para_text(self, question, pruning_l, tagme_api_key):
        tagged_titles = self.retrieve_titles_w_tag_me(question, tagme_api_key)
        tagged_doc_names = [normalize(title) for title in tagged_titles]

        context, _ = self.load_sampled_para_text_and_linked_titles(
            tagged_doc_names, question, pruning_l)
        
        return context
Пример #4
0
 def get_hyper_linked(self, doc_id):
     """Fetch the hyper-linked titles of the doc for 'doc_id'."""
     cursor = self.connection.cursor()
     cursor.execute("SELECT linked_title FROM documents WHERE id = ?",
                    (doc_id, ))
     result = cursor.fetchone()
     cursor.close()
     return result if (result is None or len(result[0]) == 0) else [
         normalize(title) for title in result[0].split("\t")
     ]
Пример #5
0
    def get_hyperlinked_abstract_paragraphs(self,
                                            title: str,
                                            question: str = None):

        if self.use_full_article is True and self.title2hyperlink_dic is not None:
            if title not in self.title2hyperlink_dic:
                return {}
            hyper_linked_titles = self.title2hyperlink_dic[title]
        elif self.use_full_article is True and self.title2hyperlink_dic is None:
            # for full article version, we need to store title2hyperlink_dic beforehand.
            raise NotImplementedError()
        else:
            hyper_linked_titles = self.db.get_hyper_linked(normalize(title))
        
        if hyper_linked_titles is None:
            return {}
        # if there are any hyperlinked titles, add the information to all_linked_paragraph
        all_linked_paras_dic = {}

        if self.use_full_article is True and self.title2hyperlink_dic is not None: 
            for hyper_linked_para_title in hyper_linked_titles:
                paras_dict, _ = load_para_and_linked_titles_dict_from_tfidf_id(
                    hyper_linked_para_title, self.db)
                # Sometimes article titles are updated over times but the hyperlinked titles are not. e.g., Winds <--> Wind
                # in our current database, we do not handle these "redirect" cases and thus we cannot recover.
                # If we cannot retrieve the hyperlinked articles, we just discard these articles.
                if len(paras_dict) == 0:
                    continue
                tfidf_vectorizer = TopTfIdf(n_to_select=self.pruning_l,
                                            filter_dist_one=True, rank=True)
                pruned_para_dict = prune_top_k_paragraphs(
                    question, paras_dict, tfidf_vectorizer, self.pruning_l)

                all_linked_paras_dic.update(pruned_para_dict)
        
        else:
            for hyper_linked_para_title in hyper_linked_titles:
                para_title_text_pairs = load_para_collections_from_tfidf_id_intro_only(
                    hyper_linked_para_title, self.db)
                # Sometimes article titles are updated over times but the hyperlinked titles are not. e.g., Winds <--> Wind
                # in our current database, we do not handle these "redirect" cases and thus we cannot recover.
                # If we cannot retrieve the hyperlinked articles, we just discard these articles.
                if len(para_title_text_pairs) == 0:
                    continue

                para_title_text_pairs = {para[0]: "".join(para[1])
                                        for para in para_title_text_pairs}

                all_linked_paras_dic.update(para_title_text_pairs)

        return all_linked_paras_dic