def get_doc_text_hyper_linked_titles_for_articles(self, doc_id): """ fetch all of the paragraphs with their corresponding hyperlink titles. e.g., >>> paras, links = db.get_doc_text_hyper_linked_titles_for_articles("Tokyo Imperial Palace_0") >>> paras[2] 'It is built on the site of the old Edo Castle. The total area including the gardens is . During the height of the 1980s Japanese property bubble, the palace grounds were valued by some to be more than the value of all of the real estate in the state of California.' >>> links[2] ['Edo Castle', 'Japanese asset price bubble', 'Real estate', 'California'] """ cursor = self.connection.cursor() cursor.execute("SELECT text FROM documents WHERE id = ?", (doc_id, )) result = cursor.fetchone() cursor.close() if result is None: return [], [] else: hyper_linked_paragraphs = result[0].split("\n\n") paragraphs, hyper_linked_titles = [], [] for hyper_linked_paragraph in hyper_linked_paragraphs: paragraphs.append(remove_tags(hyper_linked_paragraph)) hyper_linked_titles.append([ normalize(title) for title in find_hyper_linked_titles( hyper_linked_paragraph) ]) return paragraphs, hyper_linked_titles
def text2spvec(self, query): """Create a sparse tfidf-weighted word vector from query. tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5)) """ # Get hashed ngrams # TODO: do we need to have normalize? words = self.parse(normalize(query)) wids = [hash(w, self.hash_size) for w in words] if len(wids) == 0: if self.strict: raise RuntimeError('No valid word in: %s' % query) else: logger.warning('No valid word in: %s' % query) return sp.csr_matrix((1, self.hash_size)) # Count TF wids_unique, wids_counts = np.unique(wids, return_counts=True) tfs = np.log1p(wids_counts) # Count IDF Ns = self.doc_freqs[wids_unique] idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5)) idfs[idfs < 0] = 0 # TF-IDF data = np.multiply(tfs, idfs) # One row, sparse csr matrix indptr = np.array([0, len(wids_unique)]) spvec = sp.csr_matrix((data, wids_unique, indptr), shape=(1, self.hash_size)) return spvec
def load_sampled_tagged_para_text(self, question, pruning_l, tagme_api_key): tagged_titles = self.retrieve_titles_w_tag_me(question, tagme_api_key) tagged_doc_names = [normalize(title) for title in tagged_titles] context, _ = self.load_sampled_para_text_and_linked_titles( tagged_doc_names, question, pruning_l) return context
def get_hyper_linked(self, doc_id): """Fetch the hyper-linked titles of the doc for 'doc_id'.""" cursor = self.connection.cursor() cursor.execute("SELECT linked_title FROM documents WHERE id = ?", (doc_id, )) result = cursor.fetchone() cursor.close() return result if (result is None or len(result[0]) == 0) else [ normalize(title) for title in result[0].split("\t") ]
def get_hyperlinked_abstract_paragraphs(self, title: str, question: str = None): if self.use_full_article is True and self.title2hyperlink_dic is not None: if title not in self.title2hyperlink_dic: return {} hyper_linked_titles = self.title2hyperlink_dic[title] elif self.use_full_article is True and self.title2hyperlink_dic is None: # for full article version, we need to store title2hyperlink_dic beforehand. raise NotImplementedError() else: hyper_linked_titles = self.db.get_hyper_linked(normalize(title)) if hyper_linked_titles is None: return {} # if there are any hyperlinked titles, add the information to all_linked_paragraph all_linked_paras_dic = {} if self.use_full_article is True and self.title2hyperlink_dic is not None: for hyper_linked_para_title in hyper_linked_titles: paras_dict, _ = load_para_and_linked_titles_dict_from_tfidf_id( hyper_linked_para_title, self.db) # Sometimes article titles are updated over times but the hyperlinked titles are not. e.g., Winds <--> Wind # in our current database, we do not handle these "redirect" cases and thus we cannot recover. # If we cannot retrieve the hyperlinked articles, we just discard these articles. if len(paras_dict) == 0: continue tfidf_vectorizer = TopTfIdf(n_to_select=self.pruning_l, filter_dist_one=True, rank=True) pruned_para_dict = prune_top_k_paragraphs( question, paras_dict, tfidf_vectorizer, self.pruning_l) all_linked_paras_dic.update(pruned_para_dict) else: for hyper_linked_para_title in hyper_linked_titles: para_title_text_pairs = load_para_collections_from_tfidf_id_intro_only( hyper_linked_para_title, self.db) # Sometimes article titles are updated over times but the hyperlinked titles are not. e.g., Winds <--> Wind # in our current database, we do not handle these "redirect" cases and thus we cannot recover. # If we cannot retrieve the hyperlinked articles, we just discard these articles. if len(para_title_text_pairs) == 0: continue para_title_text_pairs = {para[0]: "".join(para[1]) for para in para_title_text_pairs} all_linked_paras_dic.update(para_title_text_pairs) return all_linked_paras_dic