def extract_json_values(json_pages): pages = set() for json_page in json_pages: description = json_page.get("description", None) pageid = int(json_page.get("pageid", 0)) orig_phrase = json_page.get("orig_phrase", None) orig_phrase_norm = json_page.get("orig_phrase_norm", None) wiki_title = json_page.get("wiki_title", None) wiki_title_norm = json_page.get("wiki_title_norm", None) relations_json = json_page.get("relations", None) rel_is_part_name = relations_json.get("isPartName", None) rel_is_disambiguation = relations_json.get("isDisambiguation", None) rel_disambiguation = relations_json.get("disambiguationLinks", None) rel_disambiguation_norm = relations_json.get("disambiguationLinksNorm", None) rel_parenthesis = relations_json.get("titleParenthesis", None) rel_parenthesis_norm = relations_json.get("titleParenthesisNorm", None) rel_categories = relations_json.get("categories", None) rel_categories_norm = relations_json.get("categoriesNorm", None) rel_be_comp = relations_json.get("beCompRelations", None) rel_be_comp_norm = relations_json.get("beCompRelationsNorm", None) rel_aliases = relations_json.get("aliases", None) rel_aliases_norm = relations_json.get("aliasesNorm", None) relations = WikipediaPageExtractedRelations( rel_is_part_name, rel_is_disambiguation, rel_parenthesis, rel_disambiguation, rel_categories, rel_aliases, rel_be_comp, rel_disambiguation_norm, rel_categories_norm, rel_aliases_norm, rel_parenthesis_norm, rel_be_comp_norm, ) page = WikipediaPage( orig_phrase, orig_phrase_norm, wiki_title, wiki_title_norm, 0, pageid, description, relations, ) pages.add(WikipediaSearchPageResult(orig_phrase, page)) return pages
def get_pages(self, phrase): if phrase in self.cache: return self.cache[phrase] try: phrase_strip = ' '.join(phrase.replace('-', ' ').split()) pages = set() best_results = self.get_best_elastic_results(phrase_strip) for result in best_results: _id = result['_id'] if _id != 0: result_source = result['_source'] if 'redirectTitle' in result_source: redirect_title = result_source['redirectTitle'] red_result = None while redirect_title and result_source[ 'title'] != redirect_title: red_result = self.get_redirect_result( redirect_title) if red_result is None or len(red_result) == 0: print('could not find redirect title=' + redirect_title + ', does not exist in data') redirect_title = None elif 'redirectTitle' in red_result[0]['_source']: redirect_title = red_result[0]['_source'][ 'redirectTitle'] else: redirect_title = None if red_result is not None and len(red_result) > 0: result = red_result[0] _id = result['_id'] elastic_page_result = self.get_page_from_result_v1( phrase_strip, result, _id) pages.add( WikipediaSearchPageResult(phrase, elastic_page_result)) self.cache[phrase] = pages return pages except Exception: traceback.print_exc()
def get_pages(self, phrase): if phrase in self.cache: return self.cache[phrase] try: phrase_strip = " ".join(phrase.replace("-", " ").split()) pages = set() best_results = self.get_best_elastic_results(phrase_strip) for result in best_results: _id = result["_id"] if _id != 0: result_source = result["_source"] if "redirectTitle" in result_source: redirect_title = result_source["redirectTitle"] red_result = None while redirect_title and result_source[ "title"] != redirect_title: red_result = self.get_redirect_result( redirect_title) if red_result is None or len(red_result) == 0: print("could not find redirect title=" + redirect_title + ", does not exist in data") redirect_title = None elif "redirectTitle" in red_result[0]["_source"]: redirect_title = red_result[0]["_source"][ "redirectTitle"] else: redirect_title = None if red_result is not None and len(red_result) > 0: result = red_result[0] _id = result["_id"] elastic_page_result = self.get_page_from_result_v1( phrase_strip, result, _id) pages.add( WikipediaSearchPageResult(phrase, elastic_page_result)) self.cache[phrase] = pages return pages except Exception: traceback.print_exc()
def get_pages(self, phrase): if phrase in self.cache: return self.cache[phrase] ret_pages = set() word_clean = phrase.replace('-', ' ') word_lower = word_clean.lower() word_upper = word_clean.upper() word_title = word_clean.title() words_set = {phrase, word_clean, word_lower, word_upper, word_title} for appr in words_set: try: page_result = self.get_page_redirect(appr) if page_result.pageid != 0: full_page = self.get_wiki_page_with_items(phrase, page_result) ret_pages.add(WikipediaSearchPageResult(appr, full_page)) except Exception as e: print(e) self.cache[phrase] = ret_pages return ret_pages
def extract_json_values(json_pages): pages = set() for json_page in json_pages: description = json_page.get('description', None) pageid = int(json_page.get('pageid', 0)) orig_phrase = json_page.get('orig_phrase', None) orig_phrase_norm = json_page.get('orig_phrase_norm', None) wiki_title = json_page.get('wiki_title', None) wiki_title_norm = json_page.get('wiki_title_norm', None) relations_json = json_page.get('relations', None) rel_is_part_name = relations_json.get('isPartName', None) rel_is_disambiguation = relations_json.get('isDisambiguation', None) rel_disambiguation = relations_json.get('disambiguationLinks', None) rel_disambiguation_norm = relations_json.get( 'disambiguationLinksNorm', None) rel_parenthesis = relations_json.get('titleParenthesis', None) rel_parenthesis_norm = relations_json.get('titleParenthesisNorm', None) rel_categories = relations_json.get('categories', None) rel_categories_norm = relations_json.get('categoriesNorm', None) rel_be_comp = relations_json.get('beCompRelations', None) rel_be_comp_norm = relations_json.get('beCompRelationsNorm', None) rel_aliases = relations_json.get('aliases', None) rel_aliases_norm = relations_json.get('aliasesNorm', None) relations = WikipediaPageExtractedRelations( rel_is_part_name, rel_is_disambiguation, rel_parenthesis, rel_disambiguation, rel_categories, rel_aliases, rel_be_comp, rel_disambiguation_norm, rel_categories_norm, rel_aliases_norm, rel_parenthesis_norm, rel_be_comp_norm) page = WikipediaPage(orig_phrase, orig_phrase_norm, wiki_title, wiki_title_norm, 0, pageid, description, relations) pages.add(WikipediaSearchPageResult(orig_phrase, page)) return pages