Пример #1
0
    def extract_json_values(json_pages):
        pages = set()
        for json_page in json_pages:
            description = json_page.get("description", None)
            pageid = int(json_page.get("pageid", 0))
            orig_phrase = json_page.get("orig_phrase", None)
            orig_phrase_norm = json_page.get("orig_phrase_norm", None)
            wiki_title = json_page.get("wiki_title", None)
            wiki_title_norm = json_page.get("wiki_title_norm", None)

            relations_json = json_page.get("relations", None)
            rel_is_part_name = relations_json.get("isPartName", None)
            rel_is_disambiguation = relations_json.get("isDisambiguation", None)
            rel_disambiguation = relations_json.get("disambiguationLinks", None)
            rel_disambiguation_norm = relations_json.get("disambiguationLinksNorm", None)
            rel_parenthesis = relations_json.get("titleParenthesis", None)
            rel_parenthesis_norm = relations_json.get("titleParenthesisNorm", None)
            rel_categories = relations_json.get("categories", None)
            rel_categories_norm = relations_json.get("categoriesNorm", None)
            rel_be_comp = relations_json.get("beCompRelations", None)
            rel_be_comp_norm = relations_json.get("beCompRelationsNorm", None)
            rel_aliases = relations_json.get("aliases", None)
            rel_aliases_norm = relations_json.get("aliasesNorm", None)

            relations = WikipediaPageExtractedRelations(
                rel_is_part_name,
                rel_is_disambiguation,
                rel_parenthesis,
                rel_disambiguation,
                rel_categories,
                rel_aliases,
                rel_be_comp,
                rel_disambiguation_norm,
                rel_categories_norm,
                rel_aliases_norm,
                rel_parenthesis_norm,
                rel_be_comp_norm,
            )

            page = WikipediaPage(
                orig_phrase,
                orig_phrase_norm,
                wiki_title,
                wiki_title_norm,
                0,
                pageid,
                description,
                relations,
            )
            pages.add(WikipediaSearchPageResult(orig_phrase, page))

        return pages
Пример #2
0
 def is_name_description(text, item, is_disambiguation):
     if item is not None:
         if is_disambiguation:
             if WikipediaPageExtractedRelations.is_name_part(text):
                 return True
         else:
             dic = item.get()
             if dic is not None and 'descriptions' in dic:
                 desc = dic['descriptions']
                 if desc is not None and 'en' in desc:
                     if [s for s in NAME_DESCRIPTIONS if s in desc['en'].lower()]:
                         return True
     return False
Пример #3
0
    def get_page_from_result_v1(self, phrase, result, result_id):
        if result_id != 0 and result is not None:
            relations = None
            result_source = result["_source"]
            result_score = result["_score"]
            if result_source is not None:
                title = result_source["title"]
                relations_source = result_source["relations"]

                if relations_source is not None:
                    is_part = relations_source["isPartName"]
                    is_disambig = relations_source["isDisambiguation"]

                    disambig_links = self.safe_extract_field_from_dict(
                        "disambiguationLinks", relations_source)
                    disambig_links_norm = self.safe_extract_field_from_dict(
                        "disambiguationLinksNorm", relations_source)
                    categories = self.safe_extract_field_from_dict(
                        "categories", relations_source)
                    categories_norm = self.safe_extract_field_from_dict(
                        "categoriesNorm", relations_source)
                    title_parent = self.safe_extract_field_from_dict(
                        "titleParenthesis", relations_source)
                    title_parent_norm = self.safe_extract_field_from_dict(
                        "titleParenthesisNorm", relations_source)
                    be_comp = self.safe_extract_field_from_dict(
                        "beCompRelations", relations_source)
                    be_comp_norm = self.safe_extract_field_from_dict(
                        "beCompRelationsNorm", relations_source)

                    relations = WikipediaPageExtractedRelations(
                        is_part,
                        is_disambig,
                        title_parent,
                        disambig_links,
                        categories,
                        None,
                        be_comp,
                        disambig_links_norm,
                        categories_norm,
                        None,
                        title_parent_norm,
                        be_comp_norm,
                    )

            return WikipediaPage(phrase, None, title, None, result_score,
                                 result_id, None, relations)

        return WikipediaPage()
Пример #4
0
    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(
            text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid,
                                 description, relations)

        logger.debug("Page: {}. Extracted successfully".format(ret_page))

        return ret_page
Пример #5
0
    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations)

        print('Page:' + str(ret_page) + ". Extracted successfully")

        return ret_page
Пример #6
0
    def extract_json_values(json_pages):
        pages = set()
        for json_page in json_pages:
            description = json_page.get('description', None)
            pageid = int(json_page.get('pageid', 0))
            orig_phrase = json_page.get('orig_phrase', None)
            orig_phrase_norm = json_page.get('orig_phrase_norm', None)
            wiki_title = json_page.get('wiki_title', None)
            wiki_title_norm = json_page.get('wiki_title_norm', None)

            relations_json = json_page.get('relations', None)
            rel_is_part_name = relations_json.get('isPartName', None)
            rel_is_disambiguation = relations_json.get('isDisambiguation',
                                                       None)
            rel_disambiguation = relations_json.get('disambiguationLinks',
                                                    None)
            rel_disambiguation_norm = relations_json.get(
                'disambiguationLinksNorm', None)
            rel_parenthesis = relations_json.get('titleParenthesis', None)
            rel_parenthesis_norm = relations_json.get('titleParenthesisNorm',
                                                      None)
            rel_categories = relations_json.get('categories', None)
            rel_categories_norm = relations_json.get('categoriesNorm', None)
            rel_be_comp = relations_json.get('beCompRelations', None)
            rel_be_comp_norm = relations_json.get('beCompRelationsNorm', None)
            rel_aliases = relations_json.get('aliases', None)
            rel_aliases_norm = relations_json.get('aliasesNorm', None)

            relations = WikipediaPageExtractedRelations(
                rel_is_part_name, rel_is_disambiguation, rel_parenthesis,
                rel_disambiguation, rel_categories, rel_aliases, rel_be_comp,
                rel_disambiguation_norm, rel_categories_norm, rel_aliases_norm,
                rel_parenthesis_norm, rel_be_comp_norm)

            page = WikipediaPage(orig_phrase, orig_phrase_norm, wiki_title,
                                 wiki_title_norm, 0, pageid, description,
                                 relations)
            pages.add(WikipediaSearchPageResult(orig_phrase, page))

        return pages
Пример #7
0
    def get_page_from_result_v1(self, phrase, result, result_id):
        if result_id != 0 and result is not None:
            relations = None
            result_source = result['_source']
            result_score = result['_score']
            if result_source is not None:
                title = result_source['title']
                relations_source = result_source['relations']

                if relations_source is not None:
                    is_part = relations_source['isPartName']
                    is_disambig = relations_source['isDisambiguation']

                    disambig_links = self.safe_extract_field_from_dict(
                        'disambiguationLinks', relations_source)
                    disambig_links_norm = self.safe_extract_field_from_dict(
                        'disambiguationLinksNorm', relations_source)
                    categories = self.safe_extract_field_from_dict(
                        'categories', relations_source)
                    categories_norm = self.safe_extract_field_from_dict(
                        'categoriesNorm', relations_source)
                    title_parent = self.safe_extract_field_from_dict(
                        'titleParenthesis', relations_source)
                    title_parent_norm = self.safe_extract_field_from_dict(
                        'titleParenthesisNorm', relations_source)
                    be_comp = self.safe_extract_field_from_dict(
                        'beCompRelations', relations_source)
                    be_comp_norm = self.safe_extract_field_from_dict(
                        'beCompRelationsNorm', relations_source)

                    relations = WikipediaPageExtractedRelations(
                        is_part, is_disambig, title_parent, disambig_links,
                        categories, None, be_comp, disambig_links_norm,
                        categories_norm, None, title_parent_norm, be_comp_norm)

            return WikipediaPage(phrase, None, title, None, result_score,
                                 result_id, None, relations)

        return WikipediaPage()