Exemplo n.º 1
0
    def __init__(self,
                 load_path: str,
                 wiki_filename: str,
                 entities_filename: str,
                 inverted_index_filename: str,
                 id_to_name_file: str,
                 lemmatize: bool = True,
                 debug: bool = False,
                 rule_filter_entities: bool = True,
                 use_inverted_index: bool = True,
                 language: str = 'rus',
                 *args,
                 **kwargs) -> None:
        """

        Args:
            load_path: path to folder with wikidata files
            wiki_filename: file with Wikidata triplets
            entities_filename: file with dict of entity titles (keys) and entity ids (values)
            inverted_index_filename: file with dict of words (keys) and entities containing these words (values)
            id_to_name_file: file with dict of entity ids (keys) and entities names and aliases (values)
            lemmatize: whether to lemmatize tokens of extracted entity
            debug: whether to print entities extracted from Wikidata
            rule_filter_entities: whether to filter entities which do not fit the question
            use_inverted_index: whether to use inverted index for entity linking
            language - the language of the linker (used for filtration of some questions to improve overall performance)
            *args:
            **kwargs:
        """
        super().__init__(save_path=None, load_path=load_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.lemmatize = lemmatize
        self.debug = debug
        self.rule_filter_entities = rule_filter_entities
        self.use_inverted_index = use_inverted_index
        self._language = language
        if language not in self.LANGUAGES:
            log.warning(
                f'EntityLinker supports only the following languages: {self.LANGUAGES}'
            )

        self._wiki_filename = wiki_filename
        self._entities_filename = entities_filename
        self.inverted_index_filename = inverted_index_filename
        self.id_to_name_file = id_to_name_file

        self.name_to_q: Optional[Dict[str, List[Tuple[str]]]] = None
        self.wikidata: Optional[Dict[str, List[List[str]]]] = None
        self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None
        self.id_to_name: Optional[Dict[str, Dict[List[str]]]] = None
        self.load()
        if self.use_inverted_index:
            alphabet = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890-_()=+!?.,/;:&@<>|#$%^*"
            dictionary_words = list(self.inverted_index.keys())
            self.searcher = LevenshteinSearcher(alphabet, dictionary_words)
Exemplo n.º 2
0
    def __init__(self, load_path: str,
                 inverted_index_filename: str,
                 entities_list_filename: str,
                 q2name_filename: str,
                 save_path: str = None,
                 q2descr_filename: str = None,
                 rel_ranker: RelRankerBertInfer = None,
                 build_inverted_index: bool = False,
                 kb_format: str = "hdt",
                 kb_filename: str = None,
                 label_rel: str = None,
                 descr_rel: str = None,
                 aliases_rels: List[str] = None,
                 sql_table_name: str = None,
                 sql_column_names: List[str] = None,
                 lang: str = "en",
                 use_descriptions: bool = False,
                 lemmatize: bool = False,
                 use_prefix_tree: bool = False,
                 **kwargs) -> None:
        """

        Args:
            load_path: path to folder with inverted index files
            save_path: path where to save inverted index files
            inverted_index_filename: file with dict of words (keys) and entities containing these words
            entities_list_filename: file with the list of entities from the knowledge base
            q2name_filename: name of file which maps entity id to name
            q2descr_filename: name of file which maps entity id to description
            rel_ranker: component deeppavlov.models.kbqa.rel_ranker_bert_infer
            build_inverted_index: if "true", inverted index of entities of the KB will be built
            kb_format: "hdt" or "sqlite3"
            kb_filename: file with the knowledge base, which will be used for building of inverted index
            label_rel: relation in the knowledge base which connects entity ids and entity titles
            descr_rel: relation in the knowledge base which connects entity ids and entity descriptions
            aliases_rels: list of relations which connect entity ids and entity aliases
            sql_table_name: name of the table with the KB if the KB is in sqlite3 format
            sql_column_names: names of columns with subject, relation and object
            lang: language used
            use_descriptions: whether to use context and descriptions of entities for entity ranking
            lemmatize: whether to lemmatize tokens of extracted entity
            use_prefix_tree: whether to use prefix tree for search of entities with typos in entity labels
            **kwargs:
        """
        super().__init__(save_path=save_path, load_path=load_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.lemmatize = lemmatize
        self.use_prefix_tree = use_prefix_tree
        self.inverted_index_filename = inverted_index_filename
        self.entities_list_filename = entities_list_filename
        self.build_inverted_index = build_inverted_index
        self.q2name_filename = q2name_filename
        self.q2descr_filename = q2descr_filename
        self.kb_format = kb_format
        self.kb_filename = kb_filename
        self.label_rel = label_rel
        self.aliases_rels = aliases_rels
        self.descr_rel = descr_rel
        self.sql_table_name = sql_table_name
        self.sql_column_names = sql_column_names
        self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None
        self.entities_index: Optional[List[str]] = None
        self.q2name: Optional[List[Tuple[str]]] = None
        self.lang_str = f"@{lang}"
        if self.lang_str == "@en":
            self.stopwords = set(stopwords.words("english"))
        elif self.lang_str == "@ru":
            self.stopwords = set(stopwords.words("russian"))
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.rel_ranker = rel_ranker
        self.use_descriptions = use_descriptions

        if self.use_prefix_tree:
            alphabet = "!#%\&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz½¿ÁÄ" + \
                       "ÅÆÇÉÎÓÖ×ÚßàáâãäåæçèéêëíîïðñòóôöøùúûüýāăąćČčĐėęěĞğĩīİıŁłńňŌōőřŚśşŠšťũūůŵźŻżŽžơưșȚțəʻ" + \
                       "ʿΠΡβγБМавдежикмностъяḤḥṇṬṭầếờợ–‘’Ⅲ−∗"
            dictionary_words = list(self.inverted_index.keys())
            self.searcher = LevenshteinSearcher(alphabet, dictionary_words)

        if self.build_inverted_index:
            if self.kb_format == "hdt":
                self.doc = HDTDocument(str(expand_path(self.kb_filename)))
            if self.kb_format == "sqlite3":
                self.conn = sqlite3.connect(str(expand_path(self.kb_filename)))
                self.cursor = self.conn.cursor()
            self.inverted_index_builder()
            self.save()
        else:
            self.load()
Exemplo n.º 3
0
    def __init__(self,
                 load_path: str,
                 inverted_index_filename: str,
                 entities_list_filename: str,
                 q2name_filename: str,
                 types_dict_filename: Optional[str] = None,
                 who_entities_filename: Optional[str] = None,
                 save_path: str = None,
                 q2descr_filename: str = None,
                 descr_rank_score_thres: float = 0.01,
                 freq_dict_filename: Optional[str] = None,
                 entity_ranker: RelRankerBertInfer = None,
                 build_inverted_index: bool = False,
                 kb_format: str = "hdt",
                 kb_filename: str = None,
                 label_rel: str = None,
                 descr_rel: str = None,
                 aliases_rels: List[str] = None,
                 sql_table_name: str = None,
                 sql_column_names: List[str] = None,
                 lang: str = "en",
                 use_descriptions: bool = False,
                 include_mention: bool = False,
                 num_entities_to_return: int = 5,
                 lemmatize: bool = False,
                 use_prefix_tree: bool = False,
                 **kwargs) -> None:
        """

        Args:
            load_path: path to folder with inverted index files
            inverted_index_filename: file with dict of words (keys) and entities containing these words
            entities_list_filename: file with the list of entities from the knowledge base
            q2name_filename: file which maps entity id to name
            types_dict_filename: file with types of entities
            who_entities_filename: file with the list of entities in Wikidata, which can be answers to questions
                with "Who" pronoun, i.e. humans, literary characters etc.
            save_path: path where to save inverted index files
            q2descr_filename: name of file which maps entity id to description
            descr_rank_score_thres: if the score of the entity description is less than threshold, the entity is not
                added to output list
            freq_dict_filename: filename with frequences dictionary of Russian words
            entity_ranker: component deeppavlov.models.kbqa.rel_ranker_bert_infer
            build_inverted_index: if "true", inverted index of entities of the KB will be built
            kb_format: "hdt" or "sqlite3"
            kb_filename: file with the knowledge base, which will be used for building of inverted index
            label_rel: relation in the knowledge base which connects entity ids and entity titles
            descr_rel: relation in the knowledge base which connects entity ids and entity descriptions
            aliases_rels: list of relations which connect entity ids and entity aliases
            sql_table_name: name of the table with the KB if the KB is in sqlite3 format
            sql_column_names: names of columns with subject, relation and object
            lang: language used
            use_descriptions: whether to use context and descriptions of entities for entity ranking
            include_mention: whether to leave or delete entity mention from the sentence before passing to BERT ranker
            num_entities_to_return: how many entities for each substring the system returns
            lemmatize: whether to lemmatize tokens of extracted entity
            use_prefix_tree: whether to use prefix tree for search of entities with typos in entity labels
            **kwargs:
        """
        super().__init__(save_path=save_path, load_path=load_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.lemmatize = lemmatize
        self.use_prefix_tree = use_prefix_tree
        self.inverted_index_filename = inverted_index_filename
        self.entities_list_filename = entities_list_filename
        self.build_inverted_index = build_inverted_index
        self.q2name_filename = q2name_filename
        self.types_dict_filename = types_dict_filename
        self.who_entities_filename = who_entities_filename
        self.q2descr_filename = q2descr_filename
        self.descr_rank_score_thres = descr_rank_score_thres
        self.freq_dict_filename = freq_dict_filename
        self.kb_format = kb_format
        self.kb_filename = kb_filename
        self.label_rel = label_rel
        self.aliases_rels = aliases_rels
        self.descr_rel = descr_rel
        self.sql_table_name = sql_table_name
        self.sql_column_names = sql_column_names
        self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None
        self.entities_index: Optional[List[str]] = None
        self.q2name: Optional[List[Tuple[str]]] = None
        self.types_dict: Optional[Dict[str, List[str]]] = None
        self.lang_str = f"@{lang}"
        if self.lang_str == "@en":
            self.stopwords = set(stopwords.words("english"))
        elif self.lang_str == "@ru":
            self.stopwords = set(stopwords.words("russian"))
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.entity_ranker = entity_ranker
        self.use_descriptions = use_descriptions
        self.include_mention = include_mention
        self.num_entities_to_return = num_entities_to_return
        if self.use_descriptions and self.entity_ranker is None:
            raise ValueError("No entity ranker is provided!")

        if self.use_prefix_tree:
            alphabet = "!#%\&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz½¿ÁÄ" + \
                       "ÅÆÇÉÎÓÖ×ÚßàáâãäåæçèéêëíîïðñòóôöøùúûüýāăąćČčĐėęěĞğĩīİıŁłńňŌōőřŚśşŠšťũūůŵźŻżŽžơưșȚțəʻ" + \
                       "ʿΠΡβγБМавдежикмностъяḤḥṇṬṭầếờợ–‘’Ⅲ−∗"
            dictionary_words = list(self.inverted_index.keys())
            self.searcher = LevenshteinSearcher(alphabet, dictionary_words)

        if self.build_inverted_index:
            if self.kb_format == "hdt":
                self.doc = HDTDocument(str(expand_path(self.kb_filename)))
            elif self.kb_format == "sqlite3":
                self.conn = sqlite3.connect(str(expand_path(self.kb_filename)))
                self.cursor = self.conn.cursor()
            else:
                raise ValueError(
                    f'unsupported kb_format value {self.kb_format}')
            self.inverted_index_builder()
            self.save()
        else:
            self.load()