예제 #1
0
    def _compute_tokens(self, ent):
        """
        Compute tokens from given entity
        :param ent:
        :return:
        """
        name_tokens = string_utils.tokenize_string(ent['canonical_name'],
                                                   self.tokenizer, self.STOP)
        stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens])
        lemmatized_tokens = tuple(
            [self.lemmatizer.lemmatize(w) for w in name_tokens])
        character_tokens = tuple(
            string_utils.get_character_n_grams(ent['canonical_name'],
                                               constants.NGRAM_SIZE))

        alias_tokens = [
            string_utils.tokenize_string(a, self.tokenizer, self.STOP)
            for a in ent['aliases']
        ]

        def_tokens = string_utils.tokenize_string(ent['definition'],
                                                  self.tokenizer, self.STOP)

        return [
            name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens,
            alias_tokens, def_tokens
        ]
예제 #2
0
    def _compute_tokens(self, ent):
        """
        Compute tokens from given entity
        :param ent:
        :return:
        """
        name_string = string_utils.normalize_string(ent['canonical_name'])
        name_tokens = string_utils.tokenize_string(name_string, self.tokenizer,
                                                   self.STOP)
        stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens])
        lemmatized_tokens = tuple(
            [self.lemmatizer.lemmatize(w) for w in name_tokens])
        character_tokens = tuple(
            string_utils.get_character_n_grams(name_string,
                                               constants.NGRAM_SIZE))

        alias_tokens = []

        for a in ent['aliases']:
            alias_tokens.append(
                string_utils.tokenize_string(string_utils.normalize_string(a),
                                             self.tokenizer, self.STOP))

        parent_names = ent['par_relations']
        child_names = ent['chd_relations']

        return [
            name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens,
            alias_tokens,
            set(parent_names),
            set(child_names)
        ]
 def _tokenize(self, s):
     """
     Tokenize string s
     :param s:
     :return:
     """
     return string_utils.tokenize_string(s, self.tokenizer, self.STOP)
예제 #4
0
    def _get_ent_names_from_relations(self, ent, kb, rel_types):
        """
        fetch the set of entity names that are related to the given entity
        :param ent:
        :param kb:
        :param rel_types: set of relations to extract
        :return:
        """
        matching_rels = [kb.relations[rel_id] for rel_id in ent.relation_ids]

        ent_ids = [
            rel.entity_ids[1] for rel in matching_rels
            if rel.relation_type in rel_types
            and rel.entity_ids[1] in kb.research_entity_id_to_entity_index
        ]

        ent_names = []
        for ent_id in ent_ids:
            ent = kb.get_entity_by_research_entity_id(ent_id)
            if ent:
                ent_names.append(
                    tuple(
                        string_utils.tokenize_string(
                            string_utils.normalize_string(ent.canonical_name),
                            self.tokenizer, self.STOP)))

        return ent_names
예제 #5
0
    def _generate_token_map(self, ents: List[KBEntity]):
        """
        Generates token-to-entity and entity-to-token map for an input list
        of KBEntity objects
        :param ents: list of KBEntity objects
        :return: token-to-entity dict and entity-to-token dict
        """
        # maps entity id key to word tokens in entity
        ent_to_tokens = dict()

        # maps token key to entities that have that token
        token_to_ents = defaultdict(set)

        for ent in ents:
            ent_id = ent.research_entity_id

            # tokenize all names and definitions
            name_tokens = []
            char_tokens = []
            for name in ent.aliases:
                name_tokens += string_utils.tokenize_string(
                    name, self.tokenizer, self.STOP)
                char_tokens += [
                    ''.join(c) for c in string_utils.get_character_n_grams(
                        string_utils.normalize_string(name),
                        constants.NGRAM_SIZE)
                ]

            def_tokens = string_utils.tokenize_string(ent.definition,
                                                      self.tokenizer,
                                                      self.STOP)

            # combine tokens
            tokens = set(name_tokens).union(set(char_tokens)).union(
                set(def_tokens))

            # add to ent-to-token map
            ent_to_tokens[ent_id] = tokens

            # add to token-to-ent map
            for tok in tokens:
                token_to_ents[tok].add(ent_id)

            # generate n-grams for all aliases
            for ng in char_tokens:
                token_to_ents[ng].add(ent_id)
        return token_to_ents, ent_to_tokens