Exemplo n.º 1
0
    def _compute_tokens(self, ent):
        """
        Compute tokens from given entity
        :param ent:
        :return:
        """
        name_tokens = string_utils.tokenize_string(ent['canonical_name'],
                                                   self.tokenizer, self.STOP)
        stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens])
        lemmatized_tokens = tuple(
            [self.lemmatizer.lemmatize(w) for w in name_tokens])
        character_tokens = tuple(
            string_utils.get_character_n_grams(ent['canonical_name'],
                                               constants.NGRAM_SIZE))

        alias_tokens = [
            string_utils.tokenize_string(a, self.tokenizer, self.STOP)
            for a in ent['aliases']
        ]

        def_tokens = string_utils.tokenize_string(ent['definition'],
                                                  self.tokenizer, self.STOP)

        return [
            name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens,
            alias_tokens, def_tokens
        ]
Exemplo n.º 2
0
    def _compute_tokens(self, ent):
        """
        Compute tokens from given entity
        :param ent:
        :return:
        """
        name_string = string_utils.normalize_string(ent['canonical_name'])
        name_tokens = string_utils.tokenize_string(name_string, self.tokenizer,
                                                   self.STOP)
        stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens])
        lemmatized_tokens = tuple(
            [self.lemmatizer.lemmatize(w) for w in name_tokens])
        character_tokens = tuple(
            string_utils.get_character_n_grams(name_string,
                                               constants.NGRAM_SIZE))

        alias_tokens = []

        for a in ent['aliases']:
            alias_tokens.append(
                string_utils.tokenize_string(string_utils.normalize_string(a),
                                             self.tokenizer, self.STOP))

        parent_names = ent['par_relations']
        child_names = ent['chd_relations']

        return [
            name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens,
            alias_tokens,
            set(parent_names),
            set(child_names)
        ]
 def _char_tokenize(s, ngram_size):
     """
     Generate character n-grams over string s
     :param s:
     :param ngram_size:
     :return:
     """
     return string_utils.get_character_n_grams(s, ngram_size)
Exemplo n.º 4
0
    def _generate_token_map(self, ents: List[KBEntity]):
        """
        Generates token-to-entity and entity-to-token map for an input list
        of KBEntity objects
        :param ents: list of KBEntity objects
        :return: token-to-entity dict and entity-to-token dict
        """
        # maps entity id key to word tokens in entity
        ent_to_tokens = dict()

        # maps token key to entities that have that token
        token_to_ents = defaultdict(set)

        for ent in ents:
            ent_id = ent.research_entity_id

            # tokenize all names and definitions
            name_tokens = []
            char_tokens = []
            for name in ent.aliases:
                name_tokens += string_utils.tokenize_string(
                    name, self.tokenizer, self.STOP)
                char_tokens += [
                    ''.join(c) for c in string_utils.get_character_n_grams(
                        string_utils.normalize_string(name),
                        constants.NGRAM_SIZE)
                ]

            def_tokens = string_utils.tokenize_string(ent.definition,
                                                      self.tokenizer,
                                                      self.STOP)

            # combine tokens
            tokens = set(name_tokens).union(set(char_tokens)).union(
                set(def_tokens))

            # add to ent-to-token map
            ent_to_tokens[ent_id] = tokens

            # add to token-to-ent map
            for tok in tokens:
                token_to_ents[tok].add(ent_id)

            # generate n-grams for all aliases
            for ng in char_tokens:
                token_to_ents[ng].add(ent_id)
        return token_to_ents, ent_to_tokens