def _compute_tokens(self, ent): """ Compute tokens from given entity :param ent: :return: """ name_string = string_utils.normalize_string(ent['canonical_name']) name_tokens = string_utils.tokenize_string(name_string, self.tokenizer, self.STOP) stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens]) lemmatized_tokens = tuple( [self.lemmatizer.lemmatize(w) for w in name_tokens]) character_tokens = tuple( string_utils.get_character_n_grams(name_string, constants.NGRAM_SIZE)) alias_tokens = [] for a in ent['aliases']: alias_tokens.append( string_utils.tokenize_string(string_utils.normalize_string(a), self.tokenizer, self.STOP)) parent_names = ent['par_relations'] child_names = ent['chd_relations'] return [ name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens, alias_tokens, set(parent_names), set(child_names) ]
def _normalize_ent(ent): norm_ent = dict() norm_ent['canonical_name'] = string_utils.normalize_string( ent['canonical_name']) norm_ent['aliases'] = [ string_utils.normalize_string(a) for a in ent['aliases'] ] norm_ent['definition'] = string_utils.normalize_string( ent['definition']) norm_ent['par_relations'] = set( [string_utils.normalize_string(i) for i in ent['par_relations']]) norm_ent['chd_relations'] = set( [string_utils.normalize_string(i) for i in ent['chd_relations']]) return norm_ent
def _get_ent_names_from_relations(self, ent, kb, rel_types): """ fetch the set of entity names that are related to the given entity :param ent: :param kb: :param rel_types: set of relations to extract :return: """ matching_rels = [kb.relations[rel_id] for rel_id in ent.relation_ids] ent_ids = [ rel.entity_ids[1] for rel in matching_rels if rel.relation_type in rel_types and rel.entity_ids[1] in kb.research_entity_id_to_entity_index ] ent_names = [] for ent_id in ent_ids: ent = kb.get_entity_by_research_entity_id(ent_id) if ent: ent_names.append( tuple( string_utils.tokenize_string( string_utils.normalize_string(ent.canonical_name), self.tokenizer, self.STOP))) return ent_names
def _generate_token_map(self, ents: List[KBEntity]): """ Generates token-to-entity and entity-to-token map for an input list of KBEntity objects :param ents: list of KBEntity objects :return: token-to-entity dict and entity-to-token dict """ # maps entity id key to word tokens in entity ent_to_tokens = dict() # maps token key to entities that have that token token_to_ents = defaultdict(set) for ent in ents: ent_id = ent.research_entity_id # tokenize all names and definitions name_tokens = [] char_tokens = [] for name in ent.aliases: name_tokens += string_utils.tokenize_string( name, self.tokenizer, self.STOP) char_tokens += [ ''.join(c) for c in string_utils.get_character_n_grams( string_utils.normalize_string(name), constants.NGRAM_SIZE) ] def_tokens = string_utils.tokenize_string(ent.definition, self.tokenizer, self.STOP) # combine tokens tokens = set(name_tokens).union(set(char_tokens)).union( set(def_tokens)) # add to ent-to-token map ent_to_tokens[ent_id] = tokens # add to token-to-ent map for tok in tokens: token_to_ents[tok].add(ent_id) # generate n-grams for all aliases for ng in char_tokens: token_to_ents[ng].add(ent_id) return token_to_ents, ent_to_tokens
def normalize_kb(self): """ Normalize all strings in kb :param kb: :return: """ for ent in self.entities: ent.canonical_name = string_utils.normalize_string( ent.canonical_name) ent.aliases = [ string_utils.normalize_string(a) for a in ent.aliases ] ent.definition = string_utils.normalize_string(ent.definition) ent.additional_details['wiki_entities'] = [ string_utils.normalize_string(i) for i in ent.additional_details['wiki_entities'] ] if 'wiki_entities' in ent.additional_details else [] ent.additional_details['mesh_synonyms'] = [ string_utils.normalize_string(i) for i in ent.additional_details['mesh_synonynms'] ] if 'mesh_synonynms' in ent.additional_details else [] ent.additional_details['dbpedia_synonyms'] = [ string_utils.normalize_string(i) for i in ent.additional_details['dbpedia_synonyms'] ] if 'dbpedia_synonyms' in ent.additional_details else [] all_rels = [self.relations[r_id] for r_id in ent.relation_ids] par_ents = [ r.entity_ids[1] for r in all_rels if r.relation_type in constants.UMLS_PARENT_REL_LABELS ] chd_ents = [ r.entity_ids[1] for r in all_rels if r.relation_type in constants.UMLS_CHILD_REL_LABELS ] sib_ents = [ r.entity_ids[1] for r in all_rels if r.relation_type in constants.UMLS_SIBLING_REL_LABELS ] syn_ents = [ r.entity_ids[1] for r in all_rels if r.relation_type in constants.UMLS_SYNONYM_REL_LABELS ] ent.additional_details['par_relations'] = list(set(par_ents)) ent.additional_details['chd_relations'] = list(set(chd_ents)) ent.additional_details['sib_relations'] = list(set(sib_ents)) ent.additional_details['syn_relations'] = list(set(syn_ents)) return
def get_synonyms_to_entity(self, aliases: List): """ Return synonyms of entity :param aliases: entity aliases :return: """ # normalize aliases norm_aliases = [string_utils.normalize_string(a) for a in aliases] # intialize synonym lists mesh_syns = [] dbpedia_syns = [] # get synonyms from synonym dicts for a in norm_aliases: mesh_syns += self.mesh_synonyms[a] dbpedia_syns += self.dbpedia_synonyms[a] return list(set(mesh_syns)), list(set(dbpedia_syns))