def load_kb(kb_path) -> KnowledgeBase: """ Load KnowledgeBase specified at kb_path :param kb_path: path to knowledge base :return: """ sys.stdout.write("\tLoading %s...\n" % kb_path) assert kb_path is not None assert kb_path != '' kb_name = os.path.basename(kb_path) kb = KnowledgeBase() # load kb if kb_path.endswith('.json') or kb_path.endswith( '.pickle' ) or kb_path.endswith('.pkl'): kb = kb.load(kb_path) elif kb_path.endswith('.obo') or kb_path.endswith('.OBO'): kb = KBLoader.import_obo_kb(kb_name, kb_path) elif kb_path.endswith('.owl') or kb_path.endswith('.rdf') or \ kb_path.endswith('.OWL') or kb_path.endswith('.RDF'): kb = KBLoader.import_owl_kb(kb_name, kb_path) elif kb_path.endswith('.msh'): kb = KBLoader.load_mesh(kb_name, kb_path) elif kb_path.endswith('.nci'): kb = KBLoader.load_nci(kb_name, kb_path) elif kb_path.endswith('.ttl') or kb_path.endswith('.n3'): sys.stdout.write('This program cannot parse your file type.\n') raise NotImplementedError() else: val = URLValidator() try: val(kb_path) except ValidationError: raise response = requests.get(kb_path, stream=True) response.raise_for_status() temp_file = 'temp_file_ontoemma.owl' with open(temp_file, 'wb') as outf: for block in response.iter_content(1024): outf.write(block) kb = KBLoader.import_owl_kb('', temp_file) os.remove(temp_file) sys.stdout.write("\tEntities: %i\n" % len(kb.entities)) return kb
def query_all_kb(self, kb: KnowledgeBase): """ Iterate through KB entities, query synonyms and definition, write to file. :param kb: :return: """ for ent in tqdm.tqdm(kb.entities, total=len(kb.entities)): mesh_syn, dbp_syn = self.syn_enricher.get_synonyms_to_entity( ent.aliases) wiki_ents, definition = self.wiki_enricher.get_definition_to_entity( ent.canonical_name) ent.additional_details['mesh_synonyms'] = mesh_syn ent.additional_details['dbpedia_synonyms'] = dbp_syn ent.additional_details['wiki_entities'] = wiki_ents if len(ent.definition) < 5: ent.definition = definition kb.dump(kb, self.out_path) return
def load_mesh( kb_name='mesh', path='C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\mesh_dis.xml' ): kb = KnowledgeBase() kb.name = kb_name # parse the file try: tree = etree.parse(path) except etree.XMLSyntaxError: p = etree.XMLParser(huge_tree=True) tree = etree.parse(path, parser=p) root = tree.getroot() ns = root.nsmap desc_iter = root.findall('rdf:Description', ns) # get description dict for desc in desc_iter: # get nci id entity_id = desc.find('meshv:identifier', ns).text entity = KBEntity(entity_id, None, [], '') entity.canonical_name = desc.find('skos:prefLabel', ns).text try: # get alt labels for label in desc.findall('skos:altLabel', ns): if label.text is not None: entity.aliases.append(label.text) relations = [] for sc_rel in desc.findall('meshv:broaderDescriptor', ns): target_research_entity_id = sc_rel.get( '{' + ns['rdf'] + '}resource', ns).split('/')[-1].strip() if isinstance(target_research_entity_id, str): relation = KBRelation(relation_type='subClassOf', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=False) relations.append(relation) except AttributeError: print(f'skipping {entity_id} in load_mesh') continue for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb
def import_mesh(name, mesh_filename): """ Create a KnowledgeBase object with entities from MeSH file :param name: :param mesh_filename: :return: """ # initialize the KB kb = KnowledgeBase() kb.name = name def _make_mesh_entity(entity_chunk): """ Make a KBEntity from each MeSH chunk :param entity_chunk: :return: """ entity = KBEntity() for line in entity_chunk: fields = line.split(" = ") if len(fields) != 2: continue key, value = fields[0], fields[1] if key == 'UI': entity.research_entity_id = value elif key == 'MH' or key == 'SH': entity.canonical_name = value entity.aliases.append(value) elif key == 'ENTRY' or key == 'PRINT ENTRY': entity.aliases.append(value.split("|")[0]) elif key == 'MS': entity.definition = value return entity for chunk in KBLoader._chunkify(file_util.read_lines(mesh_filename), KBLoader.MESH_ENTITY_START_TAG): kb.add_entity(_make_mesh_entity(chunk)) return kb
def import_dbpedia(kb_name, kb_filename, entities_count=0): """ Instantiate a KnowledgeBase object with entities and relations from dbpedia :param kb_name: :param kb_filename: :param entities_count: :return: """ # initialize the KB. kb = KnowledgeBase() kb.name = kb_name # only the "turtle" format is allowed for this kb. assert ('.ttl' in kb_filename) kb_filename = file_util.cache_file(kb_filename) # parse the turtle file abstracts_graph = rdflib.Graph() abstracts_graph.parse(kb_filename, format='turtle') logging.warning('done parsing dbpedia .ttl files.') counter = 0 for item_subject, item_property, item_object in abstracts_graph: entity = KBEntity() entity.research_entity_id = str(item_subject) if not entity.research_entity_id.startswith( 'http://dbpedia.org/resource/'): continue entity.canonical_name = entity.research_entity_id[ len('http://dbpedia.org/resource/'):].replace('_', ' ') entity.aliases.append(entity.canonical_name) entity.definition = str(item_object) # verify and add entity to the KB. kb.add_entity(entity) counter += 1 if counter >= entities_count > 0: break return kb
def create_umls_kbs(self, entities): """ From entity list, create several KnowledgeBase objects with entities from different KBs :param entities: dict of entities :return: """ for kb_name in constants.TRAINING_KBS: sys.stdout.write("\tCreating KB %s\n" % kb_name) kb = KnowledgeBase() kb.name = kb_name entities_to_add = entities[kb_name] for ent_id, ent_val in entities_to_add.items(): new_ent = KBEntity(ent_val['research_entity_id'], ent_val['canonical_name'], ent_val['aliases'], ' '.join(ent_val['definition'])) for ent1_id, ent2_id, rel_type, symmetric in ent_val[ 'relations']: rel_id1 = '{}:{}'.format(ent1_id[0], ent1_id[1]) rel_id2 = '{}:{}'.format(ent2_id[0], ent2_id[1]) new_rel = KBRelation(rel_type, [rel_id1, rel_id2], symmetric) kb.add_relation(new_rel) rel_ind = len(kb.relations) - 1 new_ent.relation_ids.append(rel_ind) kb.add_entity(new_ent) # write plain KB to json out_fname = 'kb-{}.json'.format(kb_name) kb.dump(kb, os.path.join(self.OUTPUT_KB_DIR, out_fname)) # add context to kb and write to file self.add_context_to_kb(kb) return
def import_kb(kb_name, kb_filename): """ Returns a KnowledgeBase object loaded from kb_filename. The KB must be one of the supported one below. :param kb_name: :param kb_filename: :return: """ # if needed, copy the file locally and update kb_filename. delete_local_copy = False if kb_filename.startswith('s3'): delete_local_copy = True kb_filename = file_util.cache_file(kb_filename) kb = None if kb_name in { KBLoader.SEQUENCE_ONTOLOGY, KBLoader.NCBI_TAXONOMY, KBLoader.CHEBI_TAXONOMY, KBLoader.GO_TAXONOMY, KBLoader.PR_TAXONOMY, KBLoader.CL_TAXONOMY, KBLoader.UNK_OBO_TAXONOMY }: kb = KBLoader.import_obo_kb(kb_name, kb_filename) elif kb_name == KBLoader.MESH_TAXONOMY: kb = KBLoader.import_mesh(kb_name, kb_filename) elif kb_name == KBLoader.DBPEDIA: kb = KBLoader.import_dbpedia(kb_name, kb_filename) elif kb_name == KBLoader.MERGED: kb = KnowledgeBase.load(kb_filename) else: raise LookupError("Unknown kb_name: {}".format(kb_name)) # remove the local copy of the raw kb file(s). if delete_local_copy: os.remove(kb_filename) # return the imported kb. assert (kb is not None) return kb
def extract_negative_mappings(self): """ sample negative pairings from entities :param mappings: positive mappings :param entities: entities grouped by kb :return: """ for kb_names, kb_training_data in self.umls_training_data.items(): # Format file names kb1_fname = 'kb-{}.json'.format(kb_names[0]) kb2_fname = 'kb-{}.json'.format(kb_names[1]) training_fname = '{}-{}.tsv'.format(kb_names[0], kb_names[1]) kb1_path = os.path.join(self.OUTPUT_KB_DIR, kb1_fname) kb2_path = os.path.join(self.OUTPUT_KB_DIR, kb2_fname) training_path = os.path.join(self.OUTPUT_DIR, 'training', training_fname) # initialize KBs s_kb = KnowledgeBase() t_kb = KnowledgeBase() # load KBs sys.stdout.write("\tLoading %s and %s\n" % kb_names) s_kb = s_kb.load(kb1_path) t_kb = t_kb.load(kb2_path) # sample negatives using candidate selection module sys.stdout.write("\t\tSampling negatives between %s and %s\n" % kb_names) neg_mappings = self.sample_negative_mappings( s_kb, t_kb, kb_training_data) # write negative mappings to training data file if neg_mappings: # write positive and negative training mappings to disk self.write_mapping_to_file(training_path, kb_training_data + neg_mappings) # append kb pair to done file with open(self.done_file, 'a') as outf: outf.write('%s\n' % training_path) return
def load_nci( kb_name='nci', path="C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\nci_dis_subset.rdf" ): # initialize the KB kb = KnowledgeBase() kb.name = kb_name # parse the file try: tree = etree.parse(path) except etree.XMLSyntaxError: p = etree.XMLParser(huge_tree=True) tree = etree.parse(path, parser=p) root = tree.getroot() ns = root.nsmap # get description dict desc_iter = root.findall('rdf:Description', ns) print('LEN OF RESOURCES', len(desc_iter)) for desc in root.findall('rdf:Description', ns): # get nci id entity_id = str(desc.find('ns1:NHC0', ns).text.strip()) entity = KBEntity(entity_id, None, [], '') entity.canonical_name = desc.find('rdfs:label', ns).text try: # get definition definition = desc.find('ns1:P97', ns) if definition is not None: entity.definition = definition.text # get alt labels for label in desc.findall('ns1:P90', ns): if label.text is not None: entity.aliases.append(label.text) relations = [] for sc_rel in desc.findall('rdfs:subClassOf', ns): try: target_research_entity_id = str( sc_rel.get('{' + ns['rdf'] + '}resource', ns).split('#')[-1].strip()) except AttributeError as ae: print(f'skipping element {sc_rel.attrib}') continue if isinstance(target_research_entity_id, str): relation = KBRelation(relation_type='subClassOf', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=False) relations.append(relation) except AttributeError as ae: print(f'skipping {entity_id} in load_nci') continue for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb
def import_owl_kb(kb_name, kb_filename): """ Create a KnowledgeBase object with entities and relations from an OWL file :param kb_name: :param kb_filename: :return: """ # get the description label for this resource id def get_label(l): if l.text is not None: return l.text else: r_id = l.get('{' + ns['rdf'] + '}resource') if r_id in descriptions: return descriptions[r_id][0] return None assert kb_filename.endswith('.owl') or kb_filename.endswith('.rdf') # initialize the KB kb = KnowledgeBase() kb.name = kb_name # parse the file try: tree = etree.parse(kb_filename) except etree.XMLSyntaxError: p = etree.XMLParser(huge_tree=True) tree = etree.parse(kb_filename, parser=p) root = tree.getroot() ns = root.nsmap if None in ns: del ns[None] # get description dict descriptions = dict() for desc in root.findall('rdf:Description', ns): resource_id = desc.get('{' + ns['rdf'] + '}about') try: labels = [] for label in desc.findall('rdfs:label', ns): if label.text is not None: labels.append(label.text) if 'skos' in ns: for label in desc.findall('skos:prefLabel', ns): if label.text is not None: labels.append(label.text) if 'oboInOwl' in ns: for syn in desc.findall('oboInOwl:hasExactSynonym', ns): if syn.text is not None: labels.append(syn.text) for syn in desc.findall('oboInOwl:hasRelatedSynonym', ns) \ + desc.findall('oboInOwl:hasNarrowSynonym', ns) \ + desc.findall('oboInOwl:hasBroadSynonym', ns): if syn.text is not None: labels.append(syn.text) if len(labels) > 0: descriptions[resource_id] = labels except AttributeError: continue # parse OWL classes for cl in root.findall('owl:Class', ns): # instantiate an entity. research_entity_id = cl.get('{' + ns['rdf'] + '}about') entity = KBEntity(research_entity_id, None, [], '') # list of KBRelations to add relations = [] if entity.research_entity_id is not None and entity.research_entity_id != '': try: labels = [] # get rdfs labels for label in cl.findall('rdfs:label', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) # add labels from description if entity.research_entity_id in descriptions: labels += descriptions[entity.research_entity_id] # get skos labels if 'skos' in ns: for label in cl.findall('skos:prefLabel', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) for label in cl.findall('skos:altLabel', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) for label in cl.findall('skos:hiddenLabel', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) # get synonyms if 'oboInOwl' in ns: for syn in cl.findall('oboInOwl:hasExactSynonym', ns): l_text = get_label(syn) if l_text is not None: labels.append(l_text) for syn in cl.findall('oboInOwl:hasRelatedSynonym', ns) \ + cl.findall('oboInOwl:hasNarrowSynonym', ns) \ + cl.findall('oboInOwl:hasBroadSynonym', ns): l_text = get_label(syn) if l_text is not None: labels.append(l_text) # set canonical_name and aliases if len(labels) > 0: entity.canonical_name = labels[0] entity.aliases = list( set([lab.lower() for lab in labels])) # if no name available (usually entity from external KB), replace name with id if entity.canonical_name is None: entity.canonical_name = entity.research_entity_id # get definition if 'skos' in ns: for definition in cl.findall('skos:definition', ns): if definition.text is not None: entity.definition += definition.text.lower( ) + ' ' if 'obo' in ns: for definition in cl.findall('obo:IAO_0000115', ns): if definition.text is not None: entity.definition += definition.text.lower( ) + ' ' entity.definition = entity.definition.strip() # get subclass relations for sc_rel in cl.findall('rdfs:subClassOf', ns): target_research_entity_id = sc_rel.get( '{' + ns['rdf'] + '}resource', ns) if isinstance(target_research_entity_id, str): relation = KBRelation( relation_type='subClassOf', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=False) relations.append(relation) except AttributeError: pass # add relations to entity and to kb for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb
def import_obo_kb(kb_name, kb_filename): """ Create a KnowledgeBase object with entities and relations from an OBO file :param kb_name: :param kb_filename: OBO file where KB is located :return: """ # initialize the KB kb = KnowledgeBase() kb.name = kb_name for chunk in KBLoader._chunkify(file_util.read_lines(kb_filename), KBLoader.OBO_ENTITY_START_TAG): # instantiate an empty entity. entity = KBEntity() # list of KBRelations to add relations = [] for line_index, line in enumerate(chunk): if line.startswith('id: '): # research_entity_id entity.research_entity_id = line[len('id: '):] elif line.startswith('name: '): # canonical_name entity.canonical_name = line[len('name: '):].replace( '_', ' ') entity.aliases.append(entity.canonical_name) elif line.startswith('def: '): # definition start_offset, end_offset = line.index( '"') + 1, line.rindex('"') entity.definition = line[start_offset:end_offset] elif line.startswith('synonym: '): # other aliases start_offset, end_offset = line.index( '"') + 1, line.rindex('"') entity.aliases.append(line[start_offset:end_offset]) elif line.startswith('is_a: '): # is_a relationships assert entity.research_entity_id splits = line.strip().split(' ') assert (len(splits) > 1) target_research_entity_id = splits[1] relation = KBRelation(relation_type='is_a', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=True) relations.append(relation) elif line.startswith('relationship: '): # other relationships assert entity.research_entity_id splits = line.split(' ') assert (len(splits) > 2) relation_type = splits[1] target_research_entity_id = splits[2] # is the relation symmetric? if relation_type in KBLoader.OBO_ASYM_RELATION_SET: symmetric = False elif relation_type in KBLoader.OBO_SYM_RELATION_SET: symmetric = True else: # unknown relation type logging.info('unknown relation type: ' + relation_type) assert False relation = KBRelation(relation_type=relation_type, entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=symmetric) relations.append(relation) elif line.startswith('intersection_of: ') or \ line.startswith('is_obsolete: ') or \ line.startswith('comment: ') or \ line.startswith('disjoint_from: ') or \ line.startswith('alt_id: ') or \ line.startswith('xref: ') or \ line.startswith('property_value: has_rank') or \ line.startswith('subset: ') or \ line.startswith('xref_analog') or \ line.startswith('xylem') or \ line.startswith('related_synonym') or \ line.startswith('exact_synonym') or \ line.startswith('broad_synonym') or \ line.startswith('narrow_synonym') or \ line.startswith('namespace') or \ line.startswith('consider') or \ line.startswith('replaced_by') or \ line.startswith('union_of'): # properties don't map naturally to the unified schema. pass else: # unknown obo property. logging.info('unknown OBO property: ' + line) assert False # add relations to entity and to kb for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb