def load_mesh( kb_name='mesh', path='C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\mesh_dis.xml' ): kb = KnowledgeBase() kb.name = kb_name # parse the file try: tree = etree.parse(path) except etree.XMLSyntaxError: p = etree.XMLParser(huge_tree=True) tree = etree.parse(path, parser=p) root = tree.getroot() ns = root.nsmap desc_iter = root.findall('rdf:Description', ns) # get description dict for desc in desc_iter: # get nci id entity_id = desc.find('meshv:identifier', ns).text entity = KBEntity(entity_id, None, [], '') entity.canonical_name = desc.find('skos:prefLabel', ns).text try: # get alt labels for label in desc.findall('skos:altLabel', ns): if label.text is not None: entity.aliases.append(label.text) relations = [] for sc_rel in desc.findall('meshv:broaderDescriptor', ns): target_research_entity_id = sc_rel.get( '{' + ns['rdf'] + '}resource', ns).split('/')[-1].strip() if isinstance(target_research_entity_id, str): relation = KBRelation(relation_type='subClassOf', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=False) relations.append(relation) except AttributeError: print(f'skipping {entity_id} in load_mesh') continue for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb
def _make_mesh_entity(entity_chunk): """ Make a KBEntity from each MeSH chunk :param entity_chunk: :return: """ entity = KBEntity() for line in entity_chunk: fields = line.split(" = ") if len(fields) != 2: continue key, value = fields[0], fields[1] if key == 'UI': entity.research_entity_id = value elif key == 'MH' or key == 'SH': entity.canonical_name = value entity.aliases.append(value) elif key == 'ENTRY' or key == 'PRINT ENTRY': entity.aliases.append(value.split("|")[0]) elif key == 'MS': entity.definition = value return entity
def import_dbpedia(kb_name, kb_filename, entities_count=0): """ Instantiate a KnowledgeBase object with entities and relations from dbpedia :param kb_name: :param kb_filename: :param entities_count: :return: """ # initialize the KB. kb = KnowledgeBase() kb.name = kb_name # only the "turtle" format is allowed for this kb. assert ('.ttl' in kb_filename) kb_filename = file_util.cache_file(kb_filename) # parse the turtle file abstracts_graph = rdflib.Graph() abstracts_graph.parse(kb_filename, format='turtle') logging.warning('done parsing dbpedia .ttl files.') counter = 0 for item_subject, item_property, item_object in abstracts_graph: entity = KBEntity() entity.research_entity_id = str(item_subject) if not entity.research_entity_id.startswith( 'http://dbpedia.org/resource/'): continue entity.canonical_name = entity.research_entity_id[ len('http://dbpedia.org/resource/'):].replace('_', ' ') entity.aliases.append(entity.canonical_name) entity.definition = str(item_object) # verify and add entity to the KB. kb.add_entity(entity) counter += 1 if counter >= entities_count > 0: break return kb
def create_umls_kbs(self, entities): """ From entity list, create several KnowledgeBase objects with entities from different KBs :param entities: dict of entities :return: """ for kb_name in constants.TRAINING_KBS: sys.stdout.write("\tCreating KB %s\n" % kb_name) kb = KnowledgeBase() kb.name = kb_name entities_to_add = entities[kb_name] for ent_id, ent_val in entities_to_add.items(): new_ent = KBEntity(ent_val['research_entity_id'], ent_val['canonical_name'], ent_val['aliases'], ' '.join(ent_val['definition'])) for ent1_id, ent2_id, rel_type, symmetric in ent_val[ 'relations']: rel_id1 = '{}:{}'.format(ent1_id[0], ent1_id[1]) rel_id2 = '{}:{}'.format(ent2_id[0], ent2_id[1]) new_rel = KBRelation(rel_type, [rel_id1, rel_id2], symmetric) kb.add_relation(new_rel) rel_ind = len(kb.relations) - 1 new_ent.relation_ids.append(rel_ind) kb.add_entity(new_ent) # write plain KB to json out_fname = 'kb-{}.json'.format(kb_name) kb.dump(kb, os.path.join(self.OUTPUT_KB_DIR, out_fname)) # add context to kb and write to file self.add_context_to_kb(kb) return
def load_nci( kb_name='nci', path="C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\nci_dis_subset.rdf" ): # initialize the KB kb = KnowledgeBase() kb.name = kb_name # parse the file try: tree = etree.parse(path) except etree.XMLSyntaxError: p = etree.XMLParser(huge_tree=True) tree = etree.parse(path, parser=p) root = tree.getroot() ns = root.nsmap # get description dict desc_iter = root.findall('rdf:Description', ns) print('LEN OF RESOURCES', len(desc_iter)) for desc in root.findall('rdf:Description', ns): # get nci id entity_id = str(desc.find('ns1:NHC0', ns).text.strip()) entity = KBEntity(entity_id, None, [], '') entity.canonical_name = desc.find('rdfs:label', ns).text try: # get definition definition = desc.find('ns1:P97', ns) if definition is not None: entity.definition = definition.text # get alt labels for label in desc.findall('ns1:P90', ns): if label.text is not None: entity.aliases.append(label.text) relations = [] for sc_rel in desc.findall('rdfs:subClassOf', ns): try: target_research_entity_id = str( sc_rel.get('{' + ns['rdf'] + '}resource', ns).split('#')[-1].strip()) except AttributeError as ae: print(f'skipping element {sc_rel.attrib}') continue if isinstance(target_research_entity_id, str): relation = KBRelation(relation_type='subClassOf', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=False) relations.append(relation) except AttributeError as ae: print(f'skipping {entity_id} in load_nci') continue for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb
def import_owl_kb(kb_name, kb_filename): """ Create a KnowledgeBase object with entities and relations from an OWL file :param kb_name: :param kb_filename: :return: """ # get the description label for this resource id def get_label(l): if l.text is not None: return l.text else: r_id = l.get('{' + ns['rdf'] + '}resource') if r_id in descriptions: return descriptions[r_id][0] return None assert kb_filename.endswith('.owl') or kb_filename.endswith('.rdf') # initialize the KB kb = KnowledgeBase() kb.name = kb_name # parse the file try: tree = etree.parse(kb_filename) except etree.XMLSyntaxError: p = etree.XMLParser(huge_tree=True) tree = etree.parse(kb_filename, parser=p) root = tree.getroot() ns = root.nsmap if None in ns: del ns[None] # get description dict descriptions = dict() for desc in root.findall('rdf:Description', ns): resource_id = desc.get('{' + ns['rdf'] + '}about') try: labels = [] for label in desc.findall('rdfs:label', ns): if label.text is not None: labels.append(label.text) if 'skos' in ns: for label in desc.findall('skos:prefLabel', ns): if label.text is not None: labels.append(label.text) if 'oboInOwl' in ns: for syn in desc.findall('oboInOwl:hasExactSynonym', ns): if syn.text is not None: labels.append(syn.text) for syn in desc.findall('oboInOwl:hasRelatedSynonym', ns) \ + desc.findall('oboInOwl:hasNarrowSynonym', ns) \ + desc.findall('oboInOwl:hasBroadSynonym', ns): if syn.text is not None: labels.append(syn.text) if len(labels) > 0: descriptions[resource_id] = labels except AttributeError: continue # parse OWL classes for cl in root.findall('owl:Class', ns): # instantiate an entity. research_entity_id = cl.get('{' + ns['rdf'] + '}about') entity = KBEntity(research_entity_id, None, [], '') # list of KBRelations to add relations = [] if entity.research_entity_id is not None and entity.research_entity_id != '': try: labels = [] # get rdfs labels for label in cl.findall('rdfs:label', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) # add labels from description if entity.research_entity_id in descriptions: labels += descriptions[entity.research_entity_id] # get skos labels if 'skos' in ns: for label in cl.findall('skos:prefLabel', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) for label in cl.findall('skos:altLabel', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) for label in cl.findall('skos:hiddenLabel', ns): l_text = get_label(label) if l_text is not None: labels.append(l_text) # get synonyms if 'oboInOwl' in ns: for syn in cl.findall('oboInOwl:hasExactSynonym', ns): l_text = get_label(syn) if l_text is not None: labels.append(l_text) for syn in cl.findall('oboInOwl:hasRelatedSynonym', ns) \ + cl.findall('oboInOwl:hasNarrowSynonym', ns) \ + cl.findall('oboInOwl:hasBroadSynonym', ns): l_text = get_label(syn) if l_text is not None: labels.append(l_text) # set canonical_name and aliases if len(labels) > 0: entity.canonical_name = labels[0] entity.aliases = list( set([lab.lower() for lab in labels])) # if no name available (usually entity from external KB), replace name with id if entity.canonical_name is None: entity.canonical_name = entity.research_entity_id # get definition if 'skos' in ns: for definition in cl.findall('skos:definition', ns): if definition.text is not None: entity.definition += definition.text.lower( ) + ' ' if 'obo' in ns: for definition in cl.findall('obo:IAO_0000115', ns): if definition.text is not None: entity.definition += definition.text.lower( ) + ' ' entity.definition = entity.definition.strip() # get subclass relations for sc_rel in cl.findall('rdfs:subClassOf', ns): target_research_entity_id = sc_rel.get( '{' + ns['rdf'] + '}resource', ns) if isinstance(target_research_entity_id, str): relation = KBRelation( relation_type='subClassOf', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=False) relations.append(relation) except AttributeError: pass # add relations to entity and to kb for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb
def import_obo_kb(kb_name, kb_filename): """ Create a KnowledgeBase object with entities and relations from an OBO file :param kb_name: :param kb_filename: OBO file where KB is located :return: """ # initialize the KB kb = KnowledgeBase() kb.name = kb_name for chunk in KBLoader._chunkify(file_util.read_lines(kb_filename), KBLoader.OBO_ENTITY_START_TAG): # instantiate an empty entity. entity = KBEntity() # list of KBRelations to add relations = [] for line_index, line in enumerate(chunk): if line.startswith('id: '): # research_entity_id entity.research_entity_id = line[len('id: '):] elif line.startswith('name: '): # canonical_name entity.canonical_name = line[len('name: '):].replace( '_', ' ') entity.aliases.append(entity.canonical_name) elif line.startswith('def: '): # definition start_offset, end_offset = line.index( '"') + 1, line.rindex('"') entity.definition = line[start_offset:end_offset] elif line.startswith('synonym: '): # other aliases start_offset, end_offset = line.index( '"') + 1, line.rindex('"') entity.aliases.append(line[start_offset:end_offset]) elif line.startswith('is_a: '): # is_a relationships assert entity.research_entity_id splits = line.strip().split(' ') assert (len(splits) > 1) target_research_entity_id = splits[1] relation = KBRelation(relation_type='is_a', entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=True) relations.append(relation) elif line.startswith('relationship: '): # other relationships assert entity.research_entity_id splits = line.split(' ') assert (len(splits) > 2) relation_type = splits[1] target_research_entity_id = splits[2] # is the relation symmetric? if relation_type in KBLoader.OBO_ASYM_RELATION_SET: symmetric = False elif relation_type in KBLoader.OBO_SYM_RELATION_SET: symmetric = True else: # unknown relation type logging.info('unknown relation type: ' + relation_type) assert False relation = KBRelation(relation_type=relation_type, entity_ids=[ entity.research_entity_id, target_research_entity_id ], symmetric=symmetric) relations.append(relation) elif line.startswith('intersection_of: ') or \ line.startswith('is_obsolete: ') or \ line.startswith('comment: ') or \ line.startswith('disjoint_from: ') or \ line.startswith('alt_id: ') or \ line.startswith('xref: ') or \ line.startswith('property_value: has_rank') or \ line.startswith('subset: ') or \ line.startswith('xref_analog') or \ line.startswith('xylem') or \ line.startswith('related_synonym') or \ line.startswith('exact_synonym') or \ line.startswith('broad_synonym') or \ line.startswith('narrow_synonym') or \ line.startswith('namespace') or \ line.startswith('consider') or \ line.startswith('replaced_by') or \ line.startswith('union_of'): # properties don't map naturally to the unified schema. pass else: # unknown obo property. logging.info('unknown OBO property: ' + line) assert False # add relations to entity and to kb for rel in relations: kb.add_relation(rel) rel_index = len(kb.relations) - 1 entity.relation_ids.append(rel_index) # add entity to kb kb.add_entity(entity) return kb