Пример #1
0
    def load_kb(kb_path) -> KnowledgeBase:
        """
        Load KnowledgeBase specified at kb_path
        :param kb_path: path to knowledge base
        :return:
        """
        sys.stdout.write("\tLoading %s...\n" % kb_path)

        assert kb_path is not None
        assert kb_path != ''

        kb_name = os.path.basename(kb_path)

        kb = KnowledgeBase()

        # load kb
        if kb_path.endswith('.json') or kb_path.endswith(
            '.pickle'
        ) or kb_path.endswith('.pkl'):
            kb = kb.load(kb_path)
        elif kb_path.endswith('.obo') or kb_path.endswith('.OBO'):
            kb = KBLoader.import_obo_kb(kb_name, kb_path)
        elif kb_path.endswith('.owl') or kb_path.endswith('.rdf') or \
            kb_path.endswith('.OWL') or kb_path.endswith('.RDF'):
            kb = KBLoader.import_owl_kb(kb_name, kb_path)
        elif kb_path.endswith('.msh'):
            kb = KBLoader.load_mesh(kb_name, kb_path)
        elif kb_path.endswith('.nci'):
            kb = KBLoader.load_nci(kb_name, kb_path)
        elif kb_path.endswith('.ttl') or kb_path.endswith('.n3'):
            sys.stdout.write('This program cannot parse your file type.\n')
            raise NotImplementedError()
        else:
            val = URLValidator()
            try:
                val(kb_path)
            except ValidationError:
                raise

            response = requests.get(kb_path, stream=True)
            response.raise_for_status()
            temp_file = 'temp_file_ontoemma.owl'
            with open(temp_file, 'wb') as outf:
                for block in response.iter_content(1024):
                    outf.write(block)
            kb = KBLoader.import_owl_kb('', temp_file)
            os.remove(temp_file)

        sys.stdout.write("\tEntities: %i\n" % len(kb.entities))

        return kb
Пример #2
0
    def query_all_kb(self, kb: KnowledgeBase):
        """
        Iterate through KB entities, query synonyms and definition, write to file.
        :param kb:
        :return:
        """
        for ent in tqdm.tqdm(kb.entities, total=len(kb.entities)):
            mesh_syn, dbp_syn = self.syn_enricher.get_synonyms_to_entity(
                ent.aliases)
            wiki_ents, definition = self.wiki_enricher.get_definition_to_entity(
                ent.canonical_name)
            ent.additional_details['mesh_synonyms'] = mesh_syn
            ent.additional_details['dbpedia_synonyms'] = dbp_syn
            ent.additional_details['wiki_entities'] = wiki_ents
            if len(ent.definition) < 5:
                ent.definition = definition

        kb.dump(kb, self.out_path)
        return
Пример #3
0
    def load_mesh(
        kb_name='mesh',
        path='C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\mesh_dis.xml'
    ):

        kb = KnowledgeBase()
        kb.name = kb_name

        # parse the file
        try:
            tree = etree.parse(path)
        except etree.XMLSyntaxError:
            p = etree.XMLParser(huge_tree=True)
            tree = etree.parse(path, parser=p)

        root = tree.getroot()
        ns = root.nsmap
        desc_iter = root.findall('rdf:Description', ns)
        # get description dict
        for desc in desc_iter:
            # get nci id
            entity_id = desc.find('meshv:identifier', ns).text
            entity = KBEntity(entity_id, None, [], '')
            entity.canonical_name = desc.find('skos:prefLabel', ns).text
            try:
                # get alt labels
                for label in desc.findall('skos:altLabel', ns):
                    if label.text is not None:
                        entity.aliases.append(label.text)

                relations = []
                for sc_rel in desc.findall('meshv:broaderDescriptor', ns):
                    target_research_entity_id = sc_rel.get(
                        '{' + ns['rdf'] + '}resource',
                        ns).split('/')[-1].strip()
                    if isinstance(target_research_entity_id, str):
                        relation = KBRelation(relation_type='subClassOf',
                                              entity_ids=[
                                                  entity.research_entity_id,
                                                  target_research_entity_id
                                              ],
                                              symmetric=False)
                        relations.append(relation)

            except AttributeError:
                print(f'skipping {entity_id} in load_mesh')
                continue

            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb
Пример #4
0
    def import_mesh(name, mesh_filename):
        """
        Create a KnowledgeBase object with entities from MeSH file
        :param name:
        :param mesh_filename:
        :return:
        """
        # initialize the KB
        kb = KnowledgeBase()
        kb.name = name

        def _make_mesh_entity(entity_chunk):
            """
            Make a KBEntity from each MeSH chunk
            :param entity_chunk:
            :return:
            """
            entity = KBEntity()
            for line in entity_chunk:
                fields = line.split(" = ")
                if len(fields) != 2:
                    continue
                key, value = fields[0], fields[1]
                if key == 'UI':
                    entity.research_entity_id = value
                elif key == 'MH' or key == 'SH':
                    entity.canonical_name = value
                    entity.aliases.append(value)
                elif key == 'ENTRY' or key == 'PRINT ENTRY':
                    entity.aliases.append(value.split("|")[0])
                elif key == 'MS':
                    entity.definition = value
            return entity

        for chunk in KBLoader._chunkify(file_util.read_lines(mesh_filename),
                                        KBLoader.MESH_ENTITY_START_TAG):
            kb.add_entity(_make_mesh_entity(chunk))
        return kb
Пример #5
0
    def import_dbpedia(kb_name, kb_filename, entities_count=0):
        """
        Instantiate a KnowledgeBase object with entities and relations from dbpedia
        :param kb_name:
        :param kb_filename:
        :param entities_count:
        :return:
        """
        # initialize the KB.
        kb = KnowledgeBase()
        kb.name = kb_name
        # only the "turtle" format is allowed for this kb.
        assert ('.ttl' in kb_filename)
        kb_filename = file_util.cache_file(kb_filename)

        # parse the turtle file
        abstracts_graph = rdflib.Graph()
        abstracts_graph.parse(kb_filename, format='turtle')
        logging.warning('done parsing dbpedia .ttl files.')

        counter = 0
        for item_subject, item_property, item_object in abstracts_graph:
            entity = KBEntity()
            entity.research_entity_id = str(item_subject)
            if not entity.research_entity_id.startswith(
                    'http://dbpedia.org/resource/'):
                continue
            entity.canonical_name = entity.research_entity_id[
                len('http://dbpedia.org/resource/'):].replace('_', ' ')
            entity.aliases.append(entity.canonical_name)
            entity.definition = str(item_object)
            # verify and add entity to the KB.
            kb.add_entity(entity)
            counter += 1
            if counter >= entities_count > 0:
                break
        return kb
    def create_umls_kbs(self, entities):
        """
        From entity list, create several KnowledgeBase objects with entities from different KBs
        :param entities: dict of entities
        :return:
        """
        for kb_name in constants.TRAINING_KBS:
            sys.stdout.write("\tCreating KB %s\n" % kb_name)
            kb = KnowledgeBase()
            kb.name = kb_name

            entities_to_add = entities[kb_name]

            for ent_id, ent_val in entities_to_add.items():
                new_ent = KBEntity(ent_val['research_entity_id'],
                                   ent_val['canonical_name'],
                                   ent_val['aliases'],
                                   ' '.join(ent_val['definition']))
                for ent1_id, ent2_id, rel_type, symmetric in ent_val[
                        'relations']:
                    rel_id1 = '{}:{}'.format(ent1_id[0], ent1_id[1])
                    rel_id2 = '{}:{}'.format(ent2_id[0], ent2_id[1])
                    new_rel = KBRelation(rel_type, [rel_id1, rel_id2],
                                         symmetric)
                    kb.add_relation(new_rel)
                    rel_ind = len(kb.relations) - 1
                    new_ent.relation_ids.append(rel_ind)
                kb.add_entity(new_ent)

            # write plain KB to json
            out_fname = 'kb-{}.json'.format(kb_name)
            kb.dump(kb, os.path.join(self.OUTPUT_KB_DIR, out_fname))

            # add context to kb and write to file
            self.add_context_to_kb(kb)
        return
Пример #7
0
    def import_kb(kb_name, kb_filename):
        """
        Returns a KnowledgeBase object loaded from kb_filename. The KB
        must be one of the supported one below.
        :param kb_name:
        :param kb_filename:
        :return:
        """
        # if needed, copy the file locally and update kb_filename.
        delete_local_copy = False
        if kb_filename.startswith('s3'):
            delete_local_copy = True
            kb_filename = file_util.cache_file(kb_filename)

        kb = None
        if kb_name in {
                KBLoader.SEQUENCE_ONTOLOGY, KBLoader.NCBI_TAXONOMY,
                KBLoader.CHEBI_TAXONOMY, KBLoader.GO_TAXONOMY,
                KBLoader.PR_TAXONOMY, KBLoader.CL_TAXONOMY,
                KBLoader.UNK_OBO_TAXONOMY
        }:
            kb = KBLoader.import_obo_kb(kb_name, kb_filename)
        elif kb_name == KBLoader.MESH_TAXONOMY:
            kb = KBLoader.import_mesh(kb_name, kb_filename)
        elif kb_name == KBLoader.DBPEDIA:
            kb = KBLoader.import_dbpedia(kb_name, kb_filename)
        elif kb_name == KBLoader.MERGED:
            kb = KnowledgeBase.load(kb_filename)
        else:
            raise LookupError("Unknown kb_name: {}".format(kb_name))

        # remove the local copy of the raw kb file(s).
        if delete_local_copy:
            os.remove(kb_filename)

        # return the imported kb.
        assert (kb is not None)
        return kb
    def extract_negative_mappings(self):
        """
        sample negative pairings from entities
        :param mappings: positive mappings
        :param entities: entities grouped by kb
        :return:
        """
        for kb_names, kb_training_data in self.umls_training_data.items():

            # Format file names
            kb1_fname = 'kb-{}.json'.format(kb_names[0])
            kb2_fname = 'kb-{}.json'.format(kb_names[1])
            training_fname = '{}-{}.tsv'.format(kb_names[0], kb_names[1])

            kb1_path = os.path.join(self.OUTPUT_KB_DIR, kb1_fname)
            kb2_path = os.path.join(self.OUTPUT_KB_DIR, kb2_fname)
            training_path = os.path.join(self.OUTPUT_DIR, 'training',
                                         training_fname)

            # initialize KBs
            s_kb = KnowledgeBase()
            t_kb = KnowledgeBase()

            # load KBs
            sys.stdout.write("\tLoading %s and %s\n" % kb_names)
            s_kb = s_kb.load(kb1_path)
            t_kb = t_kb.load(kb2_path)

            # sample negatives using candidate selection module
            sys.stdout.write("\t\tSampling negatives between %s and %s\n" %
                             kb_names)
            neg_mappings = self.sample_negative_mappings(
                s_kb, t_kb, kb_training_data)

            # write negative mappings to training data file
            if neg_mappings:
                # write positive and negative training mappings to disk
                self.write_mapping_to_file(training_path,
                                           kb_training_data + neg_mappings)

                # append kb pair to done file
                with open(self.done_file, 'a') as outf:
                    outf.write('%s\n' % training_path)
        return
Пример #9
0
    def load_nci(
        kb_name='nci',
        path="C:\\Users\\EDMISML\\Desktop\\ont_align_data\\disease_subtrees\\nci_dis_subset.rdf"
    ):

        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        # parse the file
        try:
            tree = etree.parse(path)
        except etree.XMLSyntaxError:
            p = etree.XMLParser(huge_tree=True)
            tree = etree.parse(path, parser=p)

        root = tree.getroot()
        ns = root.nsmap

        # get description dict
        desc_iter = root.findall('rdf:Description', ns)
        print('LEN OF RESOURCES', len(desc_iter))
        for desc in root.findall('rdf:Description', ns):
            # get nci id
            entity_id = str(desc.find('ns1:NHC0', ns).text.strip())
            entity = KBEntity(entity_id, None, [], '')
            entity.canonical_name = desc.find('rdfs:label', ns).text
            try:
                # get definition
                definition = desc.find('ns1:P97', ns)
                if definition is not None:
                    entity.definition = definition.text
                # get alt labels
                for label in desc.findall('ns1:P90', ns):
                    if label.text is not None:
                        entity.aliases.append(label.text)

                relations = []
                for sc_rel in desc.findall('rdfs:subClassOf', ns):
                    try:
                        target_research_entity_id = str(
                            sc_rel.get('{' + ns['rdf'] + '}resource',
                                       ns).split('#')[-1].strip())
                    except AttributeError as ae:
                        print(f'skipping element {sc_rel.attrib}')
                        continue
                    if isinstance(target_research_entity_id, str):
                        relation = KBRelation(relation_type='subClassOf',
                                              entity_ids=[
                                                  entity.research_entity_id,
                                                  target_research_entity_id
                                              ],
                                              symmetric=False)
                        relations.append(relation)

            except AttributeError as ae:
                print(f'skipping {entity_id} in load_nci')
                continue

            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb
Пример #10
0
    def import_owl_kb(kb_name, kb_filename):
        """
        Create a KnowledgeBase object with entities and relations from an OWL file
        :param kb_name:
        :param kb_filename:
        :return:
        """

        # get the description label for this resource id
        def get_label(l):
            if l.text is not None:
                return l.text
            else:
                r_id = l.get('{' + ns['rdf'] + '}resource')
                if r_id in descriptions:
                    return descriptions[r_id][0]
            return None

        assert kb_filename.endswith('.owl') or kb_filename.endswith('.rdf')

        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        # parse the file
        try:
            tree = etree.parse(kb_filename)
        except etree.XMLSyntaxError:
            p = etree.XMLParser(huge_tree=True)
            tree = etree.parse(kb_filename, parser=p)

        root = tree.getroot()
        ns = root.nsmap

        if None in ns:
            del ns[None]

        # get description dict
        descriptions = dict()
        for desc in root.findall('rdf:Description', ns):
            resource_id = desc.get('{' + ns['rdf'] + '}about')
            try:
                labels = []
                for label in desc.findall('rdfs:label', ns):
                    if label.text is not None:
                        labels.append(label.text)
                if 'skos' in ns:
                    for label in desc.findall('skos:prefLabel', ns):
                        if label.text is not None:
                            labels.append(label.text)
                if 'oboInOwl' in ns:
                    for syn in desc.findall('oboInOwl:hasExactSynonym', ns):
                        if syn.text is not None:
                            labels.append(syn.text)
                    for syn in desc.findall('oboInOwl:hasRelatedSynonym', ns) \
                            + desc.findall('oboInOwl:hasNarrowSynonym', ns) \
                            + desc.findall('oboInOwl:hasBroadSynonym', ns):
                        if syn.text is not None:
                            labels.append(syn.text)
                if len(labels) > 0:
                    descriptions[resource_id] = labels
            except AttributeError:
                continue

        # parse OWL classes
        for cl in root.findall('owl:Class', ns):
            # instantiate an entity.
            research_entity_id = cl.get('{' + ns['rdf'] + '}about')
            entity = KBEntity(research_entity_id, None, [], '')

            # list of KBRelations to add
            relations = []

            if entity.research_entity_id is not None and entity.research_entity_id != '':
                try:
                    labels = []

                    # get rdfs labels
                    for label in cl.findall('rdfs:label', ns):
                        l_text = get_label(label)
                        if l_text is not None:
                            labels.append(l_text)

                    # add labels from description
                    if entity.research_entity_id in descriptions:
                        labels += descriptions[entity.research_entity_id]

                    # get skos labels
                    if 'skos' in ns:
                        for label in cl.findall('skos:prefLabel', ns):
                            l_text = get_label(label)
                            if l_text is not None:
                                labels.append(l_text)
                        for label in cl.findall('skos:altLabel', ns):
                            l_text = get_label(label)
                            if l_text is not None:
                                labels.append(l_text)
                        for label in cl.findall('skos:hiddenLabel', ns):
                            l_text = get_label(label)
                            if l_text is not None:
                                labels.append(l_text)

                    # get synonyms
                    if 'oboInOwl' in ns:
                        for syn in cl.findall('oboInOwl:hasExactSynonym', ns):
                            l_text = get_label(syn)
                            if l_text is not None:
                                labels.append(l_text)
                        for syn in cl.findall('oboInOwl:hasRelatedSynonym', ns) \
                                + cl.findall('oboInOwl:hasNarrowSynonym', ns) \
                                + cl.findall('oboInOwl:hasBroadSynonym', ns):
                            l_text = get_label(syn)
                            if l_text is not None:
                                labels.append(l_text)

                    # set canonical_name and aliases
                    if len(labels) > 0:
                        entity.canonical_name = labels[0]
                        entity.aliases = list(
                            set([lab.lower() for lab in labels]))

                    # if no name available (usually entity from external KB), replace name with id
                    if entity.canonical_name is None:
                        entity.canonical_name = entity.research_entity_id

                    # get definition
                    if 'skos' in ns:
                        for definition in cl.findall('skos:definition', ns):
                            if definition.text is not None:
                                entity.definition += definition.text.lower(
                                ) + ' '
                    if 'obo' in ns:
                        for definition in cl.findall('obo:IAO_0000115', ns):
                            if definition.text is not None:
                                entity.definition += definition.text.lower(
                                ) + ' '
                    entity.definition = entity.definition.strip()

                    # get subclass relations
                    for sc_rel in cl.findall('rdfs:subClassOf', ns):
                        target_research_entity_id = sc_rel.get(
                            '{' + ns['rdf'] + '}resource', ns)
                        if isinstance(target_research_entity_id, str):
                            relation = KBRelation(
                                relation_type='subClassOf',
                                entity_ids=[
                                    entity.research_entity_id,
                                    target_research_entity_id
                                ],
                                symmetric=False)
                            relations.append(relation)
                except AttributeError:
                    pass

                # add relations to entity and to kb
                for rel in relations:
                    kb.add_relation(rel)
                    rel_index = len(kb.relations) - 1
                    entity.relation_ids.append(rel_index)

                # add entity to kb
                kb.add_entity(entity)

        return kb
Пример #11
0
    def import_obo_kb(kb_name, kb_filename):
        """
        Create a KnowledgeBase object with entities and relations from an OBO file
        :param kb_name:
        :param kb_filename: OBO file where KB is located
        :return:
        """
        # initialize the KB
        kb = KnowledgeBase()
        kb.name = kb_name

        for chunk in KBLoader._chunkify(file_util.read_lines(kb_filename),
                                        KBLoader.OBO_ENTITY_START_TAG):
            # instantiate an empty entity.
            entity = KBEntity()

            # list of KBRelations to add
            relations = []

            for line_index, line in enumerate(chunk):
                if line.startswith('id: '):
                    # research_entity_id
                    entity.research_entity_id = line[len('id: '):]
                elif line.startswith('name: '):
                    # canonical_name
                    entity.canonical_name = line[len('name: '):].replace(
                        '_', ' ')
                    entity.aliases.append(entity.canonical_name)
                elif line.startswith('def: '):
                    # definition
                    start_offset, end_offset = line.index(
                        '"') + 1, line.rindex('"')
                    entity.definition = line[start_offset:end_offset]
                elif line.startswith('synonym: '):
                    # other aliases
                    start_offset, end_offset = line.index(
                        '"') + 1, line.rindex('"')
                    entity.aliases.append(line[start_offset:end_offset])
                elif line.startswith('is_a: '):
                    # is_a relationships
                    assert entity.research_entity_id
                    splits = line.strip().split(' ')
                    assert (len(splits) > 1)
                    target_research_entity_id = splits[1]
                    relation = KBRelation(relation_type='is_a',
                                          entity_ids=[
                                              entity.research_entity_id,
                                              target_research_entity_id
                                          ],
                                          symmetric=True)
                    relations.append(relation)
                elif line.startswith('relationship: '):
                    # other relationships
                    assert entity.research_entity_id
                    splits = line.split(' ')
                    assert (len(splits) > 2)
                    relation_type = splits[1]
                    target_research_entity_id = splits[2]
                    # is the relation symmetric?
                    if relation_type in KBLoader.OBO_ASYM_RELATION_SET:
                        symmetric = False
                    elif relation_type in KBLoader.OBO_SYM_RELATION_SET:
                        symmetric = True
                    else:
                        # unknown relation type
                        logging.info('unknown relation type: ' + relation_type)
                        assert False
                    relation = KBRelation(relation_type=relation_type,
                                          entity_ids=[
                                              entity.research_entity_id,
                                              target_research_entity_id
                                          ],
                                          symmetric=symmetric)
                    relations.append(relation)
                elif line.startswith('intersection_of: ') or \
                        line.startswith('is_obsolete: ') or \
                        line.startswith('comment: ') or \
                        line.startswith('disjoint_from: ') or \
                        line.startswith('alt_id: ') or \
                        line.startswith('xref: ') or \
                        line.startswith('property_value: has_rank') or \
                        line.startswith('subset: ') or \
                        line.startswith('xref_analog') or \
                        line.startswith('xylem') or \
                        line.startswith('related_synonym') or \
                        line.startswith('exact_synonym') or \
                        line.startswith('broad_synonym') or \
                        line.startswith('narrow_synonym') or \
                        line.startswith('namespace') or \
                        line.startswith('consider') or \
                        line.startswith('replaced_by') or \
                        line.startswith('union_of'):
                    # properties don't map naturally to the unified schema.
                    pass
                else:
                    # unknown obo property.
                    logging.info('unknown OBO property: ' + line)
                    assert False

            # add relations to entity and to kb
            for rel in relations:
                kb.add_relation(rel)
                rel_index = len(kb.relations) - 1
                entity.relation_ids.append(rel_index)

            # add entity to kb
            kb.add_entity(entity)

        return kb