Пример #1
0
def main():
    olr = auth.get_path('ontology-local-repo')
    ori = OntResIri('http://purl.obolibrary.org/obo/doid.owl')
    orp = OntResPath(olr / 'ttl/external/doid.owl')
    ort = ori
    g = ori.graph
    query = """
    SELECT DISTINCT ?s ?o ?l
    WHERE {
        ?s a owl:Class .
        ?s rdfs:subClassOf* <http://purl.obolibrary.org/obo/DOID_4> .
        ?s rdfs:subClassOf ?o .
        ?s rdfs:label ?l .
    }"""
    res = list(g.query(query))
    filt = [r for r in res if not isinstance(r[1], rdflib.BNode)]
    spath = 'ttl/generated/doidslim.ttl'
    go = OntGraph(path=olr / spath)
    # TODO prov record like the one we have for chebi
    go.bind('DOID', 'http://purl.obolibrary.org/obo/DOID_')
    s = rdflib.URIRef('http://ontology.neuinfo.org/NIF/' + spath)
    go.populate_from_triples(
        ((s, p, o) for p, o in
         ((rdf.type, owl.Ontology),
          (rdfs.label, rdflib.Literal("NIF DOID slim")),)))
    ds = rdflib.URIRef('http://purl.obolibrary.org/obo/DOID_4')
    go.add((ds, rdf.type, owl.Class))
    go.add((ds, rdfs.label, rdflib.Literal('disease')))
    go.populate_from_triples(
        (t for s, o, l in filt for t in
         ((s, rdf.type, owl.Class),
          (s, rdfs.subClassOf, o),
          (s, rdfs.label, l))))
    go.write()
Пример #2
0
    def sneechReviewGraph(self,
                          source_graph,
                          namespaces,
                          sneech_file=None,
                          path_out=None):
        # TODO cache
        (already, cannot, maybe, sneeches,
         maybe_sneeches) = self.preSneech(source_graph, namespaces)
        # TODO not entirely sure about the best place to put this ...
        self.reView(source_graph, maybe_sneeches)  # FIXME dump and commit

        review_graph = OntGraph(path=path_out)
        oq.OntCuries.populate(review_graph)
        review_graph.bind('snchn', str(snchn))  # FIXME -> curies probably
        review_graph.bind('sncho', str(sncho))  # FIXME -> curies probably
        review_graph.bind('h', str(sghashes))  # FIXME -> curies probably
        if sneech_file:
            sneech_file.populate(review_graph)

        gen = self.triples_review(already, cannot, maybe, sneeches,
                                  sneech_file)
        [review_graph.add(t) for t in gen]
        # TODO hasReport -> maybe_sneeches report / reView
        # TODO snchn predicate ordering
        return review_graph, maybe_sneeches
Пример #3
0
 def build_instances(self, instances, dids):
     folder = Path(self.config.out_graph_path()).parent
     # WOW do I need to implement the new/better way of
     # managing writing collections of neurons to graphs
     neuron_uri = next(NeuronACT.out_graph[:rdf.type:owl.Ontology])
     name = 'allen-cell-instances.ttl'
     base, _ = neuron_uri.rsplit('/', 1)
     uri = rdflib.URIRef(base + '/' + name)
     metadata = ((uri, rdf.type, owl.Ontology),)
     instance_graph = OntGraph(path=folder / name)
     instance_graph.bind('AIBSSPEC', AIBSSPEC)
     instance_graph.bind('npokb', npokb)
     [instance_graph.add(t) for t in metadata]
     [instance_graph.add(t) for t in instances]
     [instance_graph.add(t) for t in allDifferent(None, distinctMembers(*dids))]
     instance_graph.write()
Пример #4
0
 def test_part_of(self):
     eeeee = self.OntTerm('UBERON:0008933',
                          label='primary somatosensory cortex')
     g = OntGraph()
     [g.add(t) for t in eeeee.triples_simple]
     g.debug()
     po = [t for t in eeeee.triples_simple if partOf in t]
     assert po, 'sadness'
Пример #5
0
def normalize_prefixes(graph, curies):
    new_graph = OntGraph()
    oc = OntCuries.new()
    curies.pop('', None)
    curies['rdf'] = str(rdf)
    curies['rdfs'] = str(rdfs)
    oc(curies)
    oc.populate(new_graph)
    [new_graph.add(t) for t in graph]
    return new_graph
Пример #6
0
    def populate(self, graph=None):
        """ Populate a graph, or if no graph is provided
            populate a new empty graph from the current
            content. (Also useful for debug) """

        if graph is None:
            graph = OntGraph()

        [graph.add(t) for t in self.triples]
        OntCuries.populate(graph)
        return graph
Пример #7
0
    def make_import_chain(self, ontology='nif.ttl'):
        itrips = self.get_itrips()
        if not any(ontology in t[0] for t in itrips):
            return None, None

        ontologies = ontology,  # hack around bad code in ontload
        import_graph = OntGraph()
        [import_graph.add(t) for t in itrips]

        self.tree, self.extra = next(import_tree(import_graph, ontologies))
        return self.tree, self.extra
Пример #8
0
def main():
    dandi_terms_path = aug.LocalPath.cwd()
    g = OntGraph()

    _ = [
        populateFromJsonLd(g, path_yaml(p))
        for p in dandi_terms_path.rglob('*.yaml')
    ]
    g.write('dandi-raw.ttl')
    remove = [(s, p, o) for p in (schema.domainIncludes, schema.rangeIncludes,
                                  rdfs.subClassOf, rdf.type)
              for s, o in g[:p:]]
    add = [(s, p, (g.namespace_manager.expand(o.toPython()) if isinstance(
        o, rdflib.Literal) else o)) for s, p, o in remove]
    _ = [g.remove(t) for t in remove]
    _ = [g.add(t) for t in add]
    # TODO ontology metadata header section
    g.write('dandi.ttl')
Пример #9
0
def swanson():
    """ not really a parcellation scheme
        NOTE: the defining information up here is now deprecated
        it is kept around to keep the code further down happy """

    source = auth.get_path('resources') / 'swanson_aligned.txt'
    ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/'
    filename = 'swanson_hierarchies'
    ontid = ONT_PATH + filename + '.ttl'
    PREFIXES = SwansonLabels.prefixes
    new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/')
    new_graph.add_ont(ontid,
                      'Swanson brain partomies',
                      'Swanson 2014 Partonomies',
                      'This file is automatically generated from ' + source.as_posix() + '.' + '**FIXME**',
                      'now')

    # FIXME citations should really go on the ... anatomy? scheme artifact
    definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.'
    definingCitationID = 'ISBN:9780195340624'
    new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation)
    new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID)

    with open(source, 'rt') as f:
        lines = [l.strip() for l in f.readlines()]

    # join header on page 794
    lines[635] += ' ' + lines.pop(636)
    #fix for capitalization since this header is reused
    fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN')
    lines[635] = fixed

    data = []
    for l in lines:
        if not l.startswith('#'):
            level = l.count('.'*5)
            l = l.strip('.')
            if ' (' in l:
                if ') or' in l:
                    n1, l = l.split(') or')
                    area_name, citationP =  n1.strip().split(' (')
                    citation = citationP.rstrip(')')
                    d = (level, area_name, citation, 'NEXT SYN')
                    data.append(d)
                    #print(tc.red(tc.bold(repr(d))))

                area_name, citationP =  l.strip().split(' (')
                citation = citationP.rstrip(')')
            else:
                area_name = l
                citation = None

            d = (level, area_name, citation, None)
            #print(d)
            data.append(d)
    results = Async()(deferred(sgv.findByTerm)(d[1]) for d in data)
    #results = [None] * len(data)
    curies = [[r['curie'] for r in _ if 'curie' in r and 'UBERON' in r['curie']] if _ else [] for _ in results]
    output = [_[0] if _ else None for _ in curies]

    header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon']
    zoop = [header] + [r for r in zip(*zip(*data), output)] + \
            [(0, 'Appendix END None', None, None, None)]  # needed to add last appendix

    # TODO annotate the appendicies and the classes with these
    appendix_root_mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1)  # should generate?

    class SP(rowParse):
        def __init__(self):
            self.nodes = defaultdict(dict)
            self._appendix = 0
            self.appendicies = {}
            self._last_at_level = {}
            self.names = defaultdict(set)
            self.children = defaultdict(set)
            self.parents = defaultdict(set)
            self.next_syn = False
            super().__init__(zoop)

        def Depth(self, value):
            if self.next_syn:
                self.synonym = self.next_syn
            else:
                self.synonym = False
            self.depth = value

        def Name(self, value):
            self.name = value

        def Citation(self, value):
            self.citation = value

        def NextSyn(self, value):
            if value:
                self.next_syn = self._rowind
            else:
                self.next_syn = False

        def Uberon(self, value):
            self.uberon = value

        def _row_post(self):
            # check if we are in the next appendix
            # may want to xref ids between appendicies as well...
            if self.depth == 0:
                if self.name.startswith('Appendix'):
                    if self._appendix:
                        self.appendicies[self._appendix]['children'] = dict(self.children)
                        self.appendicies[self._appendix]['parents'] = dict(self.parents)
                        self._last_at_level = {}
                        self.children = defaultdict(set)
                        self.parents = defaultdict(set)
                    _, num, apname = self.name.split(' ', 2)
                    if num == 'END':
                        return
                    self._appendix = int(num)
                    self.appendicies[self._appendix] = {
                        'name':apname.capitalize(),
                        'type':self.citation.capitalize() if self.citation else None}
                    return
                else:
                    if ' [' in self.name:
                        name, taxonB = self.name.split(' [')
                        self.name = name
                        self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize()
                    else:  # top level is animalia
                        self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize()

                    self.name = self.name.capitalize()
                    self.citation = self.citation.capitalize()
            # nodes
            if self.synonym:
                self.nodes[self.synonym]['synonym'] = self.name
                self.nodes[self.synonym]['syn-cite'] = self.citation
                self.nodes[self.synonym]['syn-uberon'] = self.uberon
                return
            else:
                if self.citation:  # Transverse Longitudinal etc all @ lvl4
                    self.names[self.name + ' ' + self.citation].add(self._rowind)
                else:
                    self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label']
                    #print(level, self.name)
                    # can't return here because they are their own level
                # replace with actually doing something...
                self.nodes[self._rowind]['label'] = self.name
                self.nodes[self._rowind]['citation'] = self.citation
                self.nodes[self._rowind]['uberon'] = self.uberon
            # edges
            self._last_at_level[self.depth] = self._rowind
            # TODO will need something to deal with the Lateral/
            if self.depth > 0:
                try:
                    parent = self._last_at_level[self.depth - 1]
                except:
                    breakpoint()
                self.children[parent].add(self._rowind)
                self.parents[self._rowind].add(parent)

        def _end(self):
            replace = {}
            for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]:
                replace_with, to_replace = asdf[0], asdf[1:]
                for r in to_replace:
                    replace[r] = replace_with

            for r, rw in replace.items():
                #print(self.nodes[rw])
                o = self.nodes.pop(r)
                #print(o)

            for vals in self.appendicies.values():
                children = vals['children']
                parents = vals['parents']
                # need reversed so children are corrected before swap
                for r, rw in reversed(sorted(replace.items())):
                    if r in parents:
                        child = r
                        new_child = rw
                        parent = parents.pop(child)
                        parents[new_child] = parent
                        parent = list(parent)[0]
                        children[parent].remove(child)
                        children[parent].add(new_child)
                    if r in children:
                        parent = r
                        new_parent = rw
                        childs = children.pop(parent)
                        children[new_parent] = childs
                        for child in childs:
                            parents[child] = {new_parent}

            self.nodes = dict(self.nodes)

    sp = SP()
    tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])]
    #print('\n'.join(tp))
    #print(sp.appendicies[1].keys())
    #print(sp.nodes[1].keys())
    nbase = PREFIXES['SWAN'] + '%s'
    json_ = {'nodes':[],'edges':[]}
    parent = ilxtr.swansonBrainRegionConcept

    og = OntGraph()
    for node, anns in sp.nodes.items():
        nid = nbase % node
        new_graph.add_class(nid, parent, label=anns['label'])
        new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation'])
        json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)})
        #if anns['uberon']:
            #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon'])  # issues arrise here...
        [og.add(t) for t in map_term(rdflib.URIRef(nid), anns['label'], prefix='UBERON')]

    og.write(auth.get_path('ontology-local-repo') /
             'ttl/generated/swanson-uberon-mapping.ttl')
    #hrm = [(anns['label'], gn(anns['label'])) for node, anns in sp.nodes.items()]
    #ok = [(h, test, term_source(h, test)) for h, test in hrm if test]
    #notok = [h for h, test in hrm if not test]

    for appendix, data in sp.appendicies.items():
        aid = PREFIXES['SWAA'] + str(appendix)
        new_graph.add_class(aid, label=data['name'].capitalize())
        new_graph.add_trip(aid, 'ilxtr:hasTaxonRank', data['taxon'])  # FIXME appendix is the data artifact...
        children = data['children']
        ahp = 'swanr:hasPart' + str(appendix)
        apo = 'swanr:partOf' + str(appendix)
        new_graph.add_op(ahp, transitive=True)
        new_graph.add_op(apo, inverse=ahp, transitive=True)
        for parent, childs in children.items():  # FIXME does this give complete coverage?
            pid = nbase % parent
            for child in childs:
                cid = nbase % child
                new_graph.add_restriction(pid, ahp, cid)  # note hierarhcy inverts direction
                new_graph.add_restriction(cid, apo, pid)
                json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)})

    return new_graph
Пример #10
0
def run(args):
    # modes
    graph = args['graph']
    scigraph = args['scigraph']
    config = args['config']
    imports = args['imports']
    chain = args['chain']
    extra = args['extra']

    # required
    repo_name = args['<repo>']
    remote_base = args['<remote_base>']
    ontologies = args['<ontologies>']

    # options
    git_remote = args['--git-remote']
    git_local = Path(args['--git-local']).resolve()
    zip_location = Path(args['--zip-location']).resolve()
    graphload_config = Path(args['--graphload-config']).resolve()
    graphload_config_template = graphload_config  # NOTE XXX
    if args['--graphload-ontologies'] is not None:
        graphload_ontologies = Path(args['--graphload-ontologies']).resolve()
    else:
        graphload_ontologies = None

    org = args['--org']
    branch = args['--branch']
    commit = args['--commit']
    scp = args['--scp-loc']
    sorg = args['--scigraph-org']
    sbranch = args['--scigraph-branch']
    scommit = args['--scigraph-commit']
    sscp = args['--scigraph-scp-loc']
    scigraph_quiet = args['--scigraph-quiet']
    patch_config = args['--patch-config']
    curies_location = args['--curies']
    patch = args['--patch']
    check_built = args['--check-built']
    debug = args['--debug']
    log = args['--logfile']  # TODO
    fix_imports_only = args['--fix-imports-only']

    load_base = 'scigraph-load -c {config_path}'  # now _this_ is easier

    if args['--view-defaults']:
        for k, v in defaults.items():
            print(f'{k:<22} {v}')
        return

    # post parse mods
    if remote_base == 'NIF':
        remote_base = 'http://ontology.neuinfo.org/NIF'

    itrips = None

    if repo_name is not None:
        local_base = jpth(git_local, repo_name)

    if graph:
        if args['--path-build-scigraph']:  # path-build-scigraph
            path_build_scigraph = Path(args['--path-build-scigraph'])
            (scigraph_commit, services_zip,
             scigraph_reset_state) = scigraph_build(path_build_scigraph,
                                                    git_remote,
                                                    sorg,
                                                    path_build_scigraph,
                                                    sbranch,
                                                    scommit,
                                                    check_built=check_built,
                                                    cleanup_later=True,
                                                    quiet=scigraph_quiet)
        else:
            scigraph_commit = 'dev-9999'
            services_zip = 'None'
            scigraph_reset_state = lambda: None

        with execute_regardless(scigraph_reset_state):
            rl = ReproLoader(
                zip_location,
                git_remote,
                org,
                git_local,
                repo_name,
                branch,
                commit,
                remote_base,
                load_base,
                graphload_config_template,
                graphload_ontologies,
                patch_config,
                patch,
                scigraph_commit,
                fix_imports_only=fix_imports_only,
                check_built=check_built,
            )

        if not fix_imports_only:
            FILE_NAME_ZIP = Path(rl.zip_path).name
            LATEST = Path(zip_location) / 'LATEST'
            if LATEST.exists() and LATEST.is_symlink():
                LATEST.unlink()

            LATEST.symlink_to(FILE_NAME_ZIP)

            itrips, config = rl.itrips, rl.config

            if not ontologies:
                ontologies = rl.ontologies

            print(services_zip)
            print(rl.zip_path)
            if '--local' in args:
                return

    elif scigraph:
        (scigraph_commit, services_zip,
         _) = scigraph_build(zip_location,
                             git_remote,
                             sorg,
                             git_local,
                             sbranch,
                             scommit,
                             check_built=check_built,
                             quiet=scigraph_quiet)
        print(services_zip)
        if '--local' in args:
            return

    elif config:
        #graph_path = Path(args['<graph_path>']).resolve()
        config_path = Path(args['--graph-config-out']).resolve()
        #local_base = Path(git_local, repo_name).resolve()
        date_today = TODAY()
        ReproLoader.make_graphload_config(graphload_config_template,
                                          graphload_ontologies, zip_location,
                                          date_today, config_path)

    elif imports:
        # TODO mismatch between import name and file name needs a better fix
        itrips = local_imports(remote_base, local_base, ontologies)
    elif chain:
        itrips = local_imports(remote_base,
                               local_base,
                               ontologies,
                               readonly=True)
    elif extra:
        from nifstd_tools.utils import memoryCheck
        curies = getCuries(curies_location)
        curie_prefixes = set(curies.values())
        memoryCheck(2665488384)
        graph = loadall(git_local, repo_name)
        new_graph = normalize_prefixes(graph, curies)
        for_burak(new_graph)
        debug = True
    elif patch:
        local_base = jpth(git_local, repo_name)
        local_versions = tuple(do_patch(patch_config, local_base))
    else:
        raise BaseException('How did we possibly get here docopt?')

    if itrips:
        import_graph = OntGraph()
        [import_graph.add(t) for t in itrips]
        for tree, extra in import_tree(import_graph, ontologies):
            name = Path(next(iter(tree.keys()))).name
            with open(jpth(zip_location, f'{name}-import-closure.html'),
                      'wt') as f:
                f.write(extra.html.replace('NIFTTL:',
                                           ''))  # much more readable

    if debug:
        breakpoint()
Пример #11
0
    def inner(local_filepath, remote=False):
        if noneMembers(local_filepath, *bigleaves) or dobig:
            ext = os.path.splitext(local_filepath)[-1]
            if ext == '.ttl':
                infmt = 'turtle'
            else:
                log.info((ext, local_filepath))
                infmt = None
            if remote:
                resp = requests.get(
                    local_filepath
                )  # TODO nonblocking pull these out, fetch, run inner again until done
                raw = resp.text.encode()
            else:
                try:
                    with open(local_filepath, 'rb') as f:
                        raw = f.read()
                except FileNotFoundError as e:
                    if local_filepath.startswith('file://'):
                        log.info(
                            f'local_imports has already been run, skipping {local_filepath}'
                        )
                        return
                        #raise ValueError('local_imports has already been run') from e
                    else:
                        log.exception(
                            e
                        )  # TODO raise a warning if the file cannot be matched
                        # seems like good practice to have any imported ontology under
                        # version control so all imports are guaranteed to have good
                        # provenance and not split the prior informaiton between the
                        # scigraph config and the repository, the repository remains
                        # the source of truth, load.yaml files can then pick a subset
                        # of the properly tracked files to load as they see fit, but
                        # not add to them (at least in pyontutils land)
                        raw = b''

            if oo in raw:  # we only care if there are imports or an ontology iri
                scratch = OntGraph()
                if infmt == 'turtle':
                    data, rest = raw.split(b'###', 1)
                elif infmt == None:  # assume xml
                    xml_tree = etree.parse(BytesIO(raw))
                    xml_root = xml_tree.getroot()
                    xml_ontology = xml_tree.xpath(
                        "/*[local-name()='RDF']/*[local-name()='Ontology']")
                    xml_root.clear()
                    xml_root.append(xml_ontology[0])
                    data = etree.tostring(xml_root)
                scratch.parse(data=data, format=infmt)
                for s in scratch.subjects(rdf.type, owl.Ontology):
                    triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath)))
                    # somehow this breaks computing the chain
                    #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label):
                    #for o in scratch[s:p]:
                    #triples.add((s, p, o))
                for s, o in sorted(scratch.subject_objects(p)):
                    if revert:
                        raise NotImplementedError('TODO')
                    nlfp = o.replace(remote_base, local_base)
                    triples.add((s, p, o))
                    if 'http://' in local_filepath or 'external' in local_filepath:  # FIXME what to do about https used inconsistently :/
                        if 'external' in local_filepath:
                            imported_iri = rdflib.URIRef(
                                local_filepath.replace(
                                    local_base, remote_base))  # inefficient
                        else:
                            imported_iri = rdflib.URIRef(local_filepath)
                        if s != imported_iri:
                            imported_iri_vs_ontology_iri[
                                imported_iri] = s  # kept for the record
                            triples.add((imported_iri, p,
                                         s))  # bridge imported != ontology iri
                    if local_base in nlfp and 'file://' not in o:  # FIXME file:// should not be slipping through here...
                        scratch.add((s, p, rdflib.URIRef('file://' + nlfp)))
                        scratch.remove((s, p, o))
                    if nlfp not in done:
                        done.append(nlfp)
                        if local_base in nlfp and 'external' not in nlfp:  # skip externals TODO
                            inner(nlfp)
                        elif readonly:  # read external imports
                            if 'external' in nlfp:
                                inner(nlfp)
                            else:
                                inner(nlfp, remote=True)
                if not readonly:
                    _orp = CustomTurtleSerializer.roundtrip_prefixes  # FIXME awful hack :/
                    CustomTurtleSerializer.roundtrip_prefixes = True
                    ttl = scratch.serialize(format='nifttl', encoding='utf-8')
                    CustomTurtleSerializer.roundtrip_prefixes = _orp
                    ndata, comment = ttl.split(b'###', 1)
                    out = ndata + b'###' + rest
                    with open(local_filepath, 'wb') as f:
                        f.write(out)
Пример #12
0
    def new_index(self, referenceIndex, *, commit=True):
        """ reference hosts have a single incrementing primary key index
            to which everything is mapped

            in theory these indexes could also be per 'prefix' aka
            the sandboxed uri path or external uri path to which
            something is mapped I don't see any reason not to do this
            for this kind of implementation since a regular pattern
            can be develop
        """
        '''
            QUESTION: do we force a remapping of external id sequences
            into uris/ first? this seems like a bad idea? or rather,
            it is actually a good idea, but it will have to be done with
            a pattern based redirect instead of an actual materialization
            the alternative is to do what ontobee does and pass the external
            iri as a query parameter ... hrm tradoffs, well we certainly
            can't make a nice /uberon/uris/obo/{UBERON_} folder if we include
            the whole uri ... so this seems a reasonable tradeoff
            http://purl.obolibrary.org/obo/ can wind up being mapped into
            multiple uri spaces ... /obo/uris/obo/ would seem to make more sense
            but how to indicate that other organizations/projects map there ...
            /uberon/uris/obo/UBERON_ could indicate the latest sequence
            ah, and of course in theory this gets us out of the very annoying
            situation where /uberon/uris/obo/UBERON_ really IS different than
            /doid/uris/obo/UBERON_ for some identifiers (sigh) and if they are
            all mapped and masking based on presence then we can detect the issues
            HOWEVER how do we enforce that in reality the _mapping_ is all to
            /obo/uris/obo/ ??
        '''

        path = self.path_index(referenceIndex)

        rrp = path.repo_relative_path
        s = sncho[rrp.with_suffix('').as_posix()]  # TODO check ownership

        if path.exists():
            raise FileExistsError(path)

        g = OntGraph(path=path)
        OntCuries.populate(g)
        # TODO these are really identified by the follow:
        # base/readable/
        # {group}/uris/
        # base/ontologies/
        # {group}/ontologies/uris/
        pos = (
            (rdf.type, snchn.IndexGraph),
            (rdfs.label, rdflib.Literal(f'IndexGraph for {referenceIndex}')),
            (snchn.referenceIndex, rdflib.Literal(referenceIndex)),  # TODO HRM
            #(snchn.indexRemote, )
        )

        for po in pos:
            g.add((s, *po))  # FIXME

        g.path.parent.mkdir(parents=True)
        g.write()

        if commit:
            path.commit(f'add new index for {referenceIndex}')

        return path
Пример #13
0
def main():
    olr = auth.get_path('ontology-local-repo')
    resources = auth.get_path('resources')
    if not olr.exists():
        raise FileNotFoundError(f'{olr} does not exist cannot continue')
    if not resources.exists():
        raise FileNotFoundError(f'{resources} does not exist cannot continue')

    PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl',
                            'CHEBI', 'owl', 'skos', 'oboInOwl')
    ug = makeGraph('utilgraph', prefixes=PREFIXES)
    file = resources / 'chebi-subset-ids.txt'
    with open(file.as_posix(), 'rt') as f:
        ids_raw = set((_.strip() for _ in f.readlines()))
        ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw)))

    def check_chebis(g):
        a = []
        for id_ in ids:
            l = sorted(g.triples((id_, None, None)))
            ll = len(l)
            a.append(ll)
        return a

    def fixIons(g):
        # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to...
        ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI'))
        # atom           ion
        None, 'CHEBI:29108'  # calcium is ok
        ng.replace_uriref('CHEBI:30145', 'CHEBI:49713')  # lithium
        ng.replace_uriref('CHEBI:18248', 'CHEBI:29033')  # iron
        ng.replace_uriref('CHEBI:26216', 'CHEBI:29103')  # potassium
        ng.replace_uriref('CHEBI:26708', 'CHEBI:29101')  # sodium
        None, 'CHEBI:29105'  # zinc is ok

    g = OntGraph()
    cg = OntGraph()
    cd = OntGraph()
    chemg = OntGraph()
    molg = OntGraph()

    cg.parse(olr / 'ttl/generated/chebislim.ttl', format='turtle')
    list(g.add(t) for t in cg)
    a1 = check_chebis(g)

    cd.parse(olr / 'ttl/generated/chebi-dead.ttl', format='turtle')
    list(g.add(t) for t in cd)
    a2 = check_chebis(g)

    chemg.parse(olr / 'ttl/NIF-Chemical.ttl', format='turtle')
    chemgg = makeGraph('NIF-Chemical', graph=chemg)
    fixIons(chemg)
    list(g.add(t) for t in chemg)
    a3 = check_chebis(g)

    molg.parse(olr / 'ttl/NIF-Molecule.ttl', format='turtle')
    molgg = makeGraph('NIF-Molecule', graph=molg)
    fixIons(molg)
    list(g.add(t) for t in molg)
    a4 = check_chebis(g)

    replacedBy = ug.expand('replacedBy:')
    deads = {s: o for s, o in cd.subject_objects(replacedBy)}

    def switch_dead(g):
        ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl'))
        for f, r in deads.items():
            ng.replace_uriref(f, r)
            ng.add_trip(r, 'oboInOwl:hasAlternateId',
                        rdflib.Literal(f, datatype=rdflib.XSD.string))
            g.remove(
                (r, replacedBy, r))  # in case the replaced by was already in

    switch_dead(g)
    switch_dead(cg)
    switch_dead(chemg)
    switch_dead(molg)

    def fixHasAltId(g):
        ng = makeGraph('',
                       graph=g,
                       prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'NIFRID'))
        ng.replace_uriref('NIFCHEM:hasAlternativeId',
                          'oboInOwl:hasAlternativeId')
        # ng.replace_uriref('NIFRID:ChEBIid', 'oboInOwl:id')  # :id does not exist, do we need an alternative?

    list(map(fixHasAltId, (g, cg, chemg)))

    def fixAltIdIsURIRef(g):
        hai = ug.expand('oboInOwl:hasAlternativeId')
        # i = ug.expand('oboInOwl:id')  # :id does not exist
        makeGraph('', graph=g, prefixes=makePrefixes(
            'CHEBI'))  # amazlingly sometimes this is missing...

        def inner(s, p, o):
            if type(o) == rdflib.URIRef:
                qn = g.namespace_manager.qname(o)
                g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string)))
                if 'ns' in qn:
                    print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o),
                          qn)
                g.remove((s, p, o))

        for s, o in g.subject_objects(hai):
            inner(s, hai, o)
        #for s, o in g.subject_objects(i):  # :id does not exist
        #inner(s, i, o)

    list(map(fixAltIdIsURIRef, (g, cg, chemg)))

    matches = [_ for _ in zip(a1, a2, a3, a4)]
    changed = [len(set(_)) != 1 for _ in matches]
    review = [(id_, m) for id_, changed, m in zip(ids, changed, matches)
              if changed and m[0]]
    # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython
    wat_c = [
        set([(s, str(o.toPython()))
             for s, p, o in cg.triples((u, None, None))]) for u, _ in review
    ]
    wat_a = [
        set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))])
        for u, _ in review
    ]
    wat_c_ = [
        set(cg.triples((u, None, None))) for u, _ in review
    ]  # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython
    wat_a_ = [
        set(g.triples((u, None, None))) for u, _ in review
    ]  # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython
    diff = [a - c for a, c in zip(wat_a, wat_c)]
    diff_ = [a - c for a, c in zip(wat_a_, wat_c_)]

    cb = createOntology(
        'chebi-bridge',
        'NIF ChEBI bridge',
        makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole',
                     'NIFCHEM', 'oboInOwl', 'NIFMOL', 'NIFRID'),
        'chebibridge',
        ('This bridge file contains additional annotations'
         ' on top of CHEBI identifiers that were originally'
         ' included in NIF-Chemical or NIF-Molecule that have'
         ' not since been added to CHEBI upstream'),
        path='ttl/bridge/',
        #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl',
        #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl'))
        imports=(
            'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl',
            'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl'))

    out = []
    for set_ in diff:
        for sub, string in sorted(set_):
            for t in g.triples((sub, None, None)):
                # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym
                py = t[-1].toPython()
                if py == string and not py.startswith(
                        'ub'
                ):  # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions...
                    cb.add_recursive(t, g)
        cb.add_class(
            sub
        )  # only need to go at the end because sub is the same for each set

    def hasImplicitSuperclass(s, o):
        for super_ in cg.objects(s, rdflib.RDFS.subClassOf):
            if super_ == o:
                return True
            elif hasImplicitSuperclass(super_, o):
                return True

    # curation decisions after review (see outtc for full list)
    curatedOut = []

    def curateOut(*t):
        curatedOut.append(
            tuple(
                ug.expand(_) if type(_) is not rdflib.Literal else _
                for _ in t))
        cb.del_trip(*t)

    curateOut(
        'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367'
    )  # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def
    curateOut(
        'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870'
    )  # some ions may also be free radicals, but all free radicals are not ions!
    #natural product removal since natural product should probably be a role if anything...
    curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243')
    curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243')
    curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243')

    curateOut('CHEBI:50906', 'rdfs:label',
              rdflib.Literal('Chemical role', datatype=rdflib.XSD.string)
              )  # chebi already has a chemical role...
    curateOut(
        'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432'
    )  # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property
    curateOut('CHEBI:22720', 'rdfs:subClassOf',
              'CHEBI:27171')  # not all children are bicyclic
    curateOut(
        'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188'
    )  # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate...
    curateOut(
        'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171'
    )  # not all children are bicyclic, some may be poly, therefore removing
    curateOut(
        'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232'
    )  # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it)
    curateOut('CHEBI:51064', 'rdfs:subClassOf',
              'CHEBI:35338')  # removing since chebi models this with has part
    curateOut(
        'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720'
    )  # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct
    #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786')  # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea

    # review hold over subClassOf statements
    intc = []
    outtc = []
    for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf):
        if str(
                o
        ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class' or str(
                o
        ) == 'http://ontology.neuinfo.org/nif/nifstd/readable/birnlexRetiredClass':
            # we need to remove any of the cases where deprecation was misused
            cb.g.remove((s, rdflib.RDFS.subClassOf, o))
        elif hasImplicitSuperclass(s, o):
            cb.g.remove((s, rdflib.RDFS.subClassOf, o))
            intc.append((s, rdflib.RDFS.subClassOf, o))
        else:
            outtc.append((s, rdflib.RDFS.subClassOf, o))

    def qname(trips):
        return tuple(
            tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips)

    for a, p, b in sorted(qname(outtc)):
        if 'NIFMOL' in b:
            continue  # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later
        s = sgv.findById(a)
        o = sgv.findById(b)
        if s is None or o is None:
            print(a, '=>', s)
            print(b, '=>', o)
        else:
            print(s['labels'], s['curie'])
            print('subClassOf')
            print(o['labels'], o['curie'])
            print((a, p, b))
        print('---------------------')

    cb.write(
    )  # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...)

    # validation
    diff2 = set(cb.g) - set(cg)
    diff3 = set(cb.g) - diff2  # should just be all the owl:Class entries
    diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg)  # not informative
    diff5 = set(cb.g) - diff4  # not informative
    both = set(chemg) & set(
        molg)  # there is no overlap beyond the owl:Class declarations

    def getChebis(set_):
        return set(t for t in set_ if 'CHEBI_' in t[0])

    def nodt(graph):
        return set((s, str(o) if type(o) is rdflib.Literal else o)
                   for s, p, o in graph)

    cmc = getChebis((((
        (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) -
                    nodt(curatedOut))
    cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o)))
    mmc = getChebis((((
        (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) -
                    nodt(curatedOut))
    mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o)))

    # remove chebi classes from nifchem and nifmol
    def remstuff(sources, targets):
        for source in sources:
            for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class):
                for target in targets:
                    target.del_class(id_)

    remstuff((cg, cd), (chemgg, molgg))

    chemgg.write()
    molgg.write()

    if __name__ == '__main__':
        breakpoint()