Пример #1
0
 def repeat(dobig=dobig):  # we don't really know when to stop, so just adjust
     for s, o in graph.subject_objects(owl.imports):
         if os.path.basename(o) not in done and o not in done:
         #if (o, rdf.type, owl.Ontology) not in graph:
             print(o)
             done.append(o)
             ext = os.path.splitext(o)[1]
             fmt = 'turtle' if ext == '.ttl' else 'xml'
             if noneMembers(o, *bigleaves) or dobig:
                 graph.parse(o, format=fmt)
Пример #2
0
def repeat(dobig=False):  # we don't really know when to stop, so just adjust
    for s, o in graph.subject_objects(rdflib.OWL.imports):
        if os.path.basename(o) not in done and o not in done:
            #if (o, rdflib.RDF.type, rdflib.OWL.Ontology) not in graph:
            print(o)
            done.append(o)
            ext = os.path.splitext(o)[1]
            fmt = 'turtle' if ext == '.ttl' else 'xml'
            if noneMembers(o, 'go.owl', 'uberon.owl', 'pr.owl', 'doid.owl',
                           'taxslim.owl') or dobig:
                graph.parse(o, format=fmt)
Пример #3
0
def graph_todo(graph, curie_prefixes, get_values):
    ug = makeGraph('big-graph', graph=graph)
    ug.add_known_namespaces('NIFRID')
    fragment_prefixes, ureps = get_values(ug)
    #all_uris = sorted(set(_ for t in graph for _ in t if type(_) == rdflib.URIRef))  # this snags a bunch of other URIs
    #all_uris = sorted(set(_ for _ in graph.subjects() if type(_) != rdflib.BNode))
    #all_uris = set(spo for t in graph.subject_predicates() for spo in t if isinstance(spo, rdflib.URIRef))
    all_uris = set(spo for t in graph for spo in t if isinstance(spo, rdflib.URIRef))
    prefs = set(_.rsplit('#', 1)[0] + '#' if '#' in _
                       else (_.rsplit('_',1)[0] + '_' if '_' in _
                             else _.rsplit('/',1)[0] + '/') for _ in all_uris)
    nots = set(_ for _ in prefs if _ not in curie_prefixes)  # TODO
    sos = set(prefs) - set(nots)
    all_uris = [u if u not in ureps
                else ureps[u]
                for u in all_uris]
    #to_rep = set(_.rsplit('#', 1)[-1].split('_', 1)[0] for _ in all_uris if 'ontology.neuinfo.org' in _)
    #to_rep = set(_.rsplit('#', 1)[-1] for _ in all_uris if 'ontology.neuinfo.org' in _)

    ignore = (
        # deprecated and only in as annotations
        'NIFGA:birnAnatomy_011',
        'NIFGA:birnAnatomy_249',
        'NIFORG:birnOrganismTaxon_19',
        'NIFORG:birnOrganismTaxon_20',
        'NIFORG:birnOrganismTaxon_21',
        'NIFORG:birnOrganismTaxon_390',
        'NIFORG:birnOrganismTaxon_391',
        'NIFORG:birnOrganismTaxon_56',
        'NIFORG:birnOrganismTaxon_68',
        'NIFINV:birnlexInvestigation_174',
        'NIFINV:birnlexInvestigation_199',
        'NIFINV:birnlexInvestigation_202',
        'NIFINV:birnlexInvestigation_204',
    )
    ignore = tuple(ug.expand(i) for i in ignore)


    non_normal_identifiers = sorted(u for u in all_uris
                                    if 'ontology.neuinfo.org' in u
                                    and noneMembers(u, *fragment_prefixes)
                                    and not u.endswith('.ttl')
                                    and not u.endswith('.owl')
                                    and u not in ignore)
    print(len(prefs))
    embed()
Пример #4
0
def uri_normalization(uri):
    """ NOTE: this does NOT produce uris """
    try:
        # strip hypothesis extension prefix
        if uri.startswith('chrome-extension://bjfhmglciegochdpefhhlphglcehbmek/content/web/viewer.html?file='):
            junk, uri = uri.split('=', 1)

        # universal fixes
        no_fragment, *_frag = uri.rsplit('#', 1)
        no_trailing_slash = no_fragment.rstrip('/')  # annoying
        _scheme, no_scheme = no_trailing_slash.split('://', 1)

        # special cases
        if 'frontiersin.org' in no_scheme:
            # og:url on frontiers is incorrect
            no_scheme = no_scheme.replace('article/', 'articles/')
        elif 'fasebj.org' in no_scheme:  # FIXME this one has _all_ the variants :/
            no_scheme = (no_scheme
                         .replace('.abstract', '')
                         .replace('.full', '')
                         .replace('.pdf', '')
            )
        elif no_scheme.endswith('?needAccess=true'):
            no_scheme = no_scheme[:-len('?needAccess=true')]
        elif '?systemMessage' in no_scheme:
            no_scheme, junk = no_scheme.rsplit('?systemMessage', 1)

        # specific fixes
        if anyMembers(no_scheme,
                      'acs.org',
                      'ahajournals.org',
                      'biologicalpsychiatryjournal.com',
                      'ebiomedicine.com',
                      'fasebj.org',
                      'frontiersin.org',
                      'future-science.com',
                      'hindawi.com',
                      'ieee.org',
                      'jclinepi.com',
                      'jpeds.com',
                      'liebertpub.com',
                      'mitpressjournals.org',
                      'molbiolcell.org',
                      'molmetab.com',
                      'neurobiologyofaging.org',
                      'physiology.org',
                      'sagepub.com',
                      'sciencedirect.com',
                      'tandfonline.com',
                      'theriojournal.com',
                      'wiley.com',):
            # NOTE not all the above hit all of these
            # almost all still resolve
            normalized = (no_scheme
                          .replace('/abstract', '')
                          .replace('/abs', '')
                          .replace('/fulltext', '')
                          .replace('/full', '')
                          .replace('/pdf', ''))
        #elif ('sciencedirect.com' in no_scheme):
            #normalized = (no_scheme
                          #.replace('/abs', ''))
        elif ('cell.com' in no_scheme):
            normalized = (no_scheme  # FIXME looks like cell uses /abstract in og:url
                          .replace('/abstract', '/XXX')
                          .replace('/fulltext', '/XXX'))
        elif 'jneurosci.org' in no_scheme:
            # TODO content/early -> resolution_chain(doi)
            normalized = (no_scheme
                          .replace('.short', '')
                          .replace('.long', '')
                          .replace('.full', '')
                          .replace('.pdf', '')
                          # note .full.pdf is a thing
                          )
        elif 'pnas.org' in no_scheme:
            normalized = (no_scheme
                          .replace('.short', '')
                          .replace('.long', '')
                          .replace('.full', ''))
        elif 'mdpi.com' in no_scheme:
            normalized = (no_scheme
                          .replace('/htm', ''))
        elif 'f1000research.com' in no_scheme:
            # you should be ashamed of yourselves for being in here for this reason
            normalized, *maybe_version = no_scheme.rsplit('/v', 1)
        elif 'academic.oup.com' in no_scheme:
            normalized, *maybesr = no_scheme.rsplit('?searchresult=', 1)
            _normalized, maybe_junk = normalized.rsplit('/', 1)
            numbers = '0123456789'
            if (maybe_junk[0] not in numbers or  # various ways to detect the human readable junk after the id
                maybe_junk[-1] not in numbers or
                '-' in maybe_junk or
                len(maybe_junk) > 20):
                normalized = _normalized
        elif anyMembers(no_scheme,
                        'jci.org',
                        'nature.com'):
            # cases where safe to remove query fragment
            normalized, *_query = no_scheme.rsplit('?', 1)
            normalized, *table_number = normalized.rsplit('/tables/', 1)
        elif 'pubmed/?term=' in no_scheme and noneMembers(no_scheme, ' ', '+'):
            normalized = no_scheme.replace('?term=', '')
        elif 'nih.gov/pubmed/?' in no_scheme:
            # FIXME scibot vs client norm?
            normalized = no_scheme.replace(' ', '+')
        elif 'govhttp' in no_scheme:
            # lol oh dear
            hrm, oops = no_scheme.split('govhttp')
            ded, wat = oops.split('//', 1)
            blargh, suffix = wat.split('/', 1)
            normalized = hrm + 'gov/pmc/' + suffix
        elif 'table/undtbl' in no_scheme:
            normalized, table_number = no_scheme.rsplit('table/undtbl')
        elif anyMembers(no_scheme,
                        'index.php?',
                       ):
            # cases where we just use hypothes.is normalization
            _scheme, normalized = uri_normalize(uri).split('://')  # FIXME h dependency
        else:
            normalized = no_scheme

        'onlinelibrary.wiley.com/doi/10.1002/cne.23727?wol1URL=/doi/10.1002/cne.23727&regionCode=US-CA&identityKey=e2523300-b934-48c9-b08e-940de05d7335'
        'www.jove.com/video/55441/?language=Japanese'
        'www.nature.com/neuro/journal/v19/n5/full/nn.4282.html'
        'www.nature.com/cr/journal/vaop/ncurrent/full/cr201669a.html'
        'https://www.nature.com/articles/cr201669'

        #{'www.ingentaconnect.com/content/umrsmas/bullmar/2017/00000093/00000002/art00006':
         #[OntId('DOI:10.5343/bms.2016.1044'), OntId('DOI:info:doi/10.5343/bms.2016.1044')]}

        # pmid extract from pmc
        #<meta name="citation_pmid" content="28955177">
        return normalized


    except ValueError as e:  # split fail
        pdf_prefix = 'urn:x-pdf:'
        if uri.startswith(pdf_prefix):
            return uri
        elif uri in bad_uris:
            print('AAAAAAAAAAAAAAAAAAAAAAAAAAA', uri)
            return 'THIS URI IS GARBAGE AND THIS IS ITS NORMALIZED FORM'
        else:
            raise TypeError(uri) from e
Пример #5
0
def main():
    branch=auth.get('neurons-branch')
    remote = OntId('NIFTTL:') if branch == 'master' else OntId(f'NIFRAW:{branch}/')

    ont_config = ontneurons(remote)
    ont_neurons = ont_config.neurons()

    bn_config = Config('basic-neurons',
                       # FIXME this should probably be pulled in automatically
                       # from the import statements, and it doesn't work even as is
                       # also a chicken and an egg problem here
                       imports=[remote.iri + 'ttl/generated/swanson.ttl'])

    #RDFL = oq.plugin.get('rdflib')  # FIXME ick
    #rdfl = RDFL(bn_config.core_graph, OntId)
    #OntTerm.query.ladd(rdfl)  # FIXME ick
    bn_config.load_existing()
    bn_neurons = bn_config.neurons()
    #OntTerm.query._services = OntTerm.query._services[:-1]  # FIXME ick

    ndl_config = Config('neuron_data_lifted')
    ndl_config.load_existing()  # FIXME this is extremely slow
    ndl_neurons = sorted(ndl_config.neurons())

    resources = auth.get_path('resources')
    cutcsv = resources / 'cut-development.csv'
    with open(cutcsv.as_posix(), 'rt') as f:
        rows = [l for l in csv.reader(f)]

    bc = byCol(rows)

    (_, *labels), *_ = zip(*bc)
    labels_set0 = set(labels)
    ns = []
    skipped = []
    bamscok = (NIFSTD.BAMSC1125,)
    for n in (ont_neurons + ndl_neurons):
        if n.id_ and 'BAMSC' in n.id_:
            if n.id_ not in bamscok:
                skipped.append(n)
                continue

        l = str(n.origLabel)
        if l is not None:
            for replace, match in rename_rules.items():  # HEH
                l = l.replace(match, replace)

        if l in labels:
            n._origLabel = l
            ns.append(n)

    ns = sorted(ns)
    sns = set(n.origLabel for n in ns)

    labels_set1 = labels_set0 - sns

    agen = [c.label for c in bc if c.autogenerated]
    sagen = set(agen)
    added = [c.label for c in bc if c.added]
    sadded = set(added)
    ans = []
    sans = set()
    missed = set()
    _bl = []  # XXX NOTE THE CONTINUE BELOW
    for n in bn_neurons:
        continue  # we actually get all of these with uberon, will map between them later
        # can't use capitalize here because there are proper names that stay uppercase
        l = n.label.replace('(swannt) ',
                            '').replace('Intrinsic',
                                        'intrinsic').replace('Projection',
                                                             'projection')

        for replace, match in rename_rules.items():  # HEH
            l = l.replace(match, replace)

        if l in agen:
            n._origLabel = l
            ans.append(n)
            sans.add(l)

        else:
            missed.add(l)

        _bl.append(l)

    agen_missing = sagen - sans
    labels_set2 = labels_set1 - sans

    nlx_labels = [c.label for c in bc if c.neurolex]
    snlx_labels = set(nlx_labels)

    class SourceCUT(resSource):
        sourceFile = 'nifstd/resources/cut-development.csv'  # FIXME relative to git workingdir...
        source_original = True

    sources = SourceCUT(),
    swanr = rdflib.Namespace(interlex_namespace('swanson/uris/readable/'))
    SWAN = interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/')
    SWAA = interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/')
    config = Config('cut-development-raw', sources=sources, source_file=relative_path(__file__),
                    prefixes={'swanr': swanr,
                              'SWAN': SWAN,
                              'SWAA': SWAA,})
    ins = [None if OntId(n.id_).prefix == 'TEMP' else n.id_ for n in ns]
    ians = [None] * len(ans)

    with NeuronCUT(CUT.Mammalia):
        mamns = [NeuronCUT(*zap(n.pes), id_=i, label=n._origLabel, override=bool(i)).adopt_meta(n)
                 for i, n in zip(ins + ians, ns + ans)]

    smatch, rem = get_smatch(labels_set2)

    labels_set3 = labels_set2 - smatch
    added_unmapped = sadded & labels_set3

    # TODO preserve the names from neuronlex on import ...
    Neuron.write()
    Neuron.write_python()
    raw_neurons = config.neurons()
    # do this before creating the new config
    # even though we are in theory tripling number of neurons in the current config graph
    # it won't show up in the next config (and this is why we need to reengineer)
    raw_neurons_ind_undep = [n.asUndeprecated().asIndicator() for n in raw_neurons]
    config = Config('cut-development', sources=sources, source_file=relative_path(__file__),
                    prefixes={'swanr': swanr,
                              'SWAN': SWAN,
                              'SWAA': SWAA,})
    # FIXME the call to asUndprecated currenlty triggers addition
    # to the current config and output graph as a side effect (ick!)
    ids_updated_neurons = [n.asUndeprecated() for n in raw_neurons]
    assert len(ids_updated_neurons) == len(raw_neurons)
    Neuron.write()
    Neuron.write_python()
    progress = (len(labels_set0), len(sns), len(sans), len(smatch),
                len(labels_set1), len(labels_set2), len(labels_set3))
    prog_report = ('\nProgress:\n'
                   f'total:            {progress[0]}\n'
                   f'from nlx:         {progress[1]}\n'
                   f'from basic:       {progress[2]}\n'
                   f'from match:       {progress[3]}\n'
                   f'TODO after nlx:   {progress[4]}\n'
                   f'TODO after basic: {progress[5]}\n'
                   f'TODO after match: {progress[6]}\n')
    print(prog_report)
    assert progress[0] == progress[1] + progress[4], 'neurolex does not add up'
    assert progress[4] == progress[2] + progress[5], 'basic does not add up'

    lnlx = set(n.lower() for n in snlx_labels)
    sos = set(n.origLabel.lower() if n.origLabel else None for n in ndl_neurons)  # FIXME load origLabel
    nlx_review = lnlx - sos
    nlx_missing = sorted(nlx_review)
    print(f'\nNeuroLex listed as source but no mapping (n = {len(nlx_review)}):')
    _ = [print(l) for l in nlx_missing]

    partial = {k:v for k, v in rem.items() if v and v not in terminals}
    print(f'\nPartially mapped (n = {len(partial)}):')
    if partial:
        mk = max((len(k) for k in partial.keys())) + 2
        for k, v in sorted(partial.items()):
            print(f'{k:<{mk}} {v!r}')
            #print(f'{k!r:<{mk}}{v!r}')
        #pprint(partial, width=200)
    unmapped = sorted(labels_set3)
    print(f'\nUnmapped (n = {len(labels_set3)}):')
    _ = [print(l) for l in unmapped]

    no_location = [n for n in Neuron.neurons()
                   if noneMembers((ilxtr.hasSomaLocatedIn, ilxtr.hasSomaLocatedInLayer), *n.unique_predicates)]
    if __name__ == '__main__':
        review_rows = export_for_review(config, unmapped, partial, nlx_missing)
        breakpoint()

    return config, unmapped, partial, nlx_missing
Пример #6
0
def get_smatch(labels_set2):
    contains_rules = make_contains_rules()
    skip = set()
    smatch = set()
    rem = {}
    for l in labels_set2:
        pes = tuple()
        l_rem = l
        for match, pheno in sorted(contains_rules.items(), key=lambda ab:-len(ab[0])):
            if not l_rem:
                break

            if len(match) > len(l_rem):
                continue

            t = None
            if match not in skip and pheno == OntTerm:
                try:
                    t = OntTerm(term=match)
                    log.debug(f'WTF {match} {t}')
                    if t.validated:
                        pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn)
                    else:
                        pheno = None
                except oq.exceptions.NotFoundError:
                    skip.add(match)
                    pheno = None

            if match in skip and pheno == OntTerm:
                pheno = None

            if match in l_rem and pheno:
                l_rem = l_rem.replace(match, '').strip()
                pes += (pheno if isinstance(pheno, tuple) else (pheno,))

        if l_rem in exact_rules:
            pes += (exact_rules[l_rem],)
            l_rem = ''

        if l_rem == '  neuron':
            l_rem = ''
        elif l_rem.endswith('  cell'):
            l_rem = l_rem[:-len('  cell')]
            #print('l_rem no cell:', l_rem)
        elif l_rem.endswith('  neuron'):
            l_rem = l_rem[:-len('  neuron')]
            #print('l_rem no neuron:', l_rem)

        hrm = [pe for pe in pes if pe.e == ilxtr.hasSomaLocatedIn]
        if '  ' in l_rem:
            #print('l_rem:', l_rem)
            #breakpoint()
            maybe_region, rest = l_rem.split('  ', 1)
        elif noneMembers(l_rem, *terminals) and not hrm:
            maybe_region, rest = l_rem, ''
            #print('MR:', maybe_region)
        else:
            #print(hrm)
            maybe_region = None

        if maybe_region:
            prefix_rank = ('UBERON', 'SWAN', 'BIRNLEX', 'SAO', 'NLXANAT', 'NLX')
            def key(ot):
                ranked = ot.prefix in prefix_rank
                qargs = ot._query_result._QueryResult__query_args
                if 'term' in qargs and qargs['term'] is not None:
                    arg = qargs['term'].lower()
                else:
                    arg = None
                return (not ranked,
                        prefix_rank.index(ot.prefix) if ranked else 0,
                        not (arg == ot.label.lower()))

            #ots = sorted((term for term in OntTerm.query(term=maybe_region,
                                                         #exclude_prefix=('FMA', 'NLX'))), key=key)

            #if not ots:
            ots = sorted((term for term in OntTerm.query(term=maybe_region,
                                                         exclude_prefix=('FMA',))), key=key)
            if not ots:
                log.error(f'No match for {maybe_region!r}')
            else:
                t = ots[0]
                if 'oboInOwl:id' in t.predicates:  # uberon replacement
                    t = OntTerm(t.predicates['oboInOwl:id'])

                t.set_next_repr('curie', 'label')
                log.info(f'Match for {maybe_region!r} was {t!r}')
                if t.validated:
                    l_rem = rest
                    pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn)  # FIXME
                    pes += (pheno,)

        if pes:
            smatch.add(l)

            if not l_rem or l_rem in ('neuron', 'neurons', 'cell', 'Cell', 'positive cell'):
                with NeuronCUT(CUT.Mammalia):
                    NeuronCUT(*zap(pes), id_=make_cut_id(l), label=l, override=True)
            else:
                rem[l] = l_rem

    return smatch, rem
Пример #7
0
    def default(self):
        out_path = self.options.out_path
        BUILD = self.options.BUILD

        glb = Path(auth.get_path('git-local-base'))
        theme_repo = glb / 'org-html-themes'
        theme = theme_repo / 'setup/theme-readtheorg-local.setup'
        prepare_paths(BUILD, out_path, theme_repo, theme)

        doc_config = self._doc_config
        names = tuple(doc_config['repos']) + tuple(
            self.options.repo)  # TODO fetch if missing ?
        repo_paths = [(glb / name).resolve() for name in names]
        repos = [p.repo for p in repo_paths]
        skip_folders = doc_config.get('skip-folders', tuple())
        rskip = doc_config.get('skip', {})

        # TODO move this into run_all
        docstring_kwargs = makeDocstrings(BUILD, repo_paths, skip_folders,
                                          rskip)
        wd_docs_kwargs = [docstring_kwargs]
        if self.options.docstring_only:
            [
                kwargs.update({'theme': theme})
                for _, _, kwargs in wd_docs_kwargs
            ]
            outname, rendered = render_docs(wd_docs_kwargs,
                                            out_path,
                                            titles=None,
                                            n_jobs=1,
                                            debug=self.options.debug)[0]
            if not outname.parent.exists():
                outname.parent.mkdir(parents=True)
            with open(outname.as_posix(), 'wt') as f:
                f.write(rendered)
            return

        et = tuple()
        wd_docs_kwargs += [
            (rp, rp / f, makeKwargs(rp, f)) for rp in repo_paths
            for f in rp.repo.git.ls_files().split('\n')
            if Path(f).suffix in suffixFuncs and only(rp, f) and noneMembers(
                f, *skip_folders) and f not in rskip.get(rp.name, et)
        ]

        [kwargs.update({'theme': theme}) for _, _, kwargs in wd_docs_kwargs]

        if self.options.spell:
            spell((f.as_posix() for _, f, _ in wd_docs_kwargs))
            return

        titles = doc_config['titles']

        outname_rendered = render_docs(wd_docs_kwargs,
                                       out_path,
                                       titles,
                                       self.options.jobs,
                                       debug=self.options.debug)

        index = [
            f'<b class="{heading}">{heading}</b>'
            for heading in doc_config['index']
        ]

        _NOTITLE = object()
        for outname, rendered in outname_rendered:
            apath = outname.relative_to(self.options.out_path)
            title = titles.get(apath.as_posix(), _NOTITLE)
            # TODO parse out/add titles
            if title is not None:
                value = (hfn.atag(apath) if title is _NOTITLE else hfn.atag(
                    apath, title))
                index.append(value)

            if not outname.parent.exists():
                outname.parent.mkdir(parents=True)

            with open(outname.as_posix(), 'wt') as f:
                f.write(rendered)

        lt = list(titles)

        def title_key(a):
            title = a.split('"')[1]
            if title not in lt:
                msg = (f'{title} missing from {self.options.config}')
                raise ValueError(msg)
            return lt.index(title)

        index_body = '<br>\n'.join(['<h1>Documentation Index</h1>'] +
                                   sorted(index, key=title_key))
        with open((out_path / 'index.html').as_posix(), 'wt') as f:
            f.write(hfn.htmldoc(index_body, title=doc_config['title']))
Пример #8
0
    def inner(local_filepath, remote=False):
        if noneMembers(local_filepath, *bigleaves) or dobig:
            ext = os.path.splitext(local_filepath)[-1]
            if ext == '.ttl':
                infmt = 'turtle'
            else:
                log.info((ext, local_filepath))
                infmt = None
            if remote:
                resp = requests.get(
                    local_filepath
                )  # TODO nonblocking pull these out, fetch, run inner again until done
                raw = resp.text.encode()
            else:
                try:
                    with open(local_filepath, 'rb') as f:
                        raw = f.read()
                except FileNotFoundError as e:
                    if local_filepath.startswith('file://'):
                        log.info(
                            f'local_imports has already been run, skipping {local_filepath}'
                        )
                        return
                        #raise ValueError('local_imports has already been run') from e
                    else:
                        log.exception(
                            e
                        )  # TODO raise a warning if the file cannot be matched
                        # seems like good practice to have any imported ontology under
                        # version control so all imports are guaranteed to have good
                        # provenance and not split the prior informaiton between the
                        # scigraph config and the repository, the repository remains
                        # the source of truth, load.yaml files can then pick a subset
                        # of the properly tracked files to load as they see fit, but
                        # not add to them (at least in pyontutils land)
                        raw = b''

            if oo in raw:  # we only care if there are imports or an ontology iri
                scratch = OntGraph()
                if infmt == 'turtle':
                    data, rest = raw.split(b'###', 1)
                elif infmt == None:  # assume xml
                    xml_tree = etree.parse(BytesIO(raw))
                    xml_root = xml_tree.getroot()
                    xml_ontology = xml_tree.xpath(
                        "/*[local-name()='RDF']/*[local-name()='Ontology']")
                    xml_root.clear()
                    xml_root.append(xml_ontology[0])
                    data = etree.tostring(xml_root)
                scratch.parse(data=data, format=infmt)
                for s in scratch.subjects(rdf.type, owl.Ontology):
                    triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath)))
                    # somehow this breaks computing the chain
                    #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label):
                    #for o in scratch[s:p]:
                    #triples.add((s, p, o))
                for s, o in sorted(scratch.subject_objects(p)):
                    if revert:
                        raise NotImplementedError('TODO')
                    nlfp = o.replace(remote_base, local_base)
                    triples.add((s, p, o))
                    if 'http://' in local_filepath or 'external' in local_filepath:  # FIXME what to do about https used inconsistently :/
                        if 'external' in local_filepath:
                            imported_iri = rdflib.URIRef(
                                local_filepath.replace(
                                    local_base, remote_base))  # inefficient
                        else:
                            imported_iri = rdflib.URIRef(local_filepath)
                        if s != imported_iri:
                            imported_iri_vs_ontology_iri[
                                imported_iri] = s  # kept for the record
                            triples.add((imported_iri, p,
                                         s))  # bridge imported != ontology iri
                    if local_base in nlfp and 'file://' not in o:  # FIXME file:// should not be slipping through here...
                        scratch.add((s, p, rdflib.URIRef('file://' + nlfp)))
                        scratch.remove((s, p, o))
                    if nlfp not in done:
                        done.append(nlfp)
                        if local_base in nlfp and 'external' not in nlfp:  # skip externals TODO
                            inner(nlfp)
                        elif readonly:  # read external imports
                            if 'external' in nlfp:
                                inner(nlfp)
                            else:
                                inner(nlfp, remote=True)
                if not readonly:
                    _orp = CustomTurtleSerializer.roundtrip_prefixes  # FIXME awful hack :/
                    CustomTurtleSerializer.roundtrip_prefixes = True
                    ttl = scratch.serialize(format='nifttl', encoding='utf-8')
                    CustomTurtleSerializer.roundtrip_prefixes = _orp
                    ndata, comment = ttl.split(b'###', 1)
                    out = ndata + b'###' + rest
                    with open(local_filepath, 'wb') as f:
                        f.write(out)
Пример #9
0
def main():
    from docopt import docopt
    args = docopt(__doc__)

    patch_theme_setup(theme)

    BUILD = working_dir / 'doc_build'
    if not BUILD.exists():
        BUILD.mkdir()

    docs_dir = BUILD / 'docs'
    if not docs_dir.exists():
        docs_dir.mkdir()

    theme_styles_dir = theme_repo / 'styles'
    doc_styles_dir = docs_dir / 'styles'
    if doc_styles_dir.exists():
        shutil.rmtree(doc_styles_dir)

    shutil.copytree(theme_styles_dir, doc_styles_dir)

    docstring_kwargs = docstrings()
    wd_docs_kwargs = [docstring_kwargs]
    if args['--docstring-only']:
        outname, rendered = render_docs(wd_docs_kwargs, BUILD, 1)[0]
        if not outname.parent.exists():
            outname.parent.mkdir(parents=True)
        with open(outname.as_posix(), 'wt') as f:
            f.write(rendered)
        return

    repos = (Repo(Path(devconfig.ontology_local_repo).resolve().as_posix()),
             Repo(working_dir.as_posix()),
             *(Repo(Path(devconfig.git_local_base, repo_name).as_posix())
               for repo_name in ('ontquery', 'sparc-curation')))

    skip_folders = 'notebook-testing', 'complete', 'ilxutils', 'librdflib'
    rskip = {
        'pyontutils': (
            'docs/NeuronLangExample.ipynb',  # exact skip due to moving file
            'ilxutils/ilx-playground.ipynb'),
        'sparc-curation': ('README.md', ),
    }

    et = tuple()
    # TODO move this into run_all
    #wd_docs_kwargs = [(Path(repo.working_dir).resolve(),
    wd_docs_kwargs += [
        (Path(repo.working_dir).resolve(), Path(repo.working_dir, f).resolve(),
         makeKwargs(repo, f)) for repo in repos
        for f in repo.git.ls_files().split('\n')
        if Path(f).suffix in suffixFuncs
        #and Path(repo.working_dir).name == 'NIF-Ontology' and f == 'README.md'  # DEBUG
        #and Path(repo.working_dir).name == 'pyontutils' and f == 'README.md'  # DEBUG
        #and Path(repo.working_dir).name == 'sparc-curation' and f == 'docs/setup.org'  # DEBUG
        and noneMembers(f, *skip_folders) and f not in rskip.get(
            Path(repo.working_dir).name, et)
    ]

    # doesn't work because read-from-minibuffer cannot block
    #compile_org_forever = ['emacs', '-q', '-l',
    #Path(devconfig.git_local_base,
    #'orgstrap/init.el').resolve().as_posix(),
    #'--batch', '-f', 'compile-org-forever']
    #org_compile_process = subprocess.Popen(compile_org_forever,
    #stdin=subprocess.PIPE,
    #stdout=subprocess.PIPE,
    #stderr=subprocess.PIPE)

    if args['--spell']:
        spell((f.as_posix() for _, f, _ in wd_docs_kwargs))
        return

    outname_rendered = render_docs(wd_docs_kwargs, BUILD, int(args['--jobs']))

    titles = {
        'Components': 'Components',
        'NIF-Ontology/README.html': 'Introduction to the NIF Ontology',  # 
        'ontquery/README.html': 'Introduction to ontquery',
        'pyontutils/README.html': 'Introduction to pyontutils',
        'pyontutils/nifstd/README.html': 'Introduction to nifstd-tools',
        'pyontutils/neurondm/README.html': 'Introduction to neurondm',
        'pyontutils/ilxutils/README.html': 'Introduction to ilxutils',
        'Developer docs': 'Developer docs',
        'NIF-Ontology/docs/processes.html':
        'Ontology development processes (START HERE!)',  # HOWTO
        'NIF-Ontology/docs/development-setup.html':
        'Ontology development setup',  # HOWTO
        'sparc-curation/docs/setup.html':
        'Developer and curator setup (broader scope but extremely detailed)',
        'NIF-Ontology/docs/import-chain.html':
        'Ontology import chain',  # Documentation
        'pyontutils/nifstd/resolver/README.html': 'Ontology resolver setup',
        'pyontutils/nifstd/scigraph/README.html': 'Ontology SciGraph setup',
        'sparc-curation/resources/scigraph/README.html':
        'SPARC SciGraph setup',
        'pyontutils/docstrings.html': 'Command line programs',
        'NIF-Ontology/docs/external-sources.html':
        'External sources for the ontology',  # Other
        'ontquery/docs/interlex-client.html':
        'InterLex client library doccumentation',
        'Contributing': 'Contributing',
        'pyontutils/nifstd/development/README.html':
        'Contributing to the ontology',
        'pyontutils/nifstd/development/community/README.html':
        'Contributing term lists to the ontology',
        'pyontutils/neurondm/neurondm/models/README.html':
        'Contributing neuron terminology to the ontology',
        'Ontology content': 'Ontology content',
        'NIF-Ontology/docs/brain-regions.html':
        'Parcellation schemes',  # Ontology Content
        'pyontutils/nifstd/development/methods/README.html':
        'Methods and techniques',  # Ontology content
        'NIF-Ontology/docs/Neurons.html': 'Neuron Lang overview',
        'pyontutils/neurondm/docs/NeuronLangExample.html':
        'Neuron Lang examples',
        'pyontutils/neurondm/docs/neurons_notebook.html': 'Neuron Lang setup',
        'Specifications': 'Specifications',
        'NIF-Ontology/docs/interlex-spec.html':
        'InterLex specification',  # Documentation
        'pyontutils/ttlser/docs/ttlser.html':
        'Deterministic turtle specification',
        'Other': 'Other',
        'pyontutils/htmlfn/README.html': 'htmlfn readme',
        'pyontutils/ttlser/README.html': 'ttlser readme',
        'sparc-curation/docs/background.html':
        '',  # present but not visibly listed
    }

    titles_sparc = {  # TODO abstract this out ...
        'Background': 'Background',
        'sparc-curation/docs/background.html': 'SPARC curation background',
        'Other':'Other',
        'sparc-curation/README.html': 'sparc-curation readme',
    }

    index = [
        '<b class="Components">Components</b>',
        '<b class="Developer docs">Developer docs</b>',
        '<b class="Contributing">Contributing</b>',
        '<b class="Ontology content">Ontology content</b>',
        '<b class="Specifications">Specifications</b>',
        '<b class="Other">Other</b>',
    ]
    for outname, rendered in outname_rendered:
        apath = outname.relative_to(BUILD / 'docs')
        title = titles.get(apath.as_posix(), None)
        # TODO parse out/add titles
        value = atag(apath) if title is None else atag(apath, title)
        index.append(value)
        if not outname.parent.exists():
            outname.parent.mkdir(parents=True)
        with open(outname.as_posix(), 'wt') as f:
            f.write(rendered)

    lt = list(titles)

    def title_key(a):
        return lt.index(a.split('"')[1])

    index_body = '<br>\n'.join(['<h1>Documentation Index</h1>'] +
                               sorted(index, key=title_key))
    with open((BUILD / 'docs/index.html').as_posix(), 'wt') as f:
        f.write(htmldoc(index_body, title='NIF Ontology documentation index'))
Пример #10
0
def main():
    DB_URI = 'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db}'
    if socket.gethostname() != 'orpheus':
        config = mysql_conn_helper('localhost', 'nif_eelg', 'nif_eelg_secure', 33060)  # see .ssh/config
    else:
        config = mysql_conn_helper('nif-mysql.crbs.ucsd.edu', 'nif_eelg', 'nif_eelg_secure')
    engine = create_engine(DB_URI.format(**config), echo=True)
    config = None
    del(config)

    insp = inspect(engine)
    terms = [c['name'] for c in insp.get_columns('terms')]
    term_existing_ids = [c['name'] for c in insp.get_columns('term_existing_ids')]
    #breakpoint()
    #sys.exit()

    query = engine.execute('SELECT * FROM term_existing_ids as teid JOIN terms as t ON t.id = teid.tid WHERE t.type != "cde"')
    header = term_existing_ids + terms

    data = query.fetchall()
    cdata = list(zip(*data))

    def datal(head):
        return cdata[header.index(head)]

    ilx_labels = {ilxb[ilx_fragment]:label for ilx_fragment, label in zip(datal('ilx'), datal('label'))}

    mapping_no_sao = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0]]  # 9446
    mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0] or '/sao' in p[0]]  # 9883
    done = [ilx for iri, ilx in mapping]
    obo_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'obolibrary' in p[0] and p[1] not in done]
    done = done + [ilx for iri, ilx in obo_mapping]
    db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'drugbank' in p[0] and p[1] not in done]
    done = done + [ilx for iri, ilx in db_mapping]
    t3db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 't3db' in p[0] and p[1] not in done]
    done = done + [ilx for iri, ilx in t3db_mapping]

    wiki_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neurolex' in p[0] and p[1] not in done]

    sao_mapping = {o.toPython():s for s, o in Graph().parse((gitf / 'nlxeol/sao-nlxwiki-fixes.ttl').as_posix(), format='ttl').subject_objects(oboInOwl.hasAlternativeId)}

    scr = Graph().parse((gitf / 'NIF-Ontology/scicrunch-registry.ttl').as_posix(), format='turtle')
    moved_to_scr = {}
    #PROBLEM = set()
    for s, o in scr.subject_objects(oboInOwl.hasDbXref):
        if 'SCR_' in o:
            print(f'WARNING Registry identifier listed as alt id! {s} hasDbXref {o}')
            continue
        uri = NIFSTD[o]
        #try:
        assert uri not in moved_to_scr, f'utoh {uri} was mapped to more than one registry entry! {s} {moved_to_scr[uri]}'
        #except AssertionError:
            #PROBLEM.add(uri)

        moved_to_scr[uri] = s

    to_scr = [(k, v) for k, v in moved_to_scr.items()
           if noneMembers(k, 'SciEx_', 'OMICS_', 'rid_', 'SciRes_',
                          'biodbcore-', 'C0085410', 'doi.org', 'C43960',
                          'doi:10.', 'GAZ:',
                          # 'birnlex_', 'nlx_', 'nif-'
                         )]

    replacement_graph = createOntology(filename='NIFSTD-ILX-mapping',
                        name='NLX* to ILX equivalents',
                        prefixes=makePrefixes('ILX'),)

    scr_rep_graph = createOntology(filename='NIFSTD-SCR-mapping',
                                   name='NLX* to SCR equivalents',
                                   prefixes=makePrefixes('SCR'),)

    _existing = {}
    def dupes(this, other, set_, dupes_):
        if this not in set_:
            set_.add(this)
            _existing[this] = other
        elif _existing[this] != other:
            dupes_[this].add(_existing[this])
            dupes_[this].add(other)

    iri_done = set()
    ilx_done = set()
    iri_dupes = defaultdict(set)
    ilx_dupes = defaultdict(set)
    def check_dupes(iri, ilx):
        dupes(iri, ilx, iri_done, iri_dupes)
        dupes(ilx, iri, ilx_done, ilx_dupes)

    BIRNLEX = Namespace(uPREFIXES['BIRNLEX'])
    trouble = [  # some are _2 issues :/
               # in interlex -- YES WE KNOW THEY DONT MATCH SOME IDIOT DID THIS IN THE PAST
               BIRNLEX['1006'],  # this one appears to be entirely novel despite a note that it was created in 2006...
               BIRNLEX['1152'],  # this was used in uberon ;_;
               BIRNLEX['2476'],  # can be owl:sameAs ed -> _2 version
               BIRNLEX['2477'],  # can be owl:sameAs ed -> _2 version
               BIRNLEX['2478'],  # can be owl:sameAs ed -> _2 version
               BIRNLEX['2479'],  # can be owl:sameAs ed -> _2 version
               BIRNLEX['2480'],  # can be owl:sameAs ed -> _2 version
               BIRNLEX['2533'],  # This is in interlex as a wiki id http://uri.interlex.org/base/ilx_0109349 since never used in the ontology, we could add it to the list of 'same as' for cosmetic purposes which will probably happen...
               BIRNLEX['3074'],  # -> CHEBI:26848  # add to slim and bridge...
               BIRNLEX['3076'],  # -> CHEBI:26195  # XXX when we go to load chebi make sure we don't dupe this...
    ]

    aaaaaaaaaaaaaaaaaaaaaaaaaaaaa = [t + '_2' for t in trouble]  # _never_ do this

    # TODO check for cases where there is an ilx and scr for the same id >_<

    sao_help = set()
    for iri, ilx_fragment in chain(mapping, to_scr):  # XXX core loop
        if iri in sao_mapping:
            uri = sao_mapping[iri]
            sao_help.add(uri)
        else:
            uri = URIRef(iri)

        if uri in trouble:
            #print('TROUBLE', iri, ilxb[ilx_fragment])
            print('TROUBLE', ilxb[ilx_fragment])

        if uri in moved_to_scr:  # TODO I think we need to have _all_ the SCR redirects here...
            s, p, o = uri, ilxtr.hasScrId, moved_to_scr[uri]
            scr_rep_graph.g.add((s, p, o))
        else:
            s, p, o = uri, ilxtr.hasIlxId, ilxb[ilx_fragment]
            #s, p, o = o, ilxtr.ilxIdFor, s
            replacement_graph.g.add((s, p, o))

        check_dupes(s, o)

    dupes = {k:v for k, v in iri_dupes.items()}
    idupes = {k:v for k, v in ilx_dupes.items()}
    assert not dupes, f'there are duplicate mappings for an external id {dupes}'
    #print(ilx_dupes)  # there are none yet

    ng = cull_prefixes(replacement_graph.g, prefixes=uPREFIXES)
    ng.filename = replacement_graph.filename

    sng = cull_prefixes(scr_rep_graph.g, prefixes=uPREFIXES)
    sng.filename = scr_rep_graph.filename


    _ = [print(k.toPython(), ' '.join(sorted(ng.qname(_.toPython()) for _ in v))) for k, v in idupes.items()]

    # run `resolver_uris = sorted(set(e for t in graph for e in t if 'uri.neuinfo.org' in e))` on a graph with everything loaded to get this file...
    resources = Path(__file__).resolve().absolute().parent / 'resources'
    with open((resources / 'all-uri.neuinfo.org-uris.pickle').as_posix(), 'rb') as f:
        all_uris = pickle.load(f)  # come in as URIRefs...
    with open((resources / 'all-uri.neuinfo.org-uris-old.pickle').as_posix(), 'rb') as f:
        all_uris_old = pickle.load(f)  # come in as URIRefs...
    with open((resources / 'all-uri.neuinfo.org-uris-old2.pickle').as_posix(), 'rb') as f:
        all_uris_old2 = pickle.load(f)  # come in as URIRefs...

    resolver_uris = set(e for t in chain(ng.g, sng.g) for e in t if 'uri.neuinfo.org' in e)
    ilx_only = resolver_uris - all_uris  # aka nlxonly
    resolver_not_ilx_only = resolver_uris - ilx_only
    problem_uris = all_uris - resolver_uris
    old_uris = all_uris_old - all_uris
    old_uris2 = all_uris_old2 - all_uris
    dold_uris = all_uris_old - all_uris_old2

    #idold_uris = all_uris_old2 - all_uris_old  # empty as expected
    #nxrefs = Graph().parse((gitf / 'NIF-Ontology/ttl/generated/nlx-xrefs.ttl').as_posix(), format='turtle')
    nxrefs = Graph().parse((gitf / 'nlxeol/nlx-xrefs.ttl').as_posix(), format='turtle')
    xrefs_uris = set(e for t in nxrefs for e in t if 'uri.neuinfo.org' in e)
    test_old_uris = old_uris2 - xrefs_uris

    diff_uris = test_old_uris - ilx_only
    #diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_149160'))  # ORNL was included in an old bad version of the xrefs file and was pulled in in the old all-uris  # now dealt with by the scr mapping
    diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_40280,birnlex_1731'))  # one of the doubled neurolex ids
    diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd'))  # i have zero idea how this snuck in
    assert not diff_uris, 'old uris and problem uris should be identical'

    _ilx = set(e for t in ng.g for e in t)
    _scr = set(e for t in sng.g for e in t)
    for uri in ilx_only:
        if uri in _ilx and uri in _scr:
            raise BaseException('AAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
        elif uri in _ilx:
            g = ng.g
        elif uri in _scr:
            g = sng.g
        else:
            raise BaseException('????????????')
        g.add((uri, ilxtr.isDefinedBy, URIRef('http://neurolex.org')))

    # XXX write the graphs
    ng.write()
    sng.write()

    nsuris = set(uri for uri, ilx in mapping_no_sao)
    auris = set(_.toPython() for _ in all_uris)
    iuris = set(_.toPython() for _ in resolver_uris)
    #sao_missing = iuris - nsuris  # now fixed and cannot run due to addition of scr ids to resolver_uris
    #assert not sao_missing, f'whoops {sao_missing}'
    ilx_missing = auris - iuris
    all_missing = iuris - auris
    #assert not all_missing, f'all is not all! {all_missing}'  # XXX have to deal with ilx_only separately as NLX-ILX or something

    # fixed
    #sao_add = {o.toPython():s.toPython() for s, p, o in ng.g if s.toPython() in sao_missing}
    #assert len(sao_add) == len(sao_missing), 'EEEEEEEEEEEEEEE'
    #with open('/tmp/please-add-these-sao-ids-as-existing-ids-to-the-listed-interlex-record.json', 'wt') as f:
        #json.dump(sao_add, f, indent=2)

    to_review = sorted(ilx_missing)

    # not relevant anymore
    #with open('thought-to-be-missing.json', 'rt') as f:
        #thought_to_be_missing = json.load(f)

    # from troy has issues
    #with open('nifext-duplicates-and-new.json', 'rt') as f:
        #nifext_data = json.load(f)

    #nifext_dupes = {v['current_nifext_id']:v['dropped_nifext_ids'][-1] if v['dropped_nifext_ids'] else None for v in nifext_data.values()}

    sgv = Vocabulary(cache=True)
    trts = [(v, (sgv.findById(v)['labels'][0]
                 if sgv.findById(v)['labels']
                 else '<--NO-LABEL-->')
             if sgv.findById(v)
             else '<------>')
            for v in to_review]

    sgg = sGraph(cache=True)
    SGG = Namespace(sgg._basePath.rstrip('/') + '/graph/')
    rg = Graph().parse((gitf / 'NIF-Ontology/ttl/unused/NIF-Retired.ttl').as_posix(), format='turtle')
    retired = set(e.toPython() for t in rg for e in t if 'uri.neuinfo.org' in e)
    retfile = '<ttl/unused/NIF-Retired.ttl>'
    help_graph = createOntology(filename='NIFSTD-BLACKHOLE-mapping',
                        name='HELPPPPPPPP!!!!',
                        prefixes=uPREFIXES,)
    def make_rt(to_review_tuples, retired=retired):
        def inner(u, l, retired=retired):
            ne = sgg.getNeighbors(u, relationshipType="isDefinedBy", depth=1)
            if ne:
                curie = help_graph.qname(u)
                help_graph.g.add((URIRef(u), ilxtr.SciGraphLookup, URIRef(f'http://scigraph.olympiangods.org/scigraph/graph/{curie}')))
            if ne and ne['edges']:
                src = ' '.join([f'<{e["obj"]}>' for e in ne["edges"]])
            elif u in retired:
                src = retfile
            else:
                src = '<>'
            return f'{u:<70} {l:<50} {src}'
        out = Async(rate=3000)(deferred(inner)(u, l) for u, l in sorted(to_review_tuples, key=lambda a:a[-1]))
        return '\n'.join(out)

    review_text = make_rt(trts)
    trts2 = [(u, l) for u, l in trts if 'nifext' not in u]
    not_nifext = make_rt(trts2)

    hng = cull_prefixes(help_graph.g, prefixes=uPREFIXES)
    hng.filename = help_graph.filename
    hng.write()

    ###
    #   Accounting of uri.neuinfo.org ids that do not resolve
    ###

    not_in_interlex = set(s for s, o in hng.g.subject_objects(ilxtr.SciGraphLookup))
    bh_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and sgv.findById(s)['deprecated'])
    bh_not_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and not sgv.findById(s)['deprecated'])
    bh_nifexts = set(s for s in bh_not_deprecated if 'nifext' in s)
    bh_readable = set(s for s in bh_not_deprecated if 'readable' in s)
    unaccounted = not_in_interlex - bh_readable - bh_nifexts - bh_deprecated
    namedinds = set(s for s in unaccounted
                    if sgv.findById(s) and
                    sgg.getNode(s)['nodes'][0]['meta']['types'] and
                    sgg.getNode(s)['nodes'][0]['meta']['types'][0] == 'NamedIndividual')
    unaccounted = unaccounted - namedinds
    ual = sorted(o for s in unaccounted for o in hng.g.objects(s, ilxtr.SciGraphLookup))
    report = (
        f'Total       {len(not_in_interlex)}\n'
        f'deprecated  {len(bh_deprecated)}\n'
        f'nd nifext   {len(bh_nifexts)}\n'
        f'nd readable {len(bh_readable)}\n'
        f'nd namedind {len(namedinds)}\n'
        f'unaccounted {len(unaccounted)}\n'
             )
    print(report)

    def reverse_report():
        ilx = Graph()
        ilx.parse('/tmp/interlex.ttl', format='turtle')
        not_in_ontology = set()
        annotations = set()
        relations = set()
        drugbank = set()
        t3db = set()
        for subject in ilx.subjects(rdf.type, owl.Class):
            ok = False
            for object in ilx.objects(subject, oboInOwl.hasDbXref):
                if anyMembers(object, 'uri.neuinfo.org', 'GO_', 'CHEBI_', 'PR_',
                              'PATO_', 'HP_', 'OBI_', 'DOID_', 'COGPO_', 'CAO_',
                              'UBERON_', 'NCBITaxon_', 'SO_', 'IAO_'):
                    # FIXME doe we areally import HP?
                    ok = True

                if (subject, rdf.type, owl.AnnotationProperty) in ilx:  # FIXME for troy these need to be cleared up
                    annotations.add(subject)
                elif (subject, rdf.type, owl.ObjectProperty) in ilx:
                    relations.add(subject)
                elif 'drugbank' in object:
                    drugbank.add(subject)
                elif 't3db.org' in object:
                    t3db.add(subject)

            if not ok:
                not_in_ontology.add(subject)


        drugbank = drugbank & not_in_ontology
        t3db = t3db & not_in_ontology
        annotations = annotations & not_in_ontology
        relations = relations & not_in_ontology
        unaccounted = not_in_ontology - drugbank - t3db - annotations - relations
        report = (
            f'Total       {len(not_in_ontology)}\n'
            f'annotations {len(annotations)}\n'
            f'relations   {len(relations)}\n'
            f'drugbank    {len(drugbank)}\n'
            f't3db        {len(t3db)}\n'
            f'unaccounted {len(unaccounted)}\n'
        )
        print(report)
        return (not_in_ontology, drugbank, unaccounted)

    _, _, un = reverse_report()

    h_uris = set(e for t in hng.g for e in t if 'uri.neuinfo.org' in e)
    real_problems = problem_uris - h_uris

    ###
    #   Missing neurons
    ###

    with open((gitf / 'nlxeol/neuron_data_curated.csv').as_posix()) as f:
        r = csv.reader(f)
        nheader = next(r)
        rows = list(r)

    ndata = list(zip(*rows))

    def datan(head):
        return ndata[nheader.index(head)]

    if __name__ == '__main__':
        breakpoint()
Пример #11
0
def main():
    ndl_config = Config('neuron_data_lifted')
    ndl_config.load_existing()
    ndl_neurons = ndl_config.neurons()
    bn_config = Config('basic-neurons')
    bn_config.load_existing()
    bn_neurons = bn_config.neurons()

    resources = Path(devconfig.resources)
    cutcsv = resources / 'common-usage-types.csv'
    with open(cutcsv.as_posix(), 'rt') as f:
        rows = [l for l in csv.reader(f)]

    bc = byCol(rows)

    (_, *labels), *_ = zip(*bc)
    labels_set0 = set(labels)
    ns = []
    for n in ndl_neurons:
        l = str(n.origLabel)
        if l is not None:
            for replace, match in rename_rules.items():  # HEH
                l = l.replace(match, replace)

        if l in labels:
            n._origLabel = l
            ns.append(n)

    sns = set(n.origLabel for n in ns)

    labels_set1 = labels_set0 - sns

    agen = [c.label for c in bc if c.autogenerated]
    sagen = set(agen)
    added = [c.label for c in bc if c.added]
    sadded = set(added)
    ans = []
    sans = set()
    missed = set()
    for n in bn_neurons:
        continue  # we actually get all of these with uberon, will map between them later
        # can't use capitalize here because there are proper names that stay uppercase
        l = n.label.replace('(swannt) ',
                            '').replace('Intrinsic',
                                        'intrinsic').replace('Projection',
                                                             'projection')

        for replace, match in rename_rules.items():  # HEH
            l = l.replace(match, replace)

        if l in agen:
            n._origLabel = l
            ans.append(n)
            sans.add(l)

        else:
            missed.add(l)

    agen_missing = sagen - sans
    labels_set2 = labels_set1 - sans

    nlx_labels = [c.label for c in bc if c.neurolex]
    snlx_labels = set(nlx_labels)

    class SourceCUT(resSource):
        sourceFile = 'nifstd/resources/common-usage-types.csv'  # FIXME relative to git workingdir...
        source_original = True

    sources = SourceCUT(),
    swanr = rdflib.Namespace(interlex_namespace('swanson/uris/readable/'))
    config = Config('common-usage-types-raw', sources=sources, source_file=relative_path(__file__),
                    prefixes={'swanr':swanr,
                              'SWAN':interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/'),
                              'SWAA':interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/'),})
    ins = [None if OntId(n.id_).prefix == 'TEMP' else n.id_ for n in ns]
    ians = [None] * len(ans)
    def zap(pes):
        for pe in pes:
            if pe not in (Phenotype('BIRNLEX:212', ilxtr.hasTaxonRank),
                          Phenotype('NCBITaxon:7742', ilxtr.hasTaxonRank),
                          Phenotype('BIRNLEX:252', ilxtr.hasTaxonRank),
                          Phenotype('BIRNLEX:516', ilxtr.hasTaxonRank),):
                yield pe

    with Neuron(CUT.Mammalia):
        mamns = [NeuronCUT(*zap(n.pes), id_=i, label=n._origLabel, override=bool(i)).adopt_meta(n)
                 for i, n in zip(ins + ians, ns + ans)]

    contains_rules = make_contains_rules()

    skip = set()
    smatch = set()
    rem = {}
    for l in labels_set2:
        pes = tuple()
        l_rem = l
        for match, pheno in contains_rules.items():
            t = None
            if match not in skip and pheno == OntTerm:
                try:
                    t = OntTerm(term=match)
                    print('WTF', match, t)
                    if t.validated:
                        pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn)
                    else:
                        pheno = None
                except oq.exceptions.NotFoundError:
                    skip.add(match)
                    pheno = None
            if match in skip and pheno == OntTerm:
                pheno = None

            if match in l_rem and pheno:
                l_rem = l_rem.replace(match, '').strip()
                pes += (pheno,)
            
        if l_rem in exact_rules:
            pes += (exact_rules[l_rem],)
            l_rem = ''

        if l_rem == '  neuron':
            l_rem = ''
        elif l_rem.endswith('  cell'):
            l_rem = l_rem[:-len('  cell')]
            #print('l_rem no cell:', l_rem)
        elif l_rem.endswith('  neuron'):
            l_rem = l_rem[:-len('  neuron')]
            #print('l_rem no neuron:', l_rem)

        hrm = [pe for pe in pes if pe.e == ilxtr.hasSomaLocatedIn]
        if '  ' in l_rem:
            #print('l_rem:', l_rem)
            #embed()
            maybe_region, rest = l_rem.split('  ', 1)
        elif noneMembers(l_rem, *terminals) and not hrm:
            maybe_region, rest = l_rem, ''
            #print('MR:', maybe_region)
        else:
            #print(hrm)
            maybe_region = None

        if maybe_region:
            prefix_rank = ('UBERON', 'SWAN', 'BIRNLEX', 'SAO', 'NLXANAT')
            def key(ot):
                ranked = ot.prefix in prefix_rank
                arg = ot._query_result._QueryResult__query_args['term'].lower()
                return (not ranked,
                        prefix_rank.index(ot.prefix) if ranked else 0,
                        not (arg == ot.label.lower()))

            #t = OntTerm(term=maybe_region)
            # using query avoids the NoExplicitIdError
            ots = sorted((qr.OntTerm for qr in OntTerm.query(term=maybe_region,
                                                             exclude_prefix=('FMA',))), key=key)
            if not ots:
                log.error(f'No match for {maybe_region!r}')
            else:
                t = ots[0]
                if 'oboInOwl:id' in t.predicates:  # uberon replacement
                    t = OntTerm(t.predicates['oboInOwl:id'])

                t.set_next_repr('curie', 'label')
                log.info(f'Match for {maybe_region!r} was {t!r}')
                if t.validated:
                    l_rem = rest
                    pheno = Phenotype(t.u, ilxtr.hasSomaLocatedIn)  # FIXME
                    pes += (pheno,)

        if pes:
            smatch.add(l)
            rem[l] = l_rem

            with Neuron(CUT.Mammalia):
                NeuronCUT(*zap(pes), id_=make_cut_id(l), label=l, override=True)

    labels_set3 = labels_set2 - smatch
    added_unmapped = sadded & labels_set3

    # TODO preserve the names from neuronlex on import ...
    Neuron.write()
    Neuron.write_python()
    raw_neurons = config.neurons()
    config = Config('common-usage-types', sources=sources, source_file=relative_path(__file__),
                    prefixes={'swanr':swanr,
                              'SWAN':interlex_namespace('swanson/uris/neuroanatomical-terminology/terms/'),
                              'SWAA':interlex_namespace('swanson/uris/neuroanatomical-terminology/appendix/'),})
    ids_updated_neurons = [n.asUndeprecated() for n in raw_neurons]
    assert len(ids_updated_neurons) == len(raw_neurons)
    Neuron.write()
    Neuron.write_python()
    progress = len(labels_set0), len(sns), len(sans), len(smatch), len(labels_set1), len(labels_set2), len(labels_set3)
    print('\nProgress:\n'
          f'total:            {progress[0]}\n'
          f'from nlx:         {progress[1]}\n'
          f'from basic:       {progress[2]}\n'
          f'from match:       {progress[3]}\n'
          f'TODO after nlx:   {progress[4]}\n'
          f'TODO after basic: {progress[5]}\n'
          f'TODO after match: {progress[6]}\n')
    assert progress[0] == progress[1] + progress[4], 'neurolex does not add up'
    assert progress[4] == progress[2] + progress[5], 'basic does not add up'

    lnlx = set(n.lower() for n in snlx_labels)
    sos = set(n.origLabel.lower() if n.origLabel else None for n in ndl_neurons)  # FIXME load origLabel
    nlx_review = lnlx - sos
    nlx_missing = sorted(nlx_review)
    print(f'\nNeuroLex listed as source but no mapping (n = {len(nlx_review)}):')
    _ = [print(l) for l in nlx_missing]

    partial = {k:v for k, v in rem.items() if v and v not in terminals}
    print(f'\nPartially mapped (n = {len(partial)}):')
    if partial:
        mk = max((len(k) for k in partial.keys())) + 2
        for k, v in sorted(partial.items()):
            print(f'{k:<{mk}} {v!r}')
            #print(f'{k!r:<{mk}}{v!r}')
        #pprint(partial, width=200)
    unmapped = sorted(labels_set3)
    print(f'\nUnmapped (n = {len(labels_set3)}):')
    _ = [print(l) for l in unmapped]

    if __name__ == '__main__':
        rows = export_for_review(config, unmapped, partial, nlx_missing)
        embed()

    return config, unmapped, partial, nlx_missing
Пример #12
0
 def inner(local_filepath, remote=False):
     if noneMembers(local_filepath, *bigleaves) or dobig:
         ext = os.path.splitext(local_filepath)[-1]
         if ext == '.ttl':
             infmt = 'turtle'
         else:
             print(ext, local_filepath)
             infmt = None
         if remote:
             resp = requests.get(
                 local_filepath
             )  # TODO nonblocking pull these out, fetch, run inner again until done
             raw = resp.text.encode()
         else:
             try:
                 with open(local_filepath, 'rb') as f:
                     raw = f.read()
             except FileNotFoundError as e:
                 if local_filepath.startswith('file://'):
                     print('local_imports has already been run, skipping',
                           local_filepath)
                     return
                     #raise ValueError('local_imports has already been run') from e
                 else:
                     print(e)
                     raw = b''
         if oo in raw:  # we only care if there are imports or an ontology iri
             scratch = rdflib.Graph()
             if infmt == 'turtle':
                 data, rest = raw.split(b'###', 1)
             elif infmt == None:  # assume xml
                 xml_tree = etree.parse(BytesIO(raw))
                 xml_root = xml_tree.getroot()
                 xml_ontology = xml_tree.xpath(
                     "/*[local-name()='RDF']/*[local-name()='Ontology']")
                 xml_root.clear()
                 xml_root.append(xml_ontology[0])
                 data = etree.tostring(xml_root)
             scratch.parse(data=data, format=infmt)
             for s in scratch.subjects(rdf.type, owl.Ontology):
                 triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath)))
             for s, o in sorted(scratch.subject_objects(p)):
                 nlfp = o.replace(remote_base, local_base)
                 triples.add((s, p, o))
                 if 'http://' in local_filepath or 'external' in local_filepath:  # FIXME what to do about https used inconsistently :/
                     if 'external' in local_filepath:
                         imported_iri = rdflib.URIRef(
                             local_filepath.replace(
                                 local_base, remote_base))  # inefficient
                     else:
                         imported_iri = rdflib.URIRef(local_filepath)
                     if s != imported_iri:
                         imported_iri_vs_ontology_iri[
                             imported_iri] = s  # kept for the record
                         triples.add((imported_iri, p,
                                      s))  # bridge imported != ontology iri
                 if local_base in nlfp and 'file://' not in o:  # FIXME file:// should not be slipping through here...
                     scratch.add((s, p, rdflib.URIRef('file://' + nlfp)))
                     scratch.remove((s, p, o))
                 if nlfp not in done:
                     done.append(nlfp)
                     if local_base in nlfp and 'external' not in nlfp:  # skip externals TODO
                         inner(nlfp)
                     elif readonly:  # read external imports
                         if 'external' in nlfp:
                             inner(nlfp)
                         else:
                             inner(nlfp, remote=True)
             if not readonly:
                 ttl = scratch.serialize(format='nifttl')
                 ndata, comment = ttl.split(b'###', 1)
                 out = ndata + b'###' + rest
                 with open(local_filepath, 'wb') as f:
                     f.write(out)