def flag_dep(json_): for node in json_['nodes']: if DEP in node['meta']: curie = node['id'] label = node['lbl'] node['id'] = tc.red(curie) node['lbl'] = tc.red(label) for edge in json_['edges']: if edge['sub'] == curie: edge['sub'] = tc.red(curie) elif edge['obj'] == curie: edge['obj'] = tc.red(curie)
def spell(filenames, debug=False): if hunspell is None: raise ImportError('hunspell is not installed on your system. If you want ' 'to run `ontutils spell` please run pipenv install --dev --skip-lock. ' 'You will need the development libs for hunspell on your system.') spell_objects = (u for r in Parallel(n_jobs=9)(delayed(get_spells)(f) for f in filenames) for u in r) hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') #nobj = hunspell.HunSpell(os.path.expanduser('~/git/domain_wordlists/neuroscience-en.dic'), '/usr/share/hunspell/en_US.aff') # segfaults without aff :x collect = set() for filename, s, p, o in spell_objects: missed = False no = [] for line in o.split('\n'): nline = [] for tok in line.split(' '): prefix, tok, suffix = tokstrip(tok) #print((prefix, tok, suffix)) if not hobj.spell(tok):# and not nobj.spell(tok): missed = True collect.add(tok) nline.append(prefix + tc.red(tok) + suffix) else: nline.append(prefix + tok + suffix) line = ' '.join(nline) no.append(line) o = '\n'.join(no) if missed: #print(filename, s, o) print('>>>', o) if debug: [print(_) for _ in sorted(collect)] breakpoint()
def _ontology_local_repo(self): try: stated_repo = Path(self.config['ontology_local_repo']) except (KeyError, TypeError, FileNotFoundError) as e: stated_repo = Path('/dev/null/does-not-exist') maybe_repo = self._maybe_repo if stated_repo.exists(): return stated_repo elif maybe_repo.exists(): return maybe_repo else: maybe_start = Path(__file__).parent.parent.parent.absolute() maybe_base = maybe_start fsroot = Path('/') while maybe_base != fsroot: maybe_repo = maybe_base / self.ontology_repo if maybe_repo.exists(): log.info( tc.blue('INFO:') + f'Ontology repository found at {maybe_repo}') return maybe_repo else: maybe_base = maybe_base.parent else: log.warning( tc.red('WARNING:') + f'No repository found in any parent directory of {maybe_start}' ) return Path('/dev/null/does-not-exist') # seems reaonsable ...
def parents(self): if not self.orphaned: for aid in self.anno.references: try: yield self._getAATById(aid) except KeyError: self.orphaned = True print(tc.red('WARNING:'), f'parent annotation was deleted from {self.id}')
def fetch_and_save(url, loc): resp = requests.get(url) saveloc = (path / loc).as_posix() if resp.ok: with open(saveloc, 'wb') as f: f.write(resp.content) print(tc.blue('INFO:'), f'{url:<60} written to {loc}') else: print(tc.red('WARNING:'), f'failed to fetch {url}')
def sv(asdf, start, ind, warn=False): if type(asdf) not in (bool, int) and asdf.startswith('http'): for iri, short in scigPrint.shorten.items(): if iri in asdf: return scigPrint.wrap(asdf.replace(iri, short + ':'), start, ind) print(tc.red('WARNING:'), 'Shorten failed for', tc.ltyellow(asdf)) return scigPrint.wrap(repr(asdf), start, ind) else: return scigPrint.wrap(repr(asdf), start, ind)
def import_tree(graph, ontologies, **kwargs): for ontology in ontologies: thisfile = Path(ontology).name print(thisfile) OntCuries.populate(graph) j = graph.asOboGraph('owl:imports', restriction=False) try: t, te = creatTree(*Query(f'NIFTTL:{thisfile}', 'owl:imports', 'OUTGOING', 30), json=j, prefixes=dict(graph.namespace_manager), **kwargs) #print(t) yield t, te except KeyError: print(tc.red('WARNING:'), 'could not find', ontology, 'in import chain') # TODO zap onts w/o imports
def build_transgenic_lines(self): """ init class | "transgenic_line_source_name":"stock_number" a Class add superClass | rdfs:subClassOf ilxtr:transgenicLine add *order* | ilxtr:useObjectProperty ilxtr:<order> add name | rdfs:label "name" add def | definition: "description" add transtype | rdfs:hasTransgenicType "transgenic_line_type_name" """ triples = [] for cell_specimen in self.neuron_data: for tl in cell_specimen['donor']['transgenic_lines']: _id = tl['stock_number'] if tl['stock_number'] else tl['id'] prefix = tl['transgenic_line_source_name'] line_type = tl['transgenic_line_type_name'] if line_type == 'driver' and 'CreERT2' in tl['name']: line_type = 'inducibleDriver' if prefix not in ['JAX', 'MMRRC', 'AIBS']: print(tc.red('WARNING:'), 'unknown prefix', prefix, json.dumps(tl, indent=4)) continue elif prefix == 'AIBS': prefix = 'AllenTL' _class = self.ns[prefix][str(_id)] triples.append((_class, rdf.type, owl.Class)) triples.append( (_class, rdfs.label, rdflib.Literal(tl['name']))) triples.append( (_class, definition, rdflib.Literal(tl['description']))) triples.append((_class, rdfs.subClassOf, ilxtr.transgenicLine)) triples.append((_class, ilxtr.hasTransgenicType, ilxtr[line_type + 'Line'])) # TODO aspects.ttl? transgenic_lines = simpleOnt( filename='allen-transgenic-lines', local_base=graphBase.local_base, path='ttl/generated/', prefixes=self.prefixes, triples=triples, comment='Allen transgenic lines for cell types', branch=self.branch, calling__file__=__file__, ) transgenic_lines._graph.write()
def spell(filenames, debug=False): if hunspell is None: raise ImportError( 'hunspell is not installed on your system. If you want ' 'to run `ontutils spell` please run pipenv install --dev --skip-lock. ' 'You will need the development libs for hunspell on your system.') hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff') #nobj = hunspell.HunSpell(os.path.expanduser('~/git/domain_wordlists/neuroscience-en.dic'), '/usr/share/hunspell/en_US.aff') # segfaults without aff :x collect = set() for filename in filenames: missed = False no = [] with open(filename, 'rt') as f: for line_ in f.readlines(): line = line_.rstrip() nline = [] #print(tc.blue(line)) for pattern in _bads: line = line.replace(pattern, ' ' * len(pattern)) #print(line) for tok in line.split(' '): prefix, tok, suffix = tokstrip(tok) #print((prefix, tok, suffix)) if not hobj.spell(tok): # and not nobj.spell(tok): missed = True collect.add(tok) nline.append(prefix + tc.red(tok) + suffix) else: nline.append(prefix + tok + suffix) line = ' '.join(nline) no.append(line) o = '\n'.join(no) if missed: #print(filename, s, o) print('>>>', o) pass if debug: [print(_) for _ in sorted(collect)] breakpoint()
def import_tree(graph, ontologies, **kwargs): for ontology in ontologies: thisfile = Path(ontology).name print(thisfile) mg = makeGraph('', graph=graph) mg.add_known_namespaces('owl', 'obo', 'dc', 'dcterms', 'dctypes', 'skos', 'NIFTTL') j = mg.make_scigraph_json('owl:imports', direct=True) try: t, te = creatTree(*Query(f'NIFTTL:{thisfile}', 'owl:imports', 'OUTGOING', 30), json=j, prefixes=mg.namespaces, **kwargs) #print(t) yield t, te except KeyError: print(tc.red('WARNING:'), 'could not find', ontology, 'in import chain') # TODO zap onts w/o imports
def print_trees(graph, bridge): PPO = 'ro:proper_part_of' HPP = 'ro:has_proper_part' hpp = HPP.replace('ro:', graph.namespaces['ro']) ppo = PPO.replace('ro:', graph.namespaces['ro']) a, b = creatTree( *Query( tc.red('birnlex_796'), HPP, 'OUTGOING', 10 ), # FIXME seems to be a last one wins bug here with birnlex_796 vs NIFGA:birnlex_796 depending on the has seed... json=graph.make_scigraph_json(HPP)) c, d = creatTree(*Query('NIFGA:birnlex_796', hpp, 'OUTGOING', 10), graph=sgg) j = bridge.make_scigraph_json( HPP) # issue https://github.com/RDFLib/rdflib/pull/661 e, f = creatTree(*Query('UBERON:0000955', HPP, 'OUTGOING', 10), json=j) k_, l_ = creatTree(*Query('NIFGA:nlx_anat_101177', ppo, 'INCOMING', 10), graph=sgg) merge = dict(d[-1]) # full tree with ppo converted to hpp merge['nodes'].extend(l_[-1]['nodes']) merge['edges'].extend([{ 'sub': e['obj'], 'pred': hpp, 'obj': e['sub'] } for e in l_[-1]['edges']]) m_, n_ = creatTree(*Query('NIFGA:birnlex_796', hpp, 'OUTGOING', 10), json=merge) print('nifga dep') print(a) print('nifga live') print(c) print('new bridge') print(e) print('nifga total (both directions)') print(m_) print('nifga white matter') print(k_) return a, b, c, d, e, f, k_, l_, m_, n_
def main(): from docopt import docopt, parse_defaults args = docopt(__doc__, version='ontutils 0.0.1') defaults = { o.name: o.value if o.argcount else None for o in parse_defaults(__doc__) } verbose = args['--verbose'] debug = args['--debug'] repo_name = args['<repo>'] git_local = os.path.expanduser(args['--git-local']) epoch = args['--epoch'] curies_location = args['--curies'] curies = getCuries(curies_location) curie_prefixes = set(curies.values()) filenames = args['<file>'] filenames.sort(key=lambda f: os.path.getsize(f), reverse=True) # make sure the big boys go first refactor_skip = ('nif.ttl', 'resources.ttl', 'generated/chebislim.ttl', 'unused/ro_bfo_bridge.ttl', 'generated/ncbigeneslim.ttl', 'generated/NIF-NIFSTD-mapping.ttl') rfilenames = [f for f in filenames if f not in refactor_skip] if args['set']: from pyontutils.config import auth uc = auth.user_config def set_uc(var, value): with open(uc._path, 'rt') as f: text = f.read() if '#' in text: msg = f'Comments detected! Not writing config! {uc._path}' raise ValueError(msg) blob = uc.load() # XXX NEVER DUMP A CONFIG THIS YOU _WILL_ KLOBBER IT # BY ACCIDENT AT SOME POINT AND WILL ERASE ANY/ALL COMMENTS # THERE IS NO SAFETY WITH THIS IMPLEMENTATION # USERS SHOULD EDIT THEIR CONFIGS DIRECTLY # except that it makes giving instructions for # setting values a bit more complicated blob['auth-variables'][var] = value uc.dump(blob) if args['ontology-local-repo']: var = 'ontology-local-repo' olr = Path(args['<path>']).expanduser().resolve() olr_string = olr.as_posix() set_uc(var, olr_string) value2 = auth.get_path(var) if not value2.exists(): msg = f'{var} path does not exist! {value2}' print(tc.red('WARNING'), msg) msg = f'{var} path {value2} written to {uc._path}' print(msg) assert olr == value2 elif args['scigraph-api-key']: # FIXME this is a hack on top of orthauth, which will not # # check the secrets path first to make sure it is ok # be implementing programmtic modification of user config # files any time soon, though it might make sense to have a # "machine config path" in addition to auth and user config path = ['scigraph', 'api', 'key'] spath = auth._pathit(uc.get_blob('auth-stores', 'secrets')['path']) if not spath.parent.exists(): spath.parent.mkdir(parents=True) spath.parent.chmod(0o0700) if spath.suffix != '.yaml': msg = f"Can't write secrets file of type {spath.suffix}" args = None raise NotImplementedError(msg) v = None try: s = uc.secrets v = s(*path) except: pass if v is not None: v = None raise ValueError(f'Path already in secrets! {path} in {spath}') # safely append to the secrets file key = args['<key>'] path_key = f'\nscigraph:\n api:\n key: {key}' if not spath.exists(): spath.touch() spath.chmod(0o0600) with open(spath, 'a+') as f: f.write(path_key) # set the config var var = 'scigraph-api-key' value = {'path': ' '.join(path)} set_uc(var, value) # set the path # XXX NOTE yes, it is correct to do this only after secrets succeeds # otherwise it is possible to get into a state where secrets does # not exist but there is a path pointing to it, so load this # ontutils file will fail during import time # test that we got the value we expected value2 = auth.get(var) msg = (f'Key written to secrets. {spath} and path to ' f'key was written to config {uc._path}') print(msg) assert key == value2, 'Key retrieved does not match key set!' elif args['devconfig']: if args['--write']: file = devconfig.write(args['--output-file']) print(f'config written to {file}') elif args['<field>']: for f in args['<field>']: print(getattr(devconfig, f, '')) else: print(devconfig) elif args['catalog-extras']: catalog_extras(args['--fetch']) elif args['version-iri']: version_iris(*filenames, epoch=epoch) elif args['scigraph-stress']: scigraph_stress(int(args['--rate']), int(args['--timeout']), verbose, debug) elif args['deadlinks']: deadlinks(filenames, int(args['--rate']), int(args['--timeout']), verbose, debug) elif args['spell']: spell(filenames, debug) elif args['iri-commit']: make_git_commit_command(git_local, repo_name) elif args['uri-switch']: uri_switch(rfilenames, uri_switch_values) elif args['backend-refactor']: backend_refactor(rfilenames, backend_refactor_values) elif args['todo']: graph = loadall(git_local, repo_name, local=True) graph_todo(graph, curie_prefixes, uri_switch_values) breakpoint() elif args['expand']: curies['NLXWIKI'] = 'http://legacy.neurolex.org/wiki/' for curie in args['<curie>']: prefix, suffix = curie.split(':') print(curies[prefix] + suffix)
def equiv(curie, label): if curie in manual: replaced_by[curie] = manual[curie] return manual[curie] ec = sgg.getNeighbors(curie, relationshipType='equivalentClass') nodes = [n for n in ec['nodes'] if n['id'] != curie] if len(nodes) > 1: #print('wtf node', [n['id'] for n in nodes], curie) for node in nodes: id_ = node['id'] label_ = node['lbl'] if id_.startswith('UBERON'): if curie in replaced_by: one = replaced_by[curie] replaced_by[curie] = one, id_ print('WE GOT DUPES', curie, label, one, id_) # TODO else: replaced_by[curie] = id_ else: internal_equivs[curie] = id_ elif not nodes: node = sgg.getNode(curie)['nodes'][0] if OWL.deprecated.toPython() in node['meta']: print('THIS CLASS IS DEPRECATED', curie) lbl = node['lbl'] if lbl.startswith( 'Predominantly white regional') or lbl.startswith( 'Predominantly gray regional'): print('\tHE\'S DEAD JIM!', lbl, node['id']) replaced_by[curie] = 'NOREP' if IRBC in node['meta']: existing_replaced = node['meta'][IRBC][0] ec2 = sgg.getNeighbors(existing_replaced, relationshipType='equivalentClass') print('\tFOUND ONE', existing_replaced) #scigPrint.pprint_node(sgg.getNode(existing_replaced)) if ec2['edges']: # pass the buck if we can print('\t', end='') scigPrint.pprint_edge(ec2['edges'][0]) rb = ec2['edges'][0]['obj'] print('\tPASSING BUCK : (%s -> %s -> %s)' % (curie, existing_replaced, rb)) irbcs[curie] = (existing_replaced, rb) replaced_by[curie] = rb return nodes else: er_node = sgv.findById(existing_replaced) if not er_node['deprecated']: if not er_node['curie'].startswith('NIFGA:'): print('\tPASSING BUCK : (%s -> %s)' % (curie, er_node['curie'])) return nodes print( '\tERROR: could not pass buck, we are at a dead end at', er_node) # TODO print() moar = [ t for t in sgv.findByTerm(label) if t['curie'].startswith('UBERON') ] if moar: #print(moar) #replaced_by[curie] = moar[0]['curie'] if len(moar) > 1: print('WARNING', curie, label, [(m['curie'], m['labels'][0]) for m in moar]) for node in moar: #if node['curie'] in uberon_obsolete: # node['deprecated']? #continue ns = sgg.getNode(node['curie']) assert len( ns['nodes']) == 1, "WTF IS GOING ON %s" % node['curie'] ns = ns['nodes'][0] if _doprint: print( 'Found putative replacement in moar: (%s -> %s)' % (curie, ns['id'])) if DBX in ns['meta']: print(' ' * 8, node['curie'], ns['meta'][DBX], node['labels'][0], node['synonyms']) if AID in ns['meta']: print(' ' * 8, node['curie'], ns['meta'][AID], node['labels'][0], node['synonyms']) if CON in ns['meta']: print(' ' * 8, node['curie'], ns['meta'][CON], node['labels'][0], node['synonyms']) replaced_by[curie] = ns['id'] else: replaced_by[curie] = None if False: # review print('NO FORWARD EQUIV', tc.red(curie), label) # TODO for k, v in sorted( sgg.getNode(curie)['nodes'][0]['meta'].items()): if type(v) == iter: print(' ' * 4, k) for _ in v: print(' ' * 8, _) else: print(' ' * 4, k, v) else: node = nodes[0] replaced_by[curie] = node['id'] exact[curie] = node['id'] return nodes
def main(): for filename in ('mbaslim', 'hbaslim', 'paxinos-rat-labels', 'waxholm-rat-labels'): filepath = gitf / 'NIF-Ontology/ttl/generated/parcellation' / ( filename + '.ttl') dir_ = filepath.parent.as_posix() print(dir_) file_commit = subprocess.check_output( [ 'git', 'log', '-n', '1', '--pretty=format:%H', '--', filepath.name ], cwd=dir_, stderr=subprocess.DEVNULL).decode().rstrip() graph = rdflib.Graph().parse(filepath.as_posix(), format='ttl') g = makeGraph('', graph=graph) annos = defaultdict(set) anno_trips = defaultdict(set) for triple, predicate_objects in annotation.parse(graph=graph): for a_p, a_o in predicate_objects: annos[a_p, a_o].add(triple) anno_trips[triple].add((a_p, a_o)) anno_trips = {k: v for k, v in anno_trips.items()} for lifted_triple in restriction.parse(graph=graph): graph.add(lifted_triple) out_header = 'label|abbrev|curie|superPart curie\n' out = [] editions_header = 'edition|label|abbrev|curie\n' editions = [] for s in graph.subjects(rdf.type, owl.Class): rdfsLabel = next(graph.objects(s, rdfs.label)) try: prefLabel = next(graph.objects(s, skos.prefLabel)) except StopIteration: print(tc.red('WARNING:'), f'skipping {s} {rdfsLabel} since it has no prefLabel') continue syns = sorted( graph.objects(s, NIFRID.synonym) ) # TODO are there cases where we need to recaptulate what we are doing for for abbrevs? abbrevs = sorted(graph.objects( s, NIFRID.abbrev)) # FIXME paxinos has more than one try: if annos: if len(abbrevs) > 1: print(tc.blue('INFO:'), g.qname(s), repr(prefLabel.value), 'has multiple abbrevs', [a.value for a in abbrevs]) # prefer latest current_edition = '' for a in abbrevs: for a_p, edition in anno_trips[s, NIFRID.abbrev, a]: if a_p == ilxtr.literalUsedBy: if current_edition < edition: current_edition = edition abbrev = a else: abbrev = abbrevs[0] except IndexError: abbrev = '' try: superPart = next(graph.objects(s, ilxtr.labelPartOf)) except StopIteration: superPart = '' out.append( f'{prefLabel}|{abbrev}|{g.qname(s)}|{g.qname(superPart)}') if annos: #asdf = {'ed':{'label':,'abbrev':,'curie':}} asdf = defaultdict(dict) triple = s, skos.prefLabel, prefLabel eds = anno_trips[triple] for a_p, a_o in eds: asdf[a_o]['curie'] = g.qname(s) asdf[a_o]['label'] = prefLabel for syn in graph.objects(s, NIFRID.synonym): triple = s, NIFRID.synonym, syn eds = anno_trips[triple] for a_p, a_o in eds: asdf[a_o]['curie'] = g.qname(s) if 'label' in asdf[a_o]: print( tc.red('WARNING:'), f'{a_o} already has a label "{asdf[a_o]["label"]}" for "{syn}"' ) asdf[a_o]['label'] = syn for abbrev in graph.objects(s, NIFRID.abbrev): triple = s, NIFRID.abbrev, abbrev eds = anno_trips[triple] #print('aaaaaaaaaaa', g.qname(s), ) for a_p, a_o in eds: asdf[a_o]['curie'] = g.qname(s) if 'abbrev' in asdf[a_o]: print( tc.red('WARNING:'), f'{a_o} already has a abbrev "{asdf[a_o]["abbrev"]}" for "{abbrev}"' ) asdf[a_o]['abbrev'] = abbrev #print(asdf) for ed, kwargs in sorted(asdf.items()): if 'abbrev' not in kwargs: print('Skipping', ed, 'for\n', kwargs) continue editions.append('{ed}|{label}|{abbrev}|{curie}'.format( ed=g.qname(ed), **kwargs)) with open('/tmp/' + filename + f'-{file_commit[:8]}.psv', 'wt') as f: f.write(out_header + '\n'.join(sorted(out, key=labelkey))) if editions: with open('/tmp/' + filename + f'-editions-{file_commit[:8]}.psv', 'wt') as f: f.write(editions_header + '\n'.join(sorted(editions, key=edkey)))
def printer(*args, **kwargs): printe(*(tc.red(repr(a)) for a in args), **kwargs)
def would_you_like_to_know_more_question_mark(): # resolving differences between classes more_ids = set(( 'http://uri.neuinfo.org/nif/nifstd/readable/ChEBIid', 'http://uri.neuinfo.org/nif/nifstd/readable/GOid', 'http://uri.neuinfo.org/nif/nifstd/readable/MeshUid', 'http://uri.neuinfo.org/nif/nifstd/readable/PMID', 'http://uri.neuinfo.org/nif/nifstd/readable/UmlsCui', 'http://uri.neuinfo.org/nif/nifstd/readable/bamsID', 'http://uri.neuinfo.org/nif/nifstd/readable/bonfireID', 'http://uri.neuinfo.org/nif/nifstd/readable/cell_ontology_ID', 'http://uri.neuinfo.org/nif/nifstd/readable/definingCitationID', 'http://uri.neuinfo.org/nif/nifstd/readable/definingCitationURI', 'http://uri.neuinfo.org/nif/nifstd/readable/emapMouseStageDataID', 'http://uri.neuinfo.org/nif/nifstd/readable/emapMouseStageDiagramID', 'http://uri.neuinfo.org/nif/nifstd/readable/externalSourceId', 'http://uri.neuinfo.org/nif/nifstd/readable/externalSourceURI', 'http://uri.neuinfo.org/nif/nifstd/readable/gbifID', 'http://uri.neuinfo.org/nif/nifstd/readable/gbifTaxonKeyID', 'http://uri.neuinfo.org/nif/nifstd/readable/gene_Ontology_ID', #'http://uri.neuinfo.org/nif/nifstd/readable/hasExternalSource', 'http://uri.neuinfo.org/nif/nifstd/readable/hasGenbankAccessionNumber', 'http://uri.neuinfo.org/nif/nifstd/readable/imsrStandardStrainName', 'http://uri.neuinfo.org/nif/nifstd/readable/isReplacedByClass', 'http://uri.neuinfo.org/nif/nifstd/readable/jaxMiceID', 'http://uri.neuinfo.org/nif/nifstd/readable/ncbiTaxID', 'http://uri.neuinfo.org/nif/nifstd/readable/neuronamesID', 'http://uri.neuinfo.org/nif/nifstd/readable/nifID', 'http://uri.neuinfo.org/nif/nifstd/readable/sao_ID', 'http://uri.neuinfo.org/nif/nifstd/readable/umls_ID', 'http://www.geneontology.org/formats/oboInOwl#id', )) outside = [] eee = {} resolver_not_ilx_only_but_not_in_scigraph = set() # resources.ttl _res = Graph().parse((gitf / 'NIF-Ontology/ttl/resources.ttl').as_posix(), format='turtle') reslookup = {uri:[l] for uri, l in _res.subject_objects(rdfs.label)} for uri in chain(h_uris, resolver_not_ilx_only): if 'uri.neuinfo.org' in uri: try: meta = sgg.getNode(uri.toPython())['nodes'][0]['meta'] asdf = {hng.qname(k):v for k, v in meta.items() if k in more_ids} except TypeError: resolver_not_ilx_only_but_not_in_scigraph.add(uri) # resources.ttl ;) if uri in reslookup: # no differentia asdf = False else: asdf = False print('WTF', uri) if asdf: #print(uri, asdf) eee[uri] = asdf for l in asdf.values(): for e in l: outside.append(e) outside_dupes = [v for v, c in Counter(outside).most_common() if c > 1] eee_dupes = {k:v for k, v in eee.items() if anyMembers(outside_dupes, *(e for l in v.values() for e in l))} #for uri, meta in sorted(eee_dupes.items(), key=lambda a:sorted(a[1].values())): #print(uri.toPython(), sorted((e.replace('PMID: ', 'PMID:'), k) for k, l in meta.items() for e in l)) # attempt to deal with label mappings iexisting = defaultdict(set) iiexisting = {} for i, existing in zip(datal('ilx'), datal('iri')): #if 'uri.neuinfo.org' in existing: if 'interlex.org' not in existing and 'neurolex.org' not in existing: iexisting[i].add(URIRef(existing)) iiexisting[URIRef(existing)] = i iexisting = {**iexisting} _ilabs = {k:l for k, l in zip(datal('ilx'), datal('label'))} def inner(iri): resp = sgv.findById(iri) if resp is not None: l = resp['labels'] else: l = [] #_ilabs[iiexisting[iri]] + '** already in ilx **'] #print('trouble?', iri) # ilx only return iri, l #labs = {k:v[0] if v else '<--NO-LABEL-->' for k, v in Async()(deferred(inner)(id_) for id_ in chain(h_uris, (e for s in iexisting.values() for e in s)))} labs = {k:v[0] if v else '<--NO-LABEL-->' for k, v in Async()(deferred(inner)(id_) for id_ in h_uris)} ilabs = {k:l.lower() for k, l in zip(datal('ilx'), datal('label'))} iilabs = {v:k for k, v in ilabs.items()} assert len(ilabs) == len(iilabs) missing_map = {k:iilabs[v.lower()] for k, v in labs.items() if v and v.lower() in iilabs} # XXX this is not valid missing_existing = {i:[m, *iexisting[i]] for m, i in missing_map.items() if i in iexisting} missing_equivs = {next(iter(iexisting[i])):i for m, i in missing_map.items() if i in iexisting} eid = NIFRID.externalSourceId.toPython() ded = owl.deprecated.toPython() # SP: -> swissprot vs uniprot mmr = [] proto_mmr_1_to_1 = {} arrr = defaultdict(set) uniprot_iuphar = set() for uri, ilx_frag in {**missing_equivs, **missing_map}.items(): uri = URIRef(uri) try: meta = sgg.getNode(uri.toPython())['nodes'][0]['meta'] except TypeError: # just ignore these, they are ilx only :/ meta = {} if eid in meta: src = meta[eid][0] if src.startswith('SP:'): src = tc.yellow(src.replace('SP:', 'http://www.uniprot.org/uniprot/')) #elif src.startswith('IUPHAR:'): #pass #else: #src = 'TODO' elif ded in meta and meta[ded]: src = tc.red('ded ') else: src = 'TODO' val = labs[uri] if uri in labs else _ilabs[ilx_frag] + ' **' if uri in eee: differentia = str(eee[uri]) for v in eee[uri].values(): for e in v: arrr[e].add(uri) if 'SP:' in e or 'IUPHAR:' in e: uniprot_iuphar.add(uri) else: differentia = '' if uri in _ilx and uri in all_uris: ruri = SGG[hng.qname(uri)] ruri = tc.blue(f'{ruri:<60}') else: ruri = uri ruri = f'{ruri:<60}' v = ' '.join((f'{val:<60}', src, ruri, ilxb[ilx_frag], differentia)) mmr.append(v) proto_mmr_1_to_1[uri] = v src = None arrr = {**arrr} arrr_not_1_to_1 = {k:v for k, v in arrr.items() if len(v) > 1} #arrr_n11_uris = set((u.toPython() for v in arrr_not_1_to_1.values() for u in v)) arrr_n11_uris = set.union(*arrr_not_1_to_1.values()) mmr_1_to_1 = {k:v for k, v in proto_mmr_1_to_1.items() if k not in arrr_n11_uris} no_uniprot = {k:v for k, v in proto_mmr_1_to_1.items() if k not in uniprot_iuphar} arrr_n11_text = '\n'.join(f'{k:<15} {sorted(_.toPython() for _ in v)}' for k, v in arrr_not_1_to_1.items()) mmr.sort() mmr_text = '\n'.join(mmr) mmr_1_to_1_text = '\n'.join(sorted(mmr_1_to_1.values())) no_uniprot_text = '\n'.join(sorted(no_uniprot.values()))
def as_pretty_diff(self, pathmeta, othermeta, pathobject=None, title=None, human=False): if title is None: if pathobject is not None: title = pathobject.relative_to(pathobject.cwd()).as_posix() else: title = '' if pathmeta.content_different(othermeta): title = tc.red(title) def merge(a, b): keys = [k for k, _ in a] okeys = [ok for ok, _ in b] kind = [(k, okeys.index(k) - i, k) if k in okeys else (None, 0, k) for i, k in enumerate(keys)] okind = [(ok, keys.index(ok) - i, ok) if ok in keys else (None, 0, ok) for i, ok in enumerate(okeys)] long, short = (kind, okind) if len(kind) > len(okind) else (okind, kind) start_align = None aligned = [] offset = 0 for i, (k, v, av) in enumerate(long): i -= offset if k is None and start_align is None: aligned.append(av) aligned.append(short[i + v][-1]) elif k is not None: if v <= 0: start_align = i aligned.append(av) else: for j in range(v): aligned.append(short[i + j][-1]) offset += v aligned.append(av) elif k is None: aligned.append(av) # FIXME what to do if short has a key that long does not? lkv = len(a[0]) lokv = len(b[0]) kv = {k: [k, v] for k, v in a} okv = {k: [k, v] for k, v in b} for k in aligned: l = kv[k] if k in kv else ([''] * lkv) r = okv[k] if k in okv else ([''] * lokv) yield l + r h = [['Key', 'Value', 'Key Other', 'Value Other']] rows = h + list( merge(self.rows(pathmeta, human=human), self.rows(othermeta, human=human))) try: table = AsciiTable(rows, title=title).table except TypeError as e: breakpoint() raise e return table