def _completeness(self, data): accessor = JT(data) # can go direct if elements are always present #organ = accessor.query('meta', 'organ') try: organ = adops.get(data, ['meta', 'organ']) except: organ = None if isinstance(organ, list) or isinstance(organ, tuple): if len(organ) == 1: organ, = organ organ = OntTerm(organ) else: organ = [OntTerm(o) for o in organ] elif organ == 'othertargets': pass elif organ: organ = OntTerm(organ) return ( accessor.status.submission_index, accessor.status.curation_index, accessor.status.error_index, #accessor.submission_completeness_index, #dataset.name, # from filename (do we not have that in meta!?) accessor.query('meta', 'folder_name'), accessor.id, #if 'id' in dowe else None, accessor.query('meta', 'award_number'), organ, )
def award_number(self, value): _, s = self.c.award_number(value) yield s, a, owl.NamedIndividual yield s, a, TEMP.FundedResearchProject return o = self.integrator.organ(value) if o: if o != 'othertargets': o = OntId(o) if o.prefix == 'FMA': ot = OntTerm(o) o = next(OntTerm.query(label=ot.label, prefix='UBERON')).OntTerm yield s, isAbout, o.u
def added(self): data = super().added # FIXME conditional lifts ... if 'award_number' not in data['meta']: am = self.lifters.award_manual if am: data['meta']['award_number'] = am if 'modality' not in data['meta']: m = self.lifters.modality if m: data['meta']['modality'] = m if 'organ' not in data['meta']: if 'award_number' in data['meta']: an = data['meta']['award_number'] o = self.lifters.organ(an) if o: if o != 'othertargets': o = OntId(o) if o.prefix == 'FMA': ot = OntTerm(o) o = next( OntTerm.query(label=ot.label, prefix='UBERON')).OntTerm data['meta']['organ'] = o if 'organ' not in data['meta'] or data['meta'][ 'organ'] == 'othertargets': o = self.lifters.organ_term if o: if isinstance(o, str): o = o, out = tuple() for _o in o: _o = OntId(_o) if _o.prefix == 'FMA': ot = OntTerm(_o) _o = next( OntTerm.query(label=ot.label, prefix='UBERON')).OntTerm out += (_o, ) data['meta']['organ'] = out return data
def triples_protcur(self, protocol_subject): # XXX deprecated and extremely slow # triples from the protcur pipeline are exported only once as part of # protcur.ttl ps = list(self._protcur(str(protocol_subject))) anatomy = [(p, OntId('UBERON:' + str(p).split('UBERON:', 1)[-1].split(' ', 1)[0])) for p in ps if p.astType == 'protc:input' and '(protc:input (term UBERON' in str(p)] #breakpoint() dataset_subject = rdflib.URIRef(self.uri_api) yield protocol_subject, TEMP.hasNumberOfProtcurAnnotations, rdflib.Literal( len(ps)) done = set() for anno, term in anatomy: if term in done: continue done.add(term) yield from OntTerm(term).triples_simple o = term.u t = dataset_subject, TEMP.involvesAnatomicalRegion, o sl = rdflib.URIRef(anno.shareLink) av = (((ilxtr.annotationValue, rdflib.Literal(anno.value)), ) if anno.value != o else tuple()) notes = [(ilxtr.curatorNote, rdflib.Literal(n)) for n in anno.curatorNotes] prov = [(ilxtr.hasAnnotation, sl)] yield t yield from cmb.annotation(t, *av, *notes, *prov)()
def process_dict(self, dict_): """ deal with multiple fields """ out = {k:v for k, v in dict_.items() if k not in self.skip} for h_unit, h_value in zip(self.h_unit, self.h_value): if h_value not in dict_: # we drop null cells so if one of these was null then we have to skip it here too continue dhv = dict_[h_value] if isinstance(dhv, str): try: dhv = ast.literal_eval(dhv) except ValueError as e: raise exc.UnhandledTypeError(f'{h_value} {dhv!r} was not parsed!') from e compose = dhv * pyru.ur.parse_units(dict_[h_unit]) #_, v, rest = parameter_expression(compose) #out[h_value] = str(UnitsParser(compose).for_text) # FIXME sparc repr #breakpoint() out[h_value] = compose #UnitsParser(compose).asPython() if 'gender' in out and 'species' in out: if out['species'] != OntTerm('NCBITaxon:9606'): out['sex'] = out.pop('gender') return out
def normv(v): if is_list_or_tuple(v): return [normv(_) for _ in v] if isinstance(v, dict): return {k: normv(v) for k, v in v.items()} if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... v = OntTerm(v) return v.asCell() if isinstance( v, rdflib.URIRef): # FIXME why is this getting converted early? ot = OntTerm(v) return ot.asCell() if isinstance(v, ProtcurExpression): return str(v) # FIXME for xml? if isinstance(v, Quantity): return str(v) elif isinstance(v, pathlib.Path): return str(v) elif isinstance(v, idlib.Stream): return v.asCell() #elif isinstance(v, list) or isinstance(v, str): #return v elif isinstance(v, BaseException): return repr(v) else: #loge.debug(repr(v)) return v
def _update_technique(self, cell): # NOTE some rows won't update if the dataset no longer exists value = cell.value if value: try: term = next(OntTerm.query(label=value)) cell.value = term.asCellHyperlink() except StopIteration: log.info(f'no term for technique {value}')
def map(self, anno): row = self._annotation_row(anno) mapping_ok = row.mapping_ok().value == 'TRUE' # FIXME not_input = row.not_input_().value bad_for_mapping = row.bad_for_mapping_().value manual_mapping = row.manual_mapping().value if mapping_ok and not not_input: pass if manual_mapping and ' ' in manual_mapping: log.error( f'Why does a manual mapping have a space in it {manual_mapping!r}' ) elif manual_mapping: return OntTerm(manual_mapping) elif mapping_ok: # FIXME anno.astValue can drift from auto_mapping # this is so hilariously inefficient, we parse the same stuff # 3 times or something return OntTerm(anno.asPython().asPython().black_box.curie)
def normv(v): if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... oid = OntId(v) if oid.prefix in want_prefixes: return OntTerm(v).tabular() else: return oid.iri if isinstance(v, OntId): if not isinstance(v, OntTerm): v = OntTerm(v) v = v.tabular() if isinstance(v, list) or isinstance(v, tuple): v = ','.join( json.dumps(_, cls=JEncode) if isinstance(_, dict ) else normv(_) for _ in v) v = v.replace('\n', ' ').replace('\t', ' ') elif any(isinstance(v, c) for c in (int, float, str)): v = str(v) v = v.replace('\n', ' ').replace('\t', ' ') # FIXME tests to catch this elif isinstance(v, dict): v = json.dumps(v, cls=JEncode) return v
def yield_from_id(s, matid, predicate=predicate): mat = rm[matid] if 'external' in mat: mat_s = OntTerm(mat['external'][0]) yield s, predicate, mat_s.u yield mat_s.u, a, owl.Class yield mat_s.u, rdfs.label, rdflib.Literal(mat_s.label) if 'materials' in mat: for submat_id in mat['materials']: yield from yield_from_id(mat_s, submat_id, TEMP.hasConstituent) else: log.warning(f'no external id for {mat}')
def query(value, prefix): for query_type in ('term', 'search'): terms = [q.OntTerm for q in OntTerm.query(prefix=prefix, **{query_type:value})] if terms: #print('matching', terms[0], value) #print('extra terms for', value, terms[1:]) return terms[0] else: continue else: log.warning(f'No ontology id found for {value}') return value
def normv(v): if is_list_or_tuple(v): return [normv(_) for _ in v] elif isinstance(v, dict): return {k:normv(v) for k, v in v.items()} elif isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... # XXX NOTE these days this will only happen if someone # supplies us with a uri in a field where we aren't # expecting one, in which case we should just return it try: v = OntTerm(v) return v.asCell() except Exception as e: loge.error(f'something went wrong with {v}') loge.exception(e) return v #raise e elif isinstance(v, rdflib.URIRef): # FIXME why is this getting converted early? ot = OntTerm(v) return ot.asCell() elif isinstance(v, ProtcurExpression): return str(v) # FIXME for xml? elif isinstance(v, Quantity): return str(v) elif isinstance(v, AsJson): # XXX returns value not tested, may be extremely strange return str(v) elif isinstance(v, pathlib.Path): return str(v) elif isinstance(v, idlib.Stream): return v.asCell() #elif isinstance(v, list) or isinstance(v, str): #return v elif isinstance(v, BaseException): return repr(v) else: #loge.debug(repr(v)) return v
def normv(v): if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... v = OntTerm(v) return v.tabular() if isinstance(v, rdflib.URIRef): # FIXME why is this getting converted early? ot = OntTerm(v) return ot.tabular() if isinstance(v, Expr): return str(v) # FIXME for xml? if isinstance(v, Quantity): return str(v) else: #log.debug(repr(v)) return v
def _term(self): if not hasattr(self, '_c_term'): self._c_term = OntTerm(self.id) return self._c_term
def disco(self): #dsh = sorted(MetaOutSchema.schema['allOf'][0]['properties']) dsh = [ 'acknowledgements', 'additional_links', 'award_number', 'completeness_of_data_set', 'contributor_count', 'description', 'dirs', 'errors', 'examples', 'files', 'funding', 'keywords', 'links', 'modality', 'name', # -> title 'organ', 'originating_article_doi', 'principal_investigator', 'prior_batch_number', 'protocol_url_or_doi', 'sample_count', 'size', 'species', 'subject_count', 'title_for_complete_data_set', 'uri_api', 'uri_human', 'error_index', # (sum *_index) 'dataset_completeness_index', # dead 'is_about', 'involves_anatomical_region', 'title', 'folder_name', ] chs = [ 'contributor_affiliation', 'contributor_orcid_id', 'contributor_role', 'is_contact_person', 'name', 'first_name', 'last_name', 'middle_name', 'id', 'blackfynn_user_id', ] datasets = [['id', 'submission_index', 'curation_index'] + dsh] contributors = [['id'] + chs] subjects = [['id', 'blob']] errors = [['id', 'blob']] resources = [['id', 'blob']] #cje = JEncode() def normv(v): if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... oid = OntId(v) if oid.prefix in want_prefixes: return OntTerm(v).tabular() else: return oid.iri if isinstance(v, OntId): if not isinstance(v, OntTerm): v = OntTerm(v) v = v.tabular() if isinstance(v, list) or isinstance(v, tuple): v = ','.join( json.dumps(_, cls=JEncode) if isinstance(_, dict ) else normv(_) for _ in v) v = v.replace('\n', ' ').replace('\t', ' ') elif any(isinstance(v, c) for c in (int, float, str)): v = str(v) v = v.replace('\n', ' ').replace('\t', ' ') # FIXME tests to catch this elif isinstance(v, dict): v = json.dumps(v, cls=JEncode) return v for dataset_blob in self: id = dataset_blob['id'] dowe = dataset_blob graph = rdflib.Graph() TriplesExportDataset(dataset_blob).populate(graph) is_about = [ OntTerm(o) for s, o in graph[:isAbout:] if isinstance(o, rdflib.URIRef) ] involves = [ OntTerm(o) for s, o in graph[:TEMP.involvesAnatomicalRegion:] ] inv = ','.join(i.tabular() for i in involves) ia = ','.join(a.tabular() for a in is_about) #row = [id, dowe['error_index'], dowe['submission_completeness_index']] # FIXME this doubles up on the row row = [ id, dowe['status']['submission_index'], dowe['status']['curation_index'] ] # FIXME this doubles up on the row if 'meta' in dowe: meta = dowe['meta'] for k in dsh: if k in meta: v = meta[k] v = normv(v) elif k == 'is_about': v = ia elif k == 'involves_anatomical_region': v = inv else: v = None row.append(v) else: row += [None for k in sc.MetaOutSchema.schema['properties']] datasets.append(row) # contribs if 'contributors' in dowe: cs = dowe['contributors'] for c in cs: row = [id] for k in chs: if k in c: v = c[k] v = normv(v) row.append(v) else: row.append(None) contributors.append(row) if 'subjects' in dowe: for subject in dowe['subjects']: row = [id] row.append(json.dumps(subject, cls=JEncode)) subjects.append(row) # moved to resources if exists already #if 'software' in sbs: #for software in sbs['software']: #row = [id] #row.append(json.dumps(software, cls=JEncode)) #resources.append(row) if 'resources' in dowe: for res in dowe['resources']: row = [id] row.append(json.dumps(res, cls=JEncode)) resources.append(row) if 'errors' in dowe: ers = get_all_errors(dowe) for er in ers: row = [id] row.append(json.dumps(er, cls=JEncode)) errors.append(row) # TODO samples resources return (('datasets', datasets), ('contributors', contributors), ('subjects', subjects), ('resources', resources), ('errors', errors))
def map(self, anno): row = self._annotation_row(anno) oid = row.ontology_id().value label = row.ontology_label().value ilx_curie = row.interlex_id().value return OntTerm(oid, label=label), OntTerm(interlex_curie)
def triples_gen(self): rm = self._source # FIXME there doesn't seem to be a section that tells me the name # of top level model so I have to know its name beforhand # the id is in the model, having the id in the resource map # prevents issues if these things get sent decoupled id = rm['id'] mid = id.replace(' ', '-') links = rm[id]['links'] #linknodes = [n for n in rm[id]['nodes'] if n['class'] == 'Link'] # visible confusion st = [] from_to = [] ot = None yield from self.apinatbase() for link in links: if 'conveyingType' in link: if link['conveyingType'] == 'ADVECTIVE': p_is = TEMP.isAdvectivelyConnectedTo p_from = TEMP.advectivelyConnectsFrom p_to = TEMP.advectivelyConnectsTo p_cmat = TEMP.advectivelyConnectsMaterial diffusive = False elif link['conveyingType'] == 'DIFFUSIVE': p_is = TEMP.isDiffusivelyConnectedTo p_from = TEMP.diffusivelyConnectsFrom p_to = TEMP.diffusivelyConnectsTo p_cmat = TEMP.diffusivelyConnectsMaterial diffusive = True else: log.critical(f'unhandled conveying type {link}') continue source = link['source'] target = link['target'] ok = True if len(from_to) == 2: # otherwise st = [] from_to = [] for i, e in enumerate((source, target)): ed = rm[e] if 'external' not in ed: if not i and from_to: # TODO make sure the intermediate ids match pass else: ok = False break else: st.append(e) from_to.append(OntId(ed['external'][0])) conveying = link['conveyingLyph'] cd = rm[conveying] if 'external' in cd: old_ot = ot ot = OntTerm(cd['external'][0]) yield ot.u, rdf.type, owl.Class yield ot.u, TEMP.internalId, rdflib.Literal(conveying) yield ot.u, rdfs.label, rdflib.Literal(ot.label) yield from self.materialTriples( ot.u, link, p_cmat) # FIXME locate this correctly if ok: u, d = from_to if st[0] == source: yield u, rdfs.label, rdflib.Literal( OntTerm(u).label) yield u, rdf.type, owl.Class yield from cmb.restriction.serialize( ot.u, p_from, u) if st[1] == target: yield d, rdfs.label, rdflib.Literal( OntTerm(d).label) yield d, rdf.type, owl.Class yield from cmb.restriction.serialize(ot.u, p_to, d) if old_ot is not None and old_ot != ot: yield from cmb.restriction.serialize( ot.u, p_from, old_ot.u) if diffusive: # we can try to hack this using named individuals # but it is not going to do exactly what is desired s_link = TEMP[f'ApiNATOMY/{mid}/{link["id"]}'] s_cd = TEMP[f'ApiNATOMY/{mid}/{cd["id"]}'] yield s_link, rdf.type, owl.NamedIndividual yield s_link, rdf.type, TEMP.diffusiveLink # FIXME I'm not sure these go in the model ... yield s_cd, rdf.type, owl.NamedIndividual if 'external' in cd and cd['external']: oid = OntId(cd['external'][0]) yield s_cd, rdf.type, oid.u ot = oid.asTerm() if ot.label: yield oid.u, rdfs.label, ot.label else: yield s_cd, rdf.type, TEMP.conveyingLyph for icd in cd['inCoalescences']: dcd = rm[icd] log.info(lj(dcd)) s_icd = TEMP[f'ApiNATOMY/{mid}/{dcd["id"]}'] yield s_cd, TEMP.partOfCoalescence, s_icd yield s_icd, rdf.type, owl.NamedIndividual yield s_icd, rdf.type, TEMP[ 'ApiNATOMY/Coalescence'] if 'external' in dcd and dcd['external']: oid = OntId(dcd['external'][0]) yield s_icd, rdf.type, oid.u ot = oid.asTerm() if ot.label: yield oid.u, rdfs.label, ot.label for lyphid in dcd['lyphs']: ild = rm[lyphid] log.info(lj(ild)) if 'external' in ild and ild['external']: yield s_icd, TEMP.hasLyphWithMaterial, OntId( ild['external'][0]) if not ok: logd.info(f'{source} {target} issue') continue for inid, e in zip(st, from_to): yield e.u, rdf.type, owl.Class yield e.u, rdfs.label, rdflib.Literal(OntTerm(e).label) yield e.u, TEMP.internalId, rdflib.Literal(inid) f, t = from_to yield from cmb.restriction.serialize(f.u, p_is, t.u)
def added(self): data = super().added if data['meta'] == {'techniques': []}: breakpoint() # FIXME conditional lifts ... if 'award_number' not in data['meta']: am = self.lifters.award_manual if am: data['meta']['award_number'] = am if 'modality' not in data['meta']: m = self.lifters.modality if m: data['meta']['modality'] = m if False and 'organ' not in data['meta']: # skip here, now attached directly to award if 'award_number' in data['meta']: an = data['meta']['award_number'] o = self.lifters.organ(an) if o: if o != 'othertargets': o = OntId(o) if o.prefix == 'FMA': ot = OntTerm(o) o = next( OntTerm.query(label=ot.label, prefix='UBERON')) data['meta']['organ'] = o if 'organ' not in data['meta'] or data['meta'][ 'organ'] == 'othertargets': o = self.lifters.organ_term if o: if isinstance(o, str): o = o, out = tuple() for _o in o: _o = OntId(_o) if _o.prefix == 'FMA': ot = OntTerm(_o) _o = next( OntTerm.query(label=ot.label, prefix='UBERON')) out += (_o, ) data['meta']['organ'] = out if 'protocol_url_or_doi' not in data['meta']: if self.lifters.protocol_uris: data['meta']['protocol_url_or_doi'] = tuple( self.lifters.protocol_uris) else: if not isinstance(data['meta']['protocol_url_or_doi'], tuple): _test_path = deque(['meta', 'protocol_url_or_doi']) if not [e for e in data['errors'] if e['path'] == _test_path]: raise ext.ShouldNotHappenError('urg') else: data['meta']['protocol_url_or_doi'] += tuple( self.lifters.protocol_uris) data['meta']['protocol_url_or_doi'] = tuple( sorted(set(data['meta']['protocol_url_or_doi']))) # ick return data
def added(self): data = super().added if data['meta'] == {'techniques': []}: breakpoint() # FIXME conditional lifts ... if 'award_number' not in data['meta']: am = self.lifters.award_manual if am: data['meta']['award_number'] = am if 'modality' not in data['meta']: m = self.lifters.modality if m: data['meta']['modality'] = m if False and 'organ' not in data['meta']: # skip here, now attached directly to award if 'award_number' in data['meta']: an = data['meta']['award_number'] o = self.lifters.organ(an) if o: if o != 'othertargets': o = OntId(o) if o.prefix == 'FMA': ot = OntTerm(o) o = next(OntTerm.query(label=ot.label, prefix='UBERON')) data['meta']['organ'] = o if 'organ' not in data['meta'] or data['meta']['organ'] == 'othertargets': o = self.lifters.organ_term if o: if isinstance(o, str): o = o, out = tuple() for _o in o: _o = OntId(_o) if _o.prefix == 'FMA': ot = OntTerm(_o) _o = next(OntTerm.query(label=ot.label, prefix='UBERON')) out += (_o,) data['meta']['organ'] = out if 'protocol_url_or_doi' not in data['meta']: if self.lifters.protocol_uris: data['meta']['protocol_url_or_doi'] = tuple(self.lifters.protocol_uris) else: if not isinstance(data['meta']['protocol_url_or_doi'], tuple): _test_path = deque(['meta', 'protocol_url_or_doi']) if not [e for e in data['errors'] if 'path' in e and e['path'] == _test_path]: raise ext.ShouldNotHappenError('urg') else: data['meta']['protocol_url_or_doi'] += tuple(self.lifters.protocol_uris) data['meta']['protocol_url_or_doi'] = tuple(sorted(set(data['meta']['protocol_url_or_doi']))) # ick # FIXME this is a really bad way to do this :/ maybe stick the folder in data['prov'] ? # and indeed, when we added PipelineStart this shifted and broke everything local = (self .previous_pipeline.pipelines[0] .previous_pipeline.pipelines[0] .previous_pipeline.pipelines[0] .path) remote = local.remote if 'doi' not in data['meta']: doi = remote.doi if doi is not None: try: metadata = doi.metadata() if metadata is not None: data['meta']['doi'] = doi.identifier except requests.exceptions.HTTPError: data['meta']['doi'] = None pass else: data['meta']['doi'] = None if 'status' not in data: data['status'] = {} if 'status_on_platform' not in data['status']: data['status']['status_on_platform'] = remote.bfobject.status return data