def _indexes(cls, data): """ compute submission and curation error indexes """ errors = get_all_errors(data) submission_errors = [] curation_errors = [] for error in reversed(errors): if error in submission_errors or error in curation_errors: log.debug('error detected multiple times not counting ' 'subsequent occurances' + lj(error)) continue stage = error['pipeline_stage'] message = error['message'] if stage in cls._submission: submission_errors.append(error) elif stage in cls._curation: curation_errors.append(error) else: raise ValueError(f'Unhandled stage {stage} {message}') si = len(submission_errors) ci = len(curation_errors) data['status'] = {} data['status']['submission_index'] = si data['status']['curation_index'] = ci data['status']['error_index'] = si + ci data['status']['submission_errors'] = submission_errors data['status']['curation_errors'] = curation_errors return si + ci
def xml(self): #datasets = [] #contributors = [] subjects = [] errors = [] resources = [] def normv(v): if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... v = OntTerm(v) return v.tabular() if isinstance(v, rdflib.URIRef): # FIXME why is this getting converted early? ot = OntTerm(v) return ot.tabular() if isinstance(v, Expr): return str(v) # FIXME for xml? if isinstance(v, Quantity): return str(v) else: #log.debug(repr(v)) return v for dataset_blob in self: id = dataset_blob['id'] dowe = dataset_blob #id = dataset.id #dowe = dataset.data if 'subjects' in dowe: for subject in dowe['subjects']: subject['dataset_id'] = id subject = {k:normv(v) for k, v in subject.items()} subjects.append(subject) if 'resources' in dowe: for res in dowe['resources']: res['dataset_id'] = id res = {k:normv(v) for k, v in res.items()} resources.append(res) if 'errors' in dowe: ers = get_all_errors(dowe) for er in ers: if er['pipeline_stage'] == 'SPARCBIDSPipeline.data': continue er['dataset_id'] = id er = {k:normv(v) for k, v in er.items()} errors.append(er) xs = dicttoxml.dicttoxml({'subjects': subjects}) xr = dicttoxml.dicttoxml({'resources': resources}) xe = dicttoxml.dicttoxml({'errors': errors}) return (('subjects', xs), ('resources', xr), ('errors', xe))
def disco(self): #dsh = sorted(MetaOutSchema.schema['allOf'][0]['properties']) dsh = [ 'acknowledgements', 'additional_links', 'award_number', 'completeness_of_data_set', 'contributor_count', 'description', 'dirs', 'errors', 'examples', 'files', 'funding', 'keywords', 'links', 'modality', 'name', # -> title 'organ', 'originating_article_doi', 'principal_investigator', 'prior_batch_number', 'protocol_url_or_doi', 'sample_count', 'size', 'species', 'subject_count', 'title_for_complete_data_set', 'uri_api', 'uri_human', 'error_index', # (sum *_index) 'dataset_completeness_index', # dead 'is_about', 'involves_anatomical_region', 'title', 'folder_name', ] chs = [ 'contributor_affiliation', 'contributor_orcid_id', 'contributor_role', 'is_contact_person', 'name', 'first_name', 'last_name', 'middle_name', 'id', 'blackfynn_user_id', ] datasets = [['id', 'submission_index', 'curation_index'] + dsh] contributors = [['id'] + chs] subjects = [['id', 'blob']] errors = [['id', 'blob']] resources = [['id', 'blob']] #cje = JEncode() def normv(v): if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... oid = OntId(v) if oid.prefix in want_prefixes: return OntTerm(v).tabular() else: return oid.iri if isinstance(v, OntId): if not isinstance(v, OntTerm): v = OntTerm(v) v = v.tabular() if isinstance(v, list) or isinstance(v, tuple): v = ','.join( json.dumps(_, cls=JEncode) if isinstance(_, dict ) else normv(_) for _ in v) v = v.replace('\n', ' ').replace('\t', ' ') elif any(isinstance(v, c) for c in (int, float, str)): v = str(v) v = v.replace('\n', ' ').replace('\t', ' ') # FIXME tests to catch this elif isinstance(v, dict): v = json.dumps(v, cls=JEncode) return v for dataset_blob in self: id = dataset_blob['id'] dowe = dataset_blob graph = rdflib.Graph() TriplesExportDataset(dataset_blob).populate(graph) is_about = [ OntTerm(o) for s, o in graph[:isAbout:] if isinstance(o, rdflib.URIRef) ] involves = [ OntTerm(o) for s, o in graph[:TEMP.involvesAnatomicalRegion:] ] inv = ','.join(i.tabular() for i in involves) ia = ','.join(a.tabular() for a in is_about) #row = [id, dowe['error_index'], dowe['submission_completeness_index']] # FIXME this doubles up on the row row = [ id, dowe['status']['submission_index'], dowe['status']['curation_index'] ] # FIXME this doubles up on the row if 'meta' in dowe: meta = dowe['meta'] for k in dsh: if k in meta: v = meta[k] v = normv(v) elif k == 'is_about': v = ia elif k == 'involves_anatomical_region': v = inv else: v = None row.append(v) else: row += [None for k in sc.MetaOutSchema.schema['properties']] datasets.append(row) # contribs if 'contributors' in dowe: cs = dowe['contributors'] for c in cs: row = [id] for k in chs: if k in c: v = c[k] v = normv(v) row.append(v) else: row.append(None) contributors.append(row) if 'subjects' in dowe: for subject in dowe['subjects']: row = [id] row.append(json.dumps(subject, cls=JEncode)) subjects.append(row) # moved to resources if exists already #if 'software' in sbs: #for software in sbs['software']: #row = [id] #row.append(json.dumps(software, cls=JEncode)) #resources.append(row) if 'resources' in dowe: for res in dowe['resources']: row = [id] row.append(json.dumps(res, cls=JEncode)) resources.append(row) if 'errors' in dowe: ers = get_all_errors(dowe) for er in ers: row = [id] row.append(json.dumps(er, cls=JEncode)) errors.append(row) # TODO samples resources return (('datasets', datasets), ('contributors', contributors), ('subjects', subjects), ('resources', resources), ('errors', errors))
def xml(dataset_blobs): #datasets = [] #contributors = [] subjects = [] resources = [] errors = [] error_reports = [] def normv(v): if is_list_or_tuple(v): return [normv(_) for _ in v] elif isinstance(v, dict): return {k:normv(v) for k, v in v.items()} elif isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... # XXX NOTE these days this will only happen if someone # supplies us with a uri in a field where we aren't # expecting one, in which case we should just return it try: v = OntTerm(v) return v.asCell() except Exception as e: loge.error(f'something went wrong with {v}') loge.exception(e) return v #raise e elif isinstance(v, rdflib.URIRef): # FIXME why is this getting converted early? ot = OntTerm(v) return ot.asCell() elif isinstance(v, ProtcurExpression): return str(v) # FIXME for xml? elif isinstance(v, Quantity): return str(v) elif isinstance(v, AsJson): # XXX returns value not tested, may be extremely strange return str(v) elif isinstance(v, pathlib.Path): return str(v) elif isinstance(v, idlib.Stream): return v.asCell() #elif isinstance(v, list) or isinstance(v, str): #return v elif isinstance(v, BaseException): return repr(v) else: #loge.debug(repr(v)) return v for dataset_blob in dataset_blobs: id = dataset_blob['id'] dowe = dataset_blob #id = dataset.id #dowe = dataset.data if 'subjects' in dowe: for subject in dowe['subjects']: subject['dataset_id'] = id subject = {k:normv(v) for k, v in subject.items()} subjects.append(subject) if 'resources' in dowe: for res in dowe['resources']: res['dataset_id'] = id res = {k:normv(v) for k, v in res.items()} resources.append(res) if 'errors' in dowe: ers = get_all_errors(dowe) for path, er in ers: if not isinstance(er, dict): #breakpoint() loge.critical(er) continue if er['pipeline_stage'] in pipes.PipelineEnd._shadowed: continue er['dataset_id'] = id er = {k:normv(v) for k, v in er.items()} errors.append(er) if 'status' in dowe: if 'path_error_report' in dowe['status']: error_reports.append(dowe['status']['path_error_report']) xs = dicttoxml.dicttoxml({'subjects': subjects}) xr = dicttoxml.dicttoxml({'resources': resources}) xe = dicttoxml.dicttoxml({'errors': errors}) xer = dicttoxml.dicttoxml({'error_reports': error_reports}) return (('subjects', xs), ('resources', xr), ('errors', xe), ('error_reports', xer),)
def xml(dataset_blobs): #datasets = [] #contributors = [] subjects = [] resources = [] errors = [] error_reports = [] def normv(v): if is_list_or_tuple(v): return [normv(_) for _ in v] if isinstance(v, dict): return {k: normv(v) for k, v in v.items()} if isinstance(v, str) and v.startswith('http'): # needed for loading from json that has been serialized # rather than from our internal representation # probably better to centralized the reload ... v = OntTerm(v) return v.asCell() if isinstance( v, rdflib.URIRef): # FIXME why is this getting converted early? ot = OntTerm(v) return ot.asCell() if isinstance(v, ProtcurExpression): return str(v) # FIXME for xml? if isinstance(v, Quantity): return str(v) elif isinstance(v, pathlib.Path): return str(v) elif isinstance(v, idlib.Stream): return v.asCell() #elif isinstance(v, list) or isinstance(v, str): #return v elif isinstance(v, BaseException): return repr(v) else: #loge.debug(repr(v)) return v for dataset_blob in dataset_blobs: id = dataset_blob['id'] dowe = dataset_blob #id = dataset.id #dowe = dataset.data if 'subjects' in dowe: for subject in dowe['subjects']: subject['dataset_id'] = id subject = {k: normv(v) for k, v in subject.items()} subjects.append(subject) if 'resources' in dowe: for res in dowe['resources']: res['dataset_id'] = id res = {k: normv(v) for k, v in res.items()} resources.append(res) if 'errors' in dowe: ers = get_all_errors(dowe) for path, er in ers: if er['pipeline_stage'] == 'SPARCBIDSPipeline.data': continue er['dataset_id'] = id er = {k: normv(v) for k, v in er.items()} errors.append(er) if 'status' in dowe: if 'path_error_report' in dowe['status']: error_reports.append(dowe['status']['path_error_report']) xs = dicttoxml.dicttoxml({'subjects': subjects}) xr = dicttoxml.dicttoxml({'resources': resources}) xe = dicttoxml.dicttoxml({'errors': errors}) xer = dicttoxml.dicttoxml({'error_reports': error_reports}) return ( ('subjects', xs), ('resources', xr), ('errors', xe), ('error_reports', xer), )
def _indexes(cls, data): """ compute submission and curation error indexes """ errors = get_all_errors(data) submission_errors = [] curation_errors = [] for error in reversed(errors): if error in submission_errors or error in curation_errors: log.debug('error detected multiple times not counting ' 'subsequent occurances' + lj(error)) continue if 'blame' not in error: breakpoint() blame = error['blame'] stage = error['pipeline_stage'] message = error['message'] blamed = False if blame is not None: if blame in cls._blame: blame_target = cls._blame[blame] if blame_target == cls._blame_stage: pass elif blame_target == cls._blame_everyone: submission_errors.append(error) curation_errors.append(error) blamed = True elif blame_target == cls._blame_submission: submission_errors.append(error) blamed = True elif blame_target == cls._blame_curation: curation_errors.append(error) blamed = True else: raise ValueError(f'Unhandled blame target {blame_target}\n{message}') else: raise ValueError(f'Unhandled blame type {blame}\n{message}') if stage in cls._submission: if not blamed: submission_errors.append(error) elif stage in cls._curation: if not blamed: curation_errors.append(error) else: if blame not in ('pipeline', 'submission', 'debug'): raise ValueError(f'Unhandled stage {stage}\n{message}') si = len(submission_errors) ci = len(curation_errors) if 'status' not in data: data['status'] = {} data['status']['submission_index'] = si data['status']['curation_index'] = ci data['status']['error_index'] = si + ci data['status']['submission_errors'] = submission_errors data['status']['curation_errors'] = curation_errors return si + ci