Пример #1
0
    def _indexes(cls, data):
        """ compute submission and curation error indexes """
        errors = get_all_errors(data)
        submission_errors = []
        curation_errors = []
        for error in reversed(errors):
            if error in submission_errors or error in curation_errors:
                log.debug('error detected multiple times not counting '
                          'subsequent occurances' + lj(error))
                continue

            stage = error['pipeline_stage']
            message = error['message']
            if stage in cls._submission:
                submission_errors.append(error)
            elif stage in cls._curation:
                curation_errors.append(error)
            else:
                raise ValueError(f'Unhandled stage {stage} {message}')

        si = len(submission_errors)
        ci = len(curation_errors)
        data['status'] = {}
        data['status']['submission_index'] = si
        data['status']['curation_index'] = ci
        data['status']['error_index'] = si + ci
        data['status']['submission_errors'] = submission_errors
        data['status']['curation_errors'] = curation_errors

        return si + ci
Пример #2
0
    def xml(self):
        #datasets = []
        #contributors = []
        subjects = []
        errors = []
        resources = []

        def normv(v):
            if isinstance(v, str) and v.startswith('http'):
                # needed for loading from json that has been serialized
                # rather than from our internal representation
                # probably better to centralized the reload ...
                v = OntTerm(v)
                return v.tabular()

            if isinstance(v, rdflib.URIRef):  # FIXME why is this getting converted early?
                ot = OntTerm(v)
                return ot.tabular()
            if isinstance(v, Expr):
                return str(v)  # FIXME for xml?
            if isinstance(v, Quantity):
                return str(v)
            else:
                #log.debug(repr(v))
                return v

        for dataset_blob in self:
            id = dataset_blob['id']
            dowe = dataset_blob
            #id = dataset.id
            #dowe = dataset.data
            if 'subjects' in dowe:
                for subject in dowe['subjects']:
                    subject['dataset_id'] = id
                    subject = {k:normv(v) for k, v in subject.items()}
                    subjects.append(subject)

            if 'resources' in dowe:
                for res in dowe['resources']:
                    res['dataset_id'] = id
                    res = {k:normv(v) for k, v in res.items()}
                    resources.append(res)

            if 'errors' in dowe:
                ers = get_all_errors(dowe)
                for er in ers:
                    if er['pipeline_stage'] == 'SPARCBIDSPipeline.data':
                        continue

                    er['dataset_id'] = id
                    er = {k:normv(v) for k, v in er.items()}
                    errors.append(er)

        xs = dicttoxml.dicttoxml({'subjects': subjects})
        xr = dicttoxml.dicttoxml({'resources': resources})
        xe = dicttoxml.dicttoxml({'errors': errors})
        return (('subjects', xs),
                ('resources', xr),
                ('errors', xe))
Пример #3
0
    def disco(self):
        #dsh = sorted(MetaOutSchema.schema['allOf'][0]['properties'])
        dsh = [
            'acknowledgements',
            'additional_links',
            'award_number',
            'completeness_of_data_set',
            'contributor_count',
            'description',
            'dirs',
            'errors',
            'examples',
            'files',
            'funding',
            'keywords',
            'links',
            'modality',
            'name',  # -> title
            'organ',
            'originating_article_doi',
            'principal_investigator',
            'prior_batch_number',
            'protocol_url_or_doi',
            'sample_count',
            'size',
            'species',
            'subject_count',
            'title_for_complete_data_set',
            'uri_api',
            'uri_human',
            'error_index',  # (sum *_index)
            'dataset_completeness_index',  # dead
            'is_about',
            'involves_anatomical_region',
            'title',
            'folder_name',
        ]
        chs = [
            'contributor_affiliation',
            'contributor_orcid_id',
            'contributor_role',
            'is_contact_person',
            'name',
            'first_name',
            'last_name',
            'middle_name',
            'id',
            'blackfynn_user_id',
        ]

        datasets = [['id', 'submission_index', 'curation_index'] + dsh]
        contributors = [['id'] + chs]
        subjects = [['id', 'blob']]
        errors = [['id', 'blob']]
        resources = [['id', 'blob']]

        #cje = JEncode()
        def normv(v):
            if isinstance(v, str) and v.startswith('http'):
                # needed for loading from json that has been serialized
                # rather than from our internal representation
                # probably better to centralized the reload ...
                oid = OntId(v)
                if oid.prefix in want_prefixes:
                    return OntTerm(v).tabular()
                else:
                    return oid.iri

            if isinstance(v, OntId):
                if not isinstance(v, OntTerm):
                    v = OntTerm(v)

                v = v.tabular()
            if isinstance(v, list) or isinstance(v, tuple):
                v = ','.join(
                    json.dumps(_, cls=JEncode) if isinstance(_, dict
                                                             ) else normv(_)
                    for _ in v)
                v = v.replace('\n', ' ').replace('\t', ' ')
            elif any(isinstance(v, c) for c in (int, float, str)):
                v = str(v)
                v = v.replace('\n',
                              ' ').replace('\t',
                                           ' ')  # FIXME tests to catch this

            elif isinstance(v, dict):
                v = json.dumps(v, cls=JEncode)

            return v

        for dataset_blob in self:
            id = dataset_blob['id']
            dowe = dataset_blob
            graph = rdflib.Graph()
            TriplesExportDataset(dataset_blob).populate(graph)
            is_about = [
                OntTerm(o) for s, o in graph[:isAbout:]
                if isinstance(o, rdflib.URIRef)
            ]
            involves = [
                OntTerm(o) for s, o in graph[:TEMP.involvesAnatomicalRegion:]
            ]

            inv = ','.join(i.tabular() for i in involves)
            ia = ','.join(a.tabular() for a in is_about)
            #row = [id, dowe['error_index'], dowe['submission_completeness_index']]  # FIXME this doubles up on the row
            row = [
                id, dowe['status']['submission_index'],
                dowe['status']['curation_index']
            ]  # FIXME this doubles up on the row
            if 'meta' in dowe:
                meta = dowe['meta']
                for k in dsh:
                    if k in meta:
                        v = meta[k]
                        v = normv(v)
                    elif k == 'is_about':
                        v = ia
                    elif k == 'involves_anatomical_region':
                        v = inv
                    else:
                        v = None

                    row.append(v)

            else:
                row += [None for k in sc.MetaOutSchema.schema['properties']]

            datasets.append(row)

            # contribs
            if 'contributors' in dowe:
                cs = dowe['contributors']
                for c in cs:
                    row = [id]
                    for k in chs:
                        if k in c:
                            v = c[k]
                            v = normv(v)
                            row.append(v)
                        else:
                            row.append(None)

                    contributors.append(row)

            if 'subjects' in dowe:
                for subject in dowe['subjects']:
                    row = [id]
                    row.append(json.dumps(subject, cls=JEncode))
                    subjects.append(row)

                # moved to resources if exists already
                #if 'software' in sbs:
                #for software in sbs['software']:
                #row = [id]
                #row.append(json.dumps(software, cls=JEncode))
                #resources.append(row)

            if 'resources' in dowe:
                for res in dowe['resources']:
                    row = [id]
                    row.append(json.dumps(res, cls=JEncode))
                    resources.append(row)

            if 'errors' in dowe:
                ers = get_all_errors(dowe)
                for er in ers:
                    row = [id]
                    row.append(json.dumps(er, cls=JEncode))
                    errors.append(row)

        # TODO samples resources
        return (('datasets', datasets), ('contributors', contributors),
                ('subjects', subjects), ('resources', resources), ('errors',
                                                                   errors))
Пример #4
0
def xml(dataset_blobs):
    #datasets = []
    #contributors = []
    subjects = []
    resources = []
    errors = []
    error_reports = []

    def normv(v):
        if is_list_or_tuple(v):
            return [normv(_) for _ in v]
        elif isinstance(v, dict):
            return {k:normv(v) for k, v in v.items()}
        elif isinstance(v, str) and v.startswith('http'):
            # needed for loading from json that has been serialized
            # rather than from our internal representation
            # probably better to centralized the reload ...

            # XXX NOTE these days this will only happen if someone
            # supplies us with a uri in a field where we aren't
            # expecting one, in which case we should just return it
            try:
                v = OntTerm(v)
                return v.asCell()
            except Exception as e:
                loge.error(f'something went wrong with {v}')
                loge.exception(e)
                return v
                #raise e
        elif isinstance(v, rdflib.URIRef):  # FIXME why is this getting converted early?
            ot = OntTerm(v)
            return ot.asCell()
        elif isinstance(v, ProtcurExpression):
            return str(v)  # FIXME for xml?
        elif isinstance(v, Quantity):
            return str(v)
        elif isinstance(v, AsJson):  # XXX returns value not tested, may be extremely strange
            return str(v)
        elif isinstance(v, pathlib.Path):
            return str(v)
        elif isinstance(v, idlib.Stream):
            return v.asCell()
        #elif isinstance(v, list) or isinstance(v, str):
            #return v
        elif isinstance(v, BaseException):
            return repr(v)
        else:
            #loge.debug(repr(v))
            return v

    for dataset_blob in dataset_blobs:
        id = dataset_blob['id']
        dowe = dataset_blob
        #id = dataset.id
        #dowe = dataset.data
        if 'subjects' in dowe:
            for subject in dowe['subjects']:
                subject['dataset_id'] = id
                subject = {k:normv(v) for k, v in subject.items()}
                subjects.append(subject)

        if 'resources' in dowe:
            for res in dowe['resources']:
                res['dataset_id'] = id
                res = {k:normv(v) for k, v in res.items()}
                resources.append(res)

        if 'errors' in dowe:
            ers = get_all_errors(dowe)
            for path, er in ers:
                if not isinstance(er, dict):
                    #breakpoint()
                    loge.critical(er)
                    continue

                if er['pipeline_stage'] in pipes.PipelineEnd._shadowed:
                    continue

                er['dataset_id'] = id
                er = {k:normv(v) for k, v in er.items()}
                errors.append(er)

        if 'status' in dowe:
            if 'path_error_report' in dowe['status']:
                error_reports.append(dowe['status']['path_error_report'])

    xs = dicttoxml.dicttoxml({'subjects': subjects})
    xr = dicttoxml.dicttoxml({'resources': resources})
    xe = dicttoxml.dicttoxml({'errors': errors})
    xer = dicttoxml.dicttoxml({'error_reports': error_reports})
    return (('subjects', xs),
            ('resources', xr),
            ('errors', xe),
            ('error_reports', xer),)
Пример #5
0
def xml(dataset_blobs):
    #datasets = []
    #contributors = []
    subjects = []
    resources = []
    errors = []
    error_reports = []

    def normv(v):
        if is_list_or_tuple(v):
            return [normv(_) for _ in v]
        if isinstance(v, dict):
            return {k: normv(v) for k, v in v.items()}
        if isinstance(v, str) and v.startswith('http'):
            # needed for loading from json that has been serialized
            # rather than from our internal representation
            # probably better to centralized the reload ...
            v = OntTerm(v)
            return v.asCell()
        if isinstance(
                v,
                rdflib.URIRef):  # FIXME why is this getting converted early?
            ot = OntTerm(v)
            return ot.asCell()
        if isinstance(v, ProtcurExpression):
            return str(v)  # FIXME for xml?
        if isinstance(v, Quantity):
            return str(v)
        elif isinstance(v, pathlib.Path):
            return str(v)
        elif isinstance(v, idlib.Stream):
            return v.asCell()
        #elif isinstance(v, list) or isinstance(v, str):
        #return v
        elif isinstance(v, BaseException):
            return repr(v)
        else:
            #loge.debug(repr(v))
            return v

    for dataset_blob in dataset_blobs:
        id = dataset_blob['id']
        dowe = dataset_blob
        #id = dataset.id
        #dowe = dataset.data
        if 'subjects' in dowe:
            for subject in dowe['subjects']:
                subject['dataset_id'] = id
                subject = {k: normv(v) for k, v in subject.items()}
                subjects.append(subject)

        if 'resources' in dowe:
            for res in dowe['resources']:
                res['dataset_id'] = id
                res = {k: normv(v) for k, v in res.items()}
                resources.append(res)

        if 'errors' in dowe:
            ers = get_all_errors(dowe)
            for path, er in ers:
                if er['pipeline_stage'] == 'SPARCBIDSPipeline.data':
                    continue

                er['dataset_id'] = id
                er = {k: normv(v) for k, v in er.items()}
                errors.append(er)

        if 'status' in dowe:
            if 'path_error_report' in dowe['status']:
                error_reports.append(dowe['status']['path_error_report'])

    xs = dicttoxml.dicttoxml({'subjects': subjects})
    xr = dicttoxml.dicttoxml({'resources': resources})
    xe = dicttoxml.dicttoxml({'errors': errors})
    xer = dicttoxml.dicttoxml({'error_reports': error_reports})
    return (
        ('subjects', xs),
        ('resources', xr),
        ('errors', xe),
        ('error_reports', xer),
    )
Пример #6
0
    def _indexes(cls, data):
        """ compute submission and curation error indexes """
        errors = get_all_errors(data)
        submission_errors = []
        curation_errors = []
        for error in reversed(errors):
            if error in submission_errors or error in curation_errors:
                log.debug('error detected multiple times not counting '
                          'subsequent occurances' + lj(error))
                continue

            if 'blame' not in error:
                breakpoint()

            blame = error['blame']
            stage = error['pipeline_stage']
            message = error['message']

            blamed = False
            if blame is not None:
                if blame in cls._blame:
                    blame_target = cls._blame[blame]
                    if blame_target == cls._blame_stage:
                        pass
                    elif blame_target == cls._blame_everyone:
                        submission_errors.append(error)
                        curation_errors.append(error)
                        blamed = True
                    elif blame_target == cls._blame_submission:
                        submission_errors.append(error)
                        blamed = True
                    elif blame_target == cls._blame_curation:
                        curation_errors.append(error)
                        blamed = True
                    else:
                        raise ValueError(f'Unhandled blame target {blame_target}\n{message}')

                else:
                    raise ValueError(f'Unhandled blame type {blame}\n{message}')

            if stage in cls._submission:
                if not blamed:
                    submission_errors.append(error)
            elif stage in cls._curation:
                if not blamed:
                    curation_errors.append(error)
            else:
                if blame not in ('pipeline', 'submission', 'debug'):
                    raise ValueError(f'Unhandled stage {stage}\n{message}')

        si = len(submission_errors)
        ci = len(curation_errors)
        if 'status' not in data:
            data['status'] = {}

        data['status']['submission_index'] = si
        data['status']['curation_index'] = ci
        data['status']['error_index'] = si + ci
        data['status']['submission_errors'] = submission_errors
        data['status']['curation_errors'] = curation_errors

        return si + ci