예제 #1
0
    def _completeness(self, data):
        accessor = JT(data)  # can go direct if elements are always present
        #organ = accessor.query('meta', 'organ')
        try:
            organ = adops.get(data, ['meta', 'organ'])
        except:
            organ = None

        if isinstance(organ, list) or isinstance(organ, tuple):
            if len(organ) == 1:
                organ, = organ
                organ = OntTerm(organ)
            else:
                organ = [OntTerm(o) for o in organ]

        elif organ == 'othertargets':
            pass
        elif organ:
            organ = OntTerm(organ)

        return (
            accessor.status.submission_index,
            accessor.status.curation_index,
            accessor.status.error_index,
            #accessor.submission_completeness_index,
            #dataset.name,  # from filename (do we not have that in meta!?)
            accessor.query('meta', 'folder_name'),
            accessor.id,  #if 'id' in dowe else None,
            accessor.query('meta', 'award_number'),
            organ,
        )
예제 #2
0
        def award_number(self, value):
            _, s = self.c.award_number(value)
            yield s, a, owl.NamedIndividual
            yield s, a, TEMP.FundedResearchProject
            return
            o = self.integrator.organ(value)
            if o:
                if o != 'othertargets':
                    o = OntId(o)
                    if o.prefix == 'FMA':
                        ot = OntTerm(o)
                        o = next(OntTerm.query(label=ot.label, prefix='UBERON')).OntTerm

                    yield s, isAbout, o.u
예제 #3
0
    def added(self):
        data = super().added
        # FIXME conditional lifts ...
        if 'award_number' not in data['meta']:
            am = self.lifters.award_manual
            if am:
                data['meta']['award_number'] = am

        if 'modality' not in data['meta']:
            m = self.lifters.modality
            if m:
                data['meta']['modality'] = m

        if 'organ' not in data['meta']:
            if 'award_number' in data['meta']:
                an = data['meta']['award_number']
                o = self.lifters.organ(an)
                if o:
                    if o != 'othertargets':
                        o = OntId(o)
                        if o.prefix == 'FMA':
                            ot = OntTerm(o)
                            o = next(
                                OntTerm.query(label=ot.label,
                                              prefix='UBERON')).OntTerm

                    data['meta']['organ'] = o

        if 'organ' not in data['meta'] or data['meta'][
                'organ'] == 'othertargets':
            o = self.lifters.organ_term
            if o:
                if isinstance(o, str):
                    o = o,

                out = tuple()
                for _o in o:
                    _o = OntId(_o)
                    if _o.prefix == 'FMA':
                        ot = OntTerm(_o)
                        _o = next(
                            OntTerm.query(label=ot.label,
                                          prefix='UBERON')).OntTerm

                    out += (_o, )

                data['meta']['organ'] = out

        return data
예제 #4
0
    def triples_protcur(self, protocol_subject):
        # XXX deprecated and extremely slow
        # triples from the protcur pipeline are exported only once as part of
        # protcur.ttl
        ps = list(self._protcur(str(protocol_subject)))
        anatomy = [(p,
                    OntId('UBERON:' +
                          str(p).split('UBERON:', 1)[-1].split(' ', 1)[0]))
                   for p in ps if p.astType == 'protc:input'
                   and '(protc:input (term UBERON' in str(p)]
        #breakpoint()
        dataset_subject = rdflib.URIRef(self.uri_api)
        yield protocol_subject, TEMP.hasNumberOfProtcurAnnotations, rdflib.Literal(
            len(ps))
        done = set()
        for anno, term in anatomy:
            if term in done:
                continue

            done.add(term)
            yield from OntTerm(term).triples_simple
            o = term.u
            t = dataset_subject, TEMP.involvesAnatomicalRegion, o
            sl = rdflib.URIRef(anno.shareLink)
            av = (((ilxtr.annotationValue, rdflib.Literal(anno.value)), )
                  if anno.value != o else tuple())
            notes = [(ilxtr.curatorNote, rdflib.Literal(n))
                     for n in anno.curatorNotes]
            prov = [(ilxtr.hasAnnotation, sl)]
            yield t
            yield from cmb.annotation(t, *av, *notes, *prov)()
예제 #5
0
    def process_dict(self, dict_):
        """ deal with multiple fields """
        out = {k:v for k, v in dict_.items() if k not in self.skip}
        for h_unit, h_value in zip(self.h_unit, self.h_value):
            if h_value not in dict_:  # we drop null cells so if one of these was null then we have to skip it here too
                continue

            dhv = dict_[h_value]
            if isinstance(dhv, str):
                try:
                    dhv = ast.literal_eval(dhv)
                except ValueError as e:
                    raise exc.UnhandledTypeError(f'{h_value} {dhv!r} was not parsed!') from e

            compose = dhv * pyru.ur.parse_units(dict_[h_unit])
            #_, v, rest = parameter_expression(compose)
            #out[h_value] = str(UnitsParser(compose).for_text)  # FIXME sparc repr
            #breakpoint()
            out[h_value] = compose #UnitsParser(compose).asPython()

        if 'gender' in out and 'species' in out:
            if out['species'] != OntTerm('NCBITaxon:9606'):
                out['sex'] = out.pop('gender')

        return out
예제 #6
0
 def normv(v):
     if is_list_or_tuple(v):
         return [normv(_) for _ in v]
     if isinstance(v, dict):
         return {k: normv(v) for k, v in v.items()}
     if isinstance(v, str) and v.startswith('http'):
         # needed for loading from json that has been serialized
         # rather than from our internal representation
         # probably better to centralized the reload ...
         v = OntTerm(v)
         return v.asCell()
     if isinstance(
             v,
             rdflib.URIRef):  # FIXME why is this getting converted early?
         ot = OntTerm(v)
         return ot.asCell()
     if isinstance(v, ProtcurExpression):
         return str(v)  # FIXME for xml?
     if isinstance(v, Quantity):
         return str(v)
     elif isinstance(v, pathlib.Path):
         return str(v)
     elif isinstance(v, idlib.Stream):
         return v.asCell()
     #elif isinstance(v, list) or isinstance(v, str):
     #return v
     elif isinstance(v, BaseException):
         return repr(v)
     else:
         #loge.debug(repr(v))
         return v
예제 #7
0
 def _update_technique(self, cell):
     # NOTE some rows won't update if the dataset no longer exists
     value = cell.value
     if value:
         try:
             term = next(OntTerm.query(label=value))
             cell.value = term.asCellHyperlink()
         except StopIteration:
             log.info(f'no term for technique {value}')
예제 #8
0
    def map(self, anno):
        row = self._annotation_row(anno)
        mapping_ok = row.mapping_ok().value == 'TRUE'  # FIXME
        not_input = row.not_input_().value
        bad_for_mapping = row.bad_for_mapping_().value
        manual_mapping = row.manual_mapping().value
        if mapping_ok and not not_input:
            pass

        if manual_mapping and ' ' in manual_mapping:
            log.error(
                f'Why does a manual mapping have a space in it {manual_mapping!r}'
            )

        elif manual_mapping:
            return OntTerm(manual_mapping)

        elif mapping_ok:  # FIXME anno.astValue can drift from auto_mapping
            # this is so hilariously inefficient, we parse the same stuff
            # 3 times or something
            return OntTerm(anno.asPython().asPython().black_box.curie)
예제 #9
0
        def normv(v):
            if isinstance(v, str) and v.startswith('http'):
                # needed for loading from json that has been serialized
                # rather than from our internal representation
                # probably better to centralized the reload ...
                oid = OntId(v)
                if oid.prefix in want_prefixes:
                    return OntTerm(v).tabular()
                else:
                    return oid.iri

            if isinstance(v, OntId):
                if not isinstance(v, OntTerm):
                    v = OntTerm(v)

                v = v.tabular()
            if isinstance(v, list) or isinstance(v, tuple):
                v = ','.join(
                    json.dumps(_, cls=JEncode) if isinstance(_, dict
                                                             ) else normv(_)
                    for _ in v)
                v = v.replace('\n', ' ').replace('\t', ' ')
            elif any(isinstance(v, c) for c in (int, float, str)):
                v = str(v)
                v = v.replace('\n',
                              ' ').replace('\t',
                                           ' ')  # FIXME tests to catch this

            elif isinstance(v, dict):
                v = json.dumps(v, cls=JEncode)

            return v
예제 #10
0
        def yield_from_id(s, matid, predicate=predicate):
            mat = rm[matid]
            if 'external' in mat:
                mat_s = OntTerm(mat['external'][0])
                yield s, predicate, mat_s.u
                yield mat_s.u, a, owl.Class
                yield mat_s.u, rdfs.label, rdflib.Literal(mat_s.label)
                if 'materials' in mat:
                    for submat_id in mat['materials']:
                        yield from yield_from_id(mat_s, submat_id, TEMP.hasConstituent)

            else:
                log.warning(f'no external id for {mat}')
예제 #11
0
    def query(value, prefix):
        for query_type in ('term', 'search'):
            terms = [q.OntTerm for q in OntTerm.query(prefix=prefix, **{query_type:value})]
            if terms:
                #print('matching', terms[0], value)
                #print('extra terms for', value, terms[1:])
                return terms[0]
            else:
                continue

        else:
            log.warning(f'No ontology id found for {value}')
            return value
예제 #12
0
    def normv(v):
        if is_list_or_tuple(v):
            return [normv(_) for _ in v]
        elif isinstance(v, dict):
            return {k:normv(v) for k, v in v.items()}
        elif isinstance(v, str) and v.startswith('http'):
            # needed for loading from json that has been serialized
            # rather than from our internal representation
            # probably better to centralized the reload ...

            # XXX NOTE these days this will only happen if someone
            # supplies us with a uri in a field where we aren't
            # expecting one, in which case we should just return it
            try:
                v = OntTerm(v)
                return v.asCell()
            except Exception as e:
                loge.error(f'something went wrong with {v}')
                loge.exception(e)
                return v
                #raise e
        elif isinstance(v, rdflib.URIRef):  # FIXME why is this getting converted early?
            ot = OntTerm(v)
            return ot.asCell()
        elif isinstance(v, ProtcurExpression):
            return str(v)  # FIXME for xml?
        elif isinstance(v, Quantity):
            return str(v)
        elif isinstance(v, AsJson):  # XXX returns value not tested, may be extremely strange
            return str(v)
        elif isinstance(v, pathlib.Path):
            return str(v)
        elif isinstance(v, idlib.Stream):
            return v.asCell()
        #elif isinstance(v, list) or isinstance(v, str):
            #return v
        elif isinstance(v, BaseException):
            return repr(v)
        else:
            #loge.debug(repr(v))
            return v
예제 #13
0
        def normv(v):
            if isinstance(v, str) and v.startswith('http'):
                # needed for loading from json that has been serialized
                # rather than from our internal representation
                # probably better to centralized the reload ...
                v = OntTerm(v)
                return v.tabular()

            if isinstance(v, rdflib.URIRef):  # FIXME why is this getting converted early?
                ot = OntTerm(v)
                return ot.tabular()
            if isinstance(v, Expr):
                return str(v)  # FIXME for xml?
            if isinstance(v, Quantity):
                return str(v)
            else:
                #log.debug(repr(v))
                return v
예제 #14
0
    def _term(self):
        if not hasattr(self, '_c_term'):
            self._c_term = OntTerm(self.id)

        return self._c_term
예제 #15
0
    def disco(self):
        #dsh = sorted(MetaOutSchema.schema['allOf'][0]['properties'])
        dsh = [
            'acknowledgements',
            'additional_links',
            'award_number',
            'completeness_of_data_set',
            'contributor_count',
            'description',
            'dirs',
            'errors',
            'examples',
            'files',
            'funding',
            'keywords',
            'links',
            'modality',
            'name',  # -> title
            'organ',
            'originating_article_doi',
            'principal_investigator',
            'prior_batch_number',
            'protocol_url_or_doi',
            'sample_count',
            'size',
            'species',
            'subject_count',
            'title_for_complete_data_set',
            'uri_api',
            'uri_human',
            'error_index',  # (sum *_index)
            'dataset_completeness_index',  # dead
            'is_about',
            'involves_anatomical_region',
            'title',
            'folder_name',
        ]
        chs = [
            'contributor_affiliation',
            'contributor_orcid_id',
            'contributor_role',
            'is_contact_person',
            'name',
            'first_name',
            'last_name',
            'middle_name',
            'id',
            'blackfynn_user_id',
        ]

        datasets = [['id', 'submission_index', 'curation_index'] + dsh]
        contributors = [['id'] + chs]
        subjects = [['id', 'blob']]
        errors = [['id', 'blob']]
        resources = [['id', 'blob']]

        #cje = JEncode()
        def normv(v):
            if isinstance(v, str) and v.startswith('http'):
                # needed for loading from json that has been serialized
                # rather than from our internal representation
                # probably better to centralized the reload ...
                oid = OntId(v)
                if oid.prefix in want_prefixes:
                    return OntTerm(v).tabular()
                else:
                    return oid.iri

            if isinstance(v, OntId):
                if not isinstance(v, OntTerm):
                    v = OntTerm(v)

                v = v.tabular()
            if isinstance(v, list) or isinstance(v, tuple):
                v = ','.join(
                    json.dumps(_, cls=JEncode) if isinstance(_, dict
                                                             ) else normv(_)
                    for _ in v)
                v = v.replace('\n', ' ').replace('\t', ' ')
            elif any(isinstance(v, c) for c in (int, float, str)):
                v = str(v)
                v = v.replace('\n',
                              ' ').replace('\t',
                                           ' ')  # FIXME tests to catch this

            elif isinstance(v, dict):
                v = json.dumps(v, cls=JEncode)

            return v

        for dataset_blob in self:
            id = dataset_blob['id']
            dowe = dataset_blob
            graph = rdflib.Graph()
            TriplesExportDataset(dataset_blob).populate(graph)
            is_about = [
                OntTerm(o) for s, o in graph[:isAbout:]
                if isinstance(o, rdflib.URIRef)
            ]
            involves = [
                OntTerm(o) for s, o in graph[:TEMP.involvesAnatomicalRegion:]
            ]

            inv = ','.join(i.tabular() for i in involves)
            ia = ','.join(a.tabular() for a in is_about)
            #row = [id, dowe['error_index'], dowe['submission_completeness_index']]  # FIXME this doubles up on the row
            row = [
                id, dowe['status']['submission_index'],
                dowe['status']['curation_index']
            ]  # FIXME this doubles up on the row
            if 'meta' in dowe:
                meta = dowe['meta']
                for k in dsh:
                    if k in meta:
                        v = meta[k]
                        v = normv(v)
                    elif k == 'is_about':
                        v = ia
                    elif k == 'involves_anatomical_region':
                        v = inv
                    else:
                        v = None

                    row.append(v)

            else:
                row += [None for k in sc.MetaOutSchema.schema['properties']]

            datasets.append(row)

            # contribs
            if 'contributors' in dowe:
                cs = dowe['contributors']
                for c in cs:
                    row = [id]
                    for k in chs:
                        if k in c:
                            v = c[k]
                            v = normv(v)
                            row.append(v)
                        else:
                            row.append(None)

                    contributors.append(row)

            if 'subjects' in dowe:
                for subject in dowe['subjects']:
                    row = [id]
                    row.append(json.dumps(subject, cls=JEncode))
                    subjects.append(row)

                # moved to resources if exists already
                #if 'software' in sbs:
                #for software in sbs['software']:
                #row = [id]
                #row.append(json.dumps(software, cls=JEncode))
                #resources.append(row)

            if 'resources' in dowe:
                for res in dowe['resources']:
                    row = [id]
                    row.append(json.dumps(res, cls=JEncode))
                    resources.append(row)

            if 'errors' in dowe:
                ers = get_all_errors(dowe)
                for er in ers:
                    row = [id]
                    row.append(json.dumps(er, cls=JEncode))
                    errors.append(row)

        # TODO samples resources
        return (('datasets', datasets), ('contributors', contributors),
                ('subjects', subjects), ('resources', resources), ('errors',
                                                                   errors))
예제 #16
0
 def map(self, anno):
     row = self._annotation_row(anno)
     oid = row.ontology_id().value
     label = row.ontology_label().value
     ilx_curie = row.interlex_id().value
     return OntTerm(oid, label=label), OntTerm(interlex_curie)
예제 #17
0
    def triples_gen(self):
        rm = self._source

        # FIXME there doesn't seem to be a section that tells me the name
        # of top level model so I have to know its name beforhand
        # the id is in the model, having the id in the resource map
        # prevents issues if these things get sent decoupled
        id = rm['id']
        mid = id.replace(' ', '-')

        links = rm[id]['links']
        #linknodes = [n for n in rm[id]['nodes'] if n['class'] == 'Link']  # visible confusion

        st = []
        from_to = []
        ot = None
        yield from self.apinatbase()
        for link in links:
            if 'conveyingType' in link:
                if link['conveyingType'] == 'ADVECTIVE':
                    p_is = TEMP.isAdvectivelyConnectedTo
                    p_from = TEMP.advectivelyConnectsFrom
                    p_to = TEMP.advectivelyConnectsTo
                    p_cmat = TEMP.advectivelyConnectsMaterial
                    diffusive = False
                elif link['conveyingType'] == 'DIFFUSIVE':
                    p_is = TEMP.isDiffusivelyConnectedTo
                    p_from = TEMP.diffusivelyConnectsFrom
                    p_to = TEMP.diffusivelyConnectsTo
                    p_cmat = TEMP.diffusivelyConnectsMaterial
                    diffusive = True
                else:
                    log.critical(f'unhandled conveying type {link}')
                    continue

                source = link['source']
                target = link['target']
                ok = True
                if len(from_to) == 2:  # otherwise
                    st = []
                    from_to = []
                for i, e in enumerate((source, target)):
                    ed = rm[e]
                    if 'external' not in ed:
                        if not i and from_to:
                            # TODO make sure the intermediate ids match
                            pass
                        else:
                            ok = False
                            break
                    else:
                        st.append(e)
                        from_to.append(OntId(ed['external'][0]))

                conveying = link['conveyingLyph']
                cd = rm[conveying]
                if 'external' in cd:
                    old_ot = ot
                    ot = OntTerm(cd['external'][0])
                    yield ot.u, rdf.type, owl.Class
                    yield ot.u, TEMP.internalId, rdflib.Literal(conveying)
                    yield ot.u, rdfs.label, rdflib.Literal(ot.label)

                    yield from self.materialTriples(
                        ot.u, link, p_cmat)  # FIXME locate this correctly

                    if ok:
                        u, d = from_to
                        if st[0] == source:
                            yield u, rdfs.label, rdflib.Literal(
                                OntTerm(u).label)
                            yield u, rdf.type, owl.Class
                            yield from cmb.restriction.serialize(
                                ot.u, p_from, u)

                        if st[1] == target:
                            yield d, rdfs.label, rdflib.Literal(
                                OntTerm(d).label)
                            yield d, rdf.type, owl.Class
                            yield from cmb.restriction.serialize(ot.u, p_to, d)

                    if old_ot is not None and old_ot != ot:
                        yield from cmb.restriction.serialize(
                            ot.u, p_from, old_ot.u)

                if diffusive:
                    # we can try to hack this using named individuals
                    # but it is not going to do exactly what is desired
                    s_link = TEMP[f'ApiNATOMY/{mid}/{link["id"]}']
                    s_cd = TEMP[f'ApiNATOMY/{mid}/{cd["id"]}']
                    yield s_link, rdf.type, owl.NamedIndividual
                    yield s_link, rdf.type, TEMP.diffusiveLink  # FIXME I'm not sure these go in the model ...
                    yield s_cd, rdf.type, owl.NamedIndividual
                    if 'external' in cd and cd['external']:
                        oid = OntId(cd['external'][0])
                        yield s_cd, rdf.type, oid.u
                        ot = oid.asTerm()
                        if ot.label:
                            yield oid.u, rdfs.label, ot.label

                    else:
                        yield s_cd, rdf.type, TEMP.conveyingLyph
                        for icd in cd['inCoalescences']:
                            dcd = rm[icd]
                            log.info(lj(dcd))
                            s_icd = TEMP[f'ApiNATOMY/{mid}/{dcd["id"]}']
                            yield s_cd, TEMP.partOfCoalescence, s_icd
                            yield s_icd, rdf.type, owl.NamedIndividual
                            yield s_icd, rdf.type, TEMP[
                                'ApiNATOMY/Coalescence']
                            if 'external' in dcd and dcd['external']:
                                oid = OntId(dcd['external'][0])
                                yield s_icd, rdf.type, oid.u
                                ot = oid.asTerm()
                                if ot.label:
                                    yield oid.u, rdfs.label, ot.label

                            for lyphid in dcd['lyphs']:
                                ild = rm[lyphid]
                                log.info(lj(ild))
                                if 'external' in ild and ild['external']:
                                    yield s_icd, TEMP.hasLyphWithMaterial, OntId(
                                        ild['external'][0])

                if not ok:
                    logd.info(f'{source} {target} issue')
                    continue

                for inid, e in zip(st, from_to):
                    yield e.u, rdf.type, owl.Class
                    yield e.u, rdfs.label, rdflib.Literal(OntTerm(e).label)
                    yield e.u, TEMP.internalId, rdflib.Literal(inid)

                f, t = from_to
                yield from cmb.restriction.serialize(f.u, p_is, t.u)
예제 #18
0
    def added(self):
        data = super().added
        if data['meta'] == {'techniques': []}:
            breakpoint()

        # FIXME conditional lifts ...
        if 'award_number' not in data['meta']:
            am = self.lifters.award_manual
            if am:
                data['meta']['award_number'] = am

        if 'modality' not in data['meta']:
            m = self.lifters.modality
            if m:
                data['meta']['modality'] = m

        if False and 'organ' not in data['meta']:
            # skip here, now attached directly to award
            if 'award_number' in data['meta']:
                an = data['meta']['award_number']
                o = self.lifters.organ(an)
                if o:
                    if o != 'othertargets':
                        o = OntId(o)
                        if o.prefix == 'FMA':
                            ot = OntTerm(o)
                            o = next(
                                OntTerm.query(label=ot.label, prefix='UBERON'))

                    data['meta']['organ'] = o

        if 'organ' not in data['meta'] or data['meta'][
                'organ'] == 'othertargets':
            o = self.lifters.organ_term
            if o:
                if isinstance(o, str):
                    o = o,

                out = tuple()
                for _o in o:
                    _o = OntId(_o)
                    if _o.prefix == 'FMA':
                        ot = OntTerm(_o)
                        _o = next(
                            OntTerm.query(label=ot.label, prefix='UBERON'))

                    out += (_o, )

                data['meta']['organ'] = out

        if 'protocol_url_or_doi' not in data['meta']:
            if self.lifters.protocol_uris:
                data['meta']['protocol_url_or_doi'] = tuple(
                    self.lifters.protocol_uris)

        else:
            if not isinstance(data['meta']['protocol_url_or_doi'], tuple):
                _test_path = deque(['meta', 'protocol_url_or_doi'])
                if not [e for e in data['errors'] if e['path'] == _test_path]:
                    raise ext.ShouldNotHappenError('urg')

            else:
                data['meta']['protocol_url_or_doi'] += tuple(
                    self.lifters.protocol_uris)
                data['meta']['protocol_url_or_doi'] = tuple(
                    sorted(set(data['meta']['protocol_url_or_doi'])))  # ick

        return data
예제 #19
0
    def added(self):
        data = super().added
        if data['meta'] == {'techniques': []}:
            breakpoint()

        # FIXME conditional lifts ...
        if 'award_number' not in data['meta']:
            am = self.lifters.award_manual
            if am:
                data['meta']['award_number'] = am

        if 'modality' not in data['meta']:
            m = self.lifters.modality
            if m:
                data['meta']['modality'] = m

        if False and 'organ' not in data['meta']:
            # skip here, now attached directly to award
            if 'award_number' in data['meta']:
                an = data['meta']['award_number']
                o = self.lifters.organ(an)
                if o:
                    if o != 'othertargets':
                        o = OntId(o)
                        if o.prefix == 'FMA':
                            ot = OntTerm(o)
                            o = next(OntTerm.query(label=ot.label, prefix='UBERON'))

                    data['meta']['organ'] = o

        if 'organ' not in data['meta'] or data['meta']['organ'] == 'othertargets':
            o = self.lifters.organ_term
            if o:
                if isinstance(o, str):
                    o = o,

                out = tuple()
                for _o in o:
                    _o = OntId(_o)
                    if _o.prefix == 'FMA':
                        ot = OntTerm(_o)
                        _o = next(OntTerm.query(label=ot.label, prefix='UBERON'))

                    out += (_o,)

                data['meta']['organ'] = out

        if 'protocol_url_or_doi' not in data['meta']:
            if self.lifters.protocol_uris:
                data['meta']['protocol_url_or_doi'] = tuple(self.lifters.protocol_uris)

        else:
            if not isinstance(data['meta']['protocol_url_or_doi'], tuple):
                _test_path = deque(['meta', 'protocol_url_or_doi'])
                if not [e for e in data['errors']
                        if 'path' in e and e['path'] == _test_path]:
                    raise ext.ShouldNotHappenError('urg')

            else:
                data['meta']['protocol_url_or_doi'] += tuple(self.lifters.protocol_uris)
                data['meta']['protocol_url_or_doi'] = tuple(sorted(set(data['meta']['protocol_url_or_doi'])))  # ick


        # FIXME this is a really bad way to do this :/ maybe stick the folder in data['prov'] ?
        # and indeed, when we added PipelineStart this shifted and broke everything
        local = (self
                 .previous_pipeline.pipelines[0]
                 .previous_pipeline.pipelines[0]
                 .previous_pipeline.pipelines[0]
                 .path)
        remote = local.remote
        if 'doi' not in data['meta']:
            doi = remote.doi
            if doi is not None:
                try:
                    metadata = doi.metadata()
                    if metadata is not None:
                        data['meta']['doi'] = doi.identifier
                except requests.exceptions.HTTPError:
                    data['meta']['doi'] = None
                    pass
            else:
                data['meta']['doi'] = None

        if 'status' not in data:
            data['status'] = {}

        if 'status_on_platform' not in data['status']:
            data['status']['status_on_platform'] = remote.bfobject.status

        return data