示例#1
0
def _json_identifier_expansion(obj, *args, **kwargs):
    if not isinstance(obj, oq.OntTerm):
        if isinstance(obj, rdflib.URIRef):
            obj = OntId(obj)

        if isinstance(obj, oq.OntId):
            obj = obj.asInstrumented()

    if isinstance(obj, oq.OntTerm):
        oc = obj.__class__
        obj.__class__ = OntTerm  # that this works is amazing/terrifying
        try:
            return obj.asDict()
        finally:
            obj.__class__ = oc

    elif isinstance(obj, idlib.Stream):
        if obj._id_class is str:
            return obj.identifier
        else:
            try:
                return obj.asDict()
            except idlib.exc.RemoteError as e:
                logd.error(e)
    else:
        return obj
示例#2
0
    def get(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        log.debug('going to network for protocols')
        resp = requests.get(uri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
示例#3
0
 def autoid_report_error(id, blob):
     try:
         return idlib.Auto(id)
     except idlib.exc.MalformedIdentifierError as e:
         msg = f'{blob["id"]} bad id: {id}'
         logd.error(msg)
         return None
示例#4
0
        def protocol_url_or_doi(self, value):
            #_, s = self.c.protocol_url_or_doi(value)
            #yield s, rdf.type, owl.NamedIndividual
            #yield s, rdf.type, sparc.Protocol
            log.debug(value)
            if not isinstance(value, idlib.Pio):
                if isinstance(value, idlib.Doi):
                    try:
                        t = None
                        for t in value.triples_gen:
                            yield t
                    except idlib.exc.RemoteError as e:
                        if t is None:
                            # we already logged this error during id dereferencing
                            return

                    ds, _, _ = t
                    try:
                        pioid = value.dereference(asType=idlib.Pio)
                        s = self.c.l(pioid)
                        yield ds, TEMP.dereferencesTo, s
                        yield s, TEMP.hasDoi, ds
                    except idlib.exc.MalformedIdentifierError as e:
                        log.warning(e)
                        return
                else:
                    pioid = idlib.Pio(
                        value
                    )  # FIXME :/ should be handled in Pio directly probably?
            else:
                pioid = value

            try:
                pioid_int = pioid.uri_api_int
                s = self.c.l(pioid_int)
                # FIXME needs to be a pipeline so that we can export errors
                try:
                    data = pioid.data()
                except OntId.BadCurieError as e:
                    loge.error(e)  # FIXME export errors ...
                    data = None
            except idlib.exc.RemoteError as e:  # FIXME sandbox violation
                loge.exception(e)
                s = self.c.l(pioid)
                data = None

            yield s, rdf.type, sparc.Protocol

            if data:
                yield s, rdfs.label, rdflib.Literal(pioid.label)
                nsteps = len(data['steps'])
                yield s, TEMP.protocolHasNumberOfSteps, rdflib.Literal(nsteps)

            try:
                yield from self.integrator.triples_protcur(s)
            except OntId.BadCurieError as e:
                logd.error(e)  # FIXME export errors ...
示例#5
0
 def fetch(id):  # FIXME error proof version ...
     try:
         metadata = id.metadata()
         metadata['id'] = id.identifier  # FIXME normalization ...
         return metadata
     except requests.exceptions.HTTPError as e:
         logd.error(e)
     except (requests.exceptions.ConnectionError,
             requests.exceptions.SSLError) as e:
         log.error(e)
示例#6
0
 def fetch(id):  # FIXME error proof version ...
     try:
         metadata = id.metadata()
         metadata['id'] = id
         return metadata
     except (requests.exceptions.HTTPError,
             idlib.exc.RemoteError) as e:
         logd.error(e)
     except (requests.exceptions.ConnectionError,
             requests.exceptions.SSLError,
             idlib.exc.ResolutionError) as e:
         log.error(e)
示例#7
0
def dereference_all_identifiers(obj, stage, *args, path=None, addError=None, **kwargs):
    try:
        dict_literal = _json_identifier_expansion(obj)
    except idlib.exc.RemoteError as e:
        if hasattr(obj, '_cooldown'):
            return obj._cooldown()  # trigger cooldown to simplify issues down the line

        error = dict(error=e,
                     pipeline_stage=stage.__class__.__name__,
                     blame='submission',
                     path=tuple(path))

        if addError:
            if addError(**error):
                log.exception(e)
                #logd.error(msg)
        else:
            return {'errors': [error]}

    except idlib.exc.ResolutionError as e:
        if hasattr(obj, '_cooldown'):
            return obj._cooldown()  # trigger cooldown to simplify issues down the line

        oops = json_export_type_converter(obj)
        msg = (f'{stage.lifters.id} could not resolve '  # FIXME lifters sigh
               f'{type(obj)}: {oops} {obj.asUri()}')
        error = dict(error=msg,
                     pipeline_stage=stage.__class__.__name__,
                     blame='submission',
                     path=tuple(path))

        if addError:
            if addError(**error):
                logd.error(msg)
        else:
            return {'errors': [error]}
    except Exception as e:
        log.critical(f'Unhandled exception {e} in {path}')
        error = dict(error=e,
                     pipeline_stage=stage.__class__.__name__,
                     blame='stage',
                     path=tuple(path))

        if addError:
            if addError(**error):
                log.exception(e)
                #logd.error(msg)
        else:
            return {'errors': [error]}
示例#8
0
            def schema_wrapped_property(_self):
                data = function(_self)
                ok, norm_or_error, data = schema.validate(data)
                if not ok:
                    if fail:
                        logd.error(
                            'schema validation has failed and fail=True')
                        breakpoint()
                        raise norm_or_error

                    if 'errors' not in data:
                        data['errors'] = []

                    data['errors'] += norm_or_error.json(pipeline_stage_name)
                    # TODO make sure the step is noted even if the schema is the same
                elif self.normalize:
                    return norm_or_error

                return data
示例#9
0
    def _protocol_uris_resolved(self):
        # FIXME quite slow ...
        for start_uri in self.protocol_uris:
            log.debug(start_uri)
            try:
                if not hasattr(start_uri, 'dereference'):
                    start_uri = idlib.StreamUri(start_uri)

                end_uri = start_uri.dereference()
                yield end_uri
                sc = end_uri.progenitor.status_code
                if sc > 400:
                    msg = f'error accessing {end_uri} {sc}'
                    if self.addError(msg, blame='submission'):
                        logd.error(msg)

            except idlib.exceptions.ResolutionError as e:
                pass  # FIXME I think we already log this error?
            except self._MissingSchema as e:
                if self.addError(e, blame='submission'):
                    logd.error(e)
            except OntId.BadCurieError as e:
                if self.addError(e, blame='submission'):
                    logd.error(e)
            except BaseException as e:
                #breakpoint()
                log.exception(e)
                log.critical('see exception above')
示例#10
0
            def schema_wrapped_property(_self, *args, **kwargs):
                data = function(_self, *args, **kwargs)
                ok, norm_or_error, data = schema.validate(data)
                if not ok:
                    if fail:
                        logd.error('schema validation has failed and fail=True')
                        raise norm_or_error

                    try:
                        if 'errors' not in data:
                            data['errors'] = []
                    except BaseException as e:
                        raise exc.SparCurError(
                            f'Error from {_self.__class__.__name__}.'
                            f'{function.__name__}') from e
                        
                    data['errors'] += norm_or_error.json(pipeline_stage_name)
                    # TODO make sure the step is noted even if the schema is the same
                elif self.normalize:
                    return norm_or_error

                return data
示例#11
0
        def protocol_url_or_doi(self, value):
            _, s = self.c.protocol_url_or_doi(value)
            yield s, rdf.type, owl.NamedIndividual
            yield s, rdf.type, sparc.Protocol
            pd = ProtocolData(self.integrator.id)
            # FIXME needs to be a pipeline so that we can export errors
            try:
                pj = pd(
                    value
                )  # FIXME a bit opaque, needs to move to a pipeline, clean up init etc.
            except OntId.BadCurieError as e:
                logd.error(e)  # FIXME export errors ...
                pj = None

            if pj:
                label = pj['protocol']['title']
                yield s, rdfs.label, rdflib.Literal(label)
                nsteps = len(pj['protocol']['steps'])
                yield s, TEMP.protocolHasNumberOfSteps, rdflib.Literal(nsteps)

            try:
                yield from self.integrator.triples_protcur(s)
            except OntId.BadCurieError as e:
                logd.error(e)  # FIXME export errors ...
示例#12
0
    def _get_protocol_json(self, uri):
        #juri = uri + '.json'
        logd.info(uri)
        pi = get_right_id(uri)
        if 'protocols.io' in pi:
            pioid = pi.slug  # FIXME normalize before we ever get here ...
            log.info(pioid)
        else:
            msg = f'protocol uri is not from protocols.io {pi} {self.id}'
            logd.error(msg)
            self.addError(msg)
            return

        #uri_path = uri.rsplit('/', 1)[-1]
        apiuri = 'https://protocols.io/api/v3/protocols/' + pioid
        #'https://www.protocols.io/api/v3/groups/sparc/protocols'
        #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top'
        #print(apiuri, header)
        log.debug('going to network for protocols')
        resp = requests.get(apiuri, headers=self._pio_header)
        #log.info(str(resp.request.headers))
        if resp.ok:
            try:
                j = resp.json()  # the api is reasonably consistent
            except BaseException as e:
                log.exception(e)
                breakpoint()
                raise e
            return j
        else:
            try:
                j = resp.json()
                sc = j['status_code']
                em = j['error_message']
                msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}'
                logd.error(msg)
                self.addError(msg)
                # can't return here because of the cache
            except BaseException as e:
                log.exception(e)

            logd.error(f'protocol no access {uri} {self.id!r}')
示例#13
0
    def validate_path_json_metadata(cls, path_meta_blob):
        from sparcur.core import HasErrors  # FIXME
        he = HasErrors(pipeline_stage=cls.__name__ +
                       '.validate_path_json_metadata')
        mimetypes, suffixes = cls._file_type_status_lookup(
        )  # SIGH this overhead is 2 function calls and a branch
        for i, path_meta in enumerate(path_meta_blob['data']):
            if path_meta['basename'] in cls._banned_basenames:
                msg = f'illegal file detect {path_meta["basename"]}'
                dsrp = path_meta['dataset_relative_path']
                if he.addError(msg, path=dsrp, json_path=('data', i)):
                    logd.error(msg)
                status = 'banned'
                path_meta['status'] = status
                continue

            if 'magic_mimetype' in path_meta and 'mimetype' in path_meta:
                # FIXME NOT clear whether magic_mimetype should be used by itself
                # usually magic and file extension together work, magic by itself
                # can give some completely bonkers results
                source = 'magic_mimetype'
                mimetype = path_meta['magic_mimetype']
                muggle_mimetype = path_meta['mimetype']
                if mimetype != muggle_mimetype:
                    msg = f'mime types do not match {mimetype} != {muggle_mimetype}'
                    dsrp = path_meta['dataset_relative_path']
                    if he.addError(msg, path=dsrp, json_path=('data', i)):
                        log.error(msg)
            elif 'magic_mimetype' in path_meta:
                source = 'magic_mimetype'
                mimetype = path_meta['magic_mimetype']
            elif 'mimetype' in path_meta:
                source = 'mimetype'
                mimetype = path_meta['mimetype']
            else:
                mimetype = None

            if mimetype is not None:
                try:
                    status = mimetypes[mimetype]
                    if status == 'banned':
                        msg = f'banned mimetype detected {mimetype}'
                        dsrp = path_meta['dataset_relative_path']
                        if he.addError(msg,
                                       path=dsrp,
                                       json_path=('data', i, source)):
                            logd.error(msg)
                except KeyError as e:
                    status = 'known'
                    if mimetype not in cls._unclassified_mimes:
                        cls._unclassified_mimes.add(mimetype)
                        log.info(f'unclassified mimetype {mimetype}')
            else:
                status = 'unknown'
                dsrp = path_meta['dataset_relative_path']
                if isinstance(dsrp, str):
                    if not dsrp:
                        msg = f'FIXME top level folder needs a mimetype!'
                    else:
                        msg = f'unknown mimetype {path_meta["basename"]}'
                else:
                    msg = f'unknown mimetype {"".join(dsrp.suffixes)}'
                    cls._unknown_suffixes.add(tuple(dsrp.suffixes))
                if he.addError(msg, path=dsrp, json_path=('data', i)):
                    logd.warning(msg)

            path_meta['status'] = status

        if he._errors_set:
            he.embedErrors(path_meta_blob)