Пример #1
0
    def refresh(self,
                update_cache=False,
                update_data=False,
                update_data_on_cache=False,
                size_limit_mb=2,
                force=False):
        """ use force if you have a file from packages """
        try:
            old_meta = self.meta
        except exc.NoMetadataRetrievedError as e:
            log.error(
                f'{e}\nYou will need to individually refresh {self.local}')
            return
        except exc.NoRemoteFileWithThatIdError as e:
            log.exception(e)
            return

        if self.is_file() and not force:  # this will tigger a fetch
            pass
        else:
            self._bfobject = self._api.get(self.id)

        if update_cache or update_data:
            file_is_different = self.update_cache()
            update_existing = file_is_different and self.cache.exists()
            udoc = update_data_on_cache and file_is_different
            if update_existing or udoc:
                size_limit_mb = None

            update_data = update_data or update_existing or udoc

        if update_data and self.is_file():
            self.cache.fetch(size_limit_mb=size_limit_mb)

        return self.cache  # when a cache calls refresh it needs to know if it no longer exists
Пример #2
0
def populate_existing_redis(conn):
    """ Set the initial state for exports from the file system. """
    # we intentionally do not go to network here because that will
    # be done by check_for_updates
    datasets_export_base = Path(options.export_path) / 'datasets'
    uuids = [c.name for c in datasets_export_base.children if c.is_dir()]
    for uuid in uuids:
        dataset_id = 'N:dataset:' + uuid
        try:
            # catch potentially malformed ids
            did = PennsieveId(dataset_id)
        except idlib.exc.MalformedIdentifierError as e:
            log.error(f'strange dir in dataset export: {uuid}\n{e}')
            continue

        # FIXME hardcoded convention
        latest = (datasets_export_base /
                  uuid / 'LATEST' / 'curation-export.json')
        if latest.exists():
            with open(latest, 'rt') as f:
                # we don't bother to use fromJson here because we just
                # need the raw values not the sparcur ir
                blob = json.load(f)
            updated = blob['meta']['timestamp_updated']
            #prov_commit = blob['prov']['commit']  # TODO need to be able to detect software changes and rerun
            sid = 'state-' + dataset_id
            uid = 'updated-' + dataset_id
            fid = 'failed-' + dataset_id
            conn.set(sid, _none)
            conn.set(uid, updated)
            conn.set(fid, '')

    log.info(pprint.pformat({k:conn.get(k) for k in
                             sorted(conn.keys()) if b'N:dataset' in k},
                            width=120))
Пример #3
0
    def update_cache(self):
        log.debug(f'maybe updating cache for {self.name}')
        file_is_different = self.cache._meta_updater(self.meta)
        # update the cache first
        # then move to the new name if relevant
        # prevents moving partial metadata onto existing files
        parent_changed = (hasattr(self._bfobject, 'parent')
                          and self._bfobject.parent != self.cache.parent.id)
        if self.cache.name != self.name or parent_changed:  # this is localy correct
            # the issue is that move is now smarter
            # and will detect if a parent path has changed
            try:
                self.cache.move(remote=self)
            except exc.WhyDidntThisGetMovedBeforeError as e:
                # AAAAAAAAAAAAAAAAAAAAAAAAAAAAA
                # deal with the sadness that is non-unique filenames
                # I am 99.999999999999999% certain that users do not
                # expect this behavior ...
                log.error(e)
                if self.bfobject.package.name != self.bfobject.name:
                    argh = self.bfobject.name
                    self.bfobject.name = self.bfobject.package.name
                    try:
                        log.critical(
                            f'Non unique filename :( '
                            f'{self.cache.name} -> {argh} -> {self.bfobject.name}'
                        )
                        self.cache.move(remote=self)
                    finally:
                        self.bfobject.name = argh
                else:
                    raise e

        return file_is_different
Пример #4
0
    def _protcur(
        self,
        protocol_uri,
        filter=lambda p: True
    ):  # FIXME deprecated and replaced by spc export protcur + recombine graphs
        self.lazy_setup()
        protocol_uri = idlib.get_right_id(protocol_uri)
        if isinstance(protocol_uri, idlib.Pio):
            gen = (p for p in self.protc
                   if p._anno.uri_api_int == protocol_uri and filter(p))
        else:
            gen = (p for p in self.protc
                   if p.uri.startswith(protocol_uri.identifier) and filter(p))

        try:
            p = next(gen)
            yield p
            yield from gen
        except StopIteration:
            log.error(
                f'could not find annotations for {protocol_uri.identifier}')
            return

        if p.document.otherVersionUri:  # FIXME also maybe check /abstract?
            other_uri = p.document.otherVersionUri
            yield from (p for p in self.protc
                        if p.uri.startswith(other_uri) and filter(p))
Пример #5
0
    def tabular(self, sep='|'):
        if self.label is None:
            if self.prefix not in self._known_no_label:
                log.error(f'No label {self.curie if self.curie else self.iri}')

            return self.curie if self.curie else self.iri

        return self.label + sep + self.curie
Пример #6
0
    def _derive(data, derives, source_key_optional=True, allow_empty=False):
        # OLD
        """ derives is a list with the following structure
            [[[source-path, ...], derive-function, [target-path, ...]], ...]

        """
        # TODO this is an implementaiton of copy that has semantics for handling lists
        for source_path, function, target_paths in derives:
            source_prefixes = source_path[:-1]
            source_key = source_path[-1]
            source = data
            failed = False
            for i, node_key in enumerate(source_prefixes):
                log.debug(lj(source))
                if node_key in source:
                    source = source[node_key]
                else:
                    msg = f'did not find {node_key} in {source.keys()}'
                    if not i:
                        log.error(msg)
                        failed = True
                        break
                    raise exc.NoSourcePathError(msg)
                if isinstance(source, list) or isinstance(source, tuple):
                    new_source_path = source_prefixes[i + 1:] + [source_key]
                    new_target_paths = [tp[i + 1:] for tp in target_paths]
                    new_derives = [(new_source_path, function, new_target_paths)]
                    for sub_source in source:
                        _DictTransformer.derive(sub_source, new_derives,
                                                source_key_optional=source_key_optional)

                    return  # no more to do here

            if failed:
                continue  # sometimes things are missing we continue to others

            if source_key not in source:
                msg = f'did not find {source_key} in {source.keys()}'
                if source_key_optional:
                    return logd.info(msg)
                else:
                    raise exc.NoSourcePathError(msg)

            source_value = source[source_key]

            new_values = function(source_value)
            if len(new_values) != len(target_paths):
                log.debug(f'{source_paths} {target_paths}')
                raise TypeError(f'wrong number of values returned for {function}\n'
                                f'was {len(new_values)} expect {len(target_paths)}')
            #temp = b'__temporary'
            #data[temp] = {}  # bytes ensure no collisions
            for target_path, value in zip(target_paths, new_values):
                if (not allow_empty and
                    (value is None or
                     hasattr(value, '__iter__') and not len(value))):
                    raise ValueError(f'value to add to {target_path} may not be empty!')
                adops.add(data, target_path, value, fail_on_exists=True)
Пример #7
0
    def derive(cls,
               data,
               derives,
               source_key_optional=True,
               empty='CULL',
               cheaty_face=None):
        """ [[[source-path, ...], function, [target-path, ...]], ...] """
        # if you have source key option True and empty='OK' you will get loads of junk
        allow_empty = empty == 'OK' and not empty == 'CULL'
        error_empty = empty == 'ERROR'

        def empty(value):
            empty = (value is None
                     or hasattr(value, '__iter__') and not len(value))
            if empty and error_empty:
                raise ValueError(f'value to add may not be empty!')
            return empty or allow_empty and not empty

        failure_value = tuple()
        for source_paths, derive_function, target_paths in derives:
            # FIXME zipeq may cause adds to modify in place in error?
            # except that this is really a type checking thing on the function
            def defer_get(*get_args):
                """ if we fail to get args then we can't gurantee that
                    derive_function will work at all so we wrap the lot """
                args = cls.get(*get_args)
                return derive_function(*args)

            def express_zip(*zip_args):
                return tuple(zipeq(*zip_args))

            try:
                if not target_paths:
                    # allows nesting
                    adops.apply(defer_get,
                                data,
                                source_paths,
                                source_key_optional=source_key_optional)
                    continue

                cls.add(data, ((tp, v) for tp, v in adops.apply(
                    express_zip,
                    target_paths,
                    adops.apply(defer_get,
                                data,
                                source_paths,
                                source_key_optional=source_key_optional),
                    source_key_optional=source_key_optional,
                    extra_error_types=(TypeError, ),
                    failure_value=tuple()) if not empty(v)))
            except TypeError as e:
                log.error('wat')
                idmsg = data['id'] if 'id' in data else ''
                raise TypeError(f'derive failed\n{source_paths}\n'
                                f'{derive_function}\n{target_paths}\n'
                                f'{idmsg}\n') from e
Пример #8
0
    def triples(self):
        crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi')
        for blob in self.data['identifier_metadata']:
            id = blob['id']
            if not isinstance(id, idlib.Stream):
                id = idlib.Auto(id)

            if not hasattr(id, 'asUri'):
                breakpoint()

            s = id.asUri(rdflib.URIRef)
            if 'source' in blob:
                source = blob['source']  # FIXME we need to wrap this in our normalized representation
                if source == 'Crossref':  # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl
                    pos = (
                        (rdf.type, owl.NamedIndividual),
                        (rdf.type, TEMP[blob['type']]),
                        (dc.publisher, blob['publisher']),
                        #(dc.type, blob['type']),  # FIXME semantify
                        (dc.title, blob['title']),
                        (dc.date, self.published_online(blob)),  # FIXME .... dangerzone
                    )
                    g = OntGraph()
                    doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id  # FIXME idlib streams need to recognize their own type in __new__
                    data = doi.ttl()
                    if data is None:  # blackfynn has some bad settings on their doi records ...
                        return

                    try:
                        g.parse(data=data, format='ttl')  # FIXME network bad
                    except BaseException as e:
                        loge.exception(e)

                    _tr = [s for s, p, o in g if p == crossref_doi_pred]
                    if _tr:
                        _their_record_s = _tr[0]
                        yield s, owl.sameAs, _their_record_s
                        yield from g
                    else:
                        g.debug()
                        log.critical('No crossref doi section in graph!')
                else:
                    msg = f'dont know what to do with {source}'
                    log.error(msg)
                    #raise NotImplementedError(msg)
                    return
            else:
                msg = f'dont know what to do with {blob} for {id.identifier}'
                log.error(msg)
                #raise NotImplementedError(msg)
                return

            for p, oraw in pos:
                if oraw is not None:
                    o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw
                    yield s, p, o
Пример #9
0
    def asCell(self, sep='|'):
        if self.label is None:
            _id = self.curie if self.curie else self.iri
            if self.prefix not in self._known_no_label:
                if not self._already_logged(_id):
                    log.error(f'No label {_id}')

            return _id

        return self.label + sep + self.curie
Пример #10
0
 def organ_term(self, dataset_id):
     row = self._lookup(dataset_id)
     organ_term = self.byCol.header.index('organ_term')
     if row:
         ot = row[organ_term] if row[organ_term] else None
         if ot:
             try:
                 ts = tuple(
                     OntId(t) for t in ot.split(' ')
                     if t and t.lower() != 'na')
                 return ts
             except OntId.BadCurieError:
                 log.error(ot)
Пример #11
0
 def organ_term(self, dataset_id):
     row = self._lookup(dataset_id)
     if row:
         organ_term = row.organ_term()
         otv = organ_term.value
         ot = otv if otv else None
         if ot:
             try:
                 ts = tuple(
                     OntId(t) for t in ot.split(' ')
                     if t and t.lower() != 'na')
                 return ts
             except OntId.BadCurieError:
                 log.error(ot)
Пример #12
0
    def generate_manifest(self, include_directories=False):
        """ generate a tabular manifest of all contents of a directory
            serialization is handled by the caller if it is required """

        if not self.is_dir():
            log.error('Can only generate manifests for directories!')
            raise NotADirectoryError(self)

        if include_directories:
            return [c.manifest_record(self) for c in self.rchildren]
        else:
            return [
                c.manifest_record(self) for c in self.rchildren
                if not c.is_dir()
            ]
Пример #13
0
    def triples(self):
        for blob in self.data['identifier_metadata']:
            id = blob['id']
            if not isinstance(id, idlib.Stream):
                id = idlib.Auto(id)

            s = id.asType(rdflib.URIRef)
            if 'source' in blob:
                source = blob[
                    'source']  # FIXME we need to wrap this in our normalized representation
                if source == 'Crossref':  # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl
                    pos = (
                        (rdf.type, owl.NamedIndividual),
                        (rdf.type, TEMP[blob['type']]),
                        (dc.publisher, blob['publisher']),
                        #(dc.type, blob['type']),  # FIXME semantify
                        (dc.title, blob['title']),
                        (dc.date,
                         self.published_online(blob)),  # FIXME .... dangerzone
                    )
                    g = OntGraph()
                    doi = idlib.Doi(id) if not isinstance(
                        id, idlib.Doi
                    ) else id  # FIXME idlib streams need to recognize their own type in __new__
                    g.parse(data=doi.ttl(), format='ttl')  # FIXME network bad
                    _their_record_s = [
                        s for s, p, o in g if p == rdflib.term.URIRef(
                            'http://prismstandard.org/namespaces/basic/2.1/doi'
                        )
                    ][0]
                    yield s, owl.sameAs, _their_record_s
                    yield from g
                else:
                    msg = f'dont know what to do with {source}'
                    log.error(msg)
                    #raise NotImplementedError(msg)
                    return
            else:
                msg = f'dont know what to do with {blob} for {id.identifier}'
                log.error(msg)
                #raise NotImplementedError(msg)
                return

            for p, oraw in pos:
                if oraw is not None:
                    o = rdflib.Literal(oraw) if not isinstance(
                        oraw, rdflib.URIRef) else oraw
                    yield s, p, o
Пример #14
0
def route(name, args, kwargs, options, task=None, **kw):
    if name == 'sparcron.check_for_updates':
        out = {'exchange': 'cron', 'routing_key': 'task.cron', 'priority': 10, 'queue': 'cron'}
    elif name == 'sparcron.check_sheet_updates':
        out = {'exchange': 'cron', 'routing_key': 'task.cron', 'priority': 10, 'queue': 'cron'}
    elif name == 'sparcron.heartbeat':
        out = {'exchange': 'cron', 'routing_key': 'task.cron', 'priority': 3, 'queue': 'cron'}
    elif name == 'sparcron.export_single_dataset':
        out = {'exchange': 'export', 'routing_key': 'task.export', 'priority': 1, 'queue': 'export'}
    elif 'celery' in name:
        out = options
    else:
        oops = (name, args, kwargs, options, task, kw)
        log.error(oops)
        raise NotImplementedError(oops)

    #print('wat', out)
    return out
Пример #15
0
    def _protcur(self, protocol_uri, filter=lambda p: True):
        self.lazy_setup()
        protocol_uri = get_right_id(protocol_uri)
        gen = (p for p in protc
               if p.uri.startswith(protocol_uri) and filter(p))

        try:
            p = next(gen)
            yield p
            yield from gen
        except StopIteration:
            log.error(f'could not find annotations for {protocol_uri}')
            return

        if p.document.otherVersionUri:  # FIXME also maybe check /abstract?
            other_uri = p.document.otherVersionUri
            yield from (p for p in protc
                        if p.uri.startswith(other_uri) and filter(p))
Пример #16
0
    def data(self):
        """ get the 'cached' data which isn't really cached at the moment
            once we implement an index for local files then we can hit that
            first from here """
        # we don't keep two copies of the local data
        # unless we are doing a git-like thing
        if self.is_dir():
            raise TypeError('can\'t retrieve data for a directory')

        meta = self.meta
        if meta.file_id is None:
            raise NotImplementedError('can\'t fetch data without a file id')

        #cands = list(self.local_object_cache_dir.glob(self.cache_key))
        # FIXME this does not play well with old_id ...
        # can probably get away with just globing for the old_id in
        # most cases
        # TODO where to store the chain of prior versions? i.e. do
        # we just keep the xattrs in the object cache? how about file moves?
        # sigh git ...
        if self.local_object_cache_path.exists():
            gen = chain((f'from local cache {self.local_object_cache_path}', ),
                        self.local_object_cache_path.data)
        else:
            gen = self._remote_class.get_file_by_id(meta.id, meta.file_id)

        try:
            self.data_headers = next(gen)
        except exc.NoRemoteFileWithThatIdError as e:
            log.error(f'{self} {e}')
            raise FileNotFoundError(
                f'{self}'
            ) from e  # have to raise so that we don't overwrite the file

        log.debug(self.data_headers)
        if self.local_object_cache_path.exists():
            yield from gen
        else:
            yield from self.local_object_cache_path._data_setter(gen)
            self.local_object_cache_path.cache_init(
                self.meta)  # FIXME self.meta be stale here?!
Пример #17
0
    def map(self, anno):
        row = self._annotation_row(anno)
        mapping_ok = row.mapping_ok().value == 'TRUE'  # FIXME
        not_input = row.not_input_().value
        bad_for_mapping = row.bad_for_mapping_().value
        manual_mapping = row.manual_mapping().value
        if mapping_ok and not not_input:
            pass

        if manual_mapping and ' ' in manual_mapping:
            log.error(
                f'Why does a manual mapping have a space in it {manual_mapping!r}'
            )

        elif manual_mapping:
            return OntTerm(manual_mapping)

        elif mapping_ok:  # FIXME anno.astValue can drift from auto_mapping
            # this is so hilariously inefficient, we parse the same stuff
            # 3 times or something
            return OntTerm(anno.asPython().asPython().black_box.curie)
Пример #18
0
def upload_fileobj(
        file,  # aka Path
        s3_host,
        s3_port,
        s3_bucket,
        s3_keybase,
        region,
        access_key_id,
        secret_access_key,
        session_token,
        encryption_key_id,
        upload_session_id=None,
        ):
    """ streaming upload
        the object passed in as 'file'
        doesn't have to be Path at all
        it just needs to implement the following methods
        `name`, `size`, and `data`
    """
    local_path = file

    try:
        # account for dev connections
        resource_args = {}
        config_args = dict(signature_version='s3v4')
        if 'amazon' not in s3_host.lower() and len(s3_host)!=0:
            resource_args = dict(endpoint_url="http://{}:{}".format(s3_host, s3_port))
            config_args = dict(s3=dict(addressing_style='path'))

        # connect to s3
        session = boto3.session.Session()
        s3 = session.client('s3',
            region_name = region,
            aws_access_key_id = access_key_id,
            aws_secret_access_key = secret_access_key,
            aws_session_token = session_token,
            config = botocore.client.Config(**config_args),
            **resource_args
        )

        # s3 key
        s3_key = '{}/{}'.format(s3_keybase, local_path.name)

        # override seek to raise an IOError so
        # we don't get a TypeError
        # FIXME IterIO stores a buffer of the whole generator >_<
        f = IterIO(local_path.data, sentinel=b'')
        def _seek(self, *args):
            raise IOError('nope')
        f.seek = _seek

        # upload file to s3
        s3.upload_fileobj(
            Fileobj=f,  # FIXME checksumming wrapper probably ...
            Bucket=s3_bucket,
            Key=s3_key,
            #Callback=progress,
            ExtraArgs=dict(
                ServerSideEncryption="aws:kms",
                SSEKMSKeyId=encryption_key_id,
                #Metadata=checksums,  # hca does it this way
                # annoyingly this means that you have to read the file twice :/
            ))

        return s3_key

    except Exception as e:
        log.error(e)
        raise e
Пример #19
0
    def validate_path_json_metadata(cls, path_meta_blob):
        from sparcur.core import HasErrors  # FIXME
        he = HasErrors(pipeline_stage=cls.__name__ +
                       '.validate_path_json_metadata')
        mimetypes, suffixes = cls._file_type_status_lookup(
        )  # SIGH this overhead is 2 function calls and a branch
        for i, path_meta in enumerate(path_meta_blob['data']):
            if path_meta['basename'] in cls._banned_basenames:
                msg = f'illegal file detect {path_meta["basename"]}'
                dsrp = path_meta['dataset_relative_path']
                if he.addError(msg, path=dsrp, json_path=('data', i)):
                    logd.error(msg)
                status = 'banned'
                path_meta['status'] = status
                continue

            if 'magic_mimetype' in path_meta and 'mimetype' in path_meta:
                # FIXME NOT clear whether magic_mimetype should be used by itself
                # usually magic and file extension together work, magic by itself
                # can give some completely bonkers results
                source = 'magic_mimetype'
                mimetype = path_meta['magic_mimetype']
                muggle_mimetype = path_meta['mimetype']
                if mimetype != muggle_mimetype:
                    msg = f'mime types do not match {mimetype} != {muggle_mimetype}'
                    dsrp = path_meta['dataset_relative_path']
                    if he.addError(msg, path=dsrp, json_path=('data', i)):
                        log.error(msg)
            elif 'magic_mimetype' in path_meta:
                source = 'magic_mimetype'
                mimetype = path_meta['magic_mimetype']
            elif 'mimetype' in path_meta:
                source = 'mimetype'
                mimetype = path_meta['mimetype']
            else:
                mimetype = None

            if mimetype is not None:
                try:
                    status = mimetypes[mimetype]
                    if status == 'banned':
                        msg = f'banned mimetype detected {mimetype}'
                        dsrp = path_meta['dataset_relative_path']
                        if he.addError(msg,
                                       path=dsrp,
                                       json_path=('data', i, source)):
                            logd.error(msg)
                except KeyError as e:
                    status = 'known'
                    if mimetype not in cls._unclassified_mimes:
                        cls._unclassified_mimes.add(mimetype)
                        log.info(f'unclassified mimetype {mimetype}')
            else:
                status = 'unknown'
                dsrp = path_meta['dataset_relative_path']
                if isinstance(dsrp, str):
                    if not dsrp:
                        msg = f'FIXME top level folder needs a mimetype!'
                    else:
                        msg = f'unknown mimetype {path_meta["basename"]}'
                else:
                    msg = f'unknown mimetype {"".join(dsrp.suffixes)}'
                    cls._unknown_suffixes.add(tuple(dsrp.suffixes))
                if he.addError(msg, path=dsrp, json_path=('data', i)):
                    logd.warning(msg)

            path_meta['status'] = status

        if he._errors_set:
            he.embedErrors(path_meta_blob)
Пример #20
0
    def data(self):
        """ get the 'cached' data which isn't really cached at the moment
            once we implement an index for local files then we can hit that
            first from here """
        # we don't keep two copies of the local data
        # unless we are doing a git-like thing
        if self.is_dir():
            raise TypeError('can\'t retrieve data for a directory')

        meta = self.meta
        if meta.file_id is None:
            raise NotImplementedError('can\'t fetch data without a file id')

        #cands = list(self.local_object_cache_dir.glob(self.cache_key))
        # FIXME this does not play well with old_id ...
        # can probably get away with just globing for the old_id in
        # most cases
        # TODO where to store the chain of prior versions? i.e. do
        # we just keep the xattrs in the object cache? how about file moves?
        # sigh git ...
        rgen = None
        if self.local_object_cache_path.exists():
            locsize = self.local_object_cache_path.size
            if locsize != meta.size:
                msg = (f'Partial download detected {locsize} != {meta.size} at'
                       f'\n{self.local_object_cache_path}')
                log.info(msg)
                size = self.local_object_cache_path.size
                kwargs = {}
                if size > 0:
                    if (self.local == self.local_object_cache_path
                            and size > 4096):  # FIXME hardcoded chunksize
                        # XXX there is a fantastic edge case where if
                        # you try to read and write from the same file
                        # only the first chunk will be written and if
                        # you are retrieving from remote then the offset
                        # would be greater than the chunksize so there
                        # will be a gap, so we set chunksize here and
                        # issue a critical log
                        msg = ('You probably did not mean to do this. '
                               f'Refetching {size - 4096} bytes.')
                        log.critical(msg)
                        kwargs['ranges'] = ((4096, ), )
                    else:
                        kwargs['ranges'] = ((size, ), )

                if not hasattr(self._remote_class, '_api'):
                    # see note below
                    self._remote_class.anchorToCache(self.anchor)

                rgen = self._remote_class.get_file_by_id(
                    meta.id, meta.file_id, **kwargs)
                gen = chain((next(rgen), ), self.local_object_cache_path.data)
            else:
                gen = chain(
                    (f'from local cache {self.local_object_cache_path}', ),
                    self.local_object_cache_path.data)
        else:
            if not hasattr(self._remote_class, '_api'):
                # NOTE we do not want to dereference self.remote
                # in this situation because we just want the file
                # not the FS metadata, so we have to ensure that _api
                # is bound
                self._remote_class.anchorToCache(self.anchor)

            gen = self._remote_class.get_file_by_id(meta.id, meta.file_id)

        try:
            self.data_headers = next(gen)
        except exc.NoRemoteFileWithThatIdError as e:
            log.error(f'{self} {e}')
            raise exc.CacheNotFoundError(
                f'{self}'
            ) from e  # have to raise so that we don't overwrite the file

        log.log(9, self.data_headers)
        if self.local_object_cache_path.exists():
            yield from gen
            if rgen is None:
                return

            yield from self.local_object_cache_path._data_setter(rgen,
                                                                 append=True)

        else:
            # FIXME we MUST write the metadata first so that we know the expected size
            # so that in the event that the generator is only partially run out we know
            # that we can pick up where we left off with the fetch, this also explains
            # why all the cases where the cached data size did not match were missing
            # xattrs entirely
            if not self.local_object_cache_path.parent.exists():
                # FIXME sigh, no obvious way around having to check
                # every time other than creating all the cache
                # subfolders in advance
                self.local_object_cache_path.parent.mkdir()

            self.local_object_cache_path.touch()
            self.local_object_cache_path.cache_init(meta)

            yield from self.local_object_cache_path._data_setter(gen)

        ls = self.local_object_cache_path.size
        if ls != meta.size:
            self.local_object_cache_path.unlink()
            msg = f'{ls} != {meta.size} for {self}'
            raise ValueError(msg)  # FIXME TODO
Пример #21
0
    def data(self):
        """ get the 'cached' data which isn't really cached at the moment
            once we implement an index for local files then we can hit that
            first from here """
        # we don't keep two copies of the local data
        # unless we are doing a git-like thing
        if self.is_dir():
            raise TypeError('can\'t retrieve data for a directory')

        meta = self.meta
        if meta.file_id is None:
            raise NotImplementedError('can\'t fetch data without a file id')

        #cands = list(self.local_object_cache_dir.glob(self.cache_key))
        # FIXME this does not play well with old_id ...
        # can probably get away with just globing for the old_id in
        # most cases
        # TODO where to store the chain of prior versions? i.e. do
        # we just keep the xattrs in the object cache? how about file moves?
        # sigh git ...
        if self.local_object_cache_path.exists():
            locsize = self.local_object_cache_path.size
            if locsize != meta.size:
                raise NotImplementedError(
                    'TODO yield from local then fetch the rest starting at offset'
                )

            gen = chain((f'from local cache {self.local_object_cache_path}', ),
                        self.local_object_cache_path.data)
        else:
            if not hasattr(self._remote_class, '_api'):
                # NOTE we do not want to dereference self.remote
                # in this situation because we just want the file
                # not the FS metadata, so we have to ensure that _api
                # is bound
                self._remote_class.anchorToCache(self.anchor)

            gen = self._remote_class.get_file_by_id(meta.id, meta.file_id)

        try:
            self.data_headers = next(gen)
        except exc.NoRemoteFileWithThatIdError as e:
            log.error(f'{self} {e}')
            raise exc.CacheNotFoundError(
                f'{self}'
            ) from e  # have to raise so that we don't overwrite the file

        log.debug(self.data_headers)
        if self.local_object_cache_path.exists():
            yield from gen
        else:
            # FIXME we MUST write the metadata first so that we know the expected size
            # so that in the event that the generator is only partially run out we know
            # that we can pick up where we left off with the fetch, this also explains
            # why all the cases where the cached data size did not match were missing
            # xattrs entirely

            self.local_object_cache_path.touch()
            self.local_object_cache_path.cache_init(meta)

            yield from self.local_object_cache_path._data_setter(gen)

            ls = self.local_object_cache_path.size
            if ls != meta.size:
                self.local_object_cache_path.unlink()
                msg = f'{ls} != {meta.size} for {self}'
                raise ValueError(msg)  # FIXME TODO
Пример #22
0
    def setup(cls, *, local_only=False):
        # FIXME this is a mess
        """ make sure we have all datasources
            calling this again will refresh helpers
        """
        if hasattr(Integrator, '__setup') and Integrator.__setup:
            return  # already setup

        Integrator.__setup = True

        for _cls in cls.mro():
            if _cls != cls:
                if hasattr(_cls, 'setup'):
                    _cls.setup()

        dat.DatasetStructure.rate = cls.rate

        class FakeOrganSheet:
            modality = lambda v: None
            organ_term = lambda v: None
            award_manual = lambda v: None
            byCol = _byCol([['award', 'award_manual', 'organ_term'], []])
            techniques = lambda v: []
            protocol_uris = lambda v: []

        class FakeAffilSheet:
            def __call__(self, *args, **kwargs):
                return

        class FakeOverviewSheet:
            def __call__(self, *args, **kwargs):
                return

        # unanchored helpers
        if cls.no_google or local_only:
            log.critical('no google no organ data')
            cls.organs_sheet = FakeOrganSheet
            cls.affiliations = FakeAffilSheet()
            cls.overview_sheet = FakeOverviewSheet()
        else:
            # ipv6 resolution issues :/ also issues with pickling
            #cls.organs_sheet = sheets.Organs(fetch_grid=True)  # this kills parallelism
            cls.organs_sheet = sheets.Organs(
            )  # if fetch_grid = False @ class level ok
            cls.affiliations = sheets.Affiliations()
            cls.overview_sheet = sheets.Overview()

            # zap all the services (apparently doesn't help)
            # yep, its just the organ sheet, these go in and out just fine
            #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'):
            #delattr(sheets.Sheet, '_Sheet__spreadsheet_service')
            #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'):
            #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro')

            #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet):
            #if hasattr(s, '_spreadsheet_service'):
            #delattr(s, '_spreadsheet_service')

            # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG!
            #cls.organs_sheet = FakeOrganSheet  # organs is BAD

            #cls.affiliations = FakeAffilSheet()  # affiliations is OK
            #cls.overview_sheet = FakeOverviewSheet()  # overview is OK

            #breakpoint()
            # remove byCol which is unpickleable (super duper sigh)
            #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet):
            #if hasattr(s, 'byCol'):
            #delattr(s, 'byCol')

        if cls.no_google:
            cls.organ = lambda award: None

        if local_only:
            cls.organ = lambda award: None
            cls.member = lambda first, last: None
        else:
            cls.organ = OrganData()
            if hasattr(State, 'member'):
                cls.member = State.member
            else:
                log.error('State missing member, using State seems '
                          'like a good idea until you go to multiprocessing')
                cls.member = lambda first, last: None