def refresh(self, update_cache=False, update_data=False, update_data_on_cache=False, size_limit_mb=2, force=False): """ use force if you have a file from packages """ try: old_meta = self.meta except exc.NoMetadataRetrievedError as e: log.error( f'{e}\nYou will need to individually refresh {self.local}') return except exc.NoRemoteFileWithThatIdError as e: log.exception(e) return if self.is_file() and not force: # this will tigger a fetch pass else: self._bfobject = self._api.get(self.id) if update_cache or update_data: file_is_different = self.update_cache() update_existing = file_is_different and self.cache.exists() udoc = update_data_on_cache and file_is_different if update_existing or udoc: size_limit_mb = None update_data = update_data or update_existing or udoc if update_data and self.is_file(): self.cache.fetch(size_limit_mb=size_limit_mb) return self.cache # when a cache calls refresh it needs to know if it no longer exists
def populate_existing_redis(conn): """ Set the initial state for exports from the file system. """ # we intentionally do not go to network here because that will # be done by check_for_updates datasets_export_base = Path(options.export_path) / 'datasets' uuids = [c.name for c in datasets_export_base.children if c.is_dir()] for uuid in uuids: dataset_id = 'N:dataset:' + uuid try: # catch potentially malformed ids did = PennsieveId(dataset_id) except idlib.exc.MalformedIdentifierError as e: log.error(f'strange dir in dataset export: {uuid}\n{e}') continue # FIXME hardcoded convention latest = (datasets_export_base / uuid / 'LATEST' / 'curation-export.json') if latest.exists(): with open(latest, 'rt') as f: # we don't bother to use fromJson here because we just # need the raw values not the sparcur ir blob = json.load(f) updated = blob['meta']['timestamp_updated'] #prov_commit = blob['prov']['commit'] # TODO need to be able to detect software changes and rerun sid = 'state-' + dataset_id uid = 'updated-' + dataset_id fid = 'failed-' + dataset_id conn.set(sid, _none) conn.set(uid, updated) conn.set(fid, '') log.info(pprint.pformat({k:conn.get(k) for k in sorted(conn.keys()) if b'N:dataset' in k}, width=120))
def update_cache(self): log.debug(f'maybe updating cache for {self.name}') file_is_different = self.cache._meta_updater(self.meta) # update the cache first # then move to the new name if relevant # prevents moving partial metadata onto existing files parent_changed = (hasattr(self._bfobject, 'parent') and self._bfobject.parent != self.cache.parent.id) if self.cache.name != self.name or parent_changed: # this is localy correct # the issue is that move is now smarter # and will detect if a parent path has changed try: self.cache.move(remote=self) except exc.WhyDidntThisGetMovedBeforeError as e: # AAAAAAAAAAAAAAAAAAAAAAAAAAAAA # deal with the sadness that is non-unique filenames # I am 99.999999999999999% certain that users do not # expect this behavior ... log.error(e) if self.bfobject.package.name != self.bfobject.name: argh = self.bfobject.name self.bfobject.name = self.bfobject.package.name try: log.critical( f'Non unique filename :( ' f'{self.cache.name} -> {argh} -> {self.bfobject.name}' ) self.cache.move(remote=self) finally: self.bfobject.name = argh else: raise e return file_is_different
def _protcur( self, protocol_uri, filter=lambda p: True ): # FIXME deprecated and replaced by spc export protcur + recombine graphs self.lazy_setup() protocol_uri = idlib.get_right_id(protocol_uri) if isinstance(protocol_uri, idlib.Pio): gen = (p for p in self.protc if p._anno.uri_api_int == protocol_uri and filter(p)) else: gen = (p for p in self.protc if p.uri.startswith(protocol_uri.identifier) and filter(p)) try: p = next(gen) yield p yield from gen except StopIteration: log.error( f'could not find annotations for {protocol_uri.identifier}') return if p.document.otherVersionUri: # FIXME also maybe check /abstract? other_uri = p.document.otherVersionUri yield from (p for p in self.protc if p.uri.startswith(other_uri) and filter(p))
def tabular(self, sep='|'): if self.label is None: if self.prefix not in self._known_no_label: log.error(f'No label {self.curie if self.curie else self.iri}') return self.curie if self.curie else self.iri return self.label + sep + self.curie
def _derive(data, derives, source_key_optional=True, allow_empty=False): # OLD """ derives is a list with the following structure [[[source-path, ...], derive-function, [target-path, ...]], ...] """ # TODO this is an implementaiton of copy that has semantics for handling lists for source_path, function, target_paths in derives: source_prefixes = source_path[:-1] source_key = source_path[-1] source = data failed = False for i, node_key in enumerate(source_prefixes): log.debug(lj(source)) if node_key in source: source = source[node_key] else: msg = f'did not find {node_key} in {source.keys()}' if not i: log.error(msg) failed = True break raise exc.NoSourcePathError(msg) if isinstance(source, list) or isinstance(source, tuple): new_source_path = source_prefixes[i + 1:] + [source_key] new_target_paths = [tp[i + 1:] for tp in target_paths] new_derives = [(new_source_path, function, new_target_paths)] for sub_source in source: _DictTransformer.derive(sub_source, new_derives, source_key_optional=source_key_optional) return # no more to do here if failed: continue # sometimes things are missing we continue to others if source_key not in source: msg = f'did not find {source_key} in {source.keys()}' if source_key_optional: return logd.info(msg) else: raise exc.NoSourcePathError(msg) source_value = source[source_key] new_values = function(source_value) if len(new_values) != len(target_paths): log.debug(f'{source_paths} {target_paths}') raise TypeError(f'wrong number of values returned for {function}\n' f'was {len(new_values)} expect {len(target_paths)}') #temp = b'__temporary' #data[temp] = {} # bytes ensure no collisions for target_path, value in zip(target_paths, new_values): if (not allow_empty and (value is None or hasattr(value, '__iter__') and not len(value))): raise ValueError(f'value to add to {target_path} may not be empty!') adops.add(data, target_path, value, fail_on_exists=True)
def derive(cls, data, derives, source_key_optional=True, empty='CULL', cheaty_face=None): """ [[[source-path, ...], function, [target-path, ...]], ...] """ # if you have source key option True and empty='OK' you will get loads of junk allow_empty = empty == 'OK' and not empty == 'CULL' error_empty = empty == 'ERROR' def empty(value): empty = (value is None or hasattr(value, '__iter__') and not len(value)) if empty and error_empty: raise ValueError(f'value to add may not be empty!') return empty or allow_empty and not empty failure_value = tuple() for source_paths, derive_function, target_paths in derives: # FIXME zipeq may cause adds to modify in place in error? # except that this is really a type checking thing on the function def defer_get(*get_args): """ if we fail to get args then we can't gurantee that derive_function will work at all so we wrap the lot """ args = cls.get(*get_args) return derive_function(*args) def express_zip(*zip_args): return tuple(zipeq(*zip_args)) try: if not target_paths: # allows nesting adops.apply(defer_get, data, source_paths, source_key_optional=source_key_optional) continue cls.add(data, ((tp, v) for tp, v in adops.apply( express_zip, target_paths, adops.apply(defer_get, data, source_paths, source_key_optional=source_key_optional), source_key_optional=source_key_optional, extra_error_types=(TypeError, ), failure_value=tuple()) if not empty(v))) except TypeError as e: log.error('wat') idmsg = data['id'] if 'id' in data else '' raise TypeError(f'derive failed\n{source_paths}\n' f'{derive_function}\n{target_paths}\n' f'{idmsg}\n') from e
def triples(self): crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi') for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) if not hasattr(id, 'asUri'): breakpoint() s = id.asUri(rdflib.URIRef) if 'source' in blob: source = blob['source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id # FIXME idlib streams need to recognize their own type in __new__ data = doi.ttl() if data is None: # blackfynn has some bad settings on their doi records ... return try: g.parse(data=data, format='ttl') # FIXME network bad except BaseException as e: loge.exception(e) _tr = [s for s, p, o in g if p == crossref_doi_pred] if _tr: _their_record_s = _tr[0] yield s, owl.sameAs, _their_record_s yield from g else: g.debug() log.critical('No crossref doi section in graph!') else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw yield s, p, o
def asCell(self, sep='|'): if self.label is None: _id = self.curie if self.curie else self.iri if self.prefix not in self._known_no_label: if not self._already_logged(_id): log.error(f'No label {_id}') return _id return self.label + sep + self.curie
def organ_term(self, dataset_id): row = self._lookup(dataset_id) organ_term = self.byCol.header.index('organ_term') if row: ot = row[organ_term] if row[organ_term] else None if ot: try: ts = tuple( OntId(t) for t in ot.split(' ') if t and t.lower() != 'na') return ts except OntId.BadCurieError: log.error(ot)
def organ_term(self, dataset_id): row = self._lookup(dataset_id) if row: organ_term = row.organ_term() otv = organ_term.value ot = otv if otv else None if ot: try: ts = tuple( OntId(t) for t in ot.split(' ') if t and t.lower() != 'na') return ts except OntId.BadCurieError: log.error(ot)
def generate_manifest(self, include_directories=False): """ generate a tabular manifest of all contents of a directory serialization is handled by the caller if it is required """ if not self.is_dir(): log.error('Can only generate manifests for directories!') raise NotADirectoryError(self) if include_directories: return [c.manifest_record(self) for c in self.rchildren] else: return [ c.manifest_record(self) for c in self.rchildren if not c.is_dir() ]
def triples(self): for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) s = id.asType(rdflib.URIRef) if 'source' in blob: source = blob[ 'source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance( id, idlib.Doi ) else id # FIXME idlib streams need to recognize their own type in __new__ g.parse(data=doi.ttl(), format='ttl') # FIXME network bad _their_record_s = [ s for s, p, o in g if p == rdflib.term.URIRef( 'http://prismstandard.org/namespaces/basic/2.1/doi' ) ][0] yield s, owl.sameAs, _their_record_s yield from g else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance( oraw, rdflib.URIRef) else oraw yield s, p, o
def route(name, args, kwargs, options, task=None, **kw): if name == 'sparcron.check_for_updates': out = {'exchange': 'cron', 'routing_key': 'task.cron', 'priority': 10, 'queue': 'cron'} elif name == 'sparcron.check_sheet_updates': out = {'exchange': 'cron', 'routing_key': 'task.cron', 'priority': 10, 'queue': 'cron'} elif name == 'sparcron.heartbeat': out = {'exchange': 'cron', 'routing_key': 'task.cron', 'priority': 3, 'queue': 'cron'} elif name == 'sparcron.export_single_dataset': out = {'exchange': 'export', 'routing_key': 'task.export', 'priority': 1, 'queue': 'export'} elif 'celery' in name: out = options else: oops = (name, args, kwargs, options, task, kw) log.error(oops) raise NotImplementedError(oops) #print('wat', out) return out
def _protcur(self, protocol_uri, filter=lambda p: True): self.lazy_setup() protocol_uri = get_right_id(protocol_uri) gen = (p for p in protc if p.uri.startswith(protocol_uri) and filter(p)) try: p = next(gen) yield p yield from gen except StopIteration: log.error(f'could not find annotations for {protocol_uri}') return if p.document.otherVersionUri: # FIXME also maybe check /abstract? other_uri = p.document.otherVersionUri yield from (p for p in protc if p.uri.startswith(other_uri) and filter(p))
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... if self.local_object_cache_path.exists(): gen = chain((f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise FileNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.debug(self.data_headers) if self.local_object_cache_path.exists(): yield from gen else: yield from self.local_object_cache_path._data_setter(gen) self.local_object_cache_path.cache_init( self.meta) # FIXME self.meta be stale here?!
def map(self, anno): row = self._annotation_row(anno) mapping_ok = row.mapping_ok().value == 'TRUE' # FIXME not_input = row.not_input_().value bad_for_mapping = row.bad_for_mapping_().value manual_mapping = row.manual_mapping().value if mapping_ok and not not_input: pass if manual_mapping and ' ' in manual_mapping: log.error( f'Why does a manual mapping have a space in it {manual_mapping!r}' ) elif manual_mapping: return OntTerm(manual_mapping) elif mapping_ok: # FIXME anno.astValue can drift from auto_mapping # this is so hilariously inefficient, we parse the same stuff # 3 times or something return OntTerm(anno.asPython().asPython().black_box.curie)
def upload_fileobj( file, # aka Path s3_host, s3_port, s3_bucket, s3_keybase, region, access_key_id, secret_access_key, session_token, encryption_key_id, upload_session_id=None, ): """ streaming upload the object passed in as 'file' doesn't have to be Path at all it just needs to implement the following methods `name`, `size`, and `data` """ local_path = file try: # account for dev connections resource_args = {} config_args = dict(signature_version='s3v4') if 'amazon' not in s3_host.lower() and len(s3_host)!=0: resource_args = dict(endpoint_url="http://{}:{}".format(s3_host, s3_port)) config_args = dict(s3=dict(addressing_style='path')) # connect to s3 session = boto3.session.Session() s3 = session.client('s3', region_name = region, aws_access_key_id = access_key_id, aws_secret_access_key = secret_access_key, aws_session_token = session_token, config = botocore.client.Config(**config_args), **resource_args ) # s3 key s3_key = '{}/{}'.format(s3_keybase, local_path.name) # override seek to raise an IOError so # we don't get a TypeError # FIXME IterIO stores a buffer of the whole generator >_< f = IterIO(local_path.data, sentinel=b'') def _seek(self, *args): raise IOError('nope') f.seek = _seek # upload file to s3 s3.upload_fileobj( Fileobj=f, # FIXME checksumming wrapper probably ... Bucket=s3_bucket, Key=s3_key, #Callback=progress, ExtraArgs=dict( ServerSideEncryption="aws:kms", SSEKMSKeyId=encryption_key_id, #Metadata=checksums, # hca does it this way # annoyingly this means that you have to read the file twice :/ )) return s3_key except Exception as e: log.error(e) raise e
def validate_path_json_metadata(cls, path_meta_blob): from sparcur.core import HasErrors # FIXME he = HasErrors(pipeline_stage=cls.__name__ + '.validate_path_json_metadata') mimetypes, suffixes = cls._file_type_status_lookup( ) # SIGH this overhead is 2 function calls and a branch for i, path_meta in enumerate(path_meta_blob['data']): if path_meta['basename'] in cls._banned_basenames: msg = f'illegal file detect {path_meta["basename"]}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i)): logd.error(msg) status = 'banned' path_meta['status'] = status continue if 'magic_mimetype' in path_meta and 'mimetype' in path_meta: # FIXME NOT clear whether magic_mimetype should be used by itself # usually magic and file extension together work, magic by itself # can give some completely bonkers results source = 'magic_mimetype' mimetype = path_meta['magic_mimetype'] muggle_mimetype = path_meta['mimetype'] if mimetype != muggle_mimetype: msg = f'mime types do not match {mimetype} != {muggle_mimetype}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i)): log.error(msg) elif 'magic_mimetype' in path_meta: source = 'magic_mimetype' mimetype = path_meta['magic_mimetype'] elif 'mimetype' in path_meta: source = 'mimetype' mimetype = path_meta['mimetype'] else: mimetype = None if mimetype is not None: try: status = mimetypes[mimetype] if status == 'banned': msg = f'banned mimetype detected {mimetype}' dsrp = path_meta['dataset_relative_path'] if he.addError(msg, path=dsrp, json_path=('data', i, source)): logd.error(msg) except KeyError as e: status = 'known' if mimetype not in cls._unclassified_mimes: cls._unclassified_mimes.add(mimetype) log.info(f'unclassified mimetype {mimetype}') else: status = 'unknown' dsrp = path_meta['dataset_relative_path'] if isinstance(dsrp, str): if not dsrp: msg = f'FIXME top level folder needs a mimetype!' else: msg = f'unknown mimetype {path_meta["basename"]}' else: msg = f'unknown mimetype {"".join(dsrp.suffixes)}' cls._unknown_suffixes.add(tuple(dsrp.suffixes)) if he.addError(msg, path=dsrp, json_path=('data', i)): logd.warning(msg) path_meta['status'] = status if he._errors_set: he.embedErrors(path_meta_blob)
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... rgen = None if self.local_object_cache_path.exists(): locsize = self.local_object_cache_path.size if locsize != meta.size: msg = (f'Partial download detected {locsize} != {meta.size} at' f'\n{self.local_object_cache_path}') log.info(msg) size = self.local_object_cache_path.size kwargs = {} if size > 0: if (self.local == self.local_object_cache_path and size > 4096): # FIXME hardcoded chunksize # XXX there is a fantastic edge case where if # you try to read and write from the same file # only the first chunk will be written and if # you are retrieving from remote then the offset # would be greater than the chunksize so there # will be a gap, so we set chunksize here and # issue a critical log msg = ('You probably did not mean to do this. ' f'Refetching {size - 4096} bytes.') log.critical(msg) kwargs['ranges'] = ((4096, ), ) else: kwargs['ranges'] = ((size, ), ) if not hasattr(self._remote_class, '_api'): # see note below self._remote_class.anchorToCache(self.anchor) rgen = self._remote_class.get_file_by_id( meta.id, meta.file_id, **kwargs) gen = chain((next(rgen), ), self.local_object_cache_path.data) else: gen = chain( (f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: if not hasattr(self._remote_class, '_api'): # NOTE we do not want to dereference self.remote # in this situation because we just want the file # not the FS metadata, so we have to ensure that _api # is bound self._remote_class.anchorToCache(self.anchor) gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise exc.CacheNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.log(9, self.data_headers) if self.local_object_cache_path.exists(): yield from gen if rgen is None: return yield from self.local_object_cache_path._data_setter(rgen, append=True) else: # FIXME we MUST write the metadata first so that we know the expected size # so that in the event that the generator is only partially run out we know # that we can pick up where we left off with the fetch, this also explains # why all the cases where the cached data size did not match were missing # xattrs entirely if not self.local_object_cache_path.parent.exists(): # FIXME sigh, no obvious way around having to check # every time other than creating all the cache # subfolders in advance self.local_object_cache_path.parent.mkdir() self.local_object_cache_path.touch() self.local_object_cache_path.cache_init(meta) yield from self.local_object_cache_path._data_setter(gen) ls = self.local_object_cache_path.size if ls != meta.size: self.local_object_cache_path.unlink() msg = f'{ls} != {meta.size} for {self}' raise ValueError(msg) # FIXME TODO
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... if self.local_object_cache_path.exists(): locsize = self.local_object_cache_path.size if locsize != meta.size: raise NotImplementedError( 'TODO yield from local then fetch the rest starting at offset' ) gen = chain((f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: if not hasattr(self._remote_class, '_api'): # NOTE we do not want to dereference self.remote # in this situation because we just want the file # not the FS metadata, so we have to ensure that _api # is bound self._remote_class.anchorToCache(self.anchor) gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise exc.CacheNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.debug(self.data_headers) if self.local_object_cache_path.exists(): yield from gen else: # FIXME we MUST write the metadata first so that we know the expected size # so that in the event that the generator is only partially run out we know # that we can pick up where we left off with the fetch, this also explains # why all the cases where the cached data size did not match were missing # xattrs entirely self.local_object_cache_path.touch() self.local_object_cache_path.cache_init(meta) yield from self.local_object_cache_path._data_setter(gen) ls = self.local_object_cache_path.size if ls != meta.size: self.local_object_cache_path.unlink() msg = f'{ls} != {meta.size} for {self}' raise ValueError(msg) # FIXME TODO
def setup(cls, *, local_only=False): # FIXME this is a mess """ make sure we have all datasources calling this again will refresh helpers """ if hasattr(Integrator, '__setup') and Integrator.__setup: return # already setup Integrator.__setup = True for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate class FakeOrganSheet: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) techniques = lambda v: [] protocol_uris = lambda v: [] class FakeAffilSheet: def __call__(self, *args, **kwargs): return class FakeOverviewSheet: def __call__(self, *args, **kwargs): return # unanchored helpers if cls.no_google or local_only: log.critical('no google no organ data') cls.organs_sheet = FakeOrganSheet cls.affiliations = FakeAffilSheet() cls.overview_sheet = FakeOverviewSheet() else: # ipv6 resolution issues :/ also issues with pickling #cls.organs_sheet = sheets.Organs(fetch_grid=True) # this kills parallelism cls.organs_sheet = sheets.Organs( ) # if fetch_grid = False @ class level ok cls.affiliations = sheets.Affiliations() cls.overview_sheet = sheets.Overview() # zap all the services (apparently doesn't help) # yep, its just the organ sheet, these go in and out just fine #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service') #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro') #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, '_spreadsheet_service'): #delattr(s, '_spreadsheet_service') # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG! #cls.organs_sheet = FakeOrganSheet # organs is BAD #cls.affiliations = FakeAffilSheet() # affiliations is OK #cls.overview_sheet = FakeOverviewSheet() # overview is OK #breakpoint() # remove byCol which is unpickleable (super duper sigh) #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, 'byCol'): #delattr(s, 'byCol') if cls.no_google: cls.organ = lambda award: None if local_only: cls.organ = lambda award: None cls.member = lambda first, last: None else: cls.organ = OrganData() if hasattr(State, 'member'): cls.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') cls.member = lambda first, last: None