def update_cache(self): log.debug(f'maybe updating cache for {self.name}') file_is_different = self.cache._meta_updater(self.meta) # update the cache first # then move to the new name if relevant # prevents moving partial metadata onto existing files parent_changed = (hasattr(self._bfobject, 'parent') and self._bfobject.parent != self.cache.parent.id) if self.cache.name != self.name or parent_changed: # this is localy correct # the issue is that move is now smarter # and will detect if a parent path has changed try: self.cache.move(remote=self) except exc.WhyDidntThisGetMovedBeforeError as e: # AAAAAAAAAAAAAAAAAAAAAAAAAAAAA # deal with the sadness that is non-unique filenames # I am 99.999999999999999% certain that users do not # expect this behavior ... log.error(e) if self.bfobject.package.name != self.bfobject.name: argh = self.bfobject.name self.bfobject.name = self.bfobject.package.name try: log.critical( f'Non unique filename :( ' f'{self.cache.name} -> {argh} -> {self.bfobject.name}' ) self.cache.move(remote=self) finally: self.bfobject.name = argh else: raise e return file_is_different
def json_identifier_expansion(obj, *args, path=None, **kwargs): """ expand identifiers to json literal form """ try: return _json_identifier_expansion(obj, *args, **kwargs) except idlib.exceptions.RemoteError as e: oops = json_export_type_converter(obj) msg = f'remote error {e} for {type(obj)}: {oops}' out = {'id': obj, 'type': 'identifier', 'system': obj.__class__.__name__, 'errors': [{'message': msg, 'path': path}]} return out except idlib.exceptions.ResolutionError as e: oops = json_export_type_converter(obj) msg = f'could not resolve {type(obj)}: {oops}' out = {'id': obj, 'type': 'identifier', 'system': obj.__class__.__name__, 'errors': [{'message': msg, 'path': path}]} return out except Exception as e: oops = json_export_type_converter(obj) msg = f'Unhandled exception {e} in {path}' out = {'id': obj, 'type': 'identifier', 'system': obj.__class__.__name__, 'errors': [{'message': msg, 'path': path}]} log.critical(msg) return out
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) try: if not hasattr(start_uri, 'dereference'): start_uri = idlib.StreamUri(start_uri) end_uri = start_uri.dereference() yield end_uri sc = end_uri.progenitor.status_code if sc > 400: msg = f'error accessing {end_uri} {sc}' if self.addError(msg, blame='submission'): logd.error(msg) except idlib.exceptions.ResolutionError as e: pass # FIXME I think we already log this error? except self._MissingSchema as e: if self.addError(e, blame='submission'): logd.error(e) except OntId.BadCurieError as e: if self.addError(e, blame='submission'): logd.error(e) except BaseException as e: #breakpoint() log.exception(e) log.critical('see exception above')
def _name(self): name = self.bfobject.name if isinstance(self.bfobject, File) and not self.from_packages: realname = os.path.basename(self.bfobject.s3_key) if name != realname: # mega weirdness if realname.startswith(name): name = realname else: realpath = PurePath(realname) namepath = PurePath(name) if namepath.suffixes: log.critical('sigh {namepath!r} -?-> {realpath!r}') else: path = namepath for suffix in realpath.suffixes: path = path.with_suffix(suffix) old_name = name name = path.as_posix() log.info(f'name {old_name} -> {name}') if '/' in name: bads = ','.join(f'{i}' for i, c in enumerate(name) if c == '/') self._errors.append(f'slashes {bads}') log.critical(f'GO AWAY {self}') name = name.replace('/', '_') self.bfobject.name = name # AND DON'T BOTHER US AGAIN return name
def _lookup(self, index_column, value, fail=False, raw=True): try: row = self.byCol.searchIndex(index_column, value, raw=raw) return row except KeyError as e: # TODO update the sheet automatically log.critical(f'No match on {index_column} for: {value}') if fail: raise e
def _jm_common(self, do_expensive_operations=False): # FIXME WARNING resolution only works if we were relative to # the current working directory if self.is_broken_symlink(): self = self.absolute() else: self = self.resolve() # safety since we have to go hunting paths project_path = self.find_cache_root() if project_path is None: # FIXME TODO I think we used dataset_description as a hint? project_path = self.__class__('/') # FIXME FIXME log.critical(f'No dataset path found for {self}!') #raise NotImplementedError('current implementation cant anchor with current info') dataset_path = [ p for p in chain((self, ), self.parents) if p.parent == project_path ][0] drp = self.relative_path_from(dataset_path) # FIXME ... dsid = dataset_path.cache_identifier blob = { 'type': 'path', 'dataset_id': dsid.curie, 'dataset_relative_path': drp, 'basename': self.name, # for sanity's sake } mimetype = self.mimetype if mimetype: blob['mimetype'] = mimetype if do_expensive_operations: blob['magic_mimetype'] = self._magic_mimetype if not (self.is_broken_symlink() or self.exists()): # TODO search for closest match cands = self._closest_existing_matches() msg = f'Path does not exist!\n{self}' if cands: _fcands = [(r, n) for r, n in cands if r < 10] fcands = _fcands if _fcands else cands msg += f'\n{0: <4} {self.name}\n' msg += '\n'.join([f'{r: <4} {n}' for r, n in fcands]) # do not log the error here, we won't have # enough context to know where we got a bad # path, but the caller should, maybe a case for # inversion of control here blob['errors'] = [{ 'message': msg, 'candidates': cands, }] return blob, project_path, dsid
def _lookup(self, dataset_id, fail=False, raw=True): try: row = self.byCol.searchIndex('id', dataset_id, raw=raw) return row except KeyError as e: # TODO update the sheet automatically log.critical(f'New dataset! {dataset_id}') if fail: raise e
def ret_val_exp(dataset_id, updated, time_now): log.info(f'START {dataset_id}') did = PennsieveId(dataset_id) uid = 'updated-' + dataset_id fid = 'failed-' + dataset_id # FIXME detect cases where we have already pulled the latest and don't pull again # FIXME TODO smart retrieve so we don't pull if we failed during # export instead of pull, should be able to get it from the # cached metadata on the dataset # FIXME getting file exists errors for pull in here # in upstream.mkdir() # FIXME we need to track/check the state here too in the event # that retrieve succeeds but validate or export fails # FIXME getting no paths to fetch errors # FIXME detect cases where it appears that a new dataset is in the process of being # uploaded and don't run for a while if it is being continually modified try: try: p1 = subprocess.Popen(argv_simple_retrieve(dataset_id)) out1 = p1.communicate() if p1.returncode != 0: raise Exception(f'oops return code was {p1.returncode}') except KeyboardInterrupt as e: p1.send_signal(signal.SIGINT) raise e dataset_path = (path_source_dir / did.uuid / 'dataset').resolve() try: p2 = subprocess.Popen(argv_spc_find_meta, cwd=dataset_path) out2 = p2.communicate() if p2.returncode != 0: raise Exception(f'oops return code was {p2.returncode}') except KeyboardInterrupt as e: p2.send_signal(signal.SIGINT) raise e try: p3 = subprocess.Popen(argv_spc_export, cwd=dataset_path) out3 = p3.communicate() if p3.returncode != 0: raise Exception(f'oops return code was {p3.returncode}') except KeyboardInterrupt as e: p3.send_signal(signal.SIGINT) raise e conn.set(uid, updated) conn.delete(fid) log.info(f'DONE: u: {uid} {updated}') except Exception as e: log.critical(f'FAIL: {fid} {updated}') conn.set(fid, updated) log.exception(e)
def triples(self): crossref_doi_pred = rdflib.term.URIRef('http://prismstandard.org/namespaces/basic/2.1/doi') for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) if not hasattr(id, 'asUri'): breakpoint() s = id.asUri(rdflib.URIRef) if 'source' in blob: source = blob['source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance(id, idlib.Doi) else id # FIXME idlib streams need to recognize their own type in __new__ data = doi.ttl() if data is None: # blackfynn has some bad settings on their doi records ... return try: g.parse(data=data, format='ttl') # FIXME network bad except BaseException as e: loge.exception(e) _tr = [s for s, p, o in g if p == crossref_doi_pred] if _tr: _their_record_s = _tr[0] yield s, owl.sameAs, _their_record_s yield from g else: g.debug() log.critical('No crossref doi section in graph!') else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance(oraw, rdflib.URIRef) else oraw yield s, p, o
def get_file(package, file_id): files = package.files if len(files) > 1: log.critical(f'MORE THAN ONE FILE IN PACKAGE {package.id}') for file in files: if file.id == file_id: return file else: raise FileNotFoundError(f'{package} has no file with id {file_id} but has:\n{files}')
def bfobject(self): if hasattr(self, '_bfobject'): return self._bfobject if isinstance(self._seed, self.__class__): bfobject = self._seed.bfobject elif isinstance(self._seed, BaseNode): bfobject = self._seed elif isinstance(self._seed, str): bfobject = self._api.get(self._seed) elif isinstance(self._seed, PathMeta): bfobject = self._api.get(self._seed.id) else: raise TypeError(self._seed) if hasattr(bfobject, '_json'): # constructed from a packages query # which we need in order for things to be fastish self._bfobject = bfobject return self._bfobject if isinstance(bfobject, DataPackage): def transfer(file, bfobject): file.parent = bfobject.parent file.dataset = bfobject.dataset file.state = bfobject.state file.package = bfobject return file files = bfobject.files parent = bfobject.parent if files: if self._file_id is not None: for file in files: if file.id == self._file_id: bfobject = transfer(file, bfobject) elif len(files) > 1: log.critical(f'MORE THAN ONE FILE IN PACKAGE {package.id}') else: file = files[0] bfobject = transfer(file, bfobject) bfobject.parent = parent # sometimes we will just reset a parent to itself else: log.warning(f'No files in package {package.id}') self._bfobject = bfobject return self._bfobject
def institutionTypes(self): if 'types' in self.data: for t in self.data['types']: if t == 'Other': log.info(self.label) yield self._type_map[t] else: log.critical(self.data) raise TypeError('wat')
def _lookup(self, dataset_id, fail=False, raw=True): try: row, iv = self._row_from_index('id', dataset_id) return row except AttributeError as e: # TODO update the sheet automatically if dataset_id not in self._news: log.critical(f'New dataset! {dataset_id}') self._news.append(dataset_id) if fail: raise e
def dereference_all_identifiers(obj, stage, *args, path=None, addError=None, **kwargs): try: dict_literal = _json_identifier_expansion(obj) except idlib.exc.RemoteError as e: if hasattr(obj, '_cooldown'): return obj._cooldown() # trigger cooldown to simplify issues down the line error = dict(error=e, pipeline_stage=stage.__class__.__name__, blame='submission', path=tuple(path)) if addError: if addError(**error): log.exception(e) #logd.error(msg) else: return {'errors': [error]} except idlib.exc.ResolutionError as e: if hasattr(obj, '_cooldown'): return obj._cooldown() # trigger cooldown to simplify issues down the line oops = json_export_type_converter(obj) msg = (f'{stage.lifters.id} could not resolve ' # FIXME lifters sigh f'{type(obj)}: {oops} {obj.asUri()}') error = dict(error=msg, pipeline_stage=stage.__class__.__name__, blame='submission', path=tuple(path)) if addError: if addError(**error): logd.error(msg) else: return {'errors': [error]} except Exception as e: log.critical(f'Unhandled exception {e} in {path}') error = dict(error=e, pipeline_stage=stage.__class__.__name__, blame='stage', path=tuple(path)) if addError: if addError(**error): log.exception(e) #logd.error(msg) else: return {'errors': [error]}
def encode(self, field, value): if field == 'file_id': if not value: if value is not None: log.critical(f'{value!r} for file_id empty but not None!') value = None if value is None: return self.empty if field in ('errors', ): return self.subfieldsep.join(value) if field == 'checksum': return value.hex() # raw hex may contain field separators :/ return _str_encode(field, value)
def __new__(cls, cache_anchor, local_class): if isinstance(cache_anchor, BlackfynnCache): try: blackfynn_local_instance = BFLocal(cache_anchor.id) except (requests.exceptions.ConnectionError, exc.MissingSecretError) as e: log.critical(f'Could not connect to blackfynn {e!r}') #blackfynn_local_instance = FakeBFLocal(anchor.id, anchor) # WARNING polutes things! blackfynn_local_instance = 'CONNECTION ERROR' else: raise TypeError(f'{type(cache_anchor)} is not BFLocal or BlackfynnCache!') cache_class = cache_anchor.__class__ self = super().__new__(cls, local_class, cache_class, _api=blackfynn_local_instance) cls._cache_anchor = cache_anchor self._errors = [] self.root = self._api.root return self
def _jsonMetadata(self, do_expensive_operations=False): # FIXME WARNING resolution only works if we were relative to # the current working directory if self.is_broken_symlink(): self = self.absolute() else: self = self.resolve() # safety since we have to go hunting paths project_path = self.find_cache_root() if project_path is None: # FIXME TODO I think we used dataset_description as a hint? project_path = Path('/') # FIXME FIXME log.critical(f'No dataset path found for {self}!') #raise NotImplementedError('current implementation cant anchor with current info') dataset_path = [ p for p in chain((self, ), self.parents) if p.parent == project_path ][0] drp = self.relative_path_from(dataset_path) # FIXME ... blob = { 'type': 'path', 'dataset_id': dataset_path.cache_id, 'dataset_relative_path': drp, } mimetype = self.mimetype if mimetype: blob['mimetype'] = mimetype if do_expensive_operations: blob['magic_mimetype'] = asdf if not (self.is_broken_symlink() or self.exists()): msg = f'Path does not exist! {self}' # do not log the error here, we won't have # enough context to know where we got a bad # path, but the caller should, maybe a case for # inversion of control here blob['errors'] = [{'message': msg}] return blob
def __call__(self, affiliation_string): if not isinstance(affiliation_string, str): logd.critical(str(affiliation_string)) return self(affiliation_string[0] + 'ERROR ERROR') m = self.mapping if not isinstance(affiliation_string, str): log.critical('sigh') return None if affiliation_string in m: return m[affiliation_string] else: # FIXME super inefficient las = len(affiliation_string) for l, s in sorted([(len(k), k) for k in m.keys()], reverse=True): if l <= las and s in affiliation_string: return m[s]
def published_online(self, blob): try: dpl = blob['issued']['date-parts'] except KeyError as e: log.critical(e) return None dp = dpl[0] if len(dp) == 3: y, m, d = dp return f'{y}-{m:0>2}-{d:0>2}' elif len(dp) == 2: y, m = dp return f'{y}-{m:0>2}' elif len(dp) == 1: y = dp return f'{y}' else: raise NotImplementedError(f'what the? {dp}')
def _has_remote_files(self): """ this will fetch """ bfobject = self.bfobject if not isinstance(bfobject, DataPackage): return False files = bfobject.files if not files: return False if len(files) > 1: log.critical(f'{self} has more than one file! Not switching bfobject!') return True file, = files file.parent = bfobject.parent file.dataset = bfobject.dataset file.package = bfobject self._bfobject = file return True
def _lookup(self, dataset_id): try: return self.byCol.searchIndex('id', dataset_id) except KeyError as e: # TODO update the sheet automatically log.critical(f'New dataset! {dataset_id}')
def data(self): """ get the 'cached' data which isn't really cached at the moment once we implement an index for local files then we can hit that first from here """ # we don't keep two copies of the local data # unless we are doing a git-like thing if self.is_dir(): raise TypeError('can\'t retrieve data for a directory') meta = self.meta if meta.file_id is None: raise NotImplementedError('can\'t fetch data without a file id') #cands = list(self.local_object_cache_dir.glob(self.cache_key)) # FIXME this does not play well with old_id ... # can probably get away with just globing for the old_id in # most cases # TODO where to store the chain of prior versions? i.e. do # we just keep the xattrs in the object cache? how about file moves? # sigh git ... rgen = None if self.local_object_cache_path.exists(): locsize = self.local_object_cache_path.size if locsize != meta.size: msg = (f'Partial download detected {locsize} != {meta.size} at' f'\n{self.local_object_cache_path}') log.info(msg) size = self.local_object_cache_path.size kwargs = {} if size > 0: if (self.local == self.local_object_cache_path and size > 4096): # FIXME hardcoded chunksize # XXX there is a fantastic edge case where if # you try to read and write from the same file # only the first chunk will be written and if # you are retrieving from remote then the offset # would be greater than the chunksize so there # will be a gap, so we set chunksize here and # issue a critical log msg = ('You probably did not mean to do this. ' f'Refetching {size - 4096} bytes.') log.critical(msg) kwargs['ranges'] = ((4096, ), ) else: kwargs['ranges'] = ((size, ), ) if not hasattr(self._remote_class, '_api'): # see note below self._remote_class.anchorToCache(self.anchor) rgen = self._remote_class.get_file_by_id( meta.id, meta.file_id, **kwargs) gen = chain((next(rgen), ), self.local_object_cache_path.data) else: gen = chain( (f'from local cache {self.local_object_cache_path}', ), self.local_object_cache_path.data) else: if not hasattr(self._remote_class, '_api'): # NOTE we do not want to dereference self.remote # in this situation because we just want the file # not the FS metadata, so we have to ensure that _api # is bound self._remote_class.anchorToCache(self.anchor) gen = self._remote_class.get_file_by_id(meta.id, meta.file_id) try: self.data_headers = next(gen) except exc.NoRemoteFileWithThatIdError as e: log.error(f'{self} {e}') raise exc.CacheNotFoundError( f'{self}' ) from e # have to raise so that we don't overwrite the file log.log(9, self.data_headers) if self.local_object_cache_path.exists(): yield from gen if rgen is None: return yield from self.local_object_cache_path._data_setter(rgen, append=True) else: # FIXME we MUST write the metadata first so that we know the expected size # so that in the event that the generator is only partially run out we know # that we can pick up where we left off with the fetch, this also explains # why all the cases where the cached data size did not match were missing # xattrs entirely if not self.local_object_cache_path.parent.exists(): # FIXME sigh, no obvious way around having to check # every time other than creating all the cache # subfolders in advance self.local_object_cache_path.parent.mkdir() self.local_object_cache_path.touch() self.local_object_cache_path.cache_init(meta) yield from self.local_object_cache_path._data_setter(gen) ls = self.local_object_cache_path.size if ls != meta.size: self.local_object_cache_path.unlink() msg = f'{ls} != {meta.size} for {self}' raise ValueError(msg) # FIXME TODO
def triples_gen(self): rm = self._source # FIXME there doesn't seem to be a section that tells me the name # of top level model so I have to know its name beforhand # the id is in the model, having the id in the resource map # prevents issues if these things get sent decoupled id = rm['id'] mid = id.replace(' ', '-') links = rm[id]['links'] #linknodes = [n for n in rm[id]['nodes'] if n['class'] == 'Link'] # visible confusion st = [] from_to = [] ot = None yield from self.apinatbase() for link in links: if 'conveyingType' in link: if link['conveyingType'] == 'ADVECTIVE': p_is = TEMP.isAdvectivelyConnectedTo p_from = TEMP.advectivelyConnectsFrom p_to = TEMP.advectivelyConnectsTo p_cmat = TEMP.advectivelyConnectsMaterial diffusive = False elif link['conveyingType'] == 'DIFFUSIVE': p_is = TEMP.isDiffusivelyConnectedTo p_from = TEMP.diffusivelyConnectsFrom p_to = TEMP.diffusivelyConnectsTo p_cmat = TEMP.diffusivelyConnectsMaterial diffusive = True else: log.critical(f'unhandled conveying type {link}') continue source = link['source'] target = link['target'] ok = True if len(from_to) == 2: # otherwise st = [] from_to = [] for i, e in enumerate((source, target)): ed = rm[e] if 'external' not in ed: if not i and from_to: # TODO make sure the intermediate ids match pass else: ok = False break else: st.append(e) from_to.append(OntId(ed['external'][0])) conveying = link['conveyingLyph'] cd = rm[conveying] if 'external' in cd: old_ot = ot ot = OntTerm(cd['external'][0]) yield ot.u, rdf.type, owl.Class yield ot.u, TEMP.internalId, rdflib.Literal(conveying) yield ot.u, rdfs.label, rdflib.Literal(ot.label) yield from self.materialTriples( ot.u, link, p_cmat) # FIXME locate this correctly if ok: u, d = from_to if st[0] == source: yield u, rdfs.label, rdflib.Literal( OntTerm(u).label) yield u, rdf.type, owl.Class yield from cmb.restriction.serialize( ot.u, p_from, u) if st[1] == target: yield d, rdfs.label, rdflib.Literal( OntTerm(d).label) yield d, rdf.type, owl.Class yield from cmb.restriction.serialize(ot.u, p_to, d) if old_ot is not None and old_ot != ot: yield from cmb.restriction.serialize( ot.u, p_from, old_ot.u) if diffusive: # we can try to hack this using named individuals # but it is not going to do exactly what is desired s_link = TEMP[f'ApiNATOMY/{mid}/{link["id"]}'] s_cd = TEMP[f'ApiNATOMY/{mid}/{cd["id"]}'] yield s_link, rdf.type, owl.NamedIndividual yield s_link, rdf.type, TEMP.diffusiveLink # FIXME I'm not sure these go in the model ... yield s_cd, rdf.type, owl.NamedIndividual if 'external' in cd and cd['external']: oid = OntId(cd['external'][0]) yield s_cd, rdf.type, oid.u ot = oid.asTerm() if ot.label: yield oid.u, rdfs.label, ot.label else: yield s_cd, rdf.type, TEMP.conveyingLyph for icd in cd['inCoalescences']: dcd = rm[icd] log.info(lj(dcd)) s_icd = TEMP[f'ApiNATOMY/{mid}/{dcd["id"]}'] yield s_cd, TEMP.partOfCoalescence, s_icd yield s_icd, rdf.type, owl.NamedIndividual yield s_icd, rdf.type, TEMP[ 'ApiNATOMY/Coalescence'] if 'external' in dcd and dcd['external']: oid = OntId(dcd['external'][0]) yield s_icd, rdf.type, oid.u ot = oid.asTerm() if ot.label: yield oid.u, rdfs.label, ot.label for lyphid in dcd['lyphs']: ild = rm[lyphid] log.info(lj(ild)) if 'external' in ild and ild['external']: yield s_icd, TEMP.hasLyphWithMaterial, OntId( ild['external'][0]) if not ok: logd.info(f'{source} {target} issue') continue for inid, e in zip(st, from_to): yield e.u, rdf.type, owl.Class yield e.u, rdfs.label, rdflib.Literal(OntTerm(e).label) yield e.u, TEMP.internalId, rdflib.Literal(inid) f, t = from_to yield from cmb.restriction.serialize(f.u, p_is, t.u)
def add(data, target_path, value, fail_on_exists=True, update=False): """ Note on semantics when target_path contains the type int. Normally when adding a path all the parents are added because we are expecting a direct path down. However, if the path contains int then it implicitly expects the list to alread exist. Therefore any failure on the way TO a list will immediately abort and not add the keys to the non-existent list. This is consistent with the approach where keys are not required but if their value is a list it must not be empty. Thus we abort so that we don't go around creating a bunch of empty lists that will show up later as errors when validating the schema. """ # type errors can occur here ... # e.g. you try to go to a string if not [_ for _ in (list, tuple) if isinstance(target_path, _)]: msg = f'target_path is not a list or tuple! {type(target_path)}' raise TypeError(msg) if False and target_path == ['@context', '@base']: # use to debug TargetPathExistsError issues if '@tracker' not in data: data['@tracker'] = [] try: raise BaseException('tracker') except BaseException as e: data['@tracker'].append(e) if '@context' in data and '@base' in data['@context']: log.critical(f'target present {data["id"]}') else: log.critical(f'target not present {data["id"]}') target_prefixes = target_path[:-1] target_key = target_path[-1] target = data is_subpath_add = int in target_path for i, target_name in enumerate(target_prefixes): if target_name is int: # add same value to all objects in list if not is_list_or_tuple(target): msg = (f'attempt to add to all elements of not a list ' f'{type(target)} target_path was {target_path} ' f'target_name was {target_name}') raise TypeError(msg) # LOL PYTHON namespaces [AtomicDictOperations.add(subtarget, target_path[i + 1:], value) for subtarget in target] return # int terminates this level of an add if target_name not in target: # TODO list indicies XXX that is really append though ... if is_subpath_add: # if we are targeting objects in a list for addition # abort the first time we would have to create a key # because we will eventually create an empty list # which we won't be able to add anything to and will # likely cause schema validation errors return target[target_name] = {} target = target[target_name] if update: pass elif fail_on_exists and target_key in target: msg = f'A value already exists at path {target_path} in\n{lj(data)}' raise exc.TargetPathExistsError(msg) target[target_key] = value
def datame(d, ca, timestamp, helpers=None, log_level=logging.INFO, dp=_p, evil=[False], dumb=False): """ sigh, pickles """ log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery', 'augpathlib', 'pyontutils') for log_name in log_names: log = logging.getLogger(log_name) if not log.handlers: log = makeSimpleLogger(log_name) log.setLevel(log_level) log.info(f'{log_name} had no handler') else: if log.level != log_level: log.setLevel(log_level) rc = d.path._cache_class._remote_class if not hasattr(rc, '_cache_anchor'): rc._setup() rc.anchorTo(ca) if not hasattr(BlackfynnCache, '_anchor'): # the fact that we only needed this much later in time # tells me that we had actually done an excellent job # of firewalling the validation pipeline from anything # related to the cache beyond the xatter data # can't use ca.__class__ because it is the posix variant of # _cache_class BlackfynnCache._anchor = ca prp = d.path.project_relative_path if helpers is not None: d.add_helpers(helpers) out_path = (dp / d.id).with_suffix('.json') if out_path.exists() and dumb: if not evil[0]: # FIXME this is SO DUMB to do in here, but ... from pysercomb.pyr import units as pyru [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)] pyru.Term._OntTerm = OntTerm # the tangled web grows ever deeper :x evil[0] = True log.warning(f'loading from path {out_path}') # FIXME this is _idiotically_ slow with joblib # multiple orders of magnitude faster just using listcomp with open(out_path, 'rt') as f: return fromJson(json.load(f)) blob_dataset = d.data_for_export(timestamp) with open(out_path.with_suffix('.raw.json'), 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode) try: pipe = pipes.IrToExportJsonPipeline( blob_dataset) # FIXME network sandbox violation blob_export = pipe.data with open(out_path, 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode) except Exception as e: log.exception(e) log.critical(f'error during fancy json export, see previous log entry') return blob_dataset
def setup(cls, *, local_only=False): # FIXME this is a mess """ make sure we have all datasources calling this again will refresh helpers """ if hasattr(Integrator, '__setup') and Integrator.__setup: return # already setup Integrator.__setup = True for _cls in cls.mro(): if _cls != cls: if hasattr(_cls, 'setup'): _cls.setup() dat.DatasetStructure.rate = cls.rate class FakeOrganSheet: modality = lambda v: None organ_term = lambda v: None award_manual = lambda v: None byCol = _byCol([['award', 'award_manual', 'organ_term'], []]) techniques = lambda v: [] protocol_uris = lambda v: [] class FakeAffilSheet: def __call__(self, *args, **kwargs): return class FakeOverviewSheet: def __call__(self, *args, **kwargs): return # unanchored helpers if cls.no_google or local_only: log.critical('no google no organ data') cls.organs_sheet = FakeOrganSheet cls.affiliations = FakeAffilSheet() cls.overview_sheet = FakeOverviewSheet() else: # ipv6 resolution issues :/ also issues with pickling #cls.organs_sheet = sheets.Organs(fetch_grid=True) # this kills parallelism cls.organs_sheet = sheets.Organs( ) # if fetch_grid = False @ class level ok cls.affiliations = sheets.Affiliations() cls.overview_sheet = sheets.Overview() # zap all the services (apparently doesn't help) # yep, its just the organ sheet, these go in and out just fine #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service') #if hasattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro'): #delattr(sheets.Sheet, '_Sheet__spreadsheet_service_ro') #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, '_spreadsheet_service'): #delattr(s, '_spreadsheet_service') # YOU THOUGHT IT WAS GOOGLE IT WAS ME ORGANS ALL ALONG! #cls.organs_sheet = FakeOrganSheet # organs is BAD #cls.affiliations = FakeAffilSheet() # affiliations is OK #cls.overview_sheet = FakeOverviewSheet() # overview is OK #breakpoint() # remove byCol which is unpickleable (super duper sigh) #for s in (cls.organs_sheet, cls.affiliations, cls.overview_sheet): #if hasattr(s, 'byCol'): #delattr(s, 'byCol') if cls.no_google: cls.organ = lambda award: None if local_only: cls.organ = lambda award: None cls.member = lambda first, last: None else: cls.organ = OrganData() if hasattr(State, 'member'): cls.member = State.member else: log.error('State missing member, using State seems ' 'like a good idea until you go to multiprocessing') cls.member = lambda first, last: None
def bfobject(self): if hasattr(self, '_bfobject'): return self._bfobject if isinstance(self._seed, self.__class__): bfobject = self._seed.bfobject elif isinstance(self._seed, BaseNode): bfobject = self._seed elif isinstance(self._seed, str): try: bfobject = self._api.get(self._seed) except Exception as e: # sigh if self._local_only: _class = id_to_type(self._seed) if issubclass(_class, Dataset): bfobject = _class(self._local_dataset_name) bfobject.id = self._seed else: raise NotImplementedError(f'{_class}') from e else: raise e elif isinstance(self._seed, PathMeta): bfobject = self._api.get(self._seed.id) else: raise TypeError(self._seed) if hasattr(bfobject, '_json'): # constructed from a packages query # which we need in order for things to be fastish self._bfobject = bfobject return self._bfobject if isinstance(bfobject, DataPackage): def transfer(file, bfobject): file.parent = bfobject.parent file.dataset = bfobject.dataset file.state = bfobject.state file.package = bfobject return file files = bfobject.files parent = bfobject.parent if files: if self._file_id is not None: for file in files: if file.id == self._file_id: bfobject = transfer(file, bfobject) elif len(files) > 1: log.critical( f'MORE THAN ONE FILE IN PACKAGE {bfobject.id}') if (len(set(f.size for f in files)) == 1 and len(set(f.name for f in files)) == 1): log.critical( 'Why are there multiple files with the same name and size here?' ) file = files[0] bfobject = transfer(file, bfobject) else: log.critical( f'There are actually multiple files ...\n{files}') else: file = files[0] bfobject = transfer(file, bfobject) bfobject.parent = parent # sometimes we will just reset a parent to itself else: log.warning(f'No files in package {bfobject.id}') self._bfobject = bfobject return self._bfobject