def refresh(self, update_cache=False, update_data=False, update_data_on_cache=False, size_limit_mb=2, force=False): """ use force if you have a file from packages """ try: old_meta = self.meta except exc.NoMetadataRetrievedError as e: log.error( f'{e}\nYou will need to individually refresh {self.local}') return except exc.NoRemoteFileWithThatIdError as e: log.exception(e) return if self.is_file() and not force: # this will tigger a fetch pass else: self._bfobject = self._api.get(self.id) if update_cache or update_data: file_is_different = self.update_cache() update_existing = file_is_different and self.cache.exists() udoc = update_data_on_cache and file_is_different if update_existing or udoc: size_limit_mb = None update_data = update_data or update_existing or udoc if update_data and self.is_file(): self.cache.fetch(size_limit_mb=size_limit_mb) return self.cache # when a cache calls refresh it needs to know if it no longer exists
def condense(self): marked_as_done = '-' mapping = defaultdict(list) def make_key(row): return tuple( c.value for c in [row.tag(), row.value(), row.text(), row.exact()]) create = [] for mt_cell in self.row_object(0).map_to().column.cells[1:]: if (mt_cell.value and mt_cell.value != marked_as_done and mt_cell.value != mt_cell.row.value().value): try: row, iv = self._row_from_index(value=mt_cell.value) key = make_key(row) except AttributeError as e: # value not in index key = ('protc:executor-verb', mt_cell.value, '', '') if key not in create: create.append(key) log.exception(e) mapping[key].append( mt_cell.row.value().value) # cells don't move so we're ok mapping = dict(mapping) value_to_map_to = { value: k for k, values in mapping.items() for value in values } #breakpoint() return value_to_map_to, create # old -> new, original -> correct
def _protocol_uris_resolved(self): # FIXME quite slow ... for start_uri in self.protocol_uris: log.debug(start_uri) try: if not hasattr(start_uri, 'dereference'): start_uri = idlib.StreamUri(start_uri) end_uri = start_uri.dereference() yield end_uri sc = end_uri.progenitor.status_code if sc > 400: msg = f'error accessing {end_uri} {sc}' if self.addError(msg, blame='submission'): logd.error(msg) except idlib.exceptions.ResolutionError as e: pass # FIXME I think we already log this error? except self._MissingSchema as e: if self.addError(e, blame='submission'): logd.error(e) except OntId.BadCurieError as e: if self.addError(e, blame='submission'): logd.error(e) except BaseException as e: #breakpoint() log.exception(e) log.critical('see exception above')
def get(self, uri): #juri = uri + '.json' logd.info(uri) log.debug('going to network for protocols') resp = requests.get(uri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def ret_val_exp(dataset_id, updated, time_now): log.info(f'START {dataset_id}') did = PennsieveId(dataset_id) uid = 'updated-' + dataset_id fid = 'failed-' + dataset_id # FIXME detect cases where we have already pulled the latest and don't pull again # FIXME TODO smart retrieve so we don't pull if we failed during # export instead of pull, should be able to get it from the # cached metadata on the dataset # FIXME getting file exists errors for pull in here # in upstream.mkdir() # FIXME we need to track/check the state here too in the event # that retrieve succeeds but validate or export fails # FIXME getting no paths to fetch errors # FIXME detect cases where it appears that a new dataset is in the process of being # uploaded and don't run for a while if it is being continually modified try: try: p1 = subprocess.Popen(argv_simple_retrieve(dataset_id)) out1 = p1.communicate() if p1.returncode != 0: raise Exception(f'oops return code was {p1.returncode}') except KeyboardInterrupt as e: p1.send_signal(signal.SIGINT) raise e dataset_path = (path_source_dir / did.uuid / 'dataset').resolve() try: p2 = subprocess.Popen(argv_spc_find_meta, cwd=dataset_path) out2 = p2.communicate() if p2.returncode != 0: raise Exception(f'oops return code was {p2.returncode}') except KeyboardInterrupt as e: p2.send_signal(signal.SIGINT) raise e try: p3 = subprocess.Popen(argv_spc_export, cwd=dataset_path) out3 = p3.communicate() if p3.returncode != 0: raise Exception(f'oops return code was {p3.returncode}') except KeyboardInterrupt as e: p3.send_signal(signal.SIGINT) raise e conn.set(uid, updated) conn.delete(fid) log.info(f'DONE: u: {uid} {updated}') except Exception as e: log.critical(f'FAIL: {fid} {updated}') conn.set(fid, updated) log.exception(e)
def dereference_all_identifiers(obj, stage, *args, path=None, addError=None, **kwargs): try: dict_literal = _json_identifier_expansion(obj) except idlib.exc.RemoteError as e: if hasattr(obj, '_cooldown'): return obj._cooldown() # trigger cooldown to simplify issues down the line error = dict(error=e, pipeline_stage=stage.__class__.__name__, blame='submission', path=tuple(path)) if addError: if addError(**error): log.exception(e) #logd.error(msg) else: return {'errors': [error]} except idlib.exc.ResolutionError as e: if hasattr(obj, '_cooldown'): return obj._cooldown() # trigger cooldown to simplify issues down the line oops = json_export_type_converter(obj) msg = (f'{stage.lifters.id} could not resolve ' # FIXME lifters sigh f'{type(obj)}: {oops} {obj.asUri()}') error = dict(error=msg, pipeline_stage=stage.__class__.__name__, blame='submission', path=tuple(path)) if addError: if addError(**error): logd.error(msg) else: return {'errors': [error]} except Exception as e: log.critical(f'Unhandled exception {e} in {path}') error = dict(error=e, pipeline_stage=stage.__class__.__name__, blame='stage', path=tuple(path)) if addError: if addError(**error): log.exception(e) #logd.error(msg) else: return {'errors': [error]}
def decode(self, field, value): if field in ('created', 'updated'): # FIXME human readable vs integer :/ try: # needed for legacy cases value, = struct.unpack('d', value) return datetime.fromtimestamp(value) except struct.error: pass return parser.parse( value.decode()) # FIXME with timezone vs without ... elif field == 'checksum': return value elif field == 'etag': # struct pack this sucker so the count can fit as well? value = value.decode() # FIXME checksum, strcount = value.rsplit('-', 1) count = int(strcount) return bytes.fromhex(checksum), count elif field == 'errors': value = value.decode() return tuple(_ for _ in value.split(';') if _) elif field == 'user_id': try: return int(value) except ValueError: # FIXME :/ uid vs owner_id etc ... return value.decode() elif field in ('id', 'mode', 'old_id'): return value.decode() elif field not in self.fields: log.warning(f'Unhandled field {field}') return value else: try: return int(value) except ValueError as e: log.exception(f'{field} {value}') raise e
def _get_protocol_json(self, uri): #juri = uri + '.json' logd.info(uri) pi = get_right_id(uri) if 'protocols.io' in pi: pioid = pi.slug # FIXME normalize before we ever get here ... log.info(pioid) else: msg = f'protocol uri is not from protocols.io {pi} {self.id}' logd.error(msg) self.addError(msg) return #uri_path = uri.rsplit('/', 1)[-1] apiuri = 'https://protocols.io/api/v3/protocols/' + pioid #'https://www.protocols.io/api/v3/groups/sparc/protocols' #apiuri = 'https://www.protocols.io/api/v3/filemanager/folders?top' #print(apiuri, header) log.debug('going to network for protocols') resp = requests.get(apiuri, headers=self._pio_header) #log.info(str(resp.request.headers)) if resp.ok: try: j = resp.json() # the api is reasonably consistent except BaseException as e: log.exception(e) breakpoint() raise e return j else: try: j = resp.json() sc = j['status_code'] em = j['error_message'] msg = f'protocol issue {uri} {resp.status_code} {sc} {em} {self.id!r}' logd.error(msg) self.addError(msg) # can't return here because of the cache except BaseException as e: log.exception(e) logd.error(f'protocol no access {uri} {self.id!r}')
def __init__(self, *args, **kwargs): self._cache_path = auth.get_path('cache-path') / 'google_sheets' if not self._only_cache: try: if 'readonly' not in kwargs or kwargs['readonly']: # readonly=True is default so we take this branch if not set self._saf = auth.get_path( 'google-api-service-account-file-readonly') else: self._saf = auth.get_path( 'google-api-service-account-file-rw') except KeyError as e: log.warn(e) except Exception as e: log.exception(e) try: super().__init__(*args, **kwargs) finally: self._saf = None
def triples_objects_multi(self): for key in self.objects_multi: if key in self.blob: values = self.blob[key] assert not isinstance(values, str), f'{values} in {key}' for value in values: if key == 'external': try: o = OntId(value).URIRef yield o, readable.annotates, self.s except OntId.UnknownPrefixError as e: log.exception(e) continue elif key == 'inheritedExternal': try: o = OntId(value).URIRef except OntId.UnknownPrefixError as e: log.exception(e) continue else: value = value.replace(' ', '-') # FIXME require no spaces in internal ids o = self.context[value] yield self.s, readable[key], o
def datame(d, ca, timestamp, helpers=None, log_level=logging.INFO, dp=_p, evil=[False], dumb=False): """ sigh, pickles """ log_names = ('sparcur', 'idlib', 'protcur', 'orthauth', 'ontquery', 'augpathlib', 'pyontutils') for log_name in log_names: log = logging.getLogger(log_name) if not log.handlers: log = makeSimpleLogger(log_name) log.setLevel(log_level) log.info(f'{log_name} had no handler') else: if log.level != log_level: log.setLevel(log_level) rc = d.path._cache_class._remote_class if not hasattr(rc, '_cache_anchor'): rc._setup() rc.anchorTo(ca) if not hasattr(BlackfynnCache, '_anchor'): # the fact that we only needed this much later in time # tells me that we had actually done an excellent job # of firewalling the validation pipeline from anything # related to the cache beyond the xatter data # can't use ca.__class__ because it is the posix variant of # _cache_class BlackfynnCache._anchor = ca prp = d.path.project_relative_path if helpers is not None: d.add_helpers(helpers) out_path = (dp / d.id).with_suffix('.json') if out_path.exists() and dumb: if not evil[0]: # FIXME this is SO DUMB to do in here, but ... from pysercomb.pyr import units as pyru [register_type(c, c.tag) for c in (pyru._Quant, pyru.Range)] pyru.Term._OntTerm = OntTerm # the tangled web grows ever deeper :x evil[0] = True log.warning(f'loading from path {out_path}') # FIXME this is _idiotically_ slow with joblib # multiple orders of magnitude faster just using listcomp with open(out_path, 'rt') as f: return fromJson(json.load(f)) blob_dataset = d.data_for_export(timestamp) with open(out_path.with_suffix('.raw.json'), 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_dataset, f, sort_keys=True, indent=2, cls=JEncode) try: pipe = pipes.IrToExportJsonPipeline( blob_dataset) # FIXME network sandbox violation blob_export = pipe.data with open(out_path, 'wt') as f: # FIXME XXXXXXXXXXXXXXXXXXXXXXXXXXXX json.dump(blob_export, f, sort_keys=True, indent=2, cls=JEncode) except Exception as e: log.exception(e) log.critical(f'error during fancy json export, see previous log entry') return blob_dataset