def machines(expired_for=0): if expired_for != 'all': adjusted_time = time.time() - units.parse_time_interval(expired_for) else: adjusted_time = None for root, dirs, files in os.walk(path.MACHINES, topdown=False): for d in (os.path.join(root, x) for x in dirs): lock_path = os.path.join(root, '.' + os.path.basename(d) + '-lock') lock = fasteners.process_lock.InterProcessLock(lock_path) lock.acquire() try: remove = fingertip.machine.needs_a_rebuild(d, by=adjusted_time) except Exception as ex: log.warning(f'while processing {d}: {ex}') remove = True if (expired_for == 'all' or remove): assert os.path.realpath(d).startswith( os.path.realpath(path.MACHINES) ) log.info(f'removing {os.path.realpath(d)}') if not os.path.islink(d): shutil.rmtree(d) else: os.unlink(d) else: log.debug(f'keeping {os.path.realpath(d)}') os.unlink(lock_path) lock.release()
def machines(expired_for=0): if expired_for != 'all': adjusted_time = time.time() - fingertip.expiration._parse(expired_for) for root, dirs, files in os.walk(path.MACHINES, topdown=False): for d in (os.path.join(root, x) for x in dirs): lock_path = os.path.join(root, '.' + os.path.basename(d) + '-lock') lock = fasteners.process_lock.InterProcessLock(lock_path) lock.acquire() try: remove = fingertip.machine.needs_a_rebuild(d, by=adjusted_time) except (FileNotFoundError, EOFError, UnboundLocalError): remove = True if (expired_for == 'all' or remove): assert os.path.realpath(d).startswith( os.path.realpath(path.MACHINES) ) log.info(f'removing {os.path.realpath(d)}') if not os.path.islink(d): shutil.rmtree(d) else: os.unlink(d) else: log.debug(f'keeping {os.path.realpath(d)}') os.unlink(lock_path) lock.release()
def schedule(): # Do this only if fingertip is in PATH if not shutil.which("fingertip"): log.debug('No `fingertip` found in PATH. Not scheduling ' 'automatic cleanup.') return # Skip if systemd is not available if not shutil.which('systemd-run') or not shutil.which('systemctl'): log.warning('It looks like systemd is not available. ' 'No cleanup is scheduled! If you are running out of disk, ' 'space, run `fingertip cleanup periodic` manually.') return # If the timer is already installed skip installation too p = subprocess.run([ 'systemctl', '--user', 'is-active', '--quiet', 'fingertip-cleanup.timer' ]) if p.returncode == 0: log.debug('The systemd timer handling cleanup is already installed ' 'and running.') return # Run twice a day log.info('Scheduling cleanup to run every two hours') subprocess.run([ 'systemd-run', '--unit=fingertip-cleanup', '--user', '--on-calendar=0/2:00:00', 'fingertip', 'cleanup', 'periodic' ])
def translate_path(self, http_path): # directly serve local files if http_path in http_cache._local_files_to_serve: local_path = http_cache._local_files_to_serve[http_path] else: local_path = super().translate_path(http_path) log.debug(f'serving {os.path.basename(http_path)} ' f'directly from {local_path}') return local_path
def report(self, scale): """ Make note of a time desync of a specified scale """ scale = self._parse_scale(scale) if scale > self._current_scale: log.debug(f'time desync scale increased to {scale}') self._current_scale = scale
def file_has_not_changed(self, path): log.debug(f'checking that {path} has not changed...') mtime, hash_ = self._deps[path] if mtime != (os.stat(path).st_mtime): if hash_ != weak_hash.of_file(path): log.warning(f'{path} has changed, set ' 'FINGERTIP_IGNORE_CODE_CHANGES=1 to ignore') return False return True
def __enter__(self): log.debug(f'state={self._state}') assert (self._state == 'loaded' and not self._up_counter or self._state == 'spun_up' and self._up_counter) if not self._up_counter: assert self._state == 'loaded' self.hooks.up() self._state = 'spun_up' self._up_counter += 1 return self
def mount_supported_fs(backing_file, tgt): log.info('mounting a reflink-supported filesystem for image storage...') tgt_uid, tgt_gid = os.stat(tgt).st_uid, os.stat(tgt).st_gid subprocess.run(['sudo', 'mount', '-o', 'loop', backing_file, tgt], check=True) mount_uid, mount_gid = os.stat(tgt).st_uid, os.stat(tgt).st_gid if (tgt_uid, tgt_gid) != (mount_uid, mount_gid): log.debug(f'fixing owner:group ({tgt_uid}:{tgt_gid})') subprocess.run(['sudo', 'chown', f'{tgt_uid}:{tgt_gid}', tgt], check=True)
def tighten(self, allowed_scale): """ Request that this and all the further stages of the pipeline must maintain desynchronization at or below the specified scale """ allowed_scale = self._parse_scale(allowed_scale) if allowed_scale < self._allowed_scale: log.debug(f'time desync requirement tightened to {allowed_scale}') self._allowed_scale = allowed_scale self.fix_if_needed()
def needs_a_rebuild(mpath, by=None): with open(os.path.join(mpath, 'machine.clpickle'), 'rb') as f: m = cloudpickle.load(f) if not m.expiration.files_have_not_changed(): return True expired = m.expiration.is_expired(by) if expired: log.debug(f'{mpath} has expired at {m.expiration.pretty()}') else: log.debug(f'{mpath} is valid until {m.expiration.pretty()}') return expired
def files_have_not_changed(self): if os.getenv('FINGERTIP_IGNORE_CODE_CHANGES', '0') != '0': return True for path, (mtime, hash_) in self._deps.items(): log.debug(f'checking that {path} has not changed...') if mtime != (os.stat(path).st_mtime): if hash_ != weak_hash.of_file(path): log.warning(f'{path} has changed, set ' 'FINGERTIP_IGNORE_CODE_CHANGES=1 to ignore') return False return True
def _load_from_path(data_dir_path): log.debug(f'load from {data_dir_path}') with open(os.path.join(data_dir_path, 'machine.clpickle'), 'rb') as f: m = cloudpickle.load(f) assert m._state == 'saving' m._state = 'loading' m.log = log.sublogger('<unknown>') assert m.path == data_dir_path assert m._parent_path == os.path.realpath(os.path.dirname(data_dir_path)) m.hooks.load() m._state = 'loaded' return m
def c_r_offline(self, request): cache_url = self.cache_url(request.url) log.debug(f'looking up {cache_url} in the cache') cache_data = self.cache.get(cache_url) if cache_data is None: log.error(f'{cache_url} not in cache and fingertip is offline') return False resp = self.serializer.loads(request, cache_data) if not resp: log.error(f'{cache_url} cache entry deserialization failed, ignored') return False log.warning(f'Using {cache_url} from offline cache') return resp
def fix_if_needed(self, at_least_if_scale=None): """ Perform one-off time synchronization if the desync is larger than the specified scale or the scale previously set with ``.desync.tighten`` """ scale = at_least_if_scale or TimeDesync.LARGE if min(scale, self._allowed_scale) < self._current_scale: log.debug('timesync ' f'(requested: {scale},' f' allowed: {self._allowed_scale},' f' current: {self._current_scale})') self.fix()
def dir_listing_has_not_changed(self, path): log.debug(f'checking that {path} listing has not changed...') mtime, hash_ = self._deps_dirs[path] try: if mtime != (os.stat(path).st_mtime): listing = repr(sorted(os.listdir(path))) if hash_ != weak_hash.of_string(listing): log.warning(f'{path} listing has changed, set ' 'FINGERTIP_IGNORE_FILE_CHANGES=1 to ignore') return False except FileNotFoundError: return False return True
def clone_and_load(from_path, name_hint=None): log.debug(f'clone {from_path}') temp_path = temp.disappearing_dir(from_path, hint=name_hint) log.debug(f'temp = {temp_path}') os.makedirs(temp_path, exist_ok=True) with open(os.path.join(from_path, 'machine.clpickle'), 'rb') as f: m = cloudpickle.load(f) m.log = log.Sublogger('<cloning>') m.hooks.clone(temp_path) m._parent_path = os.path.realpath(from_path) m.path = temp_path with open(os.path.join(m.path, 'machine.clpickle'), 'wb') as f: cloudpickle.dump(m, f) return _load_from_path(temp_path)
def _finalize(self, link_as=None, name_hint=None): log.debug(f'finalize hint={name_hint} link_as={link_as} {self._state}') self.log.disable_hint() if link_as and self._state == 'spun_down': self.hooks.save.in_reverse() temp_path = self.path self.path = temp.unique_dir(self._parent_path, hint=name_hint) log.debug(f'saving to temp {temp_path}') self._state = 'saving' self.expiration.depend_on_loaded_python_modules() del self.log with open(os.path.join(temp_path, 'machine.clpickle'), 'wb') as f: cloudpickle.dump(self, f) log.debug(f'moving {temp_path} to {self.path}') os.rename(temp_path, self.path) self._state == 'saved' link_this = self.path else: assert self._state in ('spun_down', 'loaded', 'dropped') log.info(f'discarding {self.path}') temp.remove(self.path) link_this = self._parent_path self._state = 'dropped' if (link_this and link_as and os.path.realpath(link_as) != os.path.realpath(link_this)): log.debug(f'linking {link_this} to {link_as}') if os.path.lexists(link_as): if os.path.exists(link_as) and not needs_a_rebuild(link_as): log.critical(f'Refusing to overwrite fresh {link_as}') raise RuntimeError(f'Not overriding fresh {link_as}') os.unlink(link_as) os.symlink(link_this, link_as) return link_as
def needs_a_rebuild(mpath, by=None): with open(os.path.join(mpath, 'machine.clpickle'), 'rb') as f: m = cloudpickle.load(f) if not m.expiration.files_have_not_changed(): return True expired = m.expiration.is_expired(by) if expired: log.debug(f'{mpath} has expired at {m.expiration.pretty()}') else: log.debug(f'{mpath} is valid until {m.expiration.pretty()}') if OFFLINE and expired: log.warning(f'{mpath} expired at {m.expiration.pretty()}, ' 'but offline mode is enabled, so, reusing it') return expired and not OFFLINE
def _serve(self, uri, headers, meth='GET'): sess = http_cache._get_requests_session() headers = {k: v for k, v in headers.items() if not (k in STRIP_HEADERS or k.startswith('Proxy-'))} log.debug(f'{meth} {uri}') for k, v in headers.items(): log.debug(f'{k}: {v}') try: if meth == 'GET' and not OFFLINE: # direct streaming might be required... preview = sess.head(uri, headers=headers, allow_redirects=False) direct = None if int(preview.headers.get('Content-Length', 0)) > BIG: direct = f'file bigger than {BIG}' if 'Range' in headers: # There seems to be a bug in CacheControl # that serves contents in full if a range request # hits a non-ranged cached entry. direct = f'ranged request, playing safe' if direct: # Don't cache, don't reencode, stream it as is log.warning(f'streaming {uri} directly ({direct})') r = requests.get(uri, headers=headers, stream=True) self._status_and_headers(r.status_code, r.headers) self.copyfile(r.raw, self.wfile) return # fetch with caching m_func = getattr(sess, meth.lower()) r = m_func(uri if '://' in uri else 'http://self' + uri, headers=headers, allow_redirects=False) data = r.content length = int(r.headers.get('Content-Length', 0)) if len(data) != length: data = hack_around_unpacking(uri, headers, data) assert len(data) == length self._status_and_headers(r.status_code, r.headers) if meth == 'GET': self.wfile.write(data) log.info(f'{meth} {uri} served {length}') except BrokenPipeError: log.warning(f'Broken pipe for {meth} {uri}') except ConnectionResetError: log.warning(f'Connection reset for {meth} {uri}') except requests.exceptions.ConnectionError: log.warning(f'Connection error for {meth} {uri}')
def mount_supported_fs(backing_file, tgt): log.info('mounting a reflink-supported filesystem for image storage...') tgt_uid, tgt_gid = os.stat(tgt).st_uid, os.stat(tgt).st_gid subprocess.run(['sudo', 'mount', '-o', 'loop', backing_file, tgt], check=True) mount_uid, mount_gid = os.stat(tgt).st_uid, os.stat(tgt).st_gid if (tgt_uid, tgt_gid) != (mount_uid, mount_gid): log.debug(f'fixing owner:group ({tgt_uid}:{tgt_gid})') subprocess.run(['sudo', 'chown', f'{tgt_uid}:{tgt_gid}', tgt], check=True) if tgt.startswith('/home'): subprocess.run([ 'sudo', 'semanage', 'fcontext', '-a', '-t', 'user_home_dir_t', tgt + '(/.*)?' ], check=False) subprocess.run(['sudo', 'restorecon', '-v', tgt], check=False)
def fetch(self, url, out_path): sources = saviour_sources() for i, (source, cache) in enumerate(sources): if is_fetcheable(source, url) or i == len(sources) - 1: if source == 'local': reflink.auto(path.saviour(url), out_path) return sess = self._get_requests_session(direct=not cache) if source == 'direct': surl = url else: surl = source + '/' + url surl = 'http://' + surl if '://' not in source else surl log.debug(f'fetching{"/caching" if cache else ""} ' f'{os.path.basename(url)} from {surl}') r = sess.get(surl) # not raw because that punctures cache with open(out_path, 'wb') as f: f.write(r.content) return
def _cache_aware_apply(self, step, tag, func, args, kwargs): assert self._state == 'loaded' # Could there already be a cached result? log.debug(f'PATH {self.path} {tag}') new_mpath = os.path.join(self._parent_path, tag) end_goal = self._link_as lock_path = os.path.join(self._parent_path, '.' + tag + '-lock') do_lock = not hasattr(func, 'transient') if do_lock: log.info(f'acquiring lock for {tag}...') with lock.MaybeLock(lock_path, lock=do_lock): prev_log = self.log if os.path.exists(new_mpath) and not needs_a_rebuild(new_mpath): # sweet, scratch this instance, fast-forward to cached result log.info(f'reusing {step} @ {new_mpath}') self._finalize() clone_from_path = new_mpath else: # loaded, not spun up, step not cached: perform step, cache log.info(f'applying (and, possibly, caching) {tag}') prev_log.disable_hint() self.log = log.sublogger('plugins.' + tag.split(':', 1)[0], os.path.join(self.path, 'log.txt')) m = func(self, *args, **kwargs) prev_log.enable_hint() if m: assert not m._transient m._finalize(link_as=new_mpath, name_hint=tag) clone_from_path = new_mpath log.info(f'successfully applied and saved {tag}') else: # transient step clone_from_path = self._parent_path log.info(f'successfully applied and dropped {tag}') m = clone_and_load(clone_from_path, link_as=end_goal) m.log = prev_log return m
def apply(self, step, *args, **kwargs): func, tag = step_loader.func_and_autotag(step, *args, **kwargs) log.debug(f'apply {self.path} {step} {func} {args} {kwargs}') if self._state == 'spun_up': log.debug(f'applying to unclean') return func(self, *args, **kwargs) elif self._state == 'loaded': log.debug(f'applying to clean') return self._cache_aware_apply(step, tag, func, args, kwargs) else: log.critical(f'apply to state={self._state}') raise RuntimeError(f'State machine error, apply to {self._state}')
def _event_loop(): log.debug(f'inotify blocks') for event in self._inotify.read(): log.debug(f'inotify {event}') if not _is_event_rerun_worthy(event): continue log.debug(f'that was rerun-worthy') # exhausting the events queue / debouncing debounce_end_time = time.time() + WATCHER_DEBOUNCE_TIMEOUT while True: time_left = debounce_end_time - time.time() if time_left < 0: break _ = self._inotify.read(timeout=time_left) # finally, set the flag and cease functioning self.rewind_needed.set()
def mirror(config, *what_to_mirror): total_failures = [] failures = collections.defaultdict(list) with open(config) as f: config = ruamel.yaml.YAML(typ='safe').load(f) hows, whats = config['how'], config['what'] if not what_to_mirror: what_to_mirror = whats.keys() else: what_to_mirror = [k for k in whats.keys() if any((fnmatch.fnmatch(k, req) for req in what_to_mirror))] for resource_name in what_to_mirror or whats.keys(): s = whats[resource_name] log.debug(f'processing {resource_name}...') if s is None: how, suffix = resource_name, '' elif '/' in s: how, suffix = s.split('/', 1) suffix = '/' + suffix else: how, suffix = s, '' try: how = hows[how] except KeyError: log.error(f'missing how section on {how}') raise SystemExit() url = how['url'] + suffix method = how['method'] sources = (how['sources'] if 'sources' in how else [how['url']]) sources = [s + suffix for s in sources] extra_args = {k: v for k, v in how.items() if k not in ('url', 'sources', 'method')} if f'method_{method}' not in globals(): log.error(f'unsupported method {method}') raise SystemExit() meth = globals()[f'method_{method}'] symlink = path.saviour(url.rstrip('/')) # usually symlink points to data, but while we're working on it, # it temporarily points to a consistent snapshot of it named `snap` data = path.saviour('_', resource_name, 'data') snap = path.saviour('_', resource_name, 'snap') temp = path.saviour('_', resource_name, 'temp') lockfile = path.saviour('_', resource_name) + '-lock' assert data.startswith(path.SAVIOUR) assert snap.startswith(path.SAVIOUR) assert temp.startswith(path.SAVIOUR) sublog = log.Sublogger(f'{method} {resource_name}') sublog.info('locking...') with lock.Lock(lockfile): os.makedirs(os.path.dirname(snap), exist_ok=True) if os.path.exists(temp): sublog.info('removing stale temp...') _remove(temp) if os.path.exists(symlink): # it's already published if os.path.exists(data) and not os.path.exists(snap): # `data` is present and is the best we have to publish sublog.info('snapshotting...') reflink.always(data, temp, preserve=True) os.rename(temp, snap) if os.path.exists(snap): # link to a consistent snapshot while we work on `data` _symlink(snap, symlink) for source in sources: sublog.info(f'trying {source}...') try: meth(sublog, source, snap, data, **extra_args) assert os.path.exists(data) break except Exception as _: traceback.print_exc() failures[resource_name].append(source) fingertip.util.log.warning(f'failed to mirror {source}') if len(failures[resource_name]) == len(sources): sublog.error(f'failed to mirror ' f'from all {len(sources)} sources') total_failures.append(resource_name) continue _symlink(data, symlink) if os.path.exists(snap): os.rename(snap, temp) # move it out the way asap sublog.info('removing now obsolete snapshot...') _remove(temp) try: deduplicate(sublog, resource_name, timeout=1) except lock.LockTimeout: log.warning('skipped deduplication, db was locked') if total_failures: fingertip.util.log.error(f'failed: {", ".join(total_failures)}') raise SystemExit() log.info('saviour has completed mirroring')
def _cache_aware_apply(self, step, tag, func, args, kwargs, last_step): assert self._state == 'loaded' transient_hint = func.transient if hasattr(func, 'transient') else None if callable(transient_hint): transient_hint = supply_last_step_if_requested( transient_hint, last_step) transient_hint = transient_hint(self, *args, **kwargs) return_as_transient = self._transient exec_as_transient = (transient_hint in ('always', True) or transient_hint == 'last' and last_step) log.debug(f'transient: {transient_hint}') log.debug(f'exec_as_transient: {exec_as_transient}') log.debug(f'return_as_transient: {return_as_transient}') self._transient = exec_as_transient # Could there already be a cached result? log.debug(f'PATH {self.path} {tag}') new_mpath = os.path.join(self._parent_path, tag) lock_path = os.path.join(self._parent_path, '.' + tag + '-lock') do_lock = not self._transient if do_lock: log.info(f'acquiring lock for {tag}...') prev_log_name = self.log.name self.log.finalize() with lock.Lock(lock_path) if do_lock else lock.NoLock(): if (os.path.exists(new_mpath) and not needs_a_rebuild(new_mpath) and not exec_as_transient): # sweet, scratch this instance, fast-forward to cached result log.info(f'reusing {step} @ {new_mpath}') self._finalize() clone_from_path = new_mpath else: # loaded, not spun up, step not cached: perform step, cache log.info(f'applying (and, possibly, caching) {tag}') self.log = log.Sublogger('plugins.' + tag.split(':', 1)[0], os.path.join(self.path, 'log.txt')) func = supply_last_step_if_requested(func, last_step) m = func(self, *args, **kwargs) if m: if m._transient and transient_hint == 'last' and last_step: assert m._state == 'dropped' # transient-when-last step returned m # just in case it's not the last, but it was. # m is dropped already, only log contents is preserved. fname = f'{datetime.datetime.utcnow().isoformat()}.txt' t = path.logs(fname, makedirs=True) with open(t, 'w') as f: f.write(m.log_contents) return t assert not m._transient, 'transient step returned a value' m._finalize(link_as=new_mpath, name_hint=tag) clone_from_path = new_mpath log.info(f'successfully applied and saved {tag}') else: # transient step, either had hints or just returned None clone_from_path = self._parent_path log.info(f'successfully applied and dropped {tag}') if last_step: return os.path.join(clone_from_path, 'log.txt') m = clone_and_load(clone_from_path) m.log = log.Sublogger(prev_log_name, os.path.join(m.path, 'log.txt')) m._transient = return_as_transient return m
def mirror(config, *what_to_mirror, deduplicate=None): total_failures = [] failures = collections.defaultdict(list) with open(config) as f: config = ruamel.yaml.YAML(typ='safe').load(f) if 'mirror' in config and not config['mirror']: log.warning('mirroring is disabled in config') return hows, whats = config['how'], config['what'] if not what_to_mirror: what_to_mirror = whats.keys() else: what_to_mirror = ([ k for k in whats.keys() if any( fnmatch.fnmatch(k, req) for req in what_to_mirror) ] + [k for k in what_to_mirror if '=' in k]) if not what_to_mirror: log.error('nothing to mirror') return for resource in what_to_mirror: log.debug(f'processing {resource}...') if '=' not in resource: # example: alpine-3.13=alpine/v3.13/main/x86 resource_name, tail = resource, '' s = whats[resource_name] else: # example: alpine-3.13=alpine/v3.13/main/x86 resource_name, s = resource.split('=', 1) # FIXME UGLY: config overrides are stronger that = (more syntax?) # TODO: whats shouldn't be a dict, I think, just a list of strings if resource_name in whats: s = whats[resource_name] if s is None: s = resource_name if '/' in s: how_name, suffix = s.split('/', 1) suffix = '/' + suffix else: how_name, suffix = s, '' try: how = hows[how_name] except KeyError: log.error(f'missing how section on {how_name}') raise SystemExit() url = how['url'] + suffix method = how['method'] sources = (how['sources'] if 'sources' in how else [how['url']]) sources = [s + suffix for s in sources] extra_args = { k: v for k, v in how.items() if k not in ('url', 'sources', 'method', 'validate', 'deduplicate') } if f'method_{method}' not in globals(): log.error(f'unsupported method {method}') raise SystemExit() meth = globals()[f'method_{method}'] symlink = path.saviour(url.rstrip('/')) # usually symlink points to data, but while we're working on it, # it temporarily points to a consistent snapshot of it named `snap` data = os.path.realpath(path.saviour('_', resource_name, 'data')) snap = os.path.realpath(path.saviour('_', resource_name, 'snap')) temp = os.path.realpath(path.saviour('_', resource_name, 'temp')) lockfile = path.saviour('_', resource_name) + '-lock' assert data.startswith(os.path.realpath(path.SAVIOUR)) assert snap.startswith(os.path.realpath(path.SAVIOUR)) assert temp.startswith(os.path.realpath(path.SAVIOUR)) sublog = log.Sublogger(f'{method} {resource_name}') sublog.info('locking...') with lock.Lock(lockfile): os.makedirs(os.path.dirname(snap), exist_ok=True) if os.path.exists(temp): sublog.info('removing stale temp...') _remove(temp) if os.path.exists(symlink): # it's already published if os.path.exists(data) and not os.path.exists(snap): # `data` is present and is the best we have to publish sublog.info('snapshotting...') reflink.always(data, temp, preserve=True) os.rename(temp, snap) if os.path.exists(snap): # link to a consistent snapshot while we work on `data` _symlink(snap, symlink) for source in sources: sublog.info(f'trying {source}...') try: meth(sublog, source, snap, data, **extra_args) assert os.path.exists(data) if 'validate' in how: sublog.info(f'validating with {how["validate"]}...') validator = globals()[f'validate_{how["validate"]}'] validator(sublog, source, data) sublog.info('validated') break except Exception as _: traceback.print_exc() failures[resource_name].append(source) fingertip.util.log.warning(f'failed to mirror {source}') if len(failures[resource_name]) == len(sources): sublog.error(f'failed to mirror ' f'from all {len(sources)} sources') total_failures.append(resource_name) continue _symlink(data, symlink) if os.path.exists(snap): os.rename(snap, temp) # move it out the way asap sublog.info('removing now obsolete snapshot...') _remove(temp) how_deduplicate = how.get('deduplicate', True) db_name = how_deduplicate if how_deduplicate is not True else how_name if how_deduplicate and deduplicate is not False: try: _deduplicate(sublog, db_name, resource_name, timeout=1) except lock.LockTimeout: log.warning(f'skipped deduplication of {resource_name}, ' f'db {db_name} was locked') if total_failures: fingertip.util.log.error(f'failed: {", ".join(total_failures)}') raise FailureToMirrorError(", ".join(total_failures)) log.info('saviour has completed mirroring')
def _serve_http(self, uri, headers, meth='GET', cache=True, retries=RETRIES_MAX): sess = http_cache._get_requests_session(direct=not cache) sess_dir = http_cache._get_requests_session(direct=True) basename = os.path.basename(uri) headers = { k: v for k, v in headers.items() if not (k in STRIP_HEADERS or k.startswith('Proxy-')) } headers['Accept-Encoding'] = 'identity' log.debug(f'{meth} {basename} ({uri})') for k, v in headers.items(): log.debug(f'{k}: {v}') error = None try: if meth == 'GET': # direct streaming or trickery might be required... preview = sess.head(uri, headers=headers, allow_redirects=False) if (300 <= preview.status_code < 400 and 'Location' in preview.headers): nu = preview.headers['Location'] if nu.startswith('https://'): # no point in serving that, we have to pretend # that never happened log.debug(f'suppressing HTTPS redirect {nu}') return self._serve_http(nu, headers, meth=meth, cache=cache, retries=retries) direct = [] if not cache: direct.append('caching disabled for this source') if int(preview.headers.get('Content-Length', 0)) > BIG: direct.append(f'file bigger than {BIG}') if 'Range' in headers: # There seems to be a bug in CacheControl # that serves contents in full if a range request # hits a non-ranged cached entry. direct.append('ranged request, playing safe') if direct: # Don't cache, don't reencode, stream it as is log.debug(f'streaming {basename} directly ' f'from {uri} ({", ".join(direct)})') r = sess_dir.get(uri, headers=headers, stream=True) self._status_and_headers(r.status_code, r.headers) shutil.copyfileobj(r.raw, self.wfile) return # fetch with caching m_func = getattr(sess, meth.lower()) r = m_func(uri if '://' in uri else 'http://self' + uri, headers=headers, allow_redirects=False) data = r.content if 'Content-Length' in r.headers: length = int(r.headers['Content-Length']) if len(data) != length: data = hack_around_unpacking(uri, headers, data) assert len(data) == length except BrokenPipeError: error = f'Upwards broken pipe for {meth} {uri}' except ConnectionResetError: error = f'Upwards connection reset for {meth} {uri}' except requests.exceptions.ConnectionError: error = f'Upwards connection error for {meth} {uri}' if error: # delay a re-request if retries: log.warning(f'{error} (will retry x{retries})') t = (RETRIES_MAX - retries) / RETRIES_MAX * COOLDOWN time.sleep(t) return self._serve_http(uri, headers, meth=meth, cache=cache, retries=retries - 1) else: log.error(f'{error} (out of retries)') self.send_error(http.HTTPStatus.SERVICE_UNAVAILABLE) return log.debug(f'{meth} {basename} fetched {r.status_code} ({uri})') try: self._status_and_headers(r.status_code, r.headers) if meth == 'GET': self.wfile.write(data) except BrokenPipeError: log.warning(f'Downwards broken pipe for {meth} {uri}') except ConnectionResetError: log.warning(f'Downwards connection reset for {meth} {uri}') except requests.exceptions.ConnectionError: log.warning(f'Downwards connection error for {meth} {uri}') log.debug(f'{meth} {basename} served ({uri})')