def run_daemon(script, kwargs, executable=sys.executable): args = ['nohup', executable, script] + cmd_args(**kwargs) logger.debug(' '.join(args)) return subprocess.Popen(args, stdout=open('/dev/null', 'w'), stderr=open('/dev/null', 'w'), preexec_fn=os.setpgrp)
def custom_getter(self, getter, name, *args, **kwargs): # print(kwargs) # assert not getter(name, *args, **kwargs) # Variable should not exist in scope yet # if not kwargs.get("reuse"): # # return getter(name,*args,**kwargs) # raise Exception() n = relpath(name + ':0', self._scope.name) # v = self.reuse_vars.get(n + ':0') v = self.reuse_vars.get(n) # if v: logger.debug("reuse " + n + ':0' + " - " + v.name) if v: logger.debug("reuse " + n + " - " + v.name) if not v: v = getter(name, *args, **kwargs) logger.debug("create {} - {}".format(n, v.name)) if True: # logger.level == logging.DEBUG: col = in_collections(v) if not any([tf.GraphKeys.VARIABLES in col, tf.GraphKeys.LOCAL_VARIABLES in col]): raise Exception("Error: collections have to contain tf.GraphKeys.VARIABLES or tf.GraphKeys.LOCAL_VARIABLES") return v
def on_found(self, is_dir, path): if is_dir and os.path.exists(os.path.join(path, CONFIG_NAME)): e = Experiment(path, self.host, self.port, self) self.exps.update({path: e}) if self.socketio: # self.upd() logger.debug(f'{len(self.exps)} experiments')
def on_created(self, event): sleep(3) ex = os.path.exists(os.path.join(event.src_path, CONFIG_NAME)) logger.debug('Folder created ' + str(event)) logger.debug('is exp ' + str(ex)) self.on_found(event.is_directory, event.src_path)
def __init__(self, path): import json self._path = path try: with open(path) as f: old_data = json.load(f) except json.JSONDecodeError: logger.warning('Could not decode config') old_data = {} except OSError: logger.debug('No config file') old_data = {} for i in range(10): try: self._f = open(path, 'w+') fcntl.flock(self._f, fcntl.LOCK_EX | fcntl.LOCK_NB) self._locked = True break except BlockingIOError: import signal pid = old_data.get('pid') if pid: logger.info( f'Config file is locked (try {i}). Killing previous instance {pid}' ) os.kill(pid, signal.SIGTERM) time.sleep(.05) else: logger.error(f'Config file is locked and no pid to kill') assert self._locked
def command(self, cmd): import signal if cmd == "kill": slurm = self.data.get('slurm') if slurm: jid = slurm.get('SLURM_JOB_ID') or slurm.get('SLURM_JOBID') r = subprocess.check_call(('scancel', str(jid))) assert r == 0 return {} pid = self.data.get('pid') if pid: logger.debug('send kill to ' + str(pid)) os.kill(pid, signal.SIGTERM) return dict() if cmd == "run": from chi.util import run_daemon e = self.data.get('sys_executable') a = self.data.get('sys_argv') k = self.data.get('args') if e and a and k: run_daemon(a[0], k, e) return dict() else: logger.debug('run failed because of exec ' + str(e) + ' argv ' + str(a) + ' args ' + str(k)) return dict()
def on_moved(self, event): """ event.event_type 'modified' | 'created' | 'moved' | 'deleted' event.is_directory True | False event.src_path path/to/observed/file """ logger.debug(str(event))
def __init__(self, variables_dict, tau=0.001): vs = tf.get_variable_scope().name with tf.name_scope(""): self.vars = variables_dict name = "{}/ExponentialMovingAverage".format(vs) if vs else "ExponentialMovingAverage" self.ema = tf.train.ExponentialMovingAverage(decay=1 - tau, name=name) self.upd = self.ema.apply(self.vars.values()) # also creates shadow vars self.averages = {n: self.ema.average(v) for n, v in variables_dict.items()} logger.debug(self.averages.values()[0].name)
def __init__(self, paths): super().__init__() self.watches = set() for p in paths: p = os.path.expanduser(p) logger.debug('watch ' + p) self.watches.add(Repo.observer.schedule(self, p)) for f in os.scandir(p): isinstance(f, os.DirEntry) self.on_found(f.is_dir, f.path)
def tb_killer(self): tb = self.tb while tb and not tb.poll(): if time() - self.tb_t > 60: assert isinstance(tb, subprocess.Popen) tb.terminate() logger.debug('tensorboard for {} kill because timeout'.format( self.path)) # break sleep(5) logger.debug('killer finish')
def check_tb(): try: url = "http://{}:{}".format(self.host, self.tb_port) r = requests.get( url) # requests.head not supported by tensorboard available = r.status_code == 200 sleep(.3) logger.debug('tb on {} status {}, {}'.format( url, r.status_code, r.reason)) return available except requests.ConnectionError: return False
def tb_watcher(self): assert isinstance(self.tb, subprocess.Popen) outs, errs = self.tb.communicate() returncode = self.tb.returncode self.tb = None msg = 'tensorboard on {} for {} returned with code {}'.format( self.tb_port, self.path, returncode) if returncode == 0: logger.debug(msg) else: logger.warning(f'{msg}\n out: {outs}\n err: {errs}') logger.debug('tb watcher finished')
def get_free(pool): for i in range(20): av = [p for p in pool if check_free(p)] logger.debug('Free ports' + str(av)) if av: break sleep(.1) if not av: logger.error('No ports available') return None else: return av[0]
def tensorboard(self): has_event_files = glob.glob(self.path + '**/*.tfevents*', recursive=True) if not has_event_files: return dict(no_event_files=True) elif not self.tb: self.tb_port = get_free(self.server.port_pool) cmds = [ 'tensorboard', '--logdir', "{}".format(self.path), '--host', '0.0.0.0', '--port', str(self.tb_port) ] logger.debug('Start tensorboard with: ' + ' '.join(cmds)) self.tb = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) Thread(target=self.tb_watcher, daemon=True).start() @repeat_until(timeout=6.) def check_tb(): try: url = "http://{}:{}".format(self.host, self.tb_port) r = requests.get( url) # requests.head not supported by tensorboard available = r.status_code == 200 sleep(.3) logger.debug('tb on {} status {}, {}'.format( url, r.status_code, r.reason)) return available except requests.ConnectionError: return False if not check_tb: logger.warning('tb could not be started') self.tb_t = time() Thread(target=self.tb_killer, daemon=True).start() return dict(host=self.host, port=self.tb_port, new=True, available=check_tb, no_event_files=False) else: self.tb_t = time() # heartbeat # print('heartbeat') return dict(host=self.host, port=self.tb_port, new=False, available=True, no_event_files=False)
def on_connect(self): if self.connections == 0: pass self.emit('info', dict(jupyter_port=self.jupyter_port, user=os.environ.get('USER'), bashrc=self.bashrc, )) self.experiments() # poll file system self.connections += 1 logger.debug(f'connect ({self.connections})')
def wrap(*args, **kwargs): key = args r = store.get(key) if not r: r = Resource() r.key = key r.v = f(*args, **kwargs) if hasattr(r.v, 'release'): r.release = r._release = r.v.release r.to = timeout logger.debug('resource stored') store.update({key: r}) r.t = time() return r.v
def __init__(self, host, port, rootdir, port_pool, polling_interval=20): self.port_pool = port_pool self.rootdir = rootdir self.host = host self.port = port self.exps = {} # Start jupyter jpt = shutil.which('jupyter') self.jupyter_port = p = get_free(self.port_pool) if jpt else -1 if jpt: csp = str(dict(headers={'Content-Security-Policy': f"frame-ancestors 'self' http://localhost:{self.port}/"})) logger.debug(f'Start jupyter ({jpt}) on port {p}') self.jupyter = subprocess.Popen([jpt, 'notebook', '--port='+str(p), '--no-browser', '/', "--NotebookApp.token=''", f"--NotebookApp.tornado_settings={csp}", f"--FileContentsManager.hide_globs=['']"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) Namespace.__init__(self, '/experiments') # Init Repo alt = '/tmp/chi_' + getpass.getuser() if os.path.exists(alt): os.remove(alt) root = os.path.expanduser('~/.chi') os.symlink(root, alt, target_is_directory=True) roots = [rootdir, join(root, 'experiments'), join(root, 'board'), join(root, 'apps')] for p in roots: mkdirs(os.path.expanduser(p)) bashrc = os.path.expanduser('~/.chi') + '/bashrc.sh' if os.path.exists(bashrc): os.remove(bashrc) os.symlink(os.path.expanduser('~/.bashrc'), bashrc) self.bashrc = bashrc self.connections = 0 Repo.__init__(self, roots) Repo.observer.start()
def tb_watcher(self): assert isinstance(self.tb, subprocess.Popen) outs, errs = self.tb.communicate() returncode = self.tb.returncode self.tb = None msg = 'tensorboard on {} for {} returned with code {}'.format( self.tb_port, self.path, returncode) if returncode == 0: logger.debug(msg) else: logger.warning(msg) logger.warning('out: ' + outs) logger.warning('err: ' + errs) print('watcher finish')
def remote_install_dependency(address, module): user, host = address.split('@') rem = f"/home/{user}/.chi/cache" repo = join('/', *module.__file__.split('/')[:-2]) target = address + ':' + rem + repo logger.debug(f"Uploading {repo} to {target}") copydir(repo, target, with_src=False) cmd = f'pip3 install --user -e {rem+repo}' try: out = subprocess.check_output(['ssh', address, f'echo "{cmd}"; {cmd}'], universal_newlines=True) except subprocess.CalledProcessError as e: logger.error( f'Install failed with code {e.returncode} and output:\n{e.output}') raise e
def __init__(self, name, reuse_vars=None, session=None, logging_path=None, **kwargs): self._finalized = False self.output = None self.inputs = None self.reuse_vars = reuse_vars or {} with tf.variable_scope(name, reuse=False) as self._scope: pass logger.debug("module: " + self._scope.name) # print("variables: ", variables) self._scope.set_custom_getter(self.custom_getter) self._session = session or tf.get_default_session() self._logging_path = logging_path self._writer = None
def on_deleted(self, event): logger.debug(str(event)) if event.is_directory: p = event.src_path e = self.exps.get(p) if e: e.delete() del self.exps[p] logger.debug('actually deleted exp') if self.socketio: # self.upd() logger.debug(f'{len(self.exps)} experiments')
def on_disconnect(self): self.connections -= 1 logger.debug(f'disconnect ({self.connections})')
def on_modified(self, event): if event.src_path == os.path.join(self.path, CONFIG_NAME): logger.debug(f'{event.src_path} modified') self.server.upd()
def kill(): f.close() if not p.poll(): logger.debug('Killing process started with ' + str(cmd)) p.kill()
def __init__(self, f, logdir=None, *args, **kwargs): """ :param f: function :param logdir: string :param args: :param kwargs: """ self._step = 0 # Init model parent class Model.__init__(self, f, *args, **kwargs) # process inputs import collections # for each parameters, inputs map parameter name to the placeholder self.inputs = collections.OrderedDict() # for each parameter, auto_wrap map parameter name to a boolean if the shape is a list self.auto_wrap = collections.OrderedDict() for name, dtype, shape, default in parse_signature(f): # If a default value has been provided, then we don't need to provide an output for this placeholder # in the graph, default will be the output if default: p = tf.placeholder_with_default(default, shape) else: p = tf.placeholder(dtype, shape, name) self.auto_wrap.update({name: isinstance(shape, list)}) self.inputs.update({name: p}) # if any parameter has a shape self.use_wrap = any(self.auto_wrap.values()) # build graph out = super().__call__(**self.inputs) # build Model self.__dict__.update(self._last_graph.__dict__ ) # make SubGraph properties available in self # process outputs if out is None: self.output = tf.no_op() # elif self.use_wrap: # self.unwrap = [] # self.output = [] # if isinstance(out, tuple): # for x in out: # unwrap = isinstance(x, list) # if unwrap: # assert len(x) == 1 and isinstance(x[0], tf.Tensor) # x = x[0] # self.unwrap.append(unwrap) # self.output.append(x) # elif isinstance(out, list): # assert len(out) == 1 and isinstance(out[0], tf.Tensor) # self.output = out[0] # self.inputs = self.get_tensors_by_optype("Placeholder") # If a log dir is specified, then create a FileWriter, passing the graph just defined if logdir: current_app = chi.App.current_app if not logdir.startswith('/'): logger.debug( 'logdir path relative to app: {}, app logdir: {}'.format( current_app, current_app.logdir)) if current_app and current_app.logdir: logdir = current_app.logdir + '/' + logdir else: logger.debug( 'fall back to logdir path relative to working dir') os.path.abspath('./' + logdir) # Write the session graph self.writer = tf.summary.FileWriter( logdir, graph=chi.chi.get_session().graph) else: self.writer = None # collect activations tensor activations = self.get_tensors_by_optype( 'Relu') # TODO: generalize to non-Relu # activations = self.subgraph.histogram_summaries(activations, 'activations') summaries = self.summaries() # If there are summaries, and a writer to write to it if summaries and self.writer: self._summary_op = tf.summary.merge(summaries) super().initialize()
def chiboard(self: chi.Experiment, host='localhost', port=MAGIC_PORT, rootdir='', loglevel='debug', timeout=24*60*60, port_pool=""): from flask import Flask, jsonify, send_from_directory, send_file from chi.board.server import Server from chi.board.util import rcollect from chi.board.util import get_free_port from chi.logger import logger import os import signal from time import time, sleep from threading import Thread from os.path import expanduser as expandu from flask_socketio import SocketIO def expanduser(p): pa = expandu(p) return pa if pa.startswith('/') else '/' + pa chi.set_loglevel(loglevel) if port == 0: port = get_free_port(host) print(f'{port}') self.config.port = port p = os.path.dirname(os.path.realpath(__file__)) app = Flask(__name__, root_path=p, static_url_path='/') socketio = SocketIO(app) if rootdir == '': import os rootdir = os.environ.get('CHI_EXPERIMENTS') or '~' logger.debug('Rootdir: ' + rootdir) if port_pool: port_pool = [int(p) for p in port_pool.split(',')] else: port_pool = range(port + 1, port + 30) server = Server(host, port, rootdir, port_pool) remotes = [] p = expanduser('~/.chi/board/remotes.json') if os.path.exists(p): with open(p) as f: remotes = json.load(f) # print(remotes) state = dict(last_request=time()) def killer(): while time() - state['last_request'] < timeout: sleep(2) logger.error('timeout') os.kill(os.getpid(), signal.SIGINT) # kill self Thread(target=killer, daemon=True).start() @app.before_request def tick(): state.update(last_request=time()) @app.route("/") def index(): return send_file("components/index.html") @app.route("/favicon") def favicon(): return send_file("components/favicon.png") @app.route('/bower_components/<path:path>') def bower(path): return send_from_directory('bower_components', path) @app.route('/components/<path:path>') def comp(path): return send_from_directory('components', path) @app.route("/exp/") def exp(): return send_file("components/experiment.html") @app.route("/info/<string:host>/<path:path>") # experiment page def info(host, path): if host == 'local': return jsonify(server.info(expanduser(path))) else: raise Exception('Remote not yet supported') # request scripts info # update urls @app.route("/logs/<path:path>") def logs(path): data = [] def key(x): k = '_' if x == 'stdout' else x return k path = expanduser(path) + '/logs' for p in sorted(os.listdir(path), key=key): with open(path + '/' + p, 'r') as f: f.seek(0, os.SEEK_END) l = f.tell() f.seek(max((0, l - 50000)), 0) c = f.read() while c and c[-1] == '\n': c = c[:-1] # c = c.replace('\n', '<br>') # c = c.replace('<', '<') data.append({'name': os.path.basename(p), 'content': c}) return jsonify(data) @app.route("/tb/<string:host>/<path:path>") def tb(host, path): if host == 'local': return jsonify(server.tensorboard(expanduser(path))) else: raise Exception('Remote not yet supported') # make local port forward # request scripts tensorboard # update urls @app.route("/delete/<path:path>") def delete(path): return jsonify(server.delete(expanduser(path))) @app.route("/trend/<path:path>") def trend(path): sio = server.trend('/' + path) return send_file(sio, attachment_filename='trend.png', mimetype='image/png') @app.route("/<string:cmd>/<path:path>") def command(cmd, path): return jsonify(server.command(cmd, expanduser(path))) try: socketio.on_namespace(server) socketio.run(app, host=host, port=port, log_output=loglevel == 'debug') finally: server.shutdown()
def release(self): logger.debug('release rsrc') if hasattr(self, '_release'): self._release() del store[self.key]