def submitTaskWithFiles(self, clientCodeFile, args, files={}, job_id=None): old_cwd = os.getcwd() cwd = os.path.dirname(os.path.realpath(__file__)) os.chdir(cwd) if job_id is not None: experiment_name = self.project_name + "_" + str(job_id) else: experiment_name = self.project_name + "_" + str(uuid.uuid4()) tmpdir = tempfile.gettempdir() args_file = os.path.join(tmpdir, experiment_name + "_args.pkl") workspace_orig = os.getcwd() ignore_arg = '' ignore_filepath = os.path.join(workspace_orig, ".studioml_ignore") if os.path.exists(ignore_filepath) and \ not os.path.isdir(ignore_filepath): ignore_arg = "--exclude-from=%s" % ignore_filepath workspace_new = fs_tracker.get_artifact_cache('workspace', experiment_name) rsync_cp(workspace_orig, workspace_new, ignore_arg, self.logger) distpath = os.path.join(old_cwd, 'dist') if os.path.exists(distpath): self.logger.info('dist folder found at {}, ' + 'copying into workspace') rsync_cp(distpath, os.path.join(workspace_new, 'dist')) self.logger.info('Created workspace ' + workspace_new) artifacts = self._create_artifacts(clientCodeFile, args_file, workspace_new, files) with open(args_file, 'wb') as f: f.write(pickle.dumps(args, protocol=2)) experiment = create_experiment('completion_service_client.py', [self.config['verbose']], experiment_name=experiment_name, project=self.project_name, artifacts=artifacts, resources_needed=self.resources_needed) tic = time.time() runner.submit_experiments([experiment], config=self.config, logger=self.logger, cloud=self.cloud, queue_name=self.queue_name) self.submitted[experiment.key] = time.time() os.chdir(old_cwd) toc = time.time() self.logger.info('Submitted experiment ' + experiment.key + ' in ' + str(toc - tic) + ' s') return experiment_name
def create_experiments(hyperparam_tuples): experiments = [] # experiment_names = {} for hyperparam_tuple in hyperparam_tuples: experiment_name = experiment_name_base experiment_name += "__opt__%s__%s" % (util.rand_string(32), int(time.time())) experiment_name = experiment_name.replace('.', '_') workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_name) current_artifacts = artifacts.copy() current_artifacts.update({ 'workspace': { 'local': workspace_new, 'mutable': True } }) rsync_cp(workspace_orig, workspace_new, ignore_arg, logger) # shutil.copytree(workspace_orig, workspace_new) for param_name, param_value in six.iteritems(hyperparam_tuple): if isinstance(param_value, np.ndarray): array_filepath = '/tmp/%s.npy' % util.rand_string(32) np.save(array_filepath, param_value) assert param_name not in current_artifacts current_artifacts[param_name] = {'local': array_filepath, 'mutable': False} else: with open(os.path.join(workspace_new, exec_filename), 'r') as f: script_text = f.read() script_text = re.sub( '\\b' + param_name + '\\b(?=[^=]*\\n)', str(param_value), script_text) with open(os.path.join(workspace_new, exec_filename), 'w') as f: f.write(script_text) experiments.append(create_experiment( filename=exec_filename, args=other_args, experiment_name=experiment_name, project=project, artifacts=current_artifacts, resources_needed=resources_needed, metric=runner_args.metric, max_duration=runner_args.max_duration, dependency_policy=StudioDependencyPolicy() )) return experiments
def submitTaskWithFiles(self, clientCodeFile, args, files={}): old_cwd = os.getcwd() cwd = os.path.dirname(os.path.realpath(__file__)) os.chdir(cwd) experiment_name = self.project_name + "_" + str(uuid.uuid4()) tmpdir = tempfile.gettempdir() args_file = os.path.join(tmpdir, experiment_name + "_args.pkl") workspace_orig = os.getcwd() ignore_arg = '' ignore_filepath = os.path.join(workspace_orig, ".studioml_ignore") if os.path.exists(ignore_filepath) and \ not os.path.isdir(ignore_filepath): ignore_arg = "--exclude-from=%s" % ignore_filepath workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_name) rsync_cp(workspace_orig, workspace_new, ignore_arg, self.logger) distpath = os.path.join(old_cwd, 'dist') if os.path.exists(distpath): self.logger.info('dist folder found at {}, ' + 'copying into workspace') rsync_cp(distpath, os.path.join(workspace_new, 'dist')) self.logger.info('Created workspace ' + workspace_new) artifacts = { 'retval': { 'mutable': True }, 'clientscript': { 'mutable': False, 'local': clientCodeFile }, 'args': { 'mutable': False, 'local': args_file }, 'workspace': { 'mutable': False, 'local': workspace_new } } for tag, name in six.iteritems(files): artifacts[tag] = {} url_schema = re.compile('^https{0,1}://') s3_schema = re.compile('^s3://') gcs_schema = re.compile('^gs://') if url_schema.match(name): artifacts[tag]['url'] = name elif s3_schema.match(name) or gcs_schema.match(name): artifacts[tag]['qualified'] = name else: artifacts[tag]['local'] = os.path.abspath( os.path.expanduser(name)) artifacts[tag]['mutable'] = False with open(args_file, 'wb') as f: f.write(pickle.dumps(args, protocol=2)) experiment = create_experiment( 'completion_service_client.py', [self.config['verbose']], experiment_name=experiment_name, project=self.project_name, artifacts=artifacts, resources_needed=self.resources_needed) tic = time.time() runner.submit_experiments( [experiment], config=self.config, logger=self.logger, cloud=self.cloud, queue_name=self.queue_name) self.submitted.add(experiment.key) os.chdir(old_cwd) toc = time.time() self.logger.info('Submitted experiment ' + experiment.key + ' in ' + str(toc - tic) + ' s') return experiment_name
def studio_run(self, line, cell=None): script_text = [] pickleable_ns = {} for varname, var in six.iteritems(self.shell.user_ns): if not varname.startswith('__'): if isinstance(var, ModuleType) and \ var.__name__ != 'studio.magics': script_text.append( 'import {} as {}'.format(var.__name__, varname) ) else: try: pickle.dumps(var) pickleable_ns[varname] = var except BaseException: pass script_text.append(cell) script_text = '\n'.join(script_text) stub_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'run_magic.py.stub') with open(stub_path) as f: script_stub = f.read() script = script_stub.format(script=script_text) experiment_key = str(int(time.time())) + \ "_jupyter_" + str(uuid.uuid4()) print('Running studio with experiment key ' + experiment_key) config = model.get_config() if config['database']['type'] == 'http': print("Experiment progress can be viewed/shared at:") print("{}/experiment/{}".format( config['database']['serverUrl'], experiment_key)) workspace_new = fs_tracker.get_artifact_cache( 'workspace', experiment_key) rsync_cp('.', workspace_new) with open(os.path.join(workspace_new, '_script.py'), 'w') as f: f.write(script) ns_path = fs_tracker.get_artifact_cache('_ns', experiment_key) with gzip.open(ns_path, 'wb') as f: f.write(pickle.dumps(pickleable_ns)) if any(line): runner_args = line.strip().split(' ') else: runner_args = [] runner_args.append('--capture={}:_ns'.format(ns_path)) runner_args.append('--capture-once=.:workspace') runner_args.append('--force-git') runner_args.append('--experiment=' + experiment_key) notebook_cwd = os.getcwd() os.chdir(workspace_new) print(runner_args + ['_script.py']) runner_main(runner_args + ['_script.py']) os.chdir(notebook_cwd) with model.get_db_provider() as db: while True: experiment = db.get_experiment(experiment_key) if experiment and experiment.status == 'finished': break time.sleep(10) new_ns_path = db.get_artifact(experiment.artifacts['_ns']) with open(new_ns_path) as f: new_ns = pickle.loads(f.read()) self.shell.user_ns.update(new_ns)
def run(self, experiment): if isinstance(experiment, six.string_types): experiment = self.db.get_experiment(experiment) elif not isinstance(experiment, Experiment): raise ValueError("Unknown type of experiment: " + str(type(experiment))) self.logger.info("Experiment key: " + experiment.key) with model.get_db_provider(self.config) as db: db.start_experiment(experiment) """ Override env variables with those inside the queued message """ env = dict(os.environ) if 'env' in self.config.keys(): for k, v in six.iteritems(self.config['env']): if v is not None: env[str(k)] = str(v) env['PYTHONUNBUFFERED'] = 'TRUE' fs_tracker.setup_experiment(env, experiment, clean=False) log_path = fs_tracker.get_artifact_cache('output', experiment.key) self.logger.debug('Child process environment:') self.logger.debug(str(env)) sched = BackgroundScheduler() sched.start() with open(log_path, 'w') as output_file: python = 'python' if experiment.pythonver[0] == '3': python = 'python3' python = which(python) cmd = [python, experiment.filename] + experiment.args cwd = experiment.artifacts['workspace'].local_path container_artifact = experiment.artifacts.get('_singularity') if container_artifact: container = container_artifact.get('local') if not container: container = container_artifact.get('qualified') cwd = fs_tracker.get_artifact_cache( 'workspace', experiment.key) for tag, art in six.iteritems(experiment.artifacts): local_path = art.get('local') if not art['mutable'] and os.path.exists(local_path): os.symlink(art['local'], os.path.join(os.path.dirname(cwd), tag)) if experiment.filename is not None: cmd = [ 'singularity', 'exec', container, ] + cmd else: cmd = ['singularity', 'run', container] self.logger.info('Running cmd: {0} in {1}'.format(cmd, cwd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env, cwd=cwd, text=True) def kill_subprocess(): p.kill() def get_duration(tag: str): value = self.config.get(tag, '0m') return int(str2duration(value).total_seconds() / 60) def checkpoint(): try: db.checkpoint_experiment(experiment) except BaseException as e: self.logger.info(e) check_for_kb_interrupt() minutes = get_duration('saveWorkspaceFrequency') sched.add_job(checkpoint, 'interval', minutes=minutes) metrics_path = fs_tracker.get_artifact_cache( '_metrics', experiment.key) minutes = get_duration('saveMetricsFrequency') sched.add_job(lambda: save_metrics(metrics_path), 'interval', minutes=minutes) def kill_if_stopped(): try: db_expr = db.get_experiment(experiment.key, getinfo=False) except: check_for_kb_interrupt() db_expr = None # Transient issues with getting experiment data might # result in a None value being returned, as result # leave the experiment running because we wont be able to # do anything else even if this experiment is stopped # in any event if the experiment runs too long then it # will exceed its allocated time and stop if db_expr is not None: if db_expr.status == 'stopped': kill_subprocess() return if experiment.max_duration is not None and \ time.time() > experiment.time_started + \ int(str2duration(experiment.max_duration) .total_seconds()): kill_subprocess() return # If our tasks queue is signalled inactive # during work process execution, that means we need to drop # current execution and exit if not self.task_queue.is_active(): kill_subprocess() sched.add_job(kill_if_stopped, 'interval', seconds=10) while True: output = p.stdout.readline() if output == '' and p.poll() is not None: break if output: line_out = output.strip() print(line_out) output_file.write(line_out) try: p.wait() finally: save_metrics(metrics_path) sched.shutdown() db.checkpoint_experiment(experiment) db.finish_experiment(experiment) return p.returncode