def copy_paths_rel_cached(self, paths_to_dump, dst_dir): raise NotImplementedError tar_filename = '{id}.tar.gz'.format(id=id_generator(20)) tar_path = os.path.join('/tmp/', tar_filename) print('tmp_path', tar_path) tar = tarfile.open(tar_path, 'w:gz') for d in paths_to_dump: remote_rel_path, local_path = d['dst_rel'], d['src'] if remote_rel_path != '': raise NotImplementedError print('adding_path', local_path) tar.add(local_path, arcname=os.path.basename(local_path)) tar.close() sha_digest = self._sha256_file(tar_path) print(sha_digest) print('dst_dir', dst_dir) # WARNING(maciek): this is not concurrent safe remote_tar_filename = sha_digest + '.tar.gz' if not exists(osp.join(self.scratch_space, remote_tar_filename)): self.copy_path(osp.join(self.scratch_space, remote_tar_filename), tar_path) run('cp {path_src} {path_dst}'.format( path_src=osp.join(self.scratch_space, remote_tar_filename), path_dst=osp.join(dst_dir, remote_tar_filename))) with cd(dst_dir): run('ls *') run('tar xfz {remote_tar_filename}'.format( remote_tar_filename=remote_tar_filename))
def create_pod(self, yaml_str): random_path = '/tmp/{}.yaml'.format(id_generator(10)) with open(random_path, 'w') as f: # print(yaml_str, file=f) f.write(yaml_str) command = ['kubectl', 'create'] if self.kube_config is not None: command += ['--kubeconfig', self.kube_config] command += ['-f', random_path] print(' '.join(command)) # subprocess.run(command, check=True) subprocess.call(command)
def copy_paths_rel(self, paths_to_dump, dst_dir): # TODO(maciek): describe the semantics of this!!! print('copy_paths_rel') for d in paths_to_dump: print(d) tar_filename = '{id}.tar.gz'.format(id=id_generator(20)) tar_tmp_path = os.path.join('/tmp/', tar_filename) print('tmp_path', tar_tmp_path) tar = tarfile.open(tar_tmp_path, 'w:gz') for d in paths_to_dump: remote_rel_path, local_path = d['dst_rel'], d['src'] # if remote_rel_path != '': # raise NotImplementedError print('adding_path', local_path) tar.add(local_path, arcname=os.path.basename(local_path)) tar.close() print('dst_dir', dst_dir) self.copy_path(dst_dir, tar_tmp_path) with cd(dst_dir): run('pwd') run('ls *') run('tar xfz {tar_filename}'.format(tar_filename=tar_filename))
def main(self, argv): self.argv = argv mrunner_args, rest_argv = self.parse_argv() if mrunner_args.storage_url is not None: # INFO(maciek): random noise added is for purpose! exp_dir_path = os.path.join(mrunner_args.storage_url, datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4)) print('exp_dir_path', exp_dir_path) else: print('Warning! no exp_dir_path set') exp_dir_path = '.' if int(mrunner_args.srun) + int(mrunner_args.sbatch) != 1: raise RuntimeError('Please provide exactly one of --srun, --sbatch') resource_dir_path = os.path.join(exp_dir_path, 'src') paths_to_dump = self._parse_paths_to_dump(resource_dir_path, mrunner_args.paths_to_dump_conf, mrunner_args.paths_to_dump) print(paths_to_dump) remote_config_path = os.path.join(resource_dir_path, 'config.yaml') if mrunner_args.neptune: if mrunner_args.config is None: raise RuntimeError('Please supply --config!') self.prometheus_api.mkdir(resource_dir_path) self.prometheus_api.copy_paths_rel(paths_to_dump, resource_dir_path) new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, mrunner_args.name, mrunner_args.project) self.prometheus_api.copy_path(remote_config_path, new_local_config_path) paths_to_dump_for_neptune = [os.path.join(p['dst'], os.path.basename(p['src'])) for p in paths_to_dump] print(paths_to_dump_for_neptune) local_task = self.mrunner_api.create_neptune_run_command(config_path=remote_config_path, paths_to_dump=paths_to_dump_for_neptune, storage_url=mrunner_args.storage_url, tags=mrunner_args.tags, neptune_conf_path=mrunner_args.neptune_conf, rest_argv=rest_argv) command_list = local_task.command if mrunner_args.neptune_conf is not None: with open(mrunner_args.neptune_conf) as f: for line in f.readlines(): command_list = [line] + command_list command = ' '.join(command_list) import random sleep_command = "sleep {0:.4f}".format(random.random() * 5) command = sleep_command + " ; " + command print(command) env = local_task.env env['EXPERIMENT_ID'] = mrunner_args.experiment_id env['STORAGE_URL'] = mrunner_args.storage_url env['RESOURCE_DIR_PATH'] = resource_dir_path if mrunner_args.pythonpath: env['PYTHONPATH'] = mrunner_args.pythonpath log_path = '/dev/null' modules_to_load = [] if mrunner_args.modules_to_load: modules_to_load = mrunner_args.modules_to_load.split(":") modules_to_load = [x for x in modules_to_load if x] # remove empty strings print("Modules to load:{}".format(modules_to_load)) task = PlgridTask(command=command, cwd=resource_dir_path, env=env, venv_path=mrunner_args.venv_path, after_module_load_cmd=mrunner_args.after_module_load_cmd, script_name=mrunner_args.script_name, modules_to_load=modules_to_load) if mrunner_args.with_yaml: self.prometheus_api.mkdir(resource_dir_path) self.prometheus_api.copy_paths_rel(paths_to_dump, resource_dir_path) paths_to_dump_for_neptune = [os.path.join(p['dst'], os.path.basename(p['src'])) for p in paths_to_dump] new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, mrunner_args.name, mrunner_args.project) self.prometheus_api.copy_path(remote_config_path, new_local_config_path) local_task = self.mrunner_api.create_yaml_run_command(config_path=remote_config_path, paths_to_dump=paths_to_dump_for_neptune, storage_url=mrunner_args.storage_url, tags=mrunner_args.tags, exp_dir_path=exp_dir_path, rest_argv=rest_argv) # parms_argv = rest_argv # if mrunner_args.with_yaml: # parms_argv.append(" --yaml {}".format(remote_config_path)) # new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, # mrunner_args.name, # mrunner_args.project) # # self.prometheus_api.copy_path(remote_config_path, new_local_config_path) # # local_task = self.mrunner_api.create_normal_run_command(rest_argv, exp_dir_path=exp_dir_path) command_list = local_task.command if mrunner_args.neptune_conf is not None: with open(mrunner_args.neptune_conf) as f: for line in f.readlines(): command_list = [line] + command_list command = ' '.join(command_list) env = local_task.env env['EXPERIMENT_ID'] = mrunner_args.experiment_id env['STORAGE_URL'] = mrunner_args.storage_url env['RESOURCE_DIR_PATH'] = resource_dir_path if mrunner_args.pythonpath: env['PYTHONPATH'] = "{}:$PYTHONPATH".format(mrunner_args.pythonpath) log_path = os.path.join(resource_dir_path, "job_logs.txt") modules_to_load = [] if mrunner_args.modules_to_load: modules_to_load = mrunner_args.modules_to_load.split(":") modules_to_load = [x for x in modules_to_load if x] #remove empty strings print("Modules to load:{}".format(modules_to_load)) task = PlgridTask(command=command, cwd=resource_dir_path, env=env, venv_path=mrunner_args.venv_path, after_module_load_cmd=mrunner_args.after_module_load_cmd, script_name=mrunner_args.script_name, modules_to_load=modules_to_load) if mrunner_args.srun: self.prometheus_api.srun(task, partition=mrunner_args.partition, cores=mrunner_args.cores, ntasks=mrunner_args.ntasks, account=mrunner_args.A, gres=mrunner_args.gres) elif mrunner_args.sbatch: self.prometheus_api.sbatch(task, partition=mrunner_args.partition, cores=mrunner_args.cores, time=mrunner_args.time, stdout_path = log_path, ntasks=mrunner_args.ntasks, account=mrunner_args.A, gres=mrunner_args.gres)
def _make_script_name(self, script_name): return script_name + '_' + id_generator(20) + '.sh'
def main(self, argv): self.argv = argv mrunner_args, rest_argv = self.parse_argv() # INFO(maciek): This is a directory where we now copy needed files, but then # we will run 'neptune run', and it itself will create new exp_dir # This is f****d! subdir = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4) temp_exp_dir_path = os.path.join(mrunner_args.storage_url, subdir) print('temp_exp_dir_path', temp_exp_dir_path) dump_dir = os.path.join(temp_exp_dir_path, DUMP_SUBDIRECTORY_NAME) paths_to_dump = self._parse_paths_to_dump(dump_dir, None, mrunner_args.paths_to_dump) mkdir_p(dump_dir) self.copy_paths(paths_to_dump) if mrunner_args.neptune: tmp_config_path = self.mrunner_helper.config_to_yaml(mrunner_args.neptune_exp_config, mrunner_args.name, mrunner_args.project) config_path = os.path.join(dump_dir, 'config.yaml') self.copy_path(config_path, tmp_config_path) print('config_path', config_path) paths_to_dump_for_neptune = [p['dst'] for p in paths_to_dump] # We will be constructing neptune run command here command_with_env = self.mrunner_helper.create_neptune_run_command( config_path=config_path, paths_to_dump=paths_to_dump_for_neptune, storage_url=mrunner_args.storage_url, tags=mrunner_args.tags, neptune_host=mrunner_args.neptune_host, neptune_port=mrunner_args.neptune_port, neptune_username=mrunner_args.neptune_username, neptune_password=mrunner_args.neptune_password, rest_argv=rest_argv) else: command_with_env = self.mrunner_helper.create_normal_run_command(rest_argv=rest_argv, exp_dir_path=temp_exp_dir_path) print('command_with_env', command_with_env.command, command_with_env.env) pod_name = 'mrunner-pod-{id}'.format(id=id_generator(10)) # TODO(maciek): temporary!!! volume_mounts = [ KubeVolumeMount(name='storage', mountPath='/mnt/ml-team/rl/kubernetes_storage', hostPath={'path': '/mnt/ml-team/rl/kubernetes_storage'} ) ] # volume_mounts = [ # KubeVolumeMount(name='storage', # mountPath=mrunner_args.storage_url, # hostPath={'path': mrunner_args.storage_url} # ) # ] if mrunner_args.dry_run is True: print('only dry-run, not executing!!!') else: # INFO(maciek): kube's semantics is not obvious at all what args, and command mean # https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#define-a-command-and-arguments-when-you-create-a-pod # see the table that comparse Docker and Kube # INFO(maciek): this will use container's default entrypoint args = command_with_env.command command = None env = {} env.update(command_with_env.env) if mrunner_args.pythonpath is not None: env['PYTHONPATH'] = mrunner_args.pythonpath print('Pod env will be {}'.format(env)) self.kubernetes_backend.run_command_in_pod(pod_name=pod_name, image=mrunner_args.docker_image, nr_gpus=mrunner_args.nr_gpus, args=args, command=command, volume_mounts=volume_mounts, interactive=mrunner_args.interactive, workingDir=dump_dir, node_selector_key=mrunner_args.node_selector_key, node_selector_value=mrunner_args.node_selector_value, env=env, )