def manage(jobs, only_initialization=True, sleep_duration=20): jobs_waiting_previous_jobs = jobs # type: list[JobMeta] jobs_waiting_max_default_jobs = [] # type: list[JobMeta] # initialize jobs # print('Start Initialization') for job in jobs_waiting_previous_jobs: job.initialization() # print('End Initialization') if not only_initialization: while jobs_waiting_previous_jobs != [] or jobs_waiting_max_default_jobs != []: # runs waiting because of previous jobs selected_jobs = [] for job in jobs_waiting_previous_jobs: if job.previous_jobs_ended: selected_jobs.append(job) for job in selected_jobs: jobs_waiting_previous_jobs.remove(job) jobs_waiting_max_default_jobs.append(job) # runs waiting are sorted by the inverse order of priority jobs_waiting_max_default_jobs.sort(key=lambda x: x.priority_level, reverse=True) # runs waiting because of max default jobs selected_jobs = [] for job in jobs_waiting_max_default_jobs: if run_available(job.machine_name, selected_jobs): selected_jobs.append(job) for job in selected_jobs: job.run() jobs_waiting_max_default_jobs.remove(job) # some sleeping between each loop cmd('sleep %i' % sleep_duration)
def job_study(self): # Job study # check if job ended well (i.e., script should have deleted itself) # TODO: check if in a killed besteffort, that the script is not deleted if os.path.exists(self.script_filename): # delete the bash script cmd('rm ' + self.script_filename) # send a report of the crash by mail command_mail = 'cat ' + os.path.join(self.oarsub_dirname, self.job_id + '_stderr.txt') command_mail += ' | mail -s "Failure report of ' + self.job_name + '" ' + EMAIL cmd(command_mail) # declare job as crashed to avoid running following jobs self.job_crashed = True else: self.job_done = True
def getMachineSummary(machine, keywords): try: jobs = cmd("ssh -x " + machine + " 'oarstat | grep " + LOGIN + "'") except sp.CalledProcessError as e: return [] machine_summary = list(keywords) for job in sorted(jobs): # Monitoring only non interactive jobs if (job.split(' ')[-2]).split('J=')[-1] == 'I': continue # Extracting information and initializing a list for printing job_id = job.split(' ')[0] job_name = '' if len(job.split('N=')) > 1 and len((job.split('N=')[1]).split(' (')) > 0: job_name = (job.split('N=')[1]).split(' (')[0] if len(job_name.split(',')) > 1: job_name = job_name.split(',')[0] duration = (job.split(' R=')[0]).split(' ')[-1] job_list = [job_name, job_id, duration] # oarout = os.path.join(OARSUB_DIRNAME, job_name, job_id + '_stdout.txt') oarout = os.path.join(OARSUB_DIRNAME, job_name, job_id + '_stderr.txt') try: oarout_list = tail(open(oarout, 'r'), 140) except: continue oarout_string = ' '.join(oarout_list) job_list.append(cut_step(oarout_string)) for keyword in KEYWORDS: job_list.append(cut_value(oarout_string, keyword)) # TODO: fix cutting when a job has just started machine_summary.append(job_list) return machine_summary
def run_available(machine_name, selected_runs): oarstat_lines = cmd("ssh " + machine_name + " ' oarstat ' ") jobs_nb = 0 # check number of jobs on clusters for line in oarstat_lines: if LOGIN in line: jobs_nb += 1 # check number of jobs already selected for run in selected_runs: if run.machine_name == machine_name: jobs_nb += 1 return jobs_nb < MAX_DEFAULT_JOBS[machine_name]
def create_temp_dir(args, exp_name, overwrite): temp_dir = os.path.join("/scratch/gpuhost7/apashevi/Temp/", exp_name) if not os.path.exists(temp_dir): os.makedirs(temp_dir) else: if not overwrite: ans = input( "Overwrite is set to False. You want to overwrite previous experiments?' (y/n)" ) if ans != 'y': sys.exit(0) overwrite = True if '--overwrite=False' in args: args.replace('--overwrite=False', '--overwrite=True') else: args += '--overwrite=True ' for folder in os.listdir(temp_dir): folder_path = os.path.join(temp_dir, folder) shutil.rmtree(folder_path) cmd('cp -R /home/thoth/apashevi/Code/rlgrasp {}/'.format(temp_dir)) cmd('cp -R /home/thoth/apashevi/Code/rllabplusplus {}/'.format(temp_dir)) return args, overwrite
def generate_script(self): """ Generate an executable bash script containing a list of commands :param argv: parameters for the script to run """ # build script_dirname if it has not yet been created if not os.path.exists(self.script_dirname): os.makedirs(self.script_dirname) # create script_filename file cmd('touch ' + self.script_filename) # building the list of commands for the script commands = list() # install libraries that have been specified for library in self.librairies_to_install: commands.append('sudo apt-get install ' + library + ' --yes') # # TO IMPROVE # copy the whole project to launch into it from a local directory # if not os.path.exists(self.code_dirname): # os.makedirs(self.code_dirname) # new_global_path = os.path.join(self.code_dirname, os.path.basename(self.global_path_project)) # if not os.path.exists(new_global_path): # command_copy_dir = 'cp -r ' + self.global_path_project + ' ' + self.code_dirname # print(command_copy_dir) # cmd(command_copy_dir) # new_path_exe = os.path.join(new_global_path, self.local_path_exe) path_exe = os.path.join(self.global_path_project, self.local_path_exe) # launch the main exe if self.interpreter == '': command_script = self.interpreter + path_exe else: command_script = self.interpreter + ' ' + path_exe commands.append(' '.join([command_script] + self.run_argv)) # script file delete itself when finished commands.append('rm ' + self.script_filename + '\n') # write into the bash script with open(self.script_filename, 'w') as f: for command in commands: f.write('{0} \n'.format(command)) # give the permission to the bash script to execute cmd('chmod +x ' + self.script_filename)
def cache_code_dir(args_file, commit_agents, commit_grasp_env, gridargs=None, sym_link=False): _, exp_name, _ = read_args(args_file, gridargs) cache_dir = os.path.join("/scratch/gpuhost7/apashevi/Cache/Code/", exp_name) if os.path.exists(cache_dir): if not os.path.islink(cache_dir): shutil.rmtree(cache_dir) else: os.unlink(cache_dir) if not sym_link: os.makedirs(cache_dir) cmd('cp -R /home/thoth/apashevi/Code/rlgrasp {}/'.format(cache_dir)) cmd('cp -R /home/thoth/apashevi/Code/agents {}/'.format(cache_dir)) else: os.symlink('/home/thoth/apashevi/Code', cache_dir) if commit_agents is not None: checkout_repo(os.path.join(cache_dir, 'agents'), commit_agents) if commit_grasp_env is not None: checkout_repo(os.path.join(cache_dir, 'rlgrasp'), commit_grasp_env)
def run(self): """ General pipeline of the run method: -If previous jobs have not crashed: -A bash script is generated -A job is launched to process the bash script we just generated """ # check if previous jobs have crashed or not for job in self.previous_jobs: if job.job_crashed: self.job_crashed = True break if not self.job_crashed: # run a job with oarsub (its job_id is retrieved) print(self.oarsub_command) self.job_id = cmd(self.oarsub_command)[-1].split('=')[-1]
def job_ended(self): # TODO: this is the place where I will see if a job crashed or not. But this function will be called often # the job has crashed thus it has ended if self.job_crashed: print('job crashed') ended = True # the job has not been started elif self.job_id is None: ended = False # the job has been launched, we check if it is still running else: ended = True oarstat_lines = cmd("ssh -X -Y " + self.machine_name + " ' oarstat ' ") for line in oarstat_lines: if self.job_id in line: ended = False break return ended
from pytools.tools import cmd if __name__ == '__main__': cmd('sleep 5')
def main(): parser = argparse.ArgumentParser() parser.add_argument('logdir', type=str, help='Logdir with checkpoints (seed folder).') parser.add_argument('-s0', '--render_seed0', type=utils.str2bool, default=False, required=False, help='Whether to render the seed0 policies.') parser.add_argument( '-o', '--only_seeds', type=str, default=None, required=False, help='List of seeds to render in Json format (default = all).') args = parser.parse_args() # clear path from local codes of agents and rlgrasp sys_path_clean = utils.get_sys_path_clean() # set correct python path utils.change_sys_path(sys_path_clean, args.logdir) import agents.scripts.visualize as visualizer outdirs = [] if args.only_seeds: seed_list = json.loads(args.only_seeds) else: seed_list = None for seed_folder in next(os.walk(args.logdir))[1]: if 'seed' not in seed_folder: continue if seed_folder == 'seed0' and not args.render_seed0: continue if seed_list is not None and int(seed_folder.replace( 'seed', '')) not in seed_list: continue timestamp_folders = next( os.walk(os.path.join(args.logdir, seed_folder)))[1] if len(timestamp_folders) > 1: print('WARNING: will render from {} and ignore {}'.format( timestamp_folders[-1], timestamp_folders[:-1])) finallogdir = os.path.join(args.logdir, seed_folder, timestamp_folders[-1]) assert (os.path.exists(finallogdir)) finaloutdir = finallogdir.replace('Logs/agents', 'Logs/renders') if os.path.exists(finaloutdir): shutil.rmtree(finaloutdir) os.makedirs(finaloutdir) outdirs.append(finaloutdir) visualizer.visualize(finallogdir, finaloutdir, num_agents=4, num_episodes=8, checkpoint=None, env_processes=True) cmd('rm {}/*.manifest.json'.format(finaloutdir)) cmd('rm {}/*.meta.json'.format(finaloutdir)) tf.reset_default_graph() print('Videos are written to:') for outdir in outdirs: print(outdir) print('Hope that policies do grasp :)')