示例#1
0
def manage(jobs, only_initialization=True, sleep_duration=20):
    jobs_waiting_previous_jobs = jobs  # type: list[JobMeta]
    jobs_waiting_max_default_jobs = []  # type: list[JobMeta]
    # initialize jobs
    # print('Start Initialization')
    for job in jobs_waiting_previous_jobs:
        job.initialization()
    # print('End Initialization')
    if not only_initialization:
        while jobs_waiting_previous_jobs != [] or jobs_waiting_max_default_jobs != []:
            # runs waiting because of previous jobs
            selected_jobs = []
            for job in jobs_waiting_previous_jobs:
                if job.previous_jobs_ended:
                    selected_jobs.append(job)
            for job in selected_jobs:
                jobs_waiting_previous_jobs.remove(job)
                jobs_waiting_max_default_jobs.append(job)
            # runs waiting are sorted by the inverse order of priority
            jobs_waiting_max_default_jobs.sort(key=lambda x: x.priority_level,
                                               reverse=True)
            # runs waiting because of max default jobs
            selected_jobs = []
            for job in jobs_waiting_max_default_jobs:
                if run_available(job.machine_name, selected_jobs):
                    selected_jobs.append(job)
            for job in selected_jobs:
                job.run()
                jobs_waiting_max_default_jobs.remove(job)
            # some sleeping between each loop
            cmd('sleep %i' % sleep_duration)
示例#2
0
 def job_study(self):
     # Job study
     # check if job ended well (i.e., script should have deleted itself)
     # TODO: check if in  a killed besteffort, that the script is not deleted
     if os.path.exists(self.script_filename):
         # delete the bash script
         cmd('rm ' + self.script_filename)
         # send a report of the crash by mail
         command_mail = 'cat ' + os.path.join(self.oarsub_dirname, self.job_id + '_stderr.txt')
         command_mail += ' | mail -s "Failure report of ' + self.job_name + '" ' + EMAIL
         cmd(command_mail)
         # declare job as crashed to avoid running following jobs
         self.job_crashed = True
     else:
         self.job_done = True
示例#3
0
def getMachineSummary(machine, keywords):
    try:
        jobs = cmd("ssh -x " + machine + " 'oarstat | grep " + LOGIN + "'")
    except sp.CalledProcessError as e:
        return []

    machine_summary = list(keywords)
    for job in sorted(jobs):
        # Monitoring only non interactive jobs
        if (job.split(' ')[-2]).split('J=')[-1] == 'I':
            continue
        # Extracting information and initializing a list for printing
        job_id = job.split(' ')[0]
        job_name = ''
        if len(job.split('N=')) > 1 and len((job.split('N=')[1]).split(' (')) > 0:
            job_name = (job.split('N=')[1]).split(' (')[0]
            if len(job_name.split(',')) > 1:
                job_name = job_name.split(',')[0]
        duration = (job.split(' R=')[0]).split(' ')[-1]
        job_list = [job_name, job_id, duration]
        # oarout = os.path.join(OARSUB_DIRNAME, job_name, job_id + '_stdout.txt')
        oarout = os.path.join(OARSUB_DIRNAME, job_name, job_id + '_stderr.txt')
        try:
            oarout_list = tail(open(oarout, 'r'), 140)
        except:
            continue

        oarout_string = ' '.join(oarout_list)
        job_list.append(cut_step(oarout_string))
        for keyword in KEYWORDS:
            job_list.append(cut_value(oarout_string, keyword))
            # TODO: fix cutting when a job has just started
        machine_summary.append(job_list)
    return machine_summary
示例#4
0
def run_available(machine_name, selected_runs):
    oarstat_lines = cmd("ssh " + machine_name + " ' oarstat ' ")
    jobs_nb = 0
    # check number of jobs on clusters
    for line in oarstat_lines:
        if LOGIN in line:
            jobs_nb += 1
    # check number of jobs already selected
    for run in selected_runs:
        if run.machine_name == machine_name:
            jobs_nb += 1
    return jobs_nb < MAX_DEFAULT_JOBS[machine_name]
示例#5
0
def create_temp_dir(args, exp_name, overwrite):
    temp_dir = os.path.join("/scratch/gpuhost7/apashevi/Temp/", exp_name)
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    else:
        if not overwrite:
            ans = input(
                "Overwrite is set to False. You want to overwrite previous experiments?' (y/n)"
            )
            if ans != 'y': sys.exit(0)
            overwrite = True
            if '--overwrite=False' in args:
                args.replace('--overwrite=False', '--overwrite=True')
            else:
                args += '--overwrite=True '
        for folder in os.listdir(temp_dir):
            folder_path = os.path.join(temp_dir, folder)
            shutil.rmtree(folder_path)
    cmd('cp -R /home/thoth/apashevi/Code/rlgrasp {}/'.format(temp_dir))
    cmd('cp -R /home/thoth/apashevi/Code/rllabplusplus {}/'.format(temp_dir))
    return args, overwrite
示例#6
0
 def generate_script(self):
     """
     Generate an executable bash script containing a list of commands
     :param argv: parameters for the script to run
     """
     # build script_dirname if it has not yet been created
     if not os.path.exists(self.script_dirname):
         os.makedirs(self.script_dirname)
     # create script_filename file
     cmd('touch ' + self.script_filename)
     # building the list of commands for the script
     commands = list()
     # install libraries that have been specified
     for library in self.librairies_to_install:
         commands.append('sudo apt-get install ' + library + ' --yes')
     # # TO IMPROVE
     #  copy the whole project to launch into it from a local directory
     # if not os.path.exists(self.code_dirname):
     #     os.makedirs(self.code_dirname)
     # new_global_path = os.path.join(self.code_dirname, os.path.basename(self.global_path_project))
     # if not os.path.exists(new_global_path):
     #     command_copy_dir = 'cp -r ' + self.global_path_project + ' ' + self.code_dirname
     #     print(command_copy_dir)
     #     cmd(command_copy_dir)
     # new_path_exe = os.path.join(new_global_path, self.local_path_exe)
     path_exe = os.path.join(self.global_path_project, self.local_path_exe)
     # launch the main exe
     if self.interpreter == '':
         command_script = self.interpreter + path_exe
     else:
         command_script = self.interpreter + ' ' + path_exe
     commands.append(' '.join([command_script] + self.run_argv))
     # script file delete itself when finished
     commands.append('rm ' + self.script_filename + '\n')
     # write into the bash script
     with open(self.script_filename, 'w') as f:
         for command in commands:
             f.write('{0} \n'.format(command))
     # give the permission to the bash script to execute
     cmd('chmod +x ' + self.script_filename)
示例#7
0
def cache_code_dir(args_file,
                   commit_agents,
                   commit_grasp_env,
                   gridargs=None,
                   sym_link=False):
    _, exp_name, _ = read_args(args_file, gridargs)
    cache_dir = os.path.join("/scratch/gpuhost7/apashevi/Cache/Code/",
                             exp_name)
    if os.path.exists(cache_dir):
        if not os.path.islink(cache_dir):
            shutil.rmtree(cache_dir)
        else:
            os.unlink(cache_dir)
    if not sym_link:
        os.makedirs(cache_dir)
        cmd('cp -R /home/thoth/apashevi/Code/rlgrasp {}/'.format(cache_dir))
        cmd('cp -R /home/thoth/apashevi/Code/agents {}/'.format(cache_dir))
    else:
        os.symlink('/home/thoth/apashevi/Code', cache_dir)
    if commit_agents is not None:
        checkout_repo(os.path.join(cache_dir, 'agents'), commit_agents)
    if commit_grasp_env is not None:
        checkout_repo(os.path.join(cache_dir, 'rlgrasp'), commit_grasp_env)
示例#8
0
 def run(self):
     """
     General pipeline of the run method:
         -If previous jobs have not crashed:
             -A bash script is generated
             -A job is launched to process the bash script we just generated
     """
     # check if previous jobs have crashed or not
     for job in self.previous_jobs:
         if job.job_crashed:
             self.job_crashed = True
             break
     if not self.job_crashed:
         # run a job with oarsub (its job_id is retrieved)
         print(self.oarsub_command)
         self.job_id = cmd(self.oarsub_command)[-1].split('=')[-1]
示例#9
0
 def job_ended(self):
     # TODO: this is the place where I will see if a job crashed or not. But this function will be called often
     # the job has crashed thus it has ended
     if self.job_crashed:
         print('job crashed')
         ended = True
     # the job has not been started
     elif self.job_id is None:
         ended = False
     # the job has been launched, we check if it is still running
     else:
         ended = True
         oarstat_lines = cmd("ssh -X -Y " + self.machine_name + " ' oarstat ' ")
         for line in oarstat_lines:
             if self.job_id in line:
                 ended = False
                 break
     return ended
示例#10
0
from pytools.tools import cmd


if __name__ == '__main__':
    cmd('sleep 5')
示例#11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('logdir',
                        type=str,
                        help='Logdir with checkpoints (seed folder).')
    parser.add_argument('-s0',
                        '--render_seed0',
                        type=utils.str2bool,
                        default=False,
                        required=False,
                        help='Whether to render the seed0 policies.')
    parser.add_argument(
        '-o',
        '--only_seeds',
        type=str,
        default=None,
        required=False,
        help='List of seeds to render in Json format (default = all).')

    args = parser.parse_args()

    # clear path from local codes of agents and rlgrasp
    sys_path_clean = utils.get_sys_path_clean()
    # set correct python path
    utils.change_sys_path(sys_path_clean, args.logdir)
    import agents.scripts.visualize as visualizer

    outdirs = []
    if args.only_seeds:
        seed_list = json.loads(args.only_seeds)
    else:
        seed_list = None

    for seed_folder in next(os.walk(args.logdir))[1]:
        if 'seed' not in seed_folder:
            continue
        if seed_folder == 'seed0' and not args.render_seed0:
            continue
        if seed_list is not None and int(seed_folder.replace(
                'seed', '')) not in seed_list:
            continue

        timestamp_folders = next(
            os.walk(os.path.join(args.logdir, seed_folder)))[1]
        if len(timestamp_folders) > 1:
            print('WARNING: will render from {} and ignore {}'.format(
                timestamp_folders[-1], timestamp_folders[:-1]))
        finallogdir = os.path.join(args.logdir, seed_folder,
                                   timestamp_folders[-1])
        assert (os.path.exists(finallogdir))

        finaloutdir = finallogdir.replace('Logs/agents', 'Logs/renders')
        if os.path.exists(finaloutdir):
            shutil.rmtree(finaloutdir)
        os.makedirs(finaloutdir)
        outdirs.append(finaloutdir)
        visualizer.visualize(finallogdir,
                             finaloutdir,
                             num_agents=4,
                             num_episodes=8,
                             checkpoint=None,
                             env_processes=True)
        cmd('rm {}/*.manifest.json'.format(finaloutdir))
        cmd('rm {}/*.meta.json'.format(finaloutdir))
        tf.reset_default_graph()
    print('Videos are written to:')
    for outdir in outdirs:
        print(outdir)
    print('Hope that policies do grasp :)')