def test_get_job_stats_logs_errors(self): # return exp_list = [{ 'model': { 'name': 'mlp', 'n_layers': 30 }, 'dataset': 'mnist', 'batch_size': 1 }] savedir_base = '/mnt/datasets/public/issam/tmp' job_config = { 'volume': ['/mnt:/mnt'], 'image': 'images.borgy.elementai.net/issam.laradji/main', 'bid': '1', 'restartable': '1', 'gpu': '1', 'mem': '20', 'cpu': '2', } run_command = ('python example.py -ei <exp_id> -sb %s' % (savedir_base)) hjb.run_exp_list_jobs(exp_list, savedir_base=savedir_base, workdir=os.path.dirname( os.path.realpath(__file__)), run_command=run_command, job_config=job_config, force_run=True, wait_seconds=0) assert (os.path.exists( os.path.join(savedir_base, hu.hash_dict(exp_list[0]), 'borgy_dict.json'))) jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base) jm_summary_list = jm.get_summary() rm = hr.ResultManager(exp_list=exp_list, savedir_base=savedir_base) rm_summary_list = rm.get_job_summary() assert (rm_summary_list['table'].equals(jm_summary_list['table'])) jm.kill_jobs() assert ('CANCELLED' in jm.get_summary()['status'][0])
def run_wizard(func, exp_list=None, exp_groups=None, job_config=None, savedir_base=None, reset=None, args=None, use_threads=True, exp_id=None): if args is None: args = get_args() # Asserts # ======= savedir_base = savedir_base or args.savedir_base reset = reset or args.reset exp_id = exp_id or args.exp_id assert savedir_base is not None # Collect experiments # =================== if exp_id is not None: # select one experiment savedir = os.path.join(savedir_base, exp_id) exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json")) exp_list = [exp_dict] elif exp_list is None: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_groups[exp_group_name] # save results folder if exp_id is None: results_fname = args.visualize_notebook if len(results_fname): if '.ipynb' not in results_fname: results_fname += '.ipynb' create_jupyter_file(fname=results_fname, savedir_base=savedir_base) hu.check_duplicates(exp_list) print('\nRunning %d experiments' % len(exp_list)) # Run experiments # =============== if not args.run_jobs: for exp_dict in exp_list: savedir = create_experiment(exp_dict, savedir_base, reset=reset, verbose=True) # do trainval func(exp_dict=exp_dict, savedir=savedir, args=args) else: # launch jobs from haven import haven_jobs as hjb assert job_config is not None assert 'account_id' in job_config jm = hjb.JobManager( exp_list=exp_list, savedir_base=savedir_base, workdir=os.getcwd(), job_config=job_config, ) command = ('python trainval.py -ei <exp_id> -sb %s -d %s' % (savedir_base, args.datadir)) print(command) jm.launch_menu(command=command, in_parallel=use_threads)
} exp_list = [{ 'model': { 'name': 'mlp', 'n_layers': 20 }, 'dataset': 'mnist', 'batch_size': 1 }] savedir_base = '/mnt/results/test' jm = hjb.JobManager( exp_list=exp_list, savedir_base=savedir_base, workdir=os.path.dirname(os.path.realpath(__file__)), job_config=job_config, account_id='75ce4cee-6829-4274-80e1-77e89559ddfb', ) # get jobs job_list_old = jm.get_jobs() # run single command savedir_logs = '%s/%s' % (savedir_base, np.random.randint(1000)) os.makedirs(savedir_logs, exist_ok=True) command = 'echo 2' job_id = jm.submit_job(command, workdir=jm.workdir, savedir_logs=savedir_logs) # get jobs
savedir_base=args.savedir_base, datadir_base=args.datadir_base, reset=args.reset, num_workers=args.num_workers, pin_memory=args.pin_memory, ngpu=args.ngpu, cuda_deterministic=args.cuda_deterministic, ) else: # launch jobs from haven import haven_jobs as hjb import job_configs as jc jm = hjb.JobManager(exp_list=exp_list, savedir_base=args.savedir_base, account_id=jc.ACCOUNT_ID, workdir=os.path.dirname(os.path.realpath(__file__)), job_config=jc.JOB_CONFIG, ) command = ("python trainval.py " "-ei <exp_id> " "-sb {savedir_base} " "-d {datadir_base} " "-ng {ngpu} " "-cd {cuda_deterministic} " "-pm {pin_memory} " "-nw {num_workers}".format(savedir_base=args.savedir_base, ngpu=args.ngpu, cuda_deterministic=args.cuda_deterministic, pin_memory=args.pin_memory, num_workers=args.num_workers,
# 'dataset':'mnist', 'batch_size':1}] # savedir_base = '/home/toolkit/home_mnt/data/experiments' # job_config = { # 'image': 'registry.console.elementai.com/mila.mattie_sandbox.fewshotgan/fewshot-gan', # 'data': ['mila.mattie_sandbox.fewshotgan.home:/home/toolkit/home_mnt'], if __name__ == '__main__': # return exp_list = [{'model':{'name':'mlp', 'n_layers':20}, 'dataset':'mnist', 'batch_size':1}] savedir_base = '.tmp' jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base, workdir=os.path.dirname(os.path.realpath(__file__)), job_config=job_config, ) # get jobs job_list_old = jm.get_jobs() # run single command savedir_logs = '%s/%s' % (savedir_base, np.random.randint(1000)) os.makedirs(savedir_logs, exist_ok=True) command = 'echo 2' job_id = jm.submit_job(command, workdir=jm.workdir, savedir_logs=savedir_logs) # get jobs job_list = jm.get_jobs() job = jm.get_job(job_id) assert job_list[0].id == job_id
def run_wizard(func, exp_list=None, exp_groups=None, job_config=None, savedir_base=None, reset=None, args=None, use_threads=False, exp_id=None, python_binary_path='python', python_file_path=None, workdir=None): if args is None: args = get_args() custom_args = {} else: custom_args = vars(args).copy() for k, v in vars(get_args()).items(): setattr(args, k, v) # Asserts # ======= savedir_base = savedir_base or args.savedir_base reset = reset or args.reset exp_id = exp_id or args.exp_id assert savedir_base is not None # Collect experiments # =================== if exp_id is not None: # select one experiment savedir = os.path.join(savedir_base, exp_id) exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json")) exp_list = [exp_dict] elif exp_list is None: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_groups[exp_group_name] # save results folder if exp_id is None: results_fname = args.visualize_notebook if len(results_fname): if '.ipynb' not in results_fname: results_fname += '.ipynb' create_jupyter_file(fname=results_fname, savedir_base=savedir_base) hu.check_duplicates(exp_list) print('\nRunning %d experiments' % len(exp_list)) # Run experiments # =============== if not args.run_jobs: for exp_dict in exp_list: savedir = create_experiment(exp_dict, savedir_base, reset=reset, verbose=True) # do trainval func(exp_dict=exp_dict, savedir=savedir, args=args) else: # launch jobs from haven import haven_jobs as hjb assert job_config is not None assert 'account_id' in job_config if workdir is None: workdir = os.getcwd() jm = hjb.JobManager(exp_list=exp_list, savedir_base=savedir_base, workdir=workdir, job_config=job_config, ) if python_file_path is None: python_file_path = os.path.split(sys.argv[0])[-1] command = (f'{python_binary_path} {python_file_path} -ei <exp_id> -sb {savedir_base}') for k, v in custom_args.items(): if k not in ['savedir_base', 'sb', 'ei', 'exp_id', 'e', 'exp_group_list', 'j', 'run_jobs', 'r', 'reset', 'v', 'visualize_notebook']: command += f" --{k} {v}" print(command) jm.launch_menu(command=command, in_parallel=use_threads)
def run_wizard( func, exp_list=None, exp_groups=None, job_config=None, savedir_base=None, reset=None, args=None, use_threads=False, exp_id=None, python_binary_path="python", python_file_path=None, workdir=None, job_scheduler=None, save_logs=True, filter_duplicates=False, results_fname=None, ): if args is None: args = get_args() custom_args = {} else: custom_args = vars(args).copy() for k, v in vars(get_args()).items(): if k in custom_args: continue setattr(args, k, v) # Asserts # ======= savedir_base = savedir_base or args.savedir_base reset = reset or args.reset exp_id = exp_id or args.exp_id assert savedir_base is not None # Collect experiments # =================== if exp_id is not None: # select one experiment savedir = os.path.join(savedir_base, exp_id) exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json")) exp_list = [exp_dict] elif exp_list is None: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_groups[exp_group_name] if filter_duplicates: n_total = len(exp_list) exp_list = hu.filter_duplicates(exp_list) print(f"Filtered {len(exp_list)}/{n_total}") hu.check_duplicates(exp_list) print("\nRunning %d experiments" % len(exp_list)) # save results folder if exp_id is None and results_fname is not None: if len(results_fname): if ".ipynb" not in results_fname: raise ValueError(".ipynb should be the file extension") hj.create_jupyter_file(fname=results_fname, savedir_base=savedir_base) # Run experiments # =============== if job_scheduler is None: job_scheduler = args.job_scheduler if job_scheduler in [None, "None", "0"]: job_scheduler = None elif job_scheduler in ["toolkit", "slurm", "gcp"]: job_scheduler = args.job_scheduler elif job_scheduler in ["1"]: job_scheduler = "toolkit" else: raise ValueError(f"{job_scheduler} does not exist") if job_scheduler is None: for exp_dict in exp_list: savedir = create_experiment(exp_dict, savedir_base, reset=reset, verbose=True) # do trainval func(exp_dict=exp_dict, savedir=savedir, args=args) else: # launch jobs print(f"Using Job Scheduler: {job_scheduler}") from haven import haven_jobs as hjb assert job_config is not None assert "account_id" in job_config if workdir is None: workdir = os.getcwd() jm = hjb.JobManager( exp_list=exp_list, savedir_base=savedir_base, workdir=workdir, job_config=job_config, job_scheduler=job_scheduler, save_logs=save_logs, ) if python_file_path is None: python_file_path = os.path.split(sys.argv[0])[-1] command = f"{python_binary_path} {python_file_path} --exp_id <exp_id> --savedir_base {savedir_base}" for k, v in custom_args.items(): if k not in [ "savedir_base", "sb", "ei", "exp_id", "e", "exp_group_list", "j", "job_scheduler", "r", "reset", ]: command += f" --{k} {v}" print(command) jm.launch_menu(command=command, in_parallel=use_threads)
savedir = os.path.join(args.savedir_base, args.exp_id) exp_dict = hu.load_json(os.path.join(savedir, "exp_dict.json")) exp_list = [exp_dict] else: # select exp group exp_list = [] for exp_group_name in args.exp_group_list: exp_list += exp_configs.EXP_GROUPS[exp_group_name] # Run experiments # =============== if args.run_jobs: from haven import haven_jobs as hjb jm = hjb.JobManager(exp_list=exp_list, savedir_base=args.savedir_base) jm_summary_list = jm.get_summary() print(jm.get_summary()['status']) import usr_configs as uc uc.run_jobs(exp_list, args.savedir_base, args.datadir) else: for exp_dict in exp_list: # do trainval trainval(exp_dict=exp_dict, savedir_base=args.savedir_base, datadir=args.datadir, reset=args.reset, num_workers=args.num_workers)
def test_toolkit(): # toolkit tests import job_configs exp_list = [{ "model": { "name": "mlp", "n_layers": 20 }, "dataset": "mnist", "batch_size": 1 }] savedir_base = os.path.realpath(".tmp") os.makedirs(savedir_base, exist_ok=True) jm = hjb.JobManager( exp_list=exp_list, savedir_base=savedir_base, workdir=os.path.dirname(os.path.realpath(__file__)), job_config=job_configs.JOB_CONFIG, ) # get jobs job_list_old = jm.get_jobs() # run single command savedir_logs = "%s/%s" % (savedir_base, np.random.randint(1000)) os.makedirs(savedir_logs, exist_ok=True) command = "echo 2" job_id = jm.submit_job(command, workdir=jm.workdir, savedir_logs=savedir_logs) # get jobs job_list = jm.get_jobs() job = jm.get_job(job_id) assert job_list[0]["id"] == job_id # jm.kill_job(job_list[0].id) # run print("jobs:", len(job_list_old), len(job_list)) assert (len(job_list_old) + 1) == len(job_list) # command_list = [] # for exp_dict in exp_list: # command_list += [] # hjb.run_command_list(command_list) # jm.launch_menu(command=command) jm.launch_exp_list(command="echo 2 -e <exp_id>", reset=1, in_parallel=False) assert os.path.exists( os.path.join(savedir_base, hu.hash_dict(exp_list[0]), "job_dict.json")) summary_list = jm.get_summary_list() print(hu.filter_list(summary_list, {"job_state": "SUCCEEDED"})) print(hu.group_list(summary_list, key="job_state", return_count=True)) rm = hr.ResultManager(exp_list=exp_list, savedir_base=savedir_base) rm_summary_list = rm.get_job_summary() db = hj.get_dashboard(rm, wide_display=True) db.display()