def hydra_main(cfg: DictConfig) -> None: # Set up python logging. logger = logging.getLogger() logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=cfg.log_level, datefmt='%Y-%m-%d %H:%M:%S') logging.info(cfg.pretty()) if 'slurm' in cfg.train: slurm_dir = Path.cwd() / 'slurm' slurm_dir.mkdir() executor = submitit.AutoExecutor(slurm_dir) executor.update_parameters( slurm_gpus_per_node=cfg.train.slurm.gpus_per_node, slurm_nodes=cfg.train.slurm.nodes, slurm_ntasks_per_node=cfg.train.slurm.gpus_per_node, slurm_cpus_per_task=cfg.train.slurm.cpus_per_task, slurm_time=cfg.train.slurm.time, slurm_additional_parameters={ 'constraint': 'gpu', 'account': cfg.train.slurm.account }) job = executor.submit(train, cfg=cfg) logging.info(f'submitted job {job.job_id}.') else: train(cfg)
def main(args): if not os.path.exists('./results'): os.mkdir('./results') run_IDs = list(range(20)) errors = {r: None for r in run_IDs} jobs = {r: None for r in run_IDs} # initialize job executor executor = submitit.AutoExecutor(folder="./logs") executor.update_parameters(nodes=1, tasks_per_node=1, cpus_per_task=3, slurm_mem='20GB', slurm_gres='gpu:1', slurm_time='8:00:00', slurm_job_name='osc', slurm_array_parallelism=20) # execute 3-step process sequentially print('step 1: parsing') fn = lambda r: get_base_parses(r, reverse=args.reverse) jobs, errors = array_step(executor, fn, jobs, run_IDs, errors) save_errors(errors) print('step 2: optimization') fn = lambda r: optimize_parses(r, reverse=args.reverse) jobs, errors = array_step(executor, fn, jobs, run_IDs, errors) save_errors(errors) print('step 3: re-fitting') executor.update_parameters( slurm_time='48:00:00') # more compute time needed for this step fn = lambda r: refit_parses_multi(r, reverse=args.reverse) jobs, errors = array_step(executor, fn, jobs, run_IDs, errors) save_errors(errors)
def launch(): executor = submitit.AutoExecutor(folder=args.folder) executor.update_parameters( slurm_partition=args.partition, slurm_constraint=args.device, slurm_comment='comms release April 30', slurm_mem='450G', timeout_min=args.time, nodes=args.nodes, tasks_per_node=args.tasks_per_node, cpus_per_task=10, gpus_per_node=args.tasks_per_node) config_fnames = [args.fname] if args.batch_launch: with open(args.fname, 'r') as y_file: config_fnames = yaml.load(y_file, Loader=yaml.FullLoader) jobs, trainers = [], [] with executor.batch(): for cf in config_fnames: fb_trainer = Trainer(args.sel, cf) job = executor.submit(fb_trainer,) trainers.append(fb_trainer) jobs.append(job) for job in jobs: print(job.job_id)
def launch_benchmark_suite_scheduler(config_file): assert g_pathmgr.exists( config_file), "Slurm evaluator config file must exist" user_config = load_file(config_file) config = _DEFAULT_CONFIG.copy() recursive_dict_merge(config, user_config) benchmark_suite_scheduler = BenchmarkSuiteScheduler(**config["params"]) benchmark_suite_scheduler_job = SlurmEvaluatorJob( benchmark_suite_scheduler=benchmark_suite_scheduler) executor = submitit.AutoExecutor( folder=benchmark_suite_scheduler.evaluation_dir()) assert "slurm_options" in config, "slurm_options must be specified" assert ( "PARTITION" in config["slurm_options"] ), "slurm_options.PARTITION is a required field to launch the benchmark suite on slurm" slurm_options = AttrDict(config["slurm_options"]) executor.update_parameters( name=slurm_options.NAME, slurm_comment=slurm_options.COMMENT, slurm_partition=slurm_options.PARTITION, slurm_constraint=slurm_options.CONSTRAINT, timeout_min=slurm_options.TIMEOUT_MIN, nodes=1, cpus_per_task=slurm_options.CPUS_PER_TASK, tasks_per_node=1, mem_gb=slurm_options.MEM_GB, slurm_additional_parameters=slurm_options.ADDITIONAL_PARAMETERS, ) job = executor.submit(benchmark_suite_scheduler_job) print(f"SUBMITTED EVALUATION JOB: {job.job_id}")
def main(): args = opts.parse_args() # Note that the folder will depend on the job_id, to easily track experiments executor = submitit.AutoExecutor( folder=utils.get_shared_folder(args.name) / "%j") num_gpus_per_node = 8 args.batch_size = args.batch_size * num_gpus_per_node executor.update_parameters( mem_gb=45 * num_gpus_per_node, gpus_per_node=num_gpus_per_node, # tasks_per_node=1, # one task per GPU cpus_per_task=80, nodes=1, timeout_min=60 * 16, # Below are cluster dependent parameters slurm_partition="dev", slurm_signal_delay_s=120, ) executor.update_parameters(name=args.name) args.dist_url = utils.get_init_file(args.name).as_uri() args.output_dir = str(utils.get_shared_folder(args.name)) trainer = Trainer(args) job = executor.submit(trainer)
def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 params = self.params # build executor init_params = {"folder": self.params["submitit_folder"]} specific_init_keys = {"max_num_timeout"} init_params.update( **{ f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x in specific_init_keys } ) init_keys = specific_init_keys | {"submitit_folder"} executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params) # specify resources/parameters baseparams = set(dataclasses.asdict(BaseTarget()).keys()) params = { x if x in baseparams else f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x not in init_keys } executor.update_parameters(**params) log.info( f"Submitit '{self._EXECUTOR}' sweep output dir : " f"{self.config.hydra.sweep.dir}" ) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def init_executor(executor, args): log_folder = f"{args.log_dir}/%j" executor = submitit.AutoExecutor(folder=log_folder) executor.update_parameters(timeout_min=4, slurm_partition="dev", gpus_per_node=args.ngpus) return executor
def main(): args = parse_args() if args.job_dir == "": args.job_dir = get_shared_folder() / "%j" # Note that the folder will depend on the job_id, to easily track experiments executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) # cluster setup is defined by environment variables num_gpus_per_node = args.ngpus nodes = args.nodes timeout_min = args.timeout executor.update_parameters( mem_gb=40 * num_gpus_per_node, gpus_per_node=num_gpus_per_node, tasks_per_node=num_gpus_per_node, # one task per GPU cpus_per_task=10, nodes=nodes, timeout_min=timeout_min, # max is 60 * 72 ) executor.update_parameters(name="detr") args.dist_url = get_init_file().as_uri() args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id)
def main(args): files = Path(args.alignment_dir).glob("*.a3m") output_dir = Path(args.output_dir) def commands(): for file in files: base_command = [ "bash", "run_training_in_conda_env.sh", str(file), str(output_dir), ] yield base_command executor = submitit.AutoExecutor( folder=f"/checkpoint/{os.environ['USER']}/deepsequence-timing-logs") executor.update_parameters( timeout_min=3000, slurm_partition="learnfair", gpus_per_node=1, mem_gb=64, cpus_per_task=10, slurm_constraint="volta32gb", slurm_array_parallelism=32, ) runfunc = partial(timed_run, output_dir=output_dir) with executor.batch(): for command in commands: executor.submit(runfunc, command)
def create_submitit_executor(cfg: AttrDict): """ Utility function to create a SLURM submitit executor, which is able to schedule arbitrary functions on a SLURM cluster The configuration of the executor is derived from the SLURM part of the VISSL configuration provided as parameter """ import submitit log_folder = cfg.SLURM.LOG_FOLDER makedir(log_folder) assert g_pathmgr.exists( log_folder ), f"Specified config.SLURM.LOG_FOLDER={log_folder} doesn't exist" assert cfg.SLURM.PARTITION, "SLURM.PARTITION must be set when using SLURM" executor = submitit.AutoExecutor(folder=log_folder) timeout_min = cfg.SLURM.TIME_HOURS * 60 + cfg.SLURM.TIME_MINUTES executor.update_parameters( name=cfg.SLURM.NAME, slurm_comment=cfg.SLURM.COMMENT, slurm_partition=cfg.SLURM.PARTITION, slurm_constraint=cfg.SLURM.CONSTRAINT, timeout_min=timeout_min, nodes=cfg.DISTRIBUTED.NUM_NODES, cpus_per_task=cfg.SLURM.NUM_CPU_PER_PROC * cfg.DISTRIBUTED.NUM_PROC_PER_NODE, tasks_per_node=1, gpus_per_node=cfg.DISTRIBUTED.NUM_PROC_PER_NODE, mem_gb=cfg.SLURM.MEM_GB, slurm_additional_parameters=cfg.SLURM.ADDITIONAL_PARAMETERS, ) return executor
def hydra_main(cfg: DictConfig) -> None: # Set up python logging. logger = logging.getLogger() if is_rank_zero(): logger.setLevel(cfg.log_level) logging.info(OmegaConf.to_yaml(cfg)) wandb_version = wandb.util.generate_id() add_wandb_version(cfg, wandb_version) if cfg.cluster.name == 'slurm': slurm_dir = Path.cwd() / 'slurm' slurm_dir.mkdir() logging.info(f'Slurm logs: {slurm_dir}') executor = submitit.AutoExecutor(slurm_dir) executor.update_parameters( slurm_gpus_per_node=cfg.cluster.gpus_per_node, slurm_nodes=cfg.cluster.nodes, slurm_ntasks_per_node=cfg.cluster.gpus_per_node, slurm_cpus_per_task=cfg.cluster.cpus_per_task, slurm_time=cfg.cluster.time, slurm_additional_parameters={ 'constraint': 'gpu', 'account': cfg.cluster.account, 'requeue': True }) job = executor.submit(train, cfg=cfg) logging.info(f'submitted job {job.job_id}.') else: train(cfg)
def main(): args = parse() out_sents = [] with open(args.data_path, "r") as fp: sent_list = [x.strip() for x in fp.readlines()] if args.parallel_process_num > 1: try: import submitit except ImportError: logger.warn( "submitit is not found and only one job is used to process the data" ) submitit = None if args.parallel_process_num == 1 or submitit is None: out_sents = process_sents(sent_list, args) else: # process sentences with parallel computation lsize = len(sent_list) // args.parallel_process_num + 1 executor = submitit.AutoExecutor(folder=args.logdir) executor.update_parameters(timeout_min=1000, cpus_per_task=4) jobs = [] for i in range(args.parallel_process_num): job = executor.submit(process_sents, sent_list[lsize * i:lsize * (i + 1)], args) jobs.append(job) is_running = True while is_running: time.sleep(5) is_running = sum([job.done() for job in jobs]) < len(jobs) out_sents = list( itertools.chain.from_iterable([job.result() for job in jobs])) with open(args.out_path, "w") as fp: fp.write("\n".join(out_sents) + "\n")
def main(): t0 = time.time() # Cleanup log folder. # This folder may grow rapidly especially if you have large checkpoints, # or submit lot of jobs. You should think about an automated way of cleaning it. folder = Path(__file__).parent / "mnist_logs" if folder.exists(): for file in folder.iterdir(): file.unlink() ex = submitit.AutoExecutor(folder) if ex.cluster == "slurm": print("Executor will schedule jobs on Slurm.") else: print( f"!!! Slurm executable `srun` not found. Will execute jobs on '{ex.cluster}'" ) model_path = folder / "model.pkl" trainer = MnistTrainer( LogisticRegression(penalty="l1", solver="saga", tol=0.1, multi_class="auto")) # Specify the job requirements. # Reserving only as much resource as you need ensure the cluster resource are # efficiently allocated. ex.update_parameters(mem_gb=1, cpus_per_task=4, timeout_min=5) job = ex.submit(trainer, 5000, model_path=model_path) print(f"Scheduled {job}.") # Wait for the job to be running. while job.state != "RUNNING": time.sleep(1) print("Run the following command to see what's happening") print(f" less +F {job.paths.stdout}") # Simulate preemption. # Tries to stop the job after the first stage. # If the job is preempted before the end of the first stage, try to increase it. # If the job is not preempted, try to decrease it. time.sleep(25) print(f"preempting {job} after {time.time() - t0:.0f}s") job._interrupt() score = job.result() print(f"Finished training. Final score: {score}.") print(f"---------------- Job output ---------------------") print(job.stdout()) print(f"-------------------------------------------------") assert model_path.exists() with open(model_path, "rb") as f: (scaler, clf) = pickle.load(f) sparsity = np.mean(clf.coef_ == 0) * 100 print(f"Sparsity with L1 penalty: {sparsity / 100:.2%}")
def test_slurm_through_auto(params: tp.Dict[str, int], tmp_path: Path) -> None: with mocked_slurm(): executor = submitit.AutoExecutor(folder=tmp_path) executor.update_parameters(**params, slurm_additional_parameters={"mem_per_gpu": 12}) job = executor.submit(test_core.do_nothing, 1, 2, blublu=3) text = job.paths.submission_file.read_text() mem_lines = [x for x in text.splitlines() if "#SBATCH --mem" in x] assert len(mem_lines) == 1, f"Unexpected lines: {mem_lines}"
def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit num_jobs = len(job_overrides) assert num_jobs > 0 # make sure you don't change inplace queue_parameters = self.queue_parameters.copy() OmegaConf.set_struct(queue_parameters, True) if self.queue == "auto": max_num_timeout = self.queue_parameters.auto.max_num_timeout with open_dict(queue_parameters): del queue_parameters.auto["max_num_timeout"] executor = submitit.AutoExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "slurm": max_num_timeout = self.queue_parameters.slurm.max_num_timeout with open_dict(queue_parameters): del queue_parameters.slurm["max_num_timeout"] executor = submitit.SlurmExecutor( folder=self.folder, max_num_timeout=max_num_timeout ) elif self.queue == "local": executor = submitit.LocalExecutor(folder=self.folder) else: raise RuntimeError("Unsupported queue type {}".format(self.queue)) executor.update_parameters(**queue_parameters[self.queue]) log.info("Sweep output dir : {}".format(self.config.hydra.sweep.dir)) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) params = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*params)) return [j.results()[0] for j in jobs]
def get_executor(job_name, timeout_hour=60, n_gpus=1, project='fastmri', no_force_32=False, torch=False): executor = submitit.AutoExecutor(folder=job_name) if timeout_hour > 20: qos = 't4' elif timeout_hour > 2: qos = 't3' else: qos = 'dev' multi_node = n_gpus > 8 if multi_node: assert n_gpus % 4 == 0, 'Use multiple of 4 GPUs for multi node training' assert timeout_hour <= 20, 'Use t3 qos for multi node training' multi_node = True n_nodes = n_gpus // 4 n_gpus = n_gpus // n_nodes cpu_per_gpu = 3 if n_gpus > 4 else 10 tasks_per_node = 1 cpus_per_task = cpu_per_gpu * n_gpus slurm_params = { 'ntasks-per-node': tasks_per_node, 'cpus-per-task': cpus_per_task, 'account': 'hih@gpu', 'qos': f'qos_gpu-{qos}', 'distribution': 'block:block', 'hint': 'nomultithread', } slurm_setup = [ '#SBATCH -C v100-32g', 'cd $WORK/submission-scripts/jean_zay/env_configs', f'. {project}.sh', ] if n_gpus > 4 and n_gpus < 8: slurm_params.update({'partition': 'gpu_p2'}) if (n_gpus > 4 or no_force_32) and n_gpus < 8: slurm_setup = slurm_setup[1:] if multi_node: slurm_params.update({ 'nodes': n_nodes, }) slurm_setup.append( 'unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY') executor.update_parameters( timeout_min=60, tasks_per_node=tasks_per_node, cpus_per_task=cpus_per_task, slurm_job_name=job_name, slurm_time=f'{timeout_hour}:00:00', slurm_gres=f'gpu:{n_gpus}', slurm_additional_parameters=slurm_params, slurm_setup=slurm_setup, ) return executor
def get_executor(local=False, batch=None): if local: return submitit.LocalExecutor(folder="/tmp/submitit-logs") executor = submitit.AutoExecutor(folder="/checkpoint/jjgo/submitit-logs/") if batch is not None: assert isinstance(batch, int) executor.update_parameters(slurm_array_parallelism=batch) return executor
def get_submitit_executor(n_jobs=10, comment="", partition='learnfair'): if not is_submitit_available(): raise Exception('Submitit Not installed') executor = submitit.AutoExecutor(folder='PAQ_embedding_jobs') executor.update_parameters(timeout_min=120, slurm_partition=partition, slurm_nodes=1, slurm_ntasks_per_node=1, slurm_cpus_per_task=10, slurm_constraint='volta32gb', slurm_gpus_per_node='volta:1', slurm_array_parallelism=n_jobs, slurm_comment=comment, slurm_mem='64G') return executor
def hydra_main(cfg: DictConfig) -> None: # Set up python logging. logger = logging.getLogger() logger.setLevel(cfg.log_level) logging.info(cfg.pretty()) if 'slurm' in cfg.train: slurm_dir = Path.cwd() / 'slurm' slurm_dir.mkdir() executor = submitit.AutoExecutor(slurm_dir) executor.update_parameters(slurm_gpus_per_node=cfg.train.slurm.gpus_per_node, slurm_nodes=cfg.train.slurm.nodes, slurm_ntasks_per_node=cfg.train.slurm.gpus_per_node, slurm_cpus_per_task=cfg.train.slurm.cpus_per_task, slurm_time=cfg.train.slurm.time, slurm_additional_parameters={'constraint': 'gpu', 'account': cfg.train.slurm.account}) job = executor.submit(train, cfg=cfg, output_dir=Path.cwd()) logging.info(f'submitted job {job.job_id}.') else: train(cfg)
def main(): args = parse_args() if args.job_dir == "": args.job_dir = get_shared_folder(args) / "%j" # Note that the folder will depend on the job_id, to easily track experimen`ts executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) # cluster setup is defined by environment variables num_gpus_per_node = args.ngpus nodes = args.nodes timeout_min = args.timeout kwargs = {} if args.use_volta32: kwargs['constraint'] = 'volta32gb' if args.comment: kwargs['comment'] = args.comment executor.update_parameters( mem_gb=40 * num_gpus_per_node, tasks_per_node=num_gpus_per_node, # one task per GPU cpus_per_task=10, nodes=nodes, timeout_min=10080, # max is 60 * 72 # Below are cluster dependent parameters slurm_gres= f"gpu:rtx8000:{num_gpus_per_node}", #you can choose to comment this, or change it to v100 as per your need slurm_signal_delay_s=120, **kwargs) executor.update_parameters(name="detectransformer") if args.mail: executor.update_parameters(additional_parameters={ 'mail-user': args.mail, 'mail-type': 'END' }) executor.update_parameters( slurm_additional_parameters={'gres-flags': 'enforce-binding'}) args.dist_url = get_init_file(args).as_uri() args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id)
def main(): args = parse_args() if args.job_dir == "": args.job_dir = get_shared_folder() / "%j" # Note that the folder will depend on the job_id, to easily track experiments executor = submitit.AutoExecutor(folder=args.job_dir, max_num_timeout=30) # executor = submitit.LocalExecutor(folder=get_shared_folder() / "%j") # cluster setup is defined by environment variables num_gpus_per_node = args.ngpus nodes = args.nodes partition = args.partition timeout_min = args.timeout kwargs = {} if args.use_volta32: kwargs["constraint"] = "volta32gb" if args.comment: kwargs["comment"] = args.comment executor.update_parameters( mem_gb=40 * num_gpus_per_node, gpus_per_node=num_gpus_per_node, tasks_per_node=num_gpus_per_node, # one task per GPU cpus_per_task=10, nodes=nodes, timeout_min=timeout_min, # max is 60 * 72 # Below are cluster dependent parameters hostgroup="fblearner_ash_bigsur_fair", partition=partition, signal_delay_s=120, **kwargs, ) executor.update_parameters(name="detectransformer") if args.mail: executor.update_parameters(additional_parameters={ "mail-user": args.mail, "mail-type": "END" }) args.dist_url = get_init_file().as_uri() args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id)
def get_slurm_executor(slurm_config, timeout=100, job_name="benchopt_run"): with open(slurm_config, "r") as f: config = yaml.safe_load(f) # If the job timeout is not specified in the config file, use 1.5x the # benchopt timeout. This value is a trade-off between helping the # scheduler (low slurm_time allow for faster accept) and avoiding # killing the job too early. if 'slurm_time' not in config: # Timeout is in second in benchopt config['slurm_time'] = f"00:{int(1.5*timeout)}" executor = submitit.AutoExecutor(job_name) executor.update_parameters(**config) return executor
def set_it_up(self, experiment_directory): # create the submitit executor for creating and managing jobs executor = submitit.AutoExecutor( folder=os.path.join(experiment_directory, "Logs")) # setup the executor parameters based on the cluster location if executor.cluster == "slurm": executor.update_parameters( mem_gb=8, cpus_per_task=4, timeout_min=1000, tasks_per_node=1, nodes=1, slurm_partition="long", #gres="gpu:rtx8000:1", ) return executor
def main(): args = parse_args() # Note that the folder will depend on the job_id, to easily track experiments if args.job_dir == "": args.job_dir = get_shared_folder() / "%j" executor = submitit.AutoExecutor( folder=args.job_dir, cluster=args.cluster, slurm_max_num_timeout=30) # cluster setup is defined by environment variables num_gpus_per_node = args.num_gpus nodes = args.nodes timeout_min = args.timeout if args.slurm_gres: slurm_gres = args.slurm_gres else: slurm_gres = f'gpu:{num_gpus_per_node},VRAM:{args.vram}' executor.update_parameters( mem_gb=args.mem_per_gpu * num_gpus_per_node, # gpus_per_node=num_gpus_per_node, tasks_per_node=num_gpus_per_node, # one task per GPU cpus_per_task=2, nodes=nodes, timeout_min=timeout_min, slurm_partition=args.slurm_partition, slurm_constraint=args.slurm_constraint, slurm_comment=args.slurm_comment, slurm_exclude=args.slurm_exclude, slurm_gres=slurm_gres ) executor.update_parameters(name="fair_track") args.dist_url = get_init_file().as_uri() # args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id) if args.cluster == 'debug': job.wait()
def main(): args = parse_args() if args.name == "": cfg_name = os.path.splitext(os.path.basename(args.cfg_file))[0] args.name = '_'.join([cfg_name, args.postfix]) assert args.job_dir != "" args.output_dir = str(args.job_dir) args.job_dir = Path(args.job_dir) / "%j" # Note that the folder will depend on the job_id, to easily track experiments #executor = submitit.AutoExecutor(folder=Path(args.job_dir) / "%j", slurm_max_num_timeout=30) executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) # cluster setup is defined by environment variables num_gpus_per_node = args.num_gpus nodes = args.num_shards partition = args.partition timeout_min = args.timeout kwargs = {} if args.use_volta32: kwargs['slurm_constraint'] = 'volta32gb,ib4' if args.comment: kwargs['slurm_comment'] = args.comment executor.update_parameters( mem_gb=60 * num_gpus_per_node, gpus_per_node=num_gpus_per_node, tasks_per_node=1, cpus_per_task=10 * num_gpus_per_node, nodes=nodes, timeout_min=timeout_min, # max is 60 * 72 slurm_partition=partition, slurm_signal_delay_s=120, **kwargs) print(args.name) executor.update_parameters(name=args.name) trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id)
def cli( num_tables, num_embeddings, embedding_dim, batch_size, bag_size, iters, remote, fp16, managed, mixed, ): def f(): import torch benchmark_forward( batch_size, num_embeddings, num_tables, bag_size, embedding_dim, iters, fp16, managed, mixed, ) if remote: import submitit executor = submitit.AutoExecutor(folder="sparse_embedding_perf") executor.update_parameters( timeout_min=10, partition="dev", constraint="volta32gb", gpus_per_node=1 ) job = executor.submit(f) job.wait() job.result() logging.info("Finished") import time time.sleep(1) print(job.stdout()) print(job.stderr(), file=sys.stderr) logging.info("Finished") else: f()
def main(): args = parse_args() if args.job_dir == "": args.job_dir = get_shared_folder() / "%j" # Note that the folder will depend on the job_id, to easily track experiments executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) num_gpus_per_node = args.ngpus nodes = args.nodes timeout_min = args.timeout partition = args.partition kwargs = {} if args.use_volta32: kwargs['slurm_constraint'] = 'volta32gb' if args.comment: kwargs['slurm_comment'] = args.comment executor.update_parameters( # mem_gb=40 * num_gpus_per_node, # gpus_per_node=num_gpus_per_node, tasks_per_node=num_gpus_per_node, # one task per GPU # cpus_per_task=10, nodes=nodes, timeout_min=60 * 24 * 10, # max is 60 * 72 # Below are cluster dependent parameters slurm_gres="gpu:%d" % num_gpus_per_node, slurm_partition=partition, slurm_signal_delay_s=120, slurm_additional_parameters={ 'qos': 'non-preemptable', 'mpi': 'pmi2' }, **kwargs) executor.update_parameters(name="deit") args.dist_url = get_init_file().as_uri() args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id)
def main(): args = parse_args() if args.job_dir == "": args.job_dir = get_shared_folder(args) / "%j" # Note that the folder will depend on the job_id, to easily track experiments executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) # cluster setup is defined by environment variables num_gpus_per_node = args.ngpus nodes = args.nodes partition = args.partition timeout_min = args.timeout kwargs = {} if partition is not None: kwargs["slurm_partition"] = partition executor.update_parameters( mem_gb=62 * num_gpus_per_node, gpus_per_node=num_gpus_per_node, tasks_per_node=num_gpus_per_node, # one task per GPU cpus_per_task=10, nodes=nodes, timeout_min=timeout_min, # max is 60 * 72 # Below are cluster dependent parameters slurm_signal_delay_s=120, **kwargs, ) executor.update_parameters(name="detectransformer") if args.mail: executor.update_parameters(additional_parameters={ "mail-user": args.mail, "mail-type": "END" }) args.dist_url = get_init_file(args).as_uri() args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print("Submitted job_id:", job.job_id)
def main(): args = parse_args() if args.job_dir == '': args.job_dir = get_shared_folder() / '%j' executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) num_gpus_per_node = args.ngpus nodes = args.nodes timeout_min = args.timeout executor.update_parameters(mem_gb=40 * num_gpus_per_node, gpus_per_node =num_gpus_per_node, tasks_per_node=num_gpus_per_node, cpus_per_task =10, nodes=nodes, timeout_min=timeout_min) executor.update_parameters(name='detr') args.dist_url = get_init_file().as_uri() args.output_dir = args.job_dir trainer = Trainer(args) job = executor.submit(trainer) print('Submitted job_id:', job.job_id)
def cli(num_tables, num_embeddings, embedding_dim, dense_features_dim, batch_size, bag_size, iters, remote): def f(): benchmark_torch_snn_forward("dlrm", num_tables, num_embeddings, embedding_dim, dense_features_dim, batch_size, bag_size, iters) benchmark_torch_uniform_snn_forward("fused", num_tables, num_embeddings, embedding_dim, dense_features_dim, batch_size, bag_size, iters) benchmark_torch_uniform_snn_forward("fused-fp16", num_tables, num_embeddings, embedding_dim, dense_features_dim, batch_size, bag_size, iters, fp16=1) # benchmark_torch_uniform_snn_forward("fused", # num_tables, # num_embeddings, # embedding_dim, # dense_features_dim, # batch_size, # bag_size, # iters, # fp16=1) if remote: import submitit import sys executor = submitit.AutoExecutor(folder="dlrm_perf") executor.update_parameters(timeout_min=10, partition="dev", constraint="volta", gpus_per_node=1) job = executor.submit(f) job.wait() job.result() logging.info("Finished") import time time.sleep(1) print(job.stdout()) print(job.stderr(), file=sys.stderr) logging.info("Finished") else: f()