def main(): # parse config parser = flags.get_parser() args, override_args = parser.parse_known_args() config = build_config(args, override_args) # add parameters to tune using grid or random search config["lr"] = tune.loguniform(0.0001, 0.01) # define scheduler scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="val_loss", mode="min", perturbation_interval=1, hyperparam_mutations={ "lr": tune.loguniform(0.000001, 0.01), }, ) # ray init ray.init( address="auto", _node_ip_address=os.environ["ip_head"].split(":")[0], _redis_password=os.environ["redis_password"], ) # define command line reporter reporter = CLIReporter( print_intermediate_tables=True, metric="val_loss", mode="min", metric_columns={ "act_lr": "act_lr", "steps": "steps", "epochs": "epochs", "training_iteration": "training_iteration", "val_loss": "val_loss", "val_forces_mae": "val_forces_mae", }, ) # define run parameters analysis = tune.run( ocp_trainable, resources_per_trial={ "cpu": 8, "gpu": 1 }, config=config, stop={"epochs": 12}, # time_budget_s=28200, fail_fast=False, local_dir=config.get("run_dir", "./"), num_samples=8, progress_reporter=reporter, scheduler=scheduler, ) print( "Best config is:", analysis.get_best_config(metric="val_forces_mae", mode="min", scope="last"), )
def load_config_from_yaml_and_cmd(self, args): self.config = build_config(args) # AMP Scaler self.scaler = ( torch.cuda.amp.GradScaler() if self.config["amp"] else None ) # device self.device = torch.device( "cuda" if (torch.cuda.is_available() and not self.cpu) else "cpu" ) # Are we just running sanity checks? self.is_debug = args.debug self.is_vis = args.vis # timestamps and directories args.timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") if args.identifier: args.timestamp += "-{}".format(args.identifier) args.checkpoint_dir = os.path.join("checkpoints", args.timestamp) args.results_dir = os.path.join("results", args.timestamp) args.logs_dir = os.path.join( "logs", self.config["logger"], args.timestamp ) print(yaml.dump(self.config, default_flow_style=False)) for arg in vars(args): print("{:<20}: {}".format(arg, getattr(args, arg))) # TODO(abhshkdz): Handle these parameters better. Maybe move to yaml. self.config["cmd"] = args.__dict__ del args if not self.is_debug: os.makedirs(self.config["cmd"]["checkpoint_dir"], exist_ok=True) os.makedirs(self.config["cmd"]["results_dir"], exist_ok=True) os.makedirs(self.config["cmd"]["logs_dir"], exist_ok=True) # Dump config parameters json.dump( self.config, open( os.path.join( self.config["cmd"]["checkpoint_dir"], "config.json" ), "w", ), )
from tqdm import trange from ocpmodels.models.gemnet.layers.scaling import AutomaticFit from ocpmodels.models.gemnet.utils import write_json from ocpmodels.common.flags import flags from ocpmodels.common.registry import registry from ocpmodels.common.utils import build_config, setup_imports, setup_logging if __name__ == "__main__": setup_logging() num_batches = 16 # number of batches to use to fit a single variable parser = flags.get_parser() args, override_args = parser.parse_known_args() config = build_config(args, override_args) assert config["model"]["name"].startswith("gemnet") config["logger"] = "tensorboard" if args.distributed: raise ValueError( "I don't think this works with DDP (race conditions).") setup_imports() scale_file = config["model"]["scale_file"] logging.info(f"Run fitting for model: {args.identifier}") logging.info(f"Target scale file: {scale_file}") def initialize_scale_file(scale_file):
trainer.run_relaxations() distutils.synchronize() if distutils.is_master(): print("Total time taken = ", time.time() - start_time) finally: if args.distributed: distutils.cleanup() if __name__ == "__main__": parser = flags.get_parser() args = parser.parse_args() config = build_config(args) if args.submit: # Run on cluster if args.sweep_yml: # Run grid search configs = create_grid(config, args.sweep_yml) else: configs = [config] print(f"Submitting {len(configs)} jobs") executor = submitit.AutoExecutor(folder=args.logdir / "%j") executor.update_parameters( name=args.identifier, mem_gb=args.slurm_mem, timeout_min=args.slurm_timeout * 60, slurm_partition=args.slurm_partition, gpus_per_node=args.num_gpus,
def main(): # parse config parser = flags.get_parser() args, override_args = parser.parse_known_args() config = build_config(args, override_args) # add parameters to tune using grid or random search config["model"].update( hidden_channels=tune.choice([256, 384, 512, 640, 704]), decoder_hidden_channels=tune.choice([256, 384, 512, 640, 704]), depth_mlp_edge=tune.choice([1, 2, 3, 4, 5]), depth_mlp_node=tune.choice([1, 2, 3, 4, 5]), num_interactions=tune.choice([3, 4, 5, 6]), ) # define scheduler scheduler = ASHAScheduler( time_attr="steps", metric="val_loss", mode="min", max_t=100000, grace_period=2000, reduction_factor=4, brackets=1, ) # ray init # for debug # ray.init(local_mode=True) # for slurm cluster ray.init( address="auto", _node_ip_address=os.environ["ip_head"].split(":")[0], _redis_password=os.environ["redis_password"], ) # define command line reporter reporter = CLIReporter( print_intermediate_tables=True, metric="val_loss", mode="min", metric_columns={ "steps": "steps", "epochs": "epochs", "training_iteration": "training_iteration", "val_loss": "val_loss", "val_forces_mae": "val_forces_mae", }, ) # define run parameters analysis = tune.run( ocp_trainable, resources_per_trial={ "cpu": 8, "gpu": 1 }, config=config, fail_fast=False, local_dir=config.get("run_dir", "./"), num_samples=500, progress_reporter=reporter, scheduler=scheduler, ) print( "Best config is:", analysis.get_best_config(metric="val_forces_mae", mode="min", scope="last"), )