Пример #1
0
def main():
    # parse config
    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    # add parameters to tune using grid or random search
    config["lr"] = tune.loguniform(0.0001, 0.01)
    # define scheduler
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="val_loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            "lr": tune.loguniform(0.000001, 0.01),
        },
    )
    # ray init
    ray.init(
        address="auto",
        _node_ip_address=os.environ["ip_head"].split(":")[0],
        _redis_password=os.environ["redis_password"],
    )
    # define command line reporter
    reporter = CLIReporter(
        print_intermediate_tables=True,
        metric="val_loss",
        mode="min",
        metric_columns={
            "act_lr": "act_lr",
            "steps": "steps",
            "epochs": "epochs",
            "training_iteration": "training_iteration",
            "val_loss": "val_loss",
            "val_forces_mae": "val_forces_mae",
        },
    )
    # define run parameters
    analysis = tune.run(
        ocp_trainable,
        resources_per_trial={
            "cpu": 8,
            "gpu": 1
        },
        config=config,
        stop={"epochs": 12},
        # time_budget_s=28200,
        fail_fast=False,
        local_dir=config.get("run_dir", "./"),
        num_samples=8,
        progress_reporter=reporter,
        scheduler=scheduler,
    )

    print(
        "Best config is:",
        analysis.get_best_config(metric="val_forces_mae",
                                 mode="min",
                                 scope="last"),
    )
Пример #2
0
    def load_config_from_yaml_and_cmd(self, args):
        self.config = build_config(args)

        # AMP Scaler
        self.scaler = (
            torch.cuda.amp.GradScaler() if self.config["amp"] else None
        )

        # device
        self.device = torch.device(
            "cuda" if (torch.cuda.is_available() and not self.cpu) else "cpu"
        )

        # Are we just running sanity checks?
        self.is_debug = args.debug
        self.is_vis = args.vis

        # timestamps and directories
        args.timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        if args.identifier:
            args.timestamp += "-{}".format(args.identifier)

        args.checkpoint_dir = os.path.join("checkpoints", args.timestamp)
        args.results_dir = os.path.join("results", args.timestamp)
        args.logs_dir = os.path.join(
            "logs", self.config["logger"], args.timestamp
        )

        print(yaml.dump(self.config, default_flow_style=False))
        for arg in vars(args):
            print("{:<20}: {}".format(arg, getattr(args, arg)))

        # TODO(abhshkdz): Handle these parameters better. Maybe move to yaml.
        self.config["cmd"] = args.__dict__
        del args

        if not self.is_debug:
            os.makedirs(self.config["cmd"]["checkpoint_dir"], exist_ok=True)
            os.makedirs(self.config["cmd"]["results_dir"], exist_ok=True)
            os.makedirs(self.config["cmd"]["logs_dir"], exist_ok=True)

            # Dump config parameters
            json.dump(
                self.config,
                open(
                    os.path.join(
                        self.config["cmd"]["checkpoint_dir"], "config.json"
                    ),
                    "w",
                ),
            )
Пример #3
0
from tqdm import trange

from ocpmodels.models.gemnet.layers.scaling import AutomaticFit
from ocpmodels.models.gemnet.utils import write_json
from ocpmodels.common.flags import flags
from ocpmodels.common.registry import registry
from ocpmodels.common.utils import build_config, setup_imports, setup_logging

if __name__ == "__main__":
    setup_logging()

    num_batches = 16  # number of batches to use to fit a single variable

    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    assert config["model"]["name"].startswith("gemnet")
    config["logger"] = "tensorboard"

    if args.distributed:
        raise ValueError(
            "I don't think this works with DDP (race conditions).")

    setup_imports()

    scale_file = config["model"]["scale_file"]

    logging.info(f"Run fitting for model: {args.identifier}")
    logging.info(f"Target scale file: {scale_file}")

    def initialize_scale_file(scale_file):
Пример #4
0
            trainer.run_relaxations()

        distutils.synchronize()

        if distutils.is_master():
            print("Total time taken = ", time.time() - start_time)

    finally:
        if args.distributed:
            distutils.cleanup()


if __name__ == "__main__":
    parser = flags.get_parser()
    args = parser.parse_args()
    config = build_config(args)

    if args.submit:  # Run on cluster
        if args.sweep_yml:  # Run grid search
            configs = create_grid(config, args.sweep_yml)
        else:
            configs = [config]

        print(f"Submitting {len(configs)} jobs")
        executor = submitit.AutoExecutor(folder=args.logdir / "%j")
        executor.update_parameters(
            name=args.identifier,
            mem_gb=args.slurm_mem,
            timeout_min=args.slurm_timeout * 60,
            slurm_partition=args.slurm_partition,
            gpus_per_node=args.num_gpus,
Пример #5
0
def main():
    # parse config
    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    # add parameters to tune using grid or random search
    config["model"].update(
        hidden_channels=tune.choice([256, 384, 512, 640, 704]),
        decoder_hidden_channels=tune.choice([256, 384, 512, 640, 704]),
        depth_mlp_edge=tune.choice([1, 2, 3, 4, 5]),
        depth_mlp_node=tune.choice([1, 2, 3, 4, 5]),
        num_interactions=tune.choice([3, 4, 5, 6]),
    )
    # define scheduler
    scheduler = ASHAScheduler(
        time_attr="steps",
        metric="val_loss",
        mode="min",
        max_t=100000,
        grace_period=2000,
        reduction_factor=4,
        brackets=1,
    )
    # ray init
    # for debug
    # ray.init(local_mode=True)
    # for slurm cluster
    ray.init(
        address="auto",
        _node_ip_address=os.environ["ip_head"].split(":")[0],
        _redis_password=os.environ["redis_password"],
    )
    # define command line reporter
    reporter = CLIReporter(
        print_intermediate_tables=True,
        metric="val_loss",
        mode="min",
        metric_columns={
            "steps": "steps",
            "epochs": "epochs",
            "training_iteration": "training_iteration",
            "val_loss": "val_loss",
            "val_forces_mae": "val_forces_mae",
        },
    )

    # define run parameters
    analysis = tune.run(
        ocp_trainable,
        resources_per_trial={
            "cpu": 8,
            "gpu": 1
        },
        config=config,
        fail_fast=False,
        local_dir=config.get("run_dir", "./"),
        num_samples=500,
        progress_reporter=reporter,
        scheduler=scheduler,
    )

    print(
        "Best config is:",
        analysis.get_best_config(metric="val_forces_mae",
                                 mode="min",
                                 scope="last"),
    )