Пример #1
0
def main():
    # parse config
    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    # add parameters to tune using grid or random search
    config["lr"] = tune.loguniform(0.0001, 0.01)
    # define scheduler
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="val_loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            "lr": tune.loguniform(0.000001, 0.01),
        },
    )
    # ray init
    ray.init(
        address="auto",
        _node_ip_address=os.environ["ip_head"].split(":")[0],
        _redis_password=os.environ["redis_password"],
    )
    # define command line reporter
    reporter = CLIReporter(
        print_intermediate_tables=True,
        metric="val_loss",
        mode="min",
        metric_columns={
            "act_lr": "act_lr",
            "steps": "steps",
            "epochs": "epochs",
            "training_iteration": "training_iteration",
            "val_loss": "val_loss",
            "val_forces_mae": "val_forces_mae",
        },
    )
    # define run parameters
    analysis = tune.run(
        ocp_trainable,
        resources_per_trial={
            "cpu": 8,
            "gpu": 1
        },
        config=config,
        stop={"epochs": 12},
        # time_budget_s=28200,
        fail_fast=False,
        local_dir=config.get("run_dir", "./"),
        num_samples=8,
        progress_reporter=reporter,
        scheduler=scheduler,
    )

    print(
        "Best config is:",
        analysis.get_best_config(metric="val_forces_mae",
                                 mode="min",
                                 scope="last"),
    )
Пример #2
0
import torch
from tqdm import trange

from ocpmodels.models.gemnet.layers.scaling import AutomaticFit
from ocpmodels.models.gemnet.utils import write_json
from ocpmodels.common.flags import flags
from ocpmodels.common.registry import registry
from ocpmodels.common.utils import build_config, setup_imports, setup_logging

if __name__ == "__main__":
    setup_logging()

    num_batches = 16  # number of batches to use to fit a single variable

    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    assert config["model"]["name"].startswith("gemnet")
    config["logger"] = "tensorboard"

    if args.distributed:
        raise ValueError(
            "I don't think this works with DDP (race conditions).")

    setup_imports()

    scale_file = config["model"]["scale_file"]

    logging.info(f"Run fitting for model: {args.identifier}")
    logging.info(f"Target scale file: {scale_file}")
Пример #3
0
def main():
    # parse config
    parser = flags.get_parser()
    args, override_args = parser.parse_known_args()
    config = build_config(args, override_args)
    # add parameters to tune using grid or random search
    config["model"].update(
        hidden_channels=tune.choice([256, 384, 512, 640, 704]),
        decoder_hidden_channels=tune.choice([256, 384, 512, 640, 704]),
        depth_mlp_edge=tune.choice([1, 2, 3, 4, 5]),
        depth_mlp_node=tune.choice([1, 2, 3, 4, 5]),
        num_interactions=tune.choice([3, 4, 5, 6]),
    )
    # define scheduler
    scheduler = ASHAScheduler(
        time_attr="steps",
        metric="val_loss",
        mode="min",
        max_t=100000,
        grace_period=2000,
        reduction_factor=4,
        brackets=1,
    )
    # ray init
    # for debug
    # ray.init(local_mode=True)
    # for slurm cluster
    ray.init(
        address="auto",
        _node_ip_address=os.environ["ip_head"].split(":")[0],
        _redis_password=os.environ["redis_password"],
    )
    # define command line reporter
    reporter = CLIReporter(
        print_intermediate_tables=True,
        metric="val_loss",
        mode="min",
        metric_columns={
            "steps": "steps",
            "epochs": "epochs",
            "training_iteration": "training_iteration",
            "val_loss": "val_loss",
            "val_forces_mae": "val_forces_mae",
        },
    )

    # define run parameters
    analysis = tune.run(
        ocp_trainable,
        resources_per_trial={
            "cpu": 8,
            "gpu": 1
        },
        config=config,
        fail_fast=False,
        local_dir=config.get("run_dir", "./"),
        num_samples=500,
        progress_reporter=reporter,
        scheduler=scheduler,
    )

    print(
        "Best config is:",
        analysis.get_best_config(metric="val_forces_mae",
                                 mode="min",
                                 scope="last"),
    )