perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( NoFaultToleranceTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None }, max_failures=-1, # used for fault tolerance checkpoint_freq=2, # used for fault tolerance progress_reporter=reporter, scheduler=pbt_scheduler, callbacks=[FailureInjectorCallback()], queue_trials=True, stop={"training_iteration": 1} if args.smoke_test else None) print(analysis.get_best_config(metric="val_loss", mode="min"))
# distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( TorchTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None }, max_failures=-1, # used for fault tolerance checkpoint_freq=2, # used for fault tolerance progress_reporter=reporter, scheduler=pbt_scheduler, callbacks=[ FailureInjectorCallback(time_between_checks=90), ProgressCallback() ], stop={"training_iteration": 1} if args.smoke_test else None) print(analysis.get_best_config(metric="val_loss", mode="min"))
hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }, ) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( TorchTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "head_location": None, "worker_locations": None, }, max_failures=-1, # used for fault tolerance checkpoint_freq=2, # used for fault tolerance progress_reporter=reporter, scheduler=pbt_scheduler, callbacks=[FailureInjectorCallback(time_between_checks=90), ProgressCallback()], stop={"training_iteration": 1} if args.smoke_test else None, ) print(analysis.get_best_config(metric="val_loss", mode="min"))