def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0): config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } scheduler = ASHAScheduler( max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) analysis = tune.run( tune.with_parameters( train_mnist_tune, num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_asha") print("Best hyperparameters found were: ", analysis.best_config)
def tune_xgboost(): search_space = { # You can mix constants with search space objects. "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "max_depth": tune.randint(1, 9), "min_child_weight": tune.choice([1, 2, 3]), "subsample": tune.uniform(0.5, 1.0), "eta": tune.loguniform(1e-4, 1e-1) } # This will enable aggressive early stopping of bad trials. scheduler = ASHAScheduler( max_t=10, # 10 training iterations grace_period=1, reduction_factor=2) analysis = tune.run( train_breast_cancer, metric="eval-logloss", mode="min", # You can add "gpu": 0.1 to allocate GPUs resources_per_trial={"cpu": 1}, config=search_space, num_samples=10, scheduler=scheduler) return analysis
def tune_from_existing(start_model, start_config, num_samples=10, num_epochs=10, gpus_per_trial=0.0, day=0): data_interface = MNISTDataInterface("/tmp/mnist_data", max_days=10) num_examples = data_interface._get_day_slice( day) - data_interface._get_day_slice(day - 1) config = start_config.copy() config.update({ "batch_size": tune.choice([16, 32, 64]), "lr": tune.loguniform(1e-4, 1e-1), "momentum": tune.uniform(0.1, 0.9), }) scheduler = ASHAScheduler( metric="mean_accuracy", mode="max", max_t=num_epochs, grace_period=1, reduction_factor=2, ) reporter = CLIReporter( parameter_columns=["lr", "momentum", "batch_size"], metric_columns=["mean_accuracy", "training_iteration"], ) analysis = tune.run( partial( train_mnist, start_model=start_model, data_fn=data_interface.get_incremental_data, num_epochs=num_epochs, use_gpus=True if gpus_per_trial > 0 else False, day=day, ), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, verbose=0, name="tune_serve_mnist_fromsexisting", ) best_trial = analysis.get_best_trial("mean_accuracy", "max", "last") best_accuracy = best_trial.metric_analysis["mean_accuracy"]["last"] best_trial_config = best_trial.config best_checkpoint = best_trial.checkpoint.value return best_accuracy, best_trial_config, best_checkpoint, num_examples
def main(): parser = argparse.ArgumentParser() parser.add_argument("--X_dir", type=str) parser.add_argument("--y_dir", type=str) parser.add_argument("--epoch", type=int) parser.add_argument("--config_dir", type=str) parser.add_argument("--model", type=str) parser.add_argument("--n_sample", type=int) args = parser.parse_args() X_train = torch.load(args.X_dir) y_train = torch.load(args.y_dir) config = { "n_hidden": tune.sample_from(lambda _: 2**np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([16, 32, 64, 128]), } CL = CustomLoss(1, 2) scheduler = ASHAScheduler(metric="loss", mode="min", max_t=args.epoch, grace_period=1, reduction_factor=2) reporter = CLIReporter(metric_columns=["loss", "training_iteration"]) def train_func(config): train_model( X=X_train, y=y_train, num_epochs=args.epoch, loss_func=CL.custom_loss_1, model_name=args.model, config=config, ) result = tune.run( train_func, resources_per_trial={ "cpu": 2, "gpu": 2 }, config=config, num_samples=args.n_sample, scheduler=scheduler, progress_reporter=reporter, ) best_trial = result.get_best_trial("loss", "min", "last") with open(args.config_dir, "w") as json_file: json.dump(best_trial.last_result["config"], json_file) last_loss = best_trial.last_result["loss"] print(f"Validation Loss of best model was {last_loss}.")
def tune4_withLabel( model, train_set: Dataset, val_set: Dataset, dims: list, config: dict, EPOCHS: int = 300, extra_feature_len: int = 0, extra_feature_len2: int = 0, n_gpu=1, n_samples=20, model_name="model", ): dim1, dim2, dim3, dim4 = dims[0], dims[1], dims[2], dims[3] scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["k", "lr", "batch_size", "hidden_dim"], metric_columns=["loss", "training_iteration"], max_error_rows=5, max_progress_rows=5, max_report_frequency=10) analysis = tune.run(tune.with_parameters( train4_withLabel, model=model, dim1=dim1, dim2=dim2, dim3=dim3, dim4=dim4, extra_feature_len=extra_feature_len, extra_feature_len2=extra_feature_len2, train_set=train_set, val_set=val_set, num_epochs=EPOCHS, num_gpus=n_gpu, model_name=model_name), resources_per_trial={ "cpu": 1, "gpu": n_gpu }, metric="loss", mode="min", config=config, num_samples=n_samples, scheduler=scheduler, progress_reporter=reporter, name=model_name, verbose=False) print("-" * 70) print("Done") print("Best hyperparameters found were: ", analysis.best_config) print("Best achieved loss was: ", analysis.best_result) print("-" * 70)
def start_training(name): Epochs = 1000 Samples = 50 ModelName = name pose_autoencoder = MLP_withLabel.load_checkpoint( "/home/nuoc/Documents/MEX/models/MLP4_withLabel_best/M3/0.00324857.512.pbz2" ) # pose_autoencoder = MLP_withLabel.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP_withLabel/0.0013522337.512.pbz2") pose_encoder_out_dim = pose_autoencoder.dimensions[-1] scheduler = ASHAScheduler(max_t=Epochs, grace_period=15, reduction_factor=2) reporter = CLIReporter( parameter_columns=["k", "lr", "batch_size", "loss_fn"], metric_columns=["loss", "training_iteration"], max_error_rows=5, max_progress_rows=5, max_report_frequency=1) analysis = tune.run(tune.with_parameters( tuning, MODEL=MotionGenerationModel, pose_autoencoder=pose_autoencoder, cost_dim=cost_dim, phase_dim=phase_dim, input_slices=[phase_dim, pose_dim, cost_dim], output_slices=[phase_dim, phase_dim, pose_encoder_out_dim], train_set=train_set, val_set=val_set, num_epochs=Epochs, model_name=ModelName), resources_per_trial={ "cpu": 2, "gpu": 1 }, metric="loss", mode="min", config=config, num_samples=Samples, scheduler=scheduler, progress_reporter=reporter, name=ModelName, verbose=False) print("-" * 70) print("Done") print("Best hyperparameters found were: ", analysis.best_config) print("Best achieved loss was: ", analysis.best_result) print("-" * 70) ray.shutdown()
def hyperparameter_tuning_initializer(loss_type='SL', learning_rate_scheduler ='CARM'): # defining the hyperparameters if loss_type == 'FL': config = { 'gamma': tune.choice([0.5, 1, 2]), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'CEDL': config = { 'dice_loss': tune.uniform(0, 3), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'CEDIL': config = { 'dice_loss': tune.uniform(0, 3), 'inverse_dice_loss': tune.uniform(0, 3), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'SL': config = { 'lambda': tune.uniform(0, 1), 'tau': tune.uniform(0.02, 0.04), 'lr': tune.loguniform(1e-4, 1e-3) } elif loss_type == 'VCE': config = { 'var_loss': tune.uniform(0.5, 5.5), 'lr': tune.loguniform(1e-4, 1e-3) } # hyperparameters for learning rate scheduler if learning_rate_scheduler == 'CARM': config['T_0'] = tune.choice([5, 10, 20, 40, 50]) config['eta_min_factor'] = tune.loguniform(1e2, 1e4) if learning_rate_scheduler == 'SLR': config['step_size'] = tune.choice([5, 10, 20, 40, 50]) if learning_rate_scheduler == 'MLR': config['lr_lambda'] = tune.uniform(0.8, 0.99) # defining the scheduler scheduler = ASHAScheduler( metric='loss', mode='min', max_t=conf['max_epochs'] // 20, grace_period=1, reduction_factor=2 ) # defining the reporter reporter = CLIReporter(metric_columns=['loss', 'avg_dice_coefficient', 'epoch']) return config, scheduler, reporter
def ray_tune_interface(data_settings, cross_validation_settings, parameter_settings, model_settings): cat_features = data_settings['cat_features'] weights = data_settings['weights'] data = data_settings['data'] target = data_settings['target'] model = model_settings.pop('model', None) n_folds = cross_validation_settings['n_folds'] metric_to_use = cross_validation_settings['metric_to_use'] mode = cross_validation_settings['mode'] n_samples = parameter_settings['tune_parameters']['n_samples'] num_threads = parameter_settings['tune_parameters']['num_threads'] max_concurrent = parameter_settings['tune_parameters']['max_concurrent'] space = ({ k: var_type_sample(k, v, parameter_settings['parameters_types']) for k, v in parameter_settings['parameter_range'].items() }) algo = HyperOptSearch( space, max_concurrent=max_concurrent, metric="metric_ave", mode=mode, ) scheduler = ASHAScheduler(metric="metric_ave", mode=mode) config = { 'num_samples': n_samples, 'config': { 'non_hp': { 'data': { 'X': data, 'y': target, 'cat_features': cat_features, 'weights': weights, }, 'modelIns': model, 'n_folds': n_folds, 'metric_to_use': metric_to_use, } } } config['config'].update(parameter_settings) config['config']['model_settings'] = model_settings ray_experiment = tune.run(cross_validation_catboost_ray, resources_per_trial={"gpu": 1}, search_alg=algo, scheduler=scheduler, keep_checkpoints_num=0, verbose=1, **config) results = ray_experiment.dataframe(metric="metric_ave", mode="min") results.to_csv('results_experiment.csv')
def test_custom_scheduler_default_time_attr(): try: from ray.tune.schedulers import ASHAScheduler except ImportError: print("skip the test as ray tune cannot be imported.") return my_scheduler = ASHAScheduler(max_t=10) best_config = test_scheduler(scheduler=my_scheduler) print( "Custom ASHA scheduler (with ASHA default time attr), test error:", abs(10 / 2 - best_config["z"] / 2), )
def start(self, config: dict = None, **kwargs): if config is None: raise ValueError("Argument 'config' must not be None") # better save results in home dir because paths may become very long # possibly exceeding the max path limit if stored in different paths results_dir = (os.environ.get("TEST_TMPDIR") or os.environ.get("TUNE_RESULT_DIR") or os.path.expanduser("~/ray_results")) tune_log_path = os.path.join(results_dir, self.session_id) os.makedirs(tune_log_path, exist_ok=True) os.makedirs(self.out_path, exist_ok=True) metric = kwargs.get("metric", "mean_accuracy") mode = kwargs.get("mode", "max") num_gpus = kwargs.get("num_gpus", 1) num_cpus = kwargs.get("num_cpus", 1) ray.init(include_dashboard=False, local_mode=True, num_gpus=num_gpus, num_cpus=num_cpus) tensorboard_url = launch_tensorboard(tune_log_path) print(f"TensorBoard launched: {tensorboard_url}.") scheduler = kwargs.get("scheduler", None) if scheduler is None: scheduler = ASHAScheduler(metric, mode=mode) analysis = tune.run(self._start, self.session_id, config=config, scheduler=scheduler, checkpoint_freq=20, local_dir=results_dir) result = analysis.get_best_trial(metric) print("Best trial config: {}".format(result.config)) print("Best trial final validation loss: {}".format( result.last_result["mean_loss"])) print("Best trial final validation accuracy: {}".format( result.last_result["mean_accuracy"])) # TODO search algorithm? df = analysis.dataframe(metric, mode) df.to_pickle(os.path.join(self.out_path, "results.pkl")) try: with pd.ExcelWriter(os.path.join(self.out_path, "results.xlsx")) as writer: df.to_excel(writer) except ModuleNotFoundError as e: print("Failed to write tuning result:", e)
def main(num_samples=10, max_num_epochs=50, gpus_per_trial=2): data_dir = os.path.abspath("./data") checkpoint_dir = os.path.abspath("./checkpoints") config = { "l1": tune.choice([32, 64, 128, 256]), "l2": tune.choice([32, 64, 128, 256]), "lr": tune.loguniform(1e-5, 1e-3), "batch_size": tune.choice([64, 128, 256]) } scheduler = ASHAScheduler(metric="loss", mode="min", max_t=max_num_epochs, grace_period=5, reduction_factor=2) reporter = CLIReporter( # parameter_columns=["l1", "l2", "lr", "batch_size"], metric_columns=["loss", "accuracy", "training_iteration"]) st_time = time.time() result = tune.run(partial(train_regression, checkpoint_dir=checkpoint_dir, data_dir=data_dir), resources_per_trial={ "cpu": 2, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load( os.path.join(best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc)) print("Total Time", st_time - time.time())
def main(cmdl): base_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "default.yaml")) search_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "search.yaml")) print(config_to_string(cmdl)) print(config_to_string(dict_to_namespace(search_cfg))) # the search space good_init, search_space = get_search_space(search_cfg) search_name = "{timestep}_tune_{experiment_name}{dev}".format( timestep="{:%Y%b%d-%H%M%S}".format(datetime.now()), experiment_name=base_cfg["experiment"], dev="_dev" if cmdl.dev else "", ) # search algorithm hyperopt_search = HyperOptSearch( search_space, metric="criterion", mode="max", max_concurrent=cmdl.workers, points_to_evaluate=good_init, ) # early stopping scheduler = ASHAScheduler( time_attr="train_step", metric="criterion", mode="max", max_t=base_cfg["training_steps"], # max length of the experiment grace_period=cmdl.grace_steps, # stops after 20 logged steps brackets=3, # don't know what this does ) analysis = tune.run( lambda x: tune_trial(x, base_cfg=base_cfg, get_objective=None), name=search_name, # config=search_space, search_alg=hyperopt_search, scheduler=scheduler, local_dir="./results", num_samples=cmdl.trials, trial_name_creator=trial2string, resources_per_trial={"cpu": 3}, ) dfs = analysis.trial_dataframes for i, (key, df) in enumerate(dfs.items()): print("saving: ", key) df.to_pickle(f"{key}/trial_df.pkl")
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): data_dir = os.path.abspath("./data") load_data(data_dir) config = { "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)), "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([2, 4, 8, 16]) } scheduler = ASHAScheduler(metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( # parameter_columns=["l1", "l2", "lr", "batch_size"], metric_columns=["loss", "accuracy", "training_iteration"]) result = tune.run(partial(train_cifar, data_dir=data_dir), resources_per_trial={ "cpu": 2, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if gpus_per_trial > 1: best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load( os.path.join(best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) test_acc = test_accuracy(best_trained_model, device) print("Best trial test set accuracy: {}".format(test_acc))
def main(): logging.basicConfig(level=logging.INFO) # Raylib parameters num_samples = 10 envname = 'AdversarialAntBulletEnv-v0' trainingconfig = Path.cwd() / 'trainingconfig.json' evaluate_mean_n = 1000 # Number of timesteps over which to evaluate the mean reward name_fmt = 'million-bucks_{adv_force}' config = { # TODO: sample from control once, then different adversarial strengths # Range is centered on the force that achieves the closest reward to the control (7.5) "adv_force": tune.qrandn(7.5, 2.5, 0.1), } # https://docs.ray.io/en/master/tune/tutorials/overview.html#which-search-algorithm-scheduler-should-i-choose # Use BOHB for larger problems with a small number of hyperparameters # search = TuneBOHB(max_concurrent=4, metric="mean_loss", mode="min") # sched = HyperBandForBOHB( # time_attr="training_iteration", # max_t=100, # ) # Implicitly use random search if search algo is not specified sched = ASHAScheduler( time_attr='training_iteration', max_t=100, grace_period=1, # Unit is iterations, not timesteps. TODO configure ) # Pass in a Trainable class or function to tune.run. local_dir = str(Path.cwd() / "ray") logging.info(f'{local_dir=}') anal = tune.run(tune.with_parameters(trainable, envname=envname, trainingconfig=trainingconfig, evaluate_mean_n=evaluate_mean_n, name_fmt=name_fmt), config=config, num_samples=num_samples, scheduler=sched, local_dir=local_dir, metric="robustness", mode="max", log_to_file=True) logging.info(f'best config: {anal.best_config}') logging.info(f'best config: {anal.best_result}')
def main(args): device = config_cuda(args.use_cuda) data_pth = Path(Path.cwd() / args.data_path) out_pth = Path(Path.cwd() / args.out_path) train_set, valid_set, test_set = get_datasets(data_pth) config = { 'eta': tune.loguniform(1e-5, 1e-1), 'batch_size': tune.choice([2, 4, 8, 16]) } scheduler = ASHAScheduler( metric='loss', mode='min', max_t=args.max_epochs, grace_period=1, reduction_factor=2 ) reporter = CLIReporter( parameter_columns=['eta', 'batch_size'], metric_columns=['loss', 'accuracy', 'training_iteration'] ) num_gpus = 1 if device == 'cuda' else 0 result = tune.run( partial(train_model, model_name=args.model, device=device, max_epochs=args.max_epochs, num_workers=args.num_workers, data_pth=data_pth), resources_per_trial={'cpu': args.num_workers, 'gpu': num_gpus}, config=config, num_samples=args.num_samples, scheduler=scheduler, progress_reporter=reporter ) best_trial = result.get_best_trial('loss', 'min', 'last') print(f'Best trial config {best_trial.config}') print( f'Best trial final validation loss: {best_trial.last_result["loss"]}' ) model = load_model(args.model, len(train_set.classes)) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load(os.path.join( best_checkpoint_dir, 'checkpoint')) model.load_state_dict(model_state) test_acc = test_model(model, device, test_set, out_pth, args.num_workers) print(f'Best trial test set accuracy: {test_acc}')
def setup_tune_scheduler(): search_space = workload.create_search_space() scheduler = ASHAScheduler( # set a large max_t such that ASHA will always promot to next rung, # until something reaches target accuracy max_t=int(1000), reduction_factor=3, **workload.exp_metric(), ) return dict( search_alg=VariantGenerator(), scheduler=scheduler, config=search_space, resources_per_trial=com.detect_baseline_resource(), )
def tune_model(data): logfile = open("/tmp/ray/session_latest/custom.log", "w") def write(msg): logfile.write(f"{msg}\n") logfile.flush() search_space = { # You can mix constants with search space objects. "objective": "binary:logistic", "eval_metric": ["logloss", "error"], "max_depth": tune.randint(1, 9), "min_child_weight": tune.choice([1, 2, 3]), "subsample": tune.uniform(0.5, 1.0), "eta": tune.loguniform(1e-4, 1e-1), } print("enabling aggressive early stopping of bad trials") # This will enable aggressive early stopping of bad trials. scheduler = ASHAScheduler( max_t=4, grace_period=1, reduction_factor=2 # 10 training iterations ) print("Tuning") analysis = tune.run( tune.with_parameters( train_model, data=data # checkpoint_dir=f"{LOCAL_DIR}/checkpoints", # data_dir=f"{LOCAL_DIR}/data", ), metric="eval-logloss", mode="min", local_dir=LOCAL_DIR, # You can add "gpu": 0.1 to allocate GPUs resources_per_trial=RAY_PARAMS.get_tune_resources(), config=search_space, num_samples=4, scheduler=scheduler, ) print("Done Tuning") return analysis
def tunerTrain(): ray.init(_memory=4000000000, num_cpus=5) searchSpace = { 'lr': tune.loguniform(1e-4, 9e-1), 'finalOutput': tune.randint(2, 50), # minimum of 2, other 1//2 = 0 activation maps 'stride1': tune.grid_search(np.arange(1, 4).tolist()), 'stride2': tune.grid_search(np.arange(1, 4).tolist()), 'batchSize': tune.grid_search([2, 4, 8, 16, 32, 64, 128, 256]), 'finalChannel': tune.randint(1, 50), } analysis = tune.run(train, num_samples=1, scheduler=ASHAScheduler(metric='score', mode='max'), config=searchSpace) print(f"Best Config: {analysis.get_all_configs(metric='score', mode='max')}") df = analysis.results_df logdir = analysis.get_best_logdir("mean_accuracy", mode="max") print(f"dir of best: {logdir}")
def main(args, reproducible: bool): if reproducible: seed_everything(42) datamodule = TwoDomainMMEDM(dataPath=args.dataPath, augment=True, batch_size=32, num_workers=8) config = { "log_lr": tune.uniform(-4, -2), "log_lrRatio": tune.uniform(-3, 0), "log_decay": tune.uniform(-8, -1), } search_alg = BayesOptSearch( metric='mean_iou', mode='max', ) scheduler = ASHAScheduler(grace_period=25, ) reporter = CLIReporter( parameter_columns=["log_lr", "log_lrRatio", "log_decay"], metric_columns=["loss", "mean_iou", "training_iteration"]) analysis = tune.run(tune.with_parameters( trainWithTune, datamodule=datamodule, num_epochs=175, num_gpus=1, ), resources_per_trial={ "cpu": 5, "gpu": 0.5, }, metric="mean_iou", mode="max", config=config, num_samples=20, scheduler=scheduler, search_alg=search_alg, progress_reporter=reporter, name="tune_minimax_segmenter") print("Best hyperparameters found were: ", analysis.best_config)
def optimize(): name_dir = os.path.join('saved', 'hyper-lstm') hyperparam_config = { 'name': 'lstm', 'num_hidden': tune.sample_from(lambda _: np.random.randint(1, 10)), 'num_layers': tune.sample_from(lambda _: np.random.randint(1, 5)), 'opt': tune.choice(['adam', 'sgd', 'adamw', 'lbfgs']), 'lr': tune.loguniform(1e-10, 1), 'epoch': tune.sample_from(lambda _: np.random.randint(5, 25)), 'beta_1': tune.loguniform(1e-8, 1e-2), 'beta_2': tune.loguniform(1e-8, 1e-2), 'weight_decay': tune.loguniform(1e-8, 1e-2), 'max_iter': tune.sample_from(lambda _: np.random.randint(10, 100)), 'momentum': tune.uniform(0.5, 0.9), 'patience': tune.sample_from(lambda _: np.random.randint(5, 25)), 'batch_size': 16 } if not os.path.isdir(name_dir): os.mkdir(name_dir) scheduler = ASHAScheduler(metric='accuracy', mode='max', max_t=25, grace_period=1, reduction_factor=2) reporter = CLIReporter(metric_columns=["loss", "accuracy"]) result = tune.run(partial(train, checkpoint_dir=name_dir, cwd=os.getcwd(), tuning=True), resources_per_trial={ "cpu": 1, "gpu": 0.5 }, config=hyperparam_config, num_samples=200, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("accuracy", "max", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) print("Best Checkpoint Dir: " + str(best_trial.checkpoint.value)) return best_trial.config
def test_main(gpus_per_trial=1): data_dir = os.path.abspath("../playground/data") load_data(data_dir) # Download data for all trials before starting the run config = { "lr": 1e-2, "conv1_l2": 4e-3, "conv2_l2": 4e-3, "conv3_l2": 4e-3, "fc1_l2": 1, "lr_reductions": 2, "rnorm_scale": 0.00005, "rnorm_power": 0.01, "max_num_epochs": 200, "batch_size": 128, "steps_per_epoch": 1000, "data_dir": data_dir } scheduler = ASHAScheduler(max_t=config["max_num_epochs"], grace_period=1, reduction_factor=4) result = tune.run( train_cifar, resources_per_trial={ "cpu": 2, "gpu": gpus_per_trial }, config={ **config, "wandb": { "project": "SoTL_Cifar", "api_key_file": "~" + os.sep + ".wandb" + os.sep + "nas_key.txt" } }, metric="loss", mode="min", num_samples=1, scheduler=scheduler, loggers=DEFAULT_LOGGERS + (WandbLogger, ), ) train_cifar(config) # if __name__ == "__main__": # fire.Fire(main)
def _tune( model, train_set: Dataset, val_set: Dataset, dim: int, config: dict, EPOCHS: int = 300, n_gpu=1, n_samples=20, model_name="model", ): scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["k", "lr", "batch_size", "loss_fn"], metric_columns=["loss", "training_iteration"], max_error_rows=5, max_progress_rows=5, max_report_frequency=10) analysis = tune.run(tune.with_parameters(train, model=model, dim=dim, train_set=train_set, val_set=val_set, num_epochs=EPOCHS, num_gpus=n_gpu, model_name=model_name), resources_per_trial={ "cpu": 1, "gpu": n_gpu }, metric="loss", mode="min", config=config, num_samples=n_samples, scheduler=scheduler, progress_reporter=reporter, name=model_name, verbose=False) print("-" * 70) print("Done") print("Best hyperparameters found were: ", analysis.best_config) print("Best achieved loss was: ", analysis.best_result) print("-" * 70)
def main(): parser = argparse.ArgumentParser(description='Train a strategy using CFR') parser.add_argument('-in', '--input', type=str, help='the data') parser.add_argument('-out', '--output', type=str, help='where to store the model') args = parser.parse_args() config = { "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "l3": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-1), } ray.init(num_cpus=1, include_dashboard=False, local_mode=True) scheduler = ASHAScheduler(metric='mean_accuracy', mode='max') analysis = tune.run(train_figgie, num_samples=4, scheduler= scheduler, config=config)
def tune_mnist_asha(num_samples=10, num_epochs=50, gpus_per_trial=0, cpus_per_trial=4): data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_") LightningMNISTClassifier.download_data(data_dir) config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "layer_3_size": tune.choice([128, 256, 512]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=[ "layer_1_size", "layer_2_size", "layer_3_size", "lr", "batch_size" ], metric_columns=["loss", "mean_accuracy", "training_iteration"]) analysis = tune.run(tune.with_parameters(train_mnist_tune, data_dir=data_dir, num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": cpus_per_trial, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_asha") print("Best hyperparameters found were: ", analysis.best_config) shutil.rmtree(data_dir)
def ray_schedule(): gn = 10 dim = 10 if not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/gn:{gn}-dim:{dim}-dataset-x") \ or not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/gn:{gn}-dim:{dim}-dataset-y") \ or not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/val-gn:{gn}-dim:{dim}-dataset-x") \ or not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/val-gn:{gn}-dim:{dim}-dataset-y"): save_on_hdd(*generate_dataset(gn, dim), path=f"/home/malattia/Workspace/Tesi/G2G/dataset/", name=f"gn:{gn}-dim:{dim}-dataset") save_on_hdd(*generate_dataset(gn, dim), path=f"/home/malattia/Workspace/Tesi/G2G/dataset/", name=f"val-gn:{gn}-dim:{dim}-dataset") max_iter = 50 search_space = { "lr": 0.001, "max_iter": max_iter, "gn": gn, "dim": dim, "hidden": tune.sample_from(lambda _: np.random.randint(dim, dim * 3)), "k": tune.sample_from(lambda _: np.random.choice( (dim - 2) // 2 if dim <= 10 else dim // 4) + 2), "dropout": tune.choice([0.2, 0.3, 0.4]) } analysis = tune.run(train_tune, num_samples=50, scheduler=ASHAScheduler(metric="mean_accuracy", mode="max", grace_period=1), config=search_space, verbose=2) print("Best config is", analysis.get_best_config(metric="mean_accuracy"))
def run_test(num_samples=2, max_num_epochs=2, gpus_per_trial=1): config = { "lr": tune.loguniform(1e-4, 1), "batch_size": tune.choice([16, 32, 64, 128]), "optim": tune.choice(['ADAM', 'SGD']), "mm": tune.choice([0.0]), "wd": tune.loguniform(1e-5, 5e-3) } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"]) result = tune.run( partial(train, num_epochs=max_num_epochs, device=device), resources_per_trial={"gpu": gpus_per_trial}, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter) best_trial = result.get_best_trial("loss", "min", "last") print(f'Best trial config: {best_trial.config}') print(f'Best trial final validation loss: {best_trial.last_result["loss"]}') print(f'Best trial final validation accuracy: {best_trial.last_result["accuracy"]}') best_trained_model = WasteClassifier() best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value model_state, optimizer_state = torch.load(os.path.join( best_checkpoint_dir, "checkpoint")) best_trained_model.load_state_dict(model_state) dfs = result.trial_dataframes return dfs, best_trained_model
def tune_cifar_asha(num_samples=10, num_epochs=15, gpus_per_trial=1): data_dir = 'test_tune' config = {"lr": tune.choice([0.001, 0.01, 0.1]), "act_func": "relu", "model": "resnet18", "opt": "ranger", "epochs": num_epochs, "momentum": 0.9, "weight_decay": 1e-4, "batch_size": 64, "nworkers": 4} scheduler = ASHAScheduler( max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["lr", "act_func", "model", "opt", "epochs", "num_classes", "momentum", "weight_decay", "batch_size", "nworkers"], metric_columns=["loss", "accuracy", "training_iteration"]) analysis = tune.run( tune.with_parameters( tune_main, num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": 4, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_cifar10_asha") print("Best hyperparameters found were: ", analysis.best_config)
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0): data_dir = mkdtemp(prefix="mnist_data_") LightningMNISTClassifier.download_data(data_dir) config = { "layer_1_size": tune.choice([32, 64, 128]), "layer_2_size": tune.choice([64, 128, 256]), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } scheduler = ASHAScheduler( metric="loss", mode="min", max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) tune.run( partial( train_mnist_tune, data_dir=data_dir, num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_asha") shutil.rmtree(data_dir)
def tune_model(num_samples=10, num_epochs=10, gpus_per_trial=1): # config the parameters config = { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([32, 64, 128]), } # add the scheduler scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2) reporter = CLIReporter( parameter_columns=["lr", "batch_size"], metric_columns=["loss", "mean_accuracy", "training_iteration"]) analysis = tune.run(tune.with_parameters(train_model_tune, data_dir='pruebas_tuning', num_epochs=num_epochs, num_gpus=gpus_per_trial), resources_per_trial={ "cpu": 1, "gpu": gpus_per_trial }, metric="loss", mode="min", config=config, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, name="tune_mnist_asha") tune.run(partial(train_model_tune, epochs=10, gpus=0), config=config, num_samples=10) print("Best hyperparameters found were: ", analysis.best_config) shutil.rmtree('pruebas_tuning')
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): config = { "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)), "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([2, 4, 8, 16]) } scheduler = ASHAScheduler( max_t=max_num_epochs, grace_period=1, reduction_factor=2) result = tune.run( tune.with_parameters(train_cifar), resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, config=config, metric="loss", mode="min", num_samples=num_samples, scheduler=scheduler ) best_trial = result.get_best_trial("loss", "min", "last") print("Best trial config: {}".format(best_trial.config)) print("Best trial final validation loss: {}".format( best_trial.last_result["loss"])) print("Best trial final validation accuracy: {}".format( best_trial.last_result["accuracy"])) if ray.util.client.ray.is_connected(): # If using Ray Client, we want to make sure checkpoint access # happens on the server. So we wrap `test_best_model` in a Ray task. # We have to make sure it gets executed on the same node that # ``tune.run`` is called on. from ray.util.ml_utils.node import force_on_current_node remote_fn = force_on_current_node(ray.remote(test_best_model)) ray.get(remote_fn.remote(best_trial)) else: test_best_model(best_trial)