Пример #1
0
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0):
    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    analysis = tune.run(
        tune.with_parameters(
            train_mnist_tune,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_asha")

    print("Best hyperparameters found were: ", analysis.best_config)
Пример #2
0
def tune_xgboost():
    search_space = {
        # You can mix constants with search space objects.
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": tune.randint(1, 9),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1)
    }
    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t=10,  # 10 training iterations
        grace_period=1,
        reduction_factor=2)

    analysis = tune.run(
        train_breast_cancer,
        metric="eval-logloss",
        mode="min",
        # You can add "gpu": 0.1 to allocate GPUs
        resources_per_trial={"cpu": 1},
        config=search_space,
        num_samples=10,
        scheduler=scheduler)

    return analysis
Пример #3
0
def tune_from_existing(start_model,
                       start_config,
                       num_samples=10,
                       num_epochs=10,
                       gpus_per_trial=0.0,
                       day=0):
    data_interface = MNISTDataInterface("/tmp/mnist_data", max_days=10)
    num_examples = data_interface._get_day_slice(
        day) - data_interface._get_day_slice(day - 1)

    config = start_config.copy()
    config.update({
        "batch_size": tune.choice([16, 32, 64]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "momentum": tune.uniform(0.1, 0.9),
    })

    scheduler = ASHAScheduler(
        metric="mean_accuracy",
        mode="max",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2,
    )

    reporter = CLIReporter(
        parameter_columns=["lr", "momentum", "batch_size"],
        metric_columns=["mean_accuracy", "training_iteration"],
    )

    analysis = tune.run(
        partial(
            train_mnist,
            start_model=start_model,
            data_fn=data_interface.get_incremental_data,
            num_epochs=num_epochs,
            use_gpus=True if gpus_per_trial > 0 else False,
            day=day,
        ),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        verbose=0,
        name="tune_serve_mnist_fromsexisting",
    )

    best_trial = analysis.get_best_trial("mean_accuracy", "max", "last")
    best_accuracy = best_trial.metric_analysis["mean_accuracy"]["last"]
    best_trial_config = best_trial.config
    best_checkpoint = best_trial.checkpoint.value

    return best_accuracy, best_trial_config, best_checkpoint, num_examples
Пример #4
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--X_dir", type=str)
    parser.add_argument("--y_dir", type=str)
    parser.add_argument("--epoch", type=int)
    parser.add_argument("--config_dir", type=str)
    parser.add_argument("--model", type=str)
    parser.add_argument("--n_sample", type=int)

    args = parser.parse_args()

    X_train = torch.load(args.X_dir)
    y_train = torch.load(args.y_dir)

    config = {
        "n_hidden": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([16, 32, 64, 128]),
    }

    CL = CustomLoss(1, 2)
    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=args.epoch,
                              grace_period=1,
                              reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "training_iteration"])

    def train_func(config):
        train_model(
            X=X_train,
            y=y_train,
            num_epochs=args.epoch,
            loss_func=CL.custom_loss_1,
            model_name=args.model,
            config=config,
        )

    result = tune.run(
        train_func,
        resources_per_trial={
            "cpu": 2,
            "gpu": 2
        },
        config=config,
        num_samples=args.n_sample,
        scheduler=scheduler,
        progress_reporter=reporter,
    )
    best_trial = result.get_best_trial("loss", "min", "last")

    with open(args.config_dir, "w") as json_file:
        json.dump(best_trial.last_result["config"], json_file)

    last_loss = best_trial.last_result["loss"]
    print(f"Validation Loss of best model was {last_loss}.")
Пример #5
0
def tune4_withLabel(
    model,
    train_set: Dataset,
    val_set: Dataset,
    dims: list,
    config: dict,
    EPOCHS: int = 300,
    extra_feature_len: int = 0,
    extra_feature_len2: int = 0,
    n_gpu=1,
    n_samples=20,
    model_name="model",
):

    dim1, dim2, dim3, dim4 = dims[0], dims[1], dims[2], dims[3]

    scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "hidden_dim"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=10)

    analysis = tune.run(tune.with_parameters(
        train4_withLabel,
        model=model,
        dim1=dim1,
        dim2=dim2,
        dim3=dim3,
        dim4=dim4,
        extra_feature_len=extra_feature_len,
        extra_feature_len2=extra_feature_len2,
        train_set=train_set,
        val_set=val_set,
        num_epochs=EPOCHS,
        num_gpus=n_gpu,
        model_name=model_name),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": n_gpu
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=n_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=model_name,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)
Пример #6
0
def start_training(name):
    Epochs = 1000
    Samples = 50
    ModelName = name

    pose_autoencoder = MLP_withLabel.load_checkpoint(
        "/home/nuoc/Documents/MEX/models/MLP4_withLabel_best/M3/0.00324857.512.pbz2"
    )
    # pose_autoencoder = MLP_withLabel.load_checkpoint("/home/nuoc/Documents/MEX/models/MLP_withLabel/0.0013522337.512.pbz2")

    pose_encoder_out_dim = pose_autoencoder.dimensions[-1]

    scheduler = ASHAScheduler(max_t=Epochs,
                              grace_period=15,
                              reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "loss_fn"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=1)

    analysis = tune.run(tune.with_parameters(
        tuning,
        MODEL=MotionGenerationModel,
        pose_autoencoder=pose_autoencoder,
        cost_dim=cost_dim,
        phase_dim=phase_dim,
        input_slices=[phase_dim, pose_dim, cost_dim],
        output_slices=[phase_dim, phase_dim, pose_encoder_out_dim],
        train_set=train_set,
        val_set=val_set,
        num_epochs=Epochs,
        model_name=ModelName),
                        resources_per_trial={
                            "cpu": 2,
                            "gpu": 1
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=Samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=ModelName,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)

    ray.shutdown()
Пример #7
0
def hyperparameter_tuning_initializer(loss_type='SL', learning_rate_scheduler ='CARM'):
    
    # defining the hyperparameters
    if loss_type == 'FL':
        config = {
            'gamma': tune.choice([0.5, 1, 2]),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'CEDL':
        config = {
            'dice_loss': tune.uniform(0, 3),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'CEDIL':
        config = {
            'dice_loss': tune.uniform(0, 3),
            'inverse_dice_loss': tune.uniform(0, 3),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'SL':
        config = {
            'lambda': tune.uniform(0, 1),
            'tau': tune.uniform(0.02, 0.04),
            'lr': tune.loguniform(1e-4, 1e-3)
        }
    elif loss_type == 'VCE':
        config = {
            'var_loss':  tune.uniform(0.5, 5.5),
            'lr': tune.loguniform(1e-4, 1e-3)
        }

    # hyperparameters for learning rate scheduler
    if learning_rate_scheduler == 'CARM':
        config['T_0'] = tune.choice([5, 10, 20, 40, 50])
        config['eta_min_factor'] = tune.loguniform(1e2, 1e4)
    if learning_rate_scheduler == 'SLR':
        config['step_size'] = tune.choice([5, 10, 20, 40, 50])
    if learning_rate_scheduler == 'MLR':
        config['lr_lambda'] = tune.uniform(0.8, 0.99)

        
    # defining the scheduler
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=conf['max_epochs'] // 20,
        grace_period=1,
        reduction_factor=2
    )
    
    # defining the reporter
    reporter = CLIReporter(metric_columns=['loss', 'avg_dice_coefficient', 'epoch'])
    
    return config, scheduler, reporter
Пример #8
0
def ray_tune_interface(data_settings, cross_validation_settings,
                       parameter_settings, model_settings):
    cat_features = data_settings['cat_features']
    weights = data_settings['weights']
    data = data_settings['data']
    target = data_settings['target']

    model = model_settings.pop('model', None)

    n_folds = cross_validation_settings['n_folds']
    metric_to_use = cross_validation_settings['metric_to_use']
    mode = cross_validation_settings['mode']

    n_samples = parameter_settings['tune_parameters']['n_samples']
    num_threads = parameter_settings['tune_parameters']['num_threads']
    max_concurrent = parameter_settings['tune_parameters']['max_concurrent']
    space = ({
        k: var_type_sample(k, v, parameter_settings['parameters_types'])
        for k, v in parameter_settings['parameter_range'].items()
    })
    algo = HyperOptSearch(
        space,
        max_concurrent=max_concurrent,
        metric="metric_ave",
        mode=mode,
    )
    scheduler = ASHAScheduler(metric="metric_ave", mode=mode)
    config = {
        'num_samples': n_samples,
        'config': {
            'non_hp': {
                'data': {
                    'X': data,
                    'y': target,
                    'cat_features': cat_features,
                    'weights': weights,
                },
                'modelIns': model,
                'n_folds': n_folds,
                'metric_to_use': metric_to_use,
            }
        }
    }
    config['config'].update(parameter_settings)
    config['config']['model_settings'] = model_settings
    ray_experiment = tune.run(cross_validation_catboost_ray,
                              resources_per_trial={"gpu": 1},
                              search_alg=algo,
                              scheduler=scheduler,
                              keep_checkpoints_num=0,
                              verbose=1,
                              **config)
    results = ray_experiment.dataframe(metric="metric_ave", mode="min")
    results.to_csv('results_experiment.csv')
Пример #9
0
def test_custom_scheduler_default_time_attr():
    try:
        from ray.tune.schedulers import ASHAScheduler
    except ImportError:
        print("skip the test as ray tune cannot be imported.")
        return
    my_scheduler = ASHAScheduler(max_t=10)
    best_config = test_scheduler(scheduler=my_scheduler)
    print(
        "Custom ASHA scheduler (with ASHA default time attr), test error:",
        abs(10 / 2 - best_config["z"] / 2),
    )
Пример #10
0
    def start(self, config: dict = None, **kwargs):
        if config is None:
            raise ValueError("Argument 'config' must not be None")

        # better save results in home dir because paths may become very long
        # possibly exceeding the max path limit if stored in different paths
        results_dir = (os.environ.get("TEST_TMPDIR")
                       or os.environ.get("TUNE_RESULT_DIR")
                       or os.path.expanduser("~/ray_results"))

        tune_log_path = os.path.join(results_dir, self.session_id)
        os.makedirs(tune_log_path, exist_ok=True)
        os.makedirs(self.out_path, exist_ok=True)
        metric = kwargs.get("metric", "mean_accuracy")
        mode = kwargs.get("mode", "max")

        num_gpus = kwargs.get("num_gpus", 1)
        num_cpus = kwargs.get("num_cpus", 1)

        ray.init(include_dashboard=False,
                 local_mode=True,
                 num_gpus=num_gpus,
                 num_cpus=num_cpus)

        tensorboard_url = launch_tensorboard(tune_log_path)
        print(f"TensorBoard launched: {tensorboard_url}.")

        scheduler = kwargs.get("scheduler", None)
        if scheduler is None:
            scheduler = ASHAScheduler(metric, mode=mode)
        analysis = tune.run(self._start,
                            self.session_id,
                            config=config,
                            scheduler=scheduler,
                            checkpoint_freq=20,
                            local_dir=results_dir)
        result = analysis.get_best_trial(metric)
        print("Best trial config: {}".format(result.config))
        print("Best trial final validation loss: {}".format(
            result.last_result["mean_loss"]))
        print("Best trial final validation accuracy: {}".format(
            result.last_result["mean_accuracy"]))

        # TODO search algorithm?
        df = analysis.dataframe(metric, mode)
        df.to_pickle(os.path.join(self.out_path, "results.pkl"))

        try:
            with pd.ExcelWriter(os.path.join(self.out_path,
                                             "results.xlsx")) as writer:
                df.to_excel(writer)
        except ModuleNotFoundError as e:
            print("Failed to write tuning result:", e)
Пример #11
0
def main(num_samples=10, max_num_epochs=50, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    checkpoint_dir = os.path.abspath("./checkpoints")
    config = {
        "l1": tune.choice([32, 64, 128, 256]),
        "l2": tune.choice([32, 64, 128, 256]),
        "lr": tune.loguniform(1e-5, 1e-3),
        "batch_size": tune.choice([64, 128, 256])
    }

    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=max_num_epochs,
                              grace_period=5,
                              reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])

    st_time = time.time()
    result = tune.run(partial(train_regression,
                              checkpoint_dir=checkpoint_dir,
                              data_dir=data_dir),
                      resources_per_trial={
                          "cpu": 2,
                          "gpu": gpus_per_trial
                      },
                      config=config,
                      num_samples=num_samples,
                      scheduler=scheduler,
                      progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(
        os.path.join(best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
    print("Total Time", st_time - time.time())
Пример #12
0
def main(cmdl):
    base_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "default.yaml"))
    search_cfg = namespace_to_dict(read_config(Path(cmdl.cfg) / "search.yaml"))

    print(config_to_string(cmdl))
    print(config_to_string(dict_to_namespace(search_cfg)))

    # the search space
    good_init, search_space = get_search_space(search_cfg)

    search_name = "{timestep}_tune_{experiment_name}{dev}".format(
        timestep="{:%Y%b%d-%H%M%S}".format(datetime.now()),
        experiment_name=base_cfg["experiment"],
        dev="_dev" if cmdl.dev else "",
    )

    # search algorithm
    hyperopt_search = HyperOptSearch(
        search_space,
        metric="criterion",
        mode="max",
        max_concurrent=cmdl.workers,
        points_to_evaluate=good_init,
    )

    # early stopping
    scheduler = ASHAScheduler(
        time_attr="train_step",
        metric="criterion",
        mode="max",
        max_t=base_cfg["training_steps"],  # max length of the experiment
        grace_period=cmdl.grace_steps,  # stops after 20 logged steps
        brackets=3,  # don't know what this does
    )

    analysis = tune.run(
        lambda x: tune_trial(x, base_cfg=base_cfg, get_objective=None),
        name=search_name,
        # config=search_space,
        search_alg=hyperopt_search,
        scheduler=scheduler,
        local_dir="./results",
        num_samples=cmdl.trials,
        trial_name_creator=trial2string,
        resources_per_trial={"cpu": 3},
    )

    dfs = analysis.trial_dataframes
    for i, (key, df) in enumerate(dfs.items()):
        print("saving: ", key)
        df.to_pickle(f"{key}/trial_df.pkl")
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    config = {
        "l1": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2**np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(metric="loss",
                              mode="min",
                              max_t=max_num_epochs,
                              grace_period=1,
                              reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(partial(train_cifar, data_dir=data_dir),
                      resources_per_trial={
                          "cpu": 2,
                          "gpu": gpus_per_trial
                      },
                      config=config,
                      num_samples=num_samples,
                      scheduler=scheduler,
                      progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(
        os.path.join(best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))
Пример #14
0
def main():
    logging.basicConfig(level=logging.INFO)

    # Raylib parameters
    num_samples = 10
    envname = 'AdversarialAntBulletEnv-v0'
    trainingconfig = Path.cwd() / 'trainingconfig.json'
    evaluate_mean_n = 1000  # Number of timesteps over which to evaluate the mean reward
    name_fmt = 'million-bucks_{adv_force}'

    config = {
        # TODO: sample from control once, then different adversarial strengths
        # Range is centered on the force that achieves the closest reward to the control (7.5)
        "adv_force": tune.qrandn(7.5, 2.5, 0.1),
    }

    # https://docs.ray.io/en/master/tune/tutorials/overview.html#which-search-algorithm-scheduler-should-i-choose
    # Use BOHB for larger problems with a small number of hyperparameters
    # search = TuneBOHB(max_concurrent=4, metric="mean_loss", mode="min")
    # sched = HyperBandForBOHB(
    #     time_attr="training_iteration",
    #     max_t=100,
    # )

    # Implicitly use random search if search algo is not specified
    sched = ASHAScheduler(
        time_attr='training_iteration',
        max_t=100,
        grace_period=1,  # Unit is iterations, not timesteps. TODO configure
    )

    # Pass in a Trainable class or function to tune.run.
    local_dir = str(Path.cwd() / "ray")
    logging.info(f'{local_dir=}')
    anal = tune.run(tune.with_parameters(trainable,
                                         envname=envname,
                                         trainingconfig=trainingconfig,
                                         evaluate_mean_n=evaluate_mean_n,
                                         name_fmt=name_fmt),
                    config=config,
                    num_samples=num_samples,
                    scheduler=sched,
                    local_dir=local_dir,
                    metric="robustness",
                    mode="max",
                    log_to_file=True)
    logging.info(f'best config: {anal.best_config}')
    logging.info(f'best config: {anal.best_result}')
Пример #15
0
def main(args):
    device = config_cuda(args.use_cuda)

    data_pth = Path(Path.cwd() / args.data_path)
    out_pth = Path(Path.cwd() / args.out_path)

    train_set, valid_set, test_set = get_datasets(data_pth)

    config = {
        'eta': tune.loguniform(1e-5, 1e-1),
        'batch_size': tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric='loss',
        mode='min',
        max_t=args.max_epochs,
        grace_period=1,
        reduction_factor=2
    )
    reporter = CLIReporter(
        parameter_columns=['eta', 'batch_size'],
        metric_columns=['loss', 'accuracy', 'training_iteration']
    )
    num_gpus = 1 if device == 'cuda' else 0
    result = tune.run(
        partial(train_model, model_name=args.model, device=device,
                max_epochs=args.max_epochs, num_workers=args.num_workers,
                data_pth=data_pth),
        resources_per_trial={'cpu': args.num_workers, 'gpu': num_gpus},
        config=config,
        num_samples=args.num_samples,
        scheduler=scheduler,
        progress_reporter=reporter
    )
    best_trial = result.get_best_trial('loss', 'min', 'last')
    print(f'Best trial config {best_trial.config}')
    print(
          f'Best trial final validation loss: {best_trial.last_result["loss"]}'
         )
    model = load_model(args.model, len(train_set.classes))

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, 'checkpoint'))
    model.load_state_dict(model_state)

    test_acc = test_model(model, device, test_set, out_pth, args.num_workers)
    print(f'Best trial test set accuracy: {test_acc}')
Пример #16
0
def setup_tune_scheduler():
    search_space = workload.create_search_space()

    scheduler = ASHAScheduler(
        # set a large max_t such that ASHA will always promot to next rung,
        # until something reaches target accuracy
        max_t=int(1000),
        reduction_factor=3,
        **workload.exp_metric(),
    )
    return dict(
        search_alg=VariantGenerator(),
        scheduler=scheduler,
        config=search_space,
        resources_per_trial=com.detect_baseline_resource(),
    )
    def tune_model(data):
        logfile = open("/tmp/ray/session_latest/custom.log", "w")

        def write(msg):
            logfile.write(f"{msg}\n")
            logfile.flush()

        search_space = {
            # You can mix constants with search space objects.
            "objective": "binary:logistic",
            "eval_metric": ["logloss", "error"],
            "max_depth": tune.randint(1, 9),
            "min_child_weight": tune.choice([1, 2, 3]),
            "subsample": tune.uniform(0.5, 1.0),
            "eta": tune.loguniform(1e-4, 1e-1),
        }

        print("enabling aggressive early stopping of bad trials")
        # This will enable aggressive early stopping of bad trials.
        scheduler = ASHAScheduler(
            max_t=4,
            grace_period=1,
            reduction_factor=2  # 10 training iterations
        )

        print("Tuning")

        analysis = tune.run(
            tune.with_parameters(
                train_model,
                data=data
                # checkpoint_dir=f"{LOCAL_DIR}/checkpoints",
                # data_dir=f"{LOCAL_DIR}/data",
            ),
            metric="eval-logloss",
            mode="min",
            local_dir=LOCAL_DIR,
            # You can add "gpu": 0.1 to allocate GPUs
            resources_per_trial=RAY_PARAMS.get_tune_resources(),
            config=search_space,
            num_samples=4,
            scheduler=scheduler,
        )

        print("Done Tuning")

        return analysis
Пример #18
0
def tunerTrain():
    ray.init(_memory=4000000000, num_cpus=5)
    searchSpace = {
        'lr': tune.loguniform(1e-4, 9e-1),
        'finalOutput': tune.randint(2, 50),  # minimum of 2, other 1//2 = 0 activation maps
        'stride1': tune.grid_search(np.arange(1, 4).tolist()),
        'stride2': tune.grid_search(np.arange(1, 4).tolist()),
        'batchSize': tune.grid_search([2, 4, 8, 16, 32, 64, 128, 256]),
        'finalChannel': tune.randint(1, 50),
    }

    analysis = tune.run(train, num_samples=1, scheduler=ASHAScheduler(metric='score', mode='max'),
                        config=searchSpace)
    print(f"Best Config: {analysis.get_all_configs(metric='score', mode='max')}")
    df = analysis.results_df
    logdir = analysis.get_best_logdir("mean_accuracy", mode="max")
    print(f"dir of best: {logdir}")
Пример #19
0
def main(args, reproducible: bool):
    if reproducible:
        seed_everything(42)

    datamodule = TwoDomainMMEDM(dataPath=args.dataPath,
                                augment=True,
                                batch_size=32,
                                num_workers=8)

    config = {
        "log_lr": tune.uniform(-4, -2),
        "log_lrRatio": tune.uniform(-3, 0),
        "log_decay": tune.uniform(-8, -1),
    }

    search_alg = BayesOptSearch(
        metric='mean_iou',
        mode='max',
    )

    scheduler = ASHAScheduler(grace_period=25, )

    reporter = CLIReporter(
        parameter_columns=["log_lr", "log_lrRatio", "log_decay"],
        metric_columns=["loss", "mean_iou", "training_iteration"])

    analysis = tune.run(tune.with_parameters(
        trainWithTune,
        datamodule=datamodule,
        num_epochs=175,
        num_gpus=1,
    ),
                        resources_per_trial={
                            "cpu": 5,
                            "gpu": 0.5,
                        },
                        metric="mean_iou",
                        mode="max",
                        config=config,
                        num_samples=20,
                        scheduler=scheduler,
                        search_alg=search_alg,
                        progress_reporter=reporter,
                        name="tune_minimax_segmenter")

    print("Best hyperparameters found were: ", analysis.best_config)
Пример #20
0
def optimize():
    name_dir = os.path.join('saved', 'hyper-lstm')
    hyperparam_config = {
        'name': 'lstm',
        'num_hidden': tune.sample_from(lambda _: np.random.randint(1, 10)),
        'num_layers': tune.sample_from(lambda _: np.random.randint(1, 5)),
        'opt': tune.choice(['adam', 'sgd', 'adamw', 'lbfgs']),
        'lr': tune.loguniform(1e-10, 1),
        'epoch': tune.sample_from(lambda _: np.random.randint(5, 25)),
        'beta_1': tune.loguniform(1e-8, 1e-2),
        'beta_2': tune.loguniform(1e-8, 1e-2),
        'weight_decay': tune.loguniform(1e-8, 1e-2),
        'max_iter': tune.sample_from(lambda _: np.random.randint(10, 100)),
        'momentum': tune.uniform(0.5, 0.9),
        'patience': tune.sample_from(lambda _: np.random.randint(5, 25)),
        'batch_size': 16
    }
    if not os.path.isdir(name_dir):
        os.mkdir(name_dir)
    scheduler = ASHAScheduler(metric='accuracy',
                              mode='max',
                              max_t=25,
                              grace_period=1,
                              reduction_factor=2)
    reporter = CLIReporter(metric_columns=["loss", "accuracy"])
    result = tune.run(partial(train,
                              checkpoint_dir=name_dir,
                              cwd=os.getcwd(),
                              tuning=True),
                      resources_per_trial={
                          "cpu": 1,
                          "gpu": 0.5
                      },
                      config=hyperparam_config,
                      num_samples=200,
                      scheduler=scheduler,
                      progress_reporter=reporter)
    best_trial = result.get_best_trial("accuracy", "max", "last")

    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))
    print("Best Checkpoint Dir: " + str(best_trial.checkpoint.value))
    return best_trial.config
Пример #21
0
def test_main(gpus_per_trial=1):
    data_dir = os.path.abspath("../playground/data")
    load_data(data_dir)  # Download data for all trials before starting the run
    config = {
        "lr": 1e-2,
        "conv1_l2": 4e-3,
        "conv2_l2": 4e-3,
        "conv3_l2": 4e-3,
        "fc1_l2": 1,
        "lr_reductions": 2,
        "rnorm_scale": 0.00005,
        "rnorm_power": 0.01,
        "max_num_epochs": 200,
        "batch_size": 128,
        "steps_per_epoch": 1000,
        "data_dir": data_dir
    }
    scheduler = ASHAScheduler(max_t=config["max_num_epochs"],
                              grace_period=1,
                              reduction_factor=4)

    result = tune.run(
        train_cifar,
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial
        },
        config={
            **config, "wandb": {
                "project": "SoTL_Cifar",
                "api_key_file":
                "~" + os.sep + ".wandb" + os.sep + "nas_key.txt"
            }
        },
        metric="loss",
        mode="min",
        num_samples=1,
        scheduler=scheduler,
        loggers=DEFAULT_LOGGERS + (WandbLogger, ),
    )
    train_cifar(config)


# if __name__ == "__main__":
#     fire.Fire(main)
Пример #22
0
def _tune(
    model,
    train_set: Dataset,
    val_set: Dataset,
    dim: int,
    config: dict,
    EPOCHS: int = 300,
    n_gpu=1,
    n_samples=20,
    model_name="model",
):

    scheduler = ASHAScheduler(max_t=EPOCHS, grace_period=1, reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["k", "lr", "batch_size", "loss_fn"],
        metric_columns=["loss", "training_iteration"],
        max_error_rows=5,
        max_progress_rows=5,
        max_report_frequency=10)
    analysis = tune.run(tune.with_parameters(train,
                                             model=model,
                                             dim=dim,
                                             train_set=train_set,
                                             val_set=val_set,
                                             num_epochs=EPOCHS,
                                             num_gpus=n_gpu,
                                             model_name=model_name),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": n_gpu
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=n_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name=model_name,
                        verbose=False)

    print("-" * 70)
    print("Done")
    print("Best hyperparameters found were: ", analysis.best_config)
    print("Best achieved loss was: ", analysis.best_result)
    print("-" * 70)
Пример #23
0
def main():
    parser = argparse.ArgumentParser(description='Train a strategy using CFR')
    parser.add_argument('-in', '--input', type=str, help='the data')
    parser.add_argument('-out', '--output', type=str, help='where to store the model')
    args = parser.parse_args()

    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l3": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
    }
    ray.init(num_cpus=1, include_dashboard=False, local_mode=True)
    scheduler = ASHAScheduler(metric='mean_accuracy', mode='max')
    analysis = tune.run(train_figgie,
                        num_samples=4,
                        scheduler= scheduler,
                        config=config)
Пример #24
0
def tune_mnist_asha(num_samples=10,
                    num_epochs=50,
                    gpus_per_trial=0,
                    cpus_per_trial=4):
    data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "layer_3_size": tune.choice([128, 256, 512]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(max_t=num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=[
            "layer_1_size", "layer_2_size", "layer_3_size", "lr", "batch_size"
        ],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    analysis = tune.run(tune.with_parameters(train_mnist_tune,
                                             data_dir=data_dir,
                                             num_epochs=num_epochs,
                                             num_gpus=gpus_per_trial),
                        resources_per_trial={
                            "cpu": cpus_per_trial,
                            "gpu": gpus_per_trial
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=num_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name="tune_mnist_asha")

    print("Best hyperparameters found were: ", analysis.best_config)

    shutil.rmtree(data_dir)
Пример #25
0
def ray_schedule():
    gn = 10
    dim = 10

    if not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/gn:{gn}-dim:{dim}-dataset-x") \
            or not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/gn:{gn}-dim:{dim}-dataset-y") \
            or not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/val-gn:{gn}-dim:{dim}-dataset-x") \
            or not os.path.isfile(f"/home/malattia/Workspace/Tesi/G2G/dataset/val-gn:{gn}-dim:{dim}-dataset-y"):
        save_on_hdd(*generate_dataset(gn, dim),
                    path=f"/home/malattia/Workspace/Tesi/G2G/dataset/",
                    name=f"gn:{gn}-dim:{dim}-dataset")
        save_on_hdd(*generate_dataset(gn, dim),
                    path=f"/home/malattia/Workspace/Tesi/G2G/dataset/",
                    name=f"val-gn:{gn}-dim:{dim}-dataset")

    max_iter = 50
    search_space = {
        "lr":
        0.001,
        "max_iter":
        max_iter,
        "gn":
        gn,
        "dim":
        dim,
        "hidden":
        tune.sample_from(lambda _: np.random.randint(dim, dim * 3)),
        "k":
        tune.sample_from(lambda _: np.random.choice(
            (dim - 2) // 2 if dim <= 10 else dim // 4) + 2),
        "dropout":
        tune.choice([0.2, 0.3, 0.4])
    }

    analysis = tune.run(train_tune,
                        num_samples=50,
                        scheduler=ASHAScheduler(metric="mean_accuracy",
                                                mode="max",
                                                grace_period=1),
                        config=search_space,
                        verbose=2)

    print("Best config is", analysis.get_best_config(metric="mean_accuracy"))
Пример #26
0
def run_test(num_samples=2, max_num_epochs=2, gpus_per_trial=1):
    config = {
        "lr": tune.loguniform(1e-4, 1),
        "batch_size": tune.choice([16, 32, 64, 128]),
        "optim": tune.choice(['ADAM', 'SGD']),
        "mm": tune.choice([0.0]),
        "wd": tune.loguniform(1e-5, 5e-3)
    }

    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    
    reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])
    
    result = tune.run(
        partial(train, num_epochs=max_num_epochs, device=device),
        resources_per_trial={"gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print(f'Best trial config: {best_trial.config}')
    print(f'Best trial final validation loss: {best_trial.last_result["loss"]}')
    print(f'Best trial final validation accuracy: {best_trial.last_result["accuracy"]}')

    best_trained_model = WasteClassifier()
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    dfs = result.trial_dataframes
    return dfs, best_trained_model
Пример #27
0
def tune_cifar_asha(num_samples=10, num_epochs=15, gpus_per_trial=1):
    data_dir = 'test_tune'

    config = {"lr": tune.choice([0.001, 0.01, 0.1]), 
              "act_func": "relu",
              "model": "resnet18",
              "opt": "ranger",
              "epochs": num_epochs,
              "momentum": 0.9,
              "weight_decay": 1e-4,
              "batch_size": 64,
              "nworkers": 4}

    scheduler = ASHAScheduler(
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["lr", "act_func", "model", "opt", "epochs", "num_classes",
                            "momentum", "weight_decay", "batch_size", "nworkers"],
        metric_columns=["loss", "accuracy", "training_iteration"])

    analysis = tune.run(
        tune.with_parameters(
            tune_main,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        resources_per_trial={
            "cpu": 4,
            "gpu": gpus_per_trial
        },
        metric="loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_cifar10_asha")

    print("Best hyperparameters found were: ", analysis.best_config)
Пример #28
0
def tune_mnist_asha(num_samples=10, num_epochs=10, gpus_per_trial=0):
    data_dir = mkdtemp(prefix="mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)

    config = {
        "layer_1_size": tune.choice([32, 64, 128]),
        "layer_2_size": tune.choice([64, 128, 256]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    tune.run(
        partial(
            train_mnist_tune,
            data_dir=data_dir,
            num_epochs=num_epochs,
            num_gpus=gpus_per_trial),
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_asha")

    shutil.rmtree(data_dir)
def tune_model(num_samples=10, num_epochs=10, gpus_per_trial=1):
    # config the parameters
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([32, 64, 128]),
    }

    # add the scheduler
    scheduler = ASHAScheduler(max_t=num_epochs,
                              grace_period=1,
                              reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["lr", "batch_size"],
        metric_columns=["loss", "mean_accuracy", "training_iteration"])

    analysis = tune.run(tune.with_parameters(train_model_tune,
                                             data_dir='pruebas_tuning',
                                             num_epochs=num_epochs,
                                             num_gpus=gpus_per_trial),
                        resources_per_trial={
                            "cpu": 1,
                            "gpu": gpus_per_trial
                        },
                        metric="loss",
                        mode="min",
                        config=config,
                        num_samples=num_samples,
                        scheduler=scheduler,
                        progress_reporter=reporter,
                        name="tune_mnist_asha")

    tune.run(partial(train_model_tune, epochs=10, gpus=0),
             config=config,
             num_samples=10)

    print("Best hyperparameters found were: ", analysis.best_config)

    shutil.rmtree('pruebas_tuning')
Пример #30
0
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    result = tune.run(
        tune.with_parameters(train_cifar),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        metric="loss",
        mode="min",
        num_samples=num_samples,
        scheduler=scheduler
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    if ray.util.client.ray.is_connected():
        # If using Ray Client, we want to make sure checkpoint access
        # happens on the server. So we wrap `test_best_model` in a Ray task.
        # We have to make sure it gets executed on the same node that
        # ``tune.run`` is called on.
        from ray.util.ml_utils.node import force_on_current_node
        remote_fn = force_on_current_node(ray.remote(test_best_model))
        ray.get(remote_fn.remote(best_trial))
    else:
        test_best_model(best_trial)